-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathconfig.yaml
114 lines (97 loc) · 3.55 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# ========== Pipeline Control ==========
pipeline:
from_stage: "CHUNK" # Options: "CHUNK", "GENERATE", "TRAIN"
to_stage: "TRAIN" # Run pipeline from from_stage to to_stage
# ========== Global Settings ==========
path_to_knowledgebase: "./knowledgebase" # Directory containing source documents
hub_username: "AdamLucek" # Hugging Face username
hub_token: null # Optional: Use HF_TOKEN env variable instead
# ========== Document Chunking ==========
chunker_config:
output_path: "./output/knowledgebase.json"
# Chunker Config:
chunker: "RecursiveTokenChunker"
chunker_arguments:
chunk_size: 400
chunk_overlap: 0
length_type: "character"
separators: ["\n\n", "\n", ".", "?", "!", " ", ""]
keep_separator: true
is_separator_regex: false
# Optional: Push chunks to Hugging Face Hub
upload_config:
push_to_hub: true
hub_private: false
hub_dataset_id: "AdamLucek/quickb-kb"
# ========== Question Generation ==========
question_generation:
output_path: "./output/train_data.json"
# LLM/Embedding Configuration
litellm_config:
model: "openai/gpt-4o-mini"
model_api_base: null # Optional: Custom API endpoint
embedding_model: "text-embedding-3-large"
embedding_api_base: null # Optional: Custom embedding endpoint
# Input dataset settings
input_dataset_config:
dataset_source: "local" # Options: "local", "hub"
local_knowledgebase_path: "./output/knowledgebase.json"
# Hub alternative:
# knowledgebase_dataset_id: "AdamLucek/quickb-kb"
# Performance settings
max_workers: 150 # Parallel question generation
llm_calls_per_minute: null # null = no limit
embedding_calls_per_minute: null # null = no limit
# Question deduplication
deduplication_enabled: true
dedup_embedding_batch_size: 2048 # Batch size for embedding calculation
similarity_threshold: 0.85 # Semantic Similarity Threshold
# Optional: Push training data to Hub
upload_config:
push_to_hub: true
hub_private: false
hub_dataset_id: "AdamLucek/quickb-qa"
# ========== Model Training ==========
training:
# Model configuration
model_settings:
# Base model:
model_id: "nomic-ai/modernbert-embed-base"
# Matryoshka dimensions (must be descending)
matryoshka_dimensions: [768, 512, 256, 128, 64]
metric_for_best_model: "eval_dim_128_cosine_ndcg@10"
max_seq_length: 1024
trust_remote_code: false
# Training data configuration
train_dataset_config:
dataset_source: "local" # Options: "local", "hub"
local_train_path: "./output/train_data.json"
local_knowledgebase_path: "./output/knowledgebase.json"
# Hub alternatives:
# train_dataset_id: "AdamLucek/quickb-qa"
# knowledgebase_dataset_id: "AdamLucek/quickb-kb"
# Training hyperparameters
training_arguments:
output_path: "./output/modernbert_quickb"
device: "cuda" # Options: "cuda", "mps", "cpu"
epochs: 4
batch_size: 32
gradient_accumulation_steps: 16
learning_rate: 2.0e-5
warmup_ratio: 0.1
lr_scheduler_type: "cosine"
optim: "adamw_torch_fused"
tf32: true
bf16: true
batch_sampler: "no_duplicates" # Options: "batch_sampler", "no_duplicates", "group_by_label"
eval_strategy: "epoch"
save_strategy: "epoch"
logging_steps: 10
save_total_limit: 3
load_best_model_at_end: true
report_to: "none"
# Optional: Push trained model to Hub
upload_config:
push_to_hub: true
hub_private: false
hub_model_id: "AdamLucek/modernbert-embed-quickb"