Huggingface Trainer

1. Huggingface Trainer

Original config:

1
# =============================================================================
2
# Training Configuration
3
# =============================================================================
4
output_dir: null
5
do_train: false
6
do_eval: false
7
do_predict: false
8
num_train_epochs: 3.0
9
max_steps: -1
10
resume_from_checkpoint: null
11
12
# =============================================================================
13
# Evaluation Configuration
14
# =============================================================================
15
eval_steps: null
16
eval_delay: 0
17
# Options: 'no', 'steps', 'epoch'
18
eval_strategy: 'no'
19
20
# =============================================================================
21
# Batch & Gradient Configuration
22
# =============================================================================
23
per_device_train_batch_size: 8
24
per_device_eval_batch_size: 8
25
auto_find_batch_size: false
26
gradient_accumulation_steps: 1
27
gradient_checkpointing: false
28
29
# =============================================================================
30
# Optimizer & Learning Rate
31
# =============================================================================
32
learning_rate: 5e-5
33
weight_decay: 0.0
34
lr_scheduler_type: linear
35
warmup_ratio: 0.0
36
warmup_steps: 0
37
38
# =============================================================================
39
# Logging Configuration
40
# =============================================================================
41
# Options: 'debug', 'info', 'warning', 'error', 'critical', 'passive'
42
log_level: passive
43
logging_dir: null
44
log_on_each_node: true
45
# Options: 'no', 'steps', 'epoch'
46
logging_strategy: steps
47
logging_first_step: false
48
logging_steps: 500
49
50
# =============================================================================
51
# Model Saving Configuration
52
# =============================================================================
53
# Options: 'no', 'steps', 'epoch', 'best'
54
save_strategy: steps
55
save_steps: 500
56
save_total_limit: null
57
save_only_model: false
58
59
# =============================================================================
60
# Random Seeds
61
# =============================================================================
62
seed: 42
63
data_seed: None
64
65
# =============================================================================
66
# Hardware & Performance
67
# =============================================================================
68
use_ipex: false
69
bf16: false
70
fp16: false
71
tf32: null
72
torch_compile: false
73
torch_compile_backend: 'inductor'
74
use_liger_kernel: false
75
76
# =============================================================================
77
# DataLoader Configuration
78
# =============================================================================
79
dataloader_drop_last: false
80
dataloader_num_workers: 0
81
dataloader_prefetch_factor: null
82
dataloader_pin_memory: true
83
dataloader_persistent_workers: false
84
remove_unused_columns: true
85
label_names: null
86
87
# =============================================================================
88
# Experiment Tracking
89
# =============================================================================
90
run_name: null
91
report_to: null
92
93
# =============================================================================
94
# Hugging Face Hub Integration
95
# =============================================================================
96
push_to_hub: false
97
hub_model_id: null
98
# Options: 'end', 'every_save', 'checkpoint', 'all_checkpoints'
99
hub_strategy: 'every_save'
100
hub_revision: null

A common used config:

1
# =============================================================================
2
# Training Configuration
3
# =============================================================================
4
output_dir: ./output
5
do_train: true
6
do_eval: false
7
do_predict: false
8
num_train_epochs: 1.0
9
max_steps: -1
10
resume_from_checkpoint: null
11
12
# =============================================================================
13
# Evaluation Configuration
14
# =============================================================================
15
eval_steps: null
16
eval_delay: 0
17
# Options: 'no', 'steps', 'epoch'
18
eval_strategy: 'no'
19
20
# =============================================================================
21
# Batch & Gradient Configuration
22
# =============================================================================
23
per_device_train_batch_size: 8
24
per_device_eval_batch_size: 8
25
auto_find_batch_size: false
26
gradient_accumulation_steps: 1
27
gradient_checkpointing: true
28
29
# =============================================================================
30
# Optimizer & Learning Rate
31
# =============================================================================
32
learning_rate: 1e-5
33
weight_decay: 0.0
34
lr_scheduler_type: cosine
35
warmup_ratio: 0.03
36
warmup_steps: 0
37
38
# =============================================================================
39
# Logging Configuration
40
# =============================================================================
41
# Options: 'debug', 'info', 'warning', 'error', 'critical', 'passive'
42
log_level: passive
43
logging_dir: ./logs
44
log_on_each_node: false
45
# Options: 'no', 'steps', 'epoch'
46
logging_strategy: steps
47
logging_first_step: true
48
logging_steps: 1
49
50
# =============================================================================
51
# Model Saving Configuration
52
# =============================================================================
53
# Options: 'no', 'steps', 'epoch', 'best'
54
save_strategy: steps
55
save_steps: 500
56
save_total_limit: null
57
save_only_model: false
58
59
# =============================================================================
60
# Random Seeds
61
# =============================================================================
62
seed: 42
63
data_seed: None
64
65
# =============================================================================
66
# Hardware & Performance
67
# =============================================================================
68
use_ipex: false
69
bf16: true
70
fp16: false
71
tf32: true
72
torch_compile: false
73
torch_compile_backend: 'inductor'
74
use_liger_kernel: true
75
76
# =============================================================================
77
# DataLoader Configuration
78
# =============================================================================
79
dataloader_drop_last: false
80
dataloader_num_workers: 8
81
dataloader_prefetch_factor: 2
82
dataloader_pin_memory: true
83
dataloader_persistent_workers: false
84
remove_unused_columns: true
85
label_names: null
86
87
# =============================================================================
88
# Experiment Tracking
89
# =============================================================================
90
run_name: null
91
report_to: wandb
92
93
# =============================================================================
94
# Hugging Face Hub Integration
95
# =============================================================================
96
push_to_hub: true
97
hub_model_id: null
98
# Options: 'end', 'every_save', 'checkpoint', 'all_checkpoints'
99
hub_strategy: 'all_checkpoints'
100
hub_revision: null

Some methods you may need to modify:

1
from transformers import (
2
AutoConfig,
3
AutoModel,
4
AutoModelForSequenceClassification,
5
AutoTokenizer,
6
Trainer,
7
)
8
9
# Load model and tokenizer
10
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
11
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
12
13
trainer = Trainer(
14
model=model,
15
args=training_args,
16
train_dataset=train_dataset,
17
eval_dataset=eval_dataset,
18
processing_class=tokenizer,
19
)
20
21
config = AutoConfig.from_pretrained(pretrained_model_name_or_path="bert-base-uncased")
22
model = AutoModel.from_config(config)
23
model_size = sum(t.numel() for t in model.parameters())
24
print(f"Model size: {model_size / 1000**2:.1f}M parameters")
25
# methods you can use
26
trainer.pop_callback()
27
trainer.remove_callback()
28
trainer.get_num_trainable_parameters()
29
trainer.get_learning_rates()
30
trainer.floating_point_ops(inputs=inputs)
31
trainer.init_hf_repo()
32
trainer.create_model_card()
33
trainer.push_to_hub()
34
35
36
# methods that you may need to modify
37
train_dataloader = trainer.get_train_dataloader()
38
eval_dataloader = trainer.get_eval_dataloader()
39
trainer.create_optimizer()
40
trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
41
trainer.log(logs=logs, start_time=start_time)
42
trainer.training_step(model=model, inputs=inputs, num_items_in_batch=num_items_in_batch)
43
trainer.compute_loss(
44
model=model, inputs=inputs, return_outputs=return_outputs, num_items_in_batch=num_items_in_batch
45
)
46
trainer.save_model()
47
trainer.evaluate()
48
trainer.evaluation_loop()
49
trainer.predict()
50
trainer.prediction_step()
51
52
trainer.train(resume_from_checkpoint=args.resume_from_checkpoint)