Hello i currently use hf trainer + lora + deepspeed zero2 for training. but I got problem that the metrics score f1 I got during eval training is higher than the metrics I got (same data) when I merge lora adapter and test it. I also add the option load best model at the end. can any body tell my what I my merge step do wrong:
trainer = Trainer(
model=model,
args=train_args,
data_collator=collator,
train_dataset=train_dataloader,
eval_dataset=val_dataloader if training_args.do_eval else None,
compute_metrics=compute_metrics # if training_args.do_eval else None,
)
if training_args.do_eval and callback_args.get("patient", None):
trainer.add_callback(EarlyStoppingCallback(early_stopping_patience=callback_args.patient))
# Training
checkpoint = False
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
trainer.train(resume_from_checkpoint=checkpoint)
if trainer.is_local_process_zero() and use_lora:
model = trainer.model.merge_and_unload()
model.save_pretrained(
f"{training_args.output_dir}/merged_model",
safe_serialization=True,
max_shard_size="5GB" # Adjust this value based on your needs
)
here is my training config:
data_args:
train_data:
# annotate_path: "data/public_train/ocr_llm_fix.json"
# image_path: "data/public_train/train-images"
annotate_path: "data/warn_up/ocr_llm.json"
image_path: "data/warn_up/warmup-images"
val_data:
annotate_path: "data/warn_up/ocr_llm.json"
image_path: "data/warn_up/warmup-images"
max_length: 2500
# min_pixels: 200704
# max_pixels: 1003520
# cache_dir: "./data_cache"
training_args:
freeze_base: False
bf16: True
use_lora: True
learning_rate: 5e-5
lr_scheduler_type: "cosine"
warmup_ratio: 0.1
weight_decay: 0.0
gradient_checkpointing: true
gradient_checkpointing_kwargs: {"use_reentrant": False}
save_safetensors: False
torch_compile_backend: "cudagraphs"
use_liger_kernel: False # BUG if True
quantization: 0 # 0 mean not use quantization 4 mean 4 bit 8 mean 8 bit
output_dir: "model/dump/dummp_1"
overwrite_output_dir: true
num_train_epochs: 1
per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 8
dataloader_num_workers: 36
do_train: true
do_eval: true
eval_strategy: "steps" # Evaluation is done (and logged) every eval_steps
logging_strategy: "steps"
save_strategy: "steps"
eval_steps: 1
save_steps: 1
save_total_limit: 5
# seed: null
label_names: ["labels"]
resume_from_checkpoint: False
load_best_model_at_end: True
prediction_loss_only: False
metric_for_best_model: "f1"
neftune_noise_alpha: 5
model_args:
base_model: "LLaMA-Factory/models/qwen2_vl_lora_sft_v2"
extra_layers: 4
num_class: 4
model_kwargs:
token: "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"
use_cache: False
attn_implementation: "flash_attention_2" # flash_attention_2, sdpa, eager
attn_implementation: "flash_attention_2"
tokenizer_args:
cache_dir: "./tokenizer_data"
token: "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"
lora_args:
use_dora: False
r: 16
lora_alpha: 32
lora_dropout: 0.05
bias: "none"
target_modules: [
.*\.down_proj, # Add $ to match end of string
.*\.gate_proj,
.*\.v_proj,
.*\.up_proj,
.*\.k_proj,
.*\.o_proj,
.*\.q_proj
] # "all-linear"
modules_to_save: [
^encoder_layers.*, # Add ^ to match start of string
^classification_layer
]
wandb_args:
run_name: "qwen2-vl-reasoning-tuned-based_dump" # "My goodfellas"
logging_steps: 1
report_to: "wandb"
callback_args:
patient: null
huggingface_args:
push_to_hub: false
hub_private_repo: true
hub_model_id: "qwen2vl7b-cls"