Hi dears
i have retrained this model using huggingface trainer , but my loss didn’t decrease, and after training when i want to evaluate after few iteration return me cuda error
How can i improve model ?
My Data is 140K Rows
def train(model, tokenizer, train_dataset, eval_dataset, output_dir):
# Enable gradient checkpointing to reduce memory usage
model.gradient_checkpointing_enable()
# Apply specific model preparation for k-bit training
model = prepare_model_for_kbit_training(model)
# Create PEFT config and wrap the model
peft_config = create_peft_config(find_all_linear_names(model))
model = get_peft_model(model, peft_config)
# Print information about trainable parameters
print_trainable_parameters(model)
# Set up the Trainer with training and evaluation datasets
training_args = TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=2,
num_train_epochs=1,
warmup_steps=150,
learning_rate=1e-6,
lr_scheduler_type='constant',
fp16=True,
logging_steps=100,
output_dir=output_dir,
optim="paged_adamw_32bit",
save_strategy="steps",
save_steps=1000
)
# load_best_model_at_end=True,
# metric_for_best_model='eval_loss',
# greater_is_better=False
# evaluation_strategy="steps",
# eval_steps=1000,
# # Check for last checkpoint in the output directory
# last_checkpoint = None
# if os.path.isdir(output_dir):
# # last_checkpoint = TrainingArguments.la(output_dir)
# if last_checkpoint:
# print(f"Resuming training from the last checkpoint: {last_checkpoint}")
# else:
# print("No checkpoint found, starting training from scratch.")
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
resume_from_checkpoint=last_checkpoint,
eval_dataset=eval_dataset,
)
# Disable caching during training, re-enable later for inference
model.config.use_cache = False
# Launch training and evaluation
print("Training and evaluating...")
# trainer.train()
# Evaluate and log metrics
print("Evaluating...")
eval_result = trainer.evaluate()
trainer.log_metrics("eval", eval_result)
trainer.save_metrics("eval", eval_result)
# Save the best model
print("Saving best checkpoint of the model...")
model.save_pretrained(output_dir)
# Clean up to free memory
del model, trainer
torch.cuda.empty_cache()