Mmed_Llama_3_8b_retraining

Hi dears
i have retrained this model using huggingface trainer , but my loss didn’t decrease, and after training when i want to evaluate after few iteration return me cuda error

How can i improve model ?

My Data is 140K Rows

def train(model, tokenizer, train_dataset, eval_dataset, output_dir):
    # Enable gradient checkpointing to reduce memory usage
    model.gradient_checkpointing_enable()

    # Apply specific model preparation for k-bit training
    model = prepare_model_for_kbit_training(model)

    # Create PEFT config and wrap the model
    peft_config = create_peft_config(find_all_linear_names(model))
    model = get_peft_model(model, peft_config)

    # Print information about trainable parameters
    print_trainable_parameters(model)

    # Set up the Trainer with training and evaluation datasets
    training_args = TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=2,
        num_train_epochs=1,
        warmup_steps=150,
        learning_rate=1e-6,
        lr_scheduler_type='constant',
        fp16=True,
        logging_steps=100,
        output_dir=output_dir,
        optim="paged_adamw_32bit",
        
        save_strategy="steps",
        save_steps=1000
      
    )
    #   load_best_model_at_end=True,
        # metric_for_best_model='eval_loss',
        # greater_is_better=False
    # evaluation_strategy="steps",
        # eval_steps=1000,
    # # Check for last checkpoint in the output directory
    # last_checkpoint = None
    # if os.path.isdir(output_dir):
    #     # last_checkpoint = TrainingArguments.la(output_dir)
    #     if last_checkpoint:
    #         print(f"Resuming training from the last checkpoint: {last_checkpoint}")
    #     else:
    #         print("No checkpoint found, starting training from scratch.")

    # Initialize the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        resume_from_checkpoint=last_checkpoint,
        eval_dataset=eval_dataset,
    )
 
    # Disable caching during training, re-enable later for inference
    model.config.use_cache = False

    # Launch training and evaluation
    print("Training and evaluating...")
    # trainer.train()

    # Evaluate and log metrics
    print("Evaluating...")
    eval_result = trainer.evaluate()
    trainer.log_metrics("eval", eval_result)
    trainer.save_metrics("eval", eval_result)

    # Save the best model
    print("Saving best checkpoint of the model...")
    model.save_pretrained(output_dir)

    # Clean up to free memory
    del model, trainer
    torch.cuda.empty_cache()

Hi, im also struggling in this problem, can i ask your pytorch, cuda, transformers version?