Now I am getting:
element 0 of tensors does not require grad and does not have a grad_fn
This is when I am loading the output_dir checkpoint instead of the save_pretrained data and attempting to fine-tune more
Edit: I realized I needed to enable gradient checkpointing and now I get a new error:
len(optimizer_state["found_inf_per_device"]) > 0
AssertionError: No inf checks were recorded for this optimizer.
Here is how I am loading my model (i need to implement DRY, just testing still):
def get_model(model_path, base_model):
print('Detecting if model path is for a base model (not already fine-tuned)...')
if model_path == base_model:
print('Detected model path is for a base model (not already fine-tuned).')
print(f'Getting {model_path} base model...')
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
trust_remote_code=True,
quantization_config=BNB_CONFIG,
cache_dir=hugging_face_cache_dir
)
print(f'Got {model_path} base model.')
print('Enabling model gradient checkpointing...')
model.gradient_checkpointing_enable()
print('Enabled model gradient checkpointing.')
print('Preparing model for kbit training...')
model = prepare_model_for_kbit_training(model)
print('Prepared model for kbit training.')
print(f'Getting target_modules for base model {base_model}...')
if base_model == FALCON_7B:
target_modules = ["query_key_value"]
elif base_model == MISTRAL_7B:
target_modules = ["q_proj", "v_proj"]
else:
raise Exception(f'Unable to determine target_modules for base model {base_model}!')
print(f'Got target_modules for base model {base_model}: {", ".join(target_modules)}.')
print('Setting up LoraConfig...')
lora_config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=target_modules,
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
print('Set up LoraConfig.')
print('Getting peft model...')
model = get_peft_model(model, lora_config)
print('Got peft model.')
else:
print('Detected model path is NOT for a base model (this is a fine-tuned model).')
print(f'Getting {model_path} fine-tuned model...')
model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
trust_remote_code=True,
quantization_config=BNB_CONFIG,
cache_dir=hugging_face_cache_dir
)
print(f'Got {model_path} fine-tuned model.')
print('Enabling model gradient checkpointing...')
model.gradient_checkpointing_enable()
print('Enabled model gradient checkpointing.')
print('Preparing model for kbit training...')
model = prepare_model_for_kbit_training(model)
print('Prepared model for kbit training.')
print('Unsetting model.config.use_cache...')
model.config.use_cache = False
print('Unset model.config.use_cache.')
return model
And here is my training code:
def llm_training(model_path, base_model, training_tag):
model = get_model(model_path, base_model)
tokenizer = get_tokenizer(base_model)
tokenized_dataset = get_tokenized_dataset(tokenizer)
session_trainer_output_dir = os.path.join(trainer_output_dir, get_datetime_str(), training_tag)
print('Setting up training arguments...')
training_args = TrainingArguments(
per_device_train_batch_size=11,
gradient_accumulation_steps=8,
num_train_epochs=1,
learning_rate=2e-4,
fp16=True,
logging_steps=1,
output_dir=session_trainer_output_dir,
overwrite_output_dir=True,
save_strategy='epoch',
save_total_limit=3,
optim='paged_adamw_8bit',
lr_scheduler_type="cosine",
warmup_ratio=0.05,
remove_unused_columns=True
)
print('Set up training arguments.')
print('Constructing Trainer object...')
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset['train'],
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
# optimizers=(optimizer, None)
)
print('Constructed Trainer object.')
print('Initiating training...')
trainer.train()
print('Initiated training (completed).')