I trying to finetune a model after the vocabulary expansion. I am facing some problems when the vocab is resized
tokenizer.add_tokens(list(new_tokens))
model.resize_token_embeddings(len(tokenizer),)
output_dir = "opt-125-alpaca-ne-vocab-expansion"
model_save_path = output_dir + "/saved_model"
trainer = Trainer(
model=model,
train_dataset=mapped_dataset,
args=TrainingArguments(
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_steps=100,
save_strategy="steps",
save_steps=500,
# max_steps=10, # args.max_steps,
num_train_epochs=1,
learning_rate=1e-5,
fp16=True,
logging_steps=1,
output_dir=output_dir,
),
# data_collator=collator,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False,),
)