Hello! I’m trying to fine-tune bofenghuang/vigogne-instruct-7b model for a text-classification task.
I have already managed to succesfully fine-tuned camemBERT and flauBERT models for this task.
But now I face a problem because it’s not the same way of managing the model : I have to get the weights of Llama-7b from huggyllama and then the model bofenghuang/vigogne-instruct-7b with PeftModel.
I have added a pad_token to the tokenizer because of the padding in the tokenize_function().
I have also added a batch_size and reduce it but it doesn’t solve the problem
Here is my code to train the vigogne model :
def vigogne_fine_tunning_themas():
new_dataset_dict = load_from_disk('./files/dataset_one_thema_v2')
base_model_name_or_path = "huggyllama/llama-7b"
lora_model_name_or_path = "bofenghuang/vigogne-instruct-7b"
tokenizer = LlamaTokenizer.from_pretrained(base_model_name_or_path, padding_side="right", use_fast=False)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def tokenize_function(article):
return tokenizer(article["article"], padding="max_length", truncation=True)
tokenized_datasets = new_dataset_dict.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['identifier', 'categories', 'article', 'themas_ids'])
tokenized_datasets = tokenized_datasets.rename_column("themas", "labels")
tokenized_datasets = tokenized_datasets.with_format("torch")
# small dataset for testing
train_dataset = tokenized_datasets["train_dataset_themas"].shuffle(seed=42).select(range(10))
eval_dataset = tokenized_datasets["val_dataset_themas"].shuffle(seed=42).select(range(10))
# train_dataset = tokenized_datasets["train_dataset_themas"]
# eval_dataset = tokenized_datasets["val_dataset_themas"]
data_collator = DataCollatorWithPadding(tokenizer)
thema_label2id_en = {"agronomy": 0, "business": 1, "design": 2, "digital": 3,
"environment": 4,
"learning": 5, "medical": 6, "people": 7, "production": 8, "resource": 9,
"science": 10, "security": 11, "society": 12, "transport": 13}
thema_id2label_en = {0: "agronomy", 1: "business", 2: "design", 3: "digital",
4: "environment",
5: "learning", 6: "medical", 7: "people", 8: "production", 9: "resource",
10: "science", 11: "security", 12: "society", 13: "transport"}
model = LlamaForCausalLM.from_pretrained(base_model_name_or_path, num_labels=14, id2label=thema_id2label_en,
label2id=thema_label2id_en)
model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(model, lora_model_name_or_path)
batch_size = 7 # reduce this number
training_args = TrainingArguments(output_dir="./vigogne_finetuned_one_thema_balanced",
evaluation_strategy="epoch",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
push_to_hub=True)
trainer = Trainer(
model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
trainer.save_model(output_dir="./vigogne_fine_tuned/one_thema_balanced")
trainer.push_to_hub(commit_message="finetuned vigogne model on balanced one thema dataset")
I have this output :
Loading cached processed dataset at /home/debian/innodrive-data/flask/files/dataset_one_thema_v2/train_dataset_themas/cache-40f24be49726430c.arrow
Loading cached processed dataset at /home/debian/innodrive-data/flask/files/dataset_one_thema_v2/val_dataset_themas/cache-e3406b6dcfd4a056.arrow
Loading cached processed dataset at /home/debian/innodrive-data/flask/files/dataset_one_thema_v2/test_dataset_themas/cache-7b1f3e8c012443a4.arrow
Loading cached shuffled indices for dataset at /home/debian/innodrive-data/flask/files/dataset_one_thema_v2/train_dataset_themas/cache-b610dc39803c6f86.arrow
Loading cached shuffled indices for dataset at /home/debian/innodrive-data/flask/files/dataset_one_thema_v2/val_dataset_themas/cache-be61289d61e459de.arrow
Loading checkpoint shards: 0%| | 0/2 [00:00<?, ?it/s]
Killed
I have also tried to use this model :
model = LlamaForCausalLM.from_pretrained(
base_model_name_or_path,
torch_dtype=torch.float16,
device_map="auto",
num_labels=14,
id2label=thema_id2label_en,
label2id=thema_label2id_en
)
in place of this line :
model = LlamaForCausalLM.from_pretrained(base_model_name_or_path, num_labels=14, id2label=thema_id2label_en,
label2id=thema_label2id_en)
But this time the error is
RuntimeError: "addmm_impl_cpu_" not implemented for 'Half'
Could you help me please ?