I’m trying to fine-tune a model for my thesis using the following dataset for question-and-answer tasks: SzegedAI/MILQA at main, in the following code: `
import os
import json
import torch
from transformers import DefaultDataCollator, AutoModelForQuestionAnswering, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
def process_data(json_file):
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
inputs = []
for item in data["data"]:
for paragraph in item["paragraphs"]:
context = paragraph["context"]
qa_list = [{'Question': qa["question"]} for qa in paragraph["qas"]]
inputs.append({'Context': context, 'Questions_Answers': qa_list})
return inputs
class PlotCallback(TrainerCallback):
def __init__(self):
self.train_losses = []
self.eval_losses = []
self.eval_steps = []
def on_log(self, args, state, control, logs=None, **kwargs):
if state.is_world_process_zero:
if 'loss' in logs:
self.train_losses.append((state.global_step, logs['loss']))
if 'eval_loss' in logs:
self.eval_losses.append((state.global_step, logs['eval_loss']))
self.eval_steps.append(state.global_step)
def on_train_end(self, args, state, control, **kwargs):
# Training loss diagram
plt.figure(figsize=(10, 6))
plt.plot([step_loss[0] for step_loss in self.train_losses], [step_loss[1] for step_loss in self.train_losses], label='Training Loss', color='blue')
plt.title('Training Loss over Training Steps')
plt.xlabel('Training Steps')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()
class MyDataCollator(DefaultDataCollator):
def __init__(self, tokenizer):
super().__init__()
self.tokenizer = tokenizer
def collate_batch(self, examples):
input_ids = []
attention_masks = []
labels = []
for example in examples:
context_inputs = self.tokenizer(example['Context'], return_tensors="pt", padding=True, truncation=True)
context_input_ids = context_inputs["input_ids"]
context_attention_mask = context_inputs["attention_mask"]
for qa in example['Questions_Answers']:
question_inputs = self.tokenizer(qa['Question'], return_tensors="pt", padding=True, truncation=True)
question_input_ids = question_inputs["input_ids"]
question_attention_mask = question_inputs["attention_mask"]
input_ids.append(torch.cat([context_input_ids.squeeze(0), question_input_ids.squeeze(0)], dim=0))
attention_masks.append(torch.cat([context_attention_mask.squeeze(0), question_attention_mask.squeeze(0)], dim=0))
labels.append(question_input_ids.squeeze(0)) # Use question_input_ids as labels
return {
"input_ids": torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True),
"attention_mask": torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True),
"labels": torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)
}
class MyDataset(Dataset):
def __init__(self, batch):
self.inputs = batch['input_ids']
self.attention_masks = batch['attention_mask']
self.labels = batch['labels']
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
return {
'input_ids': self.inputs[idx],
'attention_mask': self.attention_masks[idx],
'labels': self.labels[idx],
}
with open("train.MILQA-2023-03-27.squad.s.json", "r", encoding="utf-8") as f:
data = json.load(f)
with open("test.MILQA-2023-03-27.squad.s.json", "r", encoding="utf-8") as f:
eval_data = json.load(f)
model_name = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = MyDataCollator(tokenizer)
train_inputs = process_data("train.MILQA-2023-03-27.squad.s.json")
eval_inputs = process_data("test.MILQA-2023-03-27.squad.s.json")
train_batch = data_collator.collate_batch(train_inputs)
eval_batch = data_collator.collate_batch(eval_inputs)
eval_dataset = MyDataset(eval_batch)
train_dataset = MyDataset(train_batch)
output_dir = "./finetuned_model"
if not os.path.exists(output_dir):
os.makedirs(output_dir)
training_args = TrainingArguments(
output_dir=output_dir,
overwrite_output_dir=True,
num_train_epochs=2,
learning_rate=2e-4,
per_device_train_batch_size=2,
warmup_ratio=0.1,
lr_scheduler_type="linear",
save_strategy="epoch",
eval_steps=500,
save_steps=500,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
callbacks=[PlotCallback()],
)
trainer.train()
tokenizer.save_pretrained(output_dir)
However, I’m encountering the following errors:
C:\Users\Levente\Desktop\minigpt\.venv\Scripts\python.exe C:\Users\Levente\Desktop\minigpt\train.py
2024-05-05 12:48:37.957206: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-05 12:48:38.591477: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Some weights of LlamaForQuestionAnswering were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.down_proj.weight', 'layers.10.mlp.gate_proj.weight', 'layers.10.mlp.up_proj.weight', 'layers.10.post_attention_layernorm.weight', 'layers.10.self_attn.k_proj.weight', 'layers.10.self_attn.o_proj.weight', 'layers.10.self_attn.q_proj.weight', 'layers.10.self_attn.v_proj.weight', 'layers.11.input_layernorm.weight', 'layers.11.mlp.down_proj.weight', 'layers.11.mlp.gate_proj.weight', 'layers.11.mlp.up_proj.weight', 'layers.11.post_attention_layernorm.weight', 'layers.11.self_attn.k_proj.weight', 'layers.11.self_attn.o_proj.weight', 'layers.11.self_attn.q_proj.weight', 'layers.11.self_attn.v_proj.weight', 'layers.12.input_layernorm.weight', 'layers.12.mlp.down_proj.weight', 'layers.12.mlp.gate_proj.weight', 'layers.12.mlp.up_proj.weight', 'layers.12.post_attention_layernorm.weight', 'layers.12.self_attn.k_proj.weight', 'layers.12.self_attn.o_proj.weight', 'layers.12.self_attn.q_proj.weight', 'layers.12.self_attn.v_proj.weight', 'layers.13.input_layernorm.weight', 'layers.13.mlp.down_proj.weight', 'layers.13.mlp.gate_proj.weight', 'layers.13.mlp.up_proj.weight', 'layers.13.post_attention_layernorm.weight', 'layers.13.self_attn.k_proj.weight', 'layers.13.self_attn.o_proj.weight', 'layers.13.self_attn.q_proj.weight', 'layers.13.self_attn.v_proj.weight', 'layers.14.input_layernorm.weight', 'layers.14.mlp.down_proj.weight', 'layers.14.mlp.gate_proj.weight', 'layers.14.mlp.up_proj.weight', 'layers.14.post_attention_layernorm.weight', 'layers.14.self_attn.k_proj.weight', 'layers.14.self_attn.o_proj.weight', 'layers.14.self_attn.q_proj.weight', 'layers.14.self_attn.v_proj.weight', 'layers.15.input_layernorm.weight', 'layers.15.mlp.down_proj.weight', 'layers.15.mlp.gate_proj.weight', 'layers.15.mlp.up_proj.weight', 'layers.15.post_attention_layernorm.weight', 'layers.15.self_attn.k_proj.weight', 'layers.15.self_attn.o_proj.weight', 'layers.15.self_attn.q_proj.weight', 'layers.15.self_attn.v_proj.weight', 'layers.16.input_layernorm.weight', 'layers.16.mlp.down_proj.weight', 'layers.16.mlp.gate_proj.weight', 'layers.16.mlp.up_proj.weight', 'layers.16.post_attention_layernorm.weight', 'layers.16.self_attn.k_proj.weight', 'layers.16.self_attn.o_proj.weight', 'layers.16.self_attn.q_proj.weight', 'layers.16.self_attn.v_proj.weight', 'layers.17.input_layernorm.weight', 'layers.17.mlp.down_proj.weight', 'layers.17.mlp.gate_proj.weight', 'layers.17.mlp.up_proj.weight', 'layers.17.post_attention_layernorm.weight', 'layers.17.self_attn.k_proj.weight', 'layers.17.self_attn.o_proj.weight', 'layers.17.self_attn.q_proj.weight', 'layers.17.self_attn.v_proj.weight', 'layers.18.input_layernorm.weight', 'layers.18.mlp.down_proj.weight', 'layers.18.mlp.gate_proj.weight', 'layers.18.mlp.up_proj.weight', 'layers.18.post_attention_layernorm.weight', 'layers.18.self_attn.k_proj.weight', 'layers.18.self_attn.o_proj.weight', 'layers.18.self_attn.q_proj.weight', 'layers.18.self_attn.v_proj.weight', 'layers.19.input_layernorm.weight', 'layers.19.mlp.down_proj.weight', 'layers.19.mlp.gate_proj.weight', 'layers.19.mlp.up_proj.weight', 'layers.19.post_attention_layernorm.weight', 'layers.19.self_attn.k_proj.weight', 'layers.19.self_attn.o_proj.weight', 'layers.19.self_attn.q_proj.weight', 'layers.19.self_attn.v_proj.weight', 'layers.2.input_layernorm.weight', 'layers.2.mlp.down_proj.weight', 'layers.2.mlp.gate_proj.weight', 'layers.2.mlp.up_proj.weight', 'layers.2.post_attention_layernorm.weight', 'layers.2.self_attn.k_proj.weight', 'layers.2.self_attn.o_proj.weight', 'layers.2.self_attn.q_proj.weight', 'layers.2.self_attn.v_proj.weight', 'layers.20.input_layernorm.weight', 'layers.20.mlp.down_proj.weight', 'layers.20.mlp.gate_proj.weight', 'layers.20.mlp.up_proj.weight', 'layers.20.post_attention_layernorm.weight', 'layers.20.self_attn.k_proj.weight', 'layers.20.self_attn.o_proj.weight', 'layers.20.self_attn.q_proj.weight', 'layers.20.self_attn.v_proj.weight', 'layers.21.input_layernorm.weight', 'layers.21.mlp.down_proj.weight', 'layers.21.mlp.gate_proj.weight', 'layers.21.mlp.up_proj.weight', 'layers.21.post_attention_layernorm.weight', 'layers.21.self_attn.k_proj.weight', 'layers.21.self_attn.o_proj.weight', 'layers.21.self_attn.q_proj.weight', 'layers.21.self_attn.v_proj.weight', 'layers.3.input_layernorm.weight', 'layers.3.mlp.down_proj.weight', 'layers.3.mlp.gate_proj.weight', 'layers.3.mlp.up_proj.weight', 'layers.3.post_attention_layernorm.weight', 'layers.3.self_attn.k_proj.weight', 'layers.3.self_attn.o_proj.weight', 'layers.3.self_attn.q_proj.weight', 'layers.3.self_attn.v_proj.weight', 'layers.4.input_layernorm.weight', 'layers.4.mlp.down_proj.weight', 'layers.4.mlp.gate_proj.weight', 'layers.4.mlp.up_proj.weight', 'layers.4.post_attention_layernorm.weight', 'layers.4.self_attn.k_proj.weight', 'layers.4.self_attn.o_proj.weight', 'layers.4.self_attn.q_proj.weight', 'layers.4.self_attn.v_proj.weight', 'layers.5.input_layernorm.weight', 'layers.5.mlp.down_proj.weight', 'layers.5.mlp.gate_proj.weight', 'layers.5.mlp.up_proj.weight', 'layers.5.post_attention_layernorm.weight', 'layers.5.self_attn.k_proj.weight', 'layers.5.self_attn.o_proj.weight', 'layers.5.self_attn.q_proj.weight', 'layers.5.self_attn.v_proj.weight', 'layers.6.input_layernorm.weight', 'layers.6.mlp.down_proj.weight', 'layers.6.mlp.gate_proj.weight', 'layers.6.mlp.up_proj.weight', 'layers.6.post_attention_layernorm.weight', 'layers.6.self_attn.k_proj.weight', 'layers.6.self_attn.o_proj.weight', 'layers.6.self_attn.q_proj.weight', 'layers.6.self_attn.v_proj.weight', 'layers.7.input_layernorm.weight', 'layers.7.mlp.down_proj.weight', 'layers.7.mlp.gate_proj.weight', 'layers.7.mlp.up_proj.weight', 'layers.7.post_attention_layernorm.weight', 'layers.7.self_attn.k_proj.weight', 'layers.7.self_attn.o_proj.weight', 'layers.7.self_attn.q_proj.weight', 'layers.7.self_attn.v_proj.weight', 'layers.8.input_layernorm.weight', 'layers.8.mlp.down_proj.weight', 'layers.8.mlp.gate_proj.weight', 'layers.8.mlp.up_proj.weight', 'layers.8.post_attention_layernorm.weight', 'layers.8.self_attn.k_proj.weight', 'layers.8.self_attn.o_proj.weight', 'layers.8.self_attn.q_proj.weight', 'layers.8.self_attn.v_proj.weight', 'layers.9.input_layernorm.weight', 'layers.9.mlp.down_proj.weight', 'layers.9.mlp.gate_proj.weight', 'layers.9.mlp.up_proj.weight', 'layers.9.post_attention_layernorm.weight', 'layers.9.self_attn.k_proj.weight', 'layers.9.self_attn.o_proj.weight', 'layers.9.self_attn.q_proj.weight', 'layers.9.self_attn.v_proj.weight', 'norm.weight', 'qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
0%| | 0/21118 [00:00<?, ?it/s]Traceback (most recent call last):
File "C:\Users\Levente\Desktop\minigpt\train.py", line 136, in <module>
trainer.train()
File "C:\Users\Levente\Desktop\minigpt\.venv\Lib\site-packages\transformers\trainer.py", line 1859, in train
return inner_training_loop(
^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Levente\Desktop\minigpt\.venv\Lib\site-packages\transformers\trainer.py", line 2203, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Levente\Desktop\minigpt\.venv\Lib\site-packages\transformers\trainer.py", line 3138, in training_step
loss = self.compute_loss(model, inputs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\Levente\Desktop\minigpt\.venv\Lib\site-packages\transformers\trainer.py", line 3179, in compute_loss
raise ValueError(
ValueError: The model did not return a loss from the inputs, only the following keys: start_logits,end_logits. For reference, the inputs it received are input_ids,attention_mask.
Process finished with exit code 1
What could be the solution to the problem to make the program run successfully? Unfortunately, time is pressing, and I’m quite puzzled.
Unfortunately, I’ve tried many things, but I’ve encountered various other problems. The data processing works fine, but the issue always arises when I pass it to the trainer. There was a problem with passing a list containing dictionaries, and another problem occurred with “Scalar tensor has no len()”. What could be the solution? Could you help me fix the code?