I try to use PyTorch Class Dataset to create my own training task, but it seems make the model worse. After 4 epochs training the model outputs null string. Only a little part of the input can get right answer. I apprecitate it if someone could save me!!!
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import torch
from torch.utils.data import Dataset
def compute_metrics(eval_preds):
metric = evaluate.load("sacrebleu")
preds, labels = eval_preds
# In case the model returns more than the prediction logits
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100s in the labels as we can't decode them
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing to remove the "\n", "\t" and so on
decoded_preds = [pred.strip() for pred in decoded_preds]
decoded_labels = [[label.strip()] for label in decoded_labels]
for i in range(10):
print(decoded_preds[i])
print(decoded_labels[i])
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
return {"bleu": result["score"]}
class MyDataset(Dataset):
def __init__(self, file_name, tokenizer):
self.text1 = []
self.text2 = []
self.read(file_name)
self.read(file_name)
self.encoding = tokenizer(self.text1, text_target=self.text2, truncation=True, max_length=128, padding=True, return_tensors="pt")
def read(self, file_name): # Train data is like: "Go.\tVa !"
with open(file_name, "r", encoding="utf-8") as file:
while True:
line = file.readline()
if line == "":
break
self.text1.append(line.split("\t")[0])
self.text2.append(line.split("\t")[1])
def __getitem__(self, index):
item = {k: v[index].clone().detach() for k, v in self.encoding.items()}
return item
def __len__(self):
return len(self.text1)
def train():
train_dataset = MyDataset(train_file, tokenizer)
eval_dataset = MyDataset(eval_file, tokenizer)
training_args = Seq2SeqTrainingArguments(
output_dir="save_model",
learning_rate=2e-5,
per_device_train_batch_size=8,
per_device_eval_batch_size=16,
num_train_epochs=4,
evaluation_strategy="no",
save_strategy="epoch",
save_total_limit=1,
predict_with_generate=True,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
print(trainer.evaluate())
trainer.train()
print(trainer.evaluate())
if __name__ == "__main__":
model_name = "t5-small"
train_file = "fra-eng.txt"
eval_file = "fra-eng.txt"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda:0")
tokenizer = AutoTokenizer.from_pretrained(model_name)
train()