Hello everyone
I’m trying to learn how to train translation models and I get an error that I need to use padding and truncation, but I installed them when creating the tokenizer and when I used it to get model_inputs
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (
translation
in this case) have excessive nesting (inputs typelist
where typeint
is expected).
import transformers
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, pipeline, \
DataCollatorWithPadding, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments
dataset = load_dataset('opus_books', 'en-ru')
model_checkpoint = "Helsinki-NLP/opus-mt-en-ru" # берем модель которую будем дообучать
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# model = pipeline("translation", model=model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", padding=True, truncation=True)
# device = torch.device('cpu')
# model.to(device)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # TODO попробовать этот
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, max_length=64)
split_datasets = dataset['train'].train_test_split(train_size=0.9, seed=20) # метод train_test_split разделяет его на
split_datasets['validation'] = split_datasets.pop('test')
print(split_datasets["train"][1]["translation"])
max_length = 128
def preprocess_function(examples): # ф-ция для предпроцессинга датасета
inputs = [ex['en'] for ex in examples['translation']] # это создаст 1000 предложений
targets = [ex['ru'] for ex in examples['translation']]
model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, padding=True, truncation=True)
return model_inputs
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
# remove_columns=split_datasets['train'].column_names,
)
training_args = Seq2SeqTrainingArguments(
'тest-trainer',
remove_unused_columns=False,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=5,
learning_rate=2e-5,
weight_decay=0.01
)
trainer = Seq2SeqTrainer(
model,
training_args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()