It asks to add padding or truncation but I have already done it

Hello everyone

I’m trying to learn how to train translation models and I get an error that I need to use padding and truncation, but I installed them when creating the tokenizer and when I used it to get model_inputs

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (translation in this case) have excessive nesting (inputs type list where type int is expected).

import transformers
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, pipeline, \
                         DataCollatorWithPadding, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainingArguments



dataset = load_dataset('opus_books', 'en-ru')
model_checkpoint = "Helsinki-NLP/opus-mt-en-ru"                                   # берем модель которую будем дообучать
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
# model = pipeline("translation", model=model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", padding=True, truncation=True)

# device = torch.device('cpu')
# model.to(device)


# data_collator = DataCollatorWithPadding(tokenizer=tokenizer)                      # TODO попробовать этот
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, max_length=64)




split_datasets = dataset['train'].train_test_split(train_size=0.9, seed=20)      # метод train_test_split разделяет его на
split_datasets['validation'] = split_datasets.pop('test')
print(split_datasets["train"][1]["translation"])


max_length = 128

def preprocess_function(examples):                              # ф-ция для предпроцессинга датасета
    inputs = [ex['en'] for ex in examples['translation']]       # это создаст 1000 предложений
    targets = [ex['ru'] for ex in examples['translation']]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, padding=True, truncation=True)
    return model_inputs


tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    # remove_columns=split_datasets['train'].column_names,
)



training_args = Seq2SeqTrainingArguments(
    'тest-trainer',
    remove_unused_columns=False,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01
)


trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)


trainer.train()




The way you implemented model_inputs is okay for a single example, but in a batch all the tensors need to be of same length.

Hence instead of padding=True, either pass in ‘max_length’, which pads all the inputs in batch to the length of the longest sequence in the batch or ‘longest’, which pads all the batches to the defined max_length. ‘max_length’ saves computation than longest as you may be able to deduce.