Hugging day to everyone
I’m training a translation model, I had big problems setting up cuda, and I trained on cpu, but today I solved it, but it gave me an error, which I (I think) I corrected, then another one came out and another…
And I was so stupid that I didn’t use git
Now I’m getting an error saying I need to use padding and truncation, but I originally put them in everywhere I could! Thank you for help!
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (
translation
in this case) have excessive nesting (inputs typelist
where typeint
is expected).
from datasets import interleave_datasets, load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, DataCollatorWithPadding, \
Seq2SeqTrainingArguments, Seq2SeqTrainer
folder_name = 'model-14'
# ENG:
d1 = load_dataset('ted_talks_iwslt', language_pair=("en", "fr"), year="2016")
d2 = load_dataset('opus_books', 'en-fr')
dataset = interleave_datasets([d1['train'], d2['train']], stopping_strategy="all_exhausted")
split_datasets = dataset.train_test_split(train_size=0.9, seed=20)
split_datasets['validation'] = split_datasets.pop('test')
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt", padding=True, truncation=True)
max_length = 128
def preprocess_function(examples):
inputs = [ex['en'] for ex in examples['translation']]
targets = [ex['fr'] for ex in examples['translation']]
model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, padding=True, truncation=True)
return model_inputs
tokenized_datasets = split_datasets.map(
preprocess_function,
batched=True,
# remove_columns=split_datasets['train'].column_names,
)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, max_length=64)
batch = data_collator([{'input_ids': ex['input_ids'], 'attention_mask': ex['attention_mask'],
'labels': ex['labels']} for ex in tokenized_datasets['train']])
for i in range(1, 3):
print(tokenized_datasets['train'][i]['labels'])
args = Seq2SeqTrainingArguments(
folder_name,
evaluation_strategy='no',
save_strategy='epoch',
learning_rate=2e-5,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
weight_decay=0.01,
save_total_limit=3,
num_train_epochs=2,
remove_unused_columns=False,
predict_with_generate=True,
# fp16=True,
push_to_hub=False
)
trainer = Seq2SeqTrainer(
model,
args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
data_collator=data_collator,
tokenizer=tokenizer,
)
a = trainer.train()