I’m trying to fine tune the DialoGPT-large model but I’m still really new to ML and am probably misusing the trainer API. I already went through the tutorial and the colab examples but I still can’t figure out the issue.
error:
Traceback (most recent call last):
File "/.../main.py", line 26, in <module>
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
code:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
### CONFIG
weights = "microsoft/DialoGPT-large"
# Initialize tokenizer and model
print("Loading model... ", end='', flush=True)
tokenizer = AutoTokenizer.from_pretrained(weights)
model = AutoModelForCausalLM.from_pretrained(weights)
print('DONE')
### FINE TUNING ###
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
raw_datasets = load_dataset("glue", "mrpc")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(example):
return tokenizer([example["sentence1"], example["sentence2"]], truncation=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
from transformers import TrainingArguments
training_args = TrainingArguments("test-trainer")
from transformers import Trainer
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets["train"],
eval_dataset=tokenized_datasets["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)