dataset = load_dataset('csv', data_files={'train': f"{DATA_PATH}\\train.csv", 'test': f"{DATA_PATH}\\test.csv", 'validation': f"{DATA_PATH}\\validation.csv"}, column_names=['text', 'label'], split=['train', 'test', 'validation'])
dataset = dataset_dict.DatasetDict({'train':dataset[0], 'test':dataset[1], 'validation':dataset[2]})
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True)
dataset = dataset.map(tokenize_function, batched=True)
FINAL_DS = dataset
training_stuff = {
"batch_size": 64,
"epochs": 4,
"learning_rate": 1e-5,
"weight_decay": 0.01
}
training_args = TrainingArguments(
output_dir="Models/DistilBert",
per_device_train_batch_size=training_stuff["batch_size"],
evaluation_strategy="steps",
num_train_epochs=training_stuff["epochs"],
fp16=True,
save_steps=100,
eval_steps=50,
logging_steps=10,
weight_decay=training_stuff["weight_decay"],
learning_rate=training_stuff["learning_rate"],
save_total_limit=64,
remove_unused_columns=False,
push_to_hub=False,
report_to='tensorboard',
load_best_model_at_end=True,
)
model = DistilBertModel.from_pretrained(
'distilbert-base-uncased',
num_labels=3,
id2label={0: 'Biased', 1: 'Non-biased', 2: 'No agreemnt'},
label2id={'Biased': 0, 'Non-biased': 1, 'No agreement': 2},
)
trainer = Trainer(
model=model,
args=training_args,,
train_dataset=FINAL_DS['train'],
eval_dataset=FINAL_DS['validation'],
tokenizer=tokenizer,
)
train_results = trainer.train()
When I run this I get the following error
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`text` in this case) have excessive nesting (inputs type `list` where type `int` is expected).