from datasets import load_dataset
imdb = load_dataset("imdb")
currently this splits my dataset into only train/test. But I want to validation as well for my text classification
DatasetDict({
train: Dataset({
features: ['text', 'label'],
num_rows: 25000
})
test: Dataset({
features: ['text', 'label'],
num_rows: 25000
})
unsupervised: Dataset({
features: ['text', 'label'],
num_rows: 50000
})
})
and I don’t know how to do that. I tried
train_testvalid = imdb["test"].train_test_split(test=0.1)
but gives me error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[43], line 1
----> 1 train_testvalid = imdb["test"].train_test_split(test=0.1)
File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:558, in transmit_format.<locals>.wrapper(*args, **kwargs)
551 self_format = {
552 "type": self._format_type,
553 "format_kwargs": self._format_kwargs,
554 "columns": self._format_columns,
555 "output_all_columns": self._output_all_columns,
556 }
557 # apply actual function
--> 558 out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
559 datasets: List["Dataset"] = list(out.values()) if isinstance(out, dict) else [out]
560 # re-apply format to the output
File /opt/conda/lib/python3.10/site-packages/datasets/fingerprint.py:482, in fingerprint_transform.<locals>._fingerprint.<locals>.wrapper(*args, **kwargs)
478 validate_fingerprint(kwargs[fingerprint_name])
480 # Call actual function
--> 482 out = func(dataset, *args, **kwargs)
484 # Update fingerprint of in-place transforms + update in-place history of transforms
486 if inplace: # update after calling func so that the fingerprint doesn't change if the function fails
TypeError: Dataset.train_test_split() got an unexpected keyword argument 'test'
How once I split it into train, test, validation. What do I pass in here
training_args = TrainingArguments(
output_dir="distilbert-fine-turned-classification",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_imdb["train"],
eval_dataset=tokenized_imdb["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
evaluation dataset should still be test right? Because you don’t see validation during training process?