hi @everyone, i am trying to learn on how to fine tune a pretrained model and use it. this is my code
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import torch
# Define a simple accuracy metric
def compute_metrics(p):
predictions, labels = p
preds = np.argmax(predictions, axis=1)
return {"accuracy": (preds == labels).mean()}
# Load the dataset
dataset = load_dataset("imdb", split='train[:1%]')
small_train_dataset = dataset.train_test_split(test_size=0.1)['train']
small_eval_dataset = dataset.train_test_split(test_size=0.1)['test']
# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# Tokenize the dataset
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True)
small_train_dataset = small_train_dataset.map(tokenize_function, batched=True)
small_eval_dataset = small_eval_dataset.map(tokenize_function, batched=True)
small_train_dataset = small_train_dataset.rename_column("label", "labels")
small_eval_dataset = small_eval_dataset.rename_column("label", "labels")
small_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
small_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
# Define training arguments
training_args = TrainingArguments(
output_dir="test_trainer",
evaluation_strategy="epoch",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
weight_decay=0.01
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=small_train_dataset,
eval_dataset=small_eval_dataset,
compute_metrics=compute_metrics
)
# Train the model
trainer.train()
# Evaluate the model
validation_results = trainer.evaluate()
print(validation_results)
now, i am trying to make a prediction on the fine tuned model, like this
inputs=tokenizer(dataset[0]['text'], padding="max_length", truncation=True,return_tensors="pt")
predictions = trainer.predict(test_dataset=inputs)
i am getting this error when i am trying to make a prediction,
IndexError Traceback (most recent call last)
Cell In[8], line 7
3 inputs=tokenizer(dataset[0][‘text’], padding=“max_length”, truncation=True,return_tensors=“pt”)
6 # Make predictions
----> 7 predictions = trainer.predict(test_dataset=inputs)File C:\Python311\Lib\site-packages\transformers\trainer.py:3305, in Trainer.predict(self, test_dataset, ignore_keys, metric_key_prefix)
3302 start_time = time.time()
3304 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
→ 3305 output = eval_loop(
3306 test_dataloader, description=“Prediction”, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
3307 )
3308 total_batch_size = self.args.eval_batch_size * self.args.world_size
3309 if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:File C:\Python311\Lib\site-packages\transformers\trainer.py:3408, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
3406 observed_num_examples = 0
3407 # Main evaluation loop
→ 3408 for step, inputs in enumerate(dataloader):
3409 # Update the observed num examples
3410 observed_batch_size = find_batch_size(inputs)
3411 if observed_batch_size is not None:File C:\Python311\Lib\site-packages\accelerate\data_loader.py:454, in DataLoaderShard.iter(self)
452 # We iterate one batch ahead to check when we are at the end
453 try:
→ 454 current_batch = next(dataloader_iter)
455 except StopIteration:
456 yieldFile C:\Python311\Lib\site-packages\torch\utils\data\dataloader.py:631, in _BaseDataLoaderIter.next(self)
628 if self._sampler_iter is None:
629 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub)
630 self._reset() # type: ignore[call-arg]
→ 631 data = self._next_data()
632 self._num_yielded += 1
633 if self._dataset_kind == _DatasetKind.Iterable and
634 self._IterableDataset_len_called is not None and
635 self._num_yielded > self._IterableDataset_len_called:File C:\Python311\Lib\site-packages\torch\utils\data\dataloader.py:675, in _SingleProcessDataLoaderIter._next_data(self)
673 def _next_data(self):
674 index = self._next_index() # may raise StopIteration
→ 675 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
676 if self._pin_memory:
677 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)File C:\Python311\Lib\site-packages\torch\utils\data_utils\fetch.py:51, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
49 data = self.dataset.getitems(possibly_batched_index)
50 else:
—> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]File C:\Python311\Lib\site-packages\torch\utils\data_utils\fetch.py:51, in (.0)
49 data = self.dataset.getitems(possibly_batched_index)
50 else:
—> 51 data = [self.dataset[idx] for idx in possibly_batched_index]
52 else:
53 data = self.dataset[possibly_batched_index]File C:\Python311\Lib\site-packages\transformers\tokenization_utils_base.py:255, in BatchEncoding.getitem(self, item)
253 return self.data[item]
254 elif self._encodings is not None:
→ 255 return self._encodings[item]
256 elif isinstance(item, slice):
257 return {key: self.data[key][item] for key in self.data.keys()}IndexError: list index out of range
can anyone please help to resolve this issue