Loading WiC dataset for fine tuning

I’m attempting to load the WiC dataset given to me for a class final project to fine-tune a BERT model but keep getting errors.

I think it could be a problem with getitems but I’m not certain how I would change that to fit this dataset.

My Code is:

from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import torch

dataset = load_dataset('json', data_files={'train': 'train.jsonl', 'test': 'test.jsonl'})


train_texts, train_labels = dataset['train'], dataset['train']['label']
test_texts, test_labels = dataset['test'], dataset['test']['label']
encoded_dataset_train = train_texts.map(lambda examples: tokenizer(examples['sentence1']), batched=True)
encoded_dataset_test = test_texts.map(lambda examples: tokenizer(examples['sentence1']), batched=True)


class WiCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = WiCDataset(encoded_dataset_train, train_labels)
test_dataset = WiCDataset(encoded_dataset_test, test_labels)

training_args = TrainingArguments("test_trainer")
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)

trainer.train()

Returns the output:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15176/367029658.py in <module>
      3     model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
      4 
----> 5 trainer.train()

c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1288             self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
   1289 
-> 1290             for step, inputs in enumerate(epoch_iterator):
   1291 
   1292                 # Skip past any already trained steps if resuming training

c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
    559     def _next_data(self):
    560         index = self._next_index()  # may raise StopIteration
--> 561         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    562         if self._pin_memory:
    563             data = _utils.pin_memory.pin_memory(data)

c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

~\AppData\Local\Temp/ipykernel_15176/2427074494.py in __getitem__(self, idx)
     13 
     14     def __getitem__(self, idx):
---> 15         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
     16         item['labels'] = torch.tensor(self.labels[idx])
     17         return item

AttributeError: 'Dataset' object has no attribute 'items'

Hi,

I see you are first working with a HuggingFace Dataset (that is returned by the load_dataset function), and that you are then converting it to a PyTorch Dataset.

Actually, the latter is not required. Also, you can tokenize your training and test splits in one go:

from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import torch

# load local data as HuggingFace Dataset
dataset = load_dataset('json', data_files={'train': 'train.jsonl', 'test': 'test.jsonl'})

def preprocess_data(examples):
     # encode a batch of sentences
     encoding = tokenizer(examples["sentence1"], padding="max_length", truncation=True)
     # add labels as a list
     encoding["labels"] = examples["label"]

     return encoding

# tokenize sentences + add labels
encoded_dataset = dataset.map(preprocess_data)
# turn into PyTorch dataset
encoded_dataset.set_format("torch")

training_args = TrainingArguments("test_trainer")
trainer = Trainer(
    model=model, args=training_args, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset["test"])

trainer.train()
1 Like