I’m attempting to load the WiC dataset given to me for a class final project to fine-tune a BERT model but keep getting errors.
I think it could be a problem with getitems but I’m not certain how I would change that to fit this dataset.
My Code is:
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import torch
dataset = load_dataset('json', data_files={'train': 'train.jsonl', 'test': 'test.jsonl'})
train_texts, train_labels = dataset['train'], dataset['train']['label']
test_texts, test_labels = dataset['test'], dataset['test']['label']
encoded_dataset_train = train_texts.map(lambda examples: tokenizer(examples['sentence1']), batched=True)
encoded_dataset_test = test_texts.map(lambda examples: tokenizer(examples['sentence1']), batched=True)
class WiCDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_dataset = WiCDataset(encoded_dataset_train, train_labels)
test_dataset = WiCDataset(encoded_dataset_test, test_labels)
training_args = TrainingArguments("test_trainer")
trainer = Trainer(
model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
trainer.train()
Returns the output:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_15176/367029658.py in <module>
3 model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset)
4
----> 5 trainer.train()
c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\transformers\trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1288 self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control)
1289
-> 1290 for step, inputs in enumerate(epoch_iterator):
1291
1292 # Skip past any already trained steps if resuming training
c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\_utils\fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
c:\users\kjp19\appdata\local\programs\python\python38\lib\site-packages\torch\utils\data\_utils\fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
~\AppData\Local\Temp/ipykernel_15176/2427074494.py in __getitem__(self, idx)
13
14 def __getitem__(self, idx):
---> 15 item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
16 item['labels'] = torch.tensor(self.labels[idx])
17 return item
AttributeError: 'Dataset' object has no attribute 'items'