Hi! I am trying to reproduce this Notebook: https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing. However, there is no way to make it work in Colab since I am getting an OOM error all the time (with both GPU and None environments). I tried to download and run it on my laptop but still no way to make it work.
Despite this, I wanted to adapt that tutorial in order to use another pre-trained model and another local CSV dataset. This is my code:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
train_dataset, validation_dataset = load_dataset(
"csv",
delimiter="\t",
data_files={"train": "spanish_train.csv", "validation": "spanish_val.csv"},
split=["train", "validation"],
)
train_dataset.remove_column_("id_str")
train_dataset.rename_column_("TWEET", "tweet")
train_dataset.rename_column_("LABEL", "label")
validation_dataset.remove_column_("id_str")
validation_dataset.rename_column_("TWEET", "tweet")
validation_dataset.rename_column_("LABEL", "label")
def tokenize(batch):
return tokenizer(batch["tweet"], padding=True, truncation=True, max_length=10000)
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=10)
validation_dataset = validation_dataset.map(tokenize, batched=True, batch_size=10)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
validation_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
evaluate_during_training=True,
logging_dir='./logs',
)
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=validation_dataset
)
trainer.train()
In this last line I am getting this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-93-3435b262f1ae> in <module>()
----> 1 trainer.train()
10 frames
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, model_path)
490 self._past = None
491
--> 492 for step, inputs in enumerate(epoch_iterator):
493
494 # Skip past any already trained steps if resuming training
/usr/local/lib/python3.6/dist-packages/tqdm/notebook.py in __iter__(self, *args, **kwargs)
213 def __iter__(self, *args, **kwargs):
214 try:
--> 215 for obj in super(tqdm_notebook, self).__iter__(*args, **kwargs):
216 # return super(tqdm...) will not catch exception
217 yield obj
/usr/local/lib/python3.6/dist-packages/tqdm/std.py in __iter__(self)
1102 fp_write=getattr(self.fp, 'write', sys.stderr.write))
1103
-> 1104 for obj in iterable:
1105 yield obj
1106 # Update and possibly print the progressbar.
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
361
362 def __next__(self):
--> 363 data = self._next_data()
364 self._num_yielded += 1
365 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
401 def _next_data(self):
402 index = self._next_index() # may raise StopIteration
--> 403 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
404 if self._pin_memory:
405 data = _utils.pin_memory.pin_memory(data)
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in __getitem__(self, key)
717 format_columns=self._format_columns,
718 output_all_columns=self._output_all_columns,
--> 719 format_kwargs=self._format_kwargs,
720 )
721
/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in _getitem(self, key, format_type, format_columns, output_all_columns, format_kwargs)
705 format_columns=format_columns,
706 output_all_columns=output_all_columns,
--> 707 format_kwargs=format_kwargs,
708 )
709 return outputs
/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in _convert_outputs(self, outputs, format_type, format_columns, output_all_columns, format_kwargs)
617 continue
618 if format_columns is None or k in format_columns:
--> 619 v = map_nested(command, v, **map_nested_kwargs)
620 output_dict[k] = v
621 return output_dict
/usr/local/lib/python3.6/dist-packages/nlp/utils/py_utils.py in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy)
189 return np.array(mapped)
190 # Singleton
--> 191 return function(data_struct)
192
193
TypeError: new(): invalid data type 'str'
Does anyone know what am I doing wrong?