When I freshly train the Token Classification model (DistilBertForTokenClassification) and run a prediction for a single sentence that I manually type out, it runs fine, but when I try to run it on my dataset, it fails with RuntimeError: CUDA error: device-side assert triggered when using
.
This is how I’m performing the training:
if __name__ == '__main__':
import torch
class WNUTDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset = WNUTDataset(train_encodings, train_labels)
val_dataset = WNUTDataset(val_encodings, val_labels)
from transformers import DistilBertForTokenClassification
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
model.resize_token_embeddings(len(tokenizer))
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
trainer.train()
This is the function that I’m using for prediction.
def ner_normalize(text, clean=False):
if clean:
text = clean_posts(text)
inputs = tokenizer(text, return_tensors="pt")
labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)
# Running the prediction
outputs = model(**inputs, token_type_ids=None)
text = [inverse_vocab[i] for i in inputs['input_ids'][0].numpy()]
preds = [id2tag[i.item()] for i in torch.argmax(outputs['logits'], axis=-1)[0]]
collapsed_tokens = []
collapsed_preds = []
curr_token = ''
curr_pred = None
for i, subword_pred_pair in enumerate(zip(text, preds)):
subword, pred = subword_pred_pair
if subword.startswith('##'):
curr_token += subword[2:]
if i == len(text) - 1:
collapsed_tokens.append(curr_token)
collapsed_preds.append(curr_pred)
else:
collapsed_tokens.append(curr_token)
collapsed_preds.append(curr_pred)
curr_token = subword
curr_pred = pred
outputs = zip(collapsed_tokens[2:], collapsed_preds[2:])
normalized = ' '.join(
[x[0] if x[1] == "O" else "CODE" for x in outputs]
)
return normalized