RuntimeError: CUDA error: device-side assert triggered

When I freshly train the Token Classification model (DistilBertForTokenClassification) and run a prediction for a single sentence that I manually type out, it runs fine, but when I try to run it on my dataset, it fails with RuntimeError: CUDA error: device-side assert triggered when using.

This is how I’m performing the training:

if __name__ == '__main__':
    import torch

    class WNUTDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx])
            return item

        def __len__(self):
            return len(self.labels)

    train_encodings.pop("offset_mapping") # we don't want to pass this to the model
    val_encodings.pop("offset_mapping")
    train_dataset = WNUTDataset(train_encodings, train_labels)
    val_dataset = WNUTDataset(val_encodings, val_labels)

    from transformers import DistilBertForTokenClassification
    model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

    model.resize_token_embeddings(len(tokenizer))

    from transformers import Trainer, TrainingArguments

    training_args = TrainingArguments(
        output_dir='./results',          # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        logging_dir='./logs',            # directory for storing logs
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,                         # the instantiated 🤗 Transformers model to be trained
        args=training_args,                  # training arguments, defined above
        train_dataset=train_dataset,         # training dataset
        eval_dataset=val_dataset             # evaluation dataset
    )
    trainer.train()

This is the function that I’m using for prediction.

def ner_normalize(text, clean=False):
    if clean:
        text = clean_posts(text)
    inputs = tokenizer(text, return_tensors="pt")
    labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)
    # Running the prediction
    outputs = model(**inputs, token_type_ids=None)

    text = [inverse_vocab[i] for i in inputs['input_ids'][0].numpy()]
    preds = [id2tag[i.item()] for i in torch.argmax(outputs['logits'], axis=-1)[0]]

    collapsed_tokens = []
    collapsed_preds = []
    curr_token = ''
    curr_pred = None
    for i, subword_pred_pair in enumerate(zip(text, preds)):
        subword, pred = subword_pred_pair
        if subword.startswith('##'):
            curr_token += subword[2:]
            if i == len(text) - 1:
                collapsed_tokens.append(curr_token)
                collapsed_preds.append(curr_pred)
        else:
            collapsed_tokens.append(curr_token)
            collapsed_preds.append(curr_pred)
            curr_token = subword
            curr_pred = pred
    
    outputs = zip(collapsed_tokens[2:], collapsed_preds[2:])
    normalized = ' '.join(
        [x[0] if x[1] == "O" else "CODE" for x in outputs]
    )
    return normalized

This could be due to various reasons. Please run your code on the CPU to get a specific error message.