Hello,
I’m getting the following errors on evaluation using the Trainer API with a pretrained longformer model:
indexSelectLargeIndex: block: [31,0,0], thread: [32,0,0] Assertion srcIndex < srcSelectDimSize
failed.
Traceback (most recent call last):
File “/home/alexanderrasmussen/folder/sentiment_2.py”, line 62, in
trainer.train()
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/trainer.py”, line 1885, in train
return inner_training_loop(
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/trainer.py”, line 2311, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/trainer.py”, line 2721, in _maybe_log_save_evaluate
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/trainer.py”, line 3572, in evaluate
output = eval_loop(
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/trainer.py”, line 3757, in evaluation_loop
loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/trainer.py”, line 3971, in prediction_step
loss, outputs = self.compute_loss(model, inputs, return_outputs=True)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/trainer.py”, line 3264, in compute_loss
outputs = model(**inputs)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py”, line 1916, in forward
outputs = self.longformer(
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py”, line 1729, in forward
encoder_outputs = self.encoder(
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/torch/nn/modules/module.py”, line 1541, in _call_impl
return forward_call(*args, **kwargs)
File “/home/alexanderrasmussen/.conda/envs/cs224n-gpu/lib/python3.10/site-packages/transformers/models/longformer/modeling_longformer.py”, line 1282, in forward
is_global_attn = is_index_global_attn.flatten().any().item()
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
My Python code is very simple and I’m having a really hard time figuring out where the error is coming from:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import torch
import numpy as np
import pandas as pd
import transformers
import evaluate
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from torch.utils.data import DataLoader
from torch.nn.functional import softmax
from transformers import LongformerTokenizer, LongformerForSequenceClassification
f = 'rev_data.csv'
df = pd.read_csv(f)
df = df[['review_text', 'rating']]
upper = df['rating'].quantile(0.75)
lower = df['rating'].quantile(0.25)
df = df.loc[(df.rating >= upper) | (df.rating <= lower)]
df['label'] = (df.rating >= upper).astype(int)
dataset = Dataset.from_pandas(df)
dataset = dataset.class_encode_column('label')
dataset = dataset.train_test_split(train_size=0.8, seed=42)
dataset_clean = dataset['train'].train_test_split(train_size=0.75, seed=42)
dataset_clean['validation'] = dataset_clean.pop('test')
dataset_clean['test'] = dataset['test']
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
tokenized_dataset = dataset_clean.map(lambda batch: tokenizer(batch['review_text']),
batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['review_text', 'rating'])
data_collator = DataCollatorWithPadding(
tokenizer=tokenizer, pad_to_multiple_of=512)
model = LongformerForSequenceClassification.from_pretrained(
'allenai/longformer-base-4096')
training_args = TrainingArguments('test-trainer', evaluation_strategy='epoch')
training_args.set_dataloader(train_batch_size=1, eval_batch_size=1)
def compute_metrics(eval_preds):
metric = evaluate.load('glue', 'mrpc')
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_dataset['train'],
eval_dataset=tokenized_dataset['validation'],
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
If you have any ideas, I would be very grateful to hear them!