Value Error: Unable to create tensor

Hi,

I want to use sms_spam data to fine tune distilbert-base-uncased model with lora approach but I got the following error:

ValueError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:748, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
747 if not is_tensor(value):
→ 748 tensor = as_tensor(value)
750 # Removing this for now in favor of controlling the shape with prepend_batch_axis
751 # # at-least2d
752 # if tensor.ndim > 2:
753 # tensor = tensor.squeeze(0)
754 # elif tensor.ndim < 2:
755 # tensor = tensor[None, :]

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:720, in BatchEncoding.convert_to_tensors..as_tensor(value, dtype)
719 return torch.tensor(np.array(value))
→ 720 return torch.tensor(value)

ValueError: too many dimensions ‘str’

The above exception was the direct cause of the following exception:

ValueError Traceback (most recent call last)
Cell In[7], line 36
13 training_args = TrainingArguments(
14 output_dir=“my_model”,
15 learning_rate=1e-3,
(…)
23 remove_unused_columns=False
24 )
26 trainer = Trainer(
27 model=model,
28 args=training_args,
(…)
33 compute_metrics=compute_metrics,
34 )
—> 36 trainer.train()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
→ 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1821, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1818 rng_to_sync = True
1820 step = -1
→ 1821 for step, inputs in enumerate(epoch_iterator):
1822 total_batched_samples += 1
1824 if self.args.include_num_input_tokens_seen:

File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:448, in DataLoaderShard.iter(self)
446 # We iterate one batch ahead to check when we are at the end
447 try:
→ 448 current_batch = next(dataloader_iter)
449 except StopIteration:
450 yield

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:633, in _BaseDataLoaderIter.next(self)
630 if self._sampler_iter is None:
631 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub)
632 self._reset() # type: ignore[call-arg]
→ 633 data = self._next_data()
634 self._num_yielded += 1
635 if self._dataset_kind == _DatasetKind.Iterable and
636 self._IterableDataset_len_called is not None and
637 self._num_yielded > self._IterableDataset_len_called:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:677, in _SingleProcessDataLoaderIter._next_data(self)
675 def _next_data(self):
676 index = self._next_index() # may raise StopIteration
→ 677 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
678 if self._pin_memory:
679 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
—> 54 return self.collate_fn(data)

File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:249, in DataCollatorWithPadding.call(self, features)
248 def call(self, features: List[Dict[str, Any]]) → Dict[str, Any]:
→ 249 batch = self.tokenizer.pad(
250 features,
251 padding=self.padding,
252 max_length=self.max_length,
253 pad_to_multiple_of=self.pad_to_multiple_of,
254 return_tensors=self.return_tensors,
255 )
256 if “label” in batch:
257 batch[“labels”] = batch[“label”]

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3299, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3296 batch_outputs[key] =
3297 batch_outputs[key].append(value)
→ 3299 return BatchEncoding(batch_outputs, tensor_type=return_tensors)

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:223, in BatchEncoding.init(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
219 n_sequences = encoding[0].n_sequences
221 self._n_sequences = n_sequences
→ 223 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)

File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:764, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
759 if key == “overflowing_tokens”:
760 raise ValueError(
761 “Unable to create tensor returning overflowing tokens of different lengths. "
762 “Please see if a fast version of this tokenizer is available to have this feature available.”
763 ) from e
→ 764 raise ValueError(
765 “Unable to create tensor, you should probably activate truncation and/or padding with”
766 " ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your”
767 f" features ({key} in this case) have excessive nesting (inputs type list where type int is"
768 " expected)."
769 ) from e
771 return self

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (sms in this case) have excessive nesting (inputs type list where type int is expected).

Could anyone help me with this issue? Here is my code:

dataset = load_dataset(“sms_spam”, split=“train”).train_test_split(
test_size=0.2, shuffle=True, seed=23
)
splits = [“train”, “test”]

tokenizer = AutoTokenizer.from_pretrained(“distilbert-base-uncased”)

def preprocess_function(examples):
return tokenizer(examples[“sms”], truncation=True, padding=“max_length”, max_length=512)

tokenized_dataset = dataset.map(preprocess_function, batched=True)
test_dataset = tokenized_dataset[“test”]
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

model = AutoModelForSequenceClassification.from_pretrained(
“distilbert-base-uncased”,
num_labels=2,
id2label={0: “not spam”, 1: “spam”},
label2id={“not spam”: 0, “spam”: 1},
)

config = LoraConfig(
r=16,
lora_alpha=16,
target_modules=[“q_lin”, “k_lin”, “v_lin”],
lora_dropout=0.1,
bias=“none”,
)
model = get_peft_model(model, config)

def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return {“accuracy”: (predictions == labels).mean()}

train_lora = tokenized_dataset[‘train’].rename_column(‘label’, ‘labels’)
test_lora = tokenized_dataset[‘test’].rename_column(‘label’, ‘labels’)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
output_dir=“my_model”,
learning_rate=1e-3,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy=“epoch”,
save_strategy=“epoch”,
load_best_model_at_end=True,
remove_unused_columns=False
)

trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_lora,
eval_dataset=test_lora,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)

trainer.train()

consider creating your own data collator as this will allow you to more easily debug inputs at the batch level.