Hi,
I want to use sms_spam data to fine tune distilbert-base-uncased model with lora approach but I got the following error:
ValueError Traceback (most recent call last)
File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:748, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
747 if not is_tensor(value):
→ 748 tensor = as_tensor(value)
750 # Removing this for now in favor of controlling the shape withprepend_batch_axis
751 # # at-least2d
752 # if tensor.ndim > 2:
753 # tensor = tensor.squeeze(0)
754 # elif tensor.ndim < 2:
755 # tensor = tensor[None, :]File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:720, in BatchEncoding.convert_to_tensors..as_tensor(value, dtype)
719 return torch.tensor(np.array(value))
→ 720 return torch.tensor(value)ValueError: too many dimensions ‘str’
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
Cell In[7], line 36
13 training_args = TrainingArguments(
14 output_dir=“my_model”,
15 learning_rate=1e-3,
(…)
23 remove_unused_columns=False
24 )
26 trainer = Trainer(
27 model=model,
28 args=training_args,
(…)
33 compute_metrics=compute_metrics,
34 )
—> 36 trainer.train()File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
→ 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1821, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1818 rng_to_sync = True
1820 step = -1
→ 1821 for step, inputs in enumerate(epoch_iterator):
1822 total_batched_samples += 1
1824 if self.args.include_num_input_tokens_seen:File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:448, in DataLoaderShard.iter(self)
446 # We iterate one batch ahead to check when we are at the end
447 try:
→ 448 current_batch = next(dataloader_iter)
449 except StopIteration:
450 yieldFile /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:633, in _BaseDataLoaderIter.next(self)
630 if self._sampler_iter is None:
631 # TODO(Bug in dataloader iterator found by mypy · Issue #76750 · pytorch/pytorch · GitHub)
632 self._reset() # type: ignore[call-arg]
→ 633 data = self._next_data()
634 self._num_yielded += 1
635 if self._dataset_kind == _DatasetKind.Iterable and
636 self._IterableDataset_len_called is not None and
637 self._num_yielded > self._IterableDataset_len_called:File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:677, in _SingleProcessDataLoaderIter._next_data(self)
675 def _next_data(self):
676 index = self._next_index() # may raise StopIteration
→ 677 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
678 if self._pin_memory:
679 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
—> 54 return self.collate_fn(data)File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:249, in DataCollatorWithPadding.call(self, features)
248 def call(self, features: List[Dict[str, Any]]) → Dict[str, Any]:
→ 249 batch = self.tokenizer.pad(
250 features,
251 padding=self.padding,
252 max_length=self.max_length,
253 pad_to_multiple_of=self.pad_to_multiple_of,
254 return_tensors=self.return_tensors,
255 )
256 if “label” in batch:
257 batch[“labels”] = batch[“label”]File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:3299, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3296 batch_outputs[key] =
3297 batch_outputs[key].append(value)
→ 3299 return BatchEncoding(batch_outputs, tensor_type=return_tensors)File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:223, in BatchEncoding.init(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
219 n_sequences = encoding[0].n_sequences
221 self._n_sequences = n_sequences
→ 223 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)File /opt/conda/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:764, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
759 if key == “overflowing_tokens”:
760 raise ValueError(
761 “Unable to create tensor returning overflowing tokens of different lengths. "
762 “Please see if a fast version of this tokenizer is available to have this feature available.”
763 ) from e
→ 764 raise ValueError(
765 “Unable to create tensor, you should probably activate truncation and/or padding with”
766 " ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your”
767 f" features ({key}
in this case) have excessive nesting (inputs typelist
where typeint
is"
768 " expected)."
769 ) from e
771 return selfValueError: Unable to create tensor, you should probably activate truncation and/or padding with ‘padding=True’ ‘truncation=True’ to have batched tensors with the same length. Perhaps your features (
sms
in this case) have excessive nesting (inputs typelist
where typeint
is expected).
Could anyone help me with this issue? Here is my code:
dataset = load_dataset(“sms_spam”, split=“train”).train_test_split(
test_size=0.2, shuffle=True, seed=23
)
splits = [“train”, “test”]tokenizer = AutoTokenizer.from_pretrained(“distilbert-base-uncased”)
def preprocess_function(examples):
return tokenizer(examples[“sms”], truncation=True, padding=“max_length”, max_length=512)tokenized_dataset = dataset.map(preprocess_function, batched=True)
test_dataset = tokenized_dataset[“test”]
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)model = AutoModelForSequenceClassification.from_pretrained(
“distilbert-base-uncased”,
num_labels=2,
id2label={0: “not spam”, 1: “spam”},
label2id={“not spam”: 0, “spam”: 1},
)config = LoraConfig(
r=16,
lora_alpha=16,
target_modules=[“q_lin”, “k_lin”, “v_lin”],
lora_dropout=0.1,
bias=“none”,
)
model = get_peft_model(model, config)def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return {“accuracy”: (predictions == labels).mean()}train_lora = tokenized_dataset[‘train’].rename_column(‘label’, ‘labels’)
test_lora = tokenized_dataset[‘test’].rename_column(‘label’, ‘labels’)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)training_args = TrainingArguments(
output_dir=“my_model”,
learning_rate=1e-3,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy=“epoch”,
save_strategy=“epoch”,
load_best_model_at_end=True,
remove_unused_columns=False
)trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_lora,
eval_dataset=test_lora,
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)trainer.train()