Unable to properly map tensors to examples

After using the map function to tokenize all splits via a lambda function I’m unable pass the encoded text/inputs forward through Trainer. Thanks in advance for any direction!

category_data = load_dataset("csv", data_files="testdatav2.csv")
category_data = category_data.remove_columns(["amazoncontactid", "regiid", "lineofbusiness", "primary_label"])
category_data = category_data['train']
train_testvalid = category_data.train_test_split(test_size=0.3)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
from datasets.dataset_dict import DatasetDict
cd = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(cd)
category_data = load_dataset("csv", data_files="testdatav2.csv")
category_data = category_data.remove_columns(["amazoncontactid", "regiid", "lineofbusiness", "primary_label"])
category_data = category_data['train']
train_testvalid = category_data.train_test_split(test_size=0.3)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
from datasets.dataset_dict import DatasetDict
cd = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(cd)
Using custom data configuration default-89c081370f72e624
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-89c081370f72e624/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%
1/1 [00:00<00:00, 31.60it/s]
DatasetDict({
    train: Dataset({
        features: ['transcript', 'idx'],
        num_rows: 858
    })
    test: Dataset({
        features: ['transcript', 'idx'],
        num_rows: 185
    })
    valid: Dataset({
        features: ['transcript', 'idx'],
        num_rows: 184
    })
})

Here is where tokenize my examples

from transformers import AutoTokenizer

model_transcripts = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_transcripts)
from transformers import DistilBertTokenizer
db_tokenizer = DistilBertTokenizer.from_pretrained(model_transcripts)
transcripts_encoded = cd.map(lambda examples: tokenizer(examples["transcript"]), batched=True)
print(transcripts_encoded)

transcripts_encoded = transcripts_encoded.set_format("torch",
                              columns=["input_ids", "attention_mask", "idx"])`
print(transcripts_encoded)
None

I’ve checked my variable for any NaN values and can confirm there are none. Any help would be greatly appreciated!

Hi! set_format modifies the dataset in-place - it modifes the dataset’s “formatting state”. To get a new dataset with the updated “formatting state”, use with_format with the same parameters.

If this doesn’t help, please provide a self-contained example with real/dummy data, so we can debug it.

Thanks! I just learned about “in place” operations so just running transcripts_encoded.set format as opposed to transcripts_encoded = transcripts_encoded.set_format also worked. The only issue now is I get this TypeError in the traceback

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-31-521ce7183e84> in <module>
     21     compute_metrics=compute_metrics,
     22 )
---> 23 trainer.train_dataset[0:1]
     24 # trainer.train()
     25 

/opt/conda/lib/python3.7/site-packages/datasets/arrow_dataset.py in __getitem__(self, key)
   2355         """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
   2356         return self._getitem(
-> 2357             key,
   2358         )
   2359 

/opt/conda/lib/python3.7/site-packages/datasets/arrow_dataset.py in _getitem(self, key, decoded, **kwargs)
   2340         pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
   2341         formatted_output = format_table(
-> 2342             pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
   2343         )
   2344         return formatted_output

/opt/conda/lib/python3.7/site-packages/datasets/formatting/formatting.py in format_table(table, key, formatter, format_columns, output_all_columns)
    507     python_formatter = PythonFormatter(features=None)
    508     if format_columns is None:
--> 509         return formatter(pa_table, query_type=query_type)
    510     elif query_type == "column":
    511         if key in format_columns:

/opt/conda/lib/python3.7/site-packages/datasets/formatting/formatting.py in __call__(self, pa_table, query_type)
    284             return self.format_column(pa_table)
    285         elif query_type == "batch":
--> 286             return self.format_batch(pa_table)
    287 
    288     def format_row(self, pa_table: pa.Table) -> RowFormat:

/opt/conda/lib/python3.7/site-packages/datasets/formatting/torch_formatter.py in format_batch(self, pa_table)
     95         if self.decoded:
     96             batch = self.python_features_decoder.decode_batch(batch)
---> 97         batch = self.recursive_tensorize(batch)
     98         for column_name in batch:
     99             batch[column_name] = self._consolidate(batch[column_name])

/opt/conda/lib/python3.7/site-packages/datasets/formatting/torch_formatter.py in recursive_tensorize(self, data_struct)
     75 
     76     def recursive_tensorize(self, data_struct: dict):
---> 77         return map_nested(self._recursive_tensorize, data_struct)
     78 
     79     def format_row(self, pa_table: pa.Table) -> dict:

/opt/conda/lib/python3.7/site-packages/datasets/utils/py_utils.py in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy, num_proc, parallel_min_length, types, disable_tqdm, desc)
    444         mapped = [
    445             _single_map_nested((function, obj, types, None, True, None))
--> 446             for obj in logging.tqdm(iterable, disable=disable_tqdm, desc=desc)
    447         ]
    448     else:

/opt/conda/lib/python3.7/site-packages/datasets/utils/py_utils.py in <listcomp>(.0)
    444         mapped = [
    445             _single_map_nested((function, obj, types, None, True, None))
--> 446             for obj in logging.tqdm(iterable, disable=disable_tqdm, desc=desc)
    447         ]
    448     else:

/opt/conda/lib/python3.7/site-packages/datasets/utils/py_utils.py in _single_map_nested(args)
    344     # Singleton first to spare some computation
    345     if not isinstance(data_struct, dict) and not isinstance(data_struct, types):
--> 346         return function(data_struct)
    347 
    348     # Reduce logging to keep things readable in multiprocessing with tqdm

/opt/conda/lib/python3.7/site-packages/datasets/formatting/torch_formatter.py in _recursive_tensorize(self, data_struct)
     72             if data_struct.dtype == object:  # torch tensors cannot be instantied from an array of objects
     73                 return self._consolidate([self.recursive_tensorize(substruct) for substruct in data_struct])
---> 74         return self._tensorize(data_struct)
     75 
     76     def recursive_tensorize(self, data_struct: dict):

/opt/conda/lib/python3.7/site-packages/datasets/formatting/torch_formatter.py in _tensorize(self, value)
     65             if isinstance(value, PIL.Image.Image):
     66                 value = np.asarray(value)
---> 67         return torch.tensor(value, **{**default_dtype, **self.torch_tensor_kwargs})
     68 
     69     def _recursive_tensorize(self, data_struct: dict):

TypeError: tensor() got an unexpected keyword argument 'column_names'

Which I assume means I’m on an outdated version of torch that doesnt support the column_names parameter. I just installed torch/torchvision a few weeks ago though?

Please check the signature of set_format - it’s columns, not column_names.

Yep that worked, thanks

I’m now back to original issue of not being able to calculate loss.
​Is it possible that the inputs I’m passing forward are not structured correctly? Here’s a single example I’m passing through Trainer

The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Transcript, idx. If Transcript, idx are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 646
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 82
  Number of trainable parameters = 66961162
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-55-cc350a9b0377> in <module>
     22 )
     23 # trainer.train_dataset[0:1]
---> 24 trainer.train()
     25 

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1503             resume_from_checkpoint=resume_from_checkpoint,
   1504             trial=trial,
-> 1505             ignore_keys_for_eval=ignore_keys_for_eval,
   1506         )
   1507 

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1747                         tr_loss_step = self.training_step(model, inputs)
   1748                 else:
-> 1749                     tr_loss_step = self.training_step(model, inputs)
   1750 
   1751                 if (

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
   2506 
   2507         with self.compute_loss_context_manager():
-> 2508             loss = self.compute_loss(model, inputs)
   2509 
   2510         if self.args.n_gpu > 1:

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
   2552             if isinstance(outputs, dict) and "loss" not in outputs:
   2553                 raise ValueError(
-> 2554                     "The model did not return a loss from the inputs, only the following keys: "
   2555                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
   2556                 )

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.```
1
training_args = TrainingArguments(
2
    output_dir="transcripts_test",
3
    learning_rate=2e-5,
4
    per_device_train_batch_size=16,
5
    per_device_eval_batch_size=16,
6
    num_train_epochs=2,
7
    weight_decay=0.01,
8
    evaluation_strategy="epoch",
9
    save_strategy="epoch",
10
    load_best_model_at_end=True,
11
    push_to_hub=False,
12
)
13
​
14
trainer = Trainer(
15
    model=model,
16
    args=training_args,
17
    train_dataset=tokenized_cd["train"],
18
    eval_dataset=tokenized_cd["test"],
19
    tokenizer=tokenizer,
20
    data_collator=data_collator,
21
    compute_metrics=compute_metrics,
22
)
23
trainer.train_dataset[0:1]
24
# trainer.train()
25
​
26


{‘idx’: tensor([1.]),
‘input_ids’: tensor([[ 101, 2047, 3446, 2003, 2205, 6450, 2005, 1996, 2051, 1045,
2064, 5247, 3752, 4790, 1012, 2009, 2001, 2026, 5165, 2000,
6509, 2017, 2651, 999, 4067, 2017, 2005, 3967, 2075, 1996,
2047, 2259, 2335, 999, 2031, 1037, 6919, 2717, 1997, 1996,
2154, 999, 2994, 3647, 1998, 7965, 999, 2065, 2045, 2003,
2505, 2842, 2057, 2089, 2022, 2583, 2000, 6509, 2017, 2007,
3531, 2514, 2489, 2000, 3967, 2149, 2153, 1012, 1045, 2001,
2074, 2667, 2000, 17542, 2026, 15002, 1012, 2038, 2009, 2042,
13261, 1029, 7632, 1010, 2045, 999, 1045, 3246, 2017, 1005,
2128, 2725, 2092, 2651, 1012, 4067, 2017, 2005, 3967, 2075,
102]]),
‘attention_mask’: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1]])}

​Is it possible that the inputs I’m passing forward are not structured correctly?

Yes, you also need to pass the column with labels.