How to pass forward your encoded validation and train split (ValueError)

I’m unable to properly pass my encoded data (with hidden states) through Trainer via Huggingface. Below is the call to Trainer with arguments and the full traceback. I’m not really sure where to begin with this error as I believe I’ve satisfied all requirements to pass the encoded data forward unless the inputs passed should include the labels.

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    pred = pred.predictions.argmax(-1)
    f1 = f1_score(labels, pred, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}
from transformers import Trainer, TrainingArguments

batch_size = 10
logging_steps = len(transcripts_encoded["train"]) // batch_size
model_name = f"{model_checkpoint}-finetuned-transcripts"
training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=2,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=False,
                                 log_level="error")

from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=transcripts_encoded["train"],
                 eval_dataset=transcripts_encoded["valid"],
                 tokenizer=tokenizer)

trainer.train();

Here is the full traceback:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-124-76d295da3120> in <module>
     24                  tokenizer=tokenizer)
     25 
---> 26 trainer.train();

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1503             resume_from_checkpoint=resume_from_checkpoint,
   1504             trial=trial,
-> 1505             ignore_keys_for_eval=ignore_keys_for_eval,
   1506         )
   1507 

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1747                         tr_loss_step = self.training_step(model, inputs)
   1748                 else:
-> 1749                     tr_loss_step = self.training_step(model, inputs)
   1750 
   1751                 if (

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
   2506 
   2507         with self.compute_loss_context_manager():
-> 2508             loss = self.compute_loss(model, inputs)
   2509 
   2510         if self.args.n_gpu > 1:

/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
   2552             if isinstance(outputs, dict) and "loss" not in outputs:
   2553                 raise ValueError(
-> 2554                     "The model did not return a loss from the inputs, only the following keys: "
   2555                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
   2556                 )

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.

I was expecting to for it to the training details (f1, loss, accuracy etc). My assumption is that my encoded data with the hidden states is not properly structured for the model to train per the arguments set.

UPDATED MODEL CODE: here’s where I’m loading and splitting

category_data = load_dataset("csv", data_files="testdatafinal.csv")
category_data = category_data.remove_columns(["someid", "someid", "somedimension"])
category_data = category_data['train']
train_testvalid = category_data.train_test_split(test_size=0.3)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
from datasets.dataset_dict import DatasetDict
cd = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})
print(cd)

DatasetDict({
    train: Dataset({
        features: ['Transcript', 'Primary Label'],
        num_rows: 646
    })
    test: Dataset({
        features: ['Transcript', 'Primary Label'],
        num_rows: 139
    })
    valid: Dataset({
        features: ['Transcript', 'Primary Label'],
        num_rows: 139
    })
})

Here’s where I’m grabbing the model checkpoint

model_checkpoint = 'distilbert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_checkpoint).to(device)

Here’s where I’m mapping the encoded text

transcripts_encoded_one = transcripts_encoded.set_format("torch",
                              columns=["input_ids", "attention_mask", "Primary Label"])

Here’s where i’m extracting hidden states and then mapping as well

def extract_hidden_states(batch):
    #Place model inputs on the GPU/CPU
    inputs = {k:v.to(device) for k, v in batch.items()
              if k in tokenizer.model_input_names}
    #Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vecot for [CLS] Token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

transcripts_hidden = transcripts_encoded.map(extract_hidden_states, batched=True)

Calling AutoModel

from transformers import AutoModelForSequenceClassification

num_labels = 10
model =(AutoModelForSequenceClassification
       .from_pretrained(model_checkpoint, num_labels=num_labels)
       .to(device))

Accuracy Metrics

from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    pred = pred.predictions.argmax(-1)
    f1 = f1_score(labels, pred, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

Trainer

from transformers import Trainer, TrainingArguments

batch_size = 10
logging_steps = len(transcripts_encoded_one["train"]) // batch_size
model_name = f"{model_checkpoint}-finetuned-transcripts"
training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=2,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=False,
                                 log_level="error")

from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=transcripts_encoded_one["train"],
                 eval_dataset=transcripts_encoded_one["valid"],
                 tokenizer=tokenizer)

trainer.train();

I’ve tried passing "transcripts_encoded(without hidden states) and "transcripts_hidden (with hidden states) as the train and validation splits and both produce the same error

When I inspect my training dataset I get the following


trainer.train_dataset[0]

{'Primary Label': 'cancel',
 'input_ids': tensor([  101,  2047,  3446,  2003,  2205,  6450,  2005,  1996,  2051,  1045,
          2064,  5247,  3752,  4790,  1012,  2009,  2001,  2026,  5165,  2000,
          6509,  2017,  2651,   999,  4067,  2017,  2005,  3967,  2075,  1996,
          2047,  2259,  2335,   999,  2031,  1037,  6919,  2717,  1997,  1996,
          2154,   999,  2994,  3647,  1998,  7965,   999,  2065,  2045,  2003,
          2505,  2842,  2057,  2089,  2022,  2583,  2000,  6509,  2017,  2007,
          3531,  2514,  2489,  2000,  3967,  2149,  2153,  1012,  1045,  2001,
          2074,  2667,  2000, 17542,  2026, 15002,  1012,  2038,  2009,  2042,
         13261,  1029,  7632,  1010,  2045,   999,  1045,  3246,  2017,  1005,
          2128,  2725,  2092,  2651,  1012,  4067,  2017,  2005,  3967,  2075,
           102]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1])}

I believe this is correct to pass forward and train?