I’m unable to properly pass my encoded data (with hidden states) through Trainer via Huggingface. Below is the call to Trainer with arguments and the full traceback. I’m not really sure where to begin with this error as I believe I’ve satisfied all requirements to pass the encoded data forward unless the inputs passed should include the labels.
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
pred = pred.predictions.argmax(-1)
f1 = f1_score(labels, pred, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
from transformers import Trainer, TrainingArguments
batch_size = 10
logging_steps = len(transcripts_encoded["train"]) // batch_size
model_name = f"{model_checkpoint}-finetuned-transcripts"
training_args = TrainingArguments(output_dir=model_name,
num_train_epochs=2,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
log_level="error")
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
compute_metrics=compute_metrics,
train_dataset=transcripts_encoded["train"],
eval_dataset=transcripts_encoded["valid"],
tokenizer=tokenizer)
trainer.train();
Here is the full traceback:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-124-76d295da3120> in <module>
24 tokenizer=tokenizer)
25
---> 26 trainer.train();
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1503 resume_from_checkpoint=resume_from_checkpoint,
1504 trial=trial,
-> 1505 ignore_keys_for_eval=ignore_keys_for_eval,
1506 )
1507
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1747 tr_loss_step = self.training_step(model, inputs)
1748 else:
-> 1749 tr_loss_step = self.training_step(model, inputs)
1750
1751 if (
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in training_step(self, model, inputs)
2506
2507 with self.compute_loss_context_manager():
-> 2508 loss = self.compute_loss(model, inputs)
2509
2510 if self.args.n_gpu > 1:
/opt/conda/lib/python3.7/site-packages/transformers/trainer.py in compute_loss(self, model, inputs, return_outputs)
2552 if isinstance(outputs, dict) and "loss" not in outputs:
2553 raise ValueError(
-> 2554 "The model did not return a loss from the inputs, only the following keys: "
2555 f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
2556 )
ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.
I was expecting to for it to the training details (f1, loss, accuracy etc). My assumption is that my encoded data with the hidden states is not properly structured for the model to train per the arguments set.
UPDATED MODEL CODE: here’s where I’m loading and splitting
category_data = load_dataset("csv", data_files="testdatafinal.csv")
category_data = category_data.remove_columns(["someid", "someid", "somedimension"])
category_data = category_data['train']
train_testvalid = category_data.train_test_split(test_size=0.3)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
from datasets.dataset_dict import DatasetDict
cd = DatasetDict({
'train': train_testvalid['train'],
'test': test_valid['test'],
'valid': test_valid['train']})
print(cd)
DatasetDict({
train: Dataset({
features: ['Transcript', 'Primary Label'],
num_rows: 646
})
test: Dataset({
features: ['Transcript', 'Primary Label'],
num_rows: 139
})
valid: Dataset({
features: ['Transcript', 'Primary Label'],
num_rows: 139
})
})
Here’s where I’m grabbing the model checkpoint
model_checkpoint = 'distilbert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_checkpoint).to(device)
Here’s where I’m mapping the encoded text
transcripts_encoded_one = transcripts_encoded.set_format("torch",
columns=["input_ids", "attention_mask", "Primary Label"])
Here’s where i’m extracting hidden states and then mapping as well
def extract_hidden_states(batch):
#Place model inputs on the GPU/CPU
inputs = {k:v.to(device) for k, v in batch.items()
if k in tokenizer.model_input_names}
#Extract last hidden states
with torch.no_grad():
last_hidden_state = model(**inputs).last_hidden_state
# Return vecot for [CLS] Token
return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}
transcripts_hidden = transcripts_encoded.map(extract_hidden_states, batched=True)
Calling AutoModel
from transformers import AutoModelForSequenceClassification
num_labels = 10
model =(AutoModelForSequenceClassification
.from_pretrained(model_checkpoint, num_labels=num_labels)
.to(device))
Accuracy Metrics
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(pred):
labels = pred.label_ids
pred = pred.predictions.argmax(-1)
f1 = f1_score(labels, pred, average="weighted")
acc = accuracy_score(labels, preds)
return {"accuracy": acc, "f1": f1}
Trainer
from transformers import Trainer, TrainingArguments
batch_size = 10
logging_steps = len(transcripts_encoded_one["train"]) // batch_size
model_name = f"{model_checkpoint}-finetuned-transcripts"
training_args = TrainingArguments(output_dir=model_name,
num_train_epochs=2,
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
weight_decay=0.01,
evaluation_strategy="epoch",
disable_tqdm=False,
logging_steps=logging_steps,
push_to_hub=False,
log_level="error")
from transformers import Trainer
trainer = Trainer(model=model, args=training_args,
compute_metrics=compute_metrics,
train_dataset=transcripts_encoded_one["train"],
eval_dataset=transcripts_encoded_one["valid"],
tokenizer=tokenizer)
trainer.train();
I’ve tried passing "transcripts_encoded(without hidden states) and "transcripts_hidden (with hidden states) as the train and validation splits and both produce the same error
When I inspect my training dataset I get the following
trainer.train_dataset[0]
{'Primary Label': 'cancel',
'input_ids': tensor([ 101, 2047, 3446, 2003, 2205, 6450, 2005, 1996, 2051, 1045,
2064, 5247, 3752, 4790, 1012, 2009, 2001, 2026, 5165, 2000,
6509, 2017, 2651, 999, 4067, 2017, 2005, 3967, 2075, 1996,
2047, 2259, 2335, 999, 2031, 1037, 6919, 2717, 1997, 1996,
2154, 999, 2994, 3647, 1998, 7965, 999, 2065, 2045, 2003,
2505, 2842, 2057, 2089, 2022, 2583, 2000, 6509, 2017, 2007,
3531, 2514, 2489, 2000, 3967, 2149, 2153, 1012, 1045, 2001,
2074, 2667, 2000, 17542, 2026, 15002, 1012, 2038, 2009, 2042,
13261, 1029, 7632, 1010, 2045, 999, 1045, 3246, 2017, 1005,
2128, 2725, 2092, 2651, 1012, 4067, 2017, 2005, 3967, 2075,
102]),
'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1])}
I believe this is correct to pass forward and train?