Hey there! I’m trying to solve a NER-like task, in which I have text tokens and labels for each token, in this form:
['i', 'want', 'to', 'fly', 'from', 'boston', 'at', '838', 'am', 'and', 'arrive', 'in', 'denver', 'at', '1110', 'in', 'the', 'morning']
['O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-depart_time.time', 'I-depart_time.time', 'O', 'O', 'O', 'B-toloc.city_name', 'O', 'B-arrive_time.time', 'O', 'O', 'B-arrive_time.period_of_day']
I’ve been following the TokenClassification guide from Huggingface almost exactly, as it fits perfectly into this problem, however, I’m obtaining this error when using trainer.train()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-21-e3bc539dfde3> in <cell line: 1>()
----> 1 trainer_slots.train()
10 frames
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
3057 if size_average is not None or reduce is not None:
3058 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 3059 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
3060
3061
ValueError: Expected input batch_size (16) to match target batch_size (2048).
I’ve noticed that the target batch size is always some kind of multiple of the input batch size, and if I make the batch size bigger, the target batch size also gets bigger.
Here’s my code in some snippets:
# Loading the model and the tokenizer
model_slots_name = 'andgonzalez/bert-uncased-slot-filling'
tokenizer_slots = AutoTokenizer.from_pretrained(
model_slots_name
)
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer_slots(examples["text"], truncation=True, padding="max_length", max_length=128, is_split_into_words=True)
labels = []
for i, label in enumerate(examples[f"labels"]):
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
previous_word_idx = None
label_ids = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx: # Only label the first token of a given word.
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
# Loading the Dataset into HF Dataset and preprocessing it
train_df_slots = train_df[["text", "slots"]]
dev_df_slots = dev_df[["text", "slots"]]
test_df_slots = test_df[["text", "slots"]]
# Rename the column 'intent' to 'label'
train_df_slots = train_df_slots.rename(columns={"slots": "labels"})
test_df_slots = test_df_slots.rename(columns={"slots": "labels"})
dev_df_slots = dev_df_slots.rename(columns={"slots": "labels"})
# Create a dataset dictionary
dataset_dict = {
"train": Dataset.from_pandas(train_df_slots),
"test": Dataset.from_pandas(test_df_slots),
"dev": Dataset.from_pandas(dev_df_slots),
}
# Turn it into HuggingFace dataset
dataset_slots = DatasetDict(dataset_dict)
print(dataset_slots)
# Transforming the labels into numerical values with label2id
def label2id_mapping_function(examples):
examples["labels"] = [label2id_slots[i] for i in examples["labels"]]
return examples
dataset_slots = dataset_slots.map(label2id_mapping_function)
print(f"Example of instance: {dataset_slots['train'][0]}")
# Tokenize the dataset
tokenized_dataset_slots = dataset_slots.map(tokenize_and_align_labels, batched=True, remove_columns=["text"])
print(f"Example of instance after tokenization: {tokenized_dataset_slots['train'][0]}")
print(f"Example of instance after tokenization: {tokenized_dataset_slots['dev'][0]}")
DatasetDict({
train: Dataset({
features: ['text', 'labels'],
num_rows: 4978
})
test: Dataset({
features: ['text', 'labels'],
num_rows: 893
})
dev: Dataset({
features: ['text', 'labels'],
num_rows: 500
})
})
Map: 100%
4978/4978 [00:00<00:00, 7726.43 examples/s]
Map: 100%
893/893 [00:00<00:00, 6582.68 examples/s]
Map: 100%
500/500 [00:00<00:00, 5171.60 examples/s]
Example of instance: {'text': ['i', 'want', 'to', 'fly', 'from', 'boston', 'at', '838', 'am', 'and', 'arrive', 'in', 'denver', 'at', '1110', 'in', 'the', 'morning'], 'labels': [0, 0, 0, 0, 0, 0, 1, 0, 2, 3, 0, 0, 0, 4, 0, 5, 0, 0, 6]}
Map: 100%
4978/4978 [00:03<00:00, 1679.58 examples/s]
Map: 100%
893/893 [00:00<00:00, 2210.44 examples/s]
Map: 100%
500/500 [00:00<00:00, 1676.88 examples/s]
Example of instance after tokenization: {'labels': [-100, 0, 0, 0, 0, 0, 0, 1, 0, -100, 2, 3, 0, 0, 0, 4, 0, -100, 5, 0, 0, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'input_ids': [101, 1045, 2215, 2000, 4875, 2013, 3731, 2012, 6640, 2620, 2572, 1998, 7180, 1999, 7573, 2012, 11118, 2692, 1999, 1996, 2851, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
Example of instance after tokenization: {'labels': [-100, 0, 0, 0, 0, 0, 1, 0, 2, -100, 3, 0, 0, 0, 4, 0, 5, -100, 0, 0, 6, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], 'input_ids': [101, 1045, 2215, 2000, 4875, 2013, 3731, 2012, 6640, 2620, 2572, 1998, 7180, 1999, 7573, 2012, 11118, 2692, 1999, 1996, 2851, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
model_slots = AutoModelForSequenceClassification.from_pretrained(
model_slots_name,
ignore_mismatched_sizes=True,
label2id=label2id_slots,
id2label=id2label_slots
)
# Defining the Data Collator
from transformers import DataCollatorForTokenClassification
data_collator_slots = DataCollatorForTokenClassification(tokenizer=tokenizer_slots)
# Training
training_args_slots = TrainingArguments(
output_dir="output_slots",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=10,
weight_decay=0.01,
metric_for_best_model="eval_f1",
load_best_model_at_end=True,
)
trainer_slots = Trainer(
model=model_slots,
args=training_args_slots,
train_dataset=tokenized_dataset_slots["train"],
eval_dataset=tokenized_dataset_slots["dev"],
tokenizer=tokenizer_slots,
data_collator=data_collator_slots,
callbacks=[
EarlyStoppingCallback(
early_stopping_patience=3, early_stopping_threshold=0.00
)
],
)
I’ve seen a few posts similar, but none of the proposed answers were useful for this case. Has this ever happened to you? Would love some insight. Thanks in advance.