Hi,
I’ve been following the Main NLP Tasks – Token classification course, everything was good, each step is clear and training was successful. Then I’ve tried to use custom data that I load from generated json
files and after facing some problems, I reach the last part: Training
.
I’m not sure, but I think this issue is due to my custom datasets, because when I use another dataset like the Jean-Baptiste/wikiner_fr, no error is throw. Here is a example of what my JSON files look like:
{
"data": [
{
"id": 0,
"tokens": [
"39",
"Rue",
"de",
"la",
"Victoire",
"76589",
"Mont-Luçons"
],
"ner_tags": [
17,
18,
18,
18,
18,
21,
19
]
},
...
]
}
Error: I’m facing the following issue with my custom dataset loaded from JSON files.
Traceback (most recent call last):
File "/Users/.../src/main.py", line 176, in <module>
trainer.train();
File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 1400, in train
tr_loss_step = self.training_step(model, inputs)
File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 1984, in training_step
loss = self.compute_loss(model, inputs)
File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 2016, in compute_loss
outputs = model(**inputs)
File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/Users/.../.env/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 1417, in forward
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/loss.py", line 1163, in forward
return F.cross_entropy(input, target, weight=self.weight,
File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/functional.py", line 2996, in cross_entropy
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
IndexError: Target 33 is out of bounds.
So the main point is the IndexError: Target 33 is out of bounds.
The code:
from datasets import load_dataset, load_metric
import numpy as np
from transformers import CamembertTokenizerFast, CamembertForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
_DATASET_FILE_TYPE = 'json';
_MODEL_CHECKPOINT = 'camembert-base';
_NER_LABEL_NAMES = [
'O', # 0
'B-LOC', # 1
'I-LOC', # 2
'B-PER', # 3
'I-PER', # 4
'B-MISC', # 5
'I-MISC', # 6
'B-ORG', # 7
'I-ORG', # 8
'B-TRE', # 9
'I-TRE', # 10
'B-CIV', # 11
'I-CIV', # 12
'B-NOM', # 13
'I-NOM', # 14
'B-PRE', # 15
'I-PRE', # 16
'B-ADR', # 17
'I-ADR', # 18
'B-CITY', # 19
'I-CITY', # 20
'B-CP', # 21
'I-CP', # 22
'B-CTC', # 23
'I-CTC', # 24
'B-ETAB', # 25
'I-ETAB', # 26
'B-PID', # 27
'I-PID', # 28
'B-SID', # 29
'I-SID', # 30
'B-FID', # 31
'I-FID' # 32
];
def align_labels_with_tokens(labels, word_ids):
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
# Start of a new word
current_word = word_id
label = -100 if word_id is None else labels[word_id]
new_labels.append(label)
elif word_id is None:
# Special token
new_labels.append(-100)
else:
# Same word as previous token
label = labels[word_id]
# If the label is B-XXX we change it to I-XXX
if label % 2 == 1:
label += 1
new_labels.append(label)
return new_labels
def tokenize_and_align_labels(items):
tokenized_inputs = tokenizer(
items["tokens"], truncation=True, is_split_into_words=True
)
all_labels = items["ner_tags"]
new_labels = []
for index, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(index)
new_labels.append(align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
def compute_metrics(eval_preds):
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
# Remove ignored index (special tokens) and convert to labels
true_labels = [[_NER_LABEL_NAMES[l] for l in label if l != -100] for label in labels]
true_predictions = [
[_NER_LABEL_NAMES[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
};
metric = load_metric("seqeval")
dataset = load_dataset(
_DATASET_FILE_TYPE,
data_files={
'train': '../../ai-data-entities-generation/outputHuggingFace/dataset-test-b283.json',
'test': '../../ai-data-entities-generation/outputHuggingFace/dataset-train-ea41.json',
'validation': '../../ai-data-entities-generation/outputHuggingFace/dataset-validation-218a.json'
},
field='data'
);
# Load the tokenizer from the pretrained camemBERT
tokenizer = CamembertTokenizerFast.from_pretrained(_MODEL_CHECKPOINT, padding=True);
tokenized_records = dataset.map(
tokenize_and_align_labels,
batched=True,
remove_columns=dataset['train'].column_names
);
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer);
idToLabel = {str(index): label for index, label in enumerate(_NER_LABEL_NAMES)};
labelToId = {value: key for key, value in idToLabel.items()};
model = CamembertForTokenClassification.from_pretrained(
_MODEL_CHECKPOINT,
id2label=idToLabel,
label2id=labelToId,
num_labels=len(_NER_LABEL_NAMES)
);
# Check if the size of labels is equal to _NER_LABEL_NAMES
print(f'{model.config.num_labels} must match: {len(_NER_LABEL_NAMES)}');
trainingArgs = TrainingArguments(
output_dir='samm-camembert-finetuned-ner',
evaluation_strategy='epoch',
# save_strategy='epoch',
# learning_rate=2e-5,
# num_train_epochs=3,
# weight_decay=0.01,
# push_to_hub=False
);
trainer = Trainer(
model=model,
args=trainingArgs,
train_dataset=tokenized_records['train'],
eval_dataset=tokenized_records['validation'],
# data_collator=data_collator,
# compute_metrics=compute_metrics,
tokenizer=tokenizer
);
trainer.train();
More useful information:
- Using the latest version available for transformers
- Python
3.9.10
- Running in a virtual env