[NER] Fine-tune with custom dataset - Index Error: Target out of bounds

Hi,

I’ve been following the Main NLP Tasks – Token classification course, everything was good, each step is clear and training was successful. Then I’ve tried to use custom data that I load from generated json files and after facing some problems, I reach the last part: Training.

I’m not sure, but I think this issue is due to my custom datasets, because when I use another dataset like the Jean-Baptiste/wikiner_fr, no error is throw. Here is a example of what my JSON files look like:

{
	"data": [
		{
			"id": 0,
			"tokens": [
				"39",
				"Rue",
				"de",
				"la",
				"Victoire",
				"76589",
				"Mont-Luçons"
			],
			"ner_tags": [
				17,
				18,
				18,
				18,
				18,
				21,
				19
			]
		},
        ...
    ]
}

Error: I’m facing the following issue with my custom dataset loaded from JSON files.

Traceback (most recent call last):
  File "/Users/.../src/main.py", line 176, in <module>
    trainer.train();
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 1400, in train
    tr_loss_step = self.training_step(model, inputs)
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 1984, in training_step
    loss = self.compute_loss(model, inputs)
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 2016, in compute_loss
    outputs = model(**inputs)
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 1417, in forward
    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/loss.py", line 1163, in forward
    return F.cross_entropy(input, target, weight=self.weight,
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/functional.py", line 2996, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
IndexError: Target 33 is out of bounds.

So the main point is the IndexError: Target 33 is out of bounds.

The code:

from datasets import load_dataset, load_metric
import numpy as np
from transformers import CamembertTokenizerFast, CamembertForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer

_DATASET_FILE_TYPE = 'json';
_MODEL_CHECKPOINT = 'camembert-base';
_NER_LABEL_NAMES = [
	'O', # 0
	'B-LOC', # 1
	'I-LOC', # 2
	'B-PER', # 3
	'I-PER', # 4
	'B-MISC', # 5
	'I-MISC', # 6
	'B-ORG', # 7
	'I-ORG', # 8
	'B-TRE', # 9
	'I-TRE', # 10
	'B-CIV', # 11
	'I-CIV', # 12
	'B-NOM', # 13
	'I-NOM', # 14
	'B-PRE', # 15
	'I-PRE', # 16
	'B-ADR', # 17
	'I-ADR', # 18
	'B-CITY', # 19
	'I-CITY', # 20
	'B-CP', # 21
	'I-CP', # 22
	'B-CTC', # 23
	'I-CTC', # 24
	'B-ETAB', # 25
	'I-ETAB', # 26
	'B-PID', # 27
	'I-PID', # 28
	'B-SID', # 29
	'I-SID', # 30
	'B-FID', # 31
	'I-FID' # 32
];

def align_labels_with_tokens(labels, word_ids):
	new_labels = []
	current_word = None
	for word_id in word_ids:
		if word_id != current_word:
			# Start of a new word
			current_word = word_id
			label = -100 if word_id is None else labels[word_id]
			new_labels.append(label)
		elif word_id is None:
			# Special token
			new_labels.append(-100)
		else:
			# Same word as previous token
			label = labels[word_id]
			# If the label is B-XXX we change it to I-XXX
			if label % 2 == 1:
				label += 1
			new_labels.append(label)

	return new_labels

def tokenize_and_align_labels(items):
	tokenized_inputs = tokenizer(
		items["tokens"], truncation=True, is_split_into_words=True
	)
	all_labels = items["ner_tags"]
	new_labels = []
	for index, labels in enumerate(all_labels):
		word_ids = tokenized_inputs.word_ids(index)
		new_labels.append(align_labels_with_tokens(labels, word_ids))
	tokenized_inputs["labels"] = new_labels

	return tokenized_inputs

def compute_metrics(eval_preds):
	logits, labels = eval_preds
	predictions = np.argmax(logits, axis=-1)

	# Remove ignored index (special tokens) and convert to labels
	true_labels = [[_NER_LABEL_NAMES[l] for l in label if l != -100] for label in labels]
	true_predictions = [
		[_NER_LABEL_NAMES[p] for (p, l) in zip(prediction, label) if l != -100]
		for prediction, label in zip(predictions, labels)
	]
	all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
	return {
		"precision": all_metrics["overall_precision"],
		"recall": all_metrics["overall_recall"],
		"f1": all_metrics["overall_f1"],
		"accuracy": all_metrics["overall_accuracy"],
	};

metric = load_metric("seqeval")

dataset = load_dataset(
	_DATASET_FILE_TYPE,
	data_files={
		'train': '../../ai-data-entities-generation/outputHuggingFace/dataset-test-b283.json',
		'test': '../../ai-data-entities-generation/outputHuggingFace/dataset-train-ea41.json',
		'validation': '../../ai-data-entities-generation/outputHuggingFace/dataset-validation-218a.json'
	},
	field='data'
);

# Load the tokenizer from the pretrained camemBERT
tokenizer = CamembertTokenizerFast.from_pretrained(_MODEL_CHECKPOINT, padding=True);

tokenized_records = dataset.map(
	tokenize_and_align_labels,
	batched=True,
	remove_columns=dataset['train'].column_names
);

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer);

idToLabel = {str(index): label for index, label in enumerate(_NER_LABEL_NAMES)};
labelToId = {value: key for key, value in idToLabel.items()};

model = CamembertForTokenClassification.from_pretrained(
	_MODEL_CHECKPOINT,
	id2label=idToLabel,
	label2id=labelToId,
	num_labels=len(_NER_LABEL_NAMES)
);
# Check if the size of labels is equal to _NER_LABEL_NAMES
print(f'{model.config.num_labels} must match: {len(_NER_LABEL_NAMES)}');
trainingArgs = TrainingArguments(
	output_dir='samm-camembert-finetuned-ner',
	evaluation_strategy='epoch',
	# save_strategy='epoch',
	# learning_rate=2e-5,
	# num_train_epochs=3,
	# weight_decay=0.01,
	# push_to_hub=False
);
trainer = Trainer(
	model=model,
	args=trainingArgs,
	train_dataset=tokenized_records['train'],
	eval_dataset=tokenized_records['validation'],
	# data_collator=data_collator,
	# compute_metrics=compute_metrics,
	tokenizer=tokenizer
);
trainer.train();

More useful information:

  • Using the latest version available for transformers
  • Python 3.9.10
  • Running in a virtual env

What’s the length of idToLabel and labelToId?

Both length are 33.

Up ! Help needed.

Well, after looking for a bug in the code, it finally was the data I’ve been using. So I regenerate the data well formatted, and the training is finally running.