[NER] Fine-tune with custom dataset - Index Error: Target out of bounds

Hi,

I’ve been following the Main NLP Tasks – Token classification course, everything was good, each step is clear and training was successful. Then I’ve tried to use custom data that I load from generated json files and after facing some problems, I reach the last part: Training.

I’m not sure, but I think this issue is due to my custom datasets, because when I use another dataset like the Jean-Baptiste/wikiner_fr, no error is throw. Here is a example of what my JSON files look like:

{
	"data": [
		{
			"id": 0,
			"tokens": [
				"39",
				"Rue",
				"de",
				"la",
				"Victoire",
				"76589",
				"Mont-Luçons"
			],
			"ner_tags": [
				17,
				18,
				18,
				18,
				18,
				21,
				19
			]
		},
        ...
    ]
}

Error: I’m facing the following issue with my custom dataset loaded from JSON files.

Traceback (most recent call last):
  File "/Users/.../src/main.py", line 176, in <module>
    trainer.train();
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 1400, in train
    tr_loss_step = self.training_step(model, inputs)
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 1984, in training_step
    loss = self.compute_loss(model, inputs)
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/trainer.py", line 2016, in compute_loss
    outputs = model(**inputs)
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/.../.env/lib/python3.9/site-packages/transformers/models/roberta/modeling_roberta.py", line 1417, in forward
    loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/modules/loss.py", line 1163, in forward
    return F.cross_entropy(input, target, weight=self.weight,
  File "/Users/.../.env/lib/python3.9/site-packages/torch/nn/functional.py", line 2996, in cross_entropy
    return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
IndexError: Target 33 is out of bounds.

So the main point is the IndexError: Target 33 is out of bounds.

The code:

from datasets import load_dataset, load_metric
import numpy as np
from transformers import CamembertTokenizerFast, CamembertForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer

_DATASET_FILE_TYPE = 'json';
_MODEL_CHECKPOINT = 'camembert-base';
_NER_LABEL_NAMES = [
	'O', # 0
	'B-LOC', # 1
	'I-LOC', # 2
	'B-PER', # 3
	'I-PER', # 4
	'B-MISC', # 5
	'I-MISC', # 6
	'B-ORG', # 7
	'I-ORG', # 8
	'B-TRE', # 9
	'I-TRE', # 10
	'B-CIV', # 11
	'I-CIV', # 12
	'B-NOM', # 13
	'I-NOM', # 14
	'B-PRE', # 15
	'I-PRE', # 16
	'B-ADR', # 17
	'I-ADR', # 18
	'B-CITY', # 19
	'I-CITY', # 20
	'B-CP', # 21
	'I-CP', # 22
	'B-CTC', # 23
	'I-CTC', # 24
	'B-ETAB', # 25
	'I-ETAB', # 26
	'B-PID', # 27
	'I-PID', # 28
	'B-SID', # 29
	'I-SID', # 30
	'B-FID', # 31
	'I-FID' # 32
];

def align_labels_with_tokens(labels, word_ids):
	new_labels = []
	current_word = None
	for word_id in word_ids:
		if word_id != current_word:
			# Start of a new word
			current_word = word_id
			label = -100 if word_id is None else labels[word_id]
			new_labels.append(label)
		elif word_id is None:
			# Special token
			new_labels.append(-100)
		else:
			# Same word as previous token
			label = labels[word_id]
			# If the label is B-XXX we change it to I-XXX
			if label % 2 == 1:
				label += 1
			new_labels.append(label)

	return new_labels

def tokenize_and_align_labels(items):
	tokenized_inputs = tokenizer(
		items["tokens"], truncation=True, is_split_into_words=True
	)
	all_labels = items["ner_tags"]
	new_labels = []
	for index, labels in enumerate(all_labels):
		word_ids = tokenized_inputs.word_ids(index)
		new_labels.append(align_labels_with_tokens(labels, word_ids))
	tokenized_inputs["labels"] = new_labels

	return tokenized_inputs

def compute_metrics(eval_preds):
	logits, labels = eval_preds
	predictions = np.argmax(logits, axis=-1)

	# Remove ignored index (special tokens) and convert to labels
	true_labels = [[_NER_LABEL_NAMES[l] for l in label if l != -100] for label in labels]
	true_predictions = [
		[_NER_LABEL_NAMES[p] for (p, l) in zip(prediction, label) if l != -100]
		for prediction, label in zip(predictions, labels)
	]
	all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
	return {
		"precision": all_metrics["overall_precision"],
		"recall": all_metrics["overall_recall"],
		"f1": all_metrics["overall_f1"],
		"accuracy": all_metrics["overall_accuracy"],
	};

metric = load_metric("seqeval")

dataset = load_dataset(
	_DATASET_FILE_TYPE,
	data_files={
		'train': '../../ai-data-entities-generation/outputHuggingFace/dataset-test-b283.json',
		'test': '../../ai-data-entities-generation/outputHuggingFace/dataset-train-ea41.json',
		'validation': '../../ai-data-entities-generation/outputHuggingFace/dataset-validation-218a.json'
	},
	field='data'
);

# Load the tokenizer from the pretrained camemBERT
tokenizer = CamembertTokenizerFast.from_pretrained(_MODEL_CHECKPOINT, padding=True);

tokenized_records = dataset.map(
	tokenize_and_align_labels,
	batched=True,
	remove_columns=dataset['train'].column_names
);

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer);

idToLabel = {str(index): label for index, label in enumerate(_NER_LABEL_NAMES)};
labelToId = {value: key for key, value in idToLabel.items()};

model = CamembertForTokenClassification.from_pretrained(
	_MODEL_CHECKPOINT,
	id2label=idToLabel,
	label2id=labelToId,
	num_labels=len(_NER_LABEL_NAMES)
);
# Check if the size of labels is equal to _NER_LABEL_NAMES
print(f'{model.config.num_labels} must match: {len(_NER_LABEL_NAMES)}');
trainingArgs = TrainingArguments(
	output_dir='samm-camembert-finetuned-ner',
	evaluation_strategy='epoch',
	# save_strategy='epoch',
	# learning_rate=2e-5,
	# num_train_epochs=3,
	# weight_decay=0.01,
	# push_to_hub=False
);
trainer = Trainer(
	model=model,
	args=trainingArgs,
	train_dataset=tokenized_records['train'],
	eval_dataset=tokenized_records['validation'],
	# data_collator=data_collator,
	# compute_metrics=compute_metrics,
	tokenizer=tokenizer
);
trainer.train();

More useful information:

  • Using the latest version available for transformers
  • Python 3.9.10
  • Running in a virtual env

What’s the length of idToLabel and labelToId?

Both length are 33.

Up ! Help needed.

Well, after looking for a bug in the code, it finally was the data I’ve been using. So I regenerate the data well formatted, and the training is finally running.

what is the issue in data and how you formatted

Hi

I’m having the same Issue, it would be great if you can share what was the issue in the data and how you formatted.