Hi,
I am fine tuning "bert-base-uncased’ model for NEE using “wnut_17”.
For finetuning, I am saving the train and test dataset using .to_json() method of datasets library. However while loading that locally saved file using load_datasets() method, the structure of the attribute are changing.
def tokenize_and_align_labels(examples):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples[f"ner_tags"]):
word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word.
previous_word_idx = None
label_ids = []
for word_idx in word_ids: # Set the special tokens to -100.
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx: # Only label the first token of a given word.
label_ids.append(label[word_idx])
else:
label_ids.append(-100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
wnut = load_dataset("wnut_17")
tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
features = tokenized_wnut['train'].features
print("output1-->")
print(features)
tokenized_wnut["train"].to_json("train_ner1.json")
raw_datasets = load_dataset("json", data_files={'train': 'train_ner1.json'})
re_features = raw_datasets['train'].features
print("output2-->")
print(re_features)
output1–>
{‘id’: Value(dtype=‘string’, id=None), ‘tokens’: Sequence(feature=Value(dtype=‘string’, id=None), length=-1, id=None), ‘ner_tags’: Sequence(feature=ClassLabel(num_classes=13, names=[‘O’, ‘B-corporation’, ‘I-corporation’, ‘B-creative-work’, ‘I-creative-work’, ‘B-group’, ‘I-group’, ‘B-location’, ‘I-location’, ‘B-person’, ‘I-person’, ‘B-product’, ‘I-product’], id=None), length=-1, id=None), ‘input_ids’: Sequence(feature=Value(dtype=‘int32’, id=None), length=-1, id=None), ‘attention_mask’: Sequence(feature=Value(dtype=‘int8’, id=None), length=-1, id=None), ‘labels’: Sequence(feature=Value(dtype=‘int64’, id=None), length=-1, id=None)}output2–>
{‘id’: Value(dtype=‘string’, id=None), ‘tokens’: Sequence(feature=Value(dtype=‘string’, id=None), length=-1, id=None), ‘ner_tags’: Sequence(feature=Value(dtype=‘int64’, id=None), length=-1, id=None), ‘input_ids’: Sequence(feature=Value(dtype=‘int64’, id=None), length=-1, id=None), ‘attention_mask’: Sequence(feature=Value(dtype=‘int64’, id=None), length=-1, id=None), ‘labels’: Sequence(feature=Value(dtype=‘int64’, id=None), length=-1, id=None)}
I am not understanding why ner_tags attribute format is changed while exporting dataset into json and importing back. Model expects labels to be string but importing json converts them into integer and getting below error:
if label.startswith(“B-”) and label.replace(“B-”, “I-”) in label_list:
AttributeError: ‘int’ object has no attribute ‘startswith’
-Thanks,
Dinesh