I am trying to create a custom dataset using flat files. The data elements are 'Tokens" and āNER Tagsā strings. I am able to create the ClassLabels for the Training dataset and now want to apply the same ClassLabels on the Test & Validation datasets. I have tried several approaches, but none have worked. Any help or insight will be greatly appreciated. The code segment is below :
def convert_to_featured_dataset(data):
formatted_data = {ātokensā: , āner_tagsā: }
features = Features(
{ātokensā: Value(dtype=āstringā), āner_tagsā: Value(dtype=āstringā)}
)
for sentence in data:
tokens = [token_data[1] for token_data in sentence]
ner_tags = [token_data[0] for token_data in sentence]
formatted_data["tokens"].append(tokens)
formatted_data["ner_tags"].append(ner_tags)
dataset = Dataset.from_dict(formatted_data, features=features)
return dataset
ā¦
ā¦
#Works
train_dataset = convert_to_featured_dataset(train_data)
train_dataset = train_dataset.class_encode_column(āner_tagsā)
label_feature = train_dataset.features[āner_tagsā]
test_features = train_dataset.features.copy()
ā¦
ā¦
#Does not work
def convert_ids(rec):
#Error : ValueError: Invalid string class label [āOā, āOā, āB-RATINGS_AVERAGEā, āI-RATINGS_AVERAGEā, ā¦']
rec[āner_tagsā] = label_feature.str2int(rec[āner_tagsā])
return rec
test_dataset = convert_to_featured_dataset(test_data)
# Did not work : ValueError: Invalid string class label ['O'...
#test_dataset = test_dataset.map(convert_ids, batched=True)
#test_features["ner_tags"] = ClassLabel(label_feature.feature.names)
#test_dataset = test_dataset.cast(test_features)
test_dataset = test_dataset.cast_column('ner_tags',label_feature)
Thank you for your time and help.