I’m trying to train the bert-base-romanian-cased-v1 model with a costume dataset. Here is a sample of my dataset
{"tokens":["In","țesuturile","extrapulmonare","rezultă","CO2","care","pătrunde","în","eritrocite","unde","se","formează","HCO3","si","H+","In","eritrocit","reactia","CO2+H2O","H2CO3","este","catalizata","de","o","enzima","numita","anhidraza","carbonica","H2CO3","HCO3+","H+","HCO3","poate","traversa","membrana","eritrocitara","in","timp","ce","H+","va","fi","neutralizat","in","eritrocit","conform","reactiei","HbO2","+","H+","HHb+","+","O2","In","acest","fel","hemoglobina","cedeaza","O2","tesuturilor","atasand","in","schimb","protoni","Prin","cresterea","concentratiei","de","protoni","scaderea","ph","efectul","Bohr","duce","la","eliberarea","oxigenului","in","timp","ce","cresterea","presiunii","partiale","a","O2","in","plaman","duce","la","eliberarea","H+","In","capilarele","pulmonare","procesul","are","loc","invers","datorita","presiunii","partiale","crescute","a","O2","cu","eliberarea","CO2","care","este","expirat"],"id":4,"space_after":[true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true],"ner_tags":["O","O","OCIT","O","OCIT","OCIT","OCIT","O","OCIT","O","O","OCIT","OCIT","O","OCIT","O","OCIT","OCIT","OCIT","OCIT","O","OCIT","O","O","O","OCIT","OCIT","OCIT","OCIT","OCIT","OCIT","OCIT","O","OCIT","O","OCIT","O","O","O","OCIT","O","OCIT","OCIT","O","OCIT","O","OCIT","OCIT","O","OCIT","OCIT","O","OCIT","O","O","O","OCIT","OCIT","OCIT","OCIT","OCIT","O","O","O","O","OCIT","OCIT","O","O","OCIT","OCIT","O","OCIT","OCIT","O","O","O","O","O","O","OCIT","O","OCIT","OCIT","OCIT","O","OCIT","OCIT","O","O","OCIT","O","OCIT","OCIT","O","OCIT","O","O","OCIT","O","OCIT","OCIT","OCIT","OCIT","O","O","OCIT","OCIT","O","OCIT"],"ner_ids":[0,0,2,0,2,2,2,0,2,0,0,2,2,0,2,0,2,2,2,2,0,2,0,0,0,2,2,2,2,2,2,2,0,2,0,2,0,0,0,2,0,2,2,0,2,0,2,2,0,2,2,0,2,0,0,0,2,2,2,2,2,0,0,0,0,2,2,0,0,2,2,0,2,2,0,0,0,0,0,0,2,0,2,2,2,0,2,2,0,0,2,0,2,2,0,2,0,0,2,0,2,2,2,2,0,0,2,2,0,2]},
{"tokens":["George","Hoyt","Whipple","a","fost","un","medic","patolog","și","cercetător","american","în","domeniul","biomedical","A","efectuat","cercetări","sistematice","în","domeniul","pigmenților","biliari","și","a","realizat","o","dietă","împotriva","anemiei","pernicioase","A","efectuat","teste","pe","câini","cărora","le-a","provocat","anemie","prin","prelevarea","unei","cantități","de","sânge","și","cărora","prin","alimentarea","cu","carne","în","special","ficat","care","este","bogat","în","fier","fasole","le-a","stimulat","generarea","de","eritrocite","La","stabilirea","acestui","regim","alimentar","au","mai","efectuat","studii","și","colaboratorii","săi","Minot","și","Murphy","Astfel","Whipple","a","ajuns","la","concluzia","că","fierul","joacă","un","rol","important","în","producția","de","globule","roșii","de","către","măduva","oaselor","Mai","târziu","Arzt","W","Castle","a","pus","în","evidență","o","substanță","aflată","în","sucul","gastric","factor","intrinsec","care","inhibă","anihilarea","vitaminei","B12","din","acidul","gastric","și","astfel","această","vitamină","este","resorbită","în","intestinul","subțire","ccea","ce","este","vital","pentru","producerea","de","eritrocite","În","1948","și","în","anul","următor","Karl","August","Folkers","și","Alexander","Todd","au","pus","în","evidență","și","factorul","extrinsec","aflat","la","nivelul","ficatului","În","1907","Whipple","a","descris","ceea","ce","ulterior","avea","să","fie","denumită","maladia","Whipple"],"id":29,"space_after":[true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true,true],"ner_tags":["O","GASTRO","GASTRO","GASTRO","O","O","O","GASTRO","O","O","O","O","O","O","GASTRO","GASTRO","O","O","O","O","GASTRO","GASTRO","O","GASTRO","GASTRO","O","GASTRO","O","GASTRO","GASTRO","GASTRO","GASTRO","O","O","GASTRO","O","O","GASTRO","GASTRO","O","O","O","O","O","O","O","O","O","O","O","O","O","O","GASTRO","GASTRO","O","O","O","O","GASTRO","O","GASTRO","GASTRO","O","GASTRO","O","O","O","O","O","O","O","GASTRO","O","O","O","O","GASTRO","O","GASTRO","O","GASTRO","GASTRO","O","O","O","O","GASTRO","O","O","O","O","O","O","O","O","GASTRO","O","O","GASTRO","GASTRO","O","O","GASTRO","O","GASTRO","GASTRO","O","O","O","O","O","O","O","O","GASTRO","O","GASTRO","GASTRO","GASTRO","GASTRO","GASTRO","GASTRO","O","O","GASTRO","O","O","GASTRO","GASTRO","O","GASTRO","O","GASTRO","O","GASTRO","O","O","O","O","O","O","GASTRO","O","O","O","O","O","O","O","O","GASTRO","O","O","GASTRO","O","O","O","O","O","O","GASTRO","GASTRO","O","O","GASTRO","O","O","GASTRO","GASTRO","O","O","O","O","O","O","GASTRO","O","GASTRO","GASTRO"],"ner_ids":[0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,1,1,0,1,0,1,1,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,1,1]},
And below is my code
import datasets
from datasets import Dataset, ClassLabel, Sequence
import torch
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
import json
import torch
from sklearn.model_selection import train_test_split
tokenizer = BertTokenizerFast.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", do_lower_case=True,model_max_length=512)
with open('output.txt', 'r') as f:
data = json.load(f)
dataset = Dataset.from_list(data)
label_names = sorted(set(label for labels in dataset["ner_tags"] for label in labels))
dataset = dataset.cast_column("ner_tags", Sequence(ClassLabel(names=label_names)))
print(dataset.features["ner_tags"])
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
train_datasetDataset = Dataset.from_dict(train_dataset)
val_datasetDataset = Dataset.from_dict(val_dataset)
print(len(train_datasetDataset))
print(train_datasetDataset[0])
print(len(val_datasetDataset))
print(val_datasetDataset[0])
def tokenize_and_align_labels(examples, label_all_tokens=True):
tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
labels = []
for i, label in enumerate(examples["ner_ids"]):
word_ids = tokenized_inputs.word_ids(batch_index=i)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(label[word_idx])
else:
label_ids.append(label[word_idx] if label_all_tokens else -100)
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs["labels"] = labels
return tokenized_inputs
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)
model = AutoModelForTokenClassification.from_pretrained("dumitrescustefan/bert-base-romanian-uncased-v1", num_labels=3)
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
"test-ner",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
num_train_epochs=1,
weight_decay=0.01,
remove_unused_columns=False
)
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = datasets.load_metric("seqeval")
example = dataset[0]
label_list = dataset.features["ner_tags"].feature.names
labels = [label_list[i] for i in example["ner_tags"]]
metric.compute(predictions=[labels], references=[labels])
def compute_metrics(eval_preds):
pred_logits, labels = eval_preds
pred_logits = np.argmax(pred_logits, axis=2)
# the logits and the probabilities are in the same order,
# so we don’t need to apply the softmax
# We remove all the values where the label is -100
predictions = [
[label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(pred_logits, labels)
]
true_labels = [
[label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(pred_logits, labels)
]
results = metric.compute(predictions=predictions, references=true_labels)
return {
"precision": results["overall_precision"],
"recall": results["overall_recall"],
"f1": results["overall_f1"],
"accuracy": results["overall_accuracy"],
}
trainer = Trainer(
model,
args,
train_dataset=train_datasetDataset,
eval_dataset=val_datasetDataset,
data_collator=data_collator,
tokenizer=tokenizer,
compute_metrics=compute_metrics,
)
torch.cuda.empty_cache()
trainer.train()
When calling trainer.train() I get
"You should supply an encoding or a list of encodings to this method "
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['tokens',
'id', 'space_after', 'ner_tags', 'ner_ids']
Any help is appreciated!