Thank you @sgugger You are right, I checked the target, they are floats. Here is my code:
dataset = load_dataset("ptc.py")
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
def tokenize_and_align_labels(example):
tokenized_inputs = tokenizer(
example["tokens"],
is_split_into_words=True, padding=True, truncation=True,
return_offsets_mapping=True
)
labels = []
for doc, doc_labels in zip(tokenized_inputs.encodings, example["label"]):
doc_encoded_labels = []
i = 0
last_word_id = None
for word_id in doc.word_ids:
if word_id == None:
doc_encoded_labels.append(-100)
elif word_id == last_word_id:
doc_encoded_labels.append(-100)
else:
last_word_id = word_id
doc_encoded_labels.append(doc_labels[i])
i += 1
labels.append(doc_encoded_labels)
tokenized_inputs["label"] = labels
return tokenized_inputs
encoded_dataset = dataset.map(
tokenize_and_align_labels,
batched=True,
)
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=15)
model.train()
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=encoded_dataset['train'], # training dataset
eval_dataset=encoded_dataset['validation'], # evaluation dataset
)
trainer.train()
Here is the dataset:
"""PTC: The Propaganda Technique Classification Dataset."""
BUILDER_CONFIGS = [
PtcConfig(
name="jsonl",
version=datasets.Version("2.0.0", ""),
description="jsonl",
),
]
def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features({
"tokens": datasets.Sequence(datasets.Value("string")),
"label": datasets.Sequence(
datasets.features.ClassLabel(
names = [
"O",
"Appeal_to_Authority",
"Appeal_to_fear-prejudice",
"Bandwagon,Reductio_ad_hitlerum",
"Black-and-White_Fallacy",
"Causal_Oversimplification",
"Doubt",
"Exaggeration,Minimisation",
"Flag-Waving",
"Loaded_Language",
"Name_Calling,Labeling",
"Repetition",
"Slogans",
"Thought-terminating_Cliches",
"Whataboutism,Straw_Men,Red_Herring",
]
)
),
}),
supervised_keys=None, # TODO find out what is this
homepage=_HOMEPAGE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
#data_dir = dl_manager.download_and_extract(_URLS)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": "train.jsonl",
"split": "train",
}
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"filepath": "dev.jsonl",
"split": "dev",
}
),
]
def _generate_examples(self, filepath, split):
with open(filepath, encoding='utf-8') as f:
for id_, row in enumerate(f):
data = json.loads(row)
yield id_, {
"tokens": data["tokens"],
"label": [] if split == "test" else data["label"] ,
}