@lhoestq can you help me with this? The data seems to load but I canât train on it.
Here are the 2 lines of the data file I am using:
{"id": "0", "chunk_tags": [ "B-NP", "B-VP", "B-NP", "I-NP", "B-VP", "I-VP", "B-NP", "I-NP", "O" ], "ner_tags": [ "B-ORG", "O", "B-MISC", "O", "O", "O", "B-MISC", "O", "O" ], "pos_tags": [ "NNP", "VBZ", "JJ", "NN", "TO", "VB", "JJ", "NN", "." ], "tokens": [ "EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "." ]}
{"id": "0", "chunk_tags": [ "B-NP", "B-VP", "B-NP", "I-NP", "B-VP", "I-VP", "B-NP", "I-NP", "O" ], "ner_tags": [ "B-ORG", "O", "B-MISC", "O", "O", "O", "B-MISC", "O", "O" ], "pos_tags": [ "NNP", "VBZ", "JJ", "NN", "TO", "VB", "JJ", "NN", "." ], "tokens": [ "EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "." ]}
And here is the code:
import json
import datasets
from transformers import BertForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
features = datasets.Features(
{
"id": datasets.Value("string"),
"tokens": datasets.Sequence(datasets.Value("string")),
"pos_tags": datasets.Sequence(
datasets.features.ClassLabel(
names=[
'"',
"''",
"#",
"$",
"(",
")",
",",
".",
":",
"``",
"CC",
"CD",
"DT",
"EX",
"FW",
"IN",
"JJ",
"JJR",
"JJS",
"LS",
"MD",
"NN",
"NNP",
"NNPS",
"NNS",
"NN|SYM",
"PDT",
"POS",
"PRP",
"PRP$",
"RB",
"RBR",
"RBS",
"RP",
"SYM",
"TO",
"UH",
"VB",
"VBD",
"VBG",
"VBN",
"VBP",
"VBZ",
"WDT",
"WP",
"WP$",
"WRB"
]
)
),
"chunk_tags": datasets.Sequence(
datasets.features.ClassLabel(
names=[
"O",
"B-ADJP",
"I-ADJP",
"B-ADVP",
"I-ADVP",
"B-CONJP",
"I-CONJP",
"B-INTJ",
"I-INTJ",
"B-LST",
"I-LST",
"B-NP",
"I-NP",
"B-PP",
"I-PP",
"B-PRT",
"I-PRT",
"B-SBAR",
"I-SBAR",
"B-UCP",
"I-UCP",
"B-VP",
"I-VP"
]
)
),
"ner_tags": datasets.Sequence(
datasets.features.ClassLabel(
names=[
"O",
"B-PER",
"I-PER",
"B-ORG",
"I-ORG",
"B-LOC",
"I-LOC",
"B-MISC",
"I-MISC",
# "B-SSN",
# "I-SSN",
# "B-CITY",
# "I-CITY"
]
)
),
}
)
dataFiles = {
"train": "./ADPConll/ADPConll_train.json",
"validation": "./ADPConll/ADPConll_valid.json",
"test": "./ADPConll/ADPConll_test.json"
}
dataset = load_dataset('json', data_files=dataFiles, split='train')
# Call the features' object's encode_example() method on every feature value.
dataset = dataset.map(features.encode_example, features=features)
TrainDF = dataset.to_pandas()
modelName = 'bert-base-cased'
# modelName = 'dbmdz/bert-large-cased-finetuned-conll03-english'
model = BertForTokenClassification.from_pretrained(modelName)
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # 1440 batch size per device during training
per_device_eval_batch_size=16, # 64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated đ¤ Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=dataset,
eval_dataset=dataset
)
trainer.train()
The output I get is:
Using custom data configuration default-d850b3a6520ef12b
Reusing dataset json (/Users/greggwcasey/.cache/huggingface/datasets/json/default-d850b3a6520ef12b/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02)
100%|ââââââââââ| 2/2 [00:00<00:00, 173.15ex/s]
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: [âcls.predictions.transform.LayerNorm.weightâ, âcls.seq_relationship.biasâ, âcls.predictions.biasâ, âcls.seq_relationship.weightâ, âcls.predictions.transform.LayerNorm.biasâ, âcls.predictions.decoder.weightâ, âcls.predictions.transform.dense.weightâ, âcls.predictions.transform.dense.biasâ]
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: [âclassifier.weightâ, âclassifier.biasâ]
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
0%| | 0/3 [00:00<?, ?it/s]Traceback (most recent call last):
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.pyâ, line 517, in next
data = self._next_data()
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.pyâ, line 557, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.pyâ, line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.pyâ, line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/arrow_dataset.pyâ, line 1345, in getitem
return self._getitem(
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/arrow_dataset.pyâ, line 1337, in _getitem
pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/formatting/formatting.pyâ, line 365, in query_table
_check_valid_index_key(key, size)
File â/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/formatting/formatting.pyâ, line 308, in _check_valid_index_key
raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
IndexError: Invalid key: 0 is out of bounds for size 0
python-BaseException