Loading Custom Datasets

g3casey · May 20, 2021, 10:59pm

@lhoestq can you help me with this? The data seems to load but I can’t train on it.
Here are the 2 lines of the data file I am using:

{"id": "0", "chunk_tags": [ "B-NP", "B-VP", "B-NP", "I-NP", "B-VP", "I-VP", "B-NP", "I-NP", "O" ], "ner_tags": [ "B-ORG", "O", "B-MISC", "O", "O", "O", "B-MISC", "O", "O" ], "pos_tags": [ "NNP", "VBZ", "JJ", "NN", "TO", "VB", "JJ", "NN", "." ], "tokens": [ "EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "." ]}
{"id": "0", "chunk_tags": [ "B-NP", "B-VP", "B-NP", "I-NP", "B-VP", "I-VP", "B-NP", "I-NP", "O" ], "ner_tags": [ "B-ORG", "O", "B-MISC", "O", "O", "O", "B-MISC", "O", "O" ], "pos_tags": [ "NNP", "VBZ", "JJ", "NN", "TO", "VB", "JJ", "NN", "." ], "tokens": [ "EU", "rejects", "German", "call", "to", "boycott", "British", "lamb", "." ]}

And here is the code:

import json

import datasets
from transformers import BertForTokenClassification
from transformers import Trainer, TrainingArguments

from datasets import load_dataset

features = datasets.Features(
    {
        "id": datasets.Value("string"),
        "tokens": datasets.Sequence(datasets.Value("string")),
        "pos_tags": datasets.Sequence(
            datasets.features.ClassLabel(
                names=[
                    '"',
                    "''",
                    "#",
                    "$",
                    "(",
                    ")",
                    ",",
                    ".",
                    ":",
                    "``",
                    "CC",
                    "CD",
                    "DT",
                    "EX",
                    "FW",
                    "IN",
                    "JJ",
                    "JJR",
                    "JJS",
                    "LS",
                    "MD",
                    "NN",
                    "NNP",
                    "NNPS",
                    "NNS",
                    "NN|SYM",
                    "PDT",
                    "POS",
                    "PRP",
                    "PRP$",
                    "RB",
                    "RBR",
                    "RBS",
                    "RP",
                    "SYM",
                    "TO",
                    "UH",
                    "VB",
                    "VBD",
                    "VBG",
                    "VBN",
                    "VBP",
                    "VBZ",
                    "WDT",
                    "WP",
                    "WP$",
                    "WRB"
                ]
            )
        ),
        "chunk_tags": datasets.Sequence(
            datasets.features.ClassLabel(
                names=[
                    "O",
                    "B-ADJP",
                    "I-ADJP",
                    "B-ADVP",
                    "I-ADVP",
                    "B-CONJP",
                    "I-CONJP",
                    "B-INTJ",
                    "I-INTJ",
                    "B-LST",
                    "I-LST",
                    "B-NP",
                    "I-NP",
                    "B-PP",
                    "I-PP",
                    "B-PRT",
                    "I-PRT",
                    "B-SBAR",
                    "I-SBAR",
                    "B-UCP",
                    "I-UCP",
                    "B-VP",
                    "I-VP"
                ]
            )
        ),
        "ner_tags": datasets.Sequence(
            datasets.features.ClassLabel(
                names=[
                    "O",
                    "B-PER",
                    "I-PER",
                    "B-ORG",
                    "I-ORG",
                    "B-LOC",
                    "I-LOC",
                    "B-MISC",
                    "I-MISC",
                    # "B-SSN",
                    # "I-SSN",
                    # "B-CITY",
                    # "I-CITY"
                ]
            )
        ),
    }
)

dataFiles = {
    "train": "./ADPConll/ADPConll_train.json",
    "validation": "./ADPConll/ADPConll_valid.json",
    "test": "./ADPConll/ADPConll_test.json"
}
dataset = load_dataset('json', data_files=dataFiles, split='train')

# Call the features' object's encode_example() method on every feature value.
dataset = dataset.map(features.encode_example, features=features)
TrainDF = dataset.to_pandas()

modelName = 'bert-base-cased'
# modelName = 'dbmdz/bert-large-cased-finetuned-conll03-english'
model = BertForTokenClassification.from_pretrained(modelName)

training_args = TrainingArguments(
    output_dir='./results',  # output directory
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=16,  # 1440 batch size per device during training
    per_device_eval_batch_size=16,  # 64,   # batch size for evaluation
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=10,
)


trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=dataset,
    eval_dataset=dataset
)

trainer.train()

The output I get is:

Using custom data configuration default-d850b3a6520ef12b
Reusing dataset json (/Users/greggwcasey/.cache/huggingface/datasets/json/default-d850b3a6520ef12b/0.0.0/83d5b3a2f62630efc6b5315f00f20209b4ad91a00ac586597caee3a4da0bef02)
100%|██████████| 2/2 [00:00<00:00, 173.15ex/s]
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: [‘cls.predictions.transform.LayerNorm.weight’, ‘cls.seq_relationship.bias’, ‘cls.predictions.bias’, ‘cls.seq_relationship.weight’, ‘cls.predictions.transform.LayerNorm.bias’, ‘cls.predictions.decoder.weight’, ‘cls.predictions.transform.dense.weight’, ‘cls.predictions.transform.dense.bias’]

This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).

This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: [‘classifier.weight’, ‘classifier.bias’]
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
0%| | 0/3 [00:00<?, ?it/s]Traceback (most recent call last):
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py”, line 517, in next
data = self._next_data()
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py”, line 557, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py”, line 44, in
data = [self.dataset[idx] for idx in possibly_batched_index]
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/arrow_dataset.py”, line 1345, in getitem
return self._getitem(
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/arrow_dataset.py”, line 1337, in _getitem
pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/formatting/formatting.py”, line 365, in query_table
_check_valid_index_key(key, size)
File “/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/datasets/formatting/formatting.py”, line 308, in _check_valid_index_key
raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
IndexError: Invalid key: 0 is out of bounds for size 0
python-BaseException

Topic		Replies	Views
Token Classification run_NER.py AttributeError Models	1	892	July 8, 2022
Passing schema features to a load_dataset function 🤗Datasets	4	1435	August 26, 2021
Fine Tuning BERT model on custom dataset 🤗Transformers	3	1192	January 27, 2022
Fine Tuning IMDb tutorial - Unable to reproduce and adapt Beginners	19	8598	August 21, 2020
Create own dataset for NER Beginners	3	6267	November 22, 2023

Loading Custom Datasets

Related topics