Thanks Quentin, this has been very helpful.
I had to change pos, chunk, and ner in the features (from pos_tags, chunk_tags, ner_tags) but other than that I got much further.
What I am working on now is a call to
trainer.train()
I am getting the error:
0%| | 0/3 [00:00<?, ?it/s]Traceback (most recent call last):
File "/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 517, in __next__
data = self._next_data()
File "/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 557, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/Users/greggwcasey/Google Drive/PycharmProjectsLocal/ADP_Project_NER/venv/lib/python3.8/site-packages/transformers/data/data_collator.py", line 80, in default_data_collator
batch[k] = torch.tensor([f[k] for f in features])
RuntimeError: Could not infer dtype of pyarrow.lib.Field
python-BaseException
At the point of error, k = {str} ‘_schema’ and v = {Schema:5} (SEE IMAGE) so I suspect that I am not setting a parameter correctly. I think my issue is that chunk, ner, and pos are lists of int64 instead of list of strings.
The code I have (as small as I think I can make it) is:
import datasets
from transformers import BertForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
features = datasets.Features(
{
"id": datasets.Value("string"),
"words": datasets.Sequence(datasets.Value("string")),
"pos": datasets.Sequence(
datasets.features.ClassLabel(
names=[
'"',
"''",
"#",
"$",
"(",
")",
",",
".",
":",
"``",
"CC",
"CD",
"DT",
"EX",
"FW",
"IN",
"JJ",
"JJR",
"JJS",
"LS",
"MD",
"NN",
"NNP",
"NNPS",
"NNS",
"NN|SYM",
"PDT",
"POS",
"PRP",
"PRP$",
"RB",
"RBR",
"RBS",
"RP",
"SYM",
"TO",
"UH",
"VB",
"VBD",
"VBG",
"VBN",
"VBP",
"VBZ",
"WDT",
"WP",
"WP$",
"WRB",
]
)
),
"chunk": datasets.Sequence(
datasets.features.ClassLabel(
names=[
"O",
"B-ADJP",
"I-ADJP",
"B-ADVP",
"I-ADVP",
"B-CONJP",
"I-CONJP",
"B-INTJ",
"I-INTJ",
"B-LST",
"I-LST",
"B-NP",
"I-NP",
"B-PP",
"I-PP",
"B-PRT",
"I-PRT",
"B-SBAR",
"I-SBAR",
"B-UCP",
"I-UCP",
"B-VP",
"I-VP",
]
)
),
"ner": datasets.Sequence(
datasets.features.ClassLabel(
names=[
"O",
"B-PER",
"I-PER",
"B-ORG",
"I-ORG",
"B-LOC",
"I-LOC",
"B-MISC",
"I-MISC",
"B-SSN",
"I-SSN",
"B-CITY",
"I-CITY",
]
)
),
}
)
def ADPLoadData():
dataFiles = {
"train": "./ADPConll/ADPConll_train.json",
"validation": "./ADPConll/ADPConll_valid.json",
"test": "./ADPConll/ADPConll_test.json"
}
# dataset = load_dataset('json', data_files='./ADPConll/ADPConll_train.json')
dataset = load_dataset('json', data_files=dataFiles)
return dataset
ADPDataset = ADPLoadData()
ADPDataset = ADPDataset.map(features.encode_example, features=features)
print(ADPDataset)
modelName = 'dbmdz/bert-large-cased-finetuned-conll03-english'
model = BertForTokenClassification.from_pretrained(modelName)
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=3, # total number of training epochs
per_device_train_batch_size=16, # 1440 batch size per device during training
per_device_eval_batch_size=16, # 64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=[ADPDataset.data['train']], # , # training dataset
eval_dataset=[ADPDataset.data['validation']], # evaluation dataset
)
trainer.train()
trainer.save_model('./newModel')
trainer.evaluate()