I am training a simple custom NER model
using Hugging face model.
My inputs are of different lengths which I solve by truncation and padding.
I am training this on 2 GPU's
.
I get the below errors as outputs are of different lengths:
ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/tmp/ipykernel_1511906/3825976416.py", line 253, in forward
return [loss.to(device), torch.tensor(prediction).to(device)]
ValueError: expected sequence of length 15 at dim 1 (got 18)
Here is the complete code:
import os
import warnings
import compress_json
from collections import Counter
import tqdm
import random
warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"]= "true"
from torchcrf import CRF
from transformers import BertTokenizerFast as BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW, BertModel, BertConfig
import torch.nn as nn
import torch.nn.functional as F
log_soft = F.log_softmax
from transformers import (Trainer,TrainingArguments)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.empty_cache()
print(device)
train_data=[
{'text': "My name is Jon. I live in Germany.",
'spans': [{'start': 12, 'end': 14, 'label': 'name', 'ngram': 'Jon'},
{'start': 27, 'end': 33, 'label': 'country', 'ngram': 'Germany'}
]
},
{'text': "My name is Jony. I live in Russia. I am good and back from school.",
'spans': [{'start': 12, 'end': 15, 'label': 'name', 'ngram': 'Jony'},
{'start': 28, 'end': 33, 'label': 'country', 'ngram': 'Russia'}
]
},
{'text': "My name is Tony. I live in Poland.",
'spans': [{'start': 12, 'end': 15, 'label': 'name', 'ngram': 'Tony'},
{'start': 28, 'end': 33, 'label': 'country', 'ngram': 'Poland'}
]
},
{'text': "My name is Yun. I live in Holland. I am not.",
'spans': [{'start': 12, 'end': 14, 'label': 'name', 'ngram': 'Yun'},
{'start': 27, 'end': 33, 'label': 'country', 'ngram': 'Holland'}
]
}
]
model_checkpoint = "spanbert-base"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)
def isin(a, b):
return a[1] > b[0] and a[0] < b[1]
def tokenize_and_align_labels(examples, label2id, max_length=512):
pass
return tokenized_inputs
train_set = [
[
x['text'],
[{'start': y["start"], 'end': y["end"], 'tag': y["label"], 'text': y["ngram"]} for y in x['spans']]
] for x in train_data
]
## get label list
ori_label_list = []
for line in train_set:
ori_label_list += [entity['tag'] for entity in line[1]]
ori_label_list = sorted(list(set(ori_label_list)))
label_list = []
for prefix in 'BI':
label_list += [prefix + '-' + x for x in ori_label_list]
label_list += ['O']
label_list = sorted(list(set(label_list)))
label2id = {n:i for i,n in enumerate(label_list)}
id2label= {i:n for i,n in enumerate(label_list)}
train_examples ={'texts':[x[0] for x in train_set],'tag_names':[x[1] for x in train_set]}
train_data = tokenize_and_align_labels(train_examples,label2id)
_=train_data.pop('offset_mapping')
class MyDataset(torch.utils.data.Dataset):
def __init__(self, examples):
self.encodings = examples
# print(self.encodings)
# print()
self.labels = examples['labels']
def __getitem__(self, idx):
item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
# print(item)
# item["labels"] = torch.tensor([self.labels[idx]])
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
train_data2=MyDataset(train_data)
bert_model = BertForTokenClassification.from_pretrained(
model_checkpoint,
id2label=id2label,
label2id=label2id
)
bert_model.config.output_hidden_states=True
class BERT_CRF(nn.Module):
def __init__(self, bert_model, num_labels):
super(BERT_CRF, self).__init__()
self.bert = bert_model
self.config = self.bert.config
self.dropout = nn.Dropout(0.25)
self.classifier = nn.Linear(768, num_labels)
self.crf = CRF(num_labels, batch_first=True)
def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
print("the types in forward",type(input_ids), type(attention_mask), type(labels),type(token_type_ids))
outputs = self.bert(input_ids, attention_mask=attention_mask) #output_hidden_states=True,return_dict=False
sequence_output = torch.stack((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4])).mean(dim=0)
sequence_output = self.dropout(sequence_output)
emission = self.classifier(sequence_output) # [32,256,17]
labels = labels.reshape(attention_mask.size()[0], attention_mask.size()[1])
if labels is not None:
loss = -self.crf(log_soft(emission, 2), labels, mask=attention_mask.type(torch.uint8), reduction='mean')
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
return [loss.to(device), torch.tensor(prediction2).to(device)]
else:
prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
return prediction
model = BERT_CRF(bert_model, num_labels=len(label2id))
model.to(device)
args = TrainingArguments(
"spanbert_1",
# evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=2,
weight_decay=0.01,
per_device_train_batch_size=2,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=train_data2,
tokenizer=tokenizer)
trainer.train()