How to solve ValueError: expected sequence of length 15 at dim 1 (got 18) error in python

I am training a simple custom NER model using Hugging face model. My inputs are of different lengths which I solve by truncation and padding.

I am training this on 2 GPU's.

I get the below errors as outputs are of different lengths:

ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/tmp/ipykernel_1511906/3825976416.py", line 253, in forward
    return [loss.to(device), torch.tensor(prediction).to(device)]
ValueError: expected sequence of length 15 at dim 1 (got 18)

Here is the complete code:

import os
import warnings
import compress_json
from collections import Counter
import tqdm
import random
warnings.filterwarnings('ignore')
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"]= "true"
from torchcrf import CRF
from transformers import BertTokenizerFast as BertTokenizer, BertConfig
from transformers import BertForTokenClassification, AdamW, BertModel, BertConfig
import torch.nn as nn
import torch.nn.functional as F
log_soft = F.log_softmax
from transformers import (Trainer,TrainingArguments)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.empty_cache()
print(device)

train_data=[
{'text': "My name is Jon. I live in Germany.",
'spans': [{'start': 12, 'end': 14, 'label': 'name', 'ngram': 'Jon'},
          {'start': 27, 'end': 33, 'label': 'country', 'ngram': 'Germany'}
          ]
 },

{'text': "My name is Jony. I live in Russia. I am good and back from school.",
'spans': [{'start': 12, 'end': 15, 'label': 'name', 'ngram': 'Jony'},
          {'start': 28, 'end': 33, 'label': 'country', 'ngram': 'Russia'}
          ]
 },
{'text': "My name is Tony. I live in Poland.",
'spans': [{'start': 12, 'end': 15, 'label': 'name', 'ngram': 'Tony'},
          {'start': 28, 'end': 33, 'label': 'country', 'ngram': 'Poland'}
          ]
 },
{'text': "My name is Yun. I live in Holland. I am not.",
'spans': [{'start': 12, 'end': 14, 'label': 'name', 'ngram': 'Yun'},
          {'start': 27, 'end': 33, 'label': 'country', 'ngram': 'Holland'}
          ]
 }
]


model_checkpoint = "spanbert-base"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)



def isin(a, b):
    return a[1] > b[0] and a[0] < b[1]


def tokenize_and_align_labels(examples, label2id, max_length=512):
    pass
    return tokenized_inputs


train_set = [
    [
        x['text'],
        [{'start': y["start"], 'end': y["end"], 'tag': y["label"], 'text': y["ngram"]} for y in x['spans']]
    ] for x in train_data
]


## get label list
ori_label_list = []
for line in train_set:
    ori_label_list += [entity['tag'] for entity in line[1]]

ori_label_list = sorted(list(set(ori_label_list)))

label_list = []
for prefix in 'BI':
    label_list += [prefix + '-' + x for x in ori_label_list]
label_list += ['O']
label_list = sorted(list(set(label_list)))
label2id = {n:i for i,n in enumerate(label_list)}
id2label= {i:n for i,n in enumerate(label_list)}

train_examples ={'texts':[x[0] for x in train_set],'tag_names':[x[1] for x in train_set]}
train_data = tokenize_and_align_labels(train_examples,label2id)

_=train_data.pop('offset_mapping')




class MyDataset(torch.utils.data.Dataset):
    def __init__(self, examples):
        self.encodings = examples
        # print(self.encodings)
        # print()
        self.labels = examples['labels']

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        # print(item)
        # item["labels"] = torch.tensor([self.labels[idx]])
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):

        return len(self.labels)


train_data2=MyDataset(train_data)

bert_model = BertForTokenClassification.from_pretrained(
                        model_checkpoint,
                        id2label=id2label,
                        label2id=label2id
)
bert_model.config.output_hidden_states=True


class BERT_CRF(nn.Module):
  

    def __init__(self, bert_model, num_labels):
    
        super(BERT_CRF, self).__init__()
        self.bert = bert_model
        self.config = self.bert.config
        self.dropout = nn.Dropout(0.25)
      
        self.classifier = nn.Linear(768, num_labels)

        self.crf = CRF(num_labels, batch_first=True)
        

    def forward(self, input_ids, attention_mask, labels=None, token_type_ids=None):
        
        print("the types in forward",type(input_ids), type(attention_mask), type(labels),type(token_type_ids))
        
        outputs = self.bert(input_ids, attention_mask=attention_mask) #output_hidden_states=True,return_dict=False 


        sequence_output = torch.stack((outputs[1][-1], outputs[1][-2], outputs[1][-3], outputs[1][-4])).mean(dim=0)

        sequence_output = self.dropout(sequence_output)


        emission = self.classifier(sequence_output)  # [32,256,17]
  
      
        labels = labels.reshape(attention_mask.size()[0], attention_mask.size()[1])
        
      

        if labels is not None:
      
            loss = -self.crf(log_soft(emission, 2), labels, mask=attention_mask.type(torch.uint8), reduction='mean')
    

            prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))

            return [loss.to(device), torch.tensor(prediction2).to(device)]

        else:
  
            prediction = self.crf.decode(emission, mask=attention_mask.type(torch.uint8))
            return prediction

model = BERT_CRF(bert_model, num_labels=len(label2id))
model.to(device)

args = TrainingArguments(
    "spanbert_1",
    # evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    per_device_train_batch_size=2,

)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data2,
    tokenizer=tokenizer)

trainer.train()

Hi Team, I noticed that the torch.tensor(prediciton) is of different lengths. The outputs = self.bert(input_ids, attention_mask=attention_mask) gives me output of different lengths, though I have used padding and truncation in tokenizer

Please suggest me how to make consistent length output.?

Have you found a solution to that? I have a similar problem

1 Like

Ideally, you would initiate your tokenizer like this:

tokenizer_id_or_path = "spanbert-base"
tokenizer_max_len = 512
tokenizer_config = {'pretrained_model_name_or_path': tokenizer_id_or_path,
                            'max_len': tokenizer_max_len}
tokenizer = AutoTokenizer.from_pretrained(**tokenizer_config)

Then you would apply the tokenizer to your data just like this:

texts = [t for t in train_examples['text']]
train_data = tokenizer(texts, return_tensors='pt', padding='max_length', truncation=True, max_length=tokenizer.model_max_length)

Note that I am using padding=ā€˜max_lengthā€™ and set max_length=tokenizer.model_max_length. I hope that helps :slight_smile: