Hi all,
EDIT: I forgot to state that I am on transformers 4.6.1 and python 3.7
On Colab, I am trying to pre-train a BertforMaskedLM using a random subset of half of Wikitext-103. I am using a simple custom dataset class and the DataCollatorForLanguageModeling as follows.
import torch
import torchtext
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn as nn
import torch.optim as optim
import re
import random
from transformers import BertForMaskedLM, BertModel, BertConfig, BertTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizer
wiki_train, wiki_valid, wiki_test = torchtext.datasets.WikiText103(root='data',
split=('train','valid','test'))
def scrub_titles_get_lines(dataset):
pattern = " =+.+ =+"
pattern = re.compile(pattern)
title_scrubbed = []
for example in dataset:
if not example.isspace() and not bool(pattern.match(example)):
title_scrubbed.append(example)
return title_scrubbed
class LineByLineBertDataset(torch.utils.data.Dataset):
def __init__(self, data, tokenizer: PreTrainedTokenizer, max_len=512):
self.examples = data
self.tokenizer = tokenizer
self.max_length = max_len
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
result = self.tokenizer(self.examples[i],
add_special_tokens=True,
truncation=True,
return_special_tokens_mask=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt')
return result
configuration = BertConfig()
model = BertForMaskedLM(configuration)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
wiki_train = random.sample(wiki_train, len(wiki_train)//2) # list of strings
train_set = LineByLineBertDataset(wiki_train, tokenizer)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)
training_args = TrainingArguments(
output_dir="/content/drive/MyDrive/BERT_TEST",
overwrite_output_dir=True,
num_train_epochs=1,
save_steps=10_000,
save_total_limit=2,
prediction_loss_only=True,
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_set,
)
trainer.train()
However, I get an error in the forward() method of the model:
/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
923 elif input_ids is not None:
924 input_shape = input_ids.size()
--> 925 batch_size, seq_length = input_shape
926 elif inputs_embeds is not None:
927 input_shape = inputs_embeds.size()[:-1]
ValueError: too many values to unpack (expected 2)
Each of the tensors in the batch encoding are of shape (8,512)
In the DataCollatorForMaskedLM I know that at some point another dimension gets added. If I do:
res = tokenizer(wiki_train[:8],
add_special_tokens=True,
return_special_tokens_mask=True,
truncation=True,
padding='max_length',
max_length=512,
return_tensors='pt')
collated = data_collator([res])
collated['input_ids'].size()
Output: torch.Size([1, 8, 512])
So it seems that maybe this first dimension needs to be squeezed out. However, I am not sure what parameter I can tweak to ensure that the correct tensor is being seen by the model after collation.
Any thoughts?