Hi. I have a problem. I am working on pretraining a RoBERTa MLM model from scratch on Slovak language text in Python. I have trained my own BPE tokenizer and tokenized texts with it. I obtained the dictionary of encodings, with max_length=256. Sample here
The problem arises during the training of the MLM model. Within individual epochs and batch processing, the loss decreases, indicating that the model is training properly. However, when using the trained model to predict the masked token for each input, I get the same output consistently. That is, the same tokens in the same order, with the same probabilities, which is incorrect. Additionally, special tokens such as pad, s, and /s are also included in the possible tokens with very high probabilities.
from transformers import pipeline
from transformers import AutoModelForMaskedLM, AutoTokenizer
model_path = 'PureBPEMLM_epoch_0'
tokenizer_path = './tokenizers/pureBPE'
tokenizer_ = RobertaTokenizer.from_pretrained(tokenizer_path)
model_ = AutoModelForMaskedLM.from_pretrained(model_path)
fill = pipeline(
'fill-mask',
model=model_,
tokenizer=tokenizer_
)
fill('cez bratislavu teÄŤie <mask> a ĹľijĂş tam ryby.')
Below is the code for training the model. I should also mention that the length of the dataloader is 13987, due to a batch size of 64, resulting in a total of 895168 sequences.
with open("./09032024_purebpe", 'rb') as file:
encodings = pickle.load(file)
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings):
self.encodings = encodings
def __len__(self):
return self.encodings['input_ids'].shape[0]
def __getitem__(self, i):
return {key: tensor[i] for key, tensor in self.encodings.items()}
dataset = Dataset(encodings.data)
batch_size = 64
num_workers = 8
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
tokenizer_path = './tokenizers/pureBPE'
tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
# Your existing model, optimizer, and training loop
config = RobertaConfig(
vocab_size=tokenizer.vocab_size,
max_position_embeddings=258,
hidden_size=576,
num_attention_heads=12,
num_hidden_layers=6,
dropout=0.1,
type_vocab_size=1
)
model = RobertaForMaskedLM(config)
print('Num parameters: ',model.num_parameters())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
epochs = 5
for epoch in range(epochs):
loop = tqdm(dataloader, leave=True)
for batch in loop:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
loss.backward()
optimizer.step()
loop.set_description(f'Epoch {epoch}')
loop.set_postfix(loss=loss.item())
# Save after each epoch
model_save_path = f'./PureBPEMLM_epoch_{epoch}'
model.save_pretrained(model_save_path)
Could someone please advise me on how to solve this problem?
Thank you in advance for your answer.