Hi, am trying to train xlm-roberta-base on custom dataset. here is code for training
loss_values = []
# number of total steps for each epoch
print('total steps per epoch: ', len(train_dataloader) / batch_size)
# looping over epochs
for epoch_i in range(0, epochs):
print('training on epoch: ', epoch_i)
# set start time
t0 = time.time()
# reset total loss
total_loss = 0
# model in training
model.train()
# loop through batch
for step, batch in enumerate(train_dataloader):
# Progress update every 50 step
if step % 50 == 0 and not step == 0:
print('training on step: ', step)
print('total time used is: {0:.2f} s'.format(time.time() - t0))
# load data from dataloader
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# clear any previously calculated gradients
model.zero_grad()
# get outputs
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
# get loss
loss = outputs[0]
# total loss
total_loss += loss.item()
# clip the norm of the gradients to 1.0.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# update optimizer
optimizer.step()
# update learning rate
scheduler.step()
# Calculate the average loss over the training data.
avg_train_loss = total_loss / len(train_dataloader)
# Store the loss value for plotting the learning curve.
loss_values.append(avg_train_loss)
print("average training loss: {0:.2f}".format(avg_train_loss))
every thing works fine but when i test this model on test data, it gives all label as 0
t0 = time.time()
# model in validation mode
model.eval()
# save prediction
predictions,true_labels =[],[]
# evaluate data for one epoch
for batch in validation_dataloader:
# Add batch to GPU
batch = tuple(t.to(device) for t in batch)
# Unpack the inputs from our dataloader
b_input_ids, b_input_mask, b_labels = batch
# validation
with torch.no_grad():
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask)
# get output
logits = outputs[0]
# move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
final_prediction = np.argmax(logits, axis=-1).flatten()
predictions.append(final_prediction)
true_labels.append(label_ids)
print('total time used is: {0:.2f} s'.format(time.time() - t0))
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), …
where as true labels are
array([1, 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0]),
array([0, 1, 2, 0, 2, 2, 2, 0, 1, 1, 2, 2, 0, 1, 1, 2]),
array([0, 0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 1, 0, 2, 2, 2]),
array([2, 0, 1, 0, 0, 1, 0, 1, 1, 2, 0, 1, 1, 2, 1, 0]),
array([2, 0, 0, 0, 0, 1, 2, 0, 2, 1, 1, 1, 2, 1, 1, 2]),
array([0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 1, 1, 2, 2, 2, 2]),
array([1, 1, 0, 0, 2, 0, 2, 0, 2, 1, 1, 1, 1, 0, 1, 0]),