I am new to huggingface. With the help of trainer API, I trained and evaluated a model. But whenever I use it for prediction, model predicts just one class always. It would be helpful if anyone can help me identify the bug.
my data is such that there are two text inputs.
Here is my code -
import torch
import collections
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import EarlyStoppingCallback, DataCollatorWithPadding, Trainer, TrainingArguments
from transformers import BertTokenizerFast, BertConfig, BertModel, BertForSequenceClassification
import os
import sys
sys.path.append(".")
sys.path.append("..")
model_name = "bert-base-uncased"
max_length = 512
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)
data = pd.read_csv('train_data.csv', '\t')
gss = GroupShuffleSplit(train_size=.80, random_state = 2, n_splits=1)
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels=None):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
if self.labels:
item["labels"] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.encodings["input_ids"])
for train_idx, val_idx in gss.split(data.loc[:, data.columns != 'Label'], data['Label'], groups=data['QueryID']):
train_ds = data.iloc[train_idx]
val_ds = data.iloc[val_idx]
train_label = pd.factorize(train_ds.Label)[0]
valid_label = pd.factorize(val_ds.Label)[0]
train_label = 1 - train_label #because 0 should be relevant and 1 shoild be irrelevant
valid_label = 1 - valid_label
count = 0
print("Encodings generation.")
train_encodings = tokenizer(train_ds['Query'].tolist(),train_ds['Segment'].tolist(), truncation=True, padding=True, max_length=max_length)
valid_encodings = tokenizer(val_ds['Query'].tolist(), val_ds['Segment'].tolist(), truncation=True, padding=True, max_length=max_length)
print("Encodings generated.")
train_dataset = Dataset(train_encodings, train_label)
valid_dataset = Dataset(valid_encodings, valid_label)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2, return_dict=True).to("cuda")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
output_dir='bert_{}'.format(count),
num_train_epochs=5,
logging_dir='log_bert_{}'.format(count),
load_best_model_at_end=True,
evaluation_strategy="epoch")
print("Training begins:\n")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
data_collator=data_collator,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)])
print(trainer.train())
print('---------------------')
## test data
test_data = pd.read_csv("test_data.csv", "\t")
print("Encodings generation.")
test_encodings = tokenizer(test_data['Query'].tolist(),test_data['Segment'].tolist(), truncation=True, padding=True, max_length=max_length)
print("Encodings generated.")
test_dataset = Dataset(test_encodings)
# Make prediction
raw_pred, y_act, _ = trainer.predict(test_dataset)
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)
from sklearn.metrics import classification_report
print(classification_report(y_act, y_pred))