I am trying to implement this tutorial for a binary text classification problem instead of a multi-class (Google Colab), and need advice because I am getting
--------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Input In [15], in <cell line: 1>()
1 for epoch in range(EPOCHS):
----> 2 train(epoch)
Input In [14], in train(epoch)
12 targets = data['targets'].to(device, dtype = torch.long)
14 outputs = model(ids, mask)
---> 15 loss = loss_function(outputs, targets)
16 tr_loss += loss.item()
17 big_val, big_idx = torch.max(outputs.data, dim=1)
ValueError: Using a target size (torch.Size([4])) that is different to the input size (torch.Size([4, 1])) is deprecated. Please ensure they have the same size.`
I am using a kaggle dataset from here, News Category Dataset | Kaggle
I have changed this line in the code from the tutorial, as I have one class, not four classes:
self.classifier = torch.nn.Linear(768, 4)
to self.classifier = torch.nn.Linear(768, 1)
My code is:
# load labels
with open('.../News_Category_Dataset_v3.json','r') as f:
jdata = f.read()
jdata2 = [json.loads(line) for line in jdata.split('\n') if line]
df = pd.DataFrame.from_records(jdata2)
df["category"] = np.where(df["category"] == "CRIME", 1, 0) # binomial classification
y = df['category'].astype(int)
X = df['short_description']
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
#class Triage(Dataset):
def __init__(self, dataframe, tokenizer, max_len):
self.len = len(dataframe)
self.data = dataframe
self.tokenizer = tokenizer
self.max_len = max_len
def __getitem__(self, index):
title = str(self.data.short_description[index])
title = " ".join(title.split())
inputs = self.tokenizer.encode_plus(
title,
None,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
return_token_type_ids=True,
truncation=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'targets': torch.tensor(self.data.category[index], dtype=torch.long)
}
def __len__(self):
return self.len
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)
# Creating the customized model
class DistillBERTClass(torch.nn.Module):
def __init__(self):
super(DistillBERTClass, self).__init__()
self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
self.pre_classifier = torch.nn.Linear(768, 768)
self.dropout = torch.nn.Dropout(0.3)
self.classifier = torch.nn.Linear(768, 1)
def forward(self, input_ids, attention_mask):
output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
hidden_state = output_1[0]
pooler = hidden_state[:, 0]
pooler = self.pre_classifier(pooler)
pooler = torch.nn.ReLU()(pooler)
pooler = self.dropout(pooler)
output = self.classifier(pooler)
return output
model = DistillBERTClass()
model.to(device)
# Creating the loss function and optimizer
loss_function = torch.nn.BCELoss()
optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
# Function to calculate the accuracy of the model
def calculate_accu(big_idx, targets):
n_correct = (big_idx==targets).sum().item()
return n_correct
# Defining the training function on the 80% of the dataset for tuning the distilbert model
def train(epoch):
tr_loss = 0
n_correct = 0
nb_tr_steps = 0
nb_tr_examples = 0
model.train()
for _,data in enumerate(training_loader, 0):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.long)
outputs = model(ids, mask)
loss = loss_function(outputs, targets)
tr_loss += loss.item()
big_val, big_idx = torch.max(outputs.data, dim=1)
n_correct += calcuate_accu(big_idx, targets)
nb_tr_steps += 1
nb_tr_examples+=targets.size(0)
if _%5000==0:
loss_step = tr_loss/nb_tr_steps
accu_step = (n_correct*100)/nb_tr_examples
print(f"Training Loss per 5000 steps: {loss_step}")
print(f"Training Accuracy per 5000 steps: {accu_step}")
optimizer.zero_grad()
loss.backward()
# # When using GPU
optimizer.step()
print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
epoch_loss = tr_loss/nb_tr_steps
epoch_accu = (n_correct*100)/nb_tr_examples
print(f"Training Loss Epoch: {epoch_loss}")
print(f"Training Accuracy Epoch: {epoch_accu}")
return
for epoch in range(EPOCHS):
train(epoch)
Can someone help me how to adopt the above for a binary classification to avoid the ValueError?
also if I wanted to use recall instead of accuracy, how can I define this function (def calculate_recall:) ?
I was not sure if its better to post in the Beginners or the Models forum. thank you.