Hi, This is pretty much done and working as expected, although it could do with more training data. Here is the code in case someone wants to do something similar.
Model Training:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import string
data = pd.read_csv('/content/data.csv')
# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.3, random_state=42)
def preprocess_text(text):
# Convert to lowercase
text = text.lower()
# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
text = text.translate(translator)
# Remove leading/trailing whitespaces
text = text.strip()
# Collapse multiple spaces into a single space
text = re.sub(r'\s+', ' ', text)
return text
# Define a custom dataset class
class CustomDataset(Dataset):
def __init__(self, questions, document_text, answers, labels, tokenizer, max_length):
self.questions = questions
self.document_text = document_text
self.answers = answers
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.questions)
def __getitem__(self, index):
question = self.questions[index]
answer = self.answers[index]
label = self.labels[index]
question = preprocess_text(question)
answer = preprocess_text(answer)
document_text = preprocess_text(self.document_text[index])
input_text = f"{question} {document_text} {answer}"
encoding = self.tokenizer.encode_plus(
input_text,
add_special_tokens=True,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
input_ids = encoding['input_ids'].squeeze()
attention_mask = encoding['attention_mask'].squeeze()
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'label': label
}
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True
# Define hyperparameters
batch_size = 16
max_length = 512
num_epochs = 50
learning_rate = 2e-5
# Load the pre-trained tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
# Example training data
train_questions = train_data['question'].values
train_document_text = train_data['document_text'].values
train_answers = train_data['answer'].values
train_labels = train_data['label'].values
# Example validation data
val_questions = val_data['question'].values
val_document_text = val_data['document_text'].values
val_answers = val_data['answer'].values
val_labels = val_data['label'].values
# Create the custom dataset and data loader for training
train_dataset = CustomDataset(train_questions, train_document_text, train_answers, train_labels, tokenizer, max_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Create the custom dataset and data loader for validation
val_dataset = CustomDataset(val_questions, val_document_text, val_answers, val_labels, tokenizer, max_length)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)
model.to(device=device)
# Define the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.CrossEntropyLoss()
for epoch in range(num_epochs):
model.train()
train_loss = 0.0
train_preds = []
train_targets = []
for batch in train_loader:
input_ids = batch['input_ids'].to(device=device)
attention_mask = batch['attention_mask'].to(device=device)
labels = batch['label'].to(device=device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
logits = outputs.logits
loss.backward()
optimizer.step()
train_loss += loss.item()
train_preds.extend(torch.argmax(logits, dim=1).tolist())
train_targets.extend(labels.tolist())
print(f"Train Batch: Loss={loss.item()}")
# Calculate training accuracy
train_accuracy = accuracy_score(train_targets, train_preds)
# Validation loop
model.eval()
val_loss = 0.0
val_preds = []
val_targets = []
with torch.no_grad():
for batch in val_loader:
input_ids = batch['input_ids'].to(device=device)
attention_mask = batch['attention_mask'].to(device=device)
labels = batch['label'].to(device=device)
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
loss = outputs.loss
logits = outputs.logits
val_loss += loss.item()
val_preds.extend(torch.argmax(logits, dim=1).tolist())
val_targets.extend(labels.tolist())
print(f"Validation Batch: Loss={loss.item()}")
# Calculate validation accuracy
val_accuracy = accuracy_score(val_targets, val_preds)
# Print training and validation loss and accuracy
print(f"Epoch {epoch+1}:")
print(f"Train Loss: {train_loss / len(train_loader)}")
print(f"Train Accuracy: {train_accuracy}")
print(f"Validation Loss: {val_loss / len(val_loader)}")
print(f"Validation Accuracy: {val_accuracy}")
# Save the trained model
model.save_pretrained('./')
And the Inference Function:
import json
import os
import logging
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
os.environ['TRANSFORMERS_CACHE'] = '/tmp/'
logging.getLogger().setLevel(logging.DEBUG)
# Define the CustomDataset class
class CustomDataset(Dataset):
def __init__(self, questions, document_text, answers, tokenizer, max_length):
self.questions = questions
self.document_text = document_text
self.answers = answers
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.questions)
def __getitem__(self, index):
question = self.questions[index]
answer = self.answers[index]
combined_text = f"{question} {self.document_text} {answer}"
encoding = self.tokenizer.encode_plus(
combined_text,
add_special_tokens=True,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
input_ids = encoding['input_ids'].squeeze()
attention_mask = encoding['attention_mask'].squeeze()
return {
'input_ids': input_ids,
'attention_mask': attention_mask
}
def inference(loader, model, device):
model.eval()
with torch.no_grad():
for batch in loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
logits = outputs.logits
prediction = torch.argmax(logits, dim=1).item()
return prediction
def lambda_handler(event, context):
logging.info(f"event['body'] {event['body']}")
input_data = json.loads(event['body'])
logging.info(f"input_data {input_data}")
question = input_data['question']
document_text = input_data['document_text']
answer = input_data['answer']
# Load the pre-trained tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', cache_dir='/tmp/')
# Load the model
model = RobertaForSequenceClassification.from_pretrained('./')
# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Prepare the input for inference
input_data = {
'question': question,
'document_text': document_text,
'answer': answer
}
# Create a dataset for the input data
inference_dataset = CustomDataset([input_data['question']], input_data['document_text'],
[input_data['answer']], tokenizer, max_length=512)
inference_loader = DataLoader(inference_dataset, batch_size=1)
# Call the inference function
try:
prediction = inference(inference_loader, model, device)
response = {
'statusCode': 200,
'headers': {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Allow-Methods': 'OPTIONS,POST'
},
'body': json.dumps(prediction)
}
except Exception as e:
logging.error(e)
response = {
'statusCode': 500,
'headers': {
'Content-Type': 'application/json',
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Allow-Methods': 'OPTIONS,POST'
},
'body': json.dumps('Error occurred: ' + str(e))
}
return response
I’m happy to hear suggestions/comments if anyone has any.