Hi All,
I have a working script for fine-tuning t5 model for question answer task.
I want to adapt the script for fine-tuning gpt2 model for the same question answer task on the same dataset. My dataset looks like:
âcontextâ = âŚ
âquestionâ = âŚ
âanswerâ = âŚ
I am getting some errors with this script. Please can you help me resolve this error. The error is :
/usr/local/lib/python3.10/dist-packages/torch/nn/functional.py in embedding(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)
2208 # remove once script supports set_grad_enabled
2209 no_grad_embedding_renorm(weight, input, max_norm, norm_type)
â 2210 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
2211
2212
RuntimeError: CUDA error: device-side assert triggered
Compile with TORCH_USE_CUDA_DSA
to enable device-side assertions.
The adapted script for fine-tuning gpt2 is :
import sys
import torch
import pandas as pd
import json
import os
os.environ[âCUDA_LAUNCH_BLOCKINGâ] = â1â
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
from datasets import Dataset
global var
ignore_pad_token_for_loss = True
padding = âmax_lengthâ
def preprocess_data(question, context, answer, max_length=512):
inputs = tokenizer(
question,
context,
max_length=max_length,
truncation=âonly_secondâ,
padding=âmax_lengthâ,
return_attention_mask=True,
add_special_tokens=True,
return_tensors=âptâ
)
input_ids = inputs[âinput_idsâ].squeeze()
attention_mask = inputs[âattention_maskâ].squeeze()
# Encode the answer
# Tokenize targets with text_target=...
labels = tokenizer(text_target=answer, max_length=max_length, padding=padding, truncation=True)
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
# padding in the loss.
if padding == "max_length" and ignore_pad_token_for_loss:
labels["input_ids"] = [(l if l != tokenizer.pad_token_id else -100) for l in labels["input_ids"]]
inputs["labels"] = labels["input_ids"]
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': inputs["labels"]
}
model = GPT2LMHeadModel.from_pretrained(âgpt2â)
Set device to GPU if available
device = torch.device(âcudaâ if torch.cuda.is_available() else âcpuâ)
print(device)
Load custom training dataset as pandas DataFrame
with open(âaugmented_data.jsonâ, ârâ) as f:
data = json.load(f)
data = pd.DataFrame(data)
data = data.drop(columns=[âidâ]) # remove the id column
Convert pandas DataFrame to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(data)
print(train_dataset.shape)
Load custom validation dataset
with open(âvalidation_set.jsonâ, ârâ) as f:
data = json.load(f)
data = pd.DataFrame(data)
data = data.drop(columns=[âidâ]) # remove the id column
Convert pandas DataFrame to Hugging Face Dataset format
val_dataset = Dataset.from_pandas(data)
print(val_dataset.shape)
define tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(âgpt2â)
tokenizer.add_special_tokens({âpad_tokenâ: â[PAD]â})
Preprocess dataset
preprocessed_train_dataset =
preprocessed_val_dataset =
for example in train_dataset:
preprocessed_example = preprocess_data(example[âquestionâ], example[âcontextâ], example[âanswerâ])
preprocessed_train_dataset.append(preprocessed_example)
for example in val_dataset:
preprocessed_example = preprocess_data(example[âquestionâ], example[âcontextâ], example[âanswerâ])
preprocessed_val_dataset.append(preprocessed_example)
Convert preprocessed dataset to Hugging Face Dataset format
tokenized_train_dataset = Dataset.from_dict(
{key: [example[key] for example in preprocessed_train_dataset] for key in preprocessed_train_dataset[0].keys()})
print(tokenized_train_dataset)
tokenized_val_dataset = Dataset.from_dict(
{key: [example[key] for example in preprocessed_val_dataset] for key in preprocessed_val_dataset[0].keys()})
print(tokenized_val_dataset)
âââDefine training argumentsâââ
training_args = TrainingArguments(
output_dir=âqa_modelâ,
evaluation_strategy=âepochâ,
learning_rate=15e-6,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=100,
weight_decay=0.01,
push_to_hub=False,
save_strategy=âepochâ,
)
âââDefine data collatorâââ
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
âââInstantiate the Trainer classâââ
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset,
data_collator=data_collator,
)
trainer.train()
trainer.save_model(âfinal_resultâ)
print(" Training is Over ")