How do I fix this? I am new to transformers and I am using google colab.
Please help, I don’t know what else to do.
from google.colab import drive
drive.mount('/content/drive')
!pip install transformers
!pip install json
!pip install pandas
!pip install torch
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
import pandas as pd
import json
import torch
import csv
import io
import datasets
import re
from datasets import load_dataset
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import(BertTokenizerFast,
BertForQuestionAnswering,
TrainingArguments,
Trainer)
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import Trainer
data = pd.read_csv('tandr_info.csv')
newdata = data[['Customer', 'Question', 'Answer']]
newdata.rename(columns={'Customer': 'context', 'Question': 'question', 'Answer': 'answer'}, inplace=True)
newdata['answer'] = newdata['answer'].str.findall(r'\b(?!.*[A-Za-z]+.*\d+)(?!.,*\d+.*[A-Za-z]+.*).+\b').str.join(' ').str.strip()
newdata['question'] = newdata['question'].str.findall(r'\b(?!.*[A-Za-z]+.*\d+)(?!.,*\d+.*[A-Za-z]+.*).+\b').str.join(' ').str.strip()
newdata['context'] = newdata['context'].str.findall(r'\b(?!.*[A-Za-z]+.*\d+)(?!.,*\d+.*[A-Za-z]+.*).+\b').str.join(' ').str.strip()
newdata['question'] = newdata['question'].str.lower()
newdata['answer'] = newdata['answer'].str.lower()
newdata['context'] = newdata['context'].str.lower()
newdata['answer'] = newdata['answer'].astype(str)
dataset = Dataset.from_pandas(newdata)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
def tokenize(batch):
tokenized_batch = tokenizer(batch["question"], batch["context"],
max_length =512,
padding = "max_length",
truncation=True,
return_offsets_mapping = True,
return_token_type_ids=True)
answer_starts =[]
answer_ends = []
for i, context in enumerate(batch["context"]):
answer_start = context.find(batch["answer"][i])
answer_end = answer_start + len(batch["answer"][i])
answer_starts.append(answer_start)
answer_ends.append(answer_end)
tokenized_batch["answer_start"] = answer_starts
tokenized_batch["answer_end"] = answer_ends
return tokenized_batch
tokenized_dataset = dataset.map(tokenize, batched=True)
def prepare_train_features(example):
start_position = example["input_ids"].index(tokenizer.cls_token_id)
end_position = example["input_ids"].index(tokenizer.sep_token_id)
found_start = False
found_end = False
for i, (offset_start, offset_end) in enumerate(example["offset_mapping"]):
if not found_start and offset_start == example["answer_start"]:
start_position = i
fount_start =True
if not found_end and offset_end == example["answer_end"]:
end_position = i
found_start = True
if not found_end and offset_end == example["answer_end"]:
end_position = i
found_end = True
if found_start and found_end:
break
if not found_start or not found_end:
start_position = -1
end_position = -1
example["start_positions"] = start_position
example["end_positions"] = end_position
return example
prepared_dataset = tokenized_dataset.map(prepare_train_features, batched=False)
def fileter_invalid_examples(example):
return example["start_positions"] !=-1 and example["end_positions"] !=-1
filtered_dataset = prepared_dataset.filter(fileter_invalid_examples, batched=False)
train_indices, eval_indices = train_test_split(list(range(len(filtered_dataset))), test_size=0.2, random_state=42)
train_dataset = filtered_dataset.select(train_indices)
eval_dataset = filtered_dataset.select(eval_indices)
def convert_to_tensors(example):
example["input_ids"] = torch.tensor(example["input_ids"], dtype=torch.long)
example["attention_mask"] = torch.tensor(example["attention_mask"], dtype=torch.long)
return example
train_dataset = train_dataset.map(convert_to_tensors)
eval_dataset = eval_dataset.map(convert_to_tensors)
dataset_dict = DatasetDict({"train": train_dataset, "eval": eval_dataset})
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
training_args = TrainingArguments(
output_dir = '/content/drive/MyDrive/Colab Notebooks',
num_train_epochs=4,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
warmup_steps=500,
weight_decay=0.01,
#logging_dir='/content/drive/MyDrive/Colab Notebooks',
#logging_steps=10,
learning_rate=2e-5,
evaluation_strategy ="steps",
eval_steps=500,
save_strategy = "steps",
save_total_limit = 2,
#fp16=True,
load_best_model_at_end=True,
prediction_loss_only = True
)
After this I get:
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: [‘qa_outputs.bias’, ‘qa_outputs.weight’]
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
When I run the model,
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset_dict["train"],
eval_dataset=dataset_dict["eval"],
)
trainer.train()
Step Training Loss Validation Loss
TrainOutput(global_step=4, training_loss=6.499783039093018, metrics={‘train_runtime’: 9.5548, ‘train_samples_per_second’: 7.954, ‘train_steps_per_second’: 0.419, ‘total_flos’: 19858553511936.0, ‘train_loss’: 6.499783039093018, ‘epoch’: 4.0})
How can I fix this?