You should probably TRAIN this model on a down-stream task with BertForQuestionAnswering

How do I fix this? I am new to transformers and I am using google colab.
Please help, I don’t know what else to do.

from google.colab import drive
drive.mount('/content/drive')
!pip install transformers
!pip install json
!pip install pandas
!pip install torch
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U
import pandas as pd
import json
import torch 
import csv
import io
import datasets
import re
from datasets import load_dataset
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import(BertTokenizerFast, 
                         BertForQuestionAnswering,
                         TrainingArguments,
                         Trainer)
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

from transformers import Trainer
data = pd.read_csv('tandr_info.csv')
newdata = data[['Customer',  'Question', 'Answer']]
newdata.rename(columns={'Customer': 'context', 'Question': 'question', 'Answer': 'answer'}, inplace=True)
newdata['answer'] = newdata['answer'].str.findall(r'\b(?!.*[A-Za-z]+.*\d+)(?!.,*\d+.*[A-Za-z]+.*).+\b').str.join(' ').str.strip()
newdata['question'] = newdata['question'].str.findall(r'\b(?!.*[A-Za-z]+.*\d+)(?!.,*\d+.*[A-Za-z]+.*).+\b').str.join(' ').str.strip()
newdata['context'] = newdata['context'].str.findall(r'\b(?!.*[A-Za-z]+.*\d+)(?!.,*\d+.*[A-Za-z]+.*).+\b').str.join(' ').str.strip()
newdata['question'] = newdata['question'].str.lower()
newdata['answer'] = newdata['answer'].str.lower()
newdata['context'] = newdata['context'].str.lower()
newdata['answer'] = newdata['answer'].astype(str)
dataset = Dataset.from_pandas(newdata)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
def tokenize(batch):
  tokenized_batch = tokenizer(batch["question"], batch["context"],
  max_length =512,
  padding = "max_length",
  truncation=True,
  return_offsets_mapping = True,
  return_token_type_ids=True)                            
  answer_starts =[]
  answer_ends = []

  for i, context in enumerate(batch["context"]):
    answer_start = context.find(batch["answer"][i])
    answer_end = answer_start + len(batch["answer"][i])
    answer_starts.append(answer_start)
    answer_ends.append(answer_end)

  tokenized_batch["answer_start"] =  answer_starts
  tokenized_batch["answer_end"] = answer_ends
  return tokenized_batch                
tokenized_dataset = dataset.map(tokenize, batched=True)
def prepare_train_features(example):
  start_position = example["input_ids"].index(tokenizer.cls_token_id)
  end_position = example["input_ids"].index(tokenizer.sep_token_id)

  found_start = False 
  found_end = False 

  for i, (offset_start, offset_end) in enumerate(example["offset_mapping"]):
    if not found_start and offset_start == example["answer_start"]:
      start_position = i
      fount_start =True
    if not found_end and offset_end == example["answer_end"]:
      end_position = i 
      found_start = True 
    if not found_end and offset_end == example["answer_end"]:
      end_position = i
      found_end = True
    if found_start and found_end:
      break

  if not found_start or not found_end:
    start_position = -1
    end_position = -1      

  example["start_positions"] = start_position
  example["end_positions"] = end_position 
  return example 
  
prepared_dataset = tokenized_dataset.map(prepare_train_features, batched=False)
def fileter_invalid_examples(example):
  return example["start_positions"] !=-1 and example["end_positions"] !=-1

filtered_dataset = prepared_dataset.filter(fileter_invalid_examples, batched=False)
train_indices, eval_indices = train_test_split(list(range(len(filtered_dataset))), test_size=0.2, random_state=42)
train_dataset = filtered_dataset.select(train_indices)
eval_dataset = filtered_dataset.select(eval_indices)


def convert_to_tensors(example):
  example["input_ids"] = torch.tensor(example["input_ids"], dtype=torch.long)
  example["attention_mask"] = torch.tensor(example["attention_mask"], dtype=torch.long)
  return example 
train_dataset = train_dataset.map(convert_to_tensors)
eval_dataset = eval_dataset.map(convert_to_tensors)

dataset_dict = DatasetDict({"train": train_dataset, "eval": eval_dataset})
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

training_args = TrainingArguments(
    output_dir = '/content/drive/MyDrive/Colab Notebooks',
    num_train_epochs=4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01, 
    #logging_dir='/content/drive/MyDrive/Colab Notebooks',
    #logging_steps=10,
    learning_rate=2e-5,
    evaluation_strategy ="steps",
    eval_steps=500,
    save_strategy = "steps",
    save_total_limit = 2, 
    #fp16=True,
    load_best_model_at_end=True,
    prediction_loss_only = True
)

After this I get:
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: [‘qa_outputs.bias’, ‘qa_outputs.weight’]
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

When I run the model,

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["eval"],
)

trainer.train()

Step Training Loss Validation Loss

TrainOutput(global_step=4, training_loss=6.499783039093018, metrics={‘train_runtime’: 9.5548, ‘train_samples_per_second’: 7.954, ‘train_steps_per_second’: 0.419, ‘total_flos’: 19858553511936.0, ‘train_loss’: 6.499783039093018, ‘epoch’: 4.0})

How can I fix this?

The warning on training is not an issue. It’s telling you need to train the model, which you are doing. You can ignore it. If you get the same message after you have trained your model -then there may be an issue.

What other issue are you seeing?

train_loss’: 6.499783039093018, I need to improve the this. My model is not working.

How big is your dataset?

Have you tested the data preparing steps to make sure it’s working as you want it to?

How does the model converge over each epoch?