How to teach a gpt-2 for Q&A?

How to properly train a gpt-2 (1.5B) model for Question Answering using Transformers lib?
Is it possible with gtx 1080 (8gb)?
Because when i tried training 117m model (for the test) on a SQuAD Q&A dataset (part of it, just a 39000 samples from 87000), on the question “who are you?” it answered “What theHowWho was isInWhenThe of did,WhichWhere what many to a are andWhy in does wereOn. yearA withAlong do’s-During type hasForAtAfterAccording much canFromBy whichWhTo” (thats the exact copy of the gpt-2 answer after training almost on a half of the SQuAD dataset samples). Here is my code for training gpt-2 117m model:

from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, TrainingArguments, Trainer
import torch
import time
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset

class MyTrainer(Trainer):
    def __init__(self, model, args, train_dataset, eval_dataset, data_collator, tokenizer):
        super().__init__(model=model, args=args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, tokenizer=tokenizer)
    
    def compute_loss(self, model, inputs, return_outputs=False):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        start_positions = inputs["start_positions"]
        end_positions = inputs["end_positions"]
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss = torch.mean(loss)
        return (loss, outputs) if return_outputs else loss

    def compute_metrics(self, pred):
        start_pred, end_pred = pred
        start_positions = pred.label_ids[:, 0]
        end_positions = pred.label_ids[:, 1]
        start_correct = start_positions.eq(start_pred).sum().item()
        end_correct = end_positions.eq(end_pred).sum().item()
        total = start_positions.numel()
        return {
            "start_acc": start_correct / total,
            "end_acc": end_correct / total,
            "combined_acc": (start_correct + end_correct) / (2 * total),
        }

    
def get_token_positions(context, answer, encoded_dict):
    input_ids = encoded_dict['input_ids'][0].tolist()
    tokens = tokenizer.decode(input_ids).strip()

    start_idx = tokens.find(answer.strip())
    end_idx = start_idx + len(answer.strip())

    return start_idx, end_idx

time_fst = time.time()

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
dataset = load_dataset('squad')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2').to('cuda')

print("time of initialisation of the model gpt-2: ")
print(time.time() - time_fst)

def preprocess_qa_dataset(dataset, tokenizer, split_name, num):
    time_f = time.time()

    input_ids = []
    attention_masks = []
    start_positions = []
    end_positions = []
    time_s = time.time()

    for example in dataset[split_name].select(range(num)):
        question = example['question']
        context = example['context']
        answer = example['answers']['text']
        answer = answer[0]
        encoded_dict = tokenizer.encode_plus(question, context, add_special_tokens=True, max_length=1024, padding='max_length', return_attention_mask = True, return_tensors='pt')
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        start_idx, end_idx = get_token_positions(context, answer, encoded_dict)
        start_positions.append(start_idx)
        end_positions.append(end_idx)


    print("for loop in preprocess_qa_dataset function time: ")
    print((time.time() - time_s))

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    start_positions = torch.tensor(start_positions)
    end_positions = torch.tensor(end_positions)

    dataset = TensorDataset(input_ids, attention_masks, start_positions, end_positions)

    print("time of the preprocess_qa_dataset function: ")
    print((time.time() - time_f))

    return dataset

train_dataset = preprocess_qa_dataset(dataset, tokenizer, split_name='train', num=75000)

eval_dataset = preprocess_qa_dataset(dataset, tokenizer, split_name='validation', num=10000)

train_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    learning_rate=2e-5,
    do_train=True,
    per_device_train_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
    logging_dir="./logs",
    logging_steps=100,
    eval_steps=500,
    label_names=["start_positions", "end_positions"],
    save_strategy="steps",
    load_best_model_at_end=True,
    dataloader_pin_memory=False
)

trainer = MyTrainer(
    model=model,
    args=train_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]).to('cuda'),
                                'attention_mask': torch.stack([item[1] for item in data]).to('cuda'),
                                'start_positions': torch.stack([item[2] for item in data]).to('cuda'),
                                'end_positions': torch.stack([item[3] for item in data]).to('cuda')},
    tokenizer=tokenizer
)

time_t = time.time()

model.resize_token_embeddings(len(tokenizer))

trainer.train()

print("time of the \"trainer.train()\": ")
print(time_t)

and here is the code which i use for generating responds on gpt-2:

from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
import torch
import time
import sys

tokenizer = GPT2Tokenizer.from_pretrained('path_to_trained_model')
model = GPT2DoubleHeadsModel.from_pretrained('path_to_trained_model').to('cuda')

while(True):
    input_text = input("Your request here: ")
    if input_text == 'exit' or input_text == 'quit':
        sys.exit(0)
    time_start = time.time()
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to('cuda')
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device='cuda')
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=100, do_sample=False, no_repeat_ngram_size=2, early_stopping=True)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(output_text)
    print(time.time() - time_start)

I am only a beginner in AI, machine learning, etc
And i want to do a project for which i need trained (for my own purposes) AI