Hi,
Thanks for the reply. Here is my code:
import torch
import argparse
import os
import sys
import numpy as np
import torch.nn.functional as F
sys.path.append('..')
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments
from data_reader import GetDataAsPython
from sklearn.model_selection import train_test_split
from prepare_data import create_data, create_dataset
from transformers import T5Tokenizer
parser = argparse.ArgumentParser()
parser.add_argument('-e', '--epochs', type=int, default=100)
parser.add_argument('-bs', '--batch-size', type=int, default=1)
parser.add_argument('-lr', '--learning-rate', type=float, default=1e-4)
parser.add_argument('-gcv', '--gradient-clip-val', type=float, default=0.0)
parser.add_argument('-wd', '--weight-decay', type=float, default=0.01)
args = parser.parse_args()
# delete the logs directory
model_name = "t5"
os.system("rm -rf ./logs" + model_name)
os.system("rm -rf ./results_" + model_name)
data = GetDataAsPython('../data_large.json')
train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels = create_data(data, ['no-array-constructor'])
# from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
print('len of tokenizer before adding: ', len(tokenizer))
tokenizer.add_tokens(['{', '}', '<', '>'])
train_dataset = create_dataset(train_inputs, train_labels, tokenizer, True)
val_dataset = create_dataset(val_inputs, val_labels, tokenizer, False)
test_dataset = create_dataset(test_inputs, test_labels, tokenizer, False)
training_args = TrainingArguments(
output_dir='./results_' + model_name,
num_train_epochs=args.epochs,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=4,
warmup_steps=500,
weight_decay=args.weight_decay,
logging_dir='./logs_' + model_name,
logging_steps=10,
do_eval=True,
evaluation_strategy='epoch',
learning_rate=args.learning_rate,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
greater_is_better=False,
# prediction_loss_only=True
)
model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True)
model.resize_token_embeddings(len(tokenizer))
# model.resize maybe depending on tokens
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
optimizers=[torch.optim.Adam(params=model.parameters(), lr=args.learning_rate), None],
tokenizer=tokenizer,
compute_metrics=compute_val_metrics
)
trainer.train()
and I have a second file where I create the dataset:
from sklearn.model_selection import train_test_split
import torch
def filter_rule(data, rule_type):
filtered_data = []
for point in data:
if point.linter_report.rule_id == rule_type:
filtered_data.append(point)
return filtered_data
def split_filtered(filtered_data, seed=11):
filtered_data_temp = filtered_data
inputs = [data_point.GetT5Representation()[0] for data_point in filtered_data]
outputs = [data_point.GetT5Representation()[1] for data_point in filtered_data_temp]
test_size = 0.1 if len(inputs) >= 10 else 1 / len(inputs)
train_inputs, test_inputs, train_labels, test_labels = train_test_split(inputs, outputs, shuffle=True, random_state=seed, test_size=test_size)
val_size = 0.1 if len(train_inputs) >= 10 else 1 / len(train_inputs)
train_inputs, val_inputs, train_labels, val_labels = train_test_split(train_inputs, train_labels, shuffle=True, random_state=seed, test_size=val_size)
return train_inputs, train_labels, val_inputs, val_labels, test_inputs, test_labels
def create_data(data, linter_warnings: list):
train, train_labels = [], []
val, val_labels = [], []
test, test_labels = [], []
for warning in linter_warnings:
filtered_data = filter_rule(data, warning)
train_w, train_w_labels, val_w, val_w_labels, test_w, test_w_labels = split_filtered(filtered_data)
train += train_w
train_labels += train_w_labels
val += val_w
val_labels += val_w_labels
test += test_w
test_labels += test_w_labels
print('train size: ', len(train))
print('val size: ', len(val))
print('test size: ', len(test))
return train, train_labels, val, val_labels, test, test_labels
class BugFixDataset(torch.utils.data.Dataset):
def __init__(self, encodings, targets):
self.encodings = encodings
self.target_encodings = targets
# you have to correct the get item function
def __getitem__(self, index):
item = {key: torch.tensor(val[index]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.target_encodings['input_ids'][index], dtype=torch.long)
return item
def __len__(self):
return len(self.encodings['input_ids'])
def create_dataset(inputs, labels, tokenizer, training):
if training:
pad = False
else:
pad = True
input_encodings = tokenizer(inputs, truncation=pad, padding=pad)
label_encodings = tokenizer(labels, truncation=pad, padding=pad)
dataset = BugFixDataset(input_encodings, label_encodings)
return dataset