Hi,
I’m new here. I’m trying to apply transfer learning (not a complete fine-tunning) to a CodeGenModel (facebook research) with the codeparrot/apps database.
Could someone please take a look at my code? I think that I did something wrong, but I cannot understand exactly what. There’s definitly something wrong with the train code part.
Also, do you guys have a book or course recommendation to learn huggingface & pytorch? I’m really loving this huggingface community but I’m finding very challeging to “complete” this project.
Finally, I don’t understand how to implement a forward pass that works with this CodeGenModel. I would like to add layers to the top and run some kind of classification.
Here’s the code:
!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets
import pandas as pd
import transformers
from transformers import AutoTokenizer, CodeGenModel, CodeGenForCausalLM
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
checkpoint = "Salesforce/codegen-350M-mono"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = CodeGenForCausalLM.from_pretrained(checkpoint)
model.save_pretrained('./model')
torch.save(model.state_dict(), 'codegenModel.pth')
train_dataset = load_dataset("codeparrot/apps", "all", split="train")
valid_dataset = load_dataset("codeparrot/apps", "all", split="test")
train_data = train_dataset.remove_columns(['problem_id', 'starter_code', 'url'])
valid_data = valid_dataset.remove_columns(['problem_id', 'starter_code', 'url'])
train_data = train_dataset.rename_column('solutions', 'labels')
valid_data = valid_dataset.rename_column('solutions', 'labels')
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# process the examples in input and target text format and the eos token at the end
def add_eos_to_examples(example):
example['input_text'] = example['question']
example['target_text'] = example['labels']
return example
# tokenize the examples
def convert_to_features(example_batch):
input_encodings = tokenizer.batch_encode_plus(
example_batch['input_text'],
truncation=True,
padding='max_length',
add_special_tokens=True,
return_attention_mask=True,
return_token_type_ids=True,
max_length=1024,
)
target_encodings = tokenizer.batch_encode_plus(
example_batch['target_text'],
truncation=True,
padding='max_length',
add_special_tokens=True,
return_attention_mask=True,
return_token_type_ids=True,
max_length=1024,
)
encodings = {
'input_ids': input_encodings['input_ids'],
'attention_mask': input_encodings['attention_mask'],
'target_ids': target_encodings['input_ids'],
'token_type_ids': input_encodings['token_type_ids'],
'target_attention_mask': target_encodings['attention_mask']
}
return encodings
train_data = train_data.map(add_eos_to_examples)
train_data = train_data.map(convert_to_features, batched=True)
valid_data = valid_data.map(add_eos_to_examples)
valid_data = valid_data.map(convert_to_features, batched=True)
columns = ['input_ids', 'attention_mask', 'token_type_ids', 'target_attention_mask']
train_data.set_format(type='torch', columns=columns)
valid_data.set_format(type='torch', columns=columns)
torch.save(train_data, 'train_data.pt')
torch.save(valid_data, 'valid_data.pt')
for param in model.parameters():
param.requires_grad = False
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
train_dataloader = DataLoader(train_data, batch_size=8)
eval_dataloader = DataLoader(valid_data, batch_size=8)
from torch.optim import AdamW
from transformers import get_scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()
num_epochs = 500
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)
from tqdm.auto import tqdm
progress_bar = tqdm(range(num_training_steps))
Train code:
model.train()
for epoch in range(num_epochs):
for batch in tqdm(train_dataloader):
max_length=1024,
outs = model.generate(input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
max_length=1024)
outs = [tokenizer.decode(ids) for ids in outs]
optimizer.step(),
lr_scheduler.step(),
optimizer.zero_grad(),
progress_bar.update(1)