CodeGen Model - Transfer Learning, Train and Eval (codeparrot/apps database)

Hi,

I’m new here. I’m trying to apply transfer learning (not a complete fine-tunning) to a CodeGenModel (facebook research) with the codeparrot/apps database.

Could someone please take a look at my code? I think that I did something wrong, but I cannot understand exactly what. There’s definitly something wrong with the train code part.

Also, do you guys have a book or course recommendation to learn huggingface & pytorch? I’m really loving this huggingface community but I’m finding very challeging to “complete” this project.

Finally, I don’t understand how to implement a forward pass that works with this CodeGenModel. I would like to add layers to the top and run some kind of classification.

Here’s the code:

!pip install git+https://github.com/huggingface/transformers.git
!pip install datasets

import pandas as pd

import transformers
from transformers import AutoTokenizer, CodeGenModel, CodeGenForCausalLM
from datasets import load_dataset

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

checkpoint = "Salesforce/codegen-350M-mono"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = CodeGenForCausalLM.from_pretrained(checkpoint)

model.save_pretrained('./model')
torch.save(model.state_dict(), 'codegenModel.pth')

train_dataset = load_dataset("codeparrot/apps", "all", split="train")
valid_dataset = load_dataset("codeparrot/apps", "all", split="test")


train_data = train_dataset.remove_columns(['problem_id', 'starter_code', 'url'])
valid_data = valid_dataset.remove_columns(['problem_id', 'starter_code', 'url'])

train_data = train_dataset.rename_column('solutions', 'labels')
valid_data = valid_dataset.rename_column('solutions', 'labels')


if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# process the examples in input and target text format and the eos token at the end 
def add_eos_to_examples(example):
    example['input_text'] = example['question']
    example['target_text'] = example['labels']
    return example

# tokenize the examples
def convert_to_features(example_batch):
    input_encodings = tokenizer.batch_encode_plus(
        example_batch['input_text'], 
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_attention_mask=True, 
        return_token_type_ids=True,
        max_length=1024,
        )
    
    target_encodings = tokenizer.batch_encode_plus(
        example_batch['target_text'], 
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_attention_mask=True, 
        return_token_type_ids=True,
        max_length=1024,
        )

    encodings = {
        'input_ids': input_encodings['input_ids'], 
        'attention_mask': input_encodings['attention_mask'],
        'target_ids': target_encodings['input_ids'],
        'token_type_ids': input_encodings['token_type_ids'],
        'target_attention_mask': target_encodings['attention_mask']
    }
    return encodings

train_data = train_data.map(add_eos_to_examples)
train_data = train_data.map(convert_to_features, batched=True)

valid_data = valid_data.map(add_eos_to_examples)
valid_data = valid_data.map(convert_to_features, batched=True)

columns = ['input_ids', 'attention_mask', 'token_type_ids', 'target_attention_mask']
train_data.set_format(type='torch', columns=columns)
valid_data.set_format(type='torch', columns=columns)

torch.save(train_data, 'train_data.pt')
torch.save(valid_data, 'valid_data.pt')

for param in model.parameters():
    param.requires_grad = False

import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

train_dataloader = DataLoader(train_data, batch_size=8)
eval_dataloader = DataLoader(valid_data, batch_size=8)

from torch.optim import AdamW
from transformers import get_scheduler


optimizer = AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 500
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

Train code:

model.train()
for epoch in range(num_epochs):
    for batch in tqdm(train_dataloader):
        max_length=1024,
        outs = model.generate(input_ids=batch['input_ids'],         
                              attention_mask=batch['attention_mask'],
                              max_length=1024)
        outs = [tokenizer.decode(ids) for ids in outs]
        optimizer.step(),
        lr_scheduler.step(),
        optimizer.zero_grad(),
        progress_bar.update(1)