How to train a T5 model to learn a programming language?

DuzX · May 2, 2023, 10:11am

Hello!
I´m really new to training an AI or even programming python in general, so I´ve read and watched a lot of stuff in the past couple of days.
The goal is to train a T5-base model to learn the solidity programming language (-> understand smart contracts).
For the ones, who maybe don´t know what this is about: Smart Contracts are these little programs running on the ethereum blockchain for example, that automate certain processes such as transferring money and a lot more.
Anyone can upload these smart contracts, that can be really vulnerable to exploits.
So I´m trying to develope an AI, that understands these Smart Contracts which are written in solidity.
Later on I want the model to understand and find known vulnerabilities in these Smart Contracts - but that´s a “later-problem”. For now I´d be happy to have a t5 model to simply understand Solidity.
I have downloaded thousands of solidity programs, that I can use to train the model.
I am not sure tho if my approach even makes sense at all, simply feeding the model some smart contracts, without labeling them myself…

Code:

# Step 1 - preprocess the dataset: 
import os
import json
import random

def read_solidity_files(folder_path):
    solidity_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".sol"):
                file_path = os.path.join(root, file)
                with open(file_path, "r") as infile:
                    solidity_files.append(infile.read())
    return solidity_files

def preprocess_dataset(solidity_files):
    source_target_pairs = []
    for contract in solidity_files:
        lines = contract.split("\n")
        for i in range(len(lines) - 1):
            source = lines[i].strip()
            target = lines[i + 1].strip()
            if source and target:
                source_target_pairs.append({"source": source, "target": target})
    return source_target_pairs

def split_dataset(source_target_pairs, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    total_len = len(source_target_pairs)
    train_len = int(train_ratio * total_len)
    val_len = int(val_ratio * total_len)

    random.shuffle(source_target_pairs)

    train_set = source_target_pairs[:train_len]
    val_set = source_target_pairs[train_len:train_len+val_len]
    test_set = source_target_pairs[train_len+val_len:]

    return train_set, val_set, test_set

folder_path = "smart-contracts/mainnet/00/"

solidity_files = read_solidity_files(folder_path)
source_target_pairs = preprocess_dataset(solidity_files)
train_set, val_set, test_set = split_dataset(source_target_pairs)

with open("train.json", "w") as train_file:
    json.dump(train_set, train_file)

with open("val.json", "w") as val_file:
    json.dump(val_set, val_file)

with open("test.json", "w") as test_file:
    json.dump(test_set, test_file)

# Step 2 - tokenize dataset
from transformers import T5Tokenizer
from datasets import load_dataset
import os

tokenizer = T5Tokenizer.from_pretrained("t5-base")
# all solidity keywords
tokenizer.add_tokens([
    "pragma", "solidity", "contract", "function", "public", "private", "view", "returns", "require", "assert", "modifier", "event", "emit", "mapping", "address", "payable", "constant", "memory", "calldata", "storage",
    "internal", "external", "pure", "constructor", "fallback", "receive", "if", "else", "while", "for", "break", "continue", "return", "import", "using", "assembly", "struct", "enum", "library", "new", "delete", "bytes",
    "int", "uint", "bool", "string", "interface", "this", "super", "selfdestruct", "transfer", "send", "call", "delegatecall", "staticcall", "keccak256", "sha256", "sha3", "ripemd160", "ecrecover", "addmod", "mulmod",
    "is", "type", "block", "gasleft", "msg", "now", "tx", "wei", "finney", "szabo", "ether", "balance", "gas", "value", "data", "sender", "origin", "gasprice", "timestamp", "number", "difficulty", "gaslimit", "abstract", 
    "override", "virtual", "try", "catch", "revert", "Error", "Panic", "extends"
])

def tokenize_function(examples, tokenizer):
    src_tokenized = tokenizer(examples['source'], truncation=True, padding='max_length', max_length=512)
    tgt_tokenized = tokenizer(examples['target'], truncation=True, padding='max_length', max_length=512)
    
    return {
        'input_ids': src_tokenized['input_ids'],
        'attention_mask': src_tokenized['attention_mask'],
        'labels': tgt_tokenized['input_ids']
    }

train_data = load_dataset("json", data_files="train.json")["train"]
val_data = load_dataset("json", data_files="val.json")["train"]
test_data = load_dataset("json", data_files="test.json")["train"]

train_data = train_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)
val_data = val_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)
test_data = test_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Save tokenized datasets to disk
output_dir = "tokenized_datasets"
os.makedirs(output_dir, exist_ok=True)
train_data.save_to_disk(os.path.join(output_dir, "train_data"))
val_data.save_to_disk(os.path.join(output_dir, "val_data"))
test_data.save_to_disk(os.path.join(output_dir, "test_data"))

# train the model
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_from_disk

# Load the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer.add_tokens([
    "pragma", "solidity", "contract", "function", "public", "private", "view", "returns", "require", "assert", "modifier", "event", "emit", "mapping", "address", "payable", "constant", "memory", "calldata", "storage",
    "internal", "external", "pure", "constructor", "fallback", "receive", "if", "else", "while", "for", "break", "continue", "return", "import", "using", "assembly", "struct", "enum", "library", "new", "delete", "bytes",
    "int", "uint", "bool", "string", "interface", "this", "super", "selfdestruct", "transfer", "send", "call", "delegatecall", "staticcall", "keccak256", "sha256", "sha3", "ripemd160", "ecrecover", "addmod", "mulmod",
    "is", "type", "block", "gasleft", "msg", "now", "tx", "wei", "finney", "szabo", "ether", "balance", "gas", "value", "data", "sender", "origin", "gasprice", "timestamp", "number", "difficulty", "gaslimit", "abstract", 
    "override", "virtual", "try", "catch", "revert", "Error", "Panic", "extends"
])

model.resize_token_embeddings(len(tokenizer))

# Load the tokenized datasets from disk
train_data = load_from_disk("tokenized_datasets/train_data")
val_data = load_from_disk("tokenized_datasets/val_data")
test_data = load_from_disk("tokenized_datasets/test_data")

# Set the data format for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir="t5_solidity_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir="t5_solidity_logs",
    learning_rate=3e-5,
    weight_decay=0.01,
    save_total_limit=3,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Topic		Replies	Views
Prepare data for pretraining T5 model 🤗Datasets	1	1072	May 4, 2023
Need help in fine-tuning T5-Base Model for a sequence task Beginners	0	169	May 8, 2024
Finetuning T5 on Squad 🤗Transformers	1	572	November 29, 2023
Errors when fine-tuning T5 Beginners	7	6478	January 3, 2022
Training the t5 Beginners	4	1319	August 16, 2022

How to train a T5 model to learn a programming language?

Related topics