Hello!
I“m really new to training an AI or even programming python in general, so I“ve read and watched a lot of stuff in the past couple of days.
The goal is to train a T5-base model to learn the solidity programming language (-> understand smart contracts).
For the ones, who maybe don“t know what this is about: Smart Contracts are these little programs running on the ethereum blockchain for example, that automate certain processes such as transferring money and a lot more.
Anyone can upload these smart contracts, that can be really vulnerable to exploits.
So I“m trying to develope an AI, that understands these Smart Contracts which are written in solidity.
Later on I want the model to understand and find known vulnerabilities in these Smart Contracts - but thatĀ“s a ālater-problemā. For now IĀ“d be happy to have a t5 model to simply understand Solidity.
I have downloaded thousands of solidity programs, that I can use to train the model.
I am not sure tho if my approach even makes sense at all, simply feeding the model some smart contracts, without labeling them myselfā¦
Code:
# Step 1 - preprocess the dataset:
import os
import json
import random
def read_solidity_files(folder_path):
solidity_files = []
for root, _, files in os.walk(folder_path):
for file in files:
if file.endswith(".sol"):
file_path = os.path.join(root, file)
with open(file_path, "r") as infile:
solidity_files.append(infile.read())
return solidity_files
def preprocess_dataset(solidity_files):
source_target_pairs = []
for contract in solidity_files:
lines = contract.split("\n")
for i in range(len(lines) - 1):
source = lines[i].strip()
target = lines[i + 1].strip()
if source and target:
source_target_pairs.append({"source": source, "target": target})
return source_target_pairs
def split_dataset(source_target_pairs, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
total_len = len(source_target_pairs)
train_len = int(train_ratio * total_len)
val_len = int(val_ratio * total_len)
random.shuffle(source_target_pairs)
train_set = source_target_pairs[:train_len]
val_set = source_target_pairs[train_len:train_len+val_len]
test_set = source_target_pairs[train_len+val_len:]
return train_set, val_set, test_set
folder_path = "smart-contracts/mainnet/00/"
solidity_files = read_solidity_files(folder_path)
source_target_pairs = preprocess_dataset(solidity_files)
train_set, val_set, test_set = split_dataset(source_target_pairs)
with open("train.json", "w") as train_file:
json.dump(train_set, train_file)
with open("val.json", "w") as val_file:
json.dump(val_set, val_file)
with open("test.json", "w") as test_file:
json.dump(test_set, test_file)
# Step 2 - tokenize dataset
from transformers import T5Tokenizer
from datasets import load_dataset
import os
tokenizer = T5Tokenizer.from_pretrained("t5-base")
# all solidity keywords
tokenizer.add_tokens([
"pragma", "solidity", "contract", "function", "public", "private", "view", "returns", "require", "assert", "modifier", "event", "emit", "mapping", "address", "payable", "constant", "memory", "calldata", "storage",
"internal", "external", "pure", "constructor", "fallback", "receive", "if", "else", "while", "for", "break", "continue", "return", "import", "using", "assembly", "struct", "enum", "library", "new", "delete", "bytes",
"int", "uint", "bool", "string", "interface", "this", "super", "selfdestruct", "transfer", "send", "call", "delegatecall", "staticcall", "keccak256", "sha256", "sha3", "ripemd160", "ecrecover", "addmod", "mulmod",
"is", "type", "block", "gasleft", "msg", "now", "tx", "wei", "finney", "szabo", "ether", "balance", "gas", "value", "data", "sender", "origin", "gasprice", "timestamp", "number", "difficulty", "gaslimit", "abstract",
"override", "virtual", "try", "catch", "revert", "Error", "Panic", "extends"
])
def tokenize_function(examples, tokenizer):
src_tokenized = tokenizer(examples['source'], truncation=True, padding='max_length', max_length=512)
tgt_tokenized = tokenizer(examples['target'], truncation=True, padding='max_length', max_length=512)
return {
'input_ids': src_tokenized['input_ids'],
'attention_mask': src_tokenized['attention_mask'],
'labels': tgt_tokenized['input_ids']
}
train_data = load_dataset("json", data_files="train.json")["train"]
val_data = load_dataset("json", data_files="val.json")["train"]
test_data = load_dataset("json", data_files="test.json")["train"]
train_data = train_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)
val_data = val_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)
test_data = test_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# Save tokenized datasets to disk
output_dir = "tokenized_datasets"
os.makedirs(output_dir, exist_ok=True)
train_data.save_to_disk(os.path.join(output_dir, "train_data"))
val_data.save_to_disk(os.path.join(output_dir, "val_data"))
test_data.save_to_disk(os.path.join(output_dir, "test_data"))
# train the model
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_from_disk
# Load the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer.add_tokens([
"pragma", "solidity", "contract", "function", "public", "private", "view", "returns", "require", "assert", "modifier", "event", "emit", "mapping", "address", "payable", "constant", "memory", "calldata", "storage",
"internal", "external", "pure", "constructor", "fallback", "receive", "if", "else", "while", "for", "break", "continue", "return", "import", "using", "assembly", "struct", "enum", "library", "new", "delete", "bytes",
"int", "uint", "bool", "string", "interface", "this", "super", "selfdestruct", "transfer", "send", "call", "delegatecall", "staticcall", "keccak256", "sha256", "sha3", "ripemd160", "ecrecover", "addmod", "mulmod",
"is", "type", "block", "gasleft", "msg", "now", "tx", "wei", "finney", "szabo", "ether", "balance", "gas", "value", "data", "sender", "origin", "gasprice", "timestamp", "number", "difficulty", "gaslimit", "abstract",
"override", "virtual", "try", "catch", "revert", "Error", "Panic", "extends"
])
model.resize_token_embeddings(len(tokenizer))
# Load the tokenized datasets from disk
train_data = load_from_disk("tokenized_datasets/train_data")
val_data = load_from_disk("tokenized_datasets/val_data")
test_data = load_from_disk("tokenized_datasets/test_data")
# Set the data format for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# Define the training arguments
training_args = TrainingArguments(
output_dir="t5_solidity_output",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
num_train_epochs=3,
evaluation_strategy="epoch",
logging_dir="t5_solidity_logs",
learning_rate=3e-5,
weight_decay=0.01,
save_total_limit=3,
)
# Initialize the Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data,
eval_dataset=val_data,
tokenizer=tokenizer,
)
# Train the model
trainer.train()