How to train a T5 model to learn a programming language?

Hello!
IĀ“m really new to training an AI or even programming python in general, so IĀ“ve read and watched a lot of stuff in the past couple of days.
The goal is to train a T5-base model to learn the solidity programming language (-> understand smart contracts).
For the ones, who maybe donĀ“t know what this is about: Smart Contracts are these little programs running on the ethereum blockchain for example, that automate certain processes such as transferring money and a lot more.
Anyone can upload these smart contracts, that can be really vulnerable to exploits.
So IĀ“m trying to develope an AI, that understands these Smart Contracts which are written in solidity.
Later on I want the model to understand and find known vulnerabilities in these Smart Contracts - but thatĀ“s a ā€œlater-problemā€. For now IĀ“d be happy to have a t5 model to simply understand Solidity.
I have downloaded thousands of solidity programs, that I can use to train the model.
I am not sure tho if my approach even makes sense at all, simply feeding the model some smart contracts, without labeling them myselfā€¦

Code:

# Step 1 - preprocess the dataset: 
import os
import json
import random

def read_solidity_files(folder_path):
    solidity_files = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".sol"):
                file_path = os.path.join(root, file)
                with open(file_path, "r") as infile:
                    solidity_files.append(infile.read())
    return solidity_files

def preprocess_dataset(solidity_files):
    source_target_pairs = []
    for contract in solidity_files:
        lines = contract.split("\n")
        for i in range(len(lines) - 1):
            source = lines[i].strip()
            target = lines[i + 1].strip()
            if source and target:
                source_target_pairs.append({"source": source, "target": target})
    return source_target_pairs

def split_dataset(source_target_pairs, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    total_len = len(source_target_pairs)
    train_len = int(train_ratio * total_len)
    val_len = int(val_ratio * total_len)

    random.shuffle(source_target_pairs)

    train_set = source_target_pairs[:train_len]
    val_set = source_target_pairs[train_len:train_len+val_len]
    test_set = source_target_pairs[train_len+val_len:]

    return train_set, val_set, test_set

folder_path = "smart-contracts/mainnet/00/"

solidity_files = read_solidity_files(folder_path)
source_target_pairs = preprocess_dataset(solidity_files)
train_set, val_set, test_set = split_dataset(source_target_pairs)

with open("train.json", "w") as train_file:
    json.dump(train_set, train_file)

with open("val.json", "w") as val_file:
    json.dump(val_set, val_file)

with open("test.json", "w") as test_file:
    json.dump(test_set, test_file)
# Step 2 - tokenize dataset
from transformers import T5Tokenizer
from datasets import load_dataset
import os

tokenizer = T5Tokenizer.from_pretrained("t5-base")
# all solidity keywords
tokenizer.add_tokens([
    "pragma", "solidity", "contract", "function", "public", "private", "view", "returns", "require", "assert", "modifier", "event", "emit", "mapping", "address", "payable", "constant", "memory", "calldata", "storage",
    "internal", "external", "pure", "constructor", "fallback", "receive", "if", "else", "while", "for", "break", "continue", "return", "import", "using", "assembly", "struct", "enum", "library", "new", "delete", "bytes",
    "int", "uint", "bool", "string", "interface", "this", "super", "selfdestruct", "transfer", "send", "call", "delegatecall", "staticcall", "keccak256", "sha256", "sha3", "ripemd160", "ecrecover", "addmod", "mulmod",
    "is", "type", "block", "gasleft", "msg", "now", "tx", "wei", "finney", "szabo", "ether", "balance", "gas", "value", "data", "sender", "origin", "gasprice", "timestamp", "number", "difficulty", "gaslimit", "abstract", 
    "override", "virtual", "try", "catch", "revert", "Error", "Panic", "extends"
])

def tokenize_function(examples, tokenizer):
    src_tokenized = tokenizer(examples['source'], truncation=True, padding='max_length', max_length=512)
    tgt_tokenized = tokenizer(examples['target'], truncation=True, padding='max_length', max_length=512)
    
    return {
        'input_ids': src_tokenized['input_ids'],
        'attention_mask': src_tokenized['attention_mask'],
        'labels': tgt_tokenized['input_ids']
    }

train_data = load_dataset("json", data_files="train.json")["train"]
val_data = load_dataset("json", data_files="val.json")["train"]
test_data = load_dataset("json", data_files="test.json")["train"]

train_data = train_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)
val_data = val_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)
test_data = test_data.map(tokenize_function, fn_kwargs={'tokenizer': tokenizer}, batched=True)

train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Save tokenized datasets to disk
output_dir = "tokenized_datasets"
os.makedirs(output_dir, exist_ok=True)
train_data.save_to_disk(os.path.join(output_dir, "train_data"))
val_data.save_to_disk(os.path.join(output_dir, "val_data"))
test_data.save_to_disk(os.path.join(output_dir, "test_data"))
# train the model
import os
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from datasets import load_from_disk

# Load the T5 model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained("t5-base")
tokenizer.add_tokens([
    "pragma", "solidity", "contract", "function", "public", "private", "view", "returns", "require", "assert", "modifier", "event", "emit", "mapping", "address", "payable", "constant", "memory", "calldata", "storage",
    "internal", "external", "pure", "constructor", "fallback", "receive", "if", "else", "while", "for", "break", "continue", "return", "import", "using", "assembly", "struct", "enum", "library", "new", "delete", "bytes",
    "int", "uint", "bool", "string", "interface", "this", "super", "selfdestruct", "transfer", "send", "call", "delegatecall", "staticcall", "keccak256", "sha256", "sha3", "ripemd160", "ecrecover", "addmod", "mulmod",
    "is", "type", "block", "gasleft", "msg", "now", "tx", "wei", "finney", "szabo", "ether", "balance", "gas", "value", "data", "sender", "origin", "gasprice", "timestamp", "number", "difficulty", "gaslimit", "abstract", 
    "override", "virtual", "try", "catch", "revert", "Error", "Panic", "extends"
])

model.resize_token_embeddings(len(tokenizer))

# Load the tokenized datasets from disk
train_data = load_from_disk("tokenized_datasets/train_data")
val_data = load_from_disk("tokenized_datasets/val_data")
test_data = load_from_disk("tokenized_datasets/test_data")

# Set the data format for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define the training arguments
training_args = TrainingArguments(
    output_dir="t5_solidity_output",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir="t5_solidity_logs",
    learning_rate=3e-5,
    weight_decay=0.01,
    save_total_limit=3,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()