increasing the classification process speed using llm

here is the code i am using for the classification task:
I want to know if there is a way to speed up the process?
i am running the project in colab using A100 GPU
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import gc
import pandas as pd
import subprocess
import sys
import argparse
import re
from sklearn.metrics import classification_report, confusion_matrix
import transformers

Install xlsxwriter if not already installed

try:
import xlsxwriter
except ModuleNotFoundError:
print(“Installing xlsxwriter…”)
subprocess.check_call([sys.executable, “-m”, “pip”, “install”, “xlsxwriter”])
import xlsxwriter

Log versions of key dependencies

with open(“version_log.txt”, “w”) as f:
f.write(f"Transformers version: {transformers.version}\n")
f.write(f"PyTorch version: {torch.version}\n")
f.write(f"CUDA version: {torch.version.cuda}\n")
print(“Versions logged to version_log.txt”)

Argument parser for parameterization

parser = argparse.ArgumentParser(description=“Inference script for vulnerability detection”)
parser.add_argument(‘–model_name’, type=str, default=“deepseek-ai/DeepSeek-R1-Distill-Qwen-14B”, help=“Model name or path”)
parser.add_argument(‘–functions_file’, type=str, default=“/content/processed_dataset/test_functions_long.json”, help=“Path to functions JSON”)
parser.add_argument(‘–labels_file’, type=str, default=“/content/processed_dataset/test_labels_long.json”, help=“Path to labels JSON”)
parser.add_argument(‘–output_file’, type=str, default=“/content/processed_dataset/results_long.xlsx”, help=“Path to output Excel file”)
parser.add_argument(‘–batch_size’, type=int, default=1, help=“Batch size for inference”)
parser.add_argument(‘–max_length’, type=int, default=16384, help=“Maximum sequence length”)
parser.add_argument(‘–num_workers’, type=int, default=4, help=“Number of DataLoader workers”)

Remove Colab-specific arguments

if ‘-f’ in sys.argv:
f_index = sys.argv.index(‘-f’)
del sys.argv[f_index:f_index+2]

args = parser.parse_args()

Set device

device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
print(f"Using device: {device}")

Load tokenizer and model

tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.pad_token = tokenizer.eos_token # Set pad token to eos token if not defined
print(“Loading model. This may take a few minutes…”)
model = AutoModelForCausalLM.from_pretrained(
args.model_name,
torch_dtype=torch.float16, # Use fp16 to save memory
device_map=“auto” # Automatically determine device mapping
)
model.to(device)
model.eval()
print(f"Model {args.model_name} loaded successfully.")

Clean memory function

def clean_memory():
print(“\nRunning garbage collection…\n”)
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 10242:.2f} MB")
print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024
2:.2f} MB")

Clean memory after loading

clean_memory()

Define dataset class with updated prompt

class InferenceDataset(Dataset):
def init(self, functions_data, tokenizer, max_length=args.max_length):
self.functions_data = functions_data
self.tokenizer = tokenizer
self.max_length = max_length

def __len__(self):
    return len(self.functions_data)

def __getitem__(self, idx):
    item = self.functions_data[idx]
    code = item['func']
    idx = item['idx']
    prompt = (
        f"\nYou are a cybersecurity expert analyzing code for security vulnerabilities. "
        f"Analyze the following code and determine if it contains security vulnerabilities. "
        f"Begin your response with '<think>\n' and reason step by step, concisely but thoroughly, ending with '</think>'. "
        f"After completing your analysis, end your response with exactly this format: "
        f"[[**Prediction: yes**]] if the code is vulnerable, or "
        f"[[**Prediction: no**]] if the code is not vulnerable. "
        f"Do not add any extra text after the pattern. "
        f"Code:\n{code}"
    )
    encodings = self.tokenizer(
        prompt,
        truncation=True,  # Ensure truncation is enabled
        max_length=self.max_length,  # Set maximum sequence length
        padding="max_length",  # Pad sequences to max_length
        return_tensors="pt"
    )
    return {
        'input_ids': encodings['input_ids'].squeeze(0),
        'attention_mask': encodings['attention_mask'].squeeze(0),
        'idx': idx
    }

Load data function

def load_data(functions_file, labels_file):
with open(functions_file, ‘r’, encoding=‘utf-8’) as f:
functions_data = json.load(f)
with open(labels_file, ‘r’, encoding=‘utf-8’) as f:
labels_data = json.load(f)
idx_to_target = {item[‘idx’]: item[‘target’] for item in labels_data}
return functions_data, labels_data, idx_to_target

Build dataset and dataloader with optimizations

def build_dataset_dataloader(functions_data, tokenizer):
dataset = InferenceDataset(functions_data, tokenizer, max_length=args.max_length)
dataloader = DataLoader(
dataset,
batch_size=args.batch_size,
shuffle=False,
num_workers=4, # Safe default for Colab
prefetch_factor=2
)
return dataloader

Run inference with full text, thought process, and regex parsing

def run_inference(model, dataloader, tokenizer, device, idx_to_target):
predictions =
true_labels =
idx_list =
sample_outputs =
batch_idx = 0
with torch.no_grad():
for batch in tqdm(dataloader, desc=“Processing batches”, unit=“batch”):
input_ids = batch[‘input_ids’].to(device)
attention_mask = batch[‘attention_mask’].to(device)
idxs = batch[‘idx’].tolist()
outputs = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=2048,
temperature=0.6,
top_p=0.95,
pad_token_id=tokenizer.pad_token_id
)
for i, output in enumerate(outputs):
generated_text = tokenizer.decode(output, skip_special_tokens=True)
# print(f"\nFull Generated Text for idx {idxs[i]}:\n{generated_text}\n{'-'50}")
# print(f"DEBUG: Raw text to parse:\n{repr(generated_text)}\n{'-'50}“) # Debug line
# Create a pattern that matches all desired formats
pattern = r”(?:[[**Prediction:\s
(yes|no)**]]|****Prediction:\s
(yes|no)****|**Prediction:\s*(yes|no)**)"
answer_match = re.search(pattern, generated_text, re.IGNORECASE)
if answer_match:
# Get the first non-None group (whichever pattern matched)
answer = next(group for group in answer_match.groups() if group is not None).lower()
pred = 1 if answer == “yes” else 0
else:
pred = None
print(f"Warning: No prediction pattern found for idx {idxs[i]}“)
continue # Skip this sample
predictions.append(pred)
true_labels.append(idx_to_target[idxs[i]])
idx_list.append(idxs[i])
if len(sample_outputs) < 5:
sample_outputs.append({
‘idx’: idxs[i],
‘generated_text’: generated_text,
‘prediction’: pred,
‘true_label’: idx_to_target[idxs[i]]
})
clean_memory()
batch_idx += 1
if batch_idx % 10 == 0:
gpu_util = subprocess.check_output(“nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader”, shell=True).decode().strip()
print(f"GPU Utilization at batch {batch_idx}: {gpu_util}”)
return predictions, true_labels, idx_list, sample_outputs

def compute_metrics(predictions, true_labels):
# Filter out None predictions
valid_indices = [i for i, p in enumerate(predictions) if p is not None]
valid_predictions = [predictions[i] for i in valid_indices]
valid_true_labels = [true_labels[i] for i in valid_indices]
if not valid_predictions:
print(“No valid predictions to compute metrics.”)
return 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0
cm = confusion_matrix(valid_true_labels, valid_predictions)
tn, fp, fn, tp = cm.ravel()
accuracy = (tp + tn) / len(valid_predictions) if len(valid_predictions) > 0 else 0
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(“\nClassification Report:”)
print(classification_report(valid_true_labels, valid_predictions, target_names=[‘Non-Vulnerable’, ‘Vulnerable’]))
return tp, fp, fn, tn, accuracy, precision, recall, f1

def write_outputs(idx_list, predictions, true_labels, tp, fp, fn, tn, accuracy, precision, recall, f1, output_file):
# Filter out None predictions for the DataFrame
valid_data = [(idx, pred, true) for idx, pred, true in zip(idx_list, predictions, true_labels) if pred is not None]
if not valid_data:
print(“No valid data to write.”)
return
idx_list_valid, predictions_valid, true_labels_valid = zip(*valid_data)
results_df = pd.DataFrame({
‘idx’: idx_list_valid,
‘Prediction’: predictions_valid,
‘True_Label’: true_labels_valid
})
metrics_df = pd.DataFrame({
‘Metric’: [‘True Positives (TP)’, ‘True Negatives (TN)’, ‘False Positives (FP)’, ‘False Negatives (FN)’, ‘Accuracy’, ‘Precision’, ‘Recall’, ‘F1-Score’],
‘Value’: [tp, tn, fp, fn, accuracy, precision, recall, f1]
})
with pd.ExcelWriter(output_file, engine=‘xlsxwriter’) as writer:
results_df.to_excel(writer, sheet_name=‘Predictions’, index=False)
metrics_df.to_excel(writer, sheet_name=‘Metrics’, index=False)
print(f"\nResults saved to {output_file}")

Main execution flow

if name == “main”:
# Load data
functions_data, labels_data, idx_to_target = load_data(args.functions_file, args.labels_file)

# Build dataset and dataloader
dataloader = build_dataset_dataloader(functions_data, tokenizer)

# Run inference
predictions, true_labels, idx_list, sample_outputs = run_inference(model, dataloader, tokenizer, device, idx_to_target)

# Compute metrics
tp, fp, fn, tn, accuracy, precision, recall, f1 = compute_metrics(predictions, true_labels)

# Write outputs
write_outputs(idx_list, predictions, true_labels, tp, fp, fn, tn, accuracy, precision, recall, f1, args.output_file)

# Clean memory one last time
clean_memory()
1 Like

There are various ways to do this, but basically what you need to do is to reduce the amount of unnecessary processing (if you’re just making inferences) that is taking place under the surface.

You could also try things like setting up attention, or compiling to reduce the runtime overhead of Python and PyTorch (although this doesn’t really work on Windows). You can think of the speed-up methods that work with PyTorch as being generally available.

Also, if you don’t have enough VRAM, it’s worth considering quantization. This is a big benefit because data that doesn’t fit in VRAM will be processed at an incredibly slow speed. There are also several quantization algorithms that are designed with speed in mind.

In addition, there is a method of using a third-party library that is similar to Transformers and can use the same model, but is already optimized for speed.

Also, since the GPU generation is not old, you can expect some performance improvement at almost no cost just by using torch.bfloat16 instead of torch.float16.

Thanks for the reply :folded_hands: :folded_hands:
i go trough the methods you mentioned.

1 Like