Generate test case with Model Gpt2

Here’s the text translated into English:


Hello, my goal is to generate tests for REST APIs. The interface here takes a URL and the number of test cases to generate as input. However, I have a problem with the test case generation that displays this:

['pm.test("Test case", function () {\n    https://api.nasa.gov/mars-photos/api/v1/rovers/curiosity/photos?api_key=2kMSgESLKxbpMKgm3piy61AznbUV7QZbtRafmMY4&sol=0  _______________________________________________ Sent through the Full Disclosure mailing list https://nasa.gov/mailman/listinfo/fulldisclosure Web Archives & RSS: http://seclists.\n});', 
'pm.test("Test case", function () {\n    https://api.nasa.gov/mars-photos/api/v1/rovers/curiosity/photos?api_key=2kMSgESLKxbpMKgm3piy61AznbUV7QZbtRafmMY4&sol=0  _______________________________________________ Sent through the Full Disclosure mailing list https://nmap.org/mailman/listinfo/fulldisclosure Web Archives & RSS: http://seclists.\n});', 
'pm.test("Test case", function () {\n    https://api.nasa.gov/mars-photos/api/v1/rovers/curiosity/photos?api_key=2kMSgESLKxbpMKgm3piy61AznbUV7QZbtRafmMY4&sol=0  _______________________________________________ Sent through the Full Disclosure mailing list https://nasa.gov/mailman/listinfo/fulldisclosure Web Archives & RSS: https://seclists.\n});', 
'pm.test("Test case", function () {\n    https://api.nasa.gov/mars-photos/api/v1/rovers/curiosity/photos?api_key=2kMSgESLKxbpMKgm3piy61AznbUV7QZbtRafmMY4&sol=0  _______________________________________________ Sent through the Full Disclosure mailing list https://nasa.org/mailman/listinfo/fulldisclosure Web Archives & RSS: http://seclists.\n});', 
'pm.test("Test case", function () {\n    https://api.nasa.gov/mars-photos/api/v1/rovers/curiosity/photos?api_key=2kMSgESLKxbpMKgm3piy61AznbUV7QZbtRafmMY4&sol=0  _______________________________________________ Sent through the Full Disclosure mailing list https://nasa.gov/mailman/listinfo/fulldisclosure Web Archives & RSS http://seclists.org\n});']

Here’s my training code:

import warnings
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

warnings.filterwarnings("ignore", category=FutureWarning)

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the dataset
with open('dataset.json', 'r') as f:
    dataset = json.load(f)

# Extract prompts and testcases from the dataset
prompts = [item['prompt'] for item in dataset]
testcases = [item.get('testcase', []) for item in dataset]

# Tokenize prompts and testcases
tokenized_prompts = [tokenizer(prompt, return_tensors='pt', padding=True, truncation=True) for prompt in prompts]
tokenized_testcases = [[tokenizer(testcase['content'], return_tensors='pt', padding=True, truncation=True) for testcase in test] for test in testcases]

# Find the maximum sequence length
max_lengths = []
for tp, test in zip(tokenized_prompts, tokenized_testcases):
    prompt_length = len(tp['input_ids'][0]) if len(tp['input_ids'][0]) > 0 else 0
    testcase_lengths = [len(tc['input_ids'][0]) for tc in test if len(tc['input_ids'][0]) > 0]
    if testcase_lengths:
        max_lengths.append(max(prompt_length, max(testcase_lengths)))
    else:
        max_lengths.append(prompt_length)

max_length = max(max_lengths)

# Define dataset class with the correct padding
class MyGPT2TestcaseGenerator(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = item['prompt']
        testcase = item.get('testcase', [])

        tokenized_prompt = self.tokenizer(prompt, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        tokenized_testcases = [self.tokenizer(tc['content'], truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt') for tc in testcase]

        return tokenized_prompt, tokenized_testcases

def collate_fn(batch):
    prompts, testcases = zip(*batch)
    prompt_input_ids = [prompt['input_ids'].squeeze(0) for prompt in prompts]

    padded_prompts = pad_sequence(prompt_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    
    padded_testcases = []
    for test in testcases:
        if test:
            test_input_ids = [tc['input_ids'].squeeze(0) for tc in test]
            padded_testcases.append(pad_sequence(test_input_ids, batch_first=True, padding_value=tokenizer.pad_token_id))
        else:
            padded_testcases.append(torch.tensor([]))

    return padded_prompts, padded_testcases

class MyGPT2Model(nn.Module):
    def __init__(self):
        super(MyGPT2Model, self).__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

    def forward(self, input_ids):
        # Clamp input_ids to the valid range
        input_ids = input_ids.clamp(max=self.gpt2.config.vocab_size - 1)
        outputs = self.gpt2(input_ids)
        return outputs.logits

if __name__ == '__main__':
    train_dataset = MyGPT2TestcaseGenerator(dataset, tokenizer, max_length)
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, drop_last=True, collate_fn=collate_fn)

    model = MyGPT2Model()
    # Set ignore_index to tokenizer.pad_token_id
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    optimizer = optim.Adam(model.parameters(), lr=5e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(10):  # Number of epochs
        total_loss = 0.0
        model.train()
        for batch in train_loader:
            prompts, testcases = batch
            prompts = prompts.to(device)
            optimizer.zero_grad()
            outputs = model(prompts)
            # Shift the inputs to the right to align them with the targets
            shift_logits = outputs[..., :-1, :].contiguous()
            shift_labels = prompts[..., 1:].contiguous()
            loss = criterion(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    

    torch.save(model.state_dict(), 'testcase_generator_model.pth')

And my test code:

import torch
import torch.nn as nn
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from flask import Flask, render_template, request, send_file
import json
import os
import re

class MyGPT2Model(nn.Module):
    def __init__(self):
        super(MyGPT2Model, self).__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained('gpt2')

    def forward(self, input_ids):
        outputs = self.gpt2(input_ids)
        return outputs.logits

def is_valid_endpoint(url):
    pattern = r'^https?://(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(?:/[^/]*)*$'
    return re.match(pattern, url) is not None

def save_testcases_to_json(testcases, filename):
    with open(filename, 'w') as f:
        json.dump(testcases, f, indent=2)

def trainer(prompt_text, num):
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    model.load_state_dict(torch.load('testcase_generator_model.pth'), strict=False)  # Load the adjusted model
    model.eval()

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    num = int(num)  # Ensure num is an integer

    input_ids = tokenizer.encode(prompt_text, return_tensors='pt')

    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=100, num_return_sequences=num, num_beams=5)
        generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

    # Format the output into Postman test scripts
    postman_test_scripts = []
    for text in generated_texts:
        # Here, you'd

 typically want to do some processing to make it valid for Postman, if needed
        postman_test_scripts.append(f'pm.test("Test case", function () {{\n    {text}\n}});')

    return postman_test_scripts

app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        url = request.form.get('url')
        num = request.form.get('num')

        if not is_valid_endpoint(url):
            return render_template('index.html', error='Invalid URL provided.')

        try:
            testcases = trainer(url, num)
            save_testcases_to_json(testcases, 'generated_testcases.json')
            return render_template('index.html', success='Test cases generated and saved.', filename='generated_testcases.json')
        except Exception as e:
            return render_template('index.html', error=str(e))

    return render_template('index.html')

@app.route('/download/<filename>')
def download(filename):
    return send_file(filename, as_attachment=True)

if __name__ == '__main__':
    app.run(debug=True)

hi @Franco6
It looks like you’re exposing your api_key. Please edit your post.