Always clean and standardize your dataset
def clean_dataset(data):
# Remove repeated system prompts and deduplicate examples
cleaned =
seen = set()
for ex in data:
key = (ex[‘prompt’], ex[‘completion’])
if key not in seen:
# Strip system prompts from completions
ex[‘completion’] = ex[‘completion’].replace(ex.get(‘system_prompt’, ‘’), ‘’)
cleaned.append(ex)
seen.add(key)
return cleaned
Validate all tool calls for missing or invented fields
def validate_tool_calls(tool_calls, schema):
valid_calls =
for call in tool_calls:
# Remove parameters not in schema
call[‘parameters’] = {k: v for k, v in call[‘parameters’].items() if k in schema}
# Add missing fields as None or default
for field in schema:
if field not in call[‘parameters’]:
call[‘parameters’][field] = None
valid_calls.append(call)
return valid_calls
Ensure plain text prompts always require all fields
def enforce_required_fields(prompt, required_fields):
for field in required_fields:
if field not in prompt:
prompt += f"\nPlease provide the following required field: {field}"
return prompt
Example usage:
data = load_your_data()
schema = {‘field1’, ‘field2’, ‘field3’}
data = clean_dataset(data)
for ex in data:
ex[‘tool_calls’] = validate_tool_calls(ex[‘tool_calls’], schema)
ex[‘prompt’] = enforce_required_fields(ex[‘prompt’], schema)
Optional: add randomness to reduce repeated answers
import random
def shuffle_answers(answers):
random.shuffle(answers)
return answers
If model is repeating previous answers, you may need to add noise or sample from a diverse training set.
Solution provided by Triskel Data Deterministic AI.