Prompt Regression Tester
Systematically test prompt changes to prevent regressions.
Test Case Format
{ "test_cases": [ { "id": "test_001", "input": "Summarize this article", "context": "Article text here...", "expected_behavior": "Concise 2-3 sentence summary", "baseline_output": "Output from v1.0 prompt", "must_include": ["main point", "conclusion"], "must_not_include": ["opinion", "speculation"] } ] }
Comparison Framework
def compare_prompts(old_prompt, new_prompt, test_cases): results = { "test_cases": [], "summary": { "total": len(test_cases), "improvements": 0, "regressions": 0, "unchanged": 0, }, "breakages": [] }
for test in test_cases:
old_output = llm(old_prompt.format(**test))
new_output = llm(new_prompt.format(**test))
comparison = {
"test_id": test["id"],
"old_output": old_output,
"new_output": new_output,
"diff": compute_diff(old_output, new_output),
"scores": {
"old": score_output(old_output, test),
"new": score_output(new_output, test),
},
"verdict": classify_change(old_output, new_output, test)
}
results["test_cases"].append(comparison)
results["summary"][comparison["verdict"]] += 1
if comparison["verdict"] == "regressions":
results["breakages"].append(analyze_breakage(comparison, test))
return results
Stability Metrics
def calculate_stability_metrics(results): return { "output_stability": measure_output_consistency(results), "format_stability": check_format_preservation(results), "constraint_adherence": check_constraints(results), "behavioral_consistency": measure_behavior_delta(results), }
def measure_output_consistency(results): """How similar are outputs between versions?""" similarities = [] for result in results["test_cases"]: sim = semantic_similarity( result["old_output"], result["new_output"] ) similarities.append(sim) return sum(similarities) / len(similarities)
Breakage Analysis
def analyze_breakage(comparison, test_case): """Identify why the new prompt failed""" reasons = []
new_out = comparison["new_output"]
# Missing required content
for keyword in test_case.get("must_include", []):
if keyword.lower() not in new_out.lower():
reasons.append(f"Missing required content: '{keyword}'")
# Contains forbidden content
for keyword in test_case.get("must_not_include", []):
if keyword.lower() in new_out.lower():
reasons.append(f"Contains forbidden content: '{keyword}'")
# Format violations
if not check_format(new_out, test_case.get("expected_format")):
reasons.append("Output format violation")
# Length issues
expected_length = test_case.get("expected_length")
if expected_length:
actual_length = len(new_out.split())
if abs(actual_length - expected_length) > expected_length * 0.3:
reasons.append(f"Length deviation: expected ~{expected_length}, got {actual_length}")
return {
"test_id": test_case["id"],
"reasons": reasons,
"old_output": comparison["old_output"][:100],
"new_output": comparison["new_output"][:100],
}
Fix Suggestions
def suggest_fixes(breakages): """Generate fix suggestions based on breakage patterns""" suggestions = []
# Group breakages by reason
reason_groups = {}
for breakage in breakages:
for reason in breakage["reasons"]:
if reason not in reason_groups:
reason_groups[reason] = []
reason_groups[reason].append(breakage["test_id"])
# Generate suggestions
for reason, test_ids in reason_groups.items():
if "Missing required content" in reason:
suggestions.append({
"issue": reason,
"affected_tests": test_ids,
"suggestion": "Add explicit instruction in prompt to include this content",
"example": f"Make sure to mention {reason.split(':')[1]} in your response."
})
elif "format violation" in reason:
suggestions.append({
"issue": reason,
"affected_tests": test_ids,
"suggestion": "Add stricter format constraints to prompt",
"example": "Output must follow this exact format: ..."
})
return suggestions
Report Generation
Prompt Regression Report
Summary
- Total tests: 50
- Improvements: 5 (10%)
- Regressions: 3 (6%)
- Unchanged: 42 (84%)
Stability Metrics
- Output stability: 0.87
- Format stability: 0.95
- Constraint adherence: 0.94
Regressions (3)
test_005: Missing required content
Old output: "The main benefit is cost savings..." New output: "This approach provides flexibility..." Issue: Missing required keyword 'cost' Fix: Add explicit instruction: "Mention cost implications in your response"
Recommended Actions
- Revert changes that caused regressions (tests: 005, 012, 023)
- Add format constraints for JSON output
- Run full test suite before deployment
Best Practices
-
Test with diverse inputs
-
Compare across multiple runs (LLMs are stochastic)
-
Track metrics over time
-
Automate in CI/CD
-
Review all regressions before deploy
-
Maintain test case library
Output Checklist
-
Test cases defined (30+)
-
Comparison runner
-
Stability metrics
-
Breakage analyzer
-
Fix suggestions
-
Diff visualizer
-
Automated report
-
CI integration