Agent Testing Harness
Design and implement comprehensive testing for AI agent systems.
When to Use
-
Building reliable agent systems
-
Validating agent behavior before deployment
-
Creating regression tests for agents
-
Implementing continuous testing for agent updates
-
Evaluating agent performance metrics
Testing Pyramid for Agents
┌─────────────────────┐
│ End-to-End │ Full workflow tests
│ Agent Tests │ (expensive, few)
├─────────────────────┤
│ Integration │ Multi-agent interaction
│ Tests │ (medium cost, some)
├─────────────────────┤
│ Component │ Individual agent behavior
│ Tests │ (cheap, many)
├─────────────────────┤
│ Unit Tests │ Tools, utilities, helpers
│ │ (cheapest, most)
└─────────────────────┘
Unit Testing Agent Components
Testing Tools
import { describe, it, expect, vi } from 'vitest';
describe('SearchTool', () => { const searchTool = new SearchTool();
it('returns results for valid query', async () => { const result = await searchTool.execute({ query: 'test query' });
expect(result.success).toBe(true);
expect(result.data.results).toBeInstanceOf(Array);
expect(result.data.results.length).toBeGreaterThan(0);
});
it('handles empty query gracefully', async () => { const result = await searchTool.execute({ query: '' });
expect(result.success).toBe(false);
expect(result.error.code).toBe('INVALID_INPUT');
});
it('respects rate limits', async () => { // Make many requests quickly const promises = Array(10).fill(null).map(() => searchTool.execute({ query: 'test' }) );
const results = await Promise.all(promises);
const rateLimited = results.filter(r => r.error?.code === 'RATE_LIMITED');
expect(rateLimited.length).toBeGreaterThan(0);
}); });
Testing Prompts
describe('SystemPrompt', () => { it('includes all required sections', () => { const prompt = generateSystemPrompt(config);
expect(prompt).toContain('## Your Role');
expect(prompt).toContain('## Available Tools');
expect(prompt).toContain('## Constraints');
});
it('correctly formats tool descriptions', () => { const prompt = generateSystemPrompt({ tools: [ { name: 'search', description: 'Search the web' }, { name: 'calculate', description: 'Do math' } ] });
expect(prompt).toContain('- search: Search the web');
expect(prompt).toContain('- calculate: Do math');
}); });
Component Testing Agents
Mock LLM Responses
class MockLLM { private responses: Map<string, string> = new Map();
setResponse(inputPattern: RegExp | string, response: string): void { const key = inputPattern instanceof RegExp ? inputPattern.source : inputPattern; this.responses.set(key, response); }
async complete(input: string): Promise<string> { for (const [pattern, response] of this.responses) { const regex = new RegExp(pattern, 'i'); if (regex.test(input)) { return response; } } return 'Default mock response'; } }
describe('ResearchAgent', () => { let agent: ResearchAgent; let mockLLM: MockLLM;
beforeEach(() => { mockLLM = new MockLLM(); agent = new ResearchAgent({ llm: mockLLM }); });
it('formulates search queries from task', async () => { mockLLM.setResponse( /formulate.*search/i, JSON.stringify({ queries: ['query 1', 'query 2'] }) );
const result = await agent.planResearch('Find information about X');
expect(result.queries).toHaveLength(2);
});
it('synthesizes findings into report', async () => { mockLLM.setResponse( /synthesize/i, 'Based on the research, here are the key findings...' );
const result = await agent.synthesize([
{ source: 'source1', content: 'finding 1' },
{ source: 'source2', content: 'finding 2' }
]);
expect(result).toContain('key findings');
}); });
Testing Agent Decision Making
describe('Agent Decision Making', () => { it('selects appropriate tool for task', async () => { const agent = new Agent({ tools: [searchTool, calculatorTool, fileReaderTool] });
// Math task should use calculator
const mathDecision = await agent.decideTool('Calculate 15% of 200');
expect(mathDecision.tool).toBe('calculator');
// Search task should use search
const searchDecision = await agent.decideTool('Find the latest news about AI');
expect(searchDecision.tool).toBe('search');
});
it('handles ambiguous tasks appropriately', async () => { const agent = new Agent({ tools: [searchTool, fileReaderTool] });
const decision = await agent.decideTool('Read about quantum computing');
// Should clarify or make reasonable choice
expect(['search', 'clarify']).toContain(decision.action);
}); });
Integration Testing Multi-Agent Systems
describe('Multi-Agent Workflow', () => { let supervisor: SupervisorAgent; let researcher: ResearchAgent; let writer: WriterAgent; let reviewer: ReviewerAgent;
beforeEach(() => { researcher = new ResearchAgent(); writer = new WriterAgent(); reviewer = new ReviewerAgent(); supervisor = new SupervisorAgent({ workers: [researcher, writer, reviewer] }); });
it('coordinates agents to complete task', async () => { const result = await supervisor.execute( 'Write a blog post about renewable energy' );
expect(result.success).toBe(true);
expect(result.steps).toContainEqual(
expect.objectContaining({ agent: 'researcher', status: 'completed' })
);
expect(result.steps).toContainEqual(
expect.objectContaining({ agent: 'writer', status: 'completed' })
);
});
it('handles agent failure gracefully', async () => { // Make researcher fail vi.spyOn(researcher, 'execute').mockRejectedValue(new Error('API Error'));
const result = await supervisor.execute('Research topic X');
expect(result.success).toBe(false);
expect(result.error).toContain('researcher failed');
expect(result.recoveryAttempts).toBeGreaterThan(0);
});
it('respects budget constraints', async () => { const result = await supervisor.execute( 'Complex research task', { budgetUSD: 0.01 } // Very low budget );
expect(result.totalCost).toBeLessThanOrEqual(0.01);
}); });
End-to-End Agent Tests
describe('E2E: Content Creation Pipeline', () => { // Use real LLM but with test account const agent = new ContentCreationAgent({ llm: new OpenAI({ apiKey: process.env.TEST_API_KEY }) });
it('creates blog post from topic', async () => { const result = await agent.createContent({ type: 'blog_post', topic: 'Benefits of unit testing', targetLength: 500 });
// Structure validation
expect(result.title).toBeDefined();
expect(result.content.length).toBeGreaterThan(400);
expect(result.content.length).toBeLessThan(600);
// Content validation
expect(result.content.toLowerCase()).toContain('test');
expect(result.sections.length).toBeGreaterThanOrEqual(3);
}, 60000); // Long timeout for real API calls
it('handles user feedback loop', async () => { const draft = await agent.createContent({ type: 'blog_post', topic: 'AI in healthcare' });
const revised = await agent.reviseContent(draft, {
feedback: 'Make it more technical and add statistics'
});
expect(revised.content).not.toEqual(draft.content);
// Check for more technical language
expect(revised.content).toMatch(/\d+%|\d+ percent/);
}, 120000); });
Evaluation Metrics
interface AgentEvaluation { taskCompletion: number; // 0-1: Did it complete the task? accuracy: number; // 0-1: Is the output correct? efficiency: number; // 0-1: Token/time efficiency safety: number; // 0-1: No harmful outputs reliability: number; // 0-1: Consistent results }
class AgentEvaluator { async evaluate( agent: Agent, testCases: TestCase[] ): Promise<EvaluationReport> { const results: EvaluationResult[] = [];
for (const testCase of testCases) {
const startTime = Date.now();
const result = await agent.execute(testCase.input);
const duration = Date.now() - startTime;
const evaluation: AgentEvaluation = {
taskCompletion: this.assessCompletion(result, testCase.expected),
accuracy: this.assessAccuracy(result, testCase.expected),
efficiency: this.assessEfficiency(result, duration),
safety: this.assessSafety(result),
reliability: 1 // Will be calculated across runs
};
results.push({ testCase, result, evaluation, duration });
}
// Run reliability tests (same input multiple times)
const reliabilityScores = await this.testReliability(agent, testCases);
return this.compileReport(results, reliabilityScores);
}
private assessCompletion(result: Result, expected: Expected): number { if (!result.success) return 0;
// Check required outputs are present
const requiredKeys = Object.keys(expected.requiredOutputs || {});
const presentKeys = requiredKeys.filter(k => result.output[k] !== undefined);
return presentKeys.length / Math.max(requiredKeys.length, 1);
}
private assessAccuracy(result: Result, expected: Expected): number { if (!expected.groundTruth) return 1; // No ground truth to compare
// Use LLM to judge similarity
return this.llmJudge(result.output, expected.groundTruth);
} }
Test Data Management
interface TestCase { id: string; name: string; category: string; input: AgentInput; expected: { success: boolean; requiredOutputs?: Record<string, any>; groundTruth?: string; maxDurationMs?: number; maxCostUSD?: number; }; tags: string[]; }
// Test case factory function createTestCase( name: string, input: string, expected: Partial<TestCase['expected']> ): TestCase { return { id: generateId(), name, category: 'default', input: { task: input }, expected: { success: true, ...expected }, tags: [] }; }
// Example test suite const codeReviewTestCases: TestCase[] = [ createTestCase( 'Identifies security vulnerability', 'Review this code: eval(userInput)', { requiredOutputs: { vulnerabilities: expect.arrayContaining([ expect.objectContaining({ type: 'code_injection' }) ]) } } ), createTestCase( 'Catches null pointer risk', 'Review: const name = user.profile.name', { requiredOutputs: { warnings: expect.arrayContaining([ expect.objectContaining({ type: 'null_safety' }) ]) } } ) ];
Continuous Testing Pipeline
.github/workflows/agent-tests.yml
name: Agent Tests
on: push: paths: - 'agents/' - 'prompts/' schedule: - cron: '0 0 * * *' # Daily regression
jobs: unit-tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - run: npm test -- --grep "unit"
component-tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - run: npm test -- --grep "component" env: MOCK_LLM: true
integration-tests: runs-on: ubuntu-latest needs: [unit-tests, component-tests] steps: - uses: actions/checkout@v4 - run: npm test -- --grep "integration" env: TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
e2e-tests: runs-on: ubuntu-latest needs: integration-tests if: github.ref == 'refs/heads/main' steps: - uses: actions/checkout@v4 - run: npm test -- --grep "e2e" env: TEST_API_KEY: ${{ secrets.TEST_API_KEY }}
Best Practices
-
Mock at boundaries - Mock LLM, not agent logic
-
Test deterministically - Set seeds, use fixed responses
-
Measure what matters - Completion, accuracy, safety
-
Automate regression - Catch prompt regressions
-
Test failure modes - Agents should fail gracefully
-
Budget for tests - Real API tests cost money
-
Version test cases - Track alongside prompt changes
-
Use golden outputs - Compare against known-good results