Agent Testing Harness

Design and implement comprehensive testing for AI agent systems.

When to Use

Building reliable agent systems
Validating agent behavior before deployment
Creating regression tests for agents
Implementing continuous testing for agent updates
Evaluating agent performance metrics

Testing Pyramid for Agents

    ┌─────────────────────┐
    │   End-to-End        │  Full workflow tests
    │   Agent Tests       │  (expensive, few)
    ├─────────────────────┤
    │   Integration       │  Multi-agent interaction
    │   Tests             │  (medium cost, some)
    ├─────────────────────┤
    │   Component         │  Individual agent behavior
    │   Tests             │  (cheap, many)
    ├─────────────────────┤
    │   Unit Tests        │  Tools, utilities, helpers
    │                     │  (cheapest, most)
    └─────────────────────┘

Unit Testing Agent Components

Testing Tools

import { describe, it, expect, vi } from 'vitest';

describe('SearchTool', () => { const searchTool = new SearchTool();

it('returns results for valid query', async () => { const result = await searchTool.execute({ query: 'test query' });

expect(result.success).toBe(true);
expect(result.data.results).toBeInstanceOf(Array);
expect(result.data.results.length).toBeGreaterThan(0);

});

it('handles empty query gracefully', async () => { const result = await searchTool.execute({ query: '' });

expect(result.success).toBe(false);
expect(result.error.code).toBe('INVALID_INPUT');

});

it('respects rate limits', async () => { // Make many requests quickly const promises = Array(10).fill(null).map(() => searchTool.execute({ query: 'test' }) );

const results = await Promise.all(promises);
const rateLimited = results.filter(r => r.error?.code === 'RATE_LIMITED');

expect(rateLimited.length).toBeGreaterThan(0);

}); });

Testing Prompts

describe('SystemPrompt', () => { it('includes all required sections', () => { const prompt = generateSystemPrompt(config);

expect(prompt).toContain('## Your Role');
expect(prompt).toContain('## Available Tools');
expect(prompt).toContain('## Constraints');

});

it('correctly formats tool descriptions', () => { const prompt = generateSystemPrompt({ tools: [ { name: 'search', description: 'Search the web' }, { name: 'calculate', description: 'Do math' } ] });

expect(prompt).toContain('- search: Search the web');
expect(prompt).toContain('- calculate: Do math');

}); });

Component Testing Agents

Mock LLM Responses

class MockLLM { private responses: Map<string, string> = new Map();

setResponse(inputPattern: RegExp | string, response: string): void { const key = inputPattern instanceof RegExp ? inputPattern.source : inputPattern; this.responses.set(key, response); }

async complete(input: string): Promise<string> { for (const [pattern, response] of this.responses) { const regex = new RegExp(pattern, 'i'); if (regex.test(input)) { return response; } } return 'Default mock response'; } }

describe('ResearchAgent', () => { let agent: ResearchAgent; let mockLLM: MockLLM;

beforeEach(() => { mockLLM = new MockLLM(); agent = new ResearchAgent({ llm: mockLLM }); });

it('formulates search queries from task', async () => { mockLLM.setResponse( /formulate.*search/i, JSON.stringify({ queries: ['query 1', 'query 2'] }) );

const result = await agent.planResearch('Find information about X');

expect(result.queries).toHaveLength(2);

});

it('synthesizes findings into report', async () => { mockLLM.setResponse( /synthesize/i, 'Based on the research, here are the key findings...' );

const result = await agent.synthesize([
  { source: 'source1', content: 'finding 1' },
  { source: 'source2', content: 'finding 2' }
]);

expect(result).toContain('key findings');

}); });

Testing Agent Decision Making

describe('Agent Decision Making', () => { it('selects appropriate tool for task', async () => { const agent = new Agent({ tools: [searchTool, calculatorTool, fileReaderTool] });

// Math task should use calculator
const mathDecision = await agent.decideTool('Calculate 15% of 200');
expect(mathDecision.tool).toBe('calculator');

// Search task should use search
const searchDecision = await agent.decideTool('Find the latest news about AI');
expect(searchDecision.tool).toBe('search');

});

it('handles ambiguous tasks appropriately', async () => { const agent = new Agent({ tools: [searchTool, fileReaderTool] });

const decision = await agent.decideTool('Read about quantum computing');

// Should clarify or make reasonable choice
expect(['search', 'clarify']).toContain(decision.action);

}); });

Integration Testing Multi-Agent Systems

describe('Multi-Agent Workflow', () => { let supervisor: SupervisorAgent; let researcher: ResearchAgent; let writer: WriterAgent; let reviewer: ReviewerAgent;

beforeEach(() => { researcher = new ResearchAgent(); writer = new WriterAgent(); reviewer = new ReviewerAgent(); supervisor = new SupervisorAgent({ workers: [researcher, writer, reviewer] }); });

it('coordinates agents to complete task', async () => { const result = await supervisor.execute( 'Write a blog post about renewable energy' );

expect(result.success).toBe(true);
expect(result.steps).toContainEqual(
  expect.objectContaining({ agent: 'researcher', status: 'completed' })
);
expect(result.steps).toContainEqual(
  expect.objectContaining({ agent: 'writer', status: 'completed' })
);

});

it('handles agent failure gracefully', async () => { // Make researcher fail vi.spyOn(researcher, 'execute').mockRejectedValue(new Error('API Error'));

const result = await supervisor.execute('Research topic X');

expect(result.success).toBe(false);
expect(result.error).toContain('researcher failed');
expect(result.recoveryAttempts).toBeGreaterThan(0);

});

it('respects budget constraints', async () => { const result = await supervisor.execute( 'Complex research task', { budgetUSD: 0.01 } // Very low budget );

expect(result.totalCost).toBeLessThanOrEqual(0.01);

}); });

End-to-End Agent Tests

describe('E2E: Content Creation Pipeline', () => { // Use real LLM but with test account const agent = new ContentCreationAgent({ llm: new OpenAI({ apiKey: process.env.TEST_API_KEY }) });

it('creates blog post from topic', async () => { const result = await agent.createContent({ type: 'blog_post', topic: 'Benefits of unit testing', targetLength: 500 });

// Structure validation
expect(result.title).toBeDefined();
expect(result.content.length).toBeGreaterThan(400);
expect(result.content.length).toBeLessThan(600);

// Content validation
expect(result.content.toLowerCase()).toContain('test');
expect(result.sections.length).toBeGreaterThanOrEqual(3);

}, 60000); // Long timeout for real API calls

it('handles user feedback loop', async () => { const draft = await agent.createContent({ type: 'blog_post', topic: 'AI in healthcare' });

const revised = await agent.reviseContent(draft, {
  feedback: 'Make it more technical and add statistics'
});

expect(revised.content).not.toEqual(draft.content);
// Check for more technical language
expect(revised.content).toMatch(/\d+%|\d+ percent/);

}, 120000); });

Evaluation Metrics

interface AgentEvaluation { taskCompletion: number; // 0-1: Did it complete the task? accuracy: number; // 0-1: Is the output correct? efficiency: number; // 0-1: Token/time efficiency safety: number; // 0-1: No harmful outputs reliability: number; // 0-1: Consistent results }

class AgentEvaluator { async evaluate( agent: Agent, testCases: TestCase[] ): Promise<EvaluationReport> { const results: EvaluationResult[] = [];

for (const testCase of testCases) {
  const startTime = Date.now();
  const result = await agent.execute(testCase.input);
  const duration = Date.now() - startTime;

  const evaluation: AgentEvaluation = {
    taskCompletion: this.assessCompletion(result, testCase.expected),
    accuracy: this.assessAccuracy(result, testCase.expected),
    efficiency: this.assessEfficiency(result, duration),
    safety: this.assessSafety(result),
    reliability: 1 // Will be calculated across runs
  };

  results.push({ testCase, result, evaluation, duration });
}

// Run reliability tests (same input multiple times)
const reliabilityScores = await this.testReliability(agent, testCases);

return this.compileReport(results, reliabilityScores);

}

private assessCompletion(result: Result, expected: Expected): number { if (!result.success) return 0;

// Check required outputs are present
const requiredKeys = Object.keys(expected.requiredOutputs || {});
const presentKeys = requiredKeys.filter(k => result.output[k] !== undefined);

return presentKeys.length / Math.max(requiredKeys.length, 1);

}

private assessAccuracy(result: Result, expected: Expected): number { if (!expected.groundTruth) return 1; // No ground truth to compare

// Use LLM to judge similarity
return this.llmJudge(result.output, expected.groundTruth);

} }

Test Data Management

interface TestCase { id: string; name: string; category: string; input: AgentInput; expected: { success: boolean; requiredOutputs?: Record<string, any>; groundTruth?: string; maxDurationMs?: number; maxCostUSD?: number; }; tags: string[]; }

// Test case factory function createTestCase( name: string, input: string, expected: Partial<TestCase['expected']> ): TestCase { return { id: generateId(), name, category: 'default', input: { task: input }, expected: { success: true, ...expected }, tags: [] }; }

// Example test suite const codeReviewTestCases: TestCase[] = [ createTestCase( 'Identifies security vulnerability', 'Review this code: eval(userInput)', { requiredOutputs: { vulnerabilities: expect.arrayContaining([ expect.objectContaining({ type: 'code_injection' }) ]) } } ), createTestCase( 'Catches null pointer risk', 'Review: const name = user.profile.name', { requiredOutputs: { warnings: expect.arrayContaining([ expect.objectContaining({ type: 'null_safety' }) ]) } } ) ];

Continuous Testing Pipeline

.github/workflows/agent-tests.yml

name: Agent Tests

on: push: paths: - 'agents/' - 'prompts/' schedule: - cron: '0 0 * * *' # Daily regression

jobs: unit-tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - run: npm test -- --grep "unit"

component-tests: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - run: npm test -- --grep "component" env: MOCK_LLM: true

integration-tests: runs-on: ubuntu-latest needs: [unit-tests, component-tests] steps: - uses: actions/checkout@v4 - run: npm test -- --grep "integration" env: TEST_API_KEY: ${{ secrets.TEST_API_KEY }}

e2e-tests: runs-on: ubuntu-latest needs: integration-tests if: github.ref == 'refs/heads/main' steps: - uses: actions/checkout@v4 - run: npm test -- --grep "e2e" env: TEST_API_KEY: ${{ secrets.TEST_API_KEY }}

Best Practices

Mock at boundaries - Mock LLM, not agent logic
Test deterministically - Set seeds, use fixed responses
Measure what matters - Completion, accuracy, safety
Automate regression - Catch prompt regressions
Test failure modes - Agents should fail gracefully
Budget for tests - Real API tests cost money
Version test cases - Track alongside prompt changes
Use golden outputs - Compare against known-good results

agent-testing-harness

Safety Notice

Copy this and send it to your AI assistant to learn

.github/workflows/agent-tests.yml

Source Transparency

Related Skills

graphrag-patterns

agentic-rag

production-rag-checklist