Agent Error Recovery
Design fault-tolerant agent systems that recover gracefully from failures.
When to Use
-
Building production-grade agent systems
-
Agents need to handle API failures
-
Implementing autonomous error recovery
-
Designing resilient multi-agent workflows
-
Setting up monitoring and alerting
Error Classification
enum ErrorCategory { // Transient - retry likely to succeed RATE_LIMIT = 'rate_limit', TIMEOUT = 'timeout', NETWORK = 'network', SERVICE_UNAVAILABLE = 'service_unavailable',
// Recoverable - different approach may work INVALID_INPUT = 'invalid_input', CONTEXT_OVERFLOW = 'context_overflow', TOOL_FAILURE = 'tool_failure',
// Terminal - cannot proceed AUTHENTICATION = 'authentication', AUTHORIZATION = 'authorization', NOT_FOUND = 'not_found', VALIDATION = 'validation',
// Unknown UNKNOWN = 'unknown' }
interface AgentError { category: ErrorCategory; code: string; message: string; retryable: boolean; context: Record<string, unknown>; timestamp: Date; stackTrace?: string; }
function classifyError(error: Error): AgentError { // Rate limits if (error.message.includes('429') || error.message.includes('rate limit')) { return { category: ErrorCategory.RATE_LIMIT, code: 'RATE_LIMITED', message: error.message, retryable: true, context: { waitTime: extractWaitTime(error) }, timestamp: new Date() }; }
// Timeouts if (error.message.includes('timeout') || error.message.includes('ETIMEDOUT')) { return { category: ErrorCategory.TIMEOUT, code: 'TIMEOUT', message: error.message, retryable: true, context: {}, timestamp: new Date() }; }
// Context overflow if (error.message.includes('context length') || error.message.includes('too long')) { return { category: ErrorCategory.CONTEXT_OVERFLOW, code: 'CONTEXT_OVERFLOW', message: error.message, retryable: true, // Can retry with truncated context context: {}, timestamp: new Date() }; }
// Default return { category: ErrorCategory.UNKNOWN, code: 'UNKNOWN', message: error.message, retryable: false, context: {}, timestamp: new Date(), stackTrace: error.stack }; }
Recovery Strategies
Strategy 1: Retry with Backoff
interface RetryConfig { maxAttempts: number; initialDelayMs: number; maxDelayMs: number; backoffMultiplier: number; jitterMs: number; }
async function retryWithBackoff<T>( operation: () => Promise<T>, config: RetryConfig ): Promise<T> { let lastError: Error; let delay = config.initialDelayMs;
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) { try { return await operation(); } catch (error) { lastError = error as Error; const classified = classifyError(lastError);
// Don't retry non-retryable errors
if (!classified.retryable) {
throw lastError;
}
// Last attempt - throw
if (attempt === config.maxAttempts) {
throw lastError;
}
// Calculate delay with jitter
const jitter = Math.random() * config.jitterMs;
const waitTime = Math.min(delay + jitter, config.maxDelayMs);
console.log(`Attempt ${attempt} failed, retrying in ${waitTime}ms...`);
await sleep(waitTime);
// Increase delay for next attempt
delay *= config.backoffMultiplier;
}
}
throw lastError!; }
Strategy 2: Circuit Breaker
enum CircuitState { CLOSED = 'closed', // Normal operation OPEN = 'open', // Failing, reject requests HALF_OPEN = 'half_open' // Testing if recovered }
class CircuitBreaker { private state = CircuitState.CLOSED; private failures = 0; private lastFailure?: Date; private successCount = 0;
constructor( private config: { failureThreshold: number; resetTimeoutMs: number; successThreshold: number; } ) {}
async execute<T>(operation: () => Promise<T>): Promise<T> { // Check if circuit should transition this.checkState();
if (this.state === CircuitState.OPEN) {
throw new Error('Circuit breaker is OPEN');
}
try {
const result = await operation();
this.onSuccess();
return result;
} catch (error) {
this.onFailure();
throw error;
}
}
private checkState(): void { if (this.state === CircuitState.OPEN) { const elapsed = Date.now() - this.lastFailure!.getTime(); if (elapsed >= this.config.resetTimeoutMs) { this.state = CircuitState.HALF_OPEN; this.successCount = 0; } } }
private onSuccess(): void { if (this.state === CircuitState.HALF_OPEN) { this.successCount++; if (this.successCount >= this.config.successThreshold) { this.state = CircuitState.CLOSED; this.failures = 0; } } else { this.failures = 0; } }
private onFailure(): void { this.failures++; this.lastFailure = new Date();
if (this.failures >= this.config.failureThreshold) {
this.state = CircuitState.OPEN;
}
} }
Strategy 3: Fallback Chain
interface FallbackOption<T> { name: string; execute: () => Promise<T>; isApplicable: (error: AgentError) => boolean; }
async function executeWithFallbacks<T>( primary: () => Promise<T>, fallbacks: FallbackOption<T>[] ): Promise<T> { try { return await primary(); } catch (error) { const classified = classifyError(error as Error);
for (const fallback of fallbacks) {
if (fallback.isApplicable(classified)) {
console.log(`Primary failed, trying fallback: ${fallback.name}`);
try {
return await fallback.execute();
} catch (fallbackError) {
console.log(`Fallback ${fallback.name} also failed`);
continue;
}
}
}
// All fallbacks failed
throw error;
} }
// Example usage const result = await executeWithFallbacks( () => callPrimaryAPI(), [ { name: 'backup_api', execute: () => callBackupAPI(), isApplicable: (e) => e.category === ErrorCategory.SERVICE_UNAVAILABLE }, { name: 'cached_response', execute: () => getCachedResponse(), isApplicable: (e) => e.category === ErrorCategory.TIMEOUT }, { name: 'simplified_request', execute: () => callWithReducedParams(), isApplicable: (e) => e.category === ErrorCategory.CONTEXT_OVERFLOW } ] );
Strategy 4: Self-Healing Agent
class SelfHealingAgent { async execute(task: Task): Promise<Result> { let attempt = 0; const maxAttempts = 3;
while (attempt < maxAttempts) {
attempt++;
try {
return await this.runTask(task);
} catch (error) {
const classified = classifyError(error as Error);
// Can we heal?
const healingAction = this.determineHealingAction(classified);
if (!healingAction) {
throw error;
}
console.log(`Attempting self-healing: ${healingAction.description}`);
// Execute healing
await healingAction.execute();
// Modify task if needed
task = healingAction.modifyTask?.(task) || task;
}
}
throw new Error('Max healing attempts exceeded');
}
private determineHealingAction(error: AgentError): HealingAction | null { switch (error.category) { case ErrorCategory.CONTEXT_OVERFLOW: return { description: 'Truncating context to fit limits', execute: async () => {}, modifyTask: (task) => ({ ...task, context: this.truncateContext(task.context) }) };
case ErrorCategory.TOOL_FAILURE:
return {
description: 'Switching to alternative tool',
execute: async () => {
this.toolRouter.excludeTool(error.context.toolName as string);
}
};
case ErrorCategory.RATE_LIMIT:
return {
description: `Waiting ${error.context.waitTime}ms for rate limit`,
execute: async () => {
await sleep(error.context.waitTime as number);
}
};
default:
return null;
}
} }
Error Recovery Workflow
┌─────────────────────────────────────────────────────────────┐ │ Error Occurs │ └─────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────┐ │ Classify Error │ └─────────────────────────────────────────────────────────────┘ │ ┌─────────────────┼─────────────────┐ ▼ ▼ ▼ ┌─────────┐ ┌─────────┐ ┌─────────┐ │Transient│ │Recoverable │Terminal │ └────┬────┘ └────┬────┘ └────┬────┘ │ │ │ ▼ ▼ ▼ ┌─────────┐ ┌─────────┐ ┌─────────┐ │ Retry │ │ Try │ │ Log & │ │ w/Backoff │ Fallback│ │ Alert │ └────┬────┘ └────┬────┘ └────┬────┘ │ │ │ ▼ ▼ ▼ ┌─────────────────────────────────────────┐ │ Success or Escalate │ └─────────────────────────────────────────┘
Monitoring and Alerting
interface ErrorMetrics { totalErrors: number; errorsByCategory: Map<ErrorCategory, number>; errorRate: number; // errors per minute recoveryRate: number; // successful recoveries mttr: number; // mean time to recover (ms) }
class ErrorMonitor { private errors: AgentError[] = []; private recoveries: { error: AgentError; recoveredAt: Date }[] = [];
recordError(error: AgentError): void { this.errors.push(error); this.checkAlerts(); }
recordRecovery(error: AgentError): void { this.recoveries.push({ error, recoveredAt: new Date() }); }
private checkAlerts(): void { const recentErrors = this.getRecentErrors(60000); // Last minute
// High error rate alert
if (recentErrors.length > 10) {
this.sendAlert({
severity: 'high',
message: `High error rate: ${recentErrors.length} errors in last minute`,
errors: recentErrors
});
}
// Repeated same error alert
const errorCounts = new Map<string, number>();
for (const e of recentErrors) {
const key = `${e.category}:${e.code}`;
errorCounts.set(key, (errorCounts.get(key) || 0) + 1);
}
for (const [key, count] of errorCounts) {
if (count >= 5) {
this.sendAlert({
severity: 'medium',
message: `Repeated error: ${key} occurred ${count} times`,
errors: recentErrors.filter(e => `${e.category}:${e.code}` === key)
});
}
}
}
getMetrics(): ErrorMetrics { const window = 5 * 60 * 1000; // 5 minutes const recent = this.getRecentErrors(window);
const byCategory = new Map<ErrorCategory, number>();
for (const e of recent) {
byCategory.set(e.category, (byCategory.get(e.category) || 0) + 1);
}
const recentRecoveries = this.recoveries.filter(
r => Date.now() - r.recoveredAt.getTime() < window
);
const recoveryTimes = recentRecoveries.map(
r => r.recoveredAt.getTime() - r.error.timestamp.getTime()
);
return {
totalErrors: recent.length,
errorsByCategory: byCategory,
errorRate: recent.length / (window / 60000),
recoveryRate: recentRecoveries.length / Math.max(recent.length, 1),
mttr: recoveryTimes.length > 0
? recoveryTimes.reduce((a, b) => a + b, 0) / recoveryTimes.length
: 0
};
} }
Best Practices
-
Classify all errors - Know what you're dealing with
-
Don't retry everything - Some errors won't recover
-
Use exponential backoff - Avoid hammering failing services
-
Set circuit breakers - Protect downstream systems
-
Log everything - Debugging is hard without logs
-
Have fallbacks - Always have a Plan B
-
Alert on patterns - Single errors may be noise, patterns matter
-
Test failure scenarios - Chaos engineering