AgentForge

Example: Testing an Agent

Write a complete test suite for an AgentForge agent using MockLLM and TestHarness.

Overview

This example shows how to test the support triage agent with deterministic, reproducible tests.

Full Test Suite

import { describe, it, expect } from 'vitest';
import {
  createMockLLM, createTestHarness, defineTool, defineAgent,
} from '@ahzan-agentforge/core';
import { z } from 'zod';

// Agent config (without LLM — we'll inject MockLLM)
const classifyTool = defineTool({
  name: 'classify-ticket',
  description: 'Classify a support ticket',
  input: z.object({ ticketId: z.string(), content: z.string() }),
  output: z.object({
    category: z.enum(['billing', 'technical', 'general']),
    priority: z.enum(['low', 'medium', 'high']),
  }),
  execute: async ({ ticketId, content }) => ({
    category: 'technical',
    priority: 'medium',
  }),
});

const agentConfig = {
  name: 'support-triage',
  description: 'Triages support tickets',
  tools: [classifyTool],
  systemPrompt: 'Classify and route support tickets.',
  maxSteps: 5,
};

describe('support-triage agent', () => {
  it('should classify a technical ticket', async () => {
    const mockLLM = createMockLLM({
      responses: [
        {
          toolCalls: [{
            name: 'classify-ticket',
            input: { ticketId: '123', content: '500 error on checkout' },
          }],
        },
        { text: 'Ticket #123 classified as technical, medium priority.' },
      ],
    });

    const harness = createTestHarness({ agent: agentConfig, llm: mockLLM });
    const result = await harness.run({ task: 'Triage ticket #123: 500 error on checkout' });

    expect(result.status).toBe('completed');
    expect(result.toolCalls('classify-ticket')).toHaveLength(1);
    expect(result.hasError()).toBe(false);
  });

  it('should complete within budget', async () => {
    const mockLLM = createMockLLM({
      responses: [
        { toolCalls: [{ name: 'classify-ticket', input: { ticketId: '1', content: 'test' } }] },
        { text: 'Done.' },
      ],
      defaultUsage: { inputTokens: 500, outputTokens: 200 },
    });

    const harness = createTestHarness({
      agent: { ...agentConfig, budget: { maxTokens: 10_000 } },
      llm: mockLLM,
    });
    const result = await harness.run({ task: 'Triage ticket' });

    expect(result.status).toBe('completed');
    expect(result.trace.summary.totalTokens).toBeLessThan(10_000);
  });

  it('should handle tool errors gracefully', async () => {
    const failingTool = defineTool({
      name: 'classify-ticket',
      description: 'Classify a support ticket',
      input: z.object({ ticketId: z.string(), content: z.string() }),
      output: z.object({ category: z.string(), priority: z.string() }),
      execute: async () => { throw new Error('Database unavailable'); },
      retry: { maxAttempts: 1 },
    });

    const mockLLM = createMockLLM({
      responses: [
        { toolCalls: [{ name: 'classify-ticket', input: { ticketId: '1', content: 'test' } }] },
        { text: 'Unable to classify ticket due to a system error.' },
      ],
    });

    const harness = createTestHarness({
      agent: { ...agentConfig, tools: [failingTool] },
      llm: mockLLM,
    });
    const result = await harness.run({ task: 'Triage ticket' });

    expect(result.status).toBe('completed');
    expect(result.hasError('tool')).toBe(true);
  });

  it('should step through execution', async () => {
    const mockLLM = createMockLLM({
      responses: [
        { toolCalls: [{ name: 'classify-ticket', input: { ticketId: '1', content: 'test' } }] },
        { text: 'Classified.' },
      ],
    });

    const harness = createTestHarness({ agent: agentConfig, llm: mockLLM });
    const dbg = await harness.startDebug({ task: 'Triage ticket' });

    // Step 1: LLM decides to call classify-ticket
    const step1 = await dbg.next();
    expect(step1.done).toBe(false);

    // Step 2: LLM returns final text
    const step2 = await dbg.next();
    expect(step2.done).toBe(true);

    const result = await dbg.finish();
    expect(result.status).toBe('completed');
  });
});

Key Points

  • Use createMockLLM() for deterministic responses
  • Use createTestHarness() for rich result assertions
  • toolCalls() and hasError() are methods
  • MockLLM uses text field (not content)
  • StepDebugger lets you verify execution order

Next Steps