AgentForge

Testing Recipes

Common testing patterns — tool call assertions, error handling, streaming, and multi-agent tests.

Test Tool Call Order

it('should search then process', async () => {
  const mock = createMockLLM({
    responses: [
      { toolCalls: [{ name: 'search', input: { q: 'test' } }] },
      { toolCalls: [{ name: 'process', input: { data: 'found' } }] },
      { text: 'Done processing.' },
    ],
  });

  const harness = createTestHarness({ agent: config, llm: mock });
  const result = await harness.run({ task: 'Search and process' });

  const calls = result.toolCalls();
  expect(calls[0].name).toBe('search');
  expect(calls[1].name).toBe('process');
});

Test Error Recovery

it('should handle tool errors gracefully', async () => {
  const failingTool = defineTool({
    name: 'flaky-api',
    description: 'Sometimes fails',
    input: z.object({}),
    output: z.object({ data: z.string() }),
    execute: async () => { throw new Error('API down'); },
    retry: { maxAttempts: 1 },
  });

  const mock = createMockLLM({
    responses: [
      { toolCalls: [{ name: 'flaky-api', input: {} }] },
      { text: 'The API is currently unavailable.' },
    ],
  });

  const harness = createTestHarness({
    agent: { ...config, tools: [failingTool] },
    llm: mock,
  });
  const result = await harness.run({ task: 'Fetch data' });

  expect(result.status).toBe('completed');
  expect(result.toolCalls('flaky-api')[0].error).toBeDefined();
});

Test Budget Limits

it('should stop when budget exceeded', async () => {
  const mock = createMockLLM({
    responses: Array(20).fill({ text: 'thinking...' }),
    defaultUsage: { inputTokens: 10_000, outputTokens: 5_000 },
  });

  const harness = createTestHarness({
    agent: { ...config, budget: { maxTokens: 20_000 } },
    llm: mock,
  });

  const result = await harness.run({ task: 'Expensive task' });
  expect(result.status).toBe('failed');
});

Test with StepDebugger

it('should create order before charging', async () => {
  const dbg = await harness.startDebug({ task: 'Process payment' });

  const step1 = await dbg.next();
  expect(step1.step.toolCall?.name).toBe('create-order');

  const step2 = await dbg.next();
  expect(step2.step.toolCall?.name).toBe('charge-payment');

  const result = await dbg.finish();
  expect(result.status).toBe('completed');
});

Next Steps