#!/usr/bin/env node /** * Diligence Test Runner * * Runs end-to-end tests of the Worker-Reviewer loop. * * Modes: * --workflow Test MCP workflow mechanics only (no AI) * --mock Use mock Worker/Reviewer responses * --live Use real Claude API for Worker/Reviewer (requires ANTHROPIC_API_KEY) * * Usage: * node test/run-tests.mjs --workflow * node test/run-tests.mjs --mock --scenario=blocking-voice * node test/run-tests.mjs --live --scenario=permission-cache */ import { McpClient } from './mcp-client.mjs'; import { readFileSync, existsSync, unlinkSync } from 'fs'; import { dirname, join } from 'path'; import { fileURLToPath } from 'url'; const __dirname = dirname(fileURLToPath(import.meta.url)); // Parse CLI args const args = process.argv.slice(2); const mode = args.find(a => ['--workflow', '--mock', '--live'].includes(a)) || '--workflow'; const scenarioArg = args.find(a => a.startsWith('--scenario=')); const scenarioId = scenarioArg ? scenarioArg.split('=')[1] : null; const verbose = args.includes('--verbose') || args.includes('-v'); // Colors for output const colors = { reset: '\x1b[0m', green: '\x1b[32m', red: '\x1b[31m', yellow: '\x1b[33m', blue: '\x1b[34m', dim: '\x1b[2m', }; function log(msg, color = 'reset') { console.log(`${colors[color]}${msg}${colors.reset}`); } function logSection(title) { console.log(`\n${colors.blue}=== ${title} ===${colors.reset}`); } // Load scenario function loadScenario(id) { const path = join(__dirname, 'scenarios', `${id}.json`); if (!existsSync(path)) { throw new Error(`Scenario not found: ${id}`); } return JSON.parse(readFileSync(path, 'utf-8')); } // Load all scenarios function loadAllScenarios() { const index = JSON.parse(readFileSync(join(__dirname, 'scenarios', 'index.json'), 'utf-8')); return index.scenarios.map(s => loadScenario(s.id)); } // Clean up state file before test function cleanState() { const stateFile = join(__dirname, 'fixture', '.claude', '.diligence-state.json'); if (existsSync(stateFile)) { unlinkSync(stateFile); } } // ============================================================================ // Workflow Tests (no AI, just MCP mechanics) // ============================================================================ async function testWorkflow() { logSection('Workflow Tests'); const client = new McpClient(); let passed = 0; let failed = 0; try { cleanState(); await client.connect(); log('Connected to MCP server', 'green'); // Test 1: Status in conversation phase { const result = await client.status(); const ok = result.text.includes('Phase: conversation'); log(` [${ok ? 'PASS' : 'FAIL'}] Initial status is conversation`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 2: Start workflow { const result = await client.start('Test task'); const ok = result.text.includes('researching') && result.text.includes('Round: 1/5'); log(` [${ok ? 'PASS' : 'FAIL'}] Start transitions to researching`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 3: Cannot start again while in progress { const result = await client.start('Another task'); const ok = result.isError && result.text.includes('Already in'); log(` [${ok ? 'PASS' : 'FAIL'}] Cannot start while in progress`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 4: Get worker brief { const result = await client.getWorkerBrief(); const ok = result.text.includes('Worker Brief') && result.text.includes('Test task'); log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief contains task`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 5: Submit proposal { const result = await client.propose('## Analysis\n\nProposed fix here'); const ok = result.text.includes('Proposal submitted'); log(` [${ok ? 'PASS' : 'FAIL'}] Proposal submitted`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 6: Get reviewer brief includes proposal { const result = await client.getReviewerBrief(); const ok = result.text.includes('Reviewer Brief') && result.text.includes('Proposed fix here'); log(` [${ok ? 'PASS' : 'FAIL'}] Reviewer brief contains proposal`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 7: Review with NEEDS_WORK { const result = await client.review('NEEDS_WORK', 'Missing broker event handling'); const ok = result.text.includes('NEEDS_WORK') && result.text.includes('Round 2/5'); log(` [${ok ? 'PASS' : 'FAIL'}] NEEDS_WORK increments round`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 8: Worker brief now includes feedback { const result = await client.getWorkerBrief(); const ok = result.text.includes('Previous Feedback') && result.text.includes('broker event'); log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief includes previous feedback`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 9: Submit revised proposal { const result = await client.propose('## Revised\n\nNow with broker events'); const ok = result.text.includes('Proposal submitted'); log(` [${ok ? 'PASS' : 'FAIL'}] Revised proposal submitted`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 10: Review with APPROVED { const result = await client.review('APPROVED', 'All checks pass'); const ok = result.text.includes('APPROVED') && result.text.includes('2 round'); log(` [${ok ? 'PASS' : 'FAIL'}] APPROVED after review`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 11: Status shows approved { const result = await client.status(); const ok = result.text.includes('Phase: approved'); log(` [${ok ? 'PASS' : 'FAIL'}] Status shows approved phase`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 12: Implement { const result = await client.implement(); const ok = result.text.includes('Implementation phase'); log(` [${ok ? 'PASS' : 'FAIL'}] Implement starts implementation`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 13: Complete { const result = await client.complete('Fixed the bug'); const ok = result.text.includes('Complete') && result.text.includes('Reset to conversation'); log(` [${ok ? 'PASS' : 'FAIL'}] Complete resets workflow`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 14: Back to conversation { const result = await client.status(); const ok = result.text.includes('Phase: conversation'); log(` [${ok ? 'PASS' : 'FAIL'}] Back to conversation phase`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 15: Abort works { await client.start('Task to abort'); const result = await client.abort('Changed my mind'); const ok = result.text.includes('Aborted') && result.text.includes('Reset to conversation'); log(` [${ok ? 'PASS' : 'FAIL'}] Abort resets workflow`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } // Test 16: Max rounds enforcement { await client.start('Task for max rounds'); for (let i = 0; i < 5; i++) { await client.propose(`Proposal ${i + 1}`); if (i < 4) { await client.review('NEEDS_WORK', `Feedback ${i + 1}`); } } const result = await client.review('NEEDS_WORK', 'Still not good'); const ok = result.text.includes('MAX ROUNDS') && result.text.includes('reset'); log(` [${ok ? 'PASS' : 'FAIL'}] Max rounds resets workflow`, ok ? 'green' : 'red'); ok ? passed++ : failed++; } log(`\nWorkflow tests: ${passed} passed, ${failed} failed`, failed ? 'red' : 'green'); return failed === 0; } finally { await client.disconnect(); cleanState(); } } // ============================================================================ // Mock Tests (predefined Worker/Reviewer responses) // ============================================================================ async function testWithMocks(scenario) { logSection(`Mock Test: ${scenario.name}`); const client = new McpClient(); try { cleanState(); await client.connect(); // Start the workflow await client.start(scenario.task); log(`Started task: ${scenario.task.slice(0, 60)}...`, 'dim'); // Round 1: Worker submits naive fix const naiveProposal = `## Analysis ${scenario.naive_fix.description} ### Changes ${scenario.naive_fix.changes.map(c => `- ${c.file}: ${c.change}`).join('\n')} `; await client.propose(naiveProposal); log('Worker submitted naive proposal', 'dim'); // Round 1: Reviewer catches issues const issues = scenario.naive_fix.issues; const reviewFeedback = `NEEDS_WORK Issues found: ${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')} The proposal misses critical aspects. Please address all issues. `; await client.review('NEEDS_WORK', reviewFeedback); log(`Reviewer found ${issues.length} issues`, 'yellow'); // Round 2: Worker submits correct fix const correctProposal = `## Revised Analysis ${scenario.correct_fix.description} ### Required Changes ${scenario.correct_fix.required_changes.map(c => `#### ${c.file}:${c.function} - ${c.change} - Reference: ${c.line_reference}` ).join('\n\n')} ### Broker Subscriptions ${scenario.correct_fix.required_broker_subscriptions.map(s => `- ${s.service} subscribes to ${s.event}: ${s.action}` ).join('\n')} ### Pattern References ${scenario.correct_fix.pattern_references.map(p => `- ${p}`).join('\n')} `; await client.propose(correctProposal); log('Worker submitted revised proposal', 'dim'); // Round 2: Reviewer approves await client.review('APPROVED', 'All required changes identified. Pattern followed correctly.'); log('Reviewer approved', 'green'); // Validate the proposal const validation = scenario.validation_criteria; let validationPassed = true; log('\nValidation:', 'blue'); // Check must_mention for (const item of validation.must_mention || []) { const found = correctProposal.toLowerCase().includes(item.toLowerCase()); log(` [${found ? 'PASS' : 'FAIL'}] Mentions: ${item}`, found ? 'green' : 'red'); if (!found) validationPassed = false; } // Check pattern reference if (validation.should_reference_pattern) { const found = correctProposal.includes(validation.should_reference_pattern); log(` [${found ? 'PASS' : 'FAIL'}] References pattern: ${validation.should_reference_pattern}`, found ? 'green' : 'red'); if (!found) validationPassed = false; } // Complete the workflow await client.implement(); await client.complete('Test completed'); log(`\nMock test ${scenario.id}: ${validationPassed ? 'PASSED' : 'FAILED'}`, validationPassed ? 'green' : 'red'); return validationPassed; } finally { await client.disconnect(); cleanState(); } } // ============================================================================ // Live Tests (real Claude API) // ============================================================================ async function testLive(scenario) { logSection(`Live Test: ${scenario.name}`); const apiKey = process.env.ANTHROPIC_API_KEY; if (!apiKey) { log('ANTHROPIC_API_KEY not set. Skipping live test.', 'yellow'); return null; } log('Live tests with Claude API not yet implemented.', 'yellow'); log('This would spawn real Worker and Reviewer sub-agents.', 'dim'); // TODO: Implement Claude API integration // 1. Get worker brief // 2. Call Claude API with worker prompt + brief // 3. Submit Claude's proposal // 4. Get reviewer brief // 5. Call Claude API with reviewer prompt + brief // 6. Submit Claude's review // 7. Loop until approved or max rounds return null; } // ============================================================================ // Main // ============================================================================ async function main() { log('\nšŸ” Diligence Test Runner\n', 'blue'); log(`Mode: ${mode}`, 'dim'); let allPassed = true; switch (mode) { case '--workflow': allPassed = await testWorkflow(); break; case '--mock': { const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios(); for (const scenario of scenarios) { const passed = await testWithMocks(scenario); if (!passed) allPassed = false; } break; } case '--live': { const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios(); for (const scenario of scenarios) { const result = await testLive(scenario); if (result === false) allPassed = false; } break; } } console.log(); if (allPassed) { log('āœ“ All tests passed', 'green'); process.exit(0); } else { log('āœ— Some tests failed', 'red'); process.exit(1); } } main().catch(err => { console.error('Error:', err); process.exit(1); });