Files
diligence/test/run-tests.mjs
Marc J. Schmidt bd178fcaf0 Initial release: MCP server enforcing Worker-Reviewer loop
Diligence prevents AI agents from shipping quick fixes that break things
by enforcing a research-propose-verify loop before any code changes.

Key features:
- Worker sub-agent researches and proposes with file:line citations
- Reviewer sub-agent independently verifies claims by searching codebase
- Iterates until approved (max 5 rounds)
- Loads project-specific context from .claude/CODEBASE_CONTEXT.md
- State persisted across sessions

Validated on production codebase: caught architectural mistake (broker
subscriptions on client-side code) that naive agent would have shipped.
2026-01-22 06:22:59 +01:00

416 lines
13 KiB
JavaScript

#!/usr/bin/env node
/**
* Diligence Test Runner
*
* Runs end-to-end tests of the Worker-Reviewer loop.
*
* Modes:
* --workflow Test MCP workflow mechanics only (no AI)
* --mock Use mock Worker/Reviewer responses
* --live Use real Claude API for Worker/Reviewer (requires ANTHROPIC_API_KEY)
*
* Usage:
* node test/run-tests.mjs --workflow
* node test/run-tests.mjs --mock --scenario=blocking-voice
* node test/run-tests.mjs --live --scenario=permission-cache
*/
import { McpClient } from './mcp-client.mjs';
import { readFileSync, existsSync, unlinkSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';
const __dirname = dirname(fileURLToPath(import.meta.url));
// Parse CLI args
const args = process.argv.slice(2);
const mode = args.find(a => ['--workflow', '--mock', '--live'].includes(a)) || '--workflow';
const scenarioArg = args.find(a => a.startsWith('--scenario='));
const scenarioId = scenarioArg ? scenarioArg.split('=')[1] : null;
const verbose = args.includes('--verbose') || args.includes('-v');
// Colors for output
const colors = {
reset: '\x1b[0m',
green: '\x1b[32m',
red: '\x1b[31m',
yellow: '\x1b[33m',
blue: '\x1b[34m',
dim: '\x1b[2m',
};
function log(msg, color = 'reset') {
console.log(`${colors[color]}${msg}${colors.reset}`);
}
function logSection(title) {
console.log(`\n${colors.blue}=== ${title} ===${colors.reset}`);
}
// Load scenario
function loadScenario(id) {
const path = join(__dirname, 'scenarios', `${id}.json`);
if (!existsSync(path)) {
throw new Error(`Scenario not found: ${id}`);
}
return JSON.parse(readFileSync(path, 'utf-8'));
}
// Load all scenarios
function loadAllScenarios() {
const index = JSON.parse(readFileSync(join(__dirname, 'scenarios', 'index.json'), 'utf-8'));
return index.scenarios.map(s => loadScenario(s.id));
}
// Clean up state file before test
function cleanState() {
const stateFile = join(__dirname, 'fixture', '.claude', '.diligence-state.json');
if (existsSync(stateFile)) {
unlinkSync(stateFile);
}
}
// ============================================================================
// Workflow Tests (no AI, just MCP mechanics)
// ============================================================================
async function testWorkflow() {
logSection('Workflow Tests');
const client = new McpClient();
let passed = 0;
let failed = 0;
try {
cleanState();
await client.connect();
log('Connected to MCP server', 'green');
// Test 1: Status in conversation phase
{
const result = await client.status();
const ok = result.text.includes('Phase: conversation');
log(` [${ok ? 'PASS' : 'FAIL'}] Initial status is conversation`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 2: Start workflow
{
const result = await client.start('Test task');
const ok = result.text.includes('researching') && result.text.includes('Round: 1/5');
log(` [${ok ? 'PASS' : 'FAIL'}] Start transitions to researching`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 3: Cannot start again while in progress
{
const result = await client.start('Another task');
const ok = result.isError && result.text.includes('Already in');
log(` [${ok ? 'PASS' : 'FAIL'}] Cannot start while in progress`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 4: Get worker brief
{
const result = await client.getWorkerBrief();
const ok = result.text.includes('Worker Brief') && result.text.includes('Test task');
log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief contains task`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 5: Submit proposal
{
const result = await client.propose('## Analysis\n\nProposed fix here');
const ok = result.text.includes('Proposal submitted');
log(` [${ok ? 'PASS' : 'FAIL'}] Proposal submitted`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 6: Get reviewer brief includes proposal
{
const result = await client.getReviewerBrief();
const ok = result.text.includes('Reviewer Brief') && result.text.includes('Proposed fix here');
log(` [${ok ? 'PASS' : 'FAIL'}] Reviewer brief contains proposal`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 7: Review with NEEDS_WORK
{
const result = await client.review('NEEDS_WORK', 'Missing broker event handling');
const ok = result.text.includes('NEEDS_WORK') && result.text.includes('Round 2/5');
log(` [${ok ? 'PASS' : 'FAIL'}] NEEDS_WORK increments round`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 8: Worker brief now includes feedback
{
const result = await client.getWorkerBrief();
const ok = result.text.includes('Previous Feedback') && result.text.includes('broker event');
log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief includes previous feedback`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 9: Submit revised proposal
{
const result = await client.propose('## Revised\n\nNow with broker events');
const ok = result.text.includes('Proposal submitted');
log(` [${ok ? 'PASS' : 'FAIL'}] Revised proposal submitted`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 10: Review with APPROVED
{
const result = await client.review('APPROVED', 'All checks pass');
const ok = result.text.includes('APPROVED') && result.text.includes('2 round');
log(` [${ok ? 'PASS' : 'FAIL'}] APPROVED after review`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 11: Status shows approved
{
const result = await client.status();
const ok = result.text.includes('Phase: approved');
log(` [${ok ? 'PASS' : 'FAIL'}] Status shows approved phase`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 12: Implement
{
const result = await client.implement();
const ok = result.text.includes('Implementation phase');
log(` [${ok ? 'PASS' : 'FAIL'}] Implement starts implementation`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 13: Complete
{
const result = await client.complete('Fixed the bug');
const ok = result.text.includes('Complete') && result.text.includes('Reset to conversation');
log(` [${ok ? 'PASS' : 'FAIL'}] Complete resets workflow`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 14: Back to conversation
{
const result = await client.status();
const ok = result.text.includes('Phase: conversation');
log(` [${ok ? 'PASS' : 'FAIL'}] Back to conversation phase`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 15: Abort works
{
await client.start('Task to abort');
const result = await client.abort('Changed my mind');
const ok = result.text.includes('Aborted') && result.text.includes('Reset to conversation');
log(` [${ok ? 'PASS' : 'FAIL'}] Abort resets workflow`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
// Test 16: Max rounds enforcement
{
await client.start('Task for max rounds');
for (let i = 0; i < 5; i++) {
await client.propose(`Proposal ${i + 1}`);
if (i < 4) {
await client.review('NEEDS_WORK', `Feedback ${i + 1}`);
}
}
const result = await client.review('NEEDS_WORK', 'Still not good');
const ok = result.text.includes('MAX ROUNDS') && result.text.includes('reset');
log(` [${ok ? 'PASS' : 'FAIL'}] Max rounds resets workflow`, ok ? 'green' : 'red');
ok ? passed++ : failed++;
}
log(`\nWorkflow tests: ${passed} passed, ${failed} failed`, failed ? 'red' : 'green');
return failed === 0;
} finally {
await client.disconnect();
cleanState();
}
}
// ============================================================================
// Mock Tests (predefined Worker/Reviewer responses)
// ============================================================================
async function testWithMocks(scenario) {
logSection(`Mock Test: ${scenario.name}`);
const client = new McpClient();
try {
cleanState();
await client.connect();
// Start the workflow
await client.start(scenario.task);
log(`Started task: ${scenario.task.slice(0, 60)}...`, 'dim');
// Round 1: Worker submits naive fix
const naiveProposal = `## Analysis
${scenario.naive_fix.description}
### Changes
${scenario.naive_fix.changes.map(c => `- ${c.file}: ${c.change}`).join('\n')}
`;
await client.propose(naiveProposal);
log('Worker submitted naive proposal', 'dim');
// Round 1: Reviewer catches issues
const issues = scenario.naive_fix.issues;
const reviewFeedback = `NEEDS_WORK
Issues found:
${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')}
The proposal misses critical aspects. Please address all issues.
`;
await client.review('NEEDS_WORK', reviewFeedback);
log(`Reviewer found ${issues.length} issues`, 'yellow');
// Round 2: Worker submits correct fix
const correctProposal = `## Revised Analysis
${scenario.correct_fix.description}
### Required Changes
${scenario.correct_fix.required_changes.map(c =>
`#### ${c.file}:${c.function}
- ${c.change}
- Reference: ${c.line_reference}`
).join('\n\n')}
### Broker Subscriptions
${scenario.correct_fix.required_broker_subscriptions.map(s =>
`- ${s.service} subscribes to ${s.event}: ${s.action}`
).join('\n')}
### Pattern References
${scenario.correct_fix.pattern_references.map(p => `- ${p}`).join('\n')}
`;
await client.propose(correctProposal);
log('Worker submitted revised proposal', 'dim');
// Round 2: Reviewer approves
await client.review('APPROVED', 'All required changes identified. Pattern followed correctly.');
log('Reviewer approved', 'green');
// Validate the proposal
const validation = scenario.validation_criteria;
let validationPassed = true;
log('\nValidation:', 'blue');
// Check must_mention
for (const item of validation.must_mention || []) {
const found = correctProposal.toLowerCase().includes(item.toLowerCase());
log(` [${found ? 'PASS' : 'FAIL'}] Mentions: ${item}`, found ? 'green' : 'red');
if (!found) validationPassed = false;
}
// Check pattern reference
if (validation.should_reference_pattern) {
const found = correctProposal.includes(validation.should_reference_pattern);
log(` [${found ? 'PASS' : 'FAIL'}] References pattern: ${validation.should_reference_pattern}`, found ? 'green' : 'red');
if (!found) validationPassed = false;
}
// Complete the workflow
await client.implement();
await client.complete('Test completed');
log(`\nMock test ${scenario.id}: ${validationPassed ? 'PASSED' : 'FAILED'}`, validationPassed ? 'green' : 'red');
return validationPassed;
} finally {
await client.disconnect();
cleanState();
}
}
// ============================================================================
// Live Tests (real Claude API)
// ============================================================================
async function testLive(scenario) {
logSection(`Live Test: ${scenario.name}`);
const apiKey = process.env.ANTHROPIC_API_KEY;
if (!apiKey) {
log('ANTHROPIC_API_KEY not set. Skipping live test.', 'yellow');
return null;
}
log('Live tests with Claude API not yet implemented.', 'yellow');
log('This would spawn real Worker and Reviewer sub-agents.', 'dim');
// TODO: Implement Claude API integration
// 1. Get worker brief
// 2. Call Claude API with worker prompt + brief
// 3. Submit Claude's proposal
// 4. Get reviewer brief
// 5. Call Claude API with reviewer prompt + brief
// 6. Submit Claude's review
// 7. Loop until approved or max rounds
return null;
}
// ============================================================================
// Main
// ============================================================================
async function main() {
log('\n🔍 Diligence Test Runner\n', 'blue');
log(`Mode: ${mode}`, 'dim');
let allPassed = true;
switch (mode) {
case '--workflow':
allPassed = await testWorkflow();
break;
case '--mock': {
const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios();
for (const scenario of scenarios) {
const passed = await testWithMocks(scenario);
if (!passed) allPassed = false;
}
break;
}
case '--live': {
const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios();
for (const scenario of scenarios) {
const result = await testLive(scenario);
if (result === false) allPassed = false;
}
break;
}
}
console.log();
if (allPassed) {
log('✓ All tests passed', 'green');
process.exit(0);
} else {
log('✗ Some tests failed', 'red');
process.exit(1);
}
}
main().catch(err => {
console.error('Error:', err);
process.exit(1);
});