Diligence prevents AI agents from shipping quick fixes that break things by enforcing a research-propose-verify loop before any code changes. Key features: - Worker sub-agent researches and proposes with file:line citations - Reviewer sub-agent independently verifies claims by searching codebase - Iterates until approved (max 5 rounds) - Loads project-specific context from .claude/CODEBASE_CONTEXT.md - State persisted across sessions Validated on production codebase: caught architectural mistake (broker subscriptions on client-side code) that naive agent would have shipped.
416 lines
13 KiB
JavaScript
416 lines
13 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Diligence Test Runner
|
|
*
|
|
* Runs end-to-end tests of the Worker-Reviewer loop.
|
|
*
|
|
* Modes:
|
|
* --workflow Test MCP workflow mechanics only (no AI)
|
|
* --mock Use mock Worker/Reviewer responses
|
|
* --live Use real Claude API for Worker/Reviewer (requires ANTHROPIC_API_KEY)
|
|
*
|
|
* Usage:
|
|
* node test/run-tests.mjs --workflow
|
|
* node test/run-tests.mjs --mock --scenario=blocking-voice
|
|
* node test/run-tests.mjs --live --scenario=permission-cache
|
|
*/
|
|
|
|
import { McpClient } from './mcp-client.mjs';
|
|
import { readFileSync, existsSync, unlinkSync } from 'fs';
|
|
import { dirname, join } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
|
// Parse CLI args
|
|
const args = process.argv.slice(2);
|
|
const mode = args.find(a => ['--workflow', '--mock', '--live'].includes(a)) || '--workflow';
|
|
const scenarioArg = args.find(a => a.startsWith('--scenario='));
|
|
const scenarioId = scenarioArg ? scenarioArg.split('=')[1] : null;
|
|
const verbose = args.includes('--verbose') || args.includes('-v');
|
|
|
|
// Colors for output
|
|
const colors = {
|
|
reset: '\x1b[0m',
|
|
green: '\x1b[32m',
|
|
red: '\x1b[31m',
|
|
yellow: '\x1b[33m',
|
|
blue: '\x1b[34m',
|
|
dim: '\x1b[2m',
|
|
};
|
|
|
|
function log(msg, color = 'reset') {
|
|
console.log(`${colors[color]}${msg}${colors.reset}`);
|
|
}
|
|
|
|
function logSection(title) {
|
|
console.log(`\n${colors.blue}=== ${title} ===${colors.reset}`);
|
|
}
|
|
|
|
// Load scenario
|
|
function loadScenario(id) {
|
|
const path = join(__dirname, 'scenarios', `${id}.json`);
|
|
if (!existsSync(path)) {
|
|
throw new Error(`Scenario not found: ${id}`);
|
|
}
|
|
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
}
|
|
|
|
// Load all scenarios
|
|
function loadAllScenarios() {
|
|
const index = JSON.parse(readFileSync(join(__dirname, 'scenarios', 'index.json'), 'utf-8'));
|
|
return index.scenarios.map(s => loadScenario(s.id));
|
|
}
|
|
|
|
// Clean up state file before test
|
|
function cleanState() {
|
|
const stateFile = join(__dirname, 'fixture', '.claude', '.diligence-state.json');
|
|
if (existsSync(stateFile)) {
|
|
unlinkSync(stateFile);
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Workflow Tests (no AI, just MCP mechanics)
|
|
// ============================================================================
|
|
|
|
async function testWorkflow() {
|
|
logSection('Workflow Tests');
|
|
|
|
const client = new McpClient();
|
|
let passed = 0;
|
|
let failed = 0;
|
|
|
|
try {
|
|
cleanState();
|
|
await client.connect();
|
|
log('Connected to MCP server', 'green');
|
|
|
|
// Test 1: Status in conversation phase
|
|
{
|
|
const result = await client.status();
|
|
const ok = result.text.includes('Phase: conversation');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Initial status is conversation`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 2: Start workflow
|
|
{
|
|
const result = await client.start('Test task');
|
|
const ok = result.text.includes('researching') && result.text.includes('Round: 1/5');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Start transitions to researching`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 3: Cannot start again while in progress
|
|
{
|
|
const result = await client.start('Another task');
|
|
const ok = result.isError && result.text.includes('Already in');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Cannot start while in progress`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 4: Get worker brief
|
|
{
|
|
const result = await client.getWorkerBrief();
|
|
const ok = result.text.includes('Worker Brief') && result.text.includes('Test task');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief contains task`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 5: Submit proposal
|
|
{
|
|
const result = await client.propose('## Analysis\n\nProposed fix here');
|
|
const ok = result.text.includes('Proposal submitted');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Proposal submitted`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 6: Get reviewer brief includes proposal
|
|
{
|
|
const result = await client.getReviewerBrief();
|
|
const ok = result.text.includes('Reviewer Brief') && result.text.includes('Proposed fix here');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Reviewer brief contains proposal`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 7: Review with NEEDS_WORK
|
|
{
|
|
const result = await client.review('NEEDS_WORK', 'Missing broker event handling');
|
|
const ok = result.text.includes('NEEDS_WORK') && result.text.includes('Round 2/5');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] NEEDS_WORK increments round`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 8: Worker brief now includes feedback
|
|
{
|
|
const result = await client.getWorkerBrief();
|
|
const ok = result.text.includes('Previous Feedback') && result.text.includes('broker event');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief includes previous feedback`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 9: Submit revised proposal
|
|
{
|
|
const result = await client.propose('## Revised\n\nNow with broker events');
|
|
const ok = result.text.includes('Proposal submitted');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Revised proposal submitted`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 10: Review with APPROVED
|
|
{
|
|
const result = await client.review('APPROVED', 'All checks pass');
|
|
const ok = result.text.includes('APPROVED') && result.text.includes('2 round');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] APPROVED after review`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 11: Status shows approved
|
|
{
|
|
const result = await client.status();
|
|
const ok = result.text.includes('Phase: approved');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Status shows approved phase`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 12: Implement
|
|
{
|
|
const result = await client.implement();
|
|
const ok = result.text.includes('Implementation phase');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Implement starts implementation`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 13: Complete
|
|
{
|
|
const result = await client.complete('Fixed the bug');
|
|
const ok = result.text.includes('Complete') && result.text.includes('Reset to conversation');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Complete resets workflow`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 14: Back to conversation
|
|
{
|
|
const result = await client.status();
|
|
const ok = result.text.includes('Phase: conversation');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Back to conversation phase`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 15: Abort works
|
|
{
|
|
await client.start('Task to abort');
|
|
const result = await client.abort('Changed my mind');
|
|
const ok = result.text.includes('Aborted') && result.text.includes('Reset to conversation');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Abort resets workflow`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
// Test 16: Max rounds enforcement
|
|
{
|
|
await client.start('Task for max rounds');
|
|
for (let i = 0; i < 5; i++) {
|
|
await client.propose(`Proposal ${i + 1}`);
|
|
if (i < 4) {
|
|
await client.review('NEEDS_WORK', `Feedback ${i + 1}`);
|
|
}
|
|
}
|
|
const result = await client.review('NEEDS_WORK', 'Still not good');
|
|
const ok = result.text.includes('MAX ROUNDS') && result.text.includes('reset');
|
|
log(` [${ok ? 'PASS' : 'FAIL'}] Max rounds resets workflow`, ok ? 'green' : 'red');
|
|
ok ? passed++ : failed++;
|
|
}
|
|
|
|
log(`\nWorkflow tests: ${passed} passed, ${failed} failed`, failed ? 'red' : 'green');
|
|
return failed === 0;
|
|
|
|
} finally {
|
|
await client.disconnect();
|
|
cleanState();
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Mock Tests (predefined Worker/Reviewer responses)
|
|
// ============================================================================
|
|
|
|
async function testWithMocks(scenario) {
|
|
logSection(`Mock Test: ${scenario.name}`);
|
|
|
|
const client = new McpClient();
|
|
|
|
try {
|
|
cleanState();
|
|
await client.connect();
|
|
|
|
// Start the workflow
|
|
await client.start(scenario.task);
|
|
log(`Started task: ${scenario.task.slice(0, 60)}...`, 'dim');
|
|
|
|
// Round 1: Worker submits naive fix
|
|
const naiveProposal = `## Analysis
|
|
|
|
${scenario.naive_fix.description}
|
|
|
|
### Changes
|
|
${scenario.naive_fix.changes.map(c => `- ${c.file}: ${c.change}`).join('\n')}
|
|
`;
|
|
|
|
await client.propose(naiveProposal);
|
|
log('Worker submitted naive proposal', 'dim');
|
|
|
|
// Round 1: Reviewer catches issues
|
|
const issues = scenario.naive_fix.issues;
|
|
const reviewFeedback = `NEEDS_WORK
|
|
|
|
Issues found:
|
|
${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')}
|
|
|
|
The proposal misses critical aspects. Please address all issues.
|
|
`;
|
|
|
|
await client.review('NEEDS_WORK', reviewFeedback);
|
|
log(`Reviewer found ${issues.length} issues`, 'yellow');
|
|
|
|
// Round 2: Worker submits correct fix
|
|
const correctProposal = `## Revised Analysis
|
|
|
|
${scenario.correct_fix.description}
|
|
|
|
### Required Changes
|
|
|
|
${scenario.correct_fix.required_changes.map(c =>
|
|
`#### ${c.file}:${c.function}
|
|
- ${c.change}
|
|
- Reference: ${c.line_reference}`
|
|
).join('\n\n')}
|
|
|
|
### Broker Subscriptions
|
|
|
|
${scenario.correct_fix.required_broker_subscriptions.map(s =>
|
|
`- ${s.service} subscribes to ${s.event}: ${s.action}`
|
|
).join('\n')}
|
|
|
|
### Pattern References
|
|
|
|
${scenario.correct_fix.pattern_references.map(p => `- ${p}`).join('\n')}
|
|
`;
|
|
|
|
await client.propose(correctProposal);
|
|
log('Worker submitted revised proposal', 'dim');
|
|
|
|
// Round 2: Reviewer approves
|
|
await client.review('APPROVED', 'All required changes identified. Pattern followed correctly.');
|
|
log('Reviewer approved', 'green');
|
|
|
|
// Validate the proposal
|
|
const validation = scenario.validation_criteria;
|
|
let validationPassed = true;
|
|
|
|
log('\nValidation:', 'blue');
|
|
|
|
// Check must_mention
|
|
for (const item of validation.must_mention || []) {
|
|
const found = correctProposal.toLowerCase().includes(item.toLowerCase());
|
|
log(` [${found ? 'PASS' : 'FAIL'}] Mentions: ${item}`, found ? 'green' : 'red');
|
|
if (!found) validationPassed = false;
|
|
}
|
|
|
|
// Check pattern reference
|
|
if (validation.should_reference_pattern) {
|
|
const found = correctProposal.includes(validation.should_reference_pattern);
|
|
log(` [${found ? 'PASS' : 'FAIL'}] References pattern: ${validation.should_reference_pattern}`, found ? 'green' : 'red');
|
|
if (!found) validationPassed = false;
|
|
}
|
|
|
|
// Complete the workflow
|
|
await client.implement();
|
|
await client.complete('Test completed');
|
|
|
|
log(`\nMock test ${scenario.id}: ${validationPassed ? 'PASSED' : 'FAILED'}`, validationPassed ? 'green' : 'red');
|
|
return validationPassed;
|
|
|
|
} finally {
|
|
await client.disconnect();
|
|
cleanState();
|
|
}
|
|
}
|
|
|
|
// ============================================================================
|
|
// Live Tests (real Claude API)
|
|
// ============================================================================
|
|
|
|
async function testLive(scenario) {
|
|
logSection(`Live Test: ${scenario.name}`);
|
|
|
|
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
if (!apiKey) {
|
|
log('ANTHROPIC_API_KEY not set. Skipping live test.', 'yellow');
|
|
return null;
|
|
}
|
|
|
|
log('Live tests with Claude API not yet implemented.', 'yellow');
|
|
log('This would spawn real Worker and Reviewer sub-agents.', 'dim');
|
|
|
|
// TODO: Implement Claude API integration
|
|
// 1. Get worker brief
|
|
// 2. Call Claude API with worker prompt + brief
|
|
// 3. Submit Claude's proposal
|
|
// 4. Get reviewer brief
|
|
// 5. Call Claude API with reviewer prompt + brief
|
|
// 6. Submit Claude's review
|
|
// 7. Loop until approved or max rounds
|
|
|
|
return null;
|
|
}
|
|
|
|
// ============================================================================
|
|
// Main
|
|
// ============================================================================
|
|
|
|
async function main() {
|
|
log('\n🔍 Diligence Test Runner\n', 'blue');
|
|
log(`Mode: ${mode}`, 'dim');
|
|
|
|
let allPassed = true;
|
|
|
|
switch (mode) {
|
|
case '--workflow':
|
|
allPassed = await testWorkflow();
|
|
break;
|
|
|
|
case '--mock': {
|
|
const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios();
|
|
for (const scenario of scenarios) {
|
|
const passed = await testWithMocks(scenario);
|
|
if (!passed) allPassed = false;
|
|
}
|
|
break;
|
|
}
|
|
|
|
case '--live': {
|
|
const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios();
|
|
for (const scenario of scenarios) {
|
|
const result = await testLive(scenario);
|
|
if (result === false) allPassed = false;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
console.log();
|
|
if (allPassed) {
|
|
log('✓ All tests passed', 'green');
|
|
process.exit(0);
|
|
} else {
|
|
log('✗ Some tests failed', 'red');
|
|
process.exit(1);
|
|
}
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('Error:', err);
|
|
process.exit(1);
|
|
});
|