Files
diligence/test/dry-run.mjs
Marc J. Schmidt bd178fcaf0 Initial release: MCP server enforcing Worker-Reviewer loop
Diligence prevents AI agents from shipping quick fixes that break things
by enforcing a research-propose-verify loop before any code changes.

Key features:
- Worker sub-agent researches and proposes with file:line citations
- Reviewer sub-agent independently verifies claims by searching codebase
- Iterates until approved (max 5 rounds)
- Loads project-specific context from .claude/CODEBASE_CONTEXT.md
- State persisted across sessions

Validated on production codebase: caught architectural mistake (broker
subscriptions on client-side code) that naive agent would have shipped.
2026-01-22 06:22:59 +01:00

306 lines
9.1 KiB
JavaScript

#!/usr/bin/env node
/**
* Dry Run Test Against Real Project
*
* Runs the diligence MCP server against a real project (e.g., nexus) in dry-run mode.
* This tests the full workflow without making any code changes.
*
* Usage:
* node test/dry-run.mjs --project=/path/to/nexus --task="Fix permission cache"
* node test/dry-run.mjs --project=~/bude/codecharm/nexus --scenario=blocking-voice
*
* Options:
* --project=PATH Path to the project to test against
* --task=TEXT Task description to start the workflow with
* --scenario=ID Use a predefined scenario from test/scenarios/
* --interactive Run in interactive mode (prompts for input)
*/
import { spawn } from 'child_process';
import { createInterface } from 'readline';
import { dirname, join, resolve } from 'path';
import { fileURLToPath } from 'url';
import { existsSync, readFileSync } from 'fs';
const __dirname = dirname(fileURLToPath(import.meta.url));
// Parse CLI args
const args = process.argv.slice(2);
const projectArg = args.find(a => a.startsWith('--project='));
const taskArg = args.find(a => a.startsWith('--task='));
const scenarioArg = args.find(a => a.startsWith('--scenario='));
const interactive = args.includes('--interactive') || args.includes('-i');
// Resolve project path
let projectPath = projectArg ? projectArg.split('=')[1] : null;
if (projectPath) {
projectPath = projectPath.replace(/^~/, process.env.HOME);
projectPath = resolve(projectPath);
}
// Colors
const colors = {
reset: '\x1b[0m',
green: '\x1b[32m',
red: '\x1b[31m',
yellow: '\x1b[33m',
blue: '\x1b[34m',
cyan: '\x1b[36m',
dim: '\x1b[2m',
bold: '\x1b[1m',
};
function log(msg, color = 'reset') {
console.log(`${colors[color]}${msg}${colors.reset}`);
}
function logSection(title) {
console.log(`\n${colors.cyan}${colors.bold}═══ ${title} ═══${colors.reset}\n`);
}
// Load scenario
function loadScenario(id) {
const path = join(__dirname, 'scenarios', `${id}.json`);
if (!existsSync(path)) {
throw new Error(`Scenario not found: ${id}`);
}
return JSON.parse(readFileSync(path, 'utf-8'));
}
// Simple MCP client for dry run
class DryRunClient {
constructor(projectPath) {
this.projectPath = projectPath;
this.serverPath = join(__dirname, '..', 'index.mjs');
this.process = null;
this.requestId = 0;
this.pendingRequests = new Map();
this.readline = null;
}
async connect() {
return new Promise((resolve, reject) => {
this.process = spawn('node', [this.serverPath], {
stdio: ['pipe', 'pipe', 'pipe'],
cwd: this.projectPath,
});
this.readline = createInterface({
input: this.process.stdout,
crlfDelay: Infinity,
});
this.readline.on('line', (line) => {
try {
const message = JSON.parse(line);
if (message.id !== undefined && this.pendingRequests.has(message.id)) {
const { resolve, reject } = this.pendingRequests.get(message.id);
this.pendingRequests.delete(message.id);
if (message.error) {
reject(new Error(message.error.message || JSON.stringify(message.error)));
} else {
resolve(message.result);
}
}
} catch (e) {
// Ignore non-JSON lines
}
});
this.process.stderr.on('data', (data) => {
// Show server stderr in debug mode
if (process.env.DEBUG) {
console.error(colors.dim + '[server] ' + data.toString() + colors.reset);
}
});
this.process.on('error', reject);
// Initialize
this._send({
jsonrpc: '2.0',
id: this.requestId++,
method: 'initialize',
params: {
protocolVersion: '0.1.0',
clientInfo: { name: 'dry-run-client', version: '1.0.0' },
capabilities: {},
},
}).then(() => {
this._sendNotification('notifications/initialized', {});
resolve();
}).catch(reject);
});
}
async disconnect() {
if (this.process) {
this.process.kill('SIGTERM');
this.process = null;
}
}
_send(message) {
return new Promise((resolve, reject) => {
this.pendingRequests.set(message.id, { resolve, reject });
this.process.stdin.write(JSON.stringify(message) + '\n');
setTimeout(() => {
if (this.pendingRequests.has(message.id)) {
this.pendingRequests.delete(message.id);
reject(new Error('Request timeout'));
}
}, 30000);
});
}
_sendNotification(method, params) {
this.process.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n');
}
async callTool(name, args = {}) {
const result = await this._send({
jsonrpc: '2.0',
id: this.requestId++,
method: 'tools/call',
params: { name, arguments: args },
});
if (result.content?.[0]?.text) {
return { text: result.content[0].text, isError: result.isError || false };
}
return result;
}
}
// Interactive prompt
function prompt(question) {
const rl = createInterface({
input: process.stdin,
output: process.stdout,
});
return new Promise(resolve => {
rl.question(question, answer => {
rl.close();
resolve(answer);
});
});
}
async function main() {
log('\n🔍 Diligence Dry Run\n', 'cyan');
// Validate project path
if (!projectPath) {
log('Error: --project=PATH required', 'red');
log('\nUsage:', 'dim');
log(' node test/dry-run.mjs --project=~/bude/codecharm/nexus --task="Fix bug"', 'dim');
process.exit(1);
}
if (!existsSync(projectPath)) {
log(`Error: Project path not found: ${projectPath}`, 'red');
process.exit(1);
}
// Check for CODEBASE_CONTEXT.md
const contextPath = join(projectPath, '.claude', 'CODEBASE_CONTEXT.md');
if (!existsSync(contextPath)) {
log(`Warning: No .claude/CODEBASE_CONTEXT.md found in ${projectPath}`, 'yellow');
log('The Worker and Reviewer will have limited context.', 'dim');
} else {
log(`Found: ${contextPath}`, 'green');
}
// Determine task
let task;
if (scenarioArg) {
const scenarioId = scenarioArg.split('=')[1];
const scenario = loadScenario(scenarioId);
task = scenario.task;
log(`Using scenario: ${scenario.name}`, 'blue');
} else if (taskArg) {
task = taskArg.split('=')[1];
} else if (interactive) {
task = await prompt('Enter task: ');
} else {
log('Error: Either --task=TEXT or --scenario=ID required', 'red');
process.exit(1);
}
log(`\nProject: ${projectPath}`, 'dim');
log(`Task: ${task}\n`, 'dim');
// Connect to MCP server
const client = new DryRunClient(projectPath);
try {
log('Connecting to MCP server...', 'dim');
await client.connect();
log('Connected!', 'green');
// Check initial status
logSection('Status');
const status = await client.callTool('status');
log(status.text, 'dim');
// Start the workflow
logSection('Starting Workflow');
const startResult = await client.callTool('start', { task });
log(startResult.text, startResult.isError ? 'red' : 'green');
if (startResult.isError) {
// Try to abort and restart
log('\nAborting existing workflow...', 'yellow');
await client.callTool('abort', { reason: 'Dry run restart' });
const retryResult = await client.callTool('start', { task });
log(retryResult.text, retryResult.isError ? 'red' : 'green');
}
// Get worker brief
logSection('Worker Brief');
const workerBrief = await client.callTool('get_worker_brief');
// Show truncated brief
const briefLines = workerBrief.text.split('\n');
const truncatedBrief = briefLines.slice(0, 50).join('\n');
log(truncatedBrief, 'dim');
if (briefLines.length > 50) {
log(`\n... (${briefLines.length - 50} more lines)`, 'dim');
}
logSection('Dry Run Complete');
log(`
${colors.yellow}What happens next in a real session:${colors.reset}
1. ${colors.bold}Worker Agent${colors.reset} (fresh sub-agent) receives the brief above
- Researches the codebase using Glob, Grep, Read tools
- Proposes a fix with file:line citations
- Submits via ${colors.cyan}diligence.propose${colors.reset}
2. ${colors.bold}Reviewer Agent${colors.reset} (fresh sub-agent) verifies the proposal
- Searches codebase to verify Worker's claims
- Checks against patterns in CODEBASE_CONTEXT.md
- Submits ${colors.green}APPROVED${colors.reset} or ${colors.yellow}NEEDS_WORK${colors.reset} via ${colors.cyan}diligence.review${colors.reset}
3. If ${colors.yellow}NEEDS_WORK${colors.reset}: Worker revises, Reviewer re-checks (up to 5 rounds)
4. If ${colors.green}APPROVED${colors.reset}: ${colors.cyan}diligence.implement${colors.reset} → code changes → ${colors.cyan}diligence.complete${colors.reset}
${colors.dim}This was a dry run - no code changes were made.${colors.reset}
`, 'reset');
// Cleanup - abort the workflow
await client.callTool('abort', { reason: 'Dry run completed' });
log('Workflow aborted (dry run cleanup)', 'dim');
} finally {
await client.disconnect();
log('\nDisconnected from MCP server.', 'dim');
}
}
main().catch(err => {
console.error('Error:', err.message);
process.exit(1);
});