Initial release: MCP server enforcing Worker-Reviewer loop
Diligence prevents AI agents from shipping quick fixes that break things by enforcing a research-propose-verify loop before any code changes. Key features: - Worker sub-agent researches and proposes with file:line citations - Reviewer sub-agent independently verifies claims by searching codebase - Iterates until approved (max 5 rounds) - Loads project-specific context from .claude/CODEBASE_CONTEXT.md - State persisted across sessions Validated on production codebase: caught architectural mistake (broker subscriptions on client-side code) that naive agent would have shipped.
This commit is contained in:
305
test/dry-run.mjs
Normal file
305
test/dry-run.mjs
Normal file
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* Dry Run Test Against Real Project
|
||||
*
|
||||
* Runs the diligence MCP server against a real project (e.g., nexus) in dry-run mode.
|
||||
* This tests the full workflow without making any code changes.
|
||||
*
|
||||
* Usage:
|
||||
* node test/dry-run.mjs --project=/path/to/nexus --task="Fix permission cache"
|
||||
* node test/dry-run.mjs --project=~/bude/codecharm/nexus --scenario=blocking-voice
|
||||
*
|
||||
* Options:
|
||||
* --project=PATH Path to the project to test against
|
||||
* --task=TEXT Task description to start the workflow with
|
||||
* --scenario=ID Use a predefined scenario from test/scenarios/
|
||||
* --interactive Run in interactive mode (prompts for input)
|
||||
*/
|
||||
|
||||
import { spawn } from 'child_process';
|
||||
import { createInterface } from 'readline';
|
||||
import { dirname, join, resolve } from 'path';
|
||||
import { fileURLToPath } from 'url';
|
||||
import { existsSync, readFileSync } from 'fs';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
// Parse CLI args
|
||||
const args = process.argv.slice(2);
|
||||
const projectArg = args.find(a => a.startsWith('--project='));
|
||||
const taskArg = args.find(a => a.startsWith('--task='));
|
||||
const scenarioArg = args.find(a => a.startsWith('--scenario='));
|
||||
const interactive = args.includes('--interactive') || args.includes('-i');
|
||||
|
||||
// Resolve project path
|
||||
let projectPath = projectArg ? projectArg.split('=')[1] : null;
|
||||
if (projectPath) {
|
||||
projectPath = projectPath.replace(/^~/, process.env.HOME);
|
||||
projectPath = resolve(projectPath);
|
||||
}
|
||||
|
||||
// Colors
|
||||
const colors = {
|
||||
reset: '\x1b[0m',
|
||||
green: '\x1b[32m',
|
||||
red: '\x1b[31m',
|
||||
yellow: '\x1b[33m',
|
||||
blue: '\x1b[34m',
|
||||
cyan: '\x1b[36m',
|
||||
dim: '\x1b[2m',
|
||||
bold: '\x1b[1m',
|
||||
};
|
||||
|
||||
function log(msg, color = 'reset') {
|
||||
console.log(`${colors[color]}${msg}${colors.reset}`);
|
||||
}
|
||||
|
||||
function logSection(title) {
|
||||
console.log(`\n${colors.cyan}${colors.bold}═══ ${title} ═══${colors.reset}\n`);
|
||||
}
|
||||
|
||||
// Load scenario
|
||||
function loadScenario(id) {
|
||||
const path = join(__dirname, 'scenarios', `${id}.json`);
|
||||
if (!existsSync(path)) {
|
||||
throw new Error(`Scenario not found: ${id}`);
|
||||
}
|
||||
return JSON.parse(readFileSync(path, 'utf-8'));
|
||||
}
|
||||
|
||||
// Simple MCP client for dry run
|
||||
class DryRunClient {
|
||||
constructor(projectPath) {
|
||||
this.projectPath = projectPath;
|
||||
this.serverPath = join(__dirname, '..', 'index.mjs');
|
||||
this.process = null;
|
||||
this.requestId = 0;
|
||||
this.pendingRequests = new Map();
|
||||
this.readline = null;
|
||||
}
|
||||
|
||||
async connect() {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.process = spawn('node', [this.serverPath], {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
cwd: this.projectPath,
|
||||
});
|
||||
|
||||
this.readline = createInterface({
|
||||
input: this.process.stdout,
|
||||
crlfDelay: Infinity,
|
||||
});
|
||||
|
||||
this.readline.on('line', (line) => {
|
||||
try {
|
||||
const message = JSON.parse(line);
|
||||
if (message.id !== undefined && this.pendingRequests.has(message.id)) {
|
||||
const { resolve, reject } = this.pendingRequests.get(message.id);
|
||||
this.pendingRequests.delete(message.id);
|
||||
if (message.error) {
|
||||
reject(new Error(message.error.message || JSON.stringify(message.error)));
|
||||
} else {
|
||||
resolve(message.result);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// Ignore non-JSON lines
|
||||
}
|
||||
});
|
||||
|
||||
this.process.stderr.on('data', (data) => {
|
||||
// Show server stderr in debug mode
|
||||
if (process.env.DEBUG) {
|
||||
console.error(colors.dim + '[server] ' + data.toString() + colors.reset);
|
||||
}
|
||||
});
|
||||
|
||||
this.process.on('error', reject);
|
||||
|
||||
// Initialize
|
||||
this._send({
|
||||
jsonrpc: '2.0',
|
||||
id: this.requestId++,
|
||||
method: 'initialize',
|
||||
params: {
|
||||
protocolVersion: '0.1.0',
|
||||
clientInfo: { name: 'dry-run-client', version: '1.0.0' },
|
||||
capabilities: {},
|
||||
},
|
||||
}).then(() => {
|
||||
this._sendNotification('notifications/initialized', {});
|
||||
resolve();
|
||||
}).catch(reject);
|
||||
});
|
||||
}
|
||||
|
||||
async disconnect() {
|
||||
if (this.process) {
|
||||
this.process.kill('SIGTERM');
|
||||
this.process = null;
|
||||
}
|
||||
}
|
||||
|
||||
_send(message) {
|
||||
return new Promise((resolve, reject) => {
|
||||
this.pendingRequests.set(message.id, { resolve, reject });
|
||||
this.process.stdin.write(JSON.stringify(message) + '\n');
|
||||
setTimeout(() => {
|
||||
if (this.pendingRequests.has(message.id)) {
|
||||
this.pendingRequests.delete(message.id);
|
||||
reject(new Error('Request timeout'));
|
||||
}
|
||||
}, 30000);
|
||||
});
|
||||
}
|
||||
|
||||
_sendNotification(method, params) {
|
||||
this.process.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n');
|
||||
}
|
||||
|
||||
async callTool(name, args = {}) {
|
||||
const result = await this._send({
|
||||
jsonrpc: '2.0',
|
||||
id: this.requestId++,
|
||||
method: 'tools/call',
|
||||
params: { name, arguments: args },
|
||||
});
|
||||
if (result.content?.[0]?.text) {
|
||||
return { text: result.content[0].text, isError: result.isError || false };
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
// Interactive prompt
|
||||
function prompt(question) {
|
||||
const rl = createInterface({
|
||||
input: process.stdin,
|
||||
output: process.stdout,
|
||||
});
|
||||
return new Promise(resolve => {
|
||||
rl.question(question, answer => {
|
||||
rl.close();
|
||||
resolve(answer);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
async function main() {
|
||||
log('\n🔍 Diligence Dry Run\n', 'cyan');
|
||||
|
||||
// Validate project path
|
||||
if (!projectPath) {
|
||||
log('Error: --project=PATH required', 'red');
|
||||
log('\nUsage:', 'dim');
|
||||
log(' node test/dry-run.mjs --project=~/bude/codecharm/nexus --task="Fix bug"', 'dim');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
if (!existsSync(projectPath)) {
|
||||
log(`Error: Project path not found: ${projectPath}`, 'red');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
// Check for CODEBASE_CONTEXT.md
|
||||
const contextPath = join(projectPath, '.claude', 'CODEBASE_CONTEXT.md');
|
||||
if (!existsSync(contextPath)) {
|
||||
log(`Warning: No .claude/CODEBASE_CONTEXT.md found in ${projectPath}`, 'yellow');
|
||||
log('The Worker and Reviewer will have limited context.', 'dim');
|
||||
} else {
|
||||
log(`Found: ${contextPath}`, 'green');
|
||||
}
|
||||
|
||||
// Determine task
|
||||
let task;
|
||||
if (scenarioArg) {
|
||||
const scenarioId = scenarioArg.split('=')[1];
|
||||
const scenario = loadScenario(scenarioId);
|
||||
task = scenario.task;
|
||||
log(`Using scenario: ${scenario.name}`, 'blue');
|
||||
} else if (taskArg) {
|
||||
task = taskArg.split('=')[1];
|
||||
} else if (interactive) {
|
||||
task = await prompt('Enter task: ');
|
||||
} else {
|
||||
log('Error: Either --task=TEXT or --scenario=ID required', 'red');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
log(`\nProject: ${projectPath}`, 'dim');
|
||||
log(`Task: ${task}\n`, 'dim');
|
||||
|
||||
// Connect to MCP server
|
||||
const client = new DryRunClient(projectPath);
|
||||
|
||||
try {
|
||||
log('Connecting to MCP server...', 'dim');
|
||||
await client.connect();
|
||||
log('Connected!', 'green');
|
||||
|
||||
// Check initial status
|
||||
logSection('Status');
|
||||
const status = await client.callTool('status');
|
||||
log(status.text, 'dim');
|
||||
|
||||
// Start the workflow
|
||||
logSection('Starting Workflow');
|
||||
const startResult = await client.callTool('start', { task });
|
||||
log(startResult.text, startResult.isError ? 'red' : 'green');
|
||||
|
||||
if (startResult.isError) {
|
||||
// Try to abort and restart
|
||||
log('\nAborting existing workflow...', 'yellow');
|
||||
await client.callTool('abort', { reason: 'Dry run restart' });
|
||||
const retryResult = await client.callTool('start', { task });
|
||||
log(retryResult.text, retryResult.isError ? 'red' : 'green');
|
||||
}
|
||||
|
||||
// Get worker brief
|
||||
logSection('Worker Brief');
|
||||
const workerBrief = await client.callTool('get_worker_brief');
|
||||
|
||||
// Show truncated brief
|
||||
const briefLines = workerBrief.text.split('\n');
|
||||
const truncatedBrief = briefLines.slice(0, 50).join('\n');
|
||||
log(truncatedBrief, 'dim');
|
||||
if (briefLines.length > 50) {
|
||||
log(`\n... (${briefLines.length - 50} more lines)`, 'dim');
|
||||
}
|
||||
|
||||
logSection('Dry Run Complete');
|
||||
|
||||
log(`
|
||||
${colors.yellow}What happens next in a real session:${colors.reset}
|
||||
|
||||
1. ${colors.bold}Worker Agent${colors.reset} (fresh sub-agent) receives the brief above
|
||||
- Researches the codebase using Glob, Grep, Read tools
|
||||
- Proposes a fix with file:line citations
|
||||
- Submits via ${colors.cyan}diligence.propose${colors.reset}
|
||||
|
||||
2. ${colors.bold}Reviewer Agent${colors.reset} (fresh sub-agent) verifies the proposal
|
||||
- Searches codebase to verify Worker's claims
|
||||
- Checks against patterns in CODEBASE_CONTEXT.md
|
||||
- Submits ${colors.green}APPROVED${colors.reset} or ${colors.yellow}NEEDS_WORK${colors.reset} via ${colors.cyan}diligence.review${colors.reset}
|
||||
|
||||
3. If ${colors.yellow}NEEDS_WORK${colors.reset}: Worker revises, Reviewer re-checks (up to 5 rounds)
|
||||
|
||||
4. If ${colors.green}APPROVED${colors.reset}: ${colors.cyan}diligence.implement${colors.reset} → code changes → ${colors.cyan}diligence.complete${colors.reset}
|
||||
|
||||
${colors.dim}This was a dry run - no code changes were made.${colors.reset}
|
||||
`, 'reset');
|
||||
|
||||
// Cleanup - abort the workflow
|
||||
await client.callTool('abort', { reason: 'Dry run completed' });
|
||||
log('Workflow aborted (dry run cleanup)', 'dim');
|
||||
|
||||
} finally {
|
||||
await client.disconnect();
|
||||
log('\nDisconnected from MCP server.', 'dim');
|
||||
}
|
||||
}
|
||||
|
||||
main().catch(err => {
|
||||
console.error('Error:', err.message);
|
||||
process.exit(1);
|
||||
});
|
||||
Reference in New Issue
Block a user