Diligence prevents AI agents from shipping quick fixes that break things by enforcing a research-propose-verify loop before any code changes. Key features: - Worker sub-agent researches and proposes with file:line citations - Reviewer sub-agent independently verifies claims by searching codebase - Iterates until approved (max 5 rounds) - Loads project-specific context from .claude/CODEBASE_CONTEXT.md - State persisted across sessions Validated on production codebase: caught architectural mistake (broker subscriptions on client-side code) that naive agent would have shipped.
306 lines
9.1 KiB
JavaScript
306 lines
9.1 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Dry Run Test Against Real Project
|
|
*
|
|
* Runs the diligence MCP server against a real project (e.g., nexus) in dry-run mode.
|
|
* This tests the full workflow without making any code changes.
|
|
*
|
|
* Usage:
|
|
* node test/dry-run.mjs --project=/path/to/nexus --task="Fix permission cache"
|
|
* node test/dry-run.mjs --project=~/bude/codecharm/nexus --scenario=blocking-voice
|
|
*
|
|
* Options:
|
|
* --project=PATH Path to the project to test against
|
|
* --task=TEXT Task description to start the workflow with
|
|
* --scenario=ID Use a predefined scenario from test/scenarios/
|
|
* --interactive Run in interactive mode (prompts for input)
|
|
*/
|
|
|
|
import { spawn } from 'child_process';
|
|
import { createInterface } from 'readline';
|
|
import { dirname, join, resolve } from 'path';
|
|
import { fileURLToPath } from 'url';
|
|
import { existsSync, readFileSync } from 'fs';
|
|
|
|
const __dirname = dirname(fileURLToPath(import.meta.url));
|
|
|
|
// Parse CLI args
|
|
const args = process.argv.slice(2);
|
|
const projectArg = args.find(a => a.startsWith('--project='));
|
|
const taskArg = args.find(a => a.startsWith('--task='));
|
|
const scenarioArg = args.find(a => a.startsWith('--scenario='));
|
|
const interactive = args.includes('--interactive') || args.includes('-i');
|
|
|
|
// Resolve project path
|
|
let projectPath = projectArg ? projectArg.split('=')[1] : null;
|
|
if (projectPath) {
|
|
projectPath = projectPath.replace(/^~/, process.env.HOME);
|
|
projectPath = resolve(projectPath);
|
|
}
|
|
|
|
// Colors
|
|
const colors = {
|
|
reset: '\x1b[0m',
|
|
green: '\x1b[32m',
|
|
red: '\x1b[31m',
|
|
yellow: '\x1b[33m',
|
|
blue: '\x1b[34m',
|
|
cyan: '\x1b[36m',
|
|
dim: '\x1b[2m',
|
|
bold: '\x1b[1m',
|
|
};
|
|
|
|
function log(msg, color = 'reset') {
|
|
console.log(`${colors[color]}${msg}${colors.reset}`);
|
|
}
|
|
|
|
function logSection(title) {
|
|
console.log(`\n${colors.cyan}${colors.bold}═══ ${title} ═══${colors.reset}\n`);
|
|
}
|
|
|
|
// Load scenario
|
|
function loadScenario(id) {
|
|
const path = join(__dirname, 'scenarios', `${id}.json`);
|
|
if (!existsSync(path)) {
|
|
throw new Error(`Scenario not found: ${id}`);
|
|
}
|
|
return JSON.parse(readFileSync(path, 'utf-8'));
|
|
}
|
|
|
|
// Simple MCP client for dry run
|
|
class DryRunClient {
|
|
constructor(projectPath) {
|
|
this.projectPath = projectPath;
|
|
this.serverPath = join(__dirname, '..', 'index.mjs');
|
|
this.process = null;
|
|
this.requestId = 0;
|
|
this.pendingRequests = new Map();
|
|
this.readline = null;
|
|
}
|
|
|
|
async connect() {
|
|
return new Promise((resolve, reject) => {
|
|
this.process = spawn('node', [this.serverPath], {
|
|
stdio: ['pipe', 'pipe', 'pipe'],
|
|
cwd: this.projectPath,
|
|
});
|
|
|
|
this.readline = createInterface({
|
|
input: this.process.stdout,
|
|
crlfDelay: Infinity,
|
|
});
|
|
|
|
this.readline.on('line', (line) => {
|
|
try {
|
|
const message = JSON.parse(line);
|
|
if (message.id !== undefined && this.pendingRequests.has(message.id)) {
|
|
const { resolve, reject } = this.pendingRequests.get(message.id);
|
|
this.pendingRequests.delete(message.id);
|
|
if (message.error) {
|
|
reject(new Error(message.error.message || JSON.stringify(message.error)));
|
|
} else {
|
|
resolve(message.result);
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Ignore non-JSON lines
|
|
}
|
|
});
|
|
|
|
this.process.stderr.on('data', (data) => {
|
|
// Show server stderr in debug mode
|
|
if (process.env.DEBUG) {
|
|
console.error(colors.dim + '[server] ' + data.toString() + colors.reset);
|
|
}
|
|
});
|
|
|
|
this.process.on('error', reject);
|
|
|
|
// Initialize
|
|
this._send({
|
|
jsonrpc: '2.0',
|
|
id: this.requestId++,
|
|
method: 'initialize',
|
|
params: {
|
|
protocolVersion: '0.1.0',
|
|
clientInfo: { name: 'dry-run-client', version: '1.0.0' },
|
|
capabilities: {},
|
|
},
|
|
}).then(() => {
|
|
this._sendNotification('notifications/initialized', {});
|
|
resolve();
|
|
}).catch(reject);
|
|
});
|
|
}
|
|
|
|
async disconnect() {
|
|
if (this.process) {
|
|
this.process.kill('SIGTERM');
|
|
this.process = null;
|
|
}
|
|
}
|
|
|
|
_send(message) {
|
|
return new Promise((resolve, reject) => {
|
|
this.pendingRequests.set(message.id, { resolve, reject });
|
|
this.process.stdin.write(JSON.stringify(message) + '\n');
|
|
setTimeout(() => {
|
|
if (this.pendingRequests.has(message.id)) {
|
|
this.pendingRequests.delete(message.id);
|
|
reject(new Error('Request timeout'));
|
|
}
|
|
}, 30000);
|
|
});
|
|
}
|
|
|
|
_sendNotification(method, params) {
|
|
this.process.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n');
|
|
}
|
|
|
|
async callTool(name, args = {}) {
|
|
const result = await this._send({
|
|
jsonrpc: '2.0',
|
|
id: this.requestId++,
|
|
method: 'tools/call',
|
|
params: { name, arguments: args },
|
|
});
|
|
if (result.content?.[0]?.text) {
|
|
return { text: result.content[0].text, isError: result.isError || false };
|
|
}
|
|
return result;
|
|
}
|
|
}
|
|
|
|
// Interactive prompt
|
|
function prompt(question) {
|
|
const rl = createInterface({
|
|
input: process.stdin,
|
|
output: process.stdout,
|
|
});
|
|
return new Promise(resolve => {
|
|
rl.question(question, answer => {
|
|
rl.close();
|
|
resolve(answer);
|
|
});
|
|
});
|
|
}
|
|
|
|
async function main() {
|
|
log('\n🔍 Diligence Dry Run\n', 'cyan');
|
|
|
|
// Validate project path
|
|
if (!projectPath) {
|
|
log('Error: --project=PATH required', 'red');
|
|
log('\nUsage:', 'dim');
|
|
log(' node test/dry-run.mjs --project=~/bude/codecharm/nexus --task="Fix bug"', 'dim');
|
|
process.exit(1);
|
|
}
|
|
|
|
if (!existsSync(projectPath)) {
|
|
log(`Error: Project path not found: ${projectPath}`, 'red');
|
|
process.exit(1);
|
|
}
|
|
|
|
// Check for CODEBASE_CONTEXT.md
|
|
const contextPath = join(projectPath, '.claude', 'CODEBASE_CONTEXT.md');
|
|
if (!existsSync(contextPath)) {
|
|
log(`Warning: No .claude/CODEBASE_CONTEXT.md found in ${projectPath}`, 'yellow');
|
|
log('The Worker and Reviewer will have limited context.', 'dim');
|
|
} else {
|
|
log(`Found: ${contextPath}`, 'green');
|
|
}
|
|
|
|
// Determine task
|
|
let task;
|
|
if (scenarioArg) {
|
|
const scenarioId = scenarioArg.split('=')[1];
|
|
const scenario = loadScenario(scenarioId);
|
|
task = scenario.task;
|
|
log(`Using scenario: ${scenario.name}`, 'blue');
|
|
} else if (taskArg) {
|
|
task = taskArg.split('=')[1];
|
|
} else if (interactive) {
|
|
task = await prompt('Enter task: ');
|
|
} else {
|
|
log('Error: Either --task=TEXT or --scenario=ID required', 'red');
|
|
process.exit(1);
|
|
}
|
|
|
|
log(`\nProject: ${projectPath}`, 'dim');
|
|
log(`Task: ${task}\n`, 'dim');
|
|
|
|
// Connect to MCP server
|
|
const client = new DryRunClient(projectPath);
|
|
|
|
try {
|
|
log('Connecting to MCP server...', 'dim');
|
|
await client.connect();
|
|
log('Connected!', 'green');
|
|
|
|
// Check initial status
|
|
logSection('Status');
|
|
const status = await client.callTool('status');
|
|
log(status.text, 'dim');
|
|
|
|
// Start the workflow
|
|
logSection('Starting Workflow');
|
|
const startResult = await client.callTool('start', { task });
|
|
log(startResult.text, startResult.isError ? 'red' : 'green');
|
|
|
|
if (startResult.isError) {
|
|
// Try to abort and restart
|
|
log('\nAborting existing workflow...', 'yellow');
|
|
await client.callTool('abort', { reason: 'Dry run restart' });
|
|
const retryResult = await client.callTool('start', { task });
|
|
log(retryResult.text, retryResult.isError ? 'red' : 'green');
|
|
}
|
|
|
|
// Get worker brief
|
|
logSection('Worker Brief');
|
|
const workerBrief = await client.callTool('get_worker_brief');
|
|
|
|
// Show truncated brief
|
|
const briefLines = workerBrief.text.split('\n');
|
|
const truncatedBrief = briefLines.slice(0, 50).join('\n');
|
|
log(truncatedBrief, 'dim');
|
|
if (briefLines.length > 50) {
|
|
log(`\n... (${briefLines.length - 50} more lines)`, 'dim');
|
|
}
|
|
|
|
logSection('Dry Run Complete');
|
|
|
|
log(`
|
|
${colors.yellow}What happens next in a real session:${colors.reset}
|
|
|
|
1. ${colors.bold}Worker Agent${colors.reset} (fresh sub-agent) receives the brief above
|
|
- Researches the codebase using Glob, Grep, Read tools
|
|
- Proposes a fix with file:line citations
|
|
- Submits via ${colors.cyan}diligence.propose${colors.reset}
|
|
|
|
2. ${colors.bold}Reviewer Agent${colors.reset} (fresh sub-agent) verifies the proposal
|
|
- Searches codebase to verify Worker's claims
|
|
- Checks against patterns in CODEBASE_CONTEXT.md
|
|
- Submits ${colors.green}APPROVED${colors.reset} or ${colors.yellow}NEEDS_WORK${colors.reset} via ${colors.cyan}diligence.review${colors.reset}
|
|
|
|
3. If ${colors.yellow}NEEDS_WORK${colors.reset}: Worker revises, Reviewer re-checks (up to 5 rounds)
|
|
|
|
4. If ${colors.green}APPROVED${colors.reset}: ${colors.cyan}diligence.implement${colors.reset} → code changes → ${colors.cyan}diligence.complete${colors.reset}
|
|
|
|
${colors.dim}This was a dry run - no code changes were made.${colors.reset}
|
|
`, 'reset');
|
|
|
|
// Cleanup - abort the workflow
|
|
await client.callTool('abort', { reason: 'Dry run completed' });
|
|
log('Workflow aborted (dry run cleanup)', 'dim');
|
|
|
|
} finally {
|
|
await client.disconnect();
|
|
log('\nDisconnected from MCP server.', 'dim');
|
|
}
|
|
}
|
|
|
|
main().catch(err => {
|
|
console.error('Error:', err.message);
|
|
process.exit(1);
|
|
});
|