Initial release: MCP server enforcing Worker-Reviewer loop

Diligence prevents AI agents from shipping quick fixes that break things by enforcing a research-propose-verify loop before any code changes. Key features: - Worker sub-agent researches and proposes with file:line citations - Reviewer sub-agent independently verifies claims by searching codebase - Iterates until approved (max 5 rounds) - Loads project-specific context from .claude/CODEBASE_CONTEXT.md - State persisted across sessions Validated on production codebase: caught architectural mistake (broker subscriptions on client-side code) that naive agent would have shipped.
2026-01-22 06:22:59 +01:00
commit bd178fcaf0
23 changed files with 4001 additions and 0 deletions
--- a/test/compare-approaches.mjs
+++ b/test/compare-approaches.mjs
@@ -0,0 +1,255 @@
+#!/usr/bin/env node
+/**
+ * Comparison Test: Naive vs Diligence Approach
+ *
+ * This script coordinates testing of both approaches:
+ * 1. Naive: A single agent analyzes and proposes a fix
+ * 2. Diligence: Worker-Reviewer loop with separate agents
+ *
+ * The test uses a real bug from the nexus codebase.
+ *
+ * Usage:
+ *   node test/compare-approaches.mjs
+ */
+
+import { writeFileSync, mkdirSync, existsSync } from 'fs';
+import { dirname, join } from 'path';
+import { fileURLToPath } from 'url';
+
+const __dirname = dirname(fileURLToPath(import.meta.url));
+const RESULTS_DIR = join(__dirname, 'results');
+
+// Ensure results directory exists
+if (!existsSync(RESULTS_DIR)) {
+  mkdirSync(RESULTS_DIR, { recursive: true });
+}
+
+const TEST_BUG = {
+  id: 'B1',
+  name: 'Blocked users can join/answer DM voice calls',
+  task: `Fix bug B1: Blocked users can join DM voice calls.
+
+When user A blocks user B, user B should NOT be able to:
+1. Answer incoming DM calls from user A
+2. Start new calls to user A (already works)
+3. Join DM voice channel with user A (already works in joinVoiceChannel)
+
+The bug is that answerDmCall() has no blocking check.
+
+Analyze the codebase and propose a COMPLETE fix.`,
+
+  // What naive agents typically miss
+  naive_misses: [
+    'declineDmCall() also needs blocking check for consistency',
+    'notifyDmCall() should filter blocked users from notifications',
+    'blockUser() should clean up existing voice calls',
+    'Need to subscribe to BusUserBlockChange for mid-call kick',
+    'Should follow the pattern from chat.service.ts where permission=visibility, actions have separate checks',
+  ],
+
+  // Required elements for a complete fix
+  required_elements: [
+    'answerDmCall blocking check',
+    'declineDmCall blocking check',
+    'notification filtering',
+    'voice cleanup in blockUser()',
+    'BusUserBlockChange subscription',
+    'chat.service.ts pattern reference',
+  ],
+};
+
+// Prompts for the test
+const NAIVE_PROMPT = `You are analyzing a bug in the nexus codebase.
+
+BUG: ${TEST_BUG.task}
+
+Your job is to:
+1. Search the codebase to understand the current implementation
+2. Identify all files that need changes
+3. Propose a complete fix
+
+DO NOT use any diligence MCP tools. Just analyze and propose.
+
+Be thorough - check for:
+- Similar patterns in the codebase
+- Broker events that might be relevant
+- All places where blocking should be enforced
+- Edge cases (what if block happens mid-call?)
+
+Output your analysis and proposed fix.`;
+
+const WORKER_PROMPT = `You are a Worker agent in the diligence workflow.
+
+Your brief has been loaded with:
+- The task description
+- Codebase context (architecture, patterns)
+- Any previous feedback
+
+Your job:
+1. Research the codebase thoroughly
+2. Trace data flow from origin to all consumers
+3. Find existing patterns for similar functionality
+4. Identify ALL files that need changes
+5. Propose a fix with file:line citations for every claim
+
+IMPORTANT:
+- Cite specific file:line for every claim
+- Search for similar patterns (how does chat handle blocking?)
+- Don't miss broker events
+- Consider edge cases (mid-call blocking)
+
+Submit your proposal via mcp__diligence__propose when ready.`;
+
+const REVIEWER_PROMPT = `You are a Reviewer agent in the diligence workflow.
+
+Your brief has been loaded with:
+- The Worker's proposal
+- The task description
+- Codebase context
+
+Your job:
+1. VERIFY every claim by searching the codebase yourself
+2. Check if the proposal follows existing patterns
+3. Look for missing broker events or edge cases
+4. Do NOT trust the Worker's citations - verify them
+
+For each claim in the proposal:
+- Search for the file/line cited
+- Verify it says what the Worker claims
+- Check if there are related issues the Worker missed
+
+Submit your review via mcp__diligence__review:
+- APPROVED if all checks pass
+- NEEDS_WORK with specific issues if not
+
+Be strict - missing one broker event subscription can cause production bugs.`;
+
+function log(msg) {
+  const timestamp = new Date().toISOString().slice(11, 19);
+  console.log(`[${timestamp}] ${msg}`);
+}
+
+function saveResult(name, content) {
+  const timestamp = new Date().toISOString().slice(0, 10);
+  const filename = `${timestamp}-${name}.md`;
+  const path = join(RESULTS_DIR, filename);
+  writeFileSync(path, content);
+  log(`Saved: ${path}`);
+  return path;
+}
+
+// Generate the test instructions
+function generateTestInstructions() {
+  const instructions = `# Diligence Comparison Test
+
+## Test Bug
+**ID:** ${TEST_BUG.id}
+**Name:** ${TEST_BUG.name}
+
+## Task
+${TEST_BUG.task}
+
+---
+
+## Phase 1: Naive Approach (WITHOUT Diligence)
+
+In a Claude Code session, paste this prompt:
+
+\`\`\`
+${NAIVE_PROMPT}
+\`\`\`
+
+Save the output as the "naive proposal".
+
+---
+
+## Phase 2: Diligence Approach (WITH Worker-Reviewer Loop)
+
+### Step 1: Start the workflow
+\`\`\`
+mcp__diligence__start with task: "${TEST_BUG.task.split('\n')[0]}"
+\`\`\`
+
+### Step 2: Spawn Worker Agent
+\`\`\`
+1. Call mcp__diligence__get_worker_brief
+2. Use Task tool with subagent_type="Explore" and this prompt:
+   "${WORKER_PROMPT.replace(/\n/g, ' ').slice(0, 200)}..."
+3. Worker should research and call mcp__diligence__propose
+\`\`\`
+
+### Step 3: Spawn Reviewer Agent
+\`\`\`
+1. Call mcp__diligence__get_reviewer_brief
+2. Use Task tool with subagent_type="Explore" and this prompt:
+   "${REVIEWER_PROMPT.replace(/\n/g, ' ').slice(0, 200)}..."
+3. Reviewer should verify and call mcp__diligence__review
+\`\`\`
+
+### Step 4: Loop or Complete
+- If NEEDS_WORK: spawn new Worker with updated brief
+- If APPROVED: call mcp__diligence__implement
+
+Save the final approved proposal as the "diligence proposal".
+
+---
+
+## Phase 3: Compare Results
+
+### Checklist - What Naive Typically Misses
+${TEST_BUG.naive_misses.map(m => `- [ ] ${m}`).join('\n')}
+
+### Required Elements for Complete Fix
+${TEST_BUG.required_elements.map(e => `- [ ] ${e}`).join('\n')}
+
+### Scoring
+- Naive proposal: Count how many required elements it includes
+- Diligence proposal: Count how many required elements it includes
+- Did diligence catch issues that naive missed?
+
+---
+
+## Expected Outcome
+
+The naive approach will likely:
+- Add blocking check to answerDmCall() only
+- Miss the other 5 required elements
+
+The diligence approach should:
+- Catch missing elements during review
+- Iterate until all elements are addressed
+- Produce a more complete proposal
+
+`;
+
+  return instructions;
+}
+
+// Main
+async function main() {
+  log('Generating comparison test instructions...');
+
+  const instructions = generateTestInstructions();
+  const path = saveResult('comparison-test-instructions', instructions);
+
+  console.log('\n' + '='.repeat(60));
+  console.log('COMPARISON TEST READY');
+  console.log('='.repeat(60));
+  console.log(`\nInstructions saved to: ${path}`);
+  console.log('\nTo run the test:');
+  console.log('1. Open the instructions file');
+  console.log('2. Start a Claude Code session in ~/bude/codecharm/nexus');
+  console.log('3. Run Phase 1 (naive) and save the output');
+  console.log('4. Run Phase 2 (diligence) and save the output');
+  console.log('5. Compare using the checklist in Phase 3');
+  console.log('\n');
+
+  // Also print the naive prompt for immediate use
+  console.log('='.repeat(60));
+  console.log('NAIVE PROMPT (for quick testing):');
+  console.log('='.repeat(60));
+  console.log(NAIVE_PROMPT);
+  console.log('\n');
+}
+
+main().catch(console.error);