diligence/test/compare-approaches.mjs

#!/usr/bin/env node
/**
 * Comparison Test: Naive vs Diligence Approach
 *
 * This script coordinates testing of both approaches:
 * 1. Naive: A single agent analyzes and proposes a fix
 * 2. Diligence: Worker-Reviewer loop with separate agents
 *
 * The test uses a real bug from the nexus codebase.
 *
 * Usage:
 *   node test/compare-approaches.mjs
 */

import { writeFileSync, mkdirSync, existsSync } from 'fs';
import { dirname, join } from 'path';
import { fileURLToPath } from 'url';

const __dirname = dirname(fileURLToPath(import.meta.url));
const RESULTS_DIR = join(__dirname, 'results');

// Ensure results directory exists
if (!existsSync(RESULTS_DIR)) {
  mkdirSync(RESULTS_DIR, { recursive: true });
}

const TEST_BUG = {
  id: 'B1',
  name: 'Blocked users can join/answer DM voice calls',
  task: `Fix bug B1: Blocked users can join DM voice calls.

When user A blocks user B, user B should NOT be able to:
1. Answer incoming DM calls from user A
2. Start new calls to user A (already works)
3. Join DM voice channel with user A (already works in joinVoiceChannel)

The bug is that answerDmCall() has no blocking check.

Analyze the codebase and propose a COMPLETE fix.`,

  // What naive agents typically miss
  naive_misses: [
    'declineDmCall() also needs blocking check for consistency',
    'notifyDmCall() should filter blocked users from notifications',
    'blockUser() should clean up existing voice calls',
    'Need to subscribe to BusUserBlockChange for mid-call kick',
    'Should follow the pattern from chat.service.ts where permission=visibility, actions have separate checks',
  ],

  // Required elements for a complete fix
  required_elements: [
    'answerDmCall blocking check',
    'declineDmCall blocking check',
    'notification filtering',
    'voice cleanup in blockUser()',
    'BusUserBlockChange subscription',
    'chat.service.ts pattern reference',
  ],
};

// Prompts for the test
const NAIVE_PROMPT = `You are analyzing a bug in the nexus codebase.

BUG: ${TEST_BUG.task}

Your job is to:
1. Search the codebase to understand the current implementation
2. Identify all files that need changes
3. Propose a complete fix

DO NOT use any diligence MCP tools. Just analyze and propose.

Be thorough - check for:
- Similar patterns in the codebase
- Broker events that might be relevant
- All places where blocking should be enforced
- Edge cases (what if block happens mid-call?)

Output your analysis and proposed fix.`;

const WORKER_PROMPT = `You are a Worker agent in the diligence workflow.

Your brief has been loaded with:
- The task description
- Codebase context (architecture, patterns)
- Any previous feedback

Your job:
1. Research the codebase thoroughly
2. Trace data flow from origin to all consumers
3. Find existing patterns for similar functionality
4. Identify ALL files that need changes
5. Propose a fix with file:line citations for every claim

IMPORTANT:
- Cite specific file:line for every claim
- Search for similar patterns (how does chat handle blocking?)
- Don't miss broker events
- Consider edge cases (mid-call blocking)

Submit your proposal via mcp__diligence__propose when ready.`;

const REVIEWER_PROMPT = `You are a Reviewer agent in the diligence workflow.

Your brief has been loaded with:
- The Worker's proposal
- The task description
- Codebase context

Your job:
1. VERIFY every claim by searching the codebase yourself
2. Check if the proposal follows existing patterns
3. Look for missing broker events or edge cases
4. Do NOT trust the Worker's citations - verify them

For each claim in the proposal:
- Search for the file/line cited
- Verify it says what the Worker claims
- Check if there are related issues the Worker missed

Submit your review via mcp__diligence__review:
- APPROVED if all checks pass
- NEEDS_WORK with specific issues if not

Be strict - missing one broker event subscription can cause production bugs.`;

function log(msg) {
  const timestamp = new Date().toISOString().slice(11, 19);
  console.log(`[${timestamp}] ${msg}`);
}

function saveResult(name, content) {
  const timestamp = new Date().toISOString().slice(0, 10);
  const filename = `${timestamp}-${name}.md`;
  const path = join(RESULTS_DIR, filename);
  writeFileSync(path, content);
  log(`Saved: ${path}`);
  return path;
}

// Generate the test instructions
function generateTestInstructions() {
  const instructions = `# Diligence Comparison Test

## Test Bug
**ID:** ${TEST_BUG.id}
**Name:** ${TEST_BUG.name}

## Task
${TEST_BUG.task}

---

## Phase 1: Naive Approach (WITHOUT Diligence)

In a Claude Code session, paste this prompt:

\`\`\`
${NAIVE_PROMPT}
\`\`\`

Save the output as the "naive proposal".

---

## Phase 2: Diligence Approach (WITH Worker-Reviewer Loop)

### Step 1: Start the workflow
\`\`\`
mcp__diligence__start with task: "${TEST_BUG.task.split('\n')[0]}"
\`\`\`

### Step 2: Spawn Worker Agent
\`\`\`
1. Call mcp__diligence__get_worker_brief
2. Use Task tool with subagent_type="Explore" and this prompt:
   "${WORKER_PROMPT.replace(/\n/g, ' ').slice(0, 200)}..."
3. Worker should research and call mcp__diligence__propose
\`\`\`

### Step 3: Spawn Reviewer Agent
\`\`\`
1. Call mcp__diligence__get_reviewer_brief
2. Use Task tool with subagent_type="Explore" and this prompt:
   "${REVIEWER_PROMPT.replace(/\n/g, ' ').slice(0, 200)}..."
3. Reviewer should verify and call mcp__diligence__review
\`\`\`

### Step 4: Loop or Complete
- If NEEDS_WORK: spawn new Worker with updated brief
- If APPROVED: call mcp__diligence__implement

Save the final approved proposal as the "diligence proposal".

---

## Phase 3: Compare Results

### Checklist - What Naive Typically Misses
${TEST_BUG.naive_misses.map(m => `- [ ] ${m}`).join('\n')}

### Required Elements for Complete Fix
${TEST_BUG.required_elements.map(e => `- [ ] ${e}`).join('\n')}

### Scoring
- Naive proposal: Count how many required elements it includes
- Diligence proposal: Count how many required elements it includes
- Did diligence catch issues that naive missed?

---

## Expected Outcome

The naive approach will likely:
- Add blocking check to answerDmCall() only
- Miss the other 5 required elements

The diligence approach should:
- Catch missing elements during review
- Iterate until all elements are addressed
- Produce a more complete proposal

`;

  return instructions;
}

// Main
async function main() {
  log('Generating comparison test instructions...');

  const instructions = generateTestInstructions();
  const path = saveResult('comparison-test-instructions', instructions);

  console.log('\n' + '='.repeat(60));
  console.log('COMPARISON TEST READY');
  console.log('='.repeat(60));
  console.log(`\nInstructions saved to: ${path}`);
  console.log('\nTo run the test:');
  console.log('1. Open the instructions file');
  console.log('2. Start a Claude Code session in ~/bude/codecharm/nexus');
  console.log('3. Run Phase 1 (naive) and save the output');
  console.log('4. Run Phase 2 (diligence) and save the output');
  console.log('5. Compare using the checklist in Phase 3');
  console.log('\n');

  // Also print the naive prompt for immediate use
  console.log('='.repeat(60));
  console.log('NAIVE PROMPT (for quick testing):');
  console.log('='.repeat(60));
  console.log(NAIVE_PROMPT);
  console.log('\n');
}

main().catch(console.error);