From bd178fcaf0b6e3f29b0796a373cd7b62ca9aeb4e Mon Sep 17 00:00:00 2001 From: "Marc J. Schmidt" Date: Thu, 22 Jan 2026 06:22:59 +0100 Subject: [PATCH] Initial release: MCP server enforcing Worker-Reviewer loop Diligence prevents AI agents from shipping quick fixes that break things by enforcing a research-propose-verify loop before any code changes. Key features: - Worker sub-agent researches and proposes with file:line citations - Reviewer sub-agent independently verifies claims by searching codebase - Iterates until approved (max 5 rounds) - Loads project-specific context from .claude/CODEBASE_CONTEXT.md - State persisted across sessions Validated on production codebase: caught architectural mistake (broker subscriptions on client-side code) that naive agent would have shipped. --- .gitignore | 2 + README.md | 274 +++++++++ REVIEWER_PROMPT.md | 161 ++++++ TESTING.md | 243 ++++++++ WORKER_PROMPT.md | 135 +++++ index.mjs | 547 ++++++++++++++++++ package-lock.json | 162 ++++++ package.json | 21 + test/compare-approaches.mjs | 255 ++++++++ test/dry-run.mjs | 305 ++++++++++ test/fixture/.claude/CODEBASE_CONTEXT.md | 150 +++++ test/fixture/src/broker/events.ts | 79 +++ .../src/controllers/roles.controller.ts | 99 ++++ test/fixture/src/services/chat.service.ts | 122 ++++ test/fixture/src/services/team.service.ts | 102 ++++ .../src/services/user-block.service.ts | 117 ++++ .../src/services/voice-channel.service.ts | 220 +++++++ test/mcp-client.mjs | 229 ++++++++ test/results/2026-01-22-comparison-report.md | 183 ++++++ test/run-tests.mjs | 415 +++++++++++++ test/scenarios/blocking-voice.json | 78 +++ test/scenarios/index.json | 21 + test/scenarios/permission-cache.json | 81 +++ 23 files changed, 4001 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 REVIEWER_PROMPT.md create mode 100644 TESTING.md create mode 100644 WORKER_PROMPT.md create mode 100644 index.mjs create mode 100644 package-lock.json create mode 100644 package.json create mode 100644 test/compare-approaches.mjs create mode 100644 test/dry-run.mjs create mode 100644 test/fixture/.claude/CODEBASE_CONTEXT.md create mode 100644 test/fixture/src/broker/events.ts create mode 100644 test/fixture/src/controllers/roles.controller.ts create mode 100644 test/fixture/src/services/chat.service.ts create mode 100644 test/fixture/src/services/team.service.ts create mode 100644 test/fixture/src/services/user-block.service.ts create mode 100644 test/fixture/src/services/voice-channel.service.ts create mode 100644 test/mcp-client.mjs create mode 100644 test/results/2026-01-22-comparison-report.md create mode 100644 test/run-tests.mjs create mode 100644 test/scenarios/blocking-voice.json create mode 100644 test/scenarios/index.json create mode 100644 test/scenarios/permission-cache.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3c536f6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +node_modules/ +.claude/.diligence-state.json diff --git a/README.md b/README.md new file mode 100644 index 0000000..e6e1d0b --- /dev/null +++ b/README.md @@ -0,0 +1,274 @@ +# @strikt/diligence + +An MCP server that enforces a Worker-Reviewer loop before code changes, preventing quick fixes that break things. + +## The Problem + +AI coding agents are too eager. Given a bug like "permission cache doesn't invalidate," they'll add `.clear()` somewhere and call it done. But real codebases have: + +- Broker events that need subscribing +- Caches in multiple locations +- Patterns that must be followed +- Client/server architecture boundaries +- Edge cases that cause production fires + +**A naive agent doesn't know what it doesn't know.** It proposes a fix, you approve it, and three days later you're debugging why permissions are broken in a completely different feature. + +## The Solution + +**Diligence** forces a research-verify loop before any code is written: + +``` +┌─────────────────────────────────────────────────────────────┐ +│ 1. WORKER (sub-agent) │ +│ - Searches codebase thoroughly │ +│ - Traces data flow from origin to all consumers │ +│ - Finds existing patterns for similar features │ +│ - Proposes fix with file:line citations │ +└─────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 2. REVIEWER (separate sub-agent) │ +│ - Gets fresh context (no Worker bias) │ +│ - Verifies every claim by searching codebase │ +│ - Checks against architecture patterns │ +│ - Returns NEEDS_WORK with specific feedback │ +└─────────────────────────────────────────────────────────────┘ + │ + ┌───────┴───────┐ + ▼ ▼ + NEEDS_WORK APPROVED + (loop back) │ + ▼ +┌─────────────────────────────────────────────────────────────┐ +│ 3. IMPLEMENTATION │ +│ - Only now can code be changed │ +│ - Follows the verified proposal │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Real-World Validation + +Tested on a production codebase with a P0 bug: "Permission cache doesn't invalidate on role changes." + +| Round | Worker Proposed | Reviewer Found | +|-------|-----------------|----------------| +| 1 | Add broker event subscriptions to TeamService | **Wrong** - broker is server-side only, TeamService is client-side | +| 2 | Revised RPC-based approach | Missing error handling, incomplete pattern match | +| 3 | Simpler solution using ProjectService | Still gaps in implementation | +| 4 | Complete RPC stream solution | **Approved** | + +**A naive agent would have shipped Round 1** - adding broker subscriptions to client-side code that can't access the broker. That fix would have done nothing and wasted hours of debugging. + +## Installation + +```bash +npm install @strikt/diligence +``` + +Or clone directly: + +```bash +git clone https://github.com/strikt/diligence ~/tools/diligence +cd ~/tools/diligence && npm install +``` + +## Setup + +### 1. Add MCP Server + +In your project's `.mcp.json` or `.claude/settings.json`: + +```json +{ + "mcpServers": { + "diligence": { + "command": "node", + "args": ["/path/to/diligence/index.mjs"] + } + } +} +``` + +### 2. Create Codebase Context + +Create `.claude/CODEBASE_CONTEXT.md` in your project: + +```markdown +# Architecture + +[Describe your system architecture] + +## Key Patterns + +[Document patterns agents MUST follow] + +## Common Pitfalls + +[List things that break if done wrong] + +## Events/Hooks + +[List broker events, hooks, subscriptions that exist] +``` + +This context is loaded into both Worker and Reviewer briefs. + +### 3. Add to CLAUDE.md (Recommended) + +Add to your project's `CLAUDE.md`: + +```markdown +## Code Changes + +For any code changes, use the diligence workflow: + +1. Call `mcp__diligence__start` with the task +2. Spawn a Worker sub-agent with `get_worker_brief` to research and propose +3. Spawn a Reviewer sub-agent with `get_reviewer_brief` to verify +4. Loop until APPROVED +5. Call `implement` and make changes +``` + +This ensures Claude picks up diligence automatically without explicit instructions. + +## How It Works + +### Workflow Phases + +``` +conversation → researching → approved → implementing → conversation + ↑ │ + └──────────────┘ + (NEEDS_WORK, max 5 rounds) +``` + +### MCP Tools + +| Tool | Description | +|------|-------------| +| `start` | Begin workflow with a task description | +| `get_worker_brief` | Get full context for Worker sub-agent | +| `propose` | Worker submits proposal with citations | +| `get_reviewer_brief` | Get full context for Reviewer sub-agent | +| `review` | Reviewer submits APPROVED or NEEDS_WORK | +| `implement` | Begin implementation (requires approval) | +| `complete` | Mark done, reset to conversation | +| `status` | Check current workflow state | +| `abort` | Cancel and reset | + +### Sub-Agent Pattern + +The key insight: **Worker and Reviewer are separate sub-agents** with fresh context. + +``` +Main Claude Session + │ + ├─► Worker Sub-Agent (Explore) + │ - Receives: task + codebase context + previous feedback + │ - Does: searches, reads, analyzes + │ - Returns: proposal with file:line citations + │ + └─► Reviewer Sub-Agent (Explore) + - Receives: proposal + codebase context (NOT Worker's searches) + - Does: independently verifies every claim + - Returns: APPROVED or NEEDS_WORK with specifics +``` + +The Reviewer doesn't see the Worker's search results. It must verify claims by actually searching the codebase. This prevents rubber-stamping. + +## Project Structure + +``` +.claude/ +├── CODEBASE_CONTEXT.md # Required - architecture & patterns +├── WORKER_CONTEXT.md # Optional - project-specific worker guidance +├── REVIEWER_CONTEXT.md # Optional - project-specific reviewer guidance +├── context/ # Optional - additional context files +│ ├── voice.md +│ └── permissions.md +└── .diligence-state.json # Auto-generated workflow state +``` + +## Example Session + +``` +User: Fix the permission cache bug from the todo list + +Claude: [reads todo, understands task] +Claude: [calls diligence.start] + +Claude: [spawns Worker with Task tool] +Worker: [searches for permission cache, finds memoizedPermissions] +Worker: [searches for role events, finds BusTeamRoleChange] +Worker: [searches for patterns, finds how chat.service handles this] +Worker: [calls diligence.propose with full analysis] + +Claude: [spawns Reviewer with Task tool] +Reviewer: [verifies claim about memoizedPermissions - confirmed at team.service.ts:45] +Reviewer: [verifies claim about events - finds Worker missed BusTeamMemberRoleChange] +Reviewer: [calls diligence.review with NEEDS_WORK] + +Claude: [spawns new Worker with feedback] +Worker: [addresses feedback, adds missing event] +Worker: [calls diligence.propose] + +Claude: [spawns Reviewer] +Reviewer: [all claims verified] +Reviewer: [calls diligence.review with APPROVED] + +Claude: [calls diligence.implement] +Claude: [makes code changes following approved proposal] +Claude: [calls diligence.complete] +``` + +## Why This Works + +1. **Enforcement** - Can't write code without approval +2. **Fresh Context** - Reviewer has no bias from Worker's reasoning +3. **Verification** - Claims are checked against actual codebase +4. **Iteration** - Multiple rounds refine understanding +5. **Architecture Awareness** - Both agents loaded with project context +6. **Audit Trail** - Full history of proposals and feedback + +## Configuration + +### Environment Variables + +| Variable | Description | +|----------|-------------| +| None required | State stored in `.claude/.diligence-state.json` | + +### Constants (in index.mjs) + +| Constant | Default | Description | +|----------|---------|-------------| +| `MAX_ROUNDS` | 5 | Maximum Worker-Reviewer iterations before reset | + +## Testing + +```bash +# Run workflow mechanics tests +npm run test:workflow + +# Run mock scenario tests +npm run test:mock + +# Dry-run against a real project +node test/dry-run.mjs --project=/path/to/project --scenario=blocking-voice +``` + +## Limitations + +- Requires Claude Code with MCP support +- Sub-agents use context, so complex codebases may need focused CODEBASE_CONTEXT.md +- Not a replacement for human code review - complements it + +## License + +MIT + +## Contributing + +Issues and PRs welcome at https://github.com/strikt/diligence diff --git a/REVIEWER_PROMPT.md b/REVIEWER_PROMPT.md new file mode 100644 index 0000000..100685b --- /dev/null +++ b/REVIEWER_PROMPT.md @@ -0,0 +1,161 @@ +# Reviewer Instructions + +You are the Reviewer in a diligence workflow. Your job is to **verify claims by searching the codebase** and ensure the proposal is complete before implementation begins. + +## Your Mindset + +You are a skeptical senior engineer who: +- Doesn't trust claims without evidence +- Searches the codebase to verify +- Knows the patterns (from codebase context) +- Catches missing cases and edge conditions +- Approves only when genuinely confident + +**Your value comes from catching what the Worker missed.** If you rubber-stamp proposals, bugs ship. + +--- + +## Review Process + +### 1. For Each Claim, Search to Verify + +The Worker says "there are no events for X" → Search for events +The Worker says "this is the only place" → Search for other places +The Worker says "following the pattern in Y" → Read Y and compare + +**Do not trust. Verify.** + +### 2. Check Against Codebase Context + +The codebase context describes: +- Architecture patterns +- Common pitfalls +- Key events/hooks +- Validation checklists + +Cross-reference the proposal against these. Did the Worker address them? + +### 3. Look for What's Missing + +Common gaps: +- Events/hooks that should be subscribed to +- Caches that need invalidation +- Related code that assumes old behavior +- Error handling and edge cases +- Cleanup on failure/disconnect + +### 4. Verify the Pattern Match + +If Worker claims to follow an existing pattern: +- Read the cited pattern code +- Compare to proposed implementation +- Flag any deviations + +--- + +## Verification Format + +For each major claim, document: + +```markdown +### Claim: "[quote from proposal]" + +**Searched:** [what you searched for] +**Found:** [what you actually found] +**Verdict:** ✅ Verified / ❌ Incorrect / ⚠️ Incomplete +``` + +--- + +## Decision Criteria + +### APPROVED + +Use APPROVED when: +- All claims verified by searching codebase +- Proposal follows existing patterns correctly +- All scenarios from codebase context addressed +- No obvious gaps in the implementation plan +- You are confident this will work + +### NEEDS_WORK + +Use NEEDS_WORK when: +- Claims don't match what you found in codebase +- Missing events/hooks/subscriptions +- Doesn't follow existing patterns +- Scenarios or edge cases not addressed +- The implementation plan has gaps + +--- + +## Feedback Format (NEEDS_WORK) + +```markdown +## Verification Results + +### Claim: "[claim]" +**Searched:** `grep pattern` +**Found:** [results] +**Verdict:** ❌ Incorrect - [explanation] + +### Claim: "[claim]" +**Searched:** `grep pattern` +**Found:** [results] +**Verdict:** ⚠️ Incomplete - [what's missing] + +## Missing Items + +1. **[Item]** - Found `EventName` at file:line that isn't addressed +2. **[Item]** - The pattern in `other-file.ts` does X but proposal doesn't +3. **[Item]** - Scenario Y from codebase context not covered + +## Required Changes + +To approve, the Worker must: +1. [Specific actionable item] +2. [Specific actionable item] +3. [Specific actionable item] +``` + +--- + +## Common Things to Check + +| Category | What to Search | Why | +|----------|---------------|-----| +| Events | `Bus*`, `emit`, `subscribe` | State changes need propagation | +| Caches | `cache`, `memoize`, `Map<` | May need invalidation | +| Patterns | Similar feature names | Should follow conventions | +| Cleanup | `destroy`, `disconnect`, `leave` | Resources need cleanup | +| Errors | `throw`, `catch`, `UserError` | Error handling patterns | + +--- + +## Using Tools + +You have full access to search the codebase: +- **Grep** - Essential for verification +- **Glob** - Find related files +- **Read** - Check full context + +**A review without searches is not a review.** You must demonstrate you looked. + +--- + +## Red Flags + +Reject immediately if: +- Worker provides no file:line citations +- Claims contradict what you find in codebase +- Proposal ignores items from codebase context +- "I'll figure it out during implementation" +- Vague or hand-wavy implementation steps + +--- + +## Remember + +Your job is not to be difficult. Your job is to **catch problems before they become bugs in production.** + +If the proposal is solid, approve it. If it has gaps, send it back with specific, actionable feedback. diff --git a/TESTING.md b/TESTING.md new file mode 100644 index 0000000..dd8f54d --- /dev/null +++ b/TESTING.md @@ -0,0 +1,243 @@ +# Testing the Diligence MCP Server + +## Test Suite Overview + +The diligence project includes a comprehensive test suite that validates: + +1. **Workflow mechanics** - State transitions, round limits, phase enforcement +2. **Mock scenarios** - Predefined Worker-Reviewer interactions with expected outcomes +3. **Dry-run mode** - Test against real projects without making changes + +## Quick Start + +```bash +# Run all workflow tests +node test/run-tests.mjs --workflow + +# Run mock scenario tests +node test/run-tests.mjs --mock + +# Run a specific scenario +node test/run-tests.mjs --mock --scenario=blocking-voice + +# Dry run against nexus (no changes) +node test/dry-run.mjs --project=~/bude/codecharm/nexus --scenario=blocking-voice +``` + +## Test Structure + +``` +test/ +├── mcp-client.mjs # Programmatic MCP client +├── run-tests.mjs # Main test runner +├── dry-run.mjs # Dry-run against real projects +├── fixture/ # Mock codebase for testing +│ ├── src/ +│ │ ├── broker/events.ts +│ │ ├── services/ +│ │ └── controllers/ +│ └── .claude/ +│ └── CODEBASE_CONTEXT.md +└── scenarios/ # Test scenarios + ├── index.json + ├── blocking-voice.json + └── permission-cache.json +``` + +## Test Modes + +### 1. Workflow Tests (`--workflow`) + +Tests the MCP server mechanics without AI: + +- Phase transitions (conversation → researching → approved → implementing) +- Round increment on NEEDS_WORK +- Max rounds enforcement (resets after 5 rounds) +- Feedback accumulation +- Abort functionality + +```bash +node test/run-tests.mjs --workflow +``` + +### 2. Mock Tests (`--mock`) + +Tests complete Worker-Reviewer scenarios with predefined responses: + +1. Scenario defines a task and expected naive/correct fixes +2. Test simulates Worker submitting naive proposal +3. Reviewer catches issues, sends NEEDS_WORK +4. Worker submits revised proposal with all fixes +5. Reviewer approves +6. Validates that proposal mentions all required elements + +```bash +# All scenarios +node test/run-tests.mjs --mock + +# Single scenario +node test/run-tests.mjs --mock --scenario=permission-cache +``` + +### 3. Dry Run (`--project`) + +Connects to MCP server in a real project directory: + +```bash +# With predefined scenario +node test/dry-run.mjs --project=~/bude/codecharm/nexus --scenario=blocking-voice + +# With custom task +node test/dry-run.mjs --project=/path/to/project --task="Fix the caching bug" +``` + +This: +- Starts the workflow with the task +- Shows the full Worker Brief (including real CODEBASE_CONTEXT.md) +- Does NOT make any code changes +- Aborts the workflow on exit + +## Test Fixture + +The fixture (`test/fixture/`) is a mini codebase that mirrors real-world patterns: + +### Files + +| File | Purpose | Known Bugs (for testing) | +|------|---------|-------------------------| +| `broker/events.ts` | Event bus definitions | Reference implementation | +| `services/user-block.service.ts` | Blocking logic | Missing voice cleanup | +| `services/voice-channel.service.ts` | Voice/DM calls | Missing blocking check on answerDmCall | +| `services/team.service.ts` | Permission cache | Doesn't subscribe to role events | +| `services/chat.service.ts` | **Correct pattern** | Shows permission vs action separation | +| `controllers/roles.controller.ts` | Role CRUD | Missing broker events on create/delete | + +### Patterns Tested + +1. **Broker event emission** - Every state change should emit events +2. **Cache invalidation** - Caches must subscribe to relevant events +3. **Permission vs Action** - Permissions control visibility, actions have separate checks +4. **Multi-location fixes** - If one place has a check, similar places need it too + +## Test Scenarios + +### blocking-voice + +**Task:** Fix blocked users can still answer DM voice calls + +**Naive fix:** Add blocking check to answerDmCall only + +**Correct fix:** +- Add blocking check to answerDmCall +- Add blocking check to declineDmCall +- Filter notifications for blocked users +- Add voice cleanup to blockUser() +- Subscribe to BusUserBlockChange for mid-call kicks + +### permission-cache + +**Task:** Fix permission cache doesn't invalidate when roles change + +**Naive fix:** Add .clear() somewhere + +**Correct fix:** +- Subscribe to BusTeamRoleChange in team.service +- Subscribe to BusTeamMemberRoleChange in team.service +- Add broker event to createRole() +- Add broker event to deleteRole() + +## Adding New Scenarios + +Create a new JSON file in `test/scenarios/`: + +```json +{ + "id": "my-scenario", + "name": "Human-readable name", + "description": "Brief description", + + "task": "The task description given to the Worker", + + "naive_fix": { + "description": "What a quick-fix agent would do", + "changes": [ + { "file": "path/to/file.ts", "change": "Quick fix description" } + ], + "issues": [ + "What the naive fix misses #1", + "What the naive fix misses #2" + ] + }, + + "correct_fix": { + "description": "Complete fix description", + "required_changes": [ + { "file": "path.ts", "function": "funcName", "change": "What to change" } + ], + "required_broker_subscriptions": [ + { "service": "x.service.ts", "event": "BusEventName", "action": "What to do" } + ] + }, + + "validation_criteria": { + "must_mention": ["keyword1", "keyword2"], + "should_reference_pattern": "reference-file.ts" + } +} +``` + +Add to `test/scenarios/index.json`: + +```json +{ + "scenarios": [ + { "id": "my-scenario", "file": "my-scenario.json" } + ] +} +``` + +## Real-World Testing + +For true validation, test with real Claude sub-agents: + +``` +Root Agent: + 1. Call mcp__diligence__start with task + 2. Spawn Worker agent (Task tool) with get_worker_brief + 3. Worker researches, submits proposal via mcp__diligence__propose + 4. Spawn Reviewer agent (Task tool) with get_reviewer_brief + 5. Reviewer verifies claims, submits via mcp__diligence__review + 6. If NEEDS_WORK, spawn new Worker with updated brief + 7. If APPROVED, proceed to implementation +``` + +**Why separate agents matter:** +- Fresh context = no bias from previous reasoning +- Reviewer doesn't know Worker's search results +- Forces genuine verification, not rubber-stamping + +## Success Criteria + +1. **Reviewer catches issues** that Worker initially misses +2. **Multiple rounds** occur before approval +3. **Final proposal** is more complete than naive approach +4. **Validation criteria** are all met + +## Environment Variables + +| Variable | Purpose | +|----------|---------| +| `DEBUG=1` | Show MCP server stderr output | +| `ANTHROPIC_API_KEY` | Required for `--live` mode (future) | + +## CI Integration + +```bash +# Run in CI +npm test + +# Or directly +node test/run-tests.mjs --workflow && node test/run-tests.mjs --mock +``` + +Exit codes: 0 = pass, 1 = fail diff --git a/WORKER_PROMPT.md b/WORKER_PROMPT.md new file mode 100644 index 0000000..39c677a --- /dev/null +++ b/WORKER_PROMPT.md @@ -0,0 +1,135 @@ +# Worker Instructions + +You are the Worker in a diligence workflow. Your job is to **thoroughly research** and **propose a complete solution** that will be verified by a Reviewer. + +## Your Mindset + +You are a diligent engineer who: +- Researches before proposing +- Traces data flow end-to-end +- Finds and follows existing patterns +- Identifies ALL affected code +- Considers edge cases and failure modes +- Cites specific file:line locations for every claim + +**The Reviewer will verify your claims by searching the codebase.** If you lie, guess, or miss something, you will be caught and sent back to revise. + +--- + +## Before Proposing + +### 1. Trace the Data Flow + +Answer these questions: +- **Origin:** Where does the relevant data come from? +- **Flow:** What path does it take through the system? +- **Consumers:** What code uses this data? +- **Side effects:** What gets triggered when this changes? + +### 2. Find Existing Patterns + +Search for similar functionality: +- How does the codebase handle similar cases? +- What patterns are used? +- Are there helper functions you should use? +- What events/hooks exist? + +### 3. Identify All Touchpoints + +List every file that needs changes: +- Don't just fix the symptom - trace to root cause +- Check for caches, events, subscriptions +- Look for related code that assumes current behavior + +### 4. Consider Scenarios + +What situations must your fix handle? +- Happy path +- Error cases +- Race conditions +- Edge cases mentioned in codebase context + +--- + +## Proposal Format + +```markdown +## Summary + +[1-2 sentences: what you're fixing and how] + +## Data Flow Analysis + +### Origin +[Where the data comes from - cite file:line] + +### Flow +[How it moves through the system - cite each step] + +### Consumers +[What uses this data and what they expect] + +## Existing Pattern + +[How similar features work in this codebase - cite examples] + +I will follow this pattern by: [explain] + +## Files Affected + +| File | Line(s) | Change | +|------|---------|--------| +| path/to/file.ts | 123-130 | [what changes] | +| path/to/other.ts | 45 | [what changes] | + +## Implementation + +1. [First change - be specific] +2. [Second change - be specific] +3. [etc.] + +## Scenarios Covered + +- [x] Scenario 1: [description] - handled by [which part of fix] +- [x] Scenario 2: [description] - handled by [which part of fix] +- [x] Edge case: [description] - handled by [which part of fix] + +## Not Covered / Out of Scope + +[Anything you're explicitly NOT fixing and why] +``` + +--- + +## Common Mistakes to Avoid + +| Mistake | Why It's Wrong | Do This Instead | +|---------|----------------|-----------------| +| "I'll add a check here" | Doesn't explain why it's missing | Trace why the check doesn't exist | +| "This should fix it" | No verification | Cite code proving it will work | +| "Only one file needs changes" | Probably missed something | Search for all usages | +| Guessing at patterns | Will be caught by Reviewer | Search and cite actual code | +| Ignoring codebase context | Missing critical architecture | Re-read the context section | + +--- + +## Using Tools + +You have full access to search and read the codebase: +- **Grep** - Search for patterns, function names, events +- **Glob** - Find files by path pattern +- **Read** - Read specific files + +**Use them extensively.** A proposal without searches is a proposal that will be rejected. + +--- + +## If You Get Feedback + +When the Reviewer sends NEEDS_WORK: +1. Read the feedback carefully +2. Do additional searches to address gaps +3. Revise your proposal to cover missing items +4. Don't just add - verify your additions are correct + +You have limited rounds. Make each one count. diff --git a/index.mjs b/index.mjs new file mode 100644 index 0000000..234a1bb --- /dev/null +++ b/index.mjs @@ -0,0 +1,547 @@ +#!/usr/bin/env node +/** + * Diligence MCP Server + * + * Enforces diligent code changes through a worker-reviewer loop. + * Prevents quick fixes that miss patterns, events, and edge cases. + * + * Workflow: + * 1. User describes task → conversation phase + * 2. Start work → research phase begins + * 3. Worker proposes → Reviewer challenges → repeat until approved + * 4. Approved → implementation phase + * 5. Complete → back to conversation + * + * Project-specific context is loaded from .claude/CODEBASE_CONTEXT.md + * Generic prompts are bundled with this server. + */ + +import { Server } from '@modelcontextprotocol/sdk/server/index.js'; +import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'; +import { ListToolsRequestSchema, CallToolRequestSchema } from '@modelcontextprotocol/sdk/types.js'; +import { existsSync, readFileSync, writeFileSync, mkdirSync, readdirSync } from 'fs'; +import { join, dirname, basename } from 'path'; +import { fileURLToPath } from 'url'; +import { execSync } from 'child_process'; + +// ============================================================================= +// Configuration +// ============================================================================= + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MAX_ROUNDS = 5; + +/** + * Find the project root by looking for .git directory. + * Falls back to current working directory. + */ +function findProjectRoot() { + try { + const gitRoot = execSync('git rev-parse --show-toplevel', { + encoding: 'utf-8', + stdio: ['pipe', 'pipe', 'pipe'] + }).trim(); + return gitRoot; + } catch { + return process.cwd(); + } +} + +const PROJECT_ROOT = findProjectRoot(); +const STATE_DIR = join(PROJECT_ROOT, '.claude'); +const STATE_FILE = join(STATE_DIR, '.diligence-state.json'); +const CODEBASE_CONTEXT_FILE = join(STATE_DIR, 'CODEBASE_CONTEXT.md'); + +// Bundled prompts (shipped with this package) +const WORKER_PROMPT_FILE = join(__dirname, 'WORKER_PROMPT.md'); +const REVIEWER_PROMPT_FILE = join(__dirname, 'REVIEWER_PROMPT.md'); + +// ============================================================================= +// State Management +// ============================================================================= + +/** + * @typedef {Object} WorkflowState + * @property {'conversation' | 'researching' | 'approved' | 'implementing'} phase + * @property {string | null} task + * @property {number} round + * @property {string | null} proposal + * @property {Array<{round: number, feedback: string}>} feedback + * @property {Array} userContext + * @property {string | null} approvalReason + * @property {string | null} startedAt + */ + +/** @type {WorkflowState} */ +let state = { + phase: 'conversation', + task: null, + round: 0, + proposal: null, + feedback: [], + userContext: [], + approvalReason: null, + startedAt: null, +}; + +function loadState() { + try { + if (existsSync(STATE_FILE)) { + const saved = JSON.parse(readFileSync(STATE_FILE, 'utf-8')); + state = { ...state, ...saved }; + } + } catch { + // Use default state + } +} + +function saveState() { + try { + if (!existsSync(STATE_DIR)) { + mkdirSync(STATE_DIR, { recursive: true }); + } + writeFileSync(STATE_FILE, JSON.stringify(state, null, 2)); + } catch { + // Ignore write errors + } +} + +function resetState() { + state = { + phase: 'conversation', + task: null, + round: 0, + proposal: null, + feedback: [], + userContext: [], + approvalReason: null, + startedAt: null, + }; + saveState(); +} + +// ============================================================================= +// Context Loading +// ============================================================================= + +// Project-specific context files +const PROJECT_WORKER_CONTEXT = join(STATE_DIR, 'WORKER_CONTEXT.md'); +const PROJECT_REVIEWER_CONTEXT = join(STATE_DIR, 'REVIEWER_CONTEXT.md'); +const PROJECT_CONTEXT_DIR = join(STATE_DIR, 'context'); + +function loadFile(filePath) { + try { + if (existsSync(filePath)) { + return readFileSync(filePath, 'utf-8'); + } + } catch { + // Ignore + } + return ''; +} + +/** + * Load all .md files from a directory. + * @param {string} dirPath + * @returns {Array<{name: string, content: string}>} + */ +function loadContextDir(dirPath) { + const files = []; + try { + if (existsSync(dirPath)) { + const entries = readdirSync(dirPath); + for (const entry of entries) { + if (entry.endsWith('.md')) { + const content = loadFile(join(dirPath, entry)); + if (content) { + files.push({ name: basename(entry, '.md'), content }); + } + } + } + } + } catch { + // Ignore + } + return files; +} + +function getCodebaseContext() { + let context = ''; + + // Main codebase context + const mainContent = loadFile(CODEBASE_CONTEXT_FILE); + if (mainContent) { + context += mainContent; + } else { + context += `(No CODEBASE_CONTEXT.md found at ${CODEBASE_CONTEXT_FILE})\n\nCreate this file to provide project-specific architecture and patterns.`; + } + + // Additional context files from .claude/context/ + const extraFiles = loadContextDir(PROJECT_CONTEXT_DIR); + if (extraFiles.length > 0) { + context += '\n\n---\n\n# Additional Context\n\n'; + for (const file of extraFiles) { + context += `## ${file.name}\n\n${file.content}\n\n`; + } + } + + return context; +} + +function getWorkerPrompt() { + // Bundled generic prompt + let prompt = loadFile(WORKER_PROMPT_FILE); + + // Project-specific additions + const projectWorker = loadFile(PROJECT_WORKER_CONTEXT); + if (projectWorker) { + prompt += '\n\n---\n\n# Project-Specific Worker Guidance\n\n'; + prompt += projectWorker; + } + + return prompt; +} + +function getReviewerPrompt() { + // Bundled generic prompt + let prompt = loadFile(REVIEWER_PROMPT_FILE); + + // Project-specific additions + const projectReviewer = loadFile(PROJECT_REVIEWER_CONTEXT); + if (projectReviewer) { + prompt += '\n\n---\n\n# Project-Specific Reviewer Guidance\n\n'; + prompt += projectReviewer; + } + + return prompt; +} + +// ============================================================================= +// MCP Server +// ============================================================================= + +const server = new Server({ + name: 'diligence', + version: '0.1.0', +}, { + capabilities: { tools: {} } +}); + +// ============================================================================= +// Tool Definitions +// ============================================================================= + +server.setRequestHandler(ListToolsRequestSchema, async () => ({ + tools: [ + { + name: 'status', + description: 'Get current diligence workflow state (phase, round, task). Check this before taking actions.', + inputSchema: { type: 'object', properties: {} } + }, + { + name: 'start', + description: 'Start the diligence workflow for a task. Worker will research and propose, Reviewer will verify.', + inputSchema: { + type: 'object', + properties: { + task: { type: 'string', description: 'Description of the task/bug to fix' } + }, + required: ['task'] + } + }, + { + name: 'propose', + description: 'Worker submits a proposal with code citations, data flow analysis, and implementation plan.', + inputSchema: { + type: 'object', + properties: { + proposal: { type: 'string', description: 'Full proposal in markdown with file:line citations' } + }, + required: ['proposal'] + } + }, + { + name: 'review', + description: 'Reviewer submits APPROVED or NEEDS_WORK after verifying claims by searching the codebase.', + inputSchema: { + type: 'object', + properties: { + decision: { type: 'string', enum: ['APPROVED', 'NEEDS_WORK'], description: 'Review decision' }, + reasoning: { type: 'string', description: 'Verification results and feedback' } + }, + required: ['decision', 'reasoning'] + } + }, + { + name: 'add_context', + description: 'Add user feedback to incorporate in the next round.', + inputSchema: { + type: 'object', + properties: { + context: { type: 'string', description: 'Additional context from user' } + }, + required: ['context'] + } + }, + { + name: 'get_worker_brief', + description: 'Get the full brief for the Worker: task, codebase context, previous feedback, instructions.', + inputSchema: { type: 'object', properties: {} } + }, + { + name: 'get_reviewer_brief', + description: 'Get the full brief for the Reviewer: proposal to verify, codebase context, instructions.', + inputSchema: { type: 'object', properties: {} } + }, + { + name: 'approve', + description: 'User manually approves, bypassing further review. Use when proposal is clearly correct.', + inputSchema: { + type: 'object', + properties: { + reason: { type: 'string', description: 'Why approving manually' } + }, + required: ['reason'] + } + }, + { + name: 'implement', + description: 'Begin implementation phase. Only allowed after approval.', + inputSchema: { type: 'object', properties: {} } + }, + { + name: 'complete', + description: 'Mark implementation complete. Returns to conversation phase.', + inputSchema: { + type: 'object', + properties: { + summary: { type: 'string', description: 'Summary of changes made' } + }, + required: ['summary'] + } + }, + { + name: 'abort', + description: 'Cancel workflow and reset to conversation.', + inputSchema: { + type: 'object', + properties: { + reason: { type: 'string', description: 'Why aborting' } + } + } + } + ] +})); + +// ============================================================================= +// Tool Handlers +// ============================================================================= + +server.setRequestHandler(CallToolRequestSchema, async (request) => { + const { name, arguments: args } = request.params; + + try { + switch (name) { + case 'status': { + const lines = [ + `Phase: ${state.phase}`, + `Task: ${state.task || '(none)'}`, + `Round: ${state.round}/${MAX_ROUNDS}`, + `Proposal: ${state.proposal ? `${state.proposal.length} chars` : '(none)'}`, + `Feedback rounds: ${state.feedback.length}`, + state.userContext.length ? `User context: ${state.userContext.length} items` : null, + state.approvalReason ? `Approved: ${state.approvalReason.slice(0, 80)}...` : null, + `Project: ${PROJECT_ROOT}`, + ].filter(Boolean); + return { content: [{ type: 'text', text: lines.join('\n') }] }; + } + + case 'start': { + if (state.phase !== 'conversation') { + return { content: [{ type: 'text', text: `Already in "${state.phase}" phase. Use abort first.` }], isError: true }; + } + state.phase = 'researching'; + state.task = args.task; + state.round = 1; + state.proposal = null; + state.feedback = []; + state.userContext = []; + state.approvalReason = null; + state.startedAt = new Date().toISOString(); + saveState(); + return { content: [{ type: 'text', text: `Started: "${args.task}"\n\nPhase: researching\nRound: 1/${MAX_ROUNDS}\n\nWorker should use get_worker_brief for context, then research and propose.` }] }; + } + + case 'propose': { + if (state.phase !== 'researching') { + return { content: [{ type: 'text', text: `Not in researching phase (current: ${state.phase})` }], isError: true }; + } + state.proposal = args.proposal; + saveState(); + return { content: [{ type: 'text', text: `Proposal submitted (${args.proposal.length} chars).\n\nRound: ${state.round}/${MAX_ROUNDS}\n\nReviewer should use get_reviewer_brief, verify claims by searching codebase, then review.` }] }; + } + + case 'review': { + if (state.phase !== 'researching') { + return { content: [{ type: 'text', text: `Not in researching phase (current: ${state.phase})` }], isError: true }; + } + if (!state.proposal) { + return { content: [{ type: 'text', text: 'No proposal to review yet' }], isError: true }; + } + + const { decision, reasoning } = args; + + if (decision === 'APPROVED') { + state.phase = 'approved'; + state.approvalReason = reasoning; + saveState(); + return { content: [{ type: 'text', text: `APPROVED after ${state.round} round(s).\n\n${reasoning.slice(0, 500)}\n\nUse implement to begin making changes.` }] }; + } + + if (decision === 'NEEDS_WORK') { + state.feedback.push({ round: state.round, feedback: reasoning }); + state.round++; + state.userContext = []; + + if (state.round > MAX_ROUNDS) { + const lastFeedback = reasoning; + resetState(); + return { content: [{ type: 'text', text: `MAX ROUNDS (${MAX_ROUNDS}) reached. Workflow reset.\n\nLast feedback: ${lastFeedback.slice(0, 500)}\n\nBreak down the task or discuss with user.` }] }; + } + + saveState(); + return { content: [{ type: 'text', text: `NEEDS_WORK - Round ${state.round}/${MAX_ROUNDS}\n\n${reasoning}\n\nWorker should revise addressing this feedback.` }] }; + } + + return { content: [{ type: 'text', text: `Invalid decision: ${decision}` }], isError: true }; + } + + case 'add_context': { + if (state.phase !== 'researching') { + return { content: [{ type: 'text', text: `Not in researching phase` }], isError: true }; + } + state.userContext.push(args.context); + saveState(); + return { content: [{ type: 'text', text: `Context added. ${state.userContext.length} item(s) pending.` }] }; + } + + case 'get_worker_brief': { + let brief = '# Worker Brief\n\n'; + brief += `## Task\n\n${state.task || '(No task - use start first)'}\n\n`; + brief += `## Round\n\n${state.round}/${MAX_ROUNDS}\n\n`; + + if (state.feedback.length > 0) { + brief += '## Previous Feedback (Address ALL)\n\n'; + for (const f of state.feedback) { + brief += `### Round ${f.round}\n\n${f.feedback}\n\n`; + } + } + + if (state.userContext.length > 0) { + brief += '## User Context\n\n'; + state.userContext.forEach(c => brief += `- ${c}\n`); + brief += '\n'; + } + + brief += '## Codebase Context\n\n'; + brief += getCodebaseContext(); + brief += '\n\n'; + + const workerPrompt = getWorkerPrompt(); + if (workerPrompt) { + brief += '## Instructions\n\n'; + brief += workerPrompt; + } + + return { content: [{ type: 'text', text: brief }] }; + } + + case 'get_reviewer_brief': { + let brief = '# Reviewer Brief\n\n'; + brief += `## Task\n\n${state.task || '(No task)'}\n\n`; + brief += `## Round\n\n${state.round}/${MAX_ROUNDS}\n\n`; + + if (state.proposal) { + brief += '## Proposal to Review\n\n'; + brief += state.proposal; + brief += '\n\n'; + } else { + brief += '## Proposal\n\n(No proposal yet)\n\n'; + } + + if (state.feedback.length > 0) { + brief += '## Previous Feedback\n\n'; + for (const f of state.feedback) { + brief += `### Round ${f.round}\n\n${f.feedback}\n\n`; + } + } + + brief += '## Codebase Context\n\n'; + brief += getCodebaseContext(); + brief += '\n\n'; + + const reviewerPrompt = getReviewerPrompt(); + if (reviewerPrompt) { + brief += '## Instructions\n\n'; + brief += reviewerPrompt; + } + + return { content: [{ type: 'text', text: brief }] }; + } + + case 'approve': { + if (state.phase !== 'researching') { + return { content: [{ type: 'text', text: `Not in researching phase` }], isError: true }; + } + if (!state.proposal) { + return { content: [{ type: 'text', text: 'No proposal to approve' }], isError: true }; + } + state.phase = 'approved'; + state.approvalReason = `USER: ${args.reason}`; + saveState(); + return { content: [{ type: 'text', text: `Manually approved.\n\nUse implement to begin.` }] }; + } + + case 'implement': { + if (state.phase !== 'approved') { + return { content: [{ type: 'text', text: `Not approved yet (current: ${state.phase})` }], isError: true }; + } + state.phase = 'implementing'; + saveState(); + return { content: [{ type: 'text', text: `Implementation phase.\n\nTask: ${state.task}\n\nMake changes following the approved proposal.` }] }; + } + + case 'complete': { + if (state.phase !== 'implementing') { + return { content: [{ type: 'text', text: `Not in implementing phase` }], isError: true }; + } + const task = state.task; + const rounds = state.round; + resetState(); + return { content: [{ type: 'text', text: `Complete!\n\nTask: ${task}\nRounds: ${rounds}\nSummary: ${args.summary}\n\nReset to conversation.` }] }; + } + + case 'abort': { + const prev = { phase: state.phase, task: state.task }; + resetState(); + return { content: [{ type: 'text', text: `Aborted.\n\nWas: ${prev.phase}, "${prev.task || ''}"\nReason: ${args.reason || '(none)'}\n\nReset to conversation.` }] }; + } + + default: + throw new Error(`Unknown tool: ${name}`); + } + } catch (e) { + return { content: [{ type: 'text', text: `Error: ${e.message}` }], isError: true }; + } +}); + +// ============================================================================= +// Lifecycle +// ============================================================================= + +process.on('SIGTERM', () => { saveState(); process.exit(0); }); +process.on('SIGINT', () => { saveState(); process.exit(0); }); + +loadState(); +const transport = new StdioServerTransport(); +await server.connect(transport); diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 0000000..57254d3 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,162 @@ +{ + "name": "@strikt/diligence", + "version": "0.1.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "@strikt/diligence", + "version": "0.1.0", + "license": "MIT", + "dependencies": { + "@modelcontextprotocol/sdk": "^0.5.0" + }, + "bin": { + "diligence": "index.mjs" + } + }, + "node_modules/@modelcontextprotocol/sdk": { + "version": "0.5.0", + "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-0.5.0.tgz", + "integrity": "sha512-RXgulUX6ewvxjAG0kOpLMEdXXWkzWgaoCGaA2CwNW7cQCIphjpJhjpHSiaPdVCnisjRF/0Cm9KWHUuIoeiAblQ==", + "license": "MIT", + "dependencies": { + "content-type": "^1.0.5", + "raw-body": "^3.0.0", + "zod": "^3.23.8" + } + }, + "node_modules/bytes": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", + "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/content-type": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", + "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", + "license": "MIT", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/depd": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", + "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/http-errors": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", + "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", + "license": "MIT", + "dependencies": { + "depd": "~2.0.0", + "inherits": "~2.0.4", + "setprototypeof": "~1.2.0", + "statuses": "~2.0.2", + "toidentifier": "~1.0.1" + }, + "engines": { + "node": ">= 0.8" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/iconv-lite": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", + "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "license": "ISC" + }, + "node_modules/raw-body": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/raw-body/-/raw-body-3.0.2.tgz", + "integrity": "sha512-K5zQjDllxWkf7Z5xJdV0/B0WTNqx6vxG70zJE4N0kBs4LovmEYWJzQGxC9bS9RAKu3bgM40lrd5zoLJ12MQ5BA==", + "license": "MIT", + "dependencies": { + "bytes": "~3.1.2", + "http-errors": "~2.0.1", + "iconv-lite": "~0.7.0", + "unpipe": "~1.0.0" + }, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/safer-buffer": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", + "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "license": "MIT" + }, + "node_modules/setprototypeof": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", + "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==", + "license": "ISC" + }, + "node_modules/statuses": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/statuses/-/statuses-2.0.2.tgz", + "integrity": "sha512-DvEy55V3DB7uknRo+4iOGT5fP1slR8wQohVdknigZPMpMstaKJQWhwiYBACJE3Ul2pTnATihhBYnRhZQHGBiRw==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/toidentifier": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/toidentifier/-/toidentifier-1.0.1.tgz", + "integrity": "sha512-o5sSPKEkg/DIQNmH43V0/uerLrpzVedkUh8tGNvaeXpfpuwjKenlSox/2O/BTlZUtEe+JG7s5YhEz608PlAHRA==", + "license": "MIT", + "engines": { + "node": ">=0.6" + } + }, + "node_modules/unpipe": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/unpipe/-/unpipe-1.0.0.tgz", + "integrity": "sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==", + "license": "MIT", + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/zod": { + "version": "3.25.76", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.76.tgz", + "integrity": "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ==", + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } + } + } +} diff --git a/package.json b/package.json new file mode 100644 index 0000000..5344225 --- /dev/null +++ b/package.json @@ -0,0 +1,21 @@ +{ + "name": "@strikt/diligence", + "version": "0.1.0", + "description": "MCP server enforcing diligent code changes through worker-reviewer loop", + "type": "module", + "main": "index.mjs", + "bin": { + "diligence": "./index.mjs" + }, + "scripts": { + "test": "node test/run-tests.mjs --workflow && node test/run-tests.mjs --mock", + "test:workflow": "node test/run-tests.mjs --workflow", + "test:mock": "node test/run-tests.mjs --mock", + "test:dry-run": "node test/dry-run.mjs" + }, + "dependencies": { + "@modelcontextprotocol/sdk": "^0.5.0" + }, + "keywords": ["mcp", "claude", "code-review", "diligence"], + "license": "MIT" +} diff --git a/test/compare-approaches.mjs b/test/compare-approaches.mjs new file mode 100644 index 0000000..df54d4c --- /dev/null +++ b/test/compare-approaches.mjs @@ -0,0 +1,255 @@ +#!/usr/bin/env node +/** + * Comparison Test: Naive vs Diligence Approach + * + * This script coordinates testing of both approaches: + * 1. Naive: A single agent analyzes and proposes a fix + * 2. Diligence: Worker-Reviewer loop with separate agents + * + * The test uses a real bug from the nexus codebase. + * + * Usage: + * node test/compare-approaches.mjs + */ + +import { writeFileSync, mkdirSync, existsSync } from 'fs'; +import { dirname, join } from 'path'; +import { fileURLToPath } from 'url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const RESULTS_DIR = join(__dirname, 'results'); + +// Ensure results directory exists +if (!existsSync(RESULTS_DIR)) { + mkdirSync(RESULTS_DIR, { recursive: true }); +} + +const TEST_BUG = { + id: 'B1', + name: 'Blocked users can join/answer DM voice calls', + task: `Fix bug B1: Blocked users can join DM voice calls. + +When user A blocks user B, user B should NOT be able to: +1. Answer incoming DM calls from user A +2. Start new calls to user A (already works) +3. Join DM voice channel with user A (already works in joinVoiceChannel) + +The bug is that answerDmCall() has no blocking check. + +Analyze the codebase and propose a COMPLETE fix.`, + + // What naive agents typically miss + naive_misses: [ + 'declineDmCall() also needs blocking check for consistency', + 'notifyDmCall() should filter blocked users from notifications', + 'blockUser() should clean up existing voice calls', + 'Need to subscribe to BusUserBlockChange for mid-call kick', + 'Should follow the pattern from chat.service.ts where permission=visibility, actions have separate checks', + ], + + // Required elements for a complete fix + required_elements: [ + 'answerDmCall blocking check', + 'declineDmCall blocking check', + 'notification filtering', + 'voice cleanup in blockUser()', + 'BusUserBlockChange subscription', + 'chat.service.ts pattern reference', + ], +}; + +// Prompts for the test +const NAIVE_PROMPT = `You are analyzing a bug in the nexus codebase. + +BUG: ${TEST_BUG.task} + +Your job is to: +1. Search the codebase to understand the current implementation +2. Identify all files that need changes +3. Propose a complete fix + +DO NOT use any diligence MCP tools. Just analyze and propose. + +Be thorough - check for: +- Similar patterns in the codebase +- Broker events that might be relevant +- All places where blocking should be enforced +- Edge cases (what if block happens mid-call?) + +Output your analysis and proposed fix.`; + +const WORKER_PROMPT = `You are a Worker agent in the diligence workflow. + +Your brief has been loaded with: +- The task description +- Codebase context (architecture, patterns) +- Any previous feedback + +Your job: +1. Research the codebase thoroughly +2. Trace data flow from origin to all consumers +3. Find existing patterns for similar functionality +4. Identify ALL files that need changes +5. Propose a fix with file:line citations for every claim + +IMPORTANT: +- Cite specific file:line for every claim +- Search for similar patterns (how does chat handle blocking?) +- Don't miss broker events +- Consider edge cases (mid-call blocking) + +Submit your proposal via mcp__diligence__propose when ready.`; + +const REVIEWER_PROMPT = `You are a Reviewer agent in the diligence workflow. + +Your brief has been loaded with: +- The Worker's proposal +- The task description +- Codebase context + +Your job: +1. VERIFY every claim by searching the codebase yourself +2. Check if the proposal follows existing patterns +3. Look for missing broker events or edge cases +4. Do NOT trust the Worker's citations - verify them + +For each claim in the proposal: +- Search for the file/line cited +- Verify it says what the Worker claims +- Check if there are related issues the Worker missed + +Submit your review via mcp__diligence__review: +- APPROVED if all checks pass +- NEEDS_WORK with specific issues if not + +Be strict - missing one broker event subscription can cause production bugs.`; + +function log(msg) { + const timestamp = new Date().toISOString().slice(11, 19); + console.log(`[${timestamp}] ${msg}`); +} + +function saveResult(name, content) { + const timestamp = new Date().toISOString().slice(0, 10); + const filename = `${timestamp}-${name}.md`; + const path = join(RESULTS_DIR, filename); + writeFileSync(path, content); + log(`Saved: ${path}`); + return path; +} + +// Generate the test instructions +function generateTestInstructions() { + const instructions = `# Diligence Comparison Test + +## Test Bug +**ID:** ${TEST_BUG.id} +**Name:** ${TEST_BUG.name} + +## Task +${TEST_BUG.task} + +--- + +## Phase 1: Naive Approach (WITHOUT Diligence) + +In a Claude Code session, paste this prompt: + +\`\`\` +${NAIVE_PROMPT} +\`\`\` + +Save the output as the "naive proposal". + +--- + +## Phase 2: Diligence Approach (WITH Worker-Reviewer Loop) + +### Step 1: Start the workflow +\`\`\` +mcp__diligence__start with task: "${TEST_BUG.task.split('\n')[0]}" +\`\`\` + +### Step 2: Spawn Worker Agent +\`\`\` +1. Call mcp__diligence__get_worker_brief +2. Use Task tool with subagent_type="Explore" and this prompt: + "${WORKER_PROMPT.replace(/\n/g, ' ').slice(0, 200)}..." +3. Worker should research and call mcp__diligence__propose +\`\`\` + +### Step 3: Spawn Reviewer Agent +\`\`\` +1. Call mcp__diligence__get_reviewer_brief +2. Use Task tool with subagent_type="Explore" and this prompt: + "${REVIEWER_PROMPT.replace(/\n/g, ' ').slice(0, 200)}..." +3. Reviewer should verify and call mcp__diligence__review +\`\`\` + +### Step 4: Loop or Complete +- If NEEDS_WORK: spawn new Worker with updated brief +- If APPROVED: call mcp__diligence__implement + +Save the final approved proposal as the "diligence proposal". + +--- + +## Phase 3: Compare Results + +### Checklist - What Naive Typically Misses +${TEST_BUG.naive_misses.map(m => `- [ ] ${m}`).join('\n')} + +### Required Elements for Complete Fix +${TEST_BUG.required_elements.map(e => `- [ ] ${e}`).join('\n')} + +### Scoring +- Naive proposal: Count how many required elements it includes +- Diligence proposal: Count how many required elements it includes +- Did diligence catch issues that naive missed? + +--- + +## Expected Outcome + +The naive approach will likely: +- Add blocking check to answerDmCall() only +- Miss the other 5 required elements + +The diligence approach should: +- Catch missing elements during review +- Iterate until all elements are addressed +- Produce a more complete proposal + +`; + + return instructions; +} + +// Main +async function main() { + log('Generating comparison test instructions...'); + + const instructions = generateTestInstructions(); + const path = saveResult('comparison-test-instructions', instructions); + + console.log('\n' + '='.repeat(60)); + console.log('COMPARISON TEST READY'); + console.log('='.repeat(60)); + console.log(`\nInstructions saved to: ${path}`); + console.log('\nTo run the test:'); + console.log('1. Open the instructions file'); + console.log('2. Start a Claude Code session in ~/bude/codecharm/nexus'); + console.log('3. Run Phase 1 (naive) and save the output'); + console.log('4. Run Phase 2 (diligence) and save the output'); + console.log('5. Compare using the checklist in Phase 3'); + console.log('\n'); + + // Also print the naive prompt for immediate use + console.log('='.repeat(60)); + console.log('NAIVE PROMPT (for quick testing):'); + console.log('='.repeat(60)); + console.log(NAIVE_PROMPT); + console.log('\n'); +} + +main().catch(console.error); diff --git a/test/dry-run.mjs b/test/dry-run.mjs new file mode 100644 index 0000000..f2e6664 --- /dev/null +++ b/test/dry-run.mjs @@ -0,0 +1,305 @@ +#!/usr/bin/env node +/** + * Dry Run Test Against Real Project + * + * Runs the diligence MCP server against a real project (e.g., nexus) in dry-run mode. + * This tests the full workflow without making any code changes. + * + * Usage: + * node test/dry-run.mjs --project=/path/to/nexus --task="Fix permission cache" + * node test/dry-run.mjs --project=~/bude/codecharm/nexus --scenario=blocking-voice + * + * Options: + * --project=PATH Path to the project to test against + * --task=TEXT Task description to start the workflow with + * --scenario=ID Use a predefined scenario from test/scenarios/ + * --interactive Run in interactive mode (prompts for input) + */ + +import { spawn } from 'child_process'; +import { createInterface } from 'readline'; +import { dirname, join, resolve } from 'path'; +import { fileURLToPath } from 'url'; +import { existsSync, readFileSync } from 'fs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +// Parse CLI args +const args = process.argv.slice(2); +const projectArg = args.find(a => a.startsWith('--project=')); +const taskArg = args.find(a => a.startsWith('--task=')); +const scenarioArg = args.find(a => a.startsWith('--scenario=')); +const interactive = args.includes('--interactive') || args.includes('-i'); + +// Resolve project path +let projectPath = projectArg ? projectArg.split('=')[1] : null; +if (projectPath) { + projectPath = projectPath.replace(/^~/, process.env.HOME); + projectPath = resolve(projectPath); +} + +// Colors +const colors = { + reset: '\x1b[0m', + green: '\x1b[32m', + red: '\x1b[31m', + yellow: '\x1b[33m', + blue: '\x1b[34m', + cyan: '\x1b[36m', + dim: '\x1b[2m', + bold: '\x1b[1m', +}; + +function log(msg, color = 'reset') { + console.log(`${colors[color]}${msg}${colors.reset}`); +} + +function logSection(title) { + console.log(`\n${colors.cyan}${colors.bold}═══ ${title} ═══${colors.reset}\n`); +} + +// Load scenario +function loadScenario(id) { + const path = join(__dirname, 'scenarios', `${id}.json`); + if (!existsSync(path)) { + throw new Error(`Scenario not found: ${id}`); + } + return JSON.parse(readFileSync(path, 'utf-8')); +} + +// Simple MCP client for dry run +class DryRunClient { + constructor(projectPath) { + this.projectPath = projectPath; + this.serverPath = join(__dirname, '..', 'index.mjs'); + this.process = null; + this.requestId = 0; + this.pendingRequests = new Map(); + this.readline = null; + } + + async connect() { + return new Promise((resolve, reject) => { + this.process = spawn('node', [this.serverPath], { + stdio: ['pipe', 'pipe', 'pipe'], + cwd: this.projectPath, + }); + + this.readline = createInterface({ + input: this.process.stdout, + crlfDelay: Infinity, + }); + + this.readline.on('line', (line) => { + try { + const message = JSON.parse(line); + if (message.id !== undefined && this.pendingRequests.has(message.id)) { + const { resolve, reject } = this.pendingRequests.get(message.id); + this.pendingRequests.delete(message.id); + if (message.error) { + reject(new Error(message.error.message || JSON.stringify(message.error))); + } else { + resolve(message.result); + } + } + } catch (e) { + // Ignore non-JSON lines + } + }); + + this.process.stderr.on('data', (data) => { + // Show server stderr in debug mode + if (process.env.DEBUG) { + console.error(colors.dim + '[server] ' + data.toString() + colors.reset); + } + }); + + this.process.on('error', reject); + + // Initialize + this._send({ + jsonrpc: '2.0', + id: this.requestId++, + method: 'initialize', + params: { + protocolVersion: '0.1.0', + clientInfo: { name: 'dry-run-client', version: '1.0.0' }, + capabilities: {}, + }, + }).then(() => { + this._sendNotification('notifications/initialized', {}); + resolve(); + }).catch(reject); + }); + } + + async disconnect() { + if (this.process) { + this.process.kill('SIGTERM'); + this.process = null; + } + } + + _send(message) { + return new Promise((resolve, reject) => { + this.pendingRequests.set(message.id, { resolve, reject }); + this.process.stdin.write(JSON.stringify(message) + '\n'); + setTimeout(() => { + if (this.pendingRequests.has(message.id)) { + this.pendingRequests.delete(message.id); + reject(new Error('Request timeout')); + } + }, 30000); + }); + } + + _sendNotification(method, params) { + this.process.stdin.write(JSON.stringify({ jsonrpc: '2.0', method, params }) + '\n'); + } + + async callTool(name, args = {}) { + const result = await this._send({ + jsonrpc: '2.0', + id: this.requestId++, + method: 'tools/call', + params: { name, arguments: args }, + }); + if (result.content?.[0]?.text) { + return { text: result.content[0].text, isError: result.isError || false }; + } + return result; + } +} + +// Interactive prompt +function prompt(question) { + const rl = createInterface({ + input: process.stdin, + output: process.stdout, + }); + return new Promise(resolve => { + rl.question(question, answer => { + rl.close(); + resolve(answer); + }); + }); +} + +async function main() { + log('\n🔍 Diligence Dry Run\n', 'cyan'); + + // Validate project path + if (!projectPath) { + log('Error: --project=PATH required', 'red'); + log('\nUsage:', 'dim'); + log(' node test/dry-run.mjs --project=~/bude/codecharm/nexus --task="Fix bug"', 'dim'); + process.exit(1); + } + + if (!existsSync(projectPath)) { + log(`Error: Project path not found: ${projectPath}`, 'red'); + process.exit(1); + } + + // Check for CODEBASE_CONTEXT.md + const contextPath = join(projectPath, '.claude', 'CODEBASE_CONTEXT.md'); + if (!existsSync(contextPath)) { + log(`Warning: No .claude/CODEBASE_CONTEXT.md found in ${projectPath}`, 'yellow'); + log('The Worker and Reviewer will have limited context.', 'dim'); + } else { + log(`Found: ${contextPath}`, 'green'); + } + + // Determine task + let task; + if (scenarioArg) { + const scenarioId = scenarioArg.split('=')[1]; + const scenario = loadScenario(scenarioId); + task = scenario.task; + log(`Using scenario: ${scenario.name}`, 'blue'); + } else if (taskArg) { + task = taskArg.split('=')[1]; + } else if (interactive) { + task = await prompt('Enter task: '); + } else { + log('Error: Either --task=TEXT or --scenario=ID required', 'red'); + process.exit(1); + } + + log(`\nProject: ${projectPath}`, 'dim'); + log(`Task: ${task}\n`, 'dim'); + + // Connect to MCP server + const client = new DryRunClient(projectPath); + + try { + log('Connecting to MCP server...', 'dim'); + await client.connect(); + log('Connected!', 'green'); + + // Check initial status + logSection('Status'); + const status = await client.callTool('status'); + log(status.text, 'dim'); + + // Start the workflow + logSection('Starting Workflow'); + const startResult = await client.callTool('start', { task }); + log(startResult.text, startResult.isError ? 'red' : 'green'); + + if (startResult.isError) { + // Try to abort and restart + log('\nAborting existing workflow...', 'yellow'); + await client.callTool('abort', { reason: 'Dry run restart' }); + const retryResult = await client.callTool('start', { task }); + log(retryResult.text, retryResult.isError ? 'red' : 'green'); + } + + // Get worker brief + logSection('Worker Brief'); + const workerBrief = await client.callTool('get_worker_brief'); + + // Show truncated brief + const briefLines = workerBrief.text.split('\n'); + const truncatedBrief = briefLines.slice(0, 50).join('\n'); + log(truncatedBrief, 'dim'); + if (briefLines.length > 50) { + log(`\n... (${briefLines.length - 50} more lines)`, 'dim'); + } + + logSection('Dry Run Complete'); + + log(` +${colors.yellow}What happens next in a real session:${colors.reset} + +1. ${colors.bold}Worker Agent${colors.reset} (fresh sub-agent) receives the brief above + - Researches the codebase using Glob, Grep, Read tools + - Proposes a fix with file:line citations + - Submits via ${colors.cyan}diligence.propose${colors.reset} + +2. ${colors.bold}Reviewer Agent${colors.reset} (fresh sub-agent) verifies the proposal + - Searches codebase to verify Worker's claims + - Checks against patterns in CODEBASE_CONTEXT.md + - Submits ${colors.green}APPROVED${colors.reset} or ${colors.yellow}NEEDS_WORK${colors.reset} via ${colors.cyan}diligence.review${colors.reset} + +3. If ${colors.yellow}NEEDS_WORK${colors.reset}: Worker revises, Reviewer re-checks (up to 5 rounds) + +4. If ${colors.green}APPROVED${colors.reset}: ${colors.cyan}diligence.implement${colors.reset} → code changes → ${colors.cyan}diligence.complete${colors.reset} + +${colors.dim}This was a dry run - no code changes were made.${colors.reset} +`, 'reset'); + + // Cleanup - abort the workflow + await client.callTool('abort', { reason: 'Dry run completed' }); + log('Workflow aborted (dry run cleanup)', 'dim'); + + } finally { + await client.disconnect(); + log('\nDisconnected from MCP server.', 'dim'); + } +} + +main().catch(err => { + console.error('Error:', err.message); + process.exit(1); +}); diff --git a/test/fixture/.claude/CODEBASE_CONTEXT.md b/test/fixture/.claude/CODEBASE_CONTEXT.md new file mode 100644 index 0000000..46de116 --- /dev/null +++ b/test/fixture/.claude/CODEBASE_CONTEXT.md @@ -0,0 +1,150 @@ +# Codebase Context: Test Fixture + +This is a simplified test codebase that mirrors real-world patterns. Use this context to understand the architecture before making changes. + +## Architecture Overview + +``` +src/ +├── broker/ +│ └── events.ts # Broker event bus (Subject-based pub/sub) +├── services/ +│ ├── user-block.service.ts # Blocking logic +│ ├── voice-channel.service.ts # Voice channels and DM calls +│ ├── chat.service.ts # Chat channels and messages +│ └── team.service.ts # Team state and permission caching +└── controllers/ + └── roles.controller.ts # REST API for roles +``` + +## Critical Pattern: Broker Events + +**All state changes that affect multiple services MUST emit broker events.** + +### Available Events + +| Event | Emitted When | Expected Subscribers | +|-------|--------------|---------------------| +| `BusUserBlockChange` | User blocks/unblocks another | Voice services, DM services | +| `BusTeamRoleChange` | Role created/updated/deleted | Permission caches | +| `BusTeamMemberRoleChange` | User role assigned/removed | Permission caches | +| `BusVoiceParticipant` | User joins/leaves voice | Voice UI components | +| `BusDmCall` | DM call state changes | Call observers | + +### Pattern Example + +```typescript +// CORRECT: Emit event after state change +async updateRole(teamId, roleId, updates) { + const role = await db.update(roleId, updates); + + BusTeamRoleChange.next({ // ← MUST emit event + teamId, + roleId, + action: 'updated', + timestamp: new Date(), + }); + + return role; +} +``` + +## Critical Pattern: Permission vs Action Checks + +**Permission = Visibility. Action = Separate Check.** + +### Why This Matters + +For DM channels, blocking creates a `'read'` permission, NOT `'denied'`. The user can still SEE the DM channel, but cannot SEND messages. + +```typescript +// Permission check (for visibility) +if (isBlocked) { + return { permission: 'read', reason: 'blocked' }; // ← 'read', not 'denied' +} + +// Action check (separate from permission) +async sendMessage(userId, channel, content) { + if (await isBlockingEitherWay(userA, userB)) { + throw new Error('Cannot send messages'); // ← Separate check + } +} +``` + +### Voice Permission Pattern + +For DM channels, voice permissions are **always true**: + +```typescript +return { + permission: 'read', + voiceListen: true, // Always true for DM + voiceTalk: true, // Blocking checked on JOIN, not here + voiceWebcam: true, + voiceScreenshare: true, +}; +``` + +Blocking is enforced by **action checks** in: +- `joinVoiceChannel()` - line 33 +- `startDmCall()` - line 56 + +## Critical Pattern: Cache Invalidation + +**Caches MUST subscribe to relevant broker events.** + +### Current Bug Pattern + +```typescript +// BAD: Only clears on team switch +constructor() { + teamChange$.subscribe(() => { + this.memoizedPermissions.clear(); + }); +} + +// GOOD: Also clear on role changes +constructor() { + teamChange$.subscribe(() => this.clearCache()); + BusTeamRoleChange.subscribe(() => this.clearCache()); // ← ADD THIS + BusTeamMemberRoleChange.subscribe(() => this.clearCache()); // ← AND THIS +} +``` + +## Checklist: Before Making Changes + +### For ANY state change: + +1. [ ] Does this change affect other services? +2. [ ] Is there a broker event for this? If not, should there be? +3. [ ] Are all relevant services subscribed to the event? + +### For blocking-related changes: + +1. [ ] Is blocking checked on all relevant ACTIONS (not just permissions)? +2. [ ] What happens if block is created DURING an action (e.g., mid-call)? +3. [ ] Are broker events emitted for blocking changes? +4. [ ] Do voice services subscribe to `BusUserBlockChange`? + +### For permission/cache changes: + +1. [ ] What events should invalidate this cache? +2. [ ] Is the cache subscribed to all relevant broker events? +3. [ ] What's the TTL? Is stale data acceptable? + +## Files Quick Reference + +| File | Key Functions | Known Issues | +|------|---------------|--------------| +| `user-block.service.ts` | `blockUser()`, `unblockUser()` | Missing voice cleanup on block | +| `voice-channel.service.ts` | `answerDmCall()`, `startDmCall()` | Missing blocking check on answer | +| `team.service.ts` | `getPermission()`, `clearCache()` | Cache doesn't subscribe to role events | +| `roles.controller.ts` | `createRole()`, `deleteRole()` | Missing broker events | +| `chat.service.ts` | `getChannelPermission()` | **Reference implementation** (correct) | + +## Anti-Patterns to Avoid + +1. **Fixing in ONE place** - If blocking is checked in `startDmCall()`, it should also be in `answerDmCall()` +2. **Changing permissions** - Don't change `voiceListen: true` to `voiceListen: !isBlocked`. Use action checks instead. +3. **Forgetting broker events** - Every CRUD operation on roles/permissions should emit an event +4. **Assuming cache is fresh** - If an operation can change state, subscribe to its event diff --git a/test/fixture/src/broker/events.ts b/test/fixture/src/broker/events.ts new file mode 100644 index 0000000..7123f29 --- /dev/null +++ b/test/fixture/src/broker/events.ts @@ -0,0 +1,79 @@ +/** + * Broker Event Bus System + * + * All state changes that affect multiple services should emit broker events. + * Services subscribe to these events to maintain consistency. + */ + +import { Subject } from 'rxjs'; + +// ============================================================================ +// User Events +// ============================================================================ + +export interface UserBlockEvent { + sourceUserId: string; + targetUserId: string; + blocked: boolean; // true = blocked, false = unblocked + timestamp: Date; +} + +export const BusUserBlockChange = new Subject(); + +// ============================================================================ +// Team Events +// ============================================================================ + +export interface TeamRoleEvent { + teamId: string; + roleId: string; + action: 'created' | 'updated' | 'deleted'; + flags?: number; + timestamp: Date; +} + +export interface TeamMemberRoleEvent { + teamId: string; + userId: string; + roleId: string; + action: 'assigned' | 'removed'; + timestamp: Date; +} + +export const BusTeamRoleChange = new Subject(); +export const BusTeamMemberRoleChange = new Subject(); + +// ============================================================================ +// Voice Events +// ============================================================================ + +export interface VoiceParticipantEvent { + channelId: string; + userId: string; + action: 'joined' | 'left' | 'kicked'; + timestamp: Date; +} + +export interface DmCallEvent { + callId: string; + callerId: string; + calleeId: string; + state: 'ringing' | 'active' | 'ended' | 'declined'; + timestamp: Date; +} + +export const BusVoiceParticipant = new Subject(); +export const BusDmCall = new Subject(); + +// ============================================================================ +// Channel Events +// ============================================================================ + +export interface ChannelMemberEvent { + channelId: string; + userId: string; + hidden: boolean; + timestamp: Date; +} + +export const BusChannelMember = new Subject(); diff --git a/test/fixture/src/controllers/roles.controller.ts b/test/fixture/src/controllers/roles.controller.ts new file mode 100644 index 0000000..fbd36d8 --- /dev/null +++ b/test/fixture/src/controllers/roles.controller.ts @@ -0,0 +1,99 @@ +/** + * Roles Controller + * + * REST API for managing team roles. + */ + +import { BusTeamRoleChange } from '../broker/events'; + +interface TeamRole { + id: string; + teamId: string; + name: string; + flags: number; +} + +// In-memory store +const roles: TeamRole[] = []; + +export class RolesController { + /** + * Create a new role. + * + * BUG: Doesn't emit BusTeamRoleChange event! + * Clients won't know a new role was created. + */ + async createRole(teamId: string, name: string, flags: number): Promise { + const role: TeamRole = { + id: `role_${Date.now()}`, + teamId, + name, + flags, + }; + roles.push(role); + + // BUG: Missing broker event! + // Should emit: + // BusTeamRoleChange.next({ + // teamId, + // roleId: role.id, + // action: 'created', + // flags, + // timestamp: new Date(), + // }); + + return role; + } + + /** + * Update an existing role. + * + * Emits broker event correctly (this one is fine). + */ + async updateRole(teamId: string, roleId: string, updates: Partial): Promise { + const role = roles.find(r => r.id === roleId && r.teamId === teamId); + if (!role) throw new Error('Role not found'); + + Object.assign(role, updates); + + // This one correctly emits the event + BusTeamRoleChange.next({ + teamId, + roleId: role.id, + action: 'updated', + flags: role.flags, + timestamp: new Date(), + }); + + return role; + } + + /** + * Delete a role. + * + * BUG: Doesn't emit BusTeamRoleChange event! + * Clients won't know the role was deleted, will have stale data. + */ + async deleteRole(teamId: string, roleId: string): Promise { + const index = roles.findIndex(r => r.id === roleId && r.teamId === teamId); + if (index === -1) throw new Error('Role not found'); + + roles.splice(index, 1); + + // BUG: Missing broker event! + // Should emit: + // BusTeamRoleChange.next({ + // teamId, + // roleId, + // action: 'deleted', + // timestamp: new Date(), + // }); + } + + /** + * Get all roles for a team. + */ + async getRoles(teamId: string): Promise { + return roles.filter(r => r.teamId === teamId); + } +} diff --git a/test/fixture/src/services/chat.service.ts b/test/fixture/src/services/chat.service.ts new file mode 100644 index 0000000..ec3b18a --- /dev/null +++ b/test/fixture/src/services/chat.service.ts @@ -0,0 +1,122 @@ +/** + * Chat Service + * + * Manages chat channels and permissions. + * This shows the CORRECT pattern for handling blocking in chat. + */ + +import { UserBlockService } from './user-block.service'; + +interface ChannelPermission { + permission: 'read' | 'write' | 'admin' | 'denied'; + voiceListen: boolean; + voiceTalk: boolean; + voiceWebcam: boolean; + voiceScreenshare: boolean; + reason?: string; +} + +interface Channel { + id: string; + type: 'dm' | 'project'; + userA?: string; // For DM channels + userB?: string; // For DM channels + projectId?: string; // For project channels +} + +export class ChatService { + constructor(private userBlockService: UserBlockService) {} + + /** + * Get permission for a channel. + * + * For DM channels: + * - Blocking returns 'read' permission (can see messages, can't send) + * - This is the CORRECT pattern: permission = visibility, not action validation + * + * For voice permissions in DM: + * - voiceListen, voiceTalk, etc. are always TRUE for DM channels + * - This is INTENTIONAL: voice blocking is handled by action checks, not permissions + * + * Pattern: Permission controls VISIBILITY; Actions have SEPARATE blocking checks. + * See chat.sendMessage() for how blocking is enforced on actions. + */ + async getChannelPermission( + userId: string, + channel: Channel + ): Promise { + if (channel.type === 'dm' && channel.userA && channel.userB) { + const otherUser = channel.userA === userId ? channel.userB : channel.userA; + + // Check blocking status + const isBlockingOut = await this.userBlockService.isBlocking(userId, otherUser); + const isBlockingInc = await this.userBlockService.isBlocking(otherUser, userId); + + // Return 'read' permission for blocked DMs (can see, can't send) + // This is the correct pattern - permission controls visibility + if (isBlockingOut) { + return { + permission: 'read', + reason: 'block-user', + // Voice permissions are always true for DM - blocking is checked on actions + voiceListen: true, + voiceTalk: true, + voiceWebcam: true, + voiceScreenshare: true, + }; + } + + if (isBlockingInc) { + return { + permission: 'read', + reason: 'blocked-by-user', + voiceListen: true, + voiceTalk: true, + voiceWebcam: true, + voiceScreenshare: true, + }; + } + + // Normal DM permission + return { + permission: 'write', + voiceListen: true, + voiceTalk: true, + voiceWebcam: true, + voiceScreenshare: true, + }; + } + + // Project channel - normal permission flow + return { + permission: 'write', + voiceListen: true, + voiceTalk: true, + voiceWebcam: true, + voiceScreenshare: true, + }; + } + + /** + * Send a message to a channel. + * + * This is the CORRECT pattern for blocking enforcement: + * - Check blocking SEPARATELY from permission + * - Permission controls visibility; this check controls action + */ + async sendMessage(userId: string, channel: Channel, content: string): Promise { + // Separate blocking check for the ACTION (not permission) + if (channel.type === 'dm' && channel.userA && channel.userB) { + const isBlocked = await this.userBlockService.isBlockingEitherWay( + channel.userA, + channel.userB + ); + if (isBlocked) { + throw new Error('You cannot send messages to this user'); + } + } + + // Send the message... + console.log(`[chat] ${userId} -> ${channel.id}: ${content}`); + } +} diff --git a/test/fixture/src/services/team.service.ts b/test/fixture/src/services/team.service.ts new file mode 100644 index 0000000..004cabd --- /dev/null +++ b/test/fixture/src/services/team.service.ts @@ -0,0 +1,102 @@ +/** + * Team Service (Client-side) + * + * Manages team state including permissions and role caching. + */ + +import { BusTeamRoleChange, BusTeamMemberRoleChange } from '../broker/events'; + +interface Permission { + permission: 'read' | 'write' | 'admin' | 'denied'; + voiceListen: boolean; + voiceTalk: boolean; + voiceWebcam: boolean; + voiceScreenshare: boolean; +} + +interface Team { + id: string; + name: string; +} + +/** + * Permission cache for computed permissions. + * + * BUG: This cache only clears on team switch, not on role changes! + * When a user's roles change, their cached permissions become stale. + */ +const memoizedPermissions = new Map(); + +export class TeamService { + private currentTeam: Team | null = null; + + constructor() { + // Subscribe to team changes to clear cache + // BUG: Only clears on team SWITCH, not on role updates! + this.setupTeamChangeSubscription(); + + // BUG: Missing subscription to role changes! + // Should subscribe to BusTeamRoleChange and BusTeamMemberRoleChange + // and clear the cache when roles change. + } + + /** + * Get cached permission for a project. + */ + getPermission(projectId: string): Permission | undefined { + return memoizedPermissions.get(projectId); + } + + /** + * Set cached permission for a project. + */ + setPermission(projectId: string, permission: Permission): void { + memoizedPermissions.set(projectId, permission); + } + + /** + * Clear permission cache. + * + * Called when active team changes. + * BUG: Should also be called when roles change! + */ + clearPermissionCache(): void { + memoizedPermissions.clear(); + } + + /** + * Switch to a different team. + */ + setActiveTeam(team: Team): void { + this.currentTeam = team; + this.clearPermissionCache(); + } + + /** + * Setup subscription to team changes. + * + * BUG: Only subscribes to team SWITCH, not to: + * - BusTeamRoleChange (role created/updated/deleted) + * - BusTeamMemberRoleChange (user role assigned/removed) + */ + private setupTeamChangeSubscription(): void { + // This would normally be an observable subscription + // For now, we just clear on setActiveTeam() + } + + /** + * FIX: Should add these subscriptions: + * + * BusTeamRoleChange.subscribe(event => { + * if (event.teamId === this.currentTeam?.id) { + * this.clearPermissionCache(); + * } + * }); + * + * BusTeamMemberRoleChange.subscribe(event => { + * if (event.teamId === this.currentTeam?.id) { + * this.clearPermissionCache(); + * } + * }); + */ +} diff --git a/test/fixture/src/services/user-block.service.ts b/test/fixture/src/services/user-block.service.ts new file mode 100644 index 0000000..f7984bf --- /dev/null +++ b/test/fixture/src/services/user-block.service.ts @@ -0,0 +1,117 @@ +/** + * User Block Service + * + * Handles blocking/unblocking between users. + * Blocking affects: + * - DM visibility + * - Voice call permissions + * - Feed following + */ + +import { BusUserBlockChange } from '../broker/events'; + +interface UserBlockRecord { + sourceUserId: string; + targetUserId: string; + createdAt: Date; +} + +// In-memory store for testing +const blocks: UserBlockRecord[] = []; + +export class UserBlockService { + /** + * Block a user. + * + * When a user is blocked: + * 1. They can no longer send DMs + * 2. They are unfollowed from feeds + * 3. DM channel becomes read-only + * + * BUG: Missing voice call cleanup! + * - Should end any active DM call between these users + * - Should kick from shared voice channels + */ + async blockUser(sourceUserId: string, targetUserId: string): Promise { + // Check if already blocked + const existing = blocks.find( + b => b.sourceUserId === sourceUserId && b.targetUserId === targetUserId + ); + if (existing) return; + + // Create block record + const block: UserBlockRecord = { + sourceUserId, + targetUserId, + createdAt: new Date(), + }; + blocks.push(block); + + // Unfollow in both directions + await this.unfollowUser(sourceUserId, targetUserId); + await this.unfollowUser(targetUserId, sourceUserId); + + // Emit broker event + BusUserBlockChange.next({ + sourceUserId, + targetUserId, + blocked: true, + timestamp: block.createdAt, + }); + + // BUG: No voice cleanup here! + // Should call: voiceChannelService.endDmCallBetweenUsers(sourceUserId, targetUserId) + // Should call: voiceChannelService.kickFromSharedChannels(sourceUserId, targetUserId) + } + + /** + * Unblock a user. + */ + async unblockUser(sourceUserId: string, targetUserId: string): Promise { + const index = blocks.findIndex( + b => b.sourceUserId === sourceUserId && b.targetUserId === targetUserId + ); + if (index === -1) return; + + blocks.splice(index, 1); + + // Unhide DM channel + await this.unhideDmChannel(sourceUserId, targetUserId); + + // Emit broker event + BusUserBlockChange.next({ + sourceUserId, + targetUserId, + blocked: false, + timestamp: new Date(), + }); + } + + /** + * Check if either user has blocked the other. + */ + async isBlockingEitherWay(userA: string, userB: string): Promise { + return blocks.some( + b => + (b.sourceUserId === userA && b.targetUserId === userB) || + (b.sourceUserId === userB && b.targetUserId === userA) + ); + } + + /** + * Check if source has blocked target. + */ + async isBlocking(sourceUserId: string, targetUserId: string): Promise { + return blocks.some( + b => b.sourceUserId === sourceUserId && b.targetUserId === targetUserId + ); + } + + private async unfollowUser(userId: string, targetId: string): Promise { + // Unfollow logic... + } + + private async unhideDmChannel(userA: string, userB: string): Promise { + // Unhide DM channel logic... + } +} diff --git a/test/fixture/src/services/voice-channel.service.ts b/test/fixture/src/services/voice-channel.service.ts new file mode 100644 index 0000000..0dc9349 --- /dev/null +++ b/test/fixture/src/services/voice-channel.service.ts @@ -0,0 +1,220 @@ +/** + * Voice Channel Service + * + * Manages voice channel state, participants, and DM calls. + */ + +import { BusVoiceParticipant, BusDmCall, BusUserBlockChange } from '../broker/events'; +import { UserBlockService } from './user-block.service'; + +interface VoiceParticipant { + channelId: string; + odlUserId: string; + userId: string; + joinedAt: Date; + muted: boolean; + deafened: boolean; +} + +interface DmCall { + callId: string; + callerId: string; + calleeId: string; + channelId: string; + state: 'ringing' | 'active' | 'ended' | 'declined'; + createdAt: Date; +} + +// In-memory stores +const participants: VoiceParticipant[] = []; +const dmCalls: DmCall[] = []; + +export class VoiceChannelService { + constructor(private userBlockService: UserBlockService) {} + + /** + * Join a voice channel. + * + * For DM channels, checks blocking before allowing join. + */ + async joinVoiceChannel( + userId: string, + channelId: string, + channelType: 'dm' | 'project' + ): Promise { + // For DM channels, check blocking + if (channelType === 'dm') { + const otherUserId = this.getOtherDmUser(channelId, userId); + const isBlocked = await this.userBlockService.isBlockingEitherWay(userId, otherUserId); + if (isBlocked) { + throw new Error('Cannot join voice - user is blocked'); + } + } + + const participant: VoiceParticipant = { + channelId, + odlUserId: `odl_${userId}`, + userId, + joinedAt: new Date(), + muted: false, + deafened: false, + }; + participants.push(participant); + + BusVoiceParticipant.next({ + channelId, + userId, + action: 'joined', + timestamp: participant.joinedAt, + }); + } + + /** + * Start a DM call. + * + * Checks blocking before creating the call. + */ + async startDmCall(callerId: string, calleeId: string): Promise { + // Check blocking + const isBlocked = await this.userBlockService.isBlockingEitherWay(callerId, calleeId); + if (isBlocked) { + throw new Error('Cannot start call - user is blocked'); + } + + const call: DmCall = { + callId: `call_${Date.now()}`, + callerId, + calleeId, + channelId: `dm_${callerId}_${calleeId}`, + state: 'ringing', + createdAt: new Date(), + }; + dmCalls.push(call); + + BusDmCall.next({ + callId: call.callId, + callerId, + calleeId, + state: 'ringing', + timestamp: call.createdAt, + }); + + // Notify callee + this.notifyDmCall(call); + + return call; + } + + /** + * Answer a DM call. + * + * BUG: Missing blocking check! + * If block is created after call starts but before answer, + * the callee can still answer. + */ + async answerDmCall(callId: string, userId: string): Promise { + const call = dmCalls.find(c => c.callId === callId); + if (!call) throw new Error('Call not found'); + if (call.calleeId !== userId) throw new Error('Not the callee'); + if (call.state !== 'ringing') throw new Error('Call is not ringing'); + + // BUG: No blocking check here! + // Should check: await this.userBlockService.isBlockingEitherWay(call.callerId, call.calleeId) + + call.state = 'active'; + + BusDmCall.next({ + callId: call.callId, + callerId: call.callerId, + calleeId: call.calleeId, + state: 'active', + timestamp: new Date(), + }); + } + + /** + * Decline a DM call. + * + * BUG: Missing blocking check! + */ + async declineDmCall(callId: string, userId: string): Promise { + const call = dmCalls.find(c => c.callId === callId); + if (!call) throw new Error('Call not found'); + if (call.calleeId !== userId) throw new Error('Not the callee'); + + // BUG: No blocking check here either! + + call.state = 'declined'; + + BusDmCall.next({ + callId: call.callId, + callerId: call.callerId, + calleeId: call.calleeId, + state: 'declined', + timestamp: new Date(), + }); + } + + /** + * End a DM call between two users. + * + * Used when block is created to clean up active calls. + */ + async endDmCallBetweenUsers(userA: string, userB: string): Promise { + const call = dmCalls.find( + c => + (c.callerId === userA && c.calleeId === userB) || + (c.callerId === userB && c.calleeId === userA) + ); + if (call && call.state !== 'ended') { + call.state = 'ended'; + BusDmCall.next({ + callId: call.callId, + callerId: call.callerId, + calleeId: call.calleeId, + state: 'ended', + timestamp: new Date(), + }); + } + } + + /** + * Kick a user from a voice channel. + */ + async leaveChannel(userId: string, channelId?: string): Promise { + const index = participants.findIndex( + p => p.userId === userId && (!channelId || p.channelId === channelId) + ); + if (index !== -1) { + const participant = participants[index]; + participants.splice(index, 1); + + BusVoiceParticipant.next({ + channelId: participant.channelId, + userId, + action: 'left', + timestamp: new Date(), + }); + } + } + + /** + * Notify callee of incoming DM call. + * + * BUG: Doesn't filter for blocking! + * Blocked users still receive call notifications. + */ + private notifyDmCall(call: DmCall): void { + // BUG: Should check blocking before notifying + // if (await this.userBlockService.isBlockingEitherWay(call.callerId, call.calleeId)) return; + + // Send notification to callee... + console.log(`[notify] ${call.calleeId}: Incoming call from ${call.callerId}`); + } + + private getOtherDmUser(channelId: string, userId: string): string { + // Parse DM channel ID to get the other user + const parts = channelId.replace('dm_', '').split('_'); + return parts.find(id => id !== userId) || ''; + } +} diff --git a/test/mcp-client.mjs b/test/mcp-client.mjs new file mode 100644 index 0000000..1af7ac6 --- /dev/null +++ b/test/mcp-client.mjs @@ -0,0 +1,229 @@ +#!/usr/bin/env node +/** + * MCP Test Client + * + * Programmatically tests the diligence MCP server by: + * 1. Spawning the server as a child process + * 2. Sending JSON-RPC messages via stdio + * 3. Receiving and parsing responses + * + * Usage: + * const client = new McpClient(); + * await client.connect(); + * const result = await client.callTool('status', {}); + * await client.disconnect(); + */ + +import { spawn } from 'child_process'; +import { createInterface } from 'readline'; +import { dirname, join } from 'path'; +import { fileURLToPath } from 'url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +export class McpClient { + constructor(serverPath = join(__dirname, '..', 'index.mjs')) { + this.serverPath = serverPath; + this.process = null; + this.requestId = 0; + this.pendingRequests = new Map(); + this.readline = null; + } + + async connect() { + return new Promise((resolve, reject) => { + this.process = spawn('node', [this.serverPath], { + stdio: ['pipe', 'pipe', 'pipe'], + cwd: join(__dirname, 'fixture'), // Run in fixture directory + }); + + this.readline = createInterface({ + input: this.process.stdout, + crlfDelay: Infinity, + }); + + this.readline.on('line', (line) => { + try { + const message = JSON.parse(line); + if (message.id !== undefined && this.pendingRequests.has(message.id)) { + const { resolve, reject } = this.pendingRequests.get(message.id); + this.pendingRequests.delete(message.id); + if (message.error) { + reject(new Error(message.error.message || JSON.stringify(message.error))); + } else { + resolve(message.result); + } + } + } catch (e) { + // Ignore non-JSON lines + } + }); + + this.process.stderr.on('data', (data) => { + // Server logs to stderr + if (process.env.DEBUG) { + console.error('[server]', data.toString()); + } + }); + + this.process.on('error', reject); + this.process.on('exit', (code) => { + if (code !== 0 && code !== null) { + console.error(`Server exited with code ${code}`); + } + }); + + // Initialize the MCP connection + this._send({ + jsonrpc: '2.0', + id: this.requestId++, + method: 'initialize', + params: { + protocolVersion: '0.1.0', + clientInfo: { name: 'test-client', version: '1.0.0' }, + capabilities: {}, + }, + }).then(() => { + // Send initialized notification + this._sendNotification('notifications/initialized', {}); + resolve(); + }).catch(reject); + }); + } + + async disconnect() { + if (this.process) { + this.process.kill('SIGTERM'); + this.process = null; + } + if (this.readline) { + this.readline.close(); + this.readline = null; + } + } + + _send(message) { + return new Promise((resolve, reject) => { + if (!this.process) { + reject(new Error('Not connected')); + return; + } + this.pendingRequests.set(message.id, { resolve, reject }); + this.process.stdin.write(JSON.stringify(message) + '\n'); + + // Timeout after 10 seconds + setTimeout(() => { + if (this.pendingRequests.has(message.id)) { + this.pendingRequests.delete(message.id); + reject(new Error('Request timeout')); + } + }, 10000); + }); + } + + _sendNotification(method, params) { + if (!this.process) return; + this.process.stdin.write(JSON.stringify({ + jsonrpc: '2.0', + method, + params, + }) + '\n'); + } + + async listTools() { + const result = await this._send({ + jsonrpc: '2.0', + id: this.requestId++, + method: 'tools/list', + params: {}, + }); + return result.tools; + } + + async callTool(name, args = {}) { + const result = await this._send({ + jsonrpc: '2.0', + id: this.requestId++, + method: 'tools/call', + params: { name, arguments: args }, + }); + // Extract text from content array + if (result.content && result.content[0] && result.content[0].text) { + return { + text: result.content[0].text, + isError: result.isError || false, + }; + } + return result; + } + + // Convenience methods for common workflows + async status() { + return this.callTool('status'); + } + + async start(task) { + return this.callTool('start', { task }); + } + + async propose(proposal) { + return this.callTool('propose', { proposal }); + } + + async review(decision, reasoning) { + return this.callTool('review', { decision, reasoning }); + } + + async getWorkerBrief() { + return this.callTool('get_worker_brief'); + } + + async getReviewerBrief() { + return this.callTool('get_reviewer_brief'); + } + + async implement() { + return this.callTool('implement'); + } + + async complete(summary) { + return this.callTool('complete', { summary }); + } + + async abort(reason) { + return this.callTool('abort', { reason }); + } + + async approve(reason) { + return this.callTool('approve', { reason }); + } +} + +// CLI usage for quick testing +if (process.argv[1] === fileURLToPath(import.meta.url)) { + const client = new McpClient(); + + try { + console.log('Connecting to MCP server...'); + await client.connect(); + console.log('Connected!\n'); + + // List tools + const tools = await client.listTools(); + console.log('Available tools:'); + tools.forEach(t => console.log(` - ${t.name}: ${t.description.slice(0, 60)}...`)); + console.log(); + + // Check status + const status = await client.status(); + console.log('Status:'); + console.log(status.text); + + await client.disconnect(); + console.log('\nDisconnected.'); + } catch (err) { + console.error('Error:', err.message); + await client.disconnect(); + process.exit(1); + } +} diff --git a/test/results/2026-01-22-comparison-report.md b/test/results/2026-01-22-comparison-report.md new file mode 100644 index 0000000..f421e83 --- /dev/null +++ b/test/results/2026-01-22-comparison-report.md @@ -0,0 +1,183 @@ +# Diligence vs Naive Approach: Comparison Report + +**Date:** 2026-01-22 +**Test Bug:** B1 - Blocked users can answer DM voice calls +**Project:** nexus (~/bude/codecharm/nexus) + +--- + +## Executive Summary + +| Metric | Naive Approach | Diligence Approach | +|--------|----------------|-------------------| +| Bug verified exists? | ✅ Yes | ✅ Yes | +| Correct line numbers? | ✅ Yes (1050, 965) | ✅ Worker correct | +| Found declineDmCall gap? | ✅ Yes | ⚠️ Reviewer found it | +| Found notification filtering? | ✅ Yes | ⚠️ Reviewer found it | +| Found blockUser cleanup? | ✅ Yes | ⚠️ Reviewer found it | +| Reviewer caught errors? | N/A | ✅ Caught line number discrepancy* | + +*Reviewer searched wrong codebase (test fixture instead of nexus), but the PROCESS of verification worked. + +--- + +## Bug Verification: CONFIRMED REAL + +**Evidence from actual nexus code:** + +```typescript +// startDmCall (lines 965-969) - HAS blocking check ✅ +const blocked = await this.userBlockService.isBlockingEitherWay(callerId, calleeId); +if (blocked) { + throw new UserError('Cannot call this user'); +} + +// answerDmCall (line 1050+) - NO blocking check ❌ +async answerDmCall(callId: MongoId): Promise<{ token: string; channelId: string }> { + // Only checks: auth, call exists, state=ringing, user=callee, not expired + // MISSING: blocking check +} + +// declineDmCall (line 1115+) - NO blocking check ❌ +async declineDmCall(callId: MongoId): Promise { + // Only checks: auth, call exists, state=ringing, user=callee + // MISSING: blocking check +} +``` + +**Conclusion:** Bug B1 is REAL. Both approaches correctly identified it. + +--- + +## Detailed Comparison + +### Naive Approach Output + +The naive agent (single Explore agent) produced: +- ✅ Root cause analysis +- ✅ Correct file identification (`voice-channel.rpc.ts`) +- ✅ Correct line numbers (965-969, 1050-1109) +- ✅ Compared startDmCall vs answerDmCall patterns +- ✅ Identified additional issues: + - declineDmCall needs blocking check + - notifyDmCall needs filtering + - blockUser() needs voice cleanup + - BusUserBlockChange subscription needed +- ✅ Implementation order recommendation +- ✅ Edge cases considered + +**Quality:** Surprisingly thorough. Searched actual code, cited lines, found patterns. + +### Diligence Approach Output + +**Worker:** +- ✅ Verified bug exists by searching code +- ✅ Correct line numbers +- ✅ Cited exact file:line +- ✅ Proposed fix matching startDmCall pattern + +**Reviewer:** +- ✅ Attempted independent verification (correct process) +- ❌ Searched wrong codebase (test fixture, 220 lines) +- ✅ Noticed discrepancy ("file only 220 lines, Worker cited 1050") +- ✅ Found additional gaps (declineDmCall, notification filtering) +- ✅ Gave NEEDS_WORK decision with specific issues + +**Quality:** Process worked correctly. Reviewer caught a "discrepancy" (even if due to searching wrong place). + +--- + +## Key Findings + +### 1. Both approaches verified the bug exists + +Neither approach blindly trusted the task description. Both: +- Searched for answerDmCall implementation +- Compared with startDmCall pattern +- Verified blocking check is actually missing + +### 2. Naive approach was surprisingly thorough + +The single agent produced analysis comparable to the Worker. This suggests: +- For bugs with clear descriptions, naive approach may suffice +- The value of diligence may be in more ambiguous tasks + +### 3. Reviewer process works, but needs correct context + +The Reviewer: +- Did NOT rubber-stamp the Worker's proposal +- Actually searched and found discrepancies +- Caught additional issues the Worker missed +- BUT searched the wrong codebase due to test setup + +### 4. Test setup flaw identified + +The Reviewer searched `/Users/marc/bude/strikt/diligence/test/fixture/` instead of `~/bude/codecharm/nexus`. This is because: +- Agents were spawned from the diligence project +- They defaulted to searching the current working directory + +**Fix needed:** In real usage, diligence MCP runs IN the target project, so this wouldn't happen. + +--- + +## What Diligence Should Catch That Naive Might Miss + +Based on this test, diligence adds value when: + +1. **Worker makes incorrect claims** - Reviewer verifies by searching +2. **Worker misses related issues** - Reviewer's independent search finds them +3. **Task description is wrong** - Both should verify bug exists, not assume +4. **Patterns are misunderstood** - Reviewer checks against CODEBASE_CONTEXT.md + +### This test showed: + +| Scenario | Did Diligence Help? | +|----------|---------------------| +| Verify bug exists | Both approaches did this | +| Catch wrong line numbers | Reviewer caught discrepancy ✅ | +| Find additional gaps | Reviewer found more than Worker ✅ | +| Prevent hallucinated bugs | Would catch if Reviewer searched correctly | + +--- + +## Recommendations + +### 1. Run real test in nexus project + +Start a Claude Code session IN nexus and test the full workflow there. This ensures: +- MCP server runs in correct project +- Agents search the right codebase +- Full context from CODEBASE_CONTEXT.md is loaded + +### 2. Test with a more ambiguous bug + +B1 is well-documented. Test with something like: +- "Voice seems laggy sometimes" +- "Users report weird permission issues" + +These require more investigation to even determine if there's a bug. + +### 3. Test if diligence catches non-bugs + +Give a task for a bug that doesn't exist. Does the workflow correctly identify "no bug found"? + +### 4. Add explicit codebase path to Worker/Reviewer briefs + +The briefs should specify: "Search in /path/to/project, not elsewhere" + +--- + +## Conclusion + +**Does diligence work?** Yes, the process is sound: +- Worker researches and proposes +- Reviewer independently verifies +- Discrepancies are caught +- Multiple rounds can iterate + +**Is it better than naive?** For this test, similar results. But: +- Reviewer caught additional issues Worker missed +- Process would catch hallucinated bugs if Reviewer searches correctly +- Real value may be in more complex/ambiguous tasks + +**Next step:** Run a real test in a Claude Code session in nexus, with a more ambiguous task. diff --git a/test/run-tests.mjs b/test/run-tests.mjs new file mode 100644 index 0000000..01fa170 --- /dev/null +++ b/test/run-tests.mjs @@ -0,0 +1,415 @@ +#!/usr/bin/env node +/** + * Diligence Test Runner + * + * Runs end-to-end tests of the Worker-Reviewer loop. + * + * Modes: + * --workflow Test MCP workflow mechanics only (no AI) + * --mock Use mock Worker/Reviewer responses + * --live Use real Claude API for Worker/Reviewer (requires ANTHROPIC_API_KEY) + * + * Usage: + * node test/run-tests.mjs --workflow + * node test/run-tests.mjs --mock --scenario=blocking-voice + * node test/run-tests.mjs --live --scenario=permission-cache + */ + +import { McpClient } from './mcp-client.mjs'; +import { readFileSync, existsSync, unlinkSync } from 'fs'; +import { dirname, join } from 'path'; +import { fileURLToPath } from 'url'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +// Parse CLI args +const args = process.argv.slice(2); +const mode = args.find(a => ['--workflow', '--mock', '--live'].includes(a)) || '--workflow'; +const scenarioArg = args.find(a => a.startsWith('--scenario=')); +const scenarioId = scenarioArg ? scenarioArg.split('=')[1] : null; +const verbose = args.includes('--verbose') || args.includes('-v'); + +// Colors for output +const colors = { + reset: '\x1b[0m', + green: '\x1b[32m', + red: '\x1b[31m', + yellow: '\x1b[33m', + blue: '\x1b[34m', + dim: '\x1b[2m', +}; + +function log(msg, color = 'reset') { + console.log(`${colors[color]}${msg}${colors.reset}`); +} + +function logSection(title) { + console.log(`\n${colors.blue}=== ${title} ===${colors.reset}`); +} + +// Load scenario +function loadScenario(id) { + const path = join(__dirname, 'scenarios', `${id}.json`); + if (!existsSync(path)) { + throw new Error(`Scenario not found: ${id}`); + } + return JSON.parse(readFileSync(path, 'utf-8')); +} + +// Load all scenarios +function loadAllScenarios() { + const index = JSON.parse(readFileSync(join(__dirname, 'scenarios', 'index.json'), 'utf-8')); + return index.scenarios.map(s => loadScenario(s.id)); +} + +// Clean up state file before test +function cleanState() { + const stateFile = join(__dirname, 'fixture', '.claude', '.diligence-state.json'); + if (existsSync(stateFile)) { + unlinkSync(stateFile); + } +} + +// ============================================================================ +// Workflow Tests (no AI, just MCP mechanics) +// ============================================================================ + +async function testWorkflow() { + logSection('Workflow Tests'); + + const client = new McpClient(); + let passed = 0; + let failed = 0; + + try { + cleanState(); + await client.connect(); + log('Connected to MCP server', 'green'); + + // Test 1: Status in conversation phase + { + const result = await client.status(); + const ok = result.text.includes('Phase: conversation'); + log(` [${ok ? 'PASS' : 'FAIL'}] Initial status is conversation`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 2: Start workflow + { + const result = await client.start('Test task'); + const ok = result.text.includes('researching') && result.text.includes('Round: 1/5'); + log(` [${ok ? 'PASS' : 'FAIL'}] Start transitions to researching`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 3: Cannot start again while in progress + { + const result = await client.start('Another task'); + const ok = result.isError && result.text.includes('Already in'); + log(` [${ok ? 'PASS' : 'FAIL'}] Cannot start while in progress`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 4: Get worker brief + { + const result = await client.getWorkerBrief(); + const ok = result.text.includes('Worker Brief') && result.text.includes('Test task'); + log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief contains task`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 5: Submit proposal + { + const result = await client.propose('## Analysis\n\nProposed fix here'); + const ok = result.text.includes('Proposal submitted'); + log(` [${ok ? 'PASS' : 'FAIL'}] Proposal submitted`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 6: Get reviewer brief includes proposal + { + const result = await client.getReviewerBrief(); + const ok = result.text.includes('Reviewer Brief') && result.text.includes('Proposed fix here'); + log(` [${ok ? 'PASS' : 'FAIL'}] Reviewer brief contains proposal`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 7: Review with NEEDS_WORK + { + const result = await client.review('NEEDS_WORK', 'Missing broker event handling'); + const ok = result.text.includes('NEEDS_WORK') && result.text.includes('Round 2/5'); + log(` [${ok ? 'PASS' : 'FAIL'}] NEEDS_WORK increments round`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 8: Worker brief now includes feedback + { + const result = await client.getWorkerBrief(); + const ok = result.text.includes('Previous Feedback') && result.text.includes('broker event'); + log(` [${ok ? 'PASS' : 'FAIL'}] Worker brief includes previous feedback`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 9: Submit revised proposal + { + const result = await client.propose('## Revised\n\nNow with broker events'); + const ok = result.text.includes('Proposal submitted'); + log(` [${ok ? 'PASS' : 'FAIL'}] Revised proposal submitted`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 10: Review with APPROVED + { + const result = await client.review('APPROVED', 'All checks pass'); + const ok = result.text.includes('APPROVED') && result.text.includes('2 round'); + log(` [${ok ? 'PASS' : 'FAIL'}] APPROVED after review`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 11: Status shows approved + { + const result = await client.status(); + const ok = result.text.includes('Phase: approved'); + log(` [${ok ? 'PASS' : 'FAIL'}] Status shows approved phase`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 12: Implement + { + const result = await client.implement(); + const ok = result.text.includes('Implementation phase'); + log(` [${ok ? 'PASS' : 'FAIL'}] Implement starts implementation`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 13: Complete + { + const result = await client.complete('Fixed the bug'); + const ok = result.text.includes('Complete') && result.text.includes('Reset to conversation'); + log(` [${ok ? 'PASS' : 'FAIL'}] Complete resets workflow`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 14: Back to conversation + { + const result = await client.status(); + const ok = result.text.includes('Phase: conversation'); + log(` [${ok ? 'PASS' : 'FAIL'}] Back to conversation phase`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 15: Abort works + { + await client.start('Task to abort'); + const result = await client.abort('Changed my mind'); + const ok = result.text.includes('Aborted') && result.text.includes('Reset to conversation'); + log(` [${ok ? 'PASS' : 'FAIL'}] Abort resets workflow`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + // Test 16: Max rounds enforcement + { + await client.start('Task for max rounds'); + for (let i = 0; i < 5; i++) { + await client.propose(`Proposal ${i + 1}`); + if (i < 4) { + await client.review('NEEDS_WORK', `Feedback ${i + 1}`); + } + } + const result = await client.review('NEEDS_WORK', 'Still not good'); + const ok = result.text.includes('MAX ROUNDS') && result.text.includes('reset'); + log(` [${ok ? 'PASS' : 'FAIL'}] Max rounds resets workflow`, ok ? 'green' : 'red'); + ok ? passed++ : failed++; + } + + log(`\nWorkflow tests: ${passed} passed, ${failed} failed`, failed ? 'red' : 'green'); + return failed === 0; + + } finally { + await client.disconnect(); + cleanState(); + } +} + +// ============================================================================ +// Mock Tests (predefined Worker/Reviewer responses) +// ============================================================================ + +async function testWithMocks(scenario) { + logSection(`Mock Test: ${scenario.name}`); + + const client = new McpClient(); + + try { + cleanState(); + await client.connect(); + + // Start the workflow + await client.start(scenario.task); + log(`Started task: ${scenario.task.slice(0, 60)}...`, 'dim'); + + // Round 1: Worker submits naive fix + const naiveProposal = `## Analysis + +${scenario.naive_fix.description} + +### Changes +${scenario.naive_fix.changes.map(c => `- ${c.file}: ${c.change}`).join('\n')} +`; + + await client.propose(naiveProposal); + log('Worker submitted naive proposal', 'dim'); + + // Round 1: Reviewer catches issues + const issues = scenario.naive_fix.issues; + const reviewFeedback = `NEEDS_WORK + +Issues found: +${issues.map((issue, i) => `${i + 1}. ${issue}`).join('\n')} + +The proposal misses critical aspects. Please address all issues. +`; + + await client.review('NEEDS_WORK', reviewFeedback); + log(`Reviewer found ${issues.length} issues`, 'yellow'); + + // Round 2: Worker submits correct fix + const correctProposal = `## Revised Analysis + +${scenario.correct_fix.description} + +### Required Changes + +${scenario.correct_fix.required_changes.map(c => + `#### ${c.file}:${c.function} +- ${c.change} +- Reference: ${c.line_reference}` +).join('\n\n')} + +### Broker Subscriptions + +${scenario.correct_fix.required_broker_subscriptions.map(s => + `- ${s.service} subscribes to ${s.event}: ${s.action}` +).join('\n')} + +### Pattern References + +${scenario.correct_fix.pattern_references.map(p => `- ${p}`).join('\n')} +`; + + await client.propose(correctProposal); + log('Worker submitted revised proposal', 'dim'); + + // Round 2: Reviewer approves + await client.review('APPROVED', 'All required changes identified. Pattern followed correctly.'); + log('Reviewer approved', 'green'); + + // Validate the proposal + const validation = scenario.validation_criteria; + let validationPassed = true; + + log('\nValidation:', 'blue'); + + // Check must_mention + for (const item of validation.must_mention || []) { + const found = correctProposal.toLowerCase().includes(item.toLowerCase()); + log(` [${found ? 'PASS' : 'FAIL'}] Mentions: ${item}`, found ? 'green' : 'red'); + if (!found) validationPassed = false; + } + + // Check pattern reference + if (validation.should_reference_pattern) { + const found = correctProposal.includes(validation.should_reference_pattern); + log(` [${found ? 'PASS' : 'FAIL'}] References pattern: ${validation.should_reference_pattern}`, found ? 'green' : 'red'); + if (!found) validationPassed = false; + } + + // Complete the workflow + await client.implement(); + await client.complete('Test completed'); + + log(`\nMock test ${scenario.id}: ${validationPassed ? 'PASSED' : 'FAILED'}`, validationPassed ? 'green' : 'red'); + return validationPassed; + + } finally { + await client.disconnect(); + cleanState(); + } +} + +// ============================================================================ +// Live Tests (real Claude API) +// ============================================================================ + +async function testLive(scenario) { + logSection(`Live Test: ${scenario.name}`); + + const apiKey = process.env.ANTHROPIC_API_KEY; + if (!apiKey) { + log('ANTHROPIC_API_KEY not set. Skipping live test.', 'yellow'); + return null; + } + + log('Live tests with Claude API not yet implemented.', 'yellow'); + log('This would spawn real Worker and Reviewer sub-agents.', 'dim'); + + // TODO: Implement Claude API integration + // 1. Get worker brief + // 2. Call Claude API with worker prompt + brief + // 3. Submit Claude's proposal + // 4. Get reviewer brief + // 5. Call Claude API with reviewer prompt + brief + // 6. Submit Claude's review + // 7. Loop until approved or max rounds + + return null; +} + +// ============================================================================ +// Main +// ============================================================================ + +async function main() { + log('\n🔍 Diligence Test Runner\n', 'blue'); + log(`Mode: ${mode}`, 'dim'); + + let allPassed = true; + + switch (mode) { + case '--workflow': + allPassed = await testWorkflow(); + break; + + case '--mock': { + const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios(); + for (const scenario of scenarios) { + const passed = await testWithMocks(scenario); + if (!passed) allPassed = false; + } + break; + } + + case '--live': { + const scenarios = scenarioId ? [loadScenario(scenarioId)] : loadAllScenarios(); + for (const scenario of scenarios) { + const result = await testLive(scenario); + if (result === false) allPassed = false; + } + break; + } + } + + console.log(); + if (allPassed) { + log('✓ All tests passed', 'green'); + process.exit(0); + } else { + log('✗ Some tests failed', 'red'); + process.exit(1); + } +} + +main().catch(err => { + console.error('Error:', err); + process.exit(1); +}); diff --git a/test/scenarios/blocking-voice.json b/test/scenarios/blocking-voice.json new file mode 100644 index 0000000..8f988e7 --- /dev/null +++ b/test/scenarios/blocking-voice.json @@ -0,0 +1,78 @@ +{ + "id": "blocking-voice", + "name": "Blocking + Voice Bug", + "description": "Fix blocked users can answer DM voice calls", + + "task": "Fix: blocked users can still answer DM voice calls. When user A blocks user B, user B should not be able to answer calls from user A.", + + "naive_fix": { + "description": "Add blocking check to answerDmCall()", + "changes": [ + { + "file": "src/services/voice-channel.service.ts", + "function": "answerDmCall", + "change": "Add isBlockingEitherWay check before answering" + } + ], + "issues": [ + "Doesn't handle block created DURING active call", + "Doesn't clean up existing calls when block is created", + "Blocked users still receive call notifications" + ] + }, + + "correct_fix": { + "description": "Full blocking enforcement following chat.service.ts pattern", + "required_changes": [ + { + "file": "src/services/voice-channel.service.ts", + "function": "answerDmCall", + "change": "Add isBlockingEitherWay check", + "line_reference": "line 75" + }, + { + "file": "src/services/voice-channel.service.ts", + "function": "declineDmCall", + "change": "Add isBlockingEitherWay check (consistency)", + "line_reference": "line 93" + }, + { + "file": "src/services/voice-channel.service.ts", + "function": "notifyDmCall", + "change": "Filter notifications for blocked users", + "line_reference": "line 138" + }, + { + "file": "src/services/user-block.service.ts", + "function": "blockUser", + "change": "Add voice cleanup: endDmCallBetweenUsers()", + "line_reference": "line 33" + } + ], + "required_broker_subscriptions": [ + { + "service": "voice-channel.service.ts", + "event": "BusUserBlockChange", + "action": "Kick users from DM voice when block is created mid-call" + } + ], + "pattern_references": [ + "chat.service.ts:sendMessage - shows correct action check pattern", + "chat.service.ts:getChannelPermission - shows permission vs action separation" + ] + }, + + "validation_criteria": { + "must_mention": [ + "answerDmCall", + "BusUserBlockChange", + "user-block.service", + "notifyDmCall" + ], + "must_not_change": [ + "voiceListen permission values", + "voiceTalk permission values" + ], + "should_reference_pattern": "chat.service.ts" + } +} diff --git a/test/scenarios/index.json b/test/scenarios/index.json new file mode 100644 index 0000000..91c63ca --- /dev/null +++ b/test/scenarios/index.json @@ -0,0 +1,21 @@ +{ + "scenarios": [ + { + "id": "blocking-voice", + "file": "blocking-voice.json", + "difficulty": "medium", + "tags": ["blocking", "voice", "broker-events"] + }, + { + "id": "permission-cache", + "file": "permission-cache.json", + "difficulty": "medium", + "tags": ["cache", "permissions", "broker-events"] + } + ], + "metadata": { + "version": "1.0.0", + "fixture_path": "../fixture", + "description": "Test scenarios for diligence MCP server" + } +} diff --git a/test/scenarios/permission-cache.json b/test/scenarios/permission-cache.json new file mode 100644 index 0000000..ae39f67 --- /dev/null +++ b/test/scenarios/permission-cache.json @@ -0,0 +1,81 @@ +{ + "id": "permission-cache", + "name": "Permission Cache Invalidation Bug", + "description": "Fix permission cache not invalidating when roles change", + + "task": "Fix: permission cache doesn't invalidate when user roles change. Users see stale permissions for hours after their roles are updated.", + + "naive_fix": { + "description": "Add .clear() call somewhere", + "changes": [ + { + "file": "src/services/team.service.ts", + "function": "somewhere", + "change": "Call memoizedPermissions.clear()" + } + ], + "issues": [ + "Doesn't identify WHEN cache should clear", + "Missing BusTeamRoleChange subscription", + "Missing BusTeamMemberRoleChange subscription", + "Doesn't fix roles.controller.ts missing broker events" + ] + }, + + "correct_fix": { + "description": "Subscribe to all role-related broker events", + "required_changes": [ + { + "file": "src/services/team.service.ts", + "function": "constructor", + "change": "Subscribe to BusTeamRoleChange, clear cache on event", + "line_reference": "line 30" + }, + { + "file": "src/services/team.service.ts", + "function": "constructor", + "change": "Subscribe to BusTeamMemberRoleChange, clear cache on event", + "line_reference": "line 30" + }, + { + "file": "src/controllers/roles.controller.ts", + "function": "createRole", + "change": "Emit BusTeamRoleChange event after creating role", + "line_reference": "line 22" + }, + { + "file": "src/controllers/roles.controller.ts", + "function": "deleteRole", + "change": "Emit BusTeamRoleChange event before deleting role", + "line_reference": "line 62" + } + ], + "required_broker_subscriptions": [ + { + "service": "team.service.ts", + "event": "BusTeamRoleChange", + "action": "Clear permission cache" + }, + { + "service": "team.service.ts", + "event": "BusTeamMemberRoleChange", + "action": "Clear permission cache" + } + ], + "pattern_references": [ + "roles.controller.ts:updateRole - shows correct broker event emission" + ] + }, + + "validation_criteria": { + "must_mention": [ + "BusTeamRoleChange", + "BusTeamMemberRoleChange", + "createRole", + "deleteRole", + "team.service" + ], + "must_identify_root_cause": "Cache only clears on team switch, not role changes", + "should_reference_pattern": "roles.controller.ts:updateRole" + } +}