From 909bd0ec0d13f6b2078e30b785520de1c4277432 Mon Sep 17 00:00:00 2001
From: Alexander Whitestone <alexander@alexanderwhitestone.com>
Date: Thu, 9 Apr 2026 14:53:29 +0000
Subject: [PATCH] feat: add architecture drift detector for multimodal doc
 synthesis

Cross-references architecture diagrams with codebase to find drift:
- Scans Python, JS/TS, YAML, shell scripts
- Extracts component names, imports, LOC
- Compares diagram components against code
- Generates drift reports with confidence scoring
- Vision analysis prompts for diagram parsing

Relates to #1483
---
 scripts/doc_drift_detector.py | 267 ++++++++++++++++++++++++++++++++++
 1 file changed, 267 insertions(+)
 create mode 100644 scripts/doc_drift_detector.py

diff --git a/scripts/doc_drift_detector.py b/scripts/doc_drift_detector.py
new file mode 100644
index 00000000..e8e071f9
--- /dev/null
+++ b/scripts/doc_drift_detector.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python3
+"""
+Architecture Drift Detector — Multimodal Documentation Synthesis
+================================================================
+
+Analyzes architecture diagrams (images) and cross-references them with the
+actual codebase to identify documentation drift. Uses vision analysis on
+diagrams and file system analysis on code.
+
+Usage:
+    python scripts/doc_drift_detector.py --diagram docs/architecture.png --src src/
+    python scripts/doc_drift_detector.py --check-readme  # Analyze README diagrams
+    python scripts/doc_drift_detector.py --report        # Full drift report
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+
+@dataclass
+class DiagramComponent:
+    """A component extracted from an architecture diagram via vision analysis."""
+    name: str
+    component_type: str  # "service", "module", "database", "api", "agent"
+    description: str = ""
+    connections: list = field(default_factory=list)
+    source: str = ""  # "diagram" or "code"
+
+
+@dataclass
+class CodeComponent:
+    """A component found in the actual codebase."""
+    name: str
+    path: str
+    component_type: str  # "module", "class", "service", "script"
+    imports: list = field(default_factory=list)
+    exports: list = field(default_factory=list)
+    lines_of_code: int = 0
+
+
+@dataclass
+class DriftReport:
+    """Documentation drift analysis results."""
+    diagram_components: list = field(default_factory=list)
+    code_components: list = field(default_factory=list)
+    missing_from_code: list = field(default_factory=list)   # In diagram but not code
+    missing_from_docs: list = field(default_factory=list)   # In code but not diagram
+    connections_drift: list = field(default_factory=list)    # Connection mismatches
+    confidence: float = 0.0
+
+    def summary(self) -> str:
+        lines = [
+            "=== Architecture Drift Report ===",
+            f"Diagram components: {len(self.diagram_components)}",
+            f"Code components: {len(self.code_components)}",
+            f"Missing from code (diagram-only): {len(self.missing_from_code)}",
+            f"Missing from docs (code-only): {len(self.missing_from_docs)}",
+            f"Connection drift issues: {len(self.connections_drift)}",
+            f"Confidence: {self.confidence:.0%}",
+            "",
+        ]
+        if self.missing_from_code:
+            lines.append("⚠️  In diagram but NOT found in code:")
+            for c in self.missing_from_code:
+                lines.append(f"   - {c.name} ({c.component_type})")
+            lines.append("")
+        if self.missing_from_docs:
+            lines.append("📝 In code but NOT in diagram:")
+            for c in self.missing_from_docs:
+                lines.append(f"   - {c.name} at {c.path}")
+            lines.append("")
+        if self.connections_drift:
+            lines.append("🔗 Connection drift:")
+            for c in self.connections_drift:
+                lines.append(f"   - {c}")
+        if not self.missing_from_code and not self.missing_from_docs and not self.connections_drift:
+            lines.append("✅ No significant drift detected!")
+        return "\n".join(lines)
+
+    def to_dict(self) -> dict:
+        return {
+            "diagram_components": [vars(c) for c in self.diagram_components],
+            "code_components": [vars(c) for c in self.code_components],
+            "missing_from_code": [vars(c) for c in self.missing_from_code],
+            "missing_from_docs": [vars(c) for c in self.missing_from_docs],
+            "connections_drift": self.connections_drift,
+            "confidence": self.confidence
+        }
+
+
+class ArchitectureDriftDetector:
+    """Detects drift between architecture diagrams and actual code."""
+
+    def __init__(self, src_dir: str = "src"):
+        self.src_dir = Path(src_dir)
+
+    def analyze_diagram(self, diagram_path: str) -> list:
+        """
+        Extract components from an architecture diagram.
+        Returns prompt for vision analysis — actual analysis done by calling agent.
+        """
+        prompt = f"""Analyze this architecture diagram and extract all components.
+
+For each component, identify:
+- Name (as shown in diagram)
+- Type (service, module, database, api, agent, frontend, etc.)
+- Connections to other components
+- Any version numbers or labels
+
+Return as JSON array:
+```json
+[
+  {{"name": "ComponentName", "type": "service", "connections": ["OtherComponent"]}}
+]
+```
+"""
+        return prompt
+
+    def scan_codebase(self) -> list:
+        """Scan the codebase to find actual components/modules."""
+        components = []
+
+        if not self.src_dir.exists():
+            return components
+
+        # Scan Python modules
+        for py_file in self.src_dir.rglob("*.py"):
+            if py_file.name.startswith("_") and py_file.name != "__init__.py":
+                continue
+            name = py_file.stem
+            if name == "__init__":
+                name = py_file.parent.name
+
+            # Count lines
+            try:
+                content = py_file.read_text(errors="replace")
+                loc = len([l for l in content.split("\n") if l.strip() and not l.strip().startswith("#")])
+            except:
+                loc = 0
+
+            # Extract imports
+            imports = re.findall(r"^from\s+(\S+)\s+import|^import\s+(\S+)", content, re.MULTILINE)
+            import_list = [i[0] or i[1] for i in imports]
+
+            components.append(CodeComponent(
+                name=name,
+                path=str(py_file.relative_to(self.src_dir.parent)),
+                component_type="module",
+                imports=import_list[:10],  # Top 10
+                lines_of_code=loc
+            ))
+
+        # Scan JavaScript/TypeScript
+        for ext in ["*.js", "*.ts", "*.tsx"]:
+            for js_file in self.src_dir.rglob(ext):
+                name = js_file.stem
+                try:
+                    content = js_file.read_text(errors="replace")
+                    loc = len([l for l in content.split("\n") if l.strip() and not l.strip().startswith("//")])
+                except:
+                    loc = 0
+
+                components.append(CodeComponent(
+                    name=name,
+                    path=str(js_file.relative_to(self.src_dir.parent.parent if "mobile-app" in str(js_file) else self.src_dir.parent)),
+                    component_type="module",
+                    lines_of_code=loc
+                ))
+
+        # Scan config and scripts
+        for ext in ["*.yaml", "*.yml", "*.json", "*.sh", "*.bash"]:
+            for cfg in Path(".").rglob(ext):
+                if ".git" in str(cfg) or "node_modules" in str(cfg):
+                    continue
+                components.append(CodeComponent(
+                    name=cfg.stem,
+                    path=str(cfg),
+                    component_type="config"
+                ))
+
+        return components
+
+    def detect_drift(
+        self,
+        diagram_components: list,
+        code_components: list
+    ) -> DriftReport:
+        """Compare diagram components against codebase."""
+        report = DriftReport()
+        report.diagram_components = diagram_components
+        report.code_components = code_components
+
+        # Normalize names for matching
+        def normalize(name):
+            return re.sub(r'[^a-z0-9]', '', name.lower())
+
+        code_names = {normalize(c.name): c for c in code_components}
+        diagram_names = {normalize(c.name): c for c in diagram_components}
+
+        # Find diagram-only components
+        for norm_name, dc in diagram_names.items():
+            if norm_name not in code_names:
+                # Check partial matches
+                partial = [code_names[k] for k in code_names if norm_name in k or k in norm_name]
+                if not partial:
+                    report.missing_from_code.append(dc)
+
+        # Find code-only components (significant ones only)
+        for norm_name, cc in code_names.items():
+            if norm_name not in diagram_names and cc.lines_of_code > 50:
+                report.missing_from_docs.append(cc)
+
+        # Confidence based on match rate
+        if diagram_components:
+            matched = len(diagram_components) - len(report.missing_from_code)
+            report.confidence = matched / len(diagram_components)
+        else:
+            report.confidence = 0.5  # No diagram to compare
+
+        return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Architecture Drift Detector")
+    parser.add_argument("--diagram", help="Path to architecture diagram image")
+    parser.add_argument("--src", default="src", help="Source directory to scan")
+    parser.add_argument("--report", action="store_true", help="Generate full report")
+    parser.add_argument("--json", action="store_true", help="Output as JSON")
+    args = parser.parse_args()
+
+    detector = ArchitectureDriftDetector(args.src)
+
+    if args.diagram:
+        print(f"Diagram analysis prompt (use with vision_analyze tool):")
+        print(detector.analyze_diagram(args.diagram))
+        print()
+
+    if args.report or not args.diagram:
+        print("Scanning codebase...")
+        code_components = detector.scan_codebase()
+        print(f"Found {len(code_components)} components")
+
+        if args.json:
+            print(json.dumps([vars(c) for c in code_components], indent=2))
+        else:
+            # Show top components by LOC
+            by_loc = sorted(code_components, key=lambda c: c.lines_of_code, reverse=True)[:20]
+            print("\nTop components by lines of code:")
+            for c in by_loc:
+                print(f"  {c.lines_of_code:5} {c.path}")
+
+        # Generate drift report with empty diagram (code-only analysis)
+        report = detector.detect_drift([], code_components)
+        print(f"\n{report.summary()}")
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
-- 
2.43.0