#!/usr/bin/env python3 """ Architecture Drift Detector — Multimodal Documentation Synthesis ================================================================ Analyzes architecture diagrams (images) and cross-references them with the actual codebase to identify documentation drift. Uses vision analysis on diagrams and file system analysis on code. Usage: python scripts/doc_drift_detector.py --diagram docs/architecture.png --src src/ python scripts/doc_drift_detector.py --check-readme # Analyze README diagrams python scripts/doc_drift_detector.py --report # Full drift report """ import argparse import json import os import re import subprocess import sys from dataclasses import dataclass, field from pathlib import Path from typing import Optional @dataclass class DiagramComponent: """A component extracted from an architecture diagram via vision analysis.""" name: str component_type: str # "service", "module", "database", "api", "agent" description: str = "" connections: list = field(default_factory=list) source: str = "" # "diagram" or "code" @dataclass class CodeComponent: """A component found in the actual codebase.""" name: str path: str component_type: str # "module", "class", "service", "script" imports: list = field(default_factory=list) exports: list = field(default_factory=list) lines_of_code: int = 0 @dataclass class DriftReport: """Documentation drift analysis results.""" diagram_components: list = field(default_factory=list) code_components: list = field(default_factory=list) missing_from_code: list = field(default_factory=list) # In diagram but not code missing_from_docs: list = field(default_factory=list) # In code but not diagram connections_drift: list = field(default_factory=list) # Connection mismatches confidence: float = 0.0 def summary(self) -> str: lines = [ "=== Architecture Drift Report ===", f"Diagram components: {len(self.diagram_components)}", f"Code components: {len(self.code_components)}", f"Missing from code (diagram-only): {len(self.missing_from_code)}", f"Missing from docs (code-only): {len(self.missing_from_docs)}", f"Connection drift issues: {len(self.connections_drift)}", f"Confidence: {self.confidence:.0%}", "", ] if self.missing_from_code: lines.append("⚠️ In diagram but NOT found in code:") for c in self.missing_from_code: lines.append(f" - {c.name} ({c.component_type})") lines.append("") if self.missing_from_docs: lines.append("📝 In code but NOT in diagram:") for c in self.missing_from_docs: lines.append(f" - {c.name} at {c.path}") lines.append("") if self.connections_drift: lines.append("🔗 Connection drift:") for c in self.connections_drift: lines.append(f" - {c}") if not self.missing_from_code and not self.missing_from_docs and not self.connections_drift: lines.append("✅ No significant drift detected!") return "\n".join(lines) def to_dict(self) -> dict: return { "diagram_components": [vars(c) for c in self.diagram_components], "code_components": [vars(c) for c in self.code_components], "missing_from_code": [vars(c) for c in self.missing_from_code], "missing_from_docs": [vars(c) for c in self.missing_from_docs], "connections_drift": self.connections_drift, "confidence": self.confidence } class ArchitectureDriftDetector: """Detects drift between architecture diagrams and actual code.""" def __init__(self, src_dir: str = "src"): self.src_dir = Path(src_dir) def analyze_diagram(self, diagram_path: str) -> list: """ Extract components from an architecture diagram. Returns prompt for vision analysis — actual analysis done by calling agent. """ prompt = f"""Analyze this architecture diagram and extract all components. For each component, identify: - Name (as shown in diagram) - Type (service, module, database, api, agent, frontend, etc.) - Connections to other components - Any version numbers or labels Return as JSON array: ```json [ {{"name": "ComponentName", "type": "service", "connections": ["OtherComponent"]}} ] ``` """ return prompt def scan_codebase(self) -> list: """Scan the codebase to find actual components/modules.""" components = [] if not self.src_dir.exists(): return components # Scan Python modules for py_file in self.src_dir.rglob("*.py"): if py_file.name.startswith("_") and py_file.name != "__init__.py": continue name = py_file.stem if name == "__init__": name = py_file.parent.name # Count lines try: content = py_file.read_text(errors="replace") loc = len([l for l in content.split("\n") if l.strip() and not l.strip().startswith("#")]) except: loc = 0 # Extract imports imports = re.findall(r"^from\s+(\S+)\s+import|^import\s+(\S+)", content, re.MULTILINE) import_list = [i[0] or i[1] for i in imports] components.append(CodeComponent( name=name, path=str(py_file.relative_to(self.src_dir.parent)), component_type="module", imports=import_list[:10], # Top 10 lines_of_code=loc )) # Scan JavaScript/TypeScript for ext in ["*.js", "*.ts", "*.tsx"]: for js_file in self.src_dir.rglob(ext): name = js_file.stem try: content = js_file.read_text(errors="replace") loc = len([l for l in content.split("\n") if l.strip() and not l.strip().startswith("//")]) except: loc = 0 components.append(CodeComponent( name=name, path=str(js_file.relative_to(self.src_dir.parent.parent if "mobile-app" in str(js_file) else self.src_dir.parent)), component_type="module", lines_of_code=loc )) # Scan config and scripts for ext in ["*.yaml", "*.yml", "*.json", "*.sh", "*.bash"]: for cfg in Path(".").rglob(ext): if ".git" in str(cfg) or "node_modules" in str(cfg): continue components.append(CodeComponent( name=cfg.stem, path=str(cfg), component_type="config" )) return components def detect_drift( self, diagram_components: list, code_components: list ) -> DriftReport: """Compare diagram components against codebase.""" report = DriftReport() report.diagram_components = diagram_components report.code_components = code_components # Normalize names for matching def normalize(name): return re.sub(r'[^a-z0-9]', '', name.lower()) code_names = {normalize(c.name): c for c in code_components} diagram_names = {normalize(c.name): c for c in diagram_components} # Find diagram-only components for norm_name, dc in diagram_names.items(): if norm_name not in code_names: # Check partial matches partial = [code_names[k] for k in code_names if norm_name in k or k in norm_name] if not partial: report.missing_from_code.append(dc) # Find code-only components (significant ones only) for norm_name, cc in code_names.items(): if norm_name not in diagram_names and cc.lines_of_code > 50: report.missing_from_docs.append(cc) # Confidence based on match rate if diagram_components: matched = len(diagram_components) - len(report.missing_from_code) report.confidence = matched / len(diagram_components) else: report.confidence = 0.5 # No diagram to compare return report def main(): parser = argparse.ArgumentParser(description="Architecture Drift Detector") parser.add_argument("--diagram", help="Path to architecture diagram image") parser.add_argument("--src", default="src", help="Source directory to scan") parser.add_argument("--report", action="store_true", help="Generate full report") parser.add_argument("--json", action="store_true", help="Output as JSON") args = parser.parse_args() detector = ArchitectureDriftDetector(args.src) if args.diagram: print(f"Diagram analysis prompt (use with vision_analyze tool):") print(detector.analyze_diagram(args.diagram)) print() if args.report or not args.diagram: print("Scanning codebase...") code_components = detector.scan_codebase() print(f"Found {len(code_components)} components") if args.json: print(json.dumps([vars(c) for c in code_components], indent=2)) else: # Show top components by LOC by_loc = sorted(code_components, key=lambda c: c.lines_of_code, reverse=True)[:20] print("\nTop components by lines of code:") for c in by_loc: print(f" {c.lines_of_code:5} {c.path}") # Generate drift report with empty diagram (code-only analysis) report = detector.detect_drift([], code_components) print(f"\n{report.summary()}") return 0 if __name__ == "__main__": sys.exit(main())