""" Knowledge Gap Identifier — Pipeline 10.7 Cross-references code, docs, and tests to find gaps: - Undocumented functions/classes - Untested code paths - Documented but missing implementations - Test files without corresponding source Produces a gap report with severity and suggestions. """ from __future__ import annotations import ast import os import re from dataclasses import dataclass, field from enum import Enum from pathlib import Path from typing import Dict, List, Optional, Set class GapSeverity(Enum): INFO = "info" WARNING = "warning" ERROR = "error" class GapType(Enum): UNDOCUMENTED = "undocumented" UNTESTED = "untested" MISSING_IMPLEMENTATION = "missing_implementation" ORPHAN_TEST = "orphan_test" STALE_DOC = "stale_doc" @dataclass class Gap: """A single knowledge gap.""" gap_type: GapType severity: GapSeverity file: str line: Optional[int] name: str description: str suggestion: str @dataclass class GapReport: """Full gap analysis report.""" repo_path: str gaps: List[Gap] = field(default_factory=list) stats: Dict[str, int] = field(default_factory=dict) def summary(self) -> str: lines = [f"Gap Report for {self.repo_path}", "=" * 40] by_type = {} for g in self.gaps: by_type.setdefault(g.gap_type.value, []).append(g) for gtype, items in sorted(by_type.items()): lines.append(f"\n{gtype.upper()} ({len(items)}):") for g in items: loc = f"{g.file}:{g.line}" if g.line else g.file lines.append(f" [{g.severity.value}] {g.name} @ {loc}") lines.append(f" {g.description}") lines.append(f"\nTotal gaps: {len(self.gaps)}") self.stats = {k: len(v) for k, v in by_type.items()} return "\n".join(lines) def to_dict(self) -> dict: return { "repo_path": self.repo_path, "total_gaps": len(self.gaps), "stats": {k: len(v) for k, v in {gt: [g for g in self.gaps if g.gap_type == gt] for gt in GapType}.items() if v}, "gaps": [ { "type": g.gap_type.value, "severity": g.severity.value, "file": g.file, "line": g.line, "name": g.name, "description": g.description, "suggestion": g.suggestion, } for g in self.gaps ], } def _collect_python_files(root: Path) -> List[Path]: """Collect .py files, excluding venv/node_modules/.git.""" skip = {".git", "venv", "env", ".venv", "node_modules", "__pycache__", ".tox", ".mypy_cache"} files = [] for dirpath, dirnames, filenames in os.walk(root): dirnames[:] = [d for d in dirnames if d not in skip] for f in filenames: if f.endswith(".py"): files.append(Path(dirpath) / f) return files def _extract_python_symbols(filepath: Path) -> Set[str]: """Extract top-level function and class names from a Python file.""" symbols = set() try: source = filepath.read_text(encoding="utf-8", errors="replace") tree = ast.parse(source, filename=str(filepath)) except (SyntaxError, UnicodeDecodeError): return symbols for node in ast.iter_child_nodes(tree): if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): symbols.add(node.name) return symbols def _extract_doc_symbols(filepath: Path) -> Set[str]: """Extract function/class names mentioned in markdown docs.""" symbols = set() try: text = filepath.read_text(encoding="utf-8", errors="replace") except (UnicodeDecodeError, OSError): return symbols # Match backtick-quoted identifiers: `ClassName`, `func_name`, `func()` for m in re.finditer(r"`([A-Za-z_]\w+)(?:\(\))?`", text): symbols.add(m.group(1)) # Match ## ClassName or ### func_name headings for m in re.finditer(r"^#{1,4}\s+(\w+)", text, re.MULTILINE): symbols.add(m.group(1)) return symbols def _collect_test_files(root: Path) -> Dict[str, Path]: """Map test module names to their file paths.""" test_map = {} for dirpath, dirnames, filenames in os.walk(root): dirnames[:] = [d for d in dirnames if d not in {".git", "venv", "node_modules"}] for f in filenames: if f.startswith("test_") and f.endswith(".py"): # test_foo.py -> foo module_name = f[5:-3] test_map[module_name] = Path(dirpath) / f return test_map class KnowledgeGapIdentifier: """Analyzes a repo for knowledge gaps between code, docs, and tests.""" def analyze(self, repo_path: str) -> GapReport: root = Path(repo_path).resolve() report = GapReport(repo_path=str(root)) if not root.is_dir(): report.gaps.append(Gap( gap_type=GapType.UNDOCUMENTED, severity=GapSeverity.ERROR, file=str(root), line=None, name="repo", description="Path is not a directory", suggestion="Provide a valid repo directory", )) return report # Collect artifacts py_files = _collect_python_files(root) doc_files = list(root.glob("docs/**/*.md")) + list(root.glob("*.md")) test_map = _collect_test_files(root / "tests") if (root / "tests").is_dir() else {} # Extract symbols from each source file source_symbols: Dict[str, Set[str]] = {} # relative_path -> symbols all_source_symbols: Set[str] = set() for pf in py_files: rel = str(pf.relative_to(root)) # Skip test files and setup/config if "/tests/" in rel or rel.startswith("tests/") or rel.startswith("test_"): continue if pf.name in ("setup.py", "conftest.py", "conf.py"): continue syms = _extract_python_symbols(pf) if syms: source_symbols[rel] = syms all_source_symbols.update(syms) # Extract documented symbols doc_symbols: Set[str] = set() for df in doc_files: doc_symbols.update(_extract_doc_symbols(df)) # Extract test-covered symbols tested_modules: Set[str] = set(test_map.keys()) # --- Find gaps --- # 1. Undocumented: source symbols not in any doc for rel_path, syms in source_symbols.items(): for sym in sorted(syms): if sym.startswith("_") and not sym.startswith("__"): continue # Skip private if sym not in doc_symbols: report.gaps.append(Gap( gap_type=GapType.UNDOCUMENTED, severity=GapSeverity.WARNING, file=rel_path, line=None, name=sym, description=f"{sym} defined in {rel_path} but not referenced in any docs", suggestion=f"Add documentation for {sym} in a .md file", )) # 2. Untested: source modules without a corresponding test file for rel_path in source_symbols: module_name = Path(rel_path).stem if module_name not in tested_modules and module_name not in ("__init__", "main", "config"): report.gaps.append(Gap( gap_type=GapType.UNTESTED, severity=GapSeverity.ERROR, file=rel_path, line=None, name=module_name, description=f"No test file found for {rel_path}", suggestion=f"Create tests/test_{module_name}.py", )) # 3. Missing implementation: doc references symbol not in any source referenced_but_missing = doc_symbols - all_source_symbols for sym in sorted(referenced_but_missing): # Filter out common non-code terms if sym.lower() in {"todo", "fixme", "note", "example", "usage", "api", "install", "setup", "config", "license", "contributing", "changelog", "readme", "python", "bash", "json", "yaml", "http", "url", "cli", "gui", "ui", "api", "rest"}: continue if len(sym) < 3: continue report.gaps.append(Gap( gap_type=GapType.MISSING_IMPLEMENTATION, severity=GapSeverity.INFO, file="(docs)", line=None, name=sym, description=f"{sym} referenced in docs but not found in source code", suggestion=f"Verify if {sym} should be implemented or update docs", )) # 4. Orphan tests: test files without matching source for test_mod, test_path in test_map.items(): if test_mod not in tested_modules and not any( test_mod in Path(f).stem for f in source_symbols ): # Check if any source file partially matches matches_source = any(test_mod.replace("_", "-") in f or test_mod.replace("_", "") in Path(f).stem for f in source_symbols) if not matches_source: rel = str(test_path.relative_to(root)) report.gaps.append(Gap( gap_type=GapType.ORPHAN_TEST, severity=GapSeverity.WARNING, file=rel, line=None, name=test_mod, description=f"Test file {rel} exists but no matching source module found", suggestion=f"Verify if the source was renamed or removed", )) return report