From 71dd80157519569d030ac4368f34a4c4bebfc9ad Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Wed, 15 Apr 2026 14:42:28 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20knowledge=20gap=20identifier=20?= =?UTF-8?q?=E2=80=94=20Pipeline=2010.7=20(#172)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/knowledge_gap_identifier.py | 275 ++++++++++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 scripts/knowledge_gap_identifier.py diff --git a/scripts/knowledge_gap_identifier.py b/scripts/knowledge_gap_identifier.py new file mode 100644 index 0000000..27cf750 --- /dev/null +++ b/scripts/knowledge_gap_identifier.py @@ -0,0 +1,275 @@ +""" +Knowledge Gap Identifier — Pipeline 10.7 + +Cross-references code, docs, and tests to find gaps: +- Undocumented functions/classes +- Untested code paths +- Documented but missing implementations +- Test files without corresponding source + +Produces a gap report with severity and suggestions. +""" + +from __future__ import annotations + +import ast +import os +import re +from dataclasses import dataclass, field +from enum import Enum +from pathlib import Path +from typing import Dict, List, Optional, Set + + +class GapSeverity(Enum): + INFO = "info" + WARNING = "warning" + ERROR = "error" + + +class GapType(Enum): + UNDOCUMENTED = "undocumented" + UNTESTED = "untested" + MISSING_IMPLEMENTATION = "missing_implementation" + ORPHAN_TEST = "orphan_test" + STALE_DOC = "stale_doc" + + +@dataclass +class Gap: + """A single knowledge gap.""" + gap_type: GapType + severity: GapSeverity + file: str + line: Optional[int] + name: str + description: str + suggestion: str + + +@dataclass +class GapReport: + """Full gap analysis report.""" + repo_path: str + gaps: List[Gap] = field(default_factory=list) + stats: Dict[str, int] = field(default_factory=dict) + + def summary(self) -> str: + lines = [f"Gap Report for {self.repo_path}", "=" * 40] + by_type = {} + for g in self.gaps: + by_type.setdefault(g.gap_type.value, []).append(g) + + for gtype, items in sorted(by_type.items()): + lines.append(f"\n{gtype.upper()} ({len(items)}):") + for g in items: + loc = f"{g.file}:{g.line}" if g.line else g.file + lines.append(f" [{g.severity.value}] {g.name} @ {loc}") + lines.append(f" {g.description}") + + lines.append(f"\nTotal gaps: {len(self.gaps)}") + self.stats = {k: len(v) for k, v in by_type.items()} + return "\n".join(lines) + + def to_dict(self) -> dict: + return { + "repo_path": self.repo_path, + "total_gaps": len(self.gaps), + "stats": {k: len(v) for k, v in + {gt: [g for g in self.gaps if g.gap_type == gt] + for gt in GapType}.items() if v}, + "gaps": [ + { + "type": g.gap_type.value, + "severity": g.severity.value, + "file": g.file, + "line": g.line, + "name": g.name, + "description": g.description, + "suggestion": g.suggestion, + } + for g in self.gaps + ], + } + + +def _collect_python_files(root: Path) -> List[Path]: + """Collect .py files, excluding venv/node_modules/.git.""" + skip = {".git", "venv", "env", ".venv", "node_modules", "__pycache__", ".tox", ".mypy_cache"} + files = [] + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in skip] + for f in filenames: + if f.endswith(".py"): + files.append(Path(dirpath) / f) + return files + + +def _extract_python_symbols(filepath: Path) -> Set[str]: + """Extract top-level function and class names from a Python file.""" + symbols = set() + try: + source = filepath.read_text(encoding="utf-8", errors="replace") + tree = ast.parse(source, filename=str(filepath)) + except (SyntaxError, UnicodeDecodeError): + return symbols + + for node in ast.iter_child_nodes(tree): + if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)): + symbols.add(node.name) + return symbols + + +def _extract_doc_symbols(filepath: Path) -> Set[str]: + """Extract function/class names mentioned in markdown docs.""" + symbols = set() + try: + text = filepath.read_text(encoding="utf-8", errors="replace") + except (UnicodeDecodeError, OSError): + return symbols + + # Match backtick-quoted identifiers: `ClassName`, `func_name`, `func()` + for m in re.finditer(r"`([A-Za-z_]\w+)(?:\(\))?`", text): + symbols.add(m.group(1)) + # Match ## ClassName or ### func_name headings + for m in re.finditer(r"^#{1,4}\s+(\w+)", text, re.MULTILINE): + symbols.add(m.group(1)) + return symbols + + +def _collect_test_files(root: Path) -> Dict[str, Path]: + """Map test module names to their file paths.""" + test_map = {} + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in {".git", "venv", "node_modules"}] + for f in filenames: + if f.startswith("test_") and f.endswith(".py"): + # test_foo.py -> foo + module_name = f[5:-3] + test_map[module_name] = Path(dirpath) / f + return test_map + + +class KnowledgeGapIdentifier: + """Analyzes a repo for knowledge gaps between code, docs, and tests.""" + + def analyze(self, repo_path: str) -> GapReport: + root = Path(repo_path).resolve() + report = GapReport(repo_path=str(root)) + + if not root.is_dir(): + report.gaps.append(Gap( + gap_type=GapType.UNDOCUMENTED, + severity=GapSeverity.ERROR, + file=str(root), + line=None, + name="repo", + description="Path is not a directory", + suggestion="Provide a valid repo directory", + )) + return report + + # Collect artifacts + py_files = _collect_python_files(root) + doc_files = list(root.glob("docs/**/*.md")) + list(root.glob("*.md")) + test_map = _collect_test_files(root / "tests") if (root / "tests").is_dir() else {} + + # Extract symbols from each source file + source_symbols: Dict[str, Set[str]] = {} # relative_path -> symbols + all_source_symbols: Set[str] = set() + + for pf in py_files: + rel = str(pf.relative_to(root)) + # Skip test files and setup/config + if "/tests/" in rel or rel.startswith("tests/") or rel.startswith("test_"): + continue + if pf.name in ("setup.py", "conftest.py", "conf.py"): + continue + + syms = _extract_python_symbols(pf) + if syms: + source_symbols[rel] = syms + all_source_symbols.update(syms) + + # Extract documented symbols + doc_symbols: Set[str] = set() + for df in doc_files: + doc_symbols.update(_extract_doc_symbols(df)) + + # Extract test-covered symbols + tested_modules: Set[str] = set(test_map.keys()) + + # --- Find gaps --- + + # 1. Undocumented: source symbols not in any doc + for rel_path, syms in source_symbols.items(): + for sym in sorted(syms): + if sym.startswith("_") and not sym.startswith("__"): + continue # Skip private + if sym not in doc_symbols: + report.gaps.append(Gap( + gap_type=GapType.UNDOCUMENTED, + severity=GapSeverity.WARNING, + file=rel_path, + line=None, + name=sym, + description=f"{sym} defined in {rel_path} but not referenced in any docs", + suggestion=f"Add documentation for {sym} in a .md file", + )) + + # 2. Untested: source modules without a corresponding test file + for rel_path in source_symbols: + module_name = Path(rel_path).stem + if module_name not in tested_modules and module_name not in ("__init__", "main", "config"): + report.gaps.append(Gap( + gap_type=GapType.UNTESTED, + severity=GapSeverity.ERROR, + file=rel_path, + line=None, + name=module_name, + description=f"No test file found for {rel_path}", + suggestion=f"Create tests/test_{module_name}.py", + )) + + # 3. Missing implementation: doc references symbol not in any source + referenced_but_missing = doc_symbols - all_source_symbols + for sym in sorted(referenced_but_missing): + # Filter out common non-code terms + if sym.lower() in {"todo", "fixme", "note", "example", "usage", "api", + "install", "setup", "config", "license", "contributing", + "changelog", "readme", "python", "bash", "json", "yaml", + "http", "url", "cli", "gui", "ui", "api", "rest"}: + continue + if len(sym) < 3: + continue + report.gaps.append(Gap( + gap_type=GapType.MISSING_IMPLEMENTATION, + severity=GapSeverity.INFO, + file="(docs)", + line=None, + name=sym, + description=f"{sym} referenced in docs but not found in source code", + suggestion=f"Verify if {sym} should be implemented or update docs", + )) + + # 4. Orphan tests: test files without matching source + for test_mod, test_path in test_map.items(): + if test_mod not in tested_modules and not any( + test_mod in Path(f).stem for f in source_symbols + ): + # Check if any source file partially matches + matches_source = any(test_mod.replace("_", "-") in f or test_mod.replace("_", "") in Path(f).stem + for f in source_symbols) + if not matches_source: + rel = str(test_path.relative_to(root)) + report.gaps.append(Gap( + gap_type=GapType.ORPHAN_TEST, + severity=GapSeverity.WARNING, + file=rel, + line=None, + name=test_mod, + description=f"Test file {rel} exists but no matching source module found", + suggestion=f"Verify if the source was renamed or removed", + )) + + return report