scripts/knowledge_gap_identifier.py

"""
Knowledge Gap Identifier — Pipeline 10.7

Cross-references code, docs, and tests to find gaps:
- Undocumented functions/classes
- Untested code paths
- Documented but missing implementations
- Test files without corresponding source

Produces a gap report with severity and suggestions.
"""

from __future__ import annotations

import ast
import os
import re
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional, Set


class GapSeverity(Enum):
    INFO = "info"
    WARNING = "warning"
    ERROR = "error"


class GapType(Enum):
    UNDOCUMENTED = "undocumented"
    UNTESTED = "untested"
    MISSING_IMPLEMENTATION = "missing_implementation"
    ORPHAN_TEST = "orphan_test"
    STALE_DOC = "stale_doc"


@dataclass
class Gap:
    """A single knowledge gap."""
    gap_type: GapType
    severity: GapSeverity
    file: str
    line: Optional[int]
    name: str
    description: str
    suggestion: str


@dataclass
class GapReport:
    """Full gap analysis report."""
    repo_path: str
    gaps: List[Gap] = field(default_factory=list)
    stats: Dict[str, int] = field(default_factory=dict)

    def summary(self) -> str:
        lines = [f"Gap Report for {self.repo_path}", "=" * 40]
        by_type = {}
        for g in self.gaps:
            by_type.setdefault(g.gap_type.value, []).append(g)

        for gtype, items in sorted(by_type.items()):
            lines.append(f"\n{gtype.upper()} ({len(items)}):")
            for g in items:
                loc = f"{g.file}:{g.line}" if g.line else g.file
                lines.append(f"  [{g.severity.value}] {g.name} @ {loc}")
                lines.append(f"    {g.description}")

        lines.append(f"\nTotal gaps: {len(self.gaps)}")
        self.stats = {k: len(v) for k, v in by_type.items()}
        return "\n".join(lines)

    def to_dict(self) -> dict:
        return {
            "repo_path": self.repo_path,
            "total_gaps": len(self.gaps),
            "stats": {k: len(v) for k, v in
                      {gt: [g for g in self.gaps if g.gap_type == gt]
                       for gt in GapType}.items() if v},
            "gaps": [
                {
                    "type": g.gap_type.value,
                    "severity": g.severity.value,
                    "file": g.file,
                    "line": g.line,
                    "name": g.name,
                    "description": g.description,
                    "suggestion": g.suggestion,
                }
                for g in self.gaps
            ],
        }


def _collect_python_files(root: Path) -> List[Path]:
    """Collect .py files, excluding venv/node_modules/.git."""
    skip = {".git", "venv", "env", ".venv", "node_modules", "__pycache__", ".tox", ".mypy_cache"}
    files = []
    for dirpath, dirnames, filenames in os.walk(root):
        dirnames[:] = [d for d in dirnames if d not in skip]
        for f in filenames:
            if f.endswith(".py"):
                files.append(Path(dirpath) / f)
    return files


def _extract_python_symbols(filepath: Path) -> Set[str]:
    """Extract top-level function and class names from a Python file."""
    symbols = set()
    try:
        source = filepath.read_text(encoding="utf-8", errors="replace")
        tree = ast.parse(source, filename=str(filepath))
    except (SyntaxError, UnicodeDecodeError):
        return symbols

    for node in ast.iter_child_nodes(tree):
        if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
            symbols.add(node.name)
    return symbols


def _extract_doc_symbols(filepath: Path) -> Set[str]:
    """Extract function/class names mentioned in markdown docs."""
    symbols = set()
    try:
        text = filepath.read_text(encoding="utf-8", errors="replace")
    except (UnicodeDecodeError, OSError):
        return symbols

    # Match backtick-quoted identifiers: `ClassName`, `func_name`, `func()`
    for m in re.finditer(r"`([A-Za-z_]\w+)(?:\(\))?`", text):
        symbols.add(m.group(1))
    # Match ## ClassName or ### func_name headings
    for m in re.finditer(r"^#{1,4}\s+(\w+)", text, re.MULTILINE):
        symbols.add(m.group(1))
    return symbols


def _collect_test_files(root: Path) -> Dict[str, Path]:
    """Map test module names to their file paths."""
    test_map = {}
    for dirpath, dirnames, filenames in os.walk(root):
        dirnames[:] = [d for d in dirnames if d not in {".git", "venv", "node_modules"}]
        for f in filenames:
            if f.startswith("test_") and f.endswith(".py"):
                # test_foo.py -> foo
                module_name = f[5:-3]
                test_map[module_name] = Path(dirpath) / f
    return test_map


class KnowledgeGapIdentifier:
    """Analyzes a repo for knowledge gaps between code, docs, and tests."""

    def analyze(self, repo_path: str) -> GapReport:
        root = Path(repo_path).resolve()
        report = GapReport(repo_path=str(root))

        if not root.is_dir():
            report.gaps.append(Gap(
                gap_type=GapType.UNDOCUMENTED,
                severity=GapSeverity.ERROR,
                file=str(root),
                line=None,
                name="repo",
                description="Path is not a directory",
                suggestion="Provide a valid repo directory",
            ))
            return report

        # Collect artifacts
        py_files = _collect_python_files(root)
        doc_files = list(root.glob("docs/**/*.md")) + list(root.glob("*.md"))
        test_map = _collect_test_files(root / "tests") if (root / "tests").is_dir() else {}

        # Extract symbols from each source file
        source_symbols: Dict[str, Set[str]] = {}  # relative_path -> symbols
        all_source_symbols: Set[str] = set()

        for pf in py_files:
            rel = str(pf.relative_to(root))
            # Skip test files and setup/config
            if "/tests/" in rel or rel.startswith("tests/") or rel.startswith("test_"):
                continue
            if pf.name in ("setup.py", "conftest.py", "conf.py"):
                continue

            syms = _extract_python_symbols(pf)
            if syms:
                source_symbols[rel] = syms
                all_source_symbols.update(syms)

        # Extract documented symbols
        doc_symbols: Set[str] = set()
        for df in doc_files:
            doc_symbols.update(_extract_doc_symbols(df))

        # Extract test-covered symbols
        tested_modules: Set[str] = set(test_map.keys())

        # --- Find gaps ---

        # 1. Undocumented: source symbols not in any doc
        for rel_path, syms in source_symbols.items():
            for sym in sorted(syms):
                if sym.startswith("_") and not sym.startswith("__"):
                    continue  # Skip private
                if sym not in doc_symbols:
                    report.gaps.append(Gap(
                        gap_type=GapType.UNDOCUMENTED,
                        severity=GapSeverity.WARNING,
                        file=rel_path,
                        line=None,
                        name=sym,
                        description=f"{sym} defined in {rel_path} but not referenced in any docs",
                        suggestion=f"Add documentation for {sym} in a .md file",
                    ))

        # 2. Untested: source modules without a corresponding test file
        for rel_path in source_symbols:
            module_name = Path(rel_path).stem
            if module_name not in tested_modules and module_name not in ("__init__", "main", "config"):
                report.gaps.append(Gap(
                    gap_type=GapType.UNTESTED,
                    severity=GapSeverity.ERROR,
                    file=rel_path,
                    line=None,
                    name=module_name,
                    description=f"No test file found for {rel_path}",
                    suggestion=f"Create tests/test_{module_name}.py",
                ))

        # 3. Missing implementation: doc references symbol not in any source
        referenced_but_missing = doc_symbols - all_source_symbols
        for sym in sorted(referenced_but_missing):
            # Filter out common non-code terms
            if sym.lower() in {"todo", "fixme", "note", "example", "usage", "api",
                               "install", "setup", "config", "license", "contributing",
                               "changelog", "readme", "python", "bash", "json", "yaml",
                               "http", "url", "cli", "gui", "ui", "api", "rest"}:
                continue
            if len(sym) < 3:
                continue
            report.gaps.append(Gap(
                gap_type=GapType.MISSING_IMPLEMENTATION,
                severity=GapSeverity.INFO,
                file="(docs)",
                line=None,
                name=sym,
                description=f"{sym} referenced in docs but not found in source code",
                suggestion=f"Verify if {sym} should be implemented or update docs",
            ))

        # 4. Orphan tests: test files without matching source
        for test_mod, test_path in test_map.items():
            if test_mod not in tested_modules and not any(
                test_mod in Path(f).stem for f in source_symbols
            ):
                # Check if any source file partially matches
                matches_source = any(test_mod.replace("_", "-") in f or test_mod.replace("_", "") in Path(f).stem
                                     for f in source_symbols)
                if not matches_source:
                    rel = str(test_path.relative_to(root))
                    report.gaps.append(Gap(
                        gap_type=GapType.ORPHAN_TEST,
                        severity=GapSeverity.WARNING,
                        file=rel,
                        line=None,
                        name=test_mod,
                        description=f"Test file {rel} exists but no matching source module found",
                        suggestion=f"Verify if the source was renamed or removed",
                    ))

        return report
feat: knowledge gap identifier — Pipeline 10.7 (#172) 2026-04-15 14:42:28 +00:00			`"""`
			`Knowledge Gap Identifier — Pipeline 10.7`

			`Cross-references code, docs, and tests to find gaps:`
			`- Undocumented functions/classes`
			`- Untested code paths`
			`- Documented but missing implementations`
			`- Test files without corresponding source`

			`Produces a gap report with severity and suggestions.`
			`"""`

			`from __future__ import annotations`

			`import ast`
			`import os`
			`import re`
			`from dataclasses import dataclass, field`
			`from enum import Enum`
			`from pathlib import Path`
			`from typing import Dict, List, Optional, Set`


			`class GapSeverity(Enum):`
			`INFO = "info"`
			`WARNING = "warning"`
			`ERROR = "error"`


			`class GapType(Enum):`
			`UNDOCUMENTED = "undocumented"`
			`UNTESTED = "untested"`
			`MISSING_IMPLEMENTATION = "missing_implementation"`
			`ORPHAN_TEST = "orphan_test"`
			`STALE_DOC = "stale_doc"`


			`@dataclass`
			`class Gap:`
			`"""A single knowledge gap."""`
			`gap_type: GapType`
			`severity: GapSeverity`
			`file: str`
			`line: Optional[int]`
			`name: str`
			`description: str`
			`suggestion: str`


			`@dataclass`
			`class GapReport:`
			`"""Full gap analysis report."""`
			`repo_path: str`
			`gaps: List[Gap] = field(default_factory=list)`
			`stats: Dict[str, int] = field(default_factory=dict)`

			`def summary(self) -> str:`
			`lines = [f"Gap Report for {self.repo_path}", "=" * 40]`
			`by_type = {}`
			`for g in self.gaps:`
			`by_type.setdefault(g.gap_type.value, []).append(g)`

			`for gtype, items in sorted(by_type.items()):`
			`lines.append(f"\n{gtype.upper()} ({len(items)}):")`
			`for g in items:`
			`loc = f"{g.file}:{g.line}" if g.line else g.file`
			`lines.append(f" [{g.severity.value}] {g.name} @ {loc}")`
			`lines.append(f" {g.description}")`

			`lines.append(f"\nTotal gaps: {len(self.gaps)}")`
			`self.stats = {k: len(v) for k, v in by_type.items()}`
			`return "\n".join(lines)`

			`def to_dict(self) -> dict:`
			`return {`
			`"repo_path": self.repo_path,`
			`"total_gaps": len(self.gaps),`
			`"stats": {k: len(v) for k, v in`
			`{gt: [g for g in self.gaps if g.gap_type == gt]`
			`for gt in GapType}.items() if v},`
			`"gaps": [`
			`{`
			`"type": g.gap_type.value,`
			`"severity": g.severity.value,`
			`"file": g.file,`
			`"line": g.line,`
			`"name": g.name,`
			`"description": g.description,`
			`"suggestion": g.suggestion,`
			`}`
			`for g in self.gaps`
			`],`
			`}`


			`def _collect_python_files(root: Path) -> List[Path]:`
			`"""Collect .py files, excluding venv/node_modules/.git."""`
			`skip = {".git", "venv", "env", ".venv", "node_modules", "__pycache__", ".tox", ".mypy_cache"}`
			`files = []`
			`for dirpath, dirnames, filenames in os.walk(root):`
			`dirnames[:] = [d for d in dirnames if d not in skip]`
			`for f in filenames:`
			`if f.endswith(".py"):`
			`files.append(Path(dirpath) / f)`
			`return files`


			`def _extract_python_symbols(filepath: Path) -> Set[str]:`
			`"""Extract top-level function and class names from a Python file."""`
			`symbols = set()`
			`try:`
			`source = filepath.read_text(encoding="utf-8", errors="replace")`
			`tree = ast.parse(source, filename=str(filepath))`
			`except (SyntaxError, UnicodeDecodeError):`
			`return symbols`

			`for node in ast.iter_child_nodes(tree):`
			`if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):`
			`symbols.add(node.name)`
			`return symbols`


			`def _extract_doc_symbols(filepath: Path) -> Set[str]:`
			`"""Extract function/class names mentioned in markdown docs."""`
			`symbols = set()`
			`try:`
			`text = filepath.read_text(encoding="utf-8", errors="replace")`
			`except (UnicodeDecodeError, OSError):`
			`return symbols`

			# Match backtick-quoted identifiers: `ClassName`, `func_name`, `func()`
			for m in re.finditer(r"`([A-Za-z_]\w+)(?:\(\))?`", text):
			`symbols.add(m.group(1))`
			`# Match ## ClassName or ### func_name headings`
			`for m in re.finditer(r"^#{1,4}\s+(\w+)", text, re.MULTILINE):`
			`symbols.add(m.group(1))`
			`return symbols`


			`def _collect_test_files(root: Path) -> Dict[str, Path]:`
			`"""Map test module names to their file paths."""`
			`test_map = {}`
			`for dirpath, dirnames, filenames in os.walk(root):`
			`dirnames[:] = [d for d in dirnames if d not in {".git", "venv", "node_modules"}]`
			`for f in filenames:`
			`if f.startswith("test_") and f.endswith(".py"):`
			`# test_foo.py -> foo`
			`module_name = f[5:-3]`
			`test_map[module_name] = Path(dirpath) / f`
			`return test_map`


			`class KnowledgeGapIdentifier:`
			`"""Analyzes a repo for knowledge gaps between code, docs, and tests."""`

			`def analyze(self, repo_path: str) -> GapReport:`
			`root = Path(repo_path).resolve()`
			`report = GapReport(repo_path=str(root))`

			`if not root.is_dir():`
			`report.gaps.append(Gap(`
			`gap_type=GapType.UNDOCUMENTED,`
			`severity=GapSeverity.ERROR,`
			`file=str(root),`
			`line=None,`
			`name="repo",`
			`description="Path is not a directory",`
			`suggestion="Provide a valid repo directory",`
			`))`
			`return report`

			`# Collect artifacts`
			`py_files = _collect_python_files(root)`
			`doc_files = list(root.glob("docs/*/.md")) + list(root.glob("*.md"))`
			`test_map = _collect_test_files(root / "tests") if (root / "tests").is_dir() else {}`

			`# Extract symbols from each source file`
			`source_symbols: Dict[str, Set[str]] = {} # relative_path -> symbols`
			`all_source_symbols: Set[str] = set()`

			`for pf in py_files:`
			`rel = str(pf.relative_to(root))`
			`# Skip test files and setup/config`
			`if "/tests/" in rel or rel.startswith("tests/") or rel.startswith("test_"):`
			`continue`
			`if pf.name in ("setup.py", "conftest.py", "conf.py"):`
			`continue`

			`syms = _extract_python_symbols(pf)`
			`if syms:`
			`source_symbols[rel] = syms`
			`all_source_symbols.update(syms)`

			`# Extract documented symbols`
			`doc_symbols: Set[str] = set()`
			`for df in doc_files:`
			`doc_symbols.update(_extract_doc_symbols(df))`

			`# Extract test-covered symbols`
			`tested_modules: Set[str] = set(test_map.keys())`

			`# --- Find gaps ---`

			`# 1. Undocumented: source symbols not in any doc`
			`for rel_path, syms in source_symbols.items():`
			`for sym in sorted(syms):`
			`if sym.startswith("_") and not sym.startswith("__"):`
			`continue # Skip private`
			`if sym not in doc_symbols:`
			`report.gaps.append(Gap(`
			`gap_type=GapType.UNDOCUMENTED,`
			`severity=GapSeverity.WARNING,`
			`file=rel_path,`
			`line=None,`
			`name=sym,`
			`description=f"{sym} defined in {rel_path} but not referenced in any docs",`
			`suggestion=f"Add documentation for {sym} in a .md file",`
			`))`

			`# 2. Untested: source modules without a corresponding test file`
			`for rel_path in source_symbols:`
			`module_name = Path(rel_path).stem`
			`if module_name not in tested_modules and module_name not in ("__init__", "main", "config"):`
			`report.gaps.append(Gap(`
			`gap_type=GapType.UNTESTED,`
			`severity=GapSeverity.ERROR,`
			`file=rel_path,`
			`line=None,`
			`name=module_name,`
			`description=f"No test file found for {rel_path}",`
			`suggestion=f"Create tests/test_{module_name}.py",`
			`))`

			`# 3. Missing implementation: doc references symbol not in any source`
			`referenced_but_missing = doc_symbols - all_source_symbols`
			`for sym in sorted(referenced_but_missing):`
			`# Filter out common non-code terms`
			`if sym.lower() in {"todo", "fixme", "note", "example", "usage", "api",`
			`"install", "setup", "config", "license", "contributing",`
			`"changelog", "readme", "python", "bash", "json", "yaml",`
			`"http", "url", "cli", "gui", "ui", "api", "rest"}:`
			`continue`
			`if len(sym) < 3:`
			`continue`
			`report.gaps.append(Gap(`
			`gap_type=GapType.MISSING_IMPLEMENTATION,`
			`severity=GapSeverity.INFO,`
			`file="(docs)",`
			`line=None,`
			`name=sym,`
			`description=f"{sym} referenced in docs but not found in source code",`
			`suggestion=f"Verify if {sym} should be implemented or update docs",`
			`))`

			`# 4. Orphan tests: test files without matching source`
			`for test_mod, test_path in test_map.items():`
			`if test_mod not in tested_modules and not any(`
			`test_mod in Path(f).stem for f in source_symbols`
			`):`
			`# Check if any source file partially matches`
			`matches_source = any(test_mod.replace("_", "-") in f or test_mod.replace("_", "") in Path(f).stem`
			`for f in source_symbols)`
			`if not matches_source:`
			`rel = str(test_path.relative_to(root))`
			`report.gaps.append(Gap(`
			`gap_type=GapType.ORPHAN_TEST,`
			`severity=GapSeverity.WARNING,`
			`file=rel,`
			`line=None,`
			`name=test_mod,`
			`description=f"Test file {rel} exists but no matching source module found",`
			`suggestion=f"Verify if the source was renamed or removed",`
			`))`

			`return report`