Some checks failed
Test / pytest (pull_request) Failing after 11s
This adds scripts/doc_freshness.py — a tool that scans markdown documentation for function call references (`foo()`) and PascalCase class names (`Bar`), then verifies that each referenced symbol exists in the Python codebase (via AST symbol collection). - Parses docs for function/class references (backticked identifiers that are either function calls ending with () or PascalCase class names) - Checks if referenced items still exist in the code - Reports stale doc references with file paths and line numbers - Suitable for weekly cron execution; exit code 1 when stale refs found Includes tests in tests/test_doc_freshness.py covering: - symbol collection from Python AST - doc reference extraction heuristics - missing detection integration Smallest concrete implementation satisfying all acceptance criteria.
177 lines
6.1 KiB
Python
Executable File
177 lines
6.1 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Doc Freshness Checker — Issue #104
|
|
|
|
Compare docs to code. Flag docs that reference removed functions or outdated APIs.
|
|
|
|
Usage:
|
|
python3 scripts/doc_freshness.py [--root .] [--docs-dir .] [--json]
|
|
|
|
Outputs:
|
|
Human-readable report by default listing missing references.
|
|
JSON output with --json for machine consumption.
|
|
|
|
"""
|
|
|
|
import argparse
|
|
import ast
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Set, List, Tuple, Dict, Any
|
|
|
|
|
|
def collect_python_symbols(repo_root: str) -> Set[str]:
|
|
"""Collect all top-level function and class names from Python files."""
|
|
symbols: Set[str] = set()
|
|
for root, dirs, files in os.walk(repo_root):
|
|
# Skip irrelevant dirs
|
|
dirs[:] = [d for d in dirs if d not in ['.git', '__pycache__', '.venv', 'venv', 'node_modules']]
|
|
for fname in files:
|
|
if fname.endswith('.py'):
|
|
path = os.path.join(root, fname)
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
tree = ast.parse(f.read())
|
|
for node in ast.walk(tree):
|
|
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
|
|
symbols.add(node.name)
|
|
except Exception:
|
|
# Skip unparsable files
|
|
pass
|
|
return symbols
|
|
|
|
|
|
def extract_doc_references(docs_dir: str) -> List[Tuple[str, str, int]]:
|
|
"""
|
|
Walk markdown files and extract function/class references.
|
|
|
|
Only considers backticked content that is clearly a function call (ending
|
|
with ()) or a PascalCase class name. This filters out filenames, paths,
|
|
URLs, JSON fields, and other non-API references.
|
|
"""
|
|
refs: List[Tuple[str, str, int]] = []
|
|
backtick_pat = re.compile(r'`([^`]+)`')
|
|
func_pat = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
|
|
class_pat = re.compile(r'^[A-Z][a-zA-Z0-9_]*$')
|
|
|
|
for root, dirs, files in os.walk(docs_dir):
|
|
dirs[:] = [d for d in dirs if d != '.git']
|
|
for fname in files:
|
|
if not fname.endswith('.md'):
|
|
continue
|
|
path = os.path.join(root, fname)
|
|
rel_path = os.path.relpath(path, docs_dir)
|
|
try:
|
|
with open(path, 'r', encoding='utf-8') as fh:
|
|
for lineno, line in enumerate(fh, 1):
|
|
for m in backtick_pat.finditer(line):
|
|
raw = m.group(1).strip()
|
|
# Function call: ends with ()
|
|
if raw.endswith('()'):
|
|
name = raw[:-2].strip()
|
|
if func_pat.fullmatch(name):
|
|
refs.append((name, rel_path, lineno))
|
|
continue
|
|
# Class reference: PascalCase
|
|
if class_pat.fullmatch(raw):
|
|
refs.append((raw, rel_path, lineno))
|
|
except Exception:
|
|
pass
|
|
|
|
return refs
|
|
|
|
|
|
def check_doc_freshness(repo_root: str, docs_dir: str) -> Dict[str, Any]:
|
|
"""Run the full check and return structured results."""
|
|
symbols = collect_python_symbols(repo_root)
|
|
refs = extract_doc_references(docs_dir)
|
|
|
|
missing: List[Dict[str, Any]] = []
|
|
found: List[Dict[str, Any]] = []
|
|
|
|
for ref, file, lineno in refs:
|
|
if ref in symbols:
|
|
found.append({"reference": ref, "file": file, "line": lineno})
|
|
else:
|
|
missing.append({"reference": ref, "file": file, "line": lineno})
|
|
|
|
# Deduplicate missing by (reference, file)
|
|
missing_keys = set()
|
|
for item in missing:
|
|
missing_keys.add((item["reference"], item["file"]))
|
|
|
|
total_unique_refs = len({(r, f) for r, f, _ in refs})
|
|
|
|
return {
|
|
"timestamp": "..", # filled by main
|
|
"repo_root": repo_root,
|
|
"docs_dir": docs_dir,
|
|
"total_unique_references": total_unique_refs,
|
|
"defined_symbols": len(symbols),
|
|
"missing": missing,
|
|
"found": found,
|
|
"missing_count": len(missing_keys),
|
|
"found_count": total_unique_refs - len(missing_keys),
|
|
}
|
|
|
|
|
|
def format_report(result: Dict[str, Any]) -> str:
|
|
"""Format check results as a human-readable report."""
|
|
lines = [
|
|
"Doc Freshness Report",
|
|
"=" * 50,
|
|
f"Repo: {result['repo_root']}",
|
|
f"Docs: {result['docs_dir']}",
|
|
f"Defined Python symbols: {result['defined_symbols']}",
|
|
f"References found: {result['total_unique_references']}",
|
|
f"Stale references: {result['missing_count']}",
|
|
"",
|
|
]
|
|
|
|
if result["missing"]:
|
|
lines.append("Stale references:")
|
|
by_file: Dict[str, List] = {}
|
|
for item in result["missing"]:
|
|
by_file.setdefault(item["file"], []).append(item)
|
|
for fname in sorted(by_file):
|
|
lines.append(f"\n {fname}:")
|
|
for item in by_file[fname]:
|
|
lines.append(f" line {item['line']}: {item['reference']}")
|
|
else:
|
|
lines.append("All references are current.")
|
|
|
|
lines.append("")
|
|
lines.append("Note: Only backticked function calls () and PascalCase class names are checked.")
|
|
return "\n".join(lines)
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Doc Freshness Checker — compare docs to code")
|
|
parser.add_argument("--root", default=".", help="Repository root (code location)")
|
|
parser.add_argument("--docs-dir", default=None,
|
|
help="Docs directory (default: same as --root)")
|
|
parser.add_argument("--json", action="store_true", help="Machine-readable output")
|
|
args = parser.parse_args()
|
|
|
|
docs_dir = args.docs_dir or args.root
|
|
|
|
result = check_doc_freshness(args.root, docs_dir)
|
|
result["timestamp"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
if args.json:
|
|
print(json.dumps(result, indent=2))
|
|
else:
|
|
print(format_report(result))
|
|
|
|
# Exit non-zero if stale references found
|
|
sys.exit(1 if result["missing_count"] > 0 else 0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|