feat(doc-freshness): add checker to flag stale documentation references (Closes #104)
Some checks failed
Test / pytest (pull_request) Failing after 11s

This adds scripts/doc_freshness.py — a tool that scans markdown documentation
for function call references (`foo()`) and PascalCase class names (`Bar`), then
verifies that each referenced symbol exists in the Python codebase (via AST
symbol collection).

- Parses docs for function/class references (backticked identifiers that are
  either function calls ending with () or PascalCase class names)
- Checks if referenced items still exist in the code
- Reports stale doc references with file paths and line numbers
- Suitable for weekly cron execution; exit code 1 when stale refs found

Includes tests in tests/test_doc_freshness.py covering:
- symbol collection from Python AST
- doc reference extraction heuristics
- missing detection integration

Smallest concrete implementation satisfying all acceptance criteria.
This commit is contained in:
Timmy_Burn_Worker
2026-04-26 11:09:43 -04:00
parent 4b5a675355
commit ae675e72c2
2 changed files with 265 additions and 0 deletions

176
scripts/doc_freshness.py Executable file
View File

@@ -0,0 +1,176 @@
#!/usr/bin/env python3
"""
Doc Freshness Checker — Issue #104
Compare docs to code. Flag docs that reference removed functions or outdated APIs.
Usage:
python3 scripts/doc_freshness.py [--root .] [--docs-dir .] [--json]
Outputs:
Human-readable report by default listing missing references.
JSON output with --json for machine consumption.
"""
import argparse
import ast
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Set, List, Tuple, Dict, Any
def collect_python_symbols(repo_root: str) -> Set[str]:
"""Collect all top-level function and class names from Python files."""
symbols: Set[str] = set()
for root, dirs, files in os.walk(repo_root):
# Skip irrelevant dirs
dirs[:] = [d for d in dirs if d not in ['.git', '__pycache__', '.venv', 'venv', 'node_modules']]
for fname in files:
if fname.endswith('.py'):
path = os.path.join(root, fname)
try:
with open(path, 'r', encoding='utf-8') as f:
tree = ast.parse(f.read())
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
symbols.add(node.name)
except Exception:
# Skip unparsable files
pass
return symbols
def extract_doc_references(docs_dir: str) -> List[Tuple[str, str, int]]:
"""
Walk markdown files and extract function/class references.
Only considers backticked content that is clearly a function call (ending
with ()) or a PascalCase class name. This filters out filenames, paths,
URLs, JSON fields, and other non-API references.
"""
refs: List[Tuple[str, str, int]] = []
backtick_pat = re.compile(r'`([^`]+)`')
func_pat = re.compile(r'^[a-zA-Z_][a-zA-Z0-9_]*$')
class_pat = re.compile(r'^[A-Z][a-zA-Z0-9_]*$')
for root, dirs, files in os.walk(docs_dir):
dirs[:] = [d for d in dirs if d != '.git']
for fname in files:
if not fname.endswith('.md'):
continue
path = os.path.join(root, fname)
rel_path = os.path.relpath(path, docs_dir)
try:
with open(path, 'r', encoding='utf-8') as fh:
for lineno, line in enumerate(fh, 1):
for m in backtick_pat.finditer(line):
raw = m.group(1).strip()
# Function call: ends with ()
if raw.endswith('()'):
name = raw[:-2].strip()
if func_pat.fullmatch(name):
refs.append((name, rel_path, lineno))
continue
# Class reference: PascalCase
if class_pat.fullmatch(raw):
refs.append((raw, rel_path, lineno))
except Exception:
pass
return refs
def check_doc_freshness(repo_root: str, docs_dir: str) -> Dict[str, Any]:
"""Run the full check and return structured results."""
symbols = collect_python_symbols(repo_root)
refs = extract_doc_references(docs_dir)
missing: List[Dict[str, Any]] = []
found: List[Dict[str, Any]] = []
for ref, file, lineno in refs:
if ref in symbols:
found.append({"reference": ref, "file": file, "line": lineno})
else:
missing.append({"reference": ref, "file": file, "line": lineno})
# Deduplicate missing by (reference, file)
missing_keys = set()
for item in missing:
missing_keys.add((item["reference"], item["file"]))
total_unique_refs = len({(r, f) for r, f, _ in refs})
return {
"timestamp": "..", # filled by main
"repo_root": repo_root,
"docs_dir": docs_dir,
"total_unique_references": total_unique_refs,
"defined_symbols": len(symbols),
"missing": missing,
"found": found,
"missing_count": len(missing_keys),
"found_count": total_unique_refs - len(missing_keys),
}
def format_report(result: Dict[str, Any]) -> str:
"""Format check results as a human-readable report."""
lines = [
"Doc Freshness Report",
"=" * 50,
f"Repo: {result['repo_root']}",
f"Docs: {result['docs_dir']}",
f"Defined Python symbols: {result['defined_symbols']}",
f"References found: {result['total_unique_references']}",
f"Stale references: {result['missing_count']}",
"",
]
if result["missing"]:
lines.append("Stale references:")
by_file: Dict[str, List] = {}
for item in result["missing"]:
by_file.setdefault(item["file"], []).append(item)
for fname in sorted(by_file):
lines.append(f"\n {fname}:")
for item in by_file[fname]:
lines.append(f" line {item['line']}: {item['reference']}")
else:
lines.append("All references are current.")
lines.append("")
lines.append("Note: Only backticked function calls () and PascalCase class names are checked.")
return "\n".join(lines)
def main() -> None:
parser = argparse.ArgumentParser(
description="Doc Freshness Checker — compare docs to code")
parser.add_argument("--root", default=".", help="Repository root (code location)")
parser.add_argument("--docs-dir", default=None,
help="Docs directory (default: same as --root)")
parser.add_argument("--json", action="store_true", help="Machine-readable output")
args = parser.parse_args()
docs_dir = args.docs_dir or args.root
result = check_doc_freshness(args.root, docs_dir)
result["timestamp"] = datetime.now(timezone.utc).isoformat()
if args.json:
print(json.dumps(result, indent=2))
else:
print(format_report(result))
# Exit non-zero if stale references found
sys.exit(1 if result["missing_count"] > 0 else 0)
if __name__ == "__main__":
main()

89
tests/test_doc_freshness.py Executable file
View File

@@ -0,0 +1,89 @@
#!/usr/bin/env python3
"""Tests for scripts/doc_freshness.py — Issue #104."""
import os
import sys
import tempfile
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
import doc_freshness as df
def test_collect_python_symbols():
"""Should collect function and class names from Python files."""
with tempfile.TemporaryDirectory() as tmpdir:
# Create a simple Python file
py_path = os.path.join(tmpdir, "sample.py")
with open(py_path, "w") as f:
f.write('''
def my_func():
pass
class MyClass:
def method(self):
pass
async def my_async():
pass
''')
symbols = df.collect_python_symbols(tmpdir)
assert "my_func" in symbols
assert "MyClass" in symbols
assert "my_async" in symbols
# method (inside class) is also collected and should be considered valid
assert "method" in symbols
print("PASS: test_collect_python_symbols")
def test_extract_doc_references_function_and_class():
"""Should extract only function calls () and PascalCase class refs."""
with tempfile.TemporaryDirectory() as tmpdir:
docs = os.path.join(tmpdir, "docs")
os.makedirs(docs)
md_path = os.path.join(docs, "test.md")
with open(md_path, "w") as f:
f.write('''
# Test
`call_this()` is a function.
`SomeClass` is a class.
`not_a_function` (lowercase, no parens) should be ignored.
`filename.py` should be ignored.
`https://example.com` ignored.
''')
refs = df.extract_doc_references(docs)
names = [r[0] for r in refs]
assert "call_this" in names
assert "SomeClass" in names
assert "not_a_function" not in names
assert "filename" not in names # filename.py filtered
assert "https" not in names
print("PASS: test_extract_doc_references_function_and_class")
def test_check_doc_freshness_missing_detection():
"""Should detect missing symbols."""
with tempfile.TemporaryDirectory() as tmpdir:
# Code with one function
code_dir = os.path.join(tmpdir, "code")
os.makedirs(code_dir)
with open(os.path.join(code_dir, "a.py"), "w") as f:
f.write("def existing_func(): pass\n")
# Docs reference existing_func and missing_func
docs_dir = os.path.join(tmpdir, "docs")
os.makedirs(docs_dir)
with open(os.path.join(docs_dir, "readme.md"), "w") as f:
f.write("`existing_func()` and `missing_func()` are mentioned.")
result = df.check_doc_freshness(code_dir, docs_dir)
assert result["missing_count"] == 1
assert result["found_count"] == 1
print("PASS: test_check_doc_freshness_missing_detection")
if __name__ == "__main__":
test_collect_python_symbols()
test_extract_doc_references_function_and_class()
test_check_doc_freshness_missing_detection()
print("All tests passed!")