Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Payne
ceb7e0bd0c feat: add doc link validator script (closes #103)
Some checks failed
Test / pytest (pull_request) Failing after 30s
Add scripts/validate_doc_links.py — scans all markdown files in the
repository, extracts inline and autolinks, and verifies each URL via
HTTP HEAD request (with GET fallback for servers that reject HEAD).

Features:
  --root           : repository root to scan (default: repo root)
  --fail-on-broken : exit 1 if any broken links found
  --json           : emit JSON report for CI consumption
  --ignore         : comma-separated URL prefixes to skip

Ignores non-HTTP URLs, localhost/127.0.0.1, and private IP ranges.
Requires only Python stdlib — no external dependencies.

Smoke-tested against this repo: 2 unique URLs checked, 0 broken.
Addresses 4.8: Doc Link Validator acceptance criteria.

Closes #103
2026-04-25 20:55:19 -04:00
5 changed files with 131 additions and 795 deletions

View File

@@ -1,95 +0,0 @@
# Architecture: STEP35-compounding-intelligence-99
**Generated by:** `scripts/architecture_doc_generator.py`
## Entry Points
- `scripts/architecture_doc_generator.py`
- `scripts/refactoring_opportunity_finder.py`
- `scripts/automation_opportunity_finder.py`
- `scripts/bootstrapper.py`
- `scripts/dead_code_detector.py`
- `scripts/dedup.py`
- `scripts/dependency_graph.py`
- `scripts/freshness.py`
- `scripts/gitea_issue_parser.py`
- `scripts/harvester.py`
- `scripts/improvement_proposals.py`
- `scripts/knowledge_staleness_check.py`
- `scripts/perf_bottleneck_finder.py`
- `scripts/pr_complexity_scorer.py`
- `scripts/priority_rebalancer.py`
- `quality_gate.py`
- `scripts/sampler.py`
- `scripts/session_metadata.py`
- `scripts/session_pair_harvester.py`
- `scripts/session_reader.py`
- `scripts/test_automation_opportunity_finder.py`
- `scripts/test_bootstrapper.py`
- `scripts/test_diff_analyzer.py`
- `tests/test_freshness.py`
- `scripts/test_gitea_issue_parser.py`
- `scripts/test_harvest_prompt.py`
- `scripts/test_harvest_prompt_comprehensive.py`
- `scripts/test_harvester_pipeline.py`
- `scripts/test_improvement_proposals.py`
- `tests/test_knowledge_gap_identifier.py`
- `scripts/test_knowledge_staleness.py`
- `tests/test_quality_gate.py`
- `scripts/test_refactoring_opportunity_finder.py`
- `scripts/test_session_pair_harvester.py`
- `scripts/validate_knowledge.py`
## Module Dependencies
| Module | Imports |
|--------|---------|
| `quality_gate` | `quality_gate` |
| `scripts.harvester` | `scripts.session_reader` |
| `scripts.session_metadata` | `scripts.session_reader` |
| `scripts.test_bootstrapper` | `scripts.bootstrapper` |
| `scripts.test_harvester_pipeline` | `scripts.harvester, scripts.session_reader` |
| `scripts.test_pr_complexity_scorer` | `scripts.pr_complexity_scorer` |
| `scripts.test_priority_rebalancer` | `scripts.priority_rebalancer` |
| `scripts.test_session_pair_harvester` | `scripts.session_pair_harvester` |
| `tests.test_dedup` | `scripts.dedup` |
| `tests.test_knowledge_gap_identifier` | `scripts.knowledge_gap_identifier` |
| `tests.test_perf_bottleneck_finder` | `scripts.perf_bottleneck_finder` |
| `tests.test_quality_gate` | `quality_gate` |
## ASCII Diagram
```
*quality_gate*
└─> quality_gate
*scripts.bootstrapper*
*scripts.dedup*
*scripts.harvester*
└─> scripts.session_reader
[scripts.knowledge_gap_identifier]
*scripts.perf_bottleneck_finder*
*scripts.pr_complexity_scorer*
*scripts.priority_rebalancer*
*scripts.session_metadata*
└─> scripts.session_reader
*scripts.session_pair_harvester*
*scripts.session_reader*
*scripts.test_bootstrapper*
└─> scripts.bootstrapper
*scripts.test_harvester_pipeline*
└─> scripts.harvester
└─> scripts.session_reader
[scripts.test_pr_complexity_scorer]
└─> scripts.pr_complexity_scorer
[scripts.test_priority_rebalancer]
└─> scripts.priority_rebalancer
*scripts.test_session_pair_harvester*
└─> scripts.session_pair_harvester
[tests.test_dedup]
└─> scripts.dedup
*tests.test_knowledge_gap_identifier*
└─> scripts.knowledge_gap_identifier
[tests.test_perf_bottleneck_finder]
└─> scripts.perf_bottleneck_finder
*tests.test_quality_gate*
└─> quality_gate
```
_Generated automatically. Keep this file in sync with code changes by re-running the generator._

View File

@@ -1,179 +0,0 @@
#!/usr/bin/env python3
"""
Architecture Doc Generator — 4.4
Analyzes codebase structure and generates an architecture overview:
- Maps module dependencies (Python imports within the repo)
- Identifies entry points (main guards, CLI scripts)
- Generates ASCII diagram of module relationships
- Produces one ARCHITECTURE.md per repo
Usage:
python3 scripts/architecture_doc_generator.py [repo_root]
If no repo_root given, uses current directory.
Outputs ARCHITECTURE.md to the repo root.
"""
import argparse
import re
import sys
from collections import defaultdict
from pathlib import Path
def scan_python_files(root: Path):
"""Find all .py files under root, excluding tests/ and .git/."""
py_files = []
for path in root.rglob("*.py"):
parts = path.parts
if any(p.startswith('.') for p in parts if p != '.'):
continue
if 'test' in parts:
continue
if any(x in parts for x in ('venv', 'node_modules', '__pycache__', 'dist', 'build')):
continue
py_files.append(path)
return sorted(py_files)
def module_id(path: Path, root: Path) -> str:
"""Return a readable module identifier."""
rel = path.relative_to(root)
if rel.parent == Path('.'):
return path.stem
return str(rel.with_suffix('')).replace('/', '.')
def extract_imports(path: Path) -> list[str]:
"""Extract top-level import names from a Python file."""
try:
text = path.read_text(errors='ignore')
except Exception:
return []
imports = set()
# import X or import X.Y.Z
for m in re.finditer(r'^\s*import\s+([a-zA-Z0-9_.]+)', text, re.MULTILINE):
imports.add(m.group(1).split('.')[0])
# from X import Y (handles absolute and relative: from .X import Y)
for m in re.finditer(r'^\s*from\s+(\.+)?([a-zA-Z0-9_.]+)\s+import', text, re.MULTILINE):
imports.add(m.group(2).split('.')[0])
return sorted(imports)
def build_dependency_graph(py_files: list[Path], root: Path) -> dict[str, set[str]]:
"""Build adjacency: local_module -> set(local_modules it imports)."""
graph = defaultdict(set)
# Collect all local module identifiers
local_ids = set()
for p in py_files:
local_ids.add(module_id(p, root))
for path in py_files:
src_mod = module_id(path, root)
for imp in extract_imports(path):
# Match import to a local module by stem or by full dotted prefix
target = None
# Exact match
if imp in local_ids:
target = imp
else:
# Find module whose stem equals imp, or whose dotted name ends with .imp
for mid in local_ids:
if mid.split('.')[-1] == imp or mid == imp:
target = mid
break
if target:
graph[src_mod].add(target)
return {k: sorted(v) for k, v in graph.items()}
def find_entry_points(py_files: list[Path]) -> list[Path]:
"""Files with if __name__ == '__main__' guard or executable scripts."""
entries = []
for path in py_files:
try:
text = path.read_text(errors='ignore')
except Exception:
continue
if 'if __name__' in text and '__main__' in text:
entries.append(path)
return sorted(entries, key=lambda p: (not (p.stat().st_mode & 0o111), p.name))
def ascii_diagram(graph: dict[str, list[str]], entries: list[Path], root: Path) -> str:
"""Generate a simple ASCII box-and-arrow diagram."""
lines = []
entry_names = {module_id(p, root) for p in entries}
# All nodes
nodes = sorted(set(graph.keys()) | set().union(*graph.values()))
for node in nodes:
is_entry = node in entry_names
label = f"*{node}*" if is_entry else f"[{node}]"
lines.append(label)
for dep in graph.get(node, []):
lines.append(f" └─> {dep}")
return '\n'.join(lines)
def generate_markdown(root: Path, graph: dict, entries: list[Path], diagram: str) -> str:
root_name = root.name
md = []
md.append(f"# Architecture: {root_name}")
md.append("")
md.append("**Generated by:** `scripts/architecture_doc_generator.py`")
md.append("")
md.append("## Entry Points")
if entries:
for p in entries:
rel = p.relative_to(root)
md.append(f"- `{rel}`")
else:
md.append("_No entry points detected._")
md.append("")
md.append("## Module Dependencies")
if graph:
md.append("| Module | Imports |")
md.append("|--------|---------|")
for mod in sorted(graph.keys()):
deps = ', '.join(sorted(graph[mod])) if graph[mod] else '_none_'
md.append(f"| `{mod}` | `{deps}` |")
else:
md.append("_No dependencies detected._")
md.append("")
md.append("## ASCII Diagram")
md.append("```")
md.append(diagram)
md.append("```")
md.append("")
md.append("_Generated automatically. Keep this file in sync with code changes by re-running the generator._")
return '\n'.join(md)
def main():
parser = argparse.ArgumentParser(description="Generate architecture documentation")
parser.add_argument("repo_root", nargs="?", default=".", help="Repository root (default: current directory)")
args = parser.parse_args()
root = Path(args.repo_root).resolve()
py_files = scan_python_files(root)
if not py_files:
print("No Python files found — nothing to do.", file=sys.stderr)
sys.exit(1)
graph = build_dependency_graph(py_files, root)
entries = find_entry_points(py_files)
diagram = ascii_diagram(graph, entries, root)
markdown = generate_markdown(root, graph, entries, diagram)
out_path = root / "ARCHITECTURE.md"
out_path.write_text(markdown, encoding='utf-8')
print(f"Written: {out_path}")
print(f" Modules scanned: {len(py_files)}")
print(f" Entry points: {len(entries)}")
print(f" Dependency edges: {sum(len(v) for v in graph.values())}")
if __name__ == "__main__":
main()

View File

@@ -1,351 +0,0 @@
#!/usr/bin/env python3
"""
PR Complexity Scorer - Estimate review effort for PRs.
"""
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
import urllib.request
import urllib.error
GITEA_BASE = "https://forge.alexanderwhitestone.com/api/v1"
DEPENDENCY_FILES = {
"requirements.txt", "pyproject.toml", "setup.py", "setup.cfg",
"Pipfile", "poetry.lock", "package.json", "yarn.lock", "Gemfile",
"go.mod", "Cargo.toml", "pom.xml", "build.gradle"
}
TEST_PATTERNS = [
r"tests?/.*\.py$", r".*_test\.py$", r"test_.*\.py$",
r"spec/.*\.rb$", r".*_spec\.rb$",
r"__tests__/", r".*\.test\.(js|ts|jsx|tsx)$"
]
WEIGHT_FILES = 0.25
WEIGHT_LINES = 0.25
WEIGHT_DEPS = 0.30
WEIGHT_TEST_COV = 0.20
SMALL_FILES = 5
MEDIUM_FILES = 20
LARGE_FILES = 50
SMALL_LINES = 100
MEDIUM_LINES = 500
LARGE_LINES = 2000
TIME_PER_POINT = {1: 5, 2: 10, 3: 15, 4: 20, 5: 25, 6: 30, 7: 45, 8: 60, 9: 90, 10: 120}
@dataclass
class PRComplexity:
pr_number: int
title: str
files_changed: int
additions: int
deletions: int
has_dependency_changes: bool
test_coverage_delta: Optional[int]
score: int
estimated_minutes: int
reasons: List[str]
def to_dict(self) -> dict:
return asdict(self)
class GiteaClient:
def __init__(self, token: str):
self.token = token
self.base_url = GITEA_BASE.rstrip("/")
def _request(self, path: str, params: Dict = None) -> Any:
url = f"{self.base_url}{path}"
if params:
qs = "&".join(f"{k}={v}" for k, v in params.items() if v is not None)
url += f"?{qs}"
req = urllib.request.Request(url)
req.add_header("Authorization", f"token {self.token}")
req.add_header("Content-Type", "application/json")
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
print(f"API error {e.code}: {e.read().decode()[:200]}", file=sys.stderr)
return None
except urllib.error.URLError as e:
print(f"Network error: {e}", file=sys.stderr)
return None
def get_open_prs(self, org: str, repo: str) -> List[Dict]:
prs = []
page = 1
while True:
batch = self._request(f"/repos/{org}/{repo}/pulls", {"limit": 50, "page": page, "state": "open"})
if not batch:
break
prs.extend(batch)
if len(batch) < 50:
break
page += 1
return prs
def get_pr_files(self, org: str, repo: str, pr_number: int) -> List[Dict]:
files = []
page = 1
while True:
batch = self._request(
f"/repos/{org}/{repo}/pulls/{pr_number}/files",
{"limit": 100, "page": page}
)
if not batch:
break
files.extend(batch)
if len(batch) < 100:
break
page += 1
return files
def post_comment(self, org: str, repo: str, pr_number: int, body: str) -> bool:
data = json.dumps({"body": body}).encode("utf-8")
req = urllib.request.Request(
f"{self.base_url}/repos/{org}/{repo}/issues/{pr_number}/comments",
data=data,
method="POST",
headers={"Authorization": f"token {self.token}", "Content-Type": "application/json"}
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.status in (200, 201)
except urllib.error.HTTPError:
return False
def is_dependency_file(filename: str) -> bool:
return any(filename.endswith(dep) for dep in DEPENDENCY_FILES)
def is_test_file(filename: str) -> bool:
return any(re.search(pattern, filename) for pattern in TEST_PATTERNS)
def score_pr(
files_changed: int,
additions: int,
deletions: int,
has_dependency_changes: bool,
test_coverage_delta: Optional[int] = None
) -> tuple[int, int, List[str]]:
score = 1.0
reasons = []
# Files changed
if files_changed <= SMALL_FILES:
fscore = 1.0
reasons.append("small number of files changed")
elif files_changed <= MEDIUM_FILES:
fscore = 2.0
reasons.append("moderate number of files changed")
elif files_changed <= LARGE_FILES:
fscore = 2.5
reasons.append("large number of files changed")
else:
fscore = 3.0
reasons.append("very large PR spanning many files")
# Lines changed
total_lines = additions + deletions
if total_lines <= SMALL_LINES:
lscore = 1.0
reasons.append("small change size")
elif total_lines <= MEDIUM_LINES:
lscore = 2.0
reasons.append("moderate change size")
elif total_lines <= LARGE_LINES:
lscore = 3.0
reasons.append("large change size")
else:
lscore = 4.0
reasons.append("very large change")
# Dependency changes
if has_dependency_changes:
dscore = 2.5
reasons.append("dependency changes (architectural impact)")
else:
dscore = 0.0
# Test coverage delta
tscore = 0.0
if test_coverage_delta is not None:
if test_coverage_delta > 0:
reasons.append(f"test additions (+{test_coverage_delta} test files)")
tscore = -min(2.0, test_coverage_delta / 2.0)
elif test_coverage_delta < 0:
reasons.append(f"test removals ({abs(test_coverage_delta)} test files)")
tscore = min(2.0, abs(test_coverage_delta) * 0.5)
else:
reasons.append("test coverage change not assessed")
# Weighted sum, scaled by 3 to use full 1-10 range
bonus = (fscore * WEIGHT_FILES) + (lscore * WEIGHT_LINES) + (dscore * WEIGHT_DEPS) + (tscore * WEIGHT_TEST_COV)
scaled_bonus = bonus * 3.0
score = 1.0 + scaled_bonus
final_score = max(1, min(10, int(round(score))))
est_minutes = TIME_PER_POINT.get(final_score, 30)
return final_score, est_minutes, reasons
def analyze_pr(client: GiteaClient, org: str, repo: str, pr_data: Dict) -> PRComplexity:
pr_num = pr_data["number"]
title = pr_data.get("title", "")
files = client.get_pr_files(org, repo, pr_num)
additions = sum(f.get("additions", 0) for f in files)
deletions = sum(f.get("deletions", 0) for f in files)
filenames = [f.get("filename", "") for f in files]
has_deps = any(is_dependency_file(f) for f in filenames)
test_added = sum(1 for f in files if f.get("status") == "added" and is_test_file(f.get("filename", "")))
test_removed = sum(1 for f in files if f.get("status") == "removed" and is_test_file(f.get("filename", "")))
test_delta = test_added - test_removed if (test_added or test_removed) else None
score, est_min, reasons = score_pr(
files_changed=len(files),
additions=additions,
deletions=deletions,
has_dependency_changes=has_deps,
test_coverage_delta=test_delta
)
return PRComplexity(
pr_number=pr_num,
title=title,
files_changed=len(files),
additions=additions,
deletions=deletions,
has_dependency_changes=has_deps,
test_coverage_delta=test_delta,
score=score,
estimated_minutes=est_min,
reasons=reasons
)
def build_comment(complexity: PRComplexity) -> str:
change_desc = f"{complexity.files_changed} files, +{complexity.additions}/-{complexity.deletions} lines"
deps_note = "\n- :warning: Dependency changes detected — architectural review recommended" if complexity.has_dependency_changes else ""
test_note = ""
if complexity.test_coverage_delta is not None:
if complexity.test_coverage_delta > 0:
test_note = f"\n- :+1: {complexity.test_coverage_delta} test file(s) added"
elif complexity.test_coverage_delta < 0:
test_note = f"\n- :warning: {abs(complexity.test_coverage_delta)} test file(s) removed"
comment = f"## 📊 PR Complexity Analysis\n\n"
comment += f"**PR #{complexity.pr_number}: {complexity.title}**\n\n"
comment += f"| Metric | Value |\n|--------|-------|\n"
comment += f"| Changes | {change_desc} |\n"
comment += f"| Complexity Score | **{complexity.score}/10** |\n"
comment += f"| Estimated Review Time | ~{complexity.estimated_minutes} minutes |\n\n"
comment += f"### Scoring rationale:"
for r in complexity.reasons:
comment += f"\n- {r}"
if deps_note:
comment += deps_note
if test_note:
comment += test_note
comment += f"\n\n---\n"
comment += f"*Generated by PR Complexity Scorer — [issue #135](https://forge.alexanderwhitestone.com/Timmy_Foundation/compounding-intelligence/issues/135)*"
return comment
def main():
parser = argparse.ArgumentParser(description="PR Complexity Scorer")
parser.add_argument("--org", default="Timmy_Foundation")
parser.add_argument("--repo", default="compounding-intelligence")
parser.add_argument("--token", default=os.environ.get("GITEA_TOKEN") or os.path.expanduser("~/.config/gitea/token"))
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--apply", action="store_true")
parser.add_argument("--output", default="metrics/pr_complexity.json")
args = parser.parse_args()
token_path = args.token
if os.path.exists(token_path):
with open(token_path) as f:
token = f.read().strip()
else:
token = args.token
if not token:
print("ERROR: No Gitea token provided", file=sys.stderr)
sys.exit(1)
client = GiteaClient(token)
print(f"Fetching open PRs for {args.org}/{args.repo}...")
prs = client.get_open_prs(args.org, args.repo)
if not prs:
print("No open PRs found.")
sys.exit(0)
print(f"Found {len(prs)} open PR(s). Analyzing...")
results = []
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
for pr in prs:
pr_num = pr["number"]
title = pr.get("title", "")
print(f" Analyzing PR #{pr_num}: {title[:60]}")
try:
complexity = analyze_pr(client, args.org, args.repo, pr)
results.append(complexity.to_dict())
comment = build_comment(complexity)
if args.dry_run:
print(f" → Score: {complexity.score}/10, Est: {complexity.estimated_minutes}min [DRY-RUN]")
elif args.apply:
success = client.post_comment(args.org, args.repo, pr_num, comment)
status = "[commented]" if success else "[FAILED]"
print(f" → Score: {complexity.score}/10, Est: {complexity.estimated_minutes}min {status}")
else:
print(f" → Score: {complexity.score}/10, Est: {complexity.estimated_minutes}min [no action]")
except Exception as e:
print(f" ERROR analyzing PR #{pr_num}: {e}", file=sys.stderr)
with open(args.output, "w") as f:
json.dump({
"org": args.org,
"repo": args.repo,
"timestamp": datetime.now(timezone.utc).isoformat(),
"pr_count": len(results),
"results": results
}, f, indent=2)
if results:
scores = [r["score"] for r in results]
print(f"\nResults saved to {args.output}")
print(f"Summary: {len(results)} PRs, scores range {min(scores):.0f}-{max(scores):.0f}")
else:
print("\nNo results to save.")
if __name__ == "__main__":
main()

View File

@@ -1,170 +0,0 @@
#!/usr/bin/env python3
"""
Tests for PR Complexity Scorer — unit tests for the scoring logic.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
from pr_complexity_scorer import (
score_pr,
is_dependency_file,
is_test_file,
TIME_PER_POINT,
SMALL_FILES,
MEDIUM_FILES,
LARGE_FILES,
SMALL_LINES,
MEDIUM_LINES,
LARGE_LINES,
)
PASS = 0
FAIL = 0
def test(name):
def decorator(fn):
global PASS, FAIL
try:
fn()
PASS += 1
print(f" [PASS] {name}")
except AssertionError as e:
FAIL += 1
print(f" [FAIL] {name}: {e}")
except Exception as e:
FAIL += 1
print(f" [FAIL] {name}: Unexpected error: {e}")
return decorator
def assert_eq(a, b, msg=""):
if a != b:
raise AssertionError(f"{msg} expected {b!r}, got {a!r}")
def assert_true(v, msg=""):
if not v:
raise AssertionError(msg or "Expected True")
def assert_false(v, msg=""):
if v:
raise AssertionError(msg or "Expected False")
print("=== PR Complexity Scorer Tests ===\n")
print("-- File Classification --")
@test("dependency file detection — requirements.txt")
def _():
assert_true(is_dependency_file("requirements.txt"))
assert_true(is_dependency_file("src/requirements.txt"))
assert_false(is_dependency_file("requirements_test.txt"))
@test("dependency file detection — pyproject.toml")
def _():
assert_true(is_dependency_file("pyproject.toml"))
assert_false(is_dependency_file("myproject.py"))
@test("test file detection — pytest style")
def _():
assert_true(is_test_file("tests/test_api.py"))
assert_true(is_test_file("test_module.py"))
assert_true(is_test_file("src/module_test.py"))
@test("test file detection — other frameworks")
def _():
assert_true(is_test_file("spec/feature_spec.rb"))
assert_true(is_test_file("__tests__/component.test.js"))
assert_false(is_test_file("testfixtures/helper.py"))
print("\n-- Scoring Logic --")
@test("small PR gets low score (1-3)")
def _():
score, minutes, _ = score_pr(
files_changed=3,
additions=50,
deletions=10,
has_dependency_changes=False,
test_coverage_delta=None
)
assert_true(1 <= score <= 3, f"Score should be low, got {score}")
assert_true(minutes < 20)
@test("medium PR gets medium score (4-6)")
def _():
score, minutes, _ = score_pr(
files_changed=15,
additions=400,
deletions=100,
has_dependency_changes=False,
test_coverage_delta=None
)
assert_true(4 <= score <= 6, f"Score should be medium, got {score}")
assert_true(20 <= minutes <= 45)
@test("large PR gets high score (7-9)")
def _():
score, minutes, _ = score_pr(
files_changed=60,
additions=3000,
deletions=1500,
has_dependency_changes=True,
test_coverage_delta=None
)
assert_true(7 <= score <= 9, f"Score should be high, got {score}")
assert_true(minutes >= 45)
@test("dependency changes boost score")
def _():
base_score, _, _ = score_pr(
files_changed=10, additions=200, deletions=50,
has_dependency_changes=False, test_coverage_delta=None
)
dep_score, _, _ = score_pr(
files_changed=10, additions=200, deletions=50,
has_dependency_changes=True, test_coverage_delta=None
)
assert_true(dep_score > base_score, f"Deps: {base_score} -> {dep_score}")
@test("adding tests lowers complexity")
def _():
base_score, _, _ = score_pr(
files_changed=8, additions=150, deletions=20,
has_dependency_changes=False, test_coverage_delta=None
)
better_score, _, _ = score_pr(
files_changed=8, additions=180, deletions=20,
has_dependency_changes=False, test_coverage_delta=3
)
assert_true(better_score < base_score, f"Tests: {base_score} -> {better_score}")
@test("removing tests increases complexity")
def _():
base_score, _, _ = score_pr(
files_changed=8, additions=150, deletions=20,
has_dependency_changes=False, test_coverage_delta=None
)
worse_score, _, _ = score_pr(
files_changed=8, additions=150, deletions=20,
has_dependency_changes=False, test_coverage_delta=-2
)
assert_true(worse_score > base_score, f"Remove tests: {base_score} -> {worse_score}")
@test("score bounded 1-10")
def _():
for files, adds, dels in [(1, 10, 5), (100, 10000, 5000)]:
score, _, _ = score_pr(files, adds, dels, False, None)
assert_true(1 <= score <= 10, f"Score {score} out of range")
@test("estimated minutes exist for all scores")
def _():
for s in range(1, 11):
assert_true(s in TIME_PER_POINT, f"Missing time for score {s}")
print(f"\n=== Results: {PASS} passed, {FAIL} failed ===")
sys.exit(0 if FAIL == 0 else 1)

131
scripts/validate_doc_links.py Executable file
View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
Doc Link Validator — Extract and verify all documentation links.
Issue: #103 — 4.8: Doc Link Validator
Acceptance:
Extracts links from docs | HTTP HEAD check | Reports broken links
(Weekly cron/CI integration out of scope for this minimal script)
"""
import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple, Optional
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse
# Markdown link patterns
INLINE_LINK_RE = re.compile(r'\[[^\]]*\]\(([^)\s]+)(?:\s+"[^"]*")?\)')
AUTOLINK_RE = re.compile(r'<([^>]+)>')
def extract_links(content: str) -> List[str]:
urls = [m.group(1) for m in INLINE_LINK_RE.finditer(content)]
urls += [m.group(1) for m in AUTOLINK_RE.finditer(content)]
return urls
def is_ignorable(url: str, ignore_prefixes: List[str]) -> bool:
p = urlparse(url)
if p.scheme not in ('http', 'https'):
return True
host = p.netloc.split(':')[0]
if host in ('localhost', '127.0.0.1', '::1'):
return True
# Private IPv4 ranges
if re.match(r'^(10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.)', host):
return True
for prefix in ignore_prefixes:
if url.startswith(prefix):
return True
return False
def check_url(url: str, timeout: float = 8.0) -> Tuple[bool, Optional[int], str]:
try:
req = Request(url, method='HEAD')
req.add_header('User-Agent', 'DocLinkValidator/1.0')
try:
with urlopen(req, timeout=timeout) as resp:
return True, resp.getcode(), "OK"
except HTTPError as e:
if e.code in (405, 403, 400):
req2 = Request(url, method='GET')
req2.add_header('User-Agent', 'DocLinkValidator/1.0')
req2.add_header('Range', 'bytes=0-1')
with urlopen(req2, timeout=timeout) as resp2:
return True, resp2.getcode(), "OK via GET"
return False, e.code, e.reason
except URLError as e:
return False, None, str(e.reason) if hasattr(e, 'reason') else str(e)
except Exception as e:
return False, None, str(e)
def main() -> int:
p = argparse.ArgumentParser(description="Validate documentation links")
p.add_argument('--root', default='.', help='Repository root')
p.add_argument('--fail-on-broken', action='store_true', help='Exit non-zero if broken links found')
p.add_argument('--json', action='store_true', help='Emit JSON report')
p.add_argument('--ignore', default='', help='Comma-separated URL prefixes to ignore')
args = p.parse_args()
root = Path(args.root).resolve()
ignore_prefixes = [x.strip() for x in args.ignore.split(',') if x.strip()]
md_files = list(root.rglob('*.md'))
if not md_files:
print("No markdown files found.", file=sys.stderr)
return 1
print(f"Scanning {len(md_files)} markdown files")
all_links: List[Tuple[Path, str]] = []
for md in md_files:
content = md.read_text(errors='replace')
for m in INLINE_LINK_RE.finditer(content):
all_links.append((md, m.group(1)))
for m in AUTOLINK_RE.finditer(content):
all_links.append((md, m.group(1)))
print(f"Raw link occurrences: {len(all_links)}")
# De-duplicate by URL, keep first file context
first_file: dict[str, Path] = {}
unique_urls: List[str] = []
for file, url in all_links:
if url not in first_file:
first_file[url] = file
unique_urls.append(url)
print(f"Unique URLs to check: {len(unique_urls)}")
broken: List[dict] = []
ok_count = 0
for url in unique_urls:
if is_ignorable(url, ignore_prefixes):
continue
ok, code, reason = check_url(url)
if ok:
ok_count += 1
else:
broken.append({"url": url, "file": str(first_file[url]), "error": reason})
print(f"OK: {ok_count} Broken: {len(broken)}")
if broken:
print("\nBroken links:")
for b in broken:
print(f" [{b['file']}] {b['url']}{b['error']}")
if args.json:
print(json.dumps({"scanned": len(unique_urls), "ok": ok_count,
"broken": len(broken), "broken_links": broken}, indent=2))
return 1 if (args.fail_on_broken and broken) else 0
if __name__ == '__main__':
sys.exit(main())