#!/usr/bin/env python3 """ Doc Link Validator — Extract and verify all documentation links. Issue: #103 — 4.8: Doc Link Validator Acceptance: Extracts links from docs | HTTP HEAD check | Reports broken links (Weekly cron/CI integration out of scope for this minimal script) """ import argparse import re import sys from pathlib import Path from typing import List, Tuple, Optional from urllib.request import Request, urlopen from urllib.error import URLError, HTTPError from urllib.parse import urlparse # Markdown link patterns INLINE_LINK_RE = re.compile(r'\[[^\]]*\]\(([^)\s]+)(?:\s+"[^"]*")?\)') AUTOLINK_RE = re.compile(r'<([^>]+)>') def extract_links(content: str) -> List[str]: urls = [m.group(1) for m in INLINE_LINK_RE.finditer(content)] urls += [m.group(1) for m in AUTOLINK_RE.finditer(content)] return urls def is_ignorable(url: str, ignore_prefixes: List[str]) -> bool: p = urlparse(url) if p.scheme not in ('http', 'https'): return True host = p.netloc.split(':')[0] if host in ('localhost', '127.0.0.1', '::1'): return True # Private IPv4 ranges if re.match(r'^(10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.)', host): return True for prefix in ignore_prefixes: if url.startswith(prefix): return True return False def check_url(url: str, timeout: float = 8.0) -> Tuple[bool, Optional[int], str]: try: req = Request(url, method='HEAD') req.add_header('User-Agent', 'DocLinkValidator/1.0') try: with urlopen(req, timeout=timeout) as resp: return True, resp.getcode(), "OK" except HTTPError as e: if e.code in (405, 403, 400): req2 = Request(url, method='GET') req2.add_header('User-Agent', 'DocLinkValidator/1.0') req2.add_header('Range', 'bytes=0-1') with urlopen(req2, timeout=timeout) as resp2: return True, resp2.getcode(), "OK via GET" return False, e.code, e.reason except URLError as e: return False, None, str(e.reason) if hasattr(e, 'reason') else str(e) except Exception as e: return False, None, str(e) def main() -> int: p = argparse.ArgumentParser(description="Validate documentation links") p.add_argument('--root', default='.', help='Repository root') p.add_argument('--fail-on-broken', action='store_true', help='Exit non-zero if broken links found') p.add_argument('--json', action='store_true', help='Emit JSON report') p.add_argument('--ignore', default='', help='Comma-separated URL prefixes to ignore') args = p.parse_args() root = Path(args.root).resolve() ignore_prefixes = [x.strip() for x in args.ignore.split(',') if x.strip()] md_files = list(root.rglob('*.md')) if not md_files: print("No markdown files found.", file=sys.stderr) return 1 print(f"Scanning {len(md_files)} markdown files") all_links: List[Tuple[Path, str]] = [] for md in md_files: content = md.read_text(errors='replace') for m in INLINE_LINK_RE.finditer(content): all_links.append((md, m.group(1))) for m in AUTOLINK_RE.finditer(content): all_links.append((md, m.group(1))) print(f"Raw link occurrences: {len(all_links)}") # De-duplicate by URL, keep first file context first_file: dict[str, Path] = {} unique_urls: List[str] = [] for file, url in all_links: if url not in first_file: first_file[url] = file unique_urls.append(url) print(f"Unique URLs to check: {len(unique_urls)}") broken: List[dict] = [] ok_count = 0 for url in unique_urls: if is_ignorable(url, ignore_prefixes): continue ok, code, reason = check_url(url) if ok: ok_count += 1 else: broken.append({"url": url, "file": str(first_file[url]), "error": reason}) print(f"OK: {ok_count} Broken: {len(broken)}") if broken: print("\nBroken links:") for b in broken: print(f" [{b['file']}] {b['url']} — {b['error']}") if args.json: print(json.dumps({"scanned": len(unique_urls), "ok": ok_count, "broken": len(broken), "broken_links": broken}, indent=2)) return 1 if (args.fail_on_broken and broken) else 0 if __name__ == '__main__': sys.exit(main())