compounding-intelligence/scripts/validate_doc_links.py

#!/usr/bin/env python3
"""
Doc Link Validator — Extract and verify all documentation links.
Issue: #103 — 4.8: Doc Link Validator

Acceptance:
  Extracts links from docs | HTTP HEAD check | Reports broken links
  (Weekly cron/CI integration out of scope for this minimal script)
"""

import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple, Optional
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from urllib.parse import urlparse

# Markdown link patterns
INLINE_LINK_RE = re.compile(r'\[[^\]]*\]\(([^)\s]+)(?:\s+"[^"]*")?\)')
AUTOLINK_RE = re.compile(r'<([^>]+)>')


def extract_links(content: str) -> List[str]:
    urls = [m.group(1) for m in INLINE_LINK_RE.finditer(content)]
    urls += [m.group(1) for m in AUTOLINK_RE.finditer(content)]
    return urls


def is_ignorable(url: str, ignore_prefixes: List[str]) -> bool:
    p = urlparse(url)
    if p.scheme not in ('http', 'https'):
        return True
    host = p.netloc.split(':')[0]
    if host in ('localhost', '127.0.0.1', '::1'):
        return True
    # Private IPv4 ranges
    if re.match(r'^(10\.|192\.168\.|172\.(1[6-9]|2[0-9]|3[01])\.)', host):
        return True
    for prefix in ignore_prefixes:
        if url.startswith(prefix):
            return True
    return False


def check_url(url: str, timeout: float = 8.0) -> Tuple[bool, Optional[int], str]:
    try:
        req = Request(url, method='HEAD')
        req.add_header('User-Agent', 'DocLinkValidator/1.0')
        try:
            with urlopen(req, timeout=timeout) as resp:
                return True, resp.getcode(), "OK"
        except HTTPError as e:
            if e.code in (405, 403, 400):
                req2 = Request(url, method='GET')
                req2.add_header('User-Agent', 'DocLinkValidator/1.0')
                req2.add_header('Range', 'bytes=0-1')
                with urlopen(req2, timeout=timeout) as resp2:
                    return True, resp2.getcode(), "OK via GET"
            return False, e.code, e.reason
    except URLError as e:
        return False, None, str(e.reason) if hasattr(e, 'reason') else str(e)
    except Exception as e:
        return False, None, str(e)


def main() -> int:
    p = argparse.ArgumentParser(description="Validate documentation links")
    p.add_argument('--root', default='.', help='Repository root')
    p.add_argument('--fail-on-broken', action='store_true', help='Exit non-zero if broken links found')
    p.add_argument('--json', action='store_true', help='Emit JSON report')
    p.add_argument('--ignore', default='', help='Comma-separated URL prefixes to ignore')
    args = p.parse_args()

    root = Path(args.root).resolve()
    ignore_prefixes = [x.strip() for x in args.ignore.split(',') if x.strip()]

    md_files = list(root.rglob('*.md'))
    if not md_files:
        print("No markdown files found.", file=sys.stderr)
        return 1

    print(f"Scanning {len(md_files)} markdown files")

    all_links: List[Tuple[Path, str]] = []
    for md in md_files:
        content = md.read_text(errors='replace')
        for m in INLINE_LINK_RE.finditer(content):
            all_links.append((md, m.group(1)))
        for m in AUTOLINK_RE.finditer(content):
            all_links.append((md, m.group(1)))

    print(f"Raw link occurrences: {len(all_links)}")

    # De-duplicate by URL, keep first file context
    first_file: dict[str, Path] = {}
    unique_urls: List[str] = []
    for file, url in all_links:
        if url not in first_file:
            first_file[url] = file
            unique_urls.append(url)

    print(f"Unique URLs to check: {len(unique_urls)}")

    broken: List[dict] = []
    ok_count = 0
    for url in unique_urls:
        if is_ignorable(url, ignore_prefixes):
            continue
        ok, code, reason = check_url(url)
        if ok:
            ok_count += 1
        else:
            broken.append({"url": url, "file": str(first_file[url]), "error": reason})

    print(f"OK: {ok_count}   Broken: {len(broken)}")
    if broken:
        print("\nBroken links:")
        for b in broken:
            print(f"  [{b['file']}] {b['url']} — {b['error']}")

    if args.json:
        print(json.dumps({"scanned": len(unique_urls), "ok": ok_count,
                          "broken": len(broken), "broken_links": broken}, indent=2))

    return 1 if (args.fail_on_broken and broken) else 0


if __name__ == '__main__':
    sys.exit(main())