turboquant/check_markdown_links.py

#!/usr/bin/env python3
"""Check local markdown links.

Scans markdown files for local links and fails on broken targets.
Ignores:
- external URLs (http/https)
- anchors (#section)
- mailto: and tel:
- links inside fenced code blocks
- generated/build directories
"""

from __future__ import annotations

import argparse
import re
import sys
from pathlib import Path
from typing import Iterable

CODE_FENCE_RE = re.compile(r"^```")
LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)")
DEFAULT_SKIP_DIRS = {
    ".git",
    ".gitea",
    ".pytest_cache",
    "__pycache__",
    "build",
    "dist",
    "node_modules",
    "llama-cpp-fork",
}


def should_ignore_target(target: str) -> bool:
    target = target.strip()
    return (
        not target
        or target.startswith("http://")
        or target.startswith("https://")
        or target.startswith("mailto:")
        or target.startswith("tel:")
        or target.startswith("#")
    )


def normalize_target(target: str) -> str:
    target = target.strip()
    if target.startswith("<") and target.endswith(">"):
        target = target[1:-1].strip()
    if "#" in target:
        target = target.split("#", 1)[0]
    return target


def iter_markdown_files(root: Path, skip_dirs: set[str] | None = None) -> Iterable[Path]:
    skip_dirs = skip_dirs or DEFAULT_SKIP_DIRS
    for path in root.rglob("*.md"):
        if any(part in skip_dirs for part in path.relative_to(root).parts):
            continue
        yield path


def iter_links(path: Path) -> Iterable[tuple[int, str]]:
    in_code_fence = False
    for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
        if CODE_FENCE_RE.match(line.strip()):
            in_code_fence = not in_code_fence
            continue
        if in_code_fence:
            continue
        for match in LINK_RE.finditer(line):
            yield line_no, match.group(1)


def resolve_target(source: Path, target: str, root: Path) -> Path:
    if target.startswith("/"):
        return (root / target.lstrip("/")).resolve()
    return (source.parent / target).resolve()


def find_broken_links(root: Path, skip_dirs: set[str] | None = None) -> list[dict]:
    root = root.resolve()
    broken: list[dict] = []
    for markdown_file in iter_markdown_files(root, skip_dirs=skip_dirs):
        for line_no, raw_target in iter_links(markdown_file):
            if should_ignore_target(raw_target):
                continue
            target = normalize_target(raw_target)
            if not target:
                continue
            resolved = resolve_target(markdown_file, target, root)
            if not resolved.exists():
                broken.append(
                    {
                        "source": str(markdown_file),
                        "line": line_no,
                        "target": target,
                        "resolved": str(resolved),
                    }
                )
    return broken


def main() -> int:
    parser = argparse.ArgumentParser(description="Fail on broken local markdown links.")
    parser.add_argument("root", nargs="?", default=".", help="Repo root to scan (default: .)")
    args = parser.parse_args()

    root = Path(args.root)
    broken = find_broken_links(root)
    if not broken:
        print("PASS: No broken local markdown links")
        return 0

    print("Broken local markdown links found:")
    for item in broken:
        source = Path(item["source"]).relative_to(root.resolve())
        print(f"{source}:{item['line']}: missing target -> {item['target']}")
    return 1


if __name__ == "__main__":
    sys.exit(main())