125 lines
3.5 KiB
Python
125 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Check local markdown links.
|
|
|
|
Scans markdown files for local links and fails on broken targets.
|
|
Ignores:
|
|
- external URLs (http/https)
|
|
- anchors (#section)
|
|
- mailto: and tel:
|
|
- links inside fenced code blocks
|
|
- generated/build directories
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
CODE_FENCE_RE = re.compile(r"^```")
|
|
LINK_RE = re.compile(r"(?<!!)\[[^\]]+\]\(([^)]+)\)")
|
|
DEFAULT_SKIP_DIRS = {
|
|
".git",
|
|
".gitea",
|
|
".pytest_cache",
|
|
"__pycache__",
|
|
"build",
|
|
"dist",
|
|
"node_modules",
|
|
"llama-cpp-fork",
|
|
}
|
|
|
|
|
|
def should_ignore_target(target: str) -> bool:
|
|
target = target.strip()
|
|
return (
|
|
not target
|
|
or target.startswith("http://")
|
|
or target.startswith("https://")
|
|
or target.startswith("mailto:")
|
|
or target.startswith("tel:")
|
|
or target.startswith("#")
|
|
)
|
|
|
|
|
|
def normalize_target(target: str) -> str:
|
|
target = target.strip()
|
|
if target.startswith("<") and target.endswith(">"):
|
|
target = target[1:-1].strip()
|
|
if "#" in target:
|
|
target = target.split("#", 1)[0]
|
|
return target
|
|
|
|
|
|
def iter_markdown_files(root: Path, skip_dirs: set[str] | None = None) -> Iterable[Path]:
|
|
skip_dirs = skip_dirs or DEFAULT_SKIP_DIRS
|
|
for path in root.rglob("*.md"):
|
|
if any(part in skip_dirs for part in path.relative_to(root).parts):
|
|
continue
|
|
yield path
|
|
|
|
|
|
def iter_links(path: Path) -> Iterable[tuple[int, str]]:
|
|
in_code_fence = False
|
|
for line_no, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1):
|
|
if CODE_FENCE_RE.match(line.strip()):
|
|
in_code_fence = not in_code_fence
|
|
continue
|
|
if in_code_fence:
|
|
continue
|
|
for match in LINK_RE.finditer(line):
|
|
yield line_no, match.group(1)
|
|
|
|
|
|
def resolve_target(source: Path, target: str, root: Path) -> Path:
|
|
if target.startswith("/"):
|
|
return (root / target.lstrip("/")).resolve()
|
|
return (source.parent / target).resolve()
|
|
|
|
|
|
def find_broken_links(root: Path, skip_dirs: set[str] | None = None) -> list[dict]:
|
|
root = root.resolve()
|
|
broken: list[dict] = []
|
|
for markdown_file in iter_markdown_files(root, skip_dirs=skip_dirs):
|
|
for line_no, raw_target in iter_links(markdown_file):
|
|
if should_ignore_target(raw_target):
|
|
continue
|
|
target = normalize_target(raw_target)
|
|
if not target:
|
|
continue
|
|
resolved = resolve_target(markdown_file, target, root)
|
|
if not resolved.exists():
|
|
broken.append(
|
|
{
|
|
"source": str(markdown_file),
|
|
"line": line_no,
|
|
"target": target,
|
|
"resolved": str(resolved),
|
|
}
|
|
)
|
|
return broken
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Fail on broken local markdown links.")
|
|
parser.add_argument("root", nargs="?", default=".", help="Repo root to scan (default: .)")
|
|
args = parser.parse_args()
|
|
|
|
root = Path(args.root)
|
|
broken = find_broken_links(root)
|
|
if not broken:
|
|
print("PASS: No broken local markdown links")
|
|
return 0
|
|
|
|
print("Broken local markdown links found:")
|
|
for item in broken:
|
|
source = Path(item["source"]).relative_to(root.resolve())
|
|
print(f"{source}:{item['line']}: missing target -> {item['target']}")
|
|
return 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|