timmy-home/scripts/big_brain_repo_audit.py

#!/usr/bin/env python3
"""Build a Big Brain audit artifact for a repository via Ollama.

The script creates a markdown context bundle from a repo, prompts an Ollama model
for an architecture/security audit, and writes the final report to disk.
"""

from __future__ import annotations

import argparse
import json
import os
import urllib.request
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable

IGNORED_DIRS = {
    ".git",
    ".hg",
    ".svn",
    ".venv",
    "venv",
    "node_modules",
    "__pycache__",
    ".mypy_cache",
    ".pytest_cache",
    "dist",
    "build",
    "coverage",
}

TEXT_SUFFIXES = {
    ".py",
    ".js",
    ".mjs",
    ".cjs",
    ".ts",
    ".tsx",
    ".jsx",
    ".html",
    ".css",
    ".md",
    ".txt",
    ".json",
    ".yaml",
    ".yml",
    ".sh",
    ".ini",
    ".cfg",
    ".toml",
}

PRIORITY_FILENAMES = {
    "README.md",
    "CLAUDE.md",
    "POLICY.md",
    "DEVELOPMENT.md",
    "BROWSER_CONTRACT.md",
    "index.html",
    "app.js",
    "style.css",
    "server.py",
    "gofai_worker.js",
    "provenance.json",
    "tests/test_provenance.py",
}

PRIORITY_SNIPPETS = (
    "tests/",
    "docs/",
    "nexus/",
    "intelligence/deepdive/",
    "scaffold/deepdive/",
    "bin/",
)


@dataclass(frozen=True)
class RepoFile:
    path: str
    abs_path: Path
    size_bytes: int
    line_count: int

    def to_dict(self) -> dict[str, int | str]:
        return {
            "path": self.path,
            "size_bytes": self.size_bytes,
            "line_count": self.line_count,
        }


def _is_text_file(path: Path) -> bool:
    return path.suffix.lower() in TEXT_SUFFIXES or path.name in {"Dockerfile", "Makefile"}


def collect_repo_files(repo_root: str | Path) -> list[dict[str, int | str]]:
    root = Path(repo_root).resolve()
    files: list[RepoFile] = []

    for current_root, dirnames, filenames in os.walk(root):
        dirnames[:] = sorted(d for d in dirnames if d not in IGNORED_DIRS)
        base = Path(current_root)
        for filename in sorted(filenames):
            path = base / filename
            if not _is_text_file(path):
                continue
            rel_path = path.relative_to(root).as_posix()
            text = path.read_text(errors="replace")
            files.append(
                RepoFile(
                    path=rel_path,
                    abs_path=path,
                    size_bytes=path.stat().st_size,
                    line_count=len(text.splitlines()) or 1,
                )
            )

    return [item.to_dict() for item in sorted(files, key=lambda item: item.path)]


def _priority_score(path: str) -> tuple[int, int, str]:
    score = 0
    if path in PRIORITY_FILENAMES:
        score += 100
    if any(snippet in path for snippet in PRIORITY_SNIPPETS):
        score += 25
    if "/" not in path:
        score += 20
    if path.startswith("tests/"):
        score += 10
    if path.endswith("README.md"):
        score += 10
    return (-score, len(path), path)


def _numbered_excerpt(path: Path, max_chars: int) -> str:
    lines = path.read_text(errors="replace").splitlines()
    rendered: list[str] = []
    total = 0
    for idx, line in enumerate(lines, start=1):
        numbered = f"{idx}|{line}"
        if rendered and total + len(numbered) + 1 > max_chars:
            rendered.append("...[truncated]...")
            break
        rendered.append(numbered)
        total += len(numbered) + 1
    return "\n".join(rendered)


def render_context_bundle(
    repo_root: str | Path,
    repo_name: str,
    max_chars_per_file: int = 6000,
    max_total_chars: int = 120000,
) -> str:
    root = Path(repo_root).resolve()
    files = [
        RepoFile(Path(item["path"]).as_posix(), root / str(item["path"]), int(item["size_bytes"]), int(item["line_count"]))
        for item in collect_repo_files(root)
    ]

    lines: list[str] = [
        f"# Audit Context Bundle — {repo_name}",
        "",
        f"Generated: {datetime.now(timezone.utc).isoformat()}",
        f"Repo root: {root}",
        f"Text files indexed: {len(files)}",
        "",
        "## File manifest",
    ]
    for item in files:
        lines.append(f"- {item.path} — {item.line_count} lines, {item.size_bytes} bytes")

    lines.extend(["", "## Selected file excerpts"])
    total_chars = len("\n".join(lines))

    for item in sorted(files, key=lambda f: _priority_score(f.path)):
        excerpt = _numbered_excerpt(item.abs_path, max_chars_per_file)
        block = f"\n### {item.path}\n```text\n{excerpt}\n```\n"
        if total_chars + len(block) > max_total_chars:
            break
        lines.append(f"### {item.path}")
        lines.append("```text")
        lines.append(excerpt)
        lines.append("```")
        lines.append("")
        total_chars += len(block)

    return "\n".join(lines).rstrip() + "\n"


def build_audit_prompt(repo_name: str, context_bundle: str) -> str:
    return (
        f"You are auditing the repository {repo_name}.\n\n"
        "Use only the supplied context bundle. Be concrete, skeptical, and reference file:line locations.\n\n"
        "Return markdown with these sections exactly:\n"
        "1. Architecture summary\n"
        "2. Top 5 structural issues\n"
        "3. Top 3 recommended refactors\n"
        "4. Security concerns\n"
        "5. Follow-on issue candidates\n\n"
        "Rules:\n"
        "- Every issue and refactor must cite at least one file:line reference.\n"
        "- Prefer contradictions, dead code, duplicate ownership, stale docs, brittle boundaries, and unsafe execution paths.\n"
        "- If docs and code disagree, say so plainly.\n"
        "- Keep it actionable for a Gitea issue/PR workflow.\n\n"
        "Context bundle:\n\n"
        f"{context_bundle}"
    )


def call_ollama_chat(prompt: str, model: str, ollama_url: str, num_ctx: int = 32768, timeout: int = 600) -> str:
    payload = json.dumps(
        {
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "stream": False,
            "options": {"num_ctx": num_ctx},
        }
    ).encode()
    url = f"{ollama_url.rstrip('/')}/api/chat"
    request = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
    with urllib.request.urlopen(request, timeout=timeout) as response:
        data = json.loads(response.read().decode())
    if "message" in data and isinstance(data["message"], dict):
        return data["message"].get("content", "")
    if "response" in data:
        return str(data["response"])
    raise ValueError(f"Unexpected Ollama response shape: {data}")


def generate_audit_report(
    repo_root: str | Path,
    repo_name: str,
    model: str,
    ollama_url: str,
    num_ctx: int,
    context_out: str | Path | None = None,
) -> tuple[str, str]:
    context_bundle = render_context_bundle(repo_root, repo_name=repo_name)
    if context_out:
        context_path = Path(context_out)
        context_path.parent.mkdir(parents=True, exist_ok=True)
        context_path.write_text(context_bundle)
    prompt = build_audit_prompt(repo_name, context_bundle)
    report = call_ollama_chat(prompt, model=model, ollama_url=ollama_url, num_ctx=num_ctx)
    return context_bundle, report


def main() -> None:
    parser = argparse.ArgumentParser(description="Generate a Big Brain repo audit artifact via Ollama")
    parser.add_argument("--repo-root", required=True, help="Path to the repository to audit")
    parser.add_argument("--repo-name", required=True, help="Repository name, e.g. Timmy_Foundation/the-nexus")
    parser.add_argument("--model", default=os.environ.get("BIG_BRAIN_MODEL", "gemma4:latest"))
    parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_URL", "http://localhost:11434"))
    parser.add_argument("--num-ctx", type=int, default=int(os.environ.get("BIG_BRAIN_NUM_CTX", "32768")))
    parser.add_argument("--context-out", default=None, help="Optional path to save the generated context bundle")
    parser.add_argument("--report-out", required=True, help="Path to save the generated markdown audit")
    args = parser.parse_args()

    _, report = generate_audit_report(
        repo_root=args.repo_root,
        repo_name=args.repo_name,
        model=args.model,
        ollama_url=args.ollama_url,
        num_ctx=args.num_ctx,
        context_out=args.context_out,
    )

    out_path = Path(args.report_out)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(report)
    print(f"Audit report saved to {out_path}")


if __name__ == "__main__":
    main()