Some checks failed
Smoke Test / smoke (push) Failing after 11s
Merge PR #656
281 lines
8.5 KiB
Python
281 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Build a Big Brain audit artifact for a repository via Ollama.
|
|
|
|
The script creates a markdown context bundle from a repo, prompts an Ollama model
|
|
for an architecture/security audit, and writes the final report to disk.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import urllib.request
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Iterable
|
|
|
|
IGNORED_DIRS = {
|
|
".git",
|
|
".hg",
|
|
".svn",
|
|
".venv",
|
|
"venv",
|
|
"node_modules",
|
|
"__pycache__",
|
|
".mypy_cache",
|
|
".pytest_cache",
|
|
"dist",
|
|
"build",
|
|
"coverage",
|
|
}
|
|
|
|
TEXT_SUFFIXES = {
|
|
".py",
|
|
".js",
|
|
".mjs",
|
|
".cjs",
|
|
".ts",
|
|
".tsx",
|
|
".jsx",
|
|
".html",
|
|
".css",
|
|
".md",
|
|
".txt",
|
|
".json",
|
|
".yaml",
|
|
".yml",
|
|
".sh",
|
|
".ini",
|
|
".cfg",
|
|
".toml",
|
|
}
|
|
|
|
PRIORITY_FILENAMES = {
|
|
"README.md",
|
|
"CLAUDE.md",
|
|
"POLICY.md",
|
|
"DEVELOPMENT.md",
|
|
"BROWSER_CONTRACT.md",
|
|
"index.html",
|
|
"app.js",
|
|
"style.css",
|
|
"server.py",
|
|
"gofai_worker.js",
|
|
"provenance.json",
|
|
"tests/test_provenance.py",
|
|
}
|
|
|
|
PRIORITY_SNIPPETS = (
|
|
"tests/",
|
|
"docs/",
|
|
"nexus/",
|
|
"intelligence/deepdive/",
|
|
"scaffold/deepdive/",
|
|
"bin/",
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class RepoFile:
|
|
path: str
|
|
abs_path: Path
|
|
size_bytes: int
|
|
line_count: int
|
|
|
|
def to_dict(self) -> dict[str, int | str]:
|
|
return {
|
|
"path": self.path,
|
|
"size_bytes": self.size_bytes,
|
|
"line_count": self.line_count,
|
|
}
|
|
|
|
|
|
def _is_text_file(path: Path) -> bool:
|
|
return path.suffix.lower() in TEXT_SUFFIXES or path.name in {"Dockerfile", "Makefile"}
|
|
|
|
|
|
def collect_repo_files(repo_root: str | Path) -> list[dict[str, int | str]]:
|
|
root = Path(repo_root).resolve()
|
|
files: list[RepoFile] = []
|
|
|
|
for current_root, dirnames, filenames in os.walk(root):
|
|
dirnames[:] = sorted(d for d in dirnames if d not in IGNORED_DIRS)
|
|
base = Path(current_root)
|
|
for filename in sorted(filenames):
|
|
path = base / filename
|
|
if not _is_text_file(path):
|
|
continue
|
|
rel_path = path.relative_to(root).as_posix()
|
|
text = path.read_text(errors="replace")
|
|
files.append(
|
|
RepoFile(
|
|
path=rel_path,
|
|
abs_path=path,
|
|
size_bytes=path.stat().st_size,
|
|
line_count=len(text.splitlines()) or 1,
|
|
)
|
|
)
|
|
|
|
return [item.to_dict() for item in sorted(files, key=lambda item: item.path)]
|
|
|
|
|
|
def _priority_score(path: str) -> tuple[int, int, str]:
|
|
score = 0
|
|
if path in PRIORITY_FILENAMES:
|
|
score += 100
|
|
if any(snippet in path for snippet in PRIORITY_SNIPPETS):
|
|
score += 25
|
|
if "/" not in path:
|
|
score += 20
|
|
if path.startswith("tests/"):
|
|
score += 10
|
|
if path.endswith("README.md"):
|
|
score += 10
|
|
return (-score, len(path), path)
|
|
|
|
|
|
def _numbered_excerpt(path: Path, max_chars: int) -> str:
|
|
lines = path.read_text(errors="replace").splitlines()
|
|
rendered: list[str] = []
|
|
total = 0
|
|
for idx, line in enumerate(lines, start=1):
|
|
numbered = f"{idx}|{line}"
|
|
if rendered and total + len(numbered) + 1 > max_chars:
|
|
rendered.append("...[truncated]...")
|
|
break
|
|
rendered.append(numbered)
|
|
total += len(numbered) + 1
|
|
return "\n".join(rendered)
|
|
|
|
|
|
def render_context_bundle(
|
|
repo_root: str | Path,
|
|
repo_name: str,
|
|
max_chars_per_file: int = 6000,
|
|
max_total_chars: int = 120000,
|
|
) -> str:
|
|
root = Path(repo_root).resolve()
|
|
files = [
|
|
RepoFile(Path(item["path"]).as_posix(), root / str(item["path"]), int(item["size_bytes"]), int(item["line_count"]))
|
|
for item in collect_repo_files(root)
|
|
]
|
|
|
|
lines: list[str] = [
|
|
f"# Audit Context Bundle — {repo_name}",
|
|
"",
|
|
f"Generated: {datetime.now(timezone.utc).isoformat()}",
|
|
f"Repo root: {root}",
|
|
f"Text files indexed: {len(files)}",
|
|
"",
|
|
"## File manifest",
|
|
]
|
|
for item in files:
|
|
lines.append(f"- {item.path} — {item.line_count} lines, {item.size_bytes} bytes")
|
|
|
|
lines.extend(["", "## Selected file excerpts"])
|
|
total_chars = len("\n".join(lines))
|
|
|
|
for item in sorted(files, key=lambda f: _priority_score(f.path)):
|
|
excerpt = _numbered_excerpt(item.abs_path, max_chars_per_file)
|
|
block = f"\n### {item.path}\n```text\n{excerpt}\n```\n"
|
|
if total_chars + len(block) > max_total_chars:
|
|
break
|
|
lines.append(f"### {item.path}")
|
|
lines.append("```text")
|
|
lines.append(excerpt)
|
|
lines.append("```")
|
|
lines.append("")
|
|
total_chars += len(block)
|
|
|
|
return "\n".join(lines).rstrip() + "\n"
|
|
|
|
|
|
def build_audit_prompt(repo_name: str, context_bundle: str) -> str:
|
|
return (
|
|
f"You are auditing the repository {repo_name}.\n\n"
|
|
"Use only the supplied context bundle. Be concrete, skeptical, and reference file:line locations.\n\n"
|
|
"Return markdown with these sections exactly:\n"
|
|
"1. Architecture summary\n"
|
|
"2. Top 5 structural issues\n"
|
|
"3. Top 3 recommended refactors\n"
|
|
"4. Security concerns\n"
|
|
"5. Follow-on issue candidates\n\n"
|
|
"Rules:\n"
|
|
"- Every issue and refactor must cite at least one file:line reference.\n"
|
|
"- Prefer contradictions, dead code, duplicate ownership, stale docs, brittle boundaries, and unsafe execution paths.\n"
|
|
"- If docs and code disagree, say so plainly.\n"
|
|
"- Keep it actionable for a Gitea issue/PR workflow.\n\n"
|
|
"Context bundle:\n\n"
|
|
f"{context_bundle}"
|
|
)
|
|
|
|
|
|
def call_ollama_chat(prompt: str, model: str, ollama_url: str, num_ctx: int = 32768, timeout: int = 600) -> str:
|
|
payload = json.dumps(
|
|
{
|
|
"model": model,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
"stream": False,
|
|
"options": {"num_ctx": num_ctx},
|
|
}
|
|
).encode()
|
|
url = f"{ollama_url.rstrip('/')}/api/chat"
|
|
request = urllib.request.Request(url, data=payload, headers={"Content-Type": "application/json"})
|
|
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
data = json.loads(response.read().decode())
|
|
if "message" in data and isinstance(data["message"], dict):
|
|
return data["message"].get("content", "")
|
|
if "response" in data:
|
|
return str(data["response"])
|
|
raise ValueError(f"Unexpected Ollama response shape: {data}")
|
|
|
|
|
|
def generate_audit_report(
|
|
repo_root: str | Path,
|
|
repo_name: str,
|
|
model: str,
|
|
ollama_url: str,
|
|
num_ctx: int,
|
|
context_out: str | Path | None = None,
|
|
) -> tuple[str, str]:
|
|
context_bundle = render_context_bundle(repo_root, repo_name=repo_name)
|
|
if context_out:
|
|
context_path = Path(context_out)
|
|
context_path.parent.mkdir(parents=True, exist_ok=True)
|
|
context_path.write_text(context_bundle)
|
|
prompt = build_audit_prompt(repo_name, context_bundle)
|
|
report = call_ollama_chat(prompt, model=model, ollama_url=ollama_url, num_ctx=num_ctx)
|
|
return context_bundle, report
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Generate a Big Brain repo audit artifact via Ollama")
|
|
parser.add_argument("--repo-root", required=True, help="Path to the repository to audit")
|
|
parser.add_argument("--repo-name", required=True, help="Repository name, e.g. Timmy_Foundation/the-nexus")
|
|
parser.add_argument("--model", default=os.environ.get("BIG_BRAIN_MODEL", "gemma4:latest"))
|
|
parser.add_argument("--ollama-url", default=os.environ.get("OLLAMA_URL", "http://localhost:11434"))
|
|
parser.add_argument("--num-ctx", type=int, default=int(os.environ.get("BIG_BRAIN_NUM_CTX", "32768")))
|
|
parser.add_argument("--context-out", default=None, help="Optional path to save the generated context bundle")
|
|
parser.add_argument("--report-out", required=True, help="Path to save the generated markdown audit")
|
|
args = parser.parse_args()
|
|
|
|
_, report = generate_audit_report(
|
|
repo_root=args.repo_root,
|
|
repo_name=args.repo_name,
|
|
model=args.model,
|
|
ollama_url=args.ollama_url,
|
|
num_ctx=args.num_ctx,
|
|
context_out=args.context_out,
|
|
)
|
|
|
|
out_path = Path(args.report_out)
|
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
out_path.write_text(report)
|
|
print(f"Audit report saved to {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|