compounding-intelligence/scripts/bootstrapper.py

#!/usr/bin/env python3
"""
Bootstrapper — assemble pre-session context from knowledge store.

Reads the knowledge store and produces a compact context block (2k tokens max)
that can be injected into a new session so it starts with situational awareness.

Usage:
    python3 bootstrapper.py --repo the-nexus --agent mimo-sprint
    python3 bootstrapper.py --repo timmy-home --global
    python3 bootstrapper.py --global
    python3 bootstrapper.py --repo the-nexus --max-tokens 1000
"""

import argparse
import json
import sys
from pathlib import Path
from typing import Optional

# Resolve knowledge root relative to this script's parent
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
KNOWLEDGE_DIR = REPO_ROOT / "knowledge"
INDEX_PATH = KNOWLEDGE_DIR / "index.json"

# Approximate token count: ~4 chars per token for English text
CHARS_PER_TOKEN = 4

# Category sort priority (lower = shown first)
CATEGORY_PRIORITY = {
    "pitfall": 0,
    "tool-quirk": 1,
    "pattern": 2,
    "fact": 3,
    "question": 4,
}


def load_index(index_path: Path = INDEX_PATH) -> dict:
    """Load and validate the knowledge index."""
    if not index_path.exists():
        return {"version": 1, "total_facts": 0, "facts": []}

    with open(index_path) as f:
        data = json.load(f)

    if "facts" not in data:
        print(f"WARNING: index.json missing 'facts' key", file=sys.stderr)
        return {"version": 1, "total_facts": 0, "facts": []}

    return data


def filter_facts(
    facts: list[dict],
    repo: Optional[str] = None,
    agent: Optional[str] = None,
    include_global: bool = True,
) -> list[dict]:
    """Filter facts by repo, agent, and global scope."""
    filtered = []

    for fact in facts:
        fact_repo = fact.get("repo", "global")
        fact_agent = fact.get("agent", "")

        # Match by repo (regardless of agent)
        if repo and fact_repo == repo:
            filtered.append(fact)
            continue

        # Match by exact agent type
        if agent and fact_agent == agent:
            filtered.append(fact)
            continue

        # Include global facts without agent restriction (universal facts)
        if include_global and fact_repo == "global" and not fact_agent:
            filtered.append(fact)

    return filtered


def sort_facts(facts: list[dict]) -> list[dict]:
    """
    Sort facts by: confidence (desc), then category priority, then fact text.
    Most reliable and most dangerous facts come first.
    """

    def sort_key(f):
        confidence = f.get("confidence", 0.5)
        category = f.get("category", "fact")
        cat_priority = CATEGORY_PRIORITY.get(category, 5)
        return (-confidence, cat_priority, f.get("fact", ""))

    return sorted(facts, key=sort_key)


def load_repo_knowledge(repo: str) -> Optional[str]:
    """Load per-repo knowledge markdown if it exists."""
    repo_path = KNOWLEDGE_DIR / "repos" / f"{repo}.md"
    if repo_path.exists():
        return repo_path.read_text().strip()
    return None


def load_agent_knowledge(agent: str) -> Optional[str]:
    """Load per-agent knowledge markdown if it exists."""
    agent_path = KNOWLEDGE_DIR / "agents" / f"{agent}.md"
    if agent_path.exists():
        return agent_path.read_text().strip()
    return None


def load_global_knowledge() -> list[str]:
    """Load all global knowledge markdown files."""
    global_dir = KNOWLEDGE_DIR / "global"
    if not global_dir.exists():
        return []

    chunks = []
    for md_file in sorted(global_dir.glob("*.md")):
        content = md_file.read_text().strip()
        if content:
            chunks.append(content)
    return chunks


def render_facts_section(facts: list[dict], category: str, label: str) -> str:
    """Render a section of facts for a single category."""
    cat_facts = [f for f in facts if f.get("category") == category]
    if not cat_facts:
        return ""

    lines = [f"### {label}\n"]
    for f in cat_facts:
        conf = f.get("confidence", 0.5)
        fact_text = f.get("fact", "")
        repo_tag = f.get("repo", "")
        if repo_tag and repo_tag != "global":
            lines.append(f"- [{conf:.0%}] ({repo_tag}) {fact_text}")
        else:
            lines.append(f"- [{conf:.0%}] {fact_text}")

    return "\n".join(lines) + "\n"


def estimate_tokens(text: str) -> int:
    """Rough token estimate."""
    return len(text) // CHARS_PER_TOKEN


def truncate_to_tokens(text: str, max_tokens: int) -> str:
    """Truncate text to approximately max_tokens, cutting at line boundaries."""
    max_chars = max_tokens * CHARS_PER_TOKEN
    if len(text) <= max_chars:
        return text

    # Cut at last newline before the limit
    truncated = text[:max_chars]
    last_newline = truncated.rfind("\n")
    if last_newline > 0:
        truncated = truncated[:last_newline]

    return truncated + "\n\n[... truncated to fit context window ...]"


def build_bootstrap_context(
    repo: Optional[str] = None,
    agent: Optional[str] = None,
    include_global: bool = True,
    max_tokens: int = 2000,
    index_path: Path = INDEX_PATH,
) -> str:
    """
    Build the full bootstrap context block.

    Returns a markdown string suitable for injection into a session prompt.
    """
    index = load_index(index_path)
    facts = index.get("facts", [])

    # Filter
    filtered = filter_facts(facts, repo=repo, agent=agent, include_global=include_global)

    # Sort
    sorted_facts = sort_facts(filtered)

    # Build sections
    sections = ["## What You Know (bootstrapped)\n"]

    # Per-repo markdown knowledge
    if repo:
        repo_md = load_repo_knowledge(repo)
        if repo_md:
            sections.append(f"### Repo Notes: {repo}\n")
            sections.append(repo_md + "\n")

    # Structured facts by category
    if sorted_facts:
        # Group by source
        repo_facts = [f for f in sorted_facts if f.get("repo") == repo] if repo else []
        global_facts = [f for f in sorted_facts if f.get("repo") == "global"]
        agent_facts = [f for f in sorted_facts if f.get("agent") == agent] if agent else []

        if repo_facts:
            sections.append(f"### Repo: {repo}\n")
            for cat, label in [
                ("pitfall", "PITFALLS"),
                ("tool-quirk", "QUIRKS"),
                ("pattern", "PATTERNS"),
                ("fact", "FACTS"),
                ("question", "OPEN QUESTIONS"),
            ]:
                section = render_facts_section(repo_facts, cat, label)
                if section:
                    sections.append(section)

        if global_facts:
            sections.append("### Global\n")
            for cat, label in [
                ("pitfall", "PITFALLS"),
                ("tool-quirk", "QUIRKS"),
                ("pattern", "PATTERNS"),
                ("fact", "FACTS"),
            ]:
                section = render_facts_section(global_facts, cat, label)
                if section:
                    sections.append(section)

        if agent_facts:
            sections.append(f"### Agent Notes ({agent})\n")
            for cat, label in [
                ("pitfall", "PITFALLS"),
                ("tool-quirk", "QUIRKS"),
                ("pattern", "PATTERNS"),
                ("fact", "FACTS"),
            ]:
                section = render_facts_section(agent_facts, cat, label)
                if section:
                    sections.append(section)

    # Per-agent markdown knowledge
    if agent:
        agent_md = load_agent_knowledge(agent)
        if agent_md:
            sections.append(f"### Agent Profile: {agent}\n")
            sections.append(agent_md + "\n")

    # Global markdown knowledge
    global_chunks = load_global_knowledge()
    if global_chunks:
        sections.append("### Global Notes\n")
        sections.extend(chunk + "\n" for chunk in global_chunks)

    # If nothing was found
    if len(sections) == 1:
        sections.append("_No relevant knowledge found. Starting fresh._\n")
        if not facts:
            sections.append(
                "_Knowledge store is empty. Run the harvester to populate it._\n"
            )

    # Join and truncate
    context = "\n".join(sections)
    context = truncate_to_tokens(context, max_tokens)

    return context


def main():
    parser = argparse.ArgumentParser(
        description="Assemble pre-session context from knowledge store"
    )
    parser.add_argument(
        "--repo",
        type=str,
        default=None,
        help="Repository name to filter facts by",
    )
    parser.add_argument(
        "--agent",
        type=str,
        default=None,
        help="Agent type to filter facts by (e.g., mimo-sprint, groq-fast)",
    )
    parser.add_argument(
        "--global",
        dest="include_global",
        action="store_true",
        default=True,
        help="Include global facts (default: true)",
    )
    parser.add_argument(
        "--no-global",
        dest="include_global",
        action="store_false",
        help="Exclude global facts",
    )
    parser.add_argument(
        "--max-tokens",
        type=int,
        default=2000,
        help="Maximum token count for output (default: 2000)",
    )
    parser.add_argument(
        "--index",
        type=str,
        default=None,
        help="Path to index.json (default: knowledge/index.json)",
    )
    parser.add_argument(
        "--json",
        dest="output_json",
        action="store_true",
        help="Output raw JSON instead of markdown",
    )

    args = parser.parse_args()

    index_path = Path(args.index) if args.index else INDEX_PATH

    if args.output_json:
        # JSON mode: return the filtered, sorted facts
        index = load_index(index_path)
        facts = index.get("facts", [])
        filtered = filter_facts(
            facts,
            repo=args.repo,
            agent=args.agent,
            include_global=args.include_global,
        )
        sorted_facts = sort_facts(filtered)
        output = {
            "repo": args.repo,
            "agent": args.agent,
            "include_global": args.include_global,
            "total_indexed": len(facts),
            "matched": len(sorted_facts),
            "facts": sorted_facts,
        }
        print(json.dumps(output, indent=2))
    else:
        # Markdown mode: full bootstrap context
        context = build_bootstrap_context(
            repo=args.repo,
            agent=args.agent,
            include_global=args.include_global,
            max_tokens=args.max_tokens,
            index_path=index_path,
        )
        print(context)

    return 0


if __name__ == "__main__":
    sys.exit(main())