feat: build bootstrapper.py - pre-session context assembler

Assembles relevant knowledge from the store into a compact 2k-token context block for session injection. Features: - Filter by repo, agent type, and global scope - Sort by confidence (pitfalls first, patterns, facts) - Per-repo and per-agent markdown knowledge files - Graceful empty-store handling - JSON output mode for programmatic use - Token-count-aware truncation at line boundaries Closes #11
2026-04-14 14:05:30 -04:00
parent 102ef67a8e
commit b65256bf76
4 changed files with 1050 additions and 132 deletions
--- a/scripts/bootstrapper.py
+++ b/scripts/bootstrapper.py
@@ -0,0 +1,359 @@
+#!/usr/bin/env python3
+"""
+Bootstrapper — assemble pre-session context from knowledge store.
+
+Reads the knowledge store and produces a compact context block (2k tokens max)
+that can be injected into a new session so it starts with situational awareness.
+
+Usage:
+    python3 bootstrapper.py --repo the-nexus --agent mimo-sprint
+    python3 bootstrapper.py --repo timmy-home --global
+    python3 bootstrapper.py --global
+    python3 bootstrapper.py --repo the-nexus --max-tokens 1000
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Optional
+
+# Resolve knowledge root relative to this script's parent
+SCRIPT_DIR = Path(__file__).resolve().parent
+REPO_ROOT = SCRIPT_DIR.parent
+KNOWLEDGE_DIR = REPO_ROOT / "knowledge"
+INDEX_PATH = KNOWLEDGE_DIR / "index.json"
+
+# Approximate token count: ~4 chars per token for English text
+CHARS_PER_TOKEN = 4
+
+# Category sort priority (lower = shown first)
+CATEGORY_PRIORITY = {
+    "pitfall": 0,
+    "tool-quirk": 1,
+    "pattern": 2,
+    "fact": 3,
+    "question": 4,
+}
+
+
+def load_index(index_path: Path = INDEX_PATH) -> dict:
+    """Load and validate the knowledge index."""
+    if not index_path.exists():
+        return {"version": 1, "total_facts": 0, "facts": []}
+
+    with open(index_path) as f:
+        data = json.load(f)
+
+    if "facts" not in data:
+        print(f"WARNING: index.json missing 'facts' key", file=sys.stderr)
+        return {"version": 1, "total_facts": 0, "facts": []}
+
+    return data
+
+
+def filter_facts(
+    facts: list[dict],
+    repo: Optional[str] = None,
+    agent: Optional[str] = None,
+    include_global: bool = True,
+) -> list[dict]:
+    """Filter facts by repo, agent, and global scope."""
+    filtered = []
+
+    for fact in facts:
+        fact_repo = fact.get("repo", "global")
+        fact_agent = fact.get("agent", "")
+
+        # Match by repo (regardless of agent)
+        if repo and fact_repo == repo:
+            filtered.append(fact)
+            continue
+
+        # Match by exact agent type
+        if agent and fact_agent == agent:
+            filtered.append(fact)
+            continue
+
+        # Include global facts without agent restriction (universal facts)
+        if include_global and fact_repo == "global" and not fact_agent:
+            filtered.append(fact)
+
+    return filtered
+
+
+def sort_facts(facts: list[dict]) -> list[dict]:
+    """
+    Sort facts by: confidence (desc), then category priority, then fact text.
+    Most reliable and most dangerous facts come first.
+    """
+
+    def sort_key(f):
+        confidence = f.get("confidence", 0.5)
+        category = f.get("category", "fact")
+        cat_priority = CATEGORY_PRIORITY.get(category, 5)
+        return (-confidence, cat_priority, f.get("fact", ""))
+
+    return sorted(facts, key=sort_key)
+
+
+def load_repo_knowledge(repo: str) -> Optional[str]:
+    """Load per-repo knowledge markdown if it exists."""
+    repo_path = KNOWLEDGE_DIR / "repos" / f"{repo}.md"
+    if repo_path.exists():
+        return repo_path.read_text().strip()
+    return None
+
+
+def load_agent_knowledge(agent: str) -> Optional[str]:
+    """Load per-agent knowledge markdown if it exists."""
+    agent_path = KNOWLEDGE_DIR / "agents" / f"{agent}.md"
+    if agent_path.exists():
+        return agent_path.read_text().strip()
+    return None
+
+
+def load_global_knowledge() -> list[str]:
+    """Load all global knowledge markdown files."""
+    global_dir = KNOWLEDGE_DIR / "global"
+    if not global_dir.exists():
+        return []
+
+    chunks = []
+    for md_file in sorted(global_dir.glob("*.md")):
+        content = md_file.read_text().strip()
+        if content:
+            chunks.append(content)
+    return chunks
+
+
+def render_facts_section(facts: list[dict], category: str, label: str) -> str:
+    """Render a section of facts for a single category."""
+    cat_facts = [f for f in facts if f.get("category") == category]
+    if not cat_facts:
+        return ""
+
+    lines = [f"### {label}\n"]
+    for f in cat_facts:
+        conf = f.get("confidence", 0.5)
+        fact_text = f.get("fact", "")
+        repo_tag = f.get("repo", "")
+        if repo_tag and repo_tag != "global":
+            lines.append(f"- [{conf:.0%}] ({repo_tag}) {fact_text}")
+        else:
+            lines.append(f"- [{conf:.0%}] {fact_text}")
+
+    return "\n".join(lines) + "\n"
+
+
+def estimate_tokens(text: str) -> int:
+    """Rough token estimate."""
+    return len(text) // CHARS_PER_TOKEN
+
+
+def truncate_to_tokens(text: str, max_tokens: int) -> str:
+    """Truncate text to approximately max_tokens, cutting at line boundaries."""
+    max_chars = max_tokens * CHARS_PER_TOKEN
+    if len(text) <= max_chars:
+        return text
+
+    # Cut at last newline before the limit
+    truncated = text[:max_chars]
+    last_newline = truncated.rfind("\n")
+    if last_newline > 0:
+        truncated = truncated[:last_newline]
+
+    return truncated + "\n\n[... truncated to fit context window ...]"
+
+
+def build_bootstrap_context(
+    repo: Optional[str] = None,
+    agent: Optional[str] = None,
+    include_global: bool = True,
+    max_tokens: int = 2000,
+    index_path: Path = INDEX_PATH,
+) -> str:
+    """
+    Build the full bootstrap context block.
+
+    Returns a markdown string suitable for injection into a session prompt.
+    """
+    index = load_index(index_path)
+    facts = index.get("facts", [])
+
+    # Filter
+    filtered = filter_facts(facts, repo=repo, agent=agent, include_global=include_global)
+
+    # Sort
+    sorted_facts = sort_facts(filtered)
+
+    # Build sections
+    sections = ["## What You Know (bootstrapped)\n"]
+
+    # Per-repo markdown knowledge
+    if repo:
+        repo_md = load_repo_knowledge(repo)
+        if repo_md:
+            sections.append(f"### Repo Notes: {repo}\n")
+            sections.append(repo_md + "\n")
+
+    # Structured facts by category
+    if sorted_facts:
+        # Group by source
+        repo_facts = [f for f in sorted_facts if f.get("repo") == repo] if repo else []
+        global_facts = [f for f in sorted_facts if f.get("repo") == "global"]
+        agent_facts = [f for f in sorted_facts if f.get("agent") == agent] if agent else []
+
+        if repo_facts:
+            sections.append(f"### Repo: {repo}\n")
+            for cat, label in [
+                ("pitfall", "PITFALLS"),
+                ("tool-quirk", "QUIRKS"),
+                ("pattern", "PATTERNS"),
+                ("fact", "FACTS"),
+                ("question", "OPEN QUESTIONS"),
+            ]:
+                section = render_facts_section(repo_facts, cat, label)
+                if section:
+                    sections.append(section)
+
+        if global_facts:
+            sections.append("### Global\n")
+            for cat, label in [
+                ("pitfall", "PITFALLS"),
+                ("tool-quirk", "QUIRKS"),
+                ("pattern", "PATTERNS"),
+                ("fact", "FACTS"),
+            ]:
+                section = render_facts_section(global_facts, cat, label)
+                if section:
+                    sections.append(section)
+
+        if agent_facts:
+            sections.append(f"### Agent Notes ({agent})\n")
+            for cat, label in [
+                ("pitfall", "PITFALLS"),
+                ("tool-quirk", "QUIRKS"),
+                ("pattern", "PATTERNS"),
+                ("fact", "FACTS"),
+            ]:
+                section = render_facts_section(agent_facts, cat, label)
+                if section:
+                    sections.append(section)
+
+    # Per-agent markdown knowledge
+    if agent:
+        agent_md = load_agent_knowledge(agent)
+        if agent_md:
+            sections.append(f"### Agent Profile: {agent}\n")
+            sections.append(agent_md + "\n")
+
+    # Global markdown knowledge
+    global_chunks = load_global_knowledge()
+    if global_chunks:
+        sections.append("### Global Notes\n")
+        sections.extend(chunk + "\n" for chunk in global_chunks)
+
+    # If nothing was found
+    if len(sections) == 1:
+        sections.append("_No relevant knowledge found. Starting fresh._\n")
+        if not facts:
+            sections.append(
+                "_Knowledge store is empty. Run the harvester to populate it._\n"
+            )
+
+    # Join and truncate
+    context = "\n".join(sections)
+    context = truncate_to_tokens(context, max_tokens)
+
+    return context
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Assemble pre-session context from knowledge store"
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        default=None,
+        help="Repository name to filter facts by",
+    )
+    parser.add_argument(
+        "--agent",
+        type=str,
+        default=None,
+        help="Agent type to filter facts by (e.g., mimo-sprint, groq-fast)",
+    )
+    parser.add_argument(
+        "--global",
+        dest="include_global",
+        action="store_true",
+        default=True,
+        help="Include global facts (default: true)",
+    )
+    parser.add_argument(
+        "--no-global",
+        dest="include_global",
+        action="store_false",
+        help="Exclude global facts",
+    )
+    parser.add_argument(
+        "--max-tokens",
+        type=int,
+        default=2000,
+        help="Maximum token count for output (default: 2000)",
+    )
+    parser.add_argument(
+        "--index",
+        type=str,
+        default=None,
+        help="Path to index.json (default: knowledge/index.json)",
+    )
+    parser.add_argument(
+        "--json",
+        dest="output_json",
+        action="store_true",
+        help="Output raw JSON instead of markdown",
+    )
+
+    args = parser.parse_args()
+
+    index_path = Path(args.index) if args.index else INDEX_PATH
+
+    if args.output_json:
+        # JSON mode: return the filtered, sorted facts
+        index = load_index(index_path)
+        facts = index.get("facts", [])
+        filtered = filter_facts(
+            facts,
+            repo=args.repo,
+            agent=args.agent,
+            include_global=args.include_global,
+        )
+        sorted_facts = sort_facts(filtered)
+        output = {
+            "repo": args.repo,
+            "agent": args.agent,
+            "include_global": args.include_global,
+            "total_indexed": len(facts),
+            "matched": len(sorted_facts),
+            "facts": sorted_facts,
+        }
+        print(json.dumps(output, indent=2))
+    else:
+        # Markdown mode: full bootstrap context
+        context = build_bootstrap_context(
+            repo=args.repo,
+            agent=args.agent,
+            include_global=args.include_global,
+            max_tokens=args.max_tokens,
+            index_path=index_path,
+        )
+        print(context)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())