feat: build bootstrapper.py - pre-session context assembler

Assembles relevant knowledge from the store into a compact 2k-token
context block for session injection.

Features:
- Filter by repo, agent type, and global scope
- Sort by confidence (pitfalls first, patterns, facts)
- Per-repo and per-agent markdown knowledge files
- Graceful empty-store handling
- JSON output mode for programmatic use
- Token-count-aware truncation at line boundaries

Closes #11
This commit is contained in:
Timmy
2026-04-14 14:05:30 -04:00
parent 102ef67a8e
commit b65256bf76
4 changed files with 1050 additions and 132 deletions

359
scripts/bootstrapper.py Normal file
View File

@@ -0,0 +1,359 @@
#!/usr/bin/env python3
"""
Bootstrapper — assemble pre-session context from knowledge store.
Reads the knowledge store and produces a compact context block (2k tokens max)
that can be injected into a new session so it starts with situational awareness.
Usage:
python3 bootstrapper.py --repo the-nexus --agent mimo-sprint
python3 bootstrapper.py --repo timmy-home --global
python3 bootstrapper.py --global
python3 bootstrapper.py --repo the-nexus --max-tokens 1000
"""
import argparse
import json
import sys
from pathlib import Path
from typing import Optional
# Resolve knowledge root relative to this script's parent
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
KNOWLEDGE_DIR = REPO_ROOT / "knowledge"
INDEX_PATH = KNOWLEDGE_DIR / "index.json"
# Approximate token count: ~4 chars per token for English text
CHARS_PER_TOKEN = 4
# Category sort priority (lower = shown first)
CATEGORY_PRIORITY = {
"pitfall": 0,
"tool-quirk": 1,
"pattern": 2,
"fact": 3,
"question": 4,
}
def load_index(index_path: Path = INDEX_PATH) -> dict:
"""Load and validate the knowledge index."""
if not index_path.exists():
return {"version": 1, "total_facts": 0, "facts": []}
with open(index_path) as f:
data = json.load(f)
if "facts" not in data:
print(f"WARNING: index.json missing 'facts' key", file=sys.stderr)
return {"version": 1, "total_facts": 0, "facts": []}
return data
def filter_facts(
facts: list[dict],
repo: Optional[str] = None,
agent: Optional[str] = None,
include_global: bool = True,
) -> list[dict]:
"""Filter facts by repo, agent, and global scope."""
filtered = []
for fact in facts:
fact_repo = fact.get("repo", "global")
fact_agent = fact.get("agent", "")
# Match by repo (regardless of agent)
if repo and fact_repo == repo:
filtered.append(fact)
continue
# Match by exact agent type
if agent and fact_agent == agent:
filtered.append(fact)
continue
# Include global facts without agent restriction (universal facts)
if include_global and fact_repo == "global" and not fact_agent:
filtered.append(fact)
return filtered
def sort_facts(facts: list[dict]) -> list[dict]:
"""
Sort facts by: confidence (desc), then category priority, then fact text.
Most reliable and most dangerous facts come first.
"""
def sort_key(f):
confidence = f.get("confidence", 0.5)
category = f.get("category", "fact")
cat_priority = CATEGORY_PRIORITY.get(category, 5)
return (-confidence, cat_priority, f.get("fact", ""))
return sorted(facts, key=sort_key)
def load_repo_knowledge(repo: str) -> Optional[str]:
"""Load per-repo knowledge markdown if it exists."""
repo_path = KNOWLEDGE_DIR / "repos" / f"{repo}.md"
if repo_path.exists():
return repo_path.read_text().strip()
return None
def load_agent_knowledge(agent: str) -> Optional[str]:
"""Load per-agent knowledge markdown if it exists."""
agent_path = KNOWLEDGE_DIR / "agents" / f"{agent}.md"
if agent_path.exists():
return agent_path.read_text().strip()
return None
def load_global_knowledge() -> list[str]:
"""Load all global knowledge markdown files."""
global_dir = KNOWLEDGE_DIR / "global"
if not global_dir.exists():
return []
chunks = []
for md_file in sorted(global_dir.glob("*.md")):
content = md_file.read_text().strip()
if content:
chunks.append(content)
return chunks
def render_facts_section(facts: list[dict], category: str, label: str) -> str:
"""Render a section of facts for a single category."""
cat_facts = [f for f in facts if f.get("category") == category]
if not cat_facts:
return ""
lines = [f"### {label}\n"]
for f in cat_facts:
conf = f.get("confidence", 0.5)
fact_text = f.get("fact", "")
repo_tag = f.get("repo", "")
if repo_tag and repo_tag != "global":
lines.append(f"- [{conf:.0%}] ({repo_tag}) {fact_text}")
else:
lines.append(f"- [{conf:.0%}] {fact_text}")
return "\n".join(lines) + "\n"
def estimate_tokens(text: str) -> int:
"""Rough token estimate."""
return len(text) // CHARS_PER_TOKEN
def truncate_to_tokens(text: str, max_tokens: int) -> str:
"""Truncate text to approximately max_tokens, cutting at line boundaries."""
max_chars = max_tokens * CHARS_PER_TOKEN
if len(text) <= max_chars:
return text
# Cut at last newline before the limit
truncated = text[:max_chars]
last_newline = truncated.rfind("\n")
if last_newline > 0:
truncated = truncated[:last_newline]
return truncated + "\n\n[... truncated to fit context window ...]"
def build_bootstrap_context(
repo: Optional[str] = None,
agent: Optional[str] = None,
include_global: bool = True,
max_tokens: int = 2000,
index_path: Path = INDEX_PATH,
) -> str:
"""
Build the full bootstrap context block.
Returns a markdown string suitable for injection into a session prompt.
"""
index = load_index(index_path)
facts = index.get("facts", [])
# Filter
filtered = filter_facts(facts, repo=repo, agent=agent, include_global=include_global)
# Sort
sorted_facts = sort_facts(filtered)
# Build sections
sections = ["## What You Know (bootstrapped)\n"]
# Per-repo markdown knowledge
if repo:
repo_md = load_repo_knowledge(repo)
if repo_md:
sections.append(f"### Repo Notes: {repo}\n")
sections.append(repo_md + "\n")
# Structured facts by category
if sorted_facts:
# Group by source
repo_facts = [f for f in sorted_facts if f.get("repo") == repo] if repo else []
global_facts = [f for f in sorted_facts if f.get("repo") == "global"]
agent_facts = [f for f in sorted_facts if f.get("agent") == agent] if agent else []
if repo_facts:
sections.append(f"### Repo: {repo}\n")
for cat, label in [
("pitfall", "PITFALLS"),
("tool-quirk", "QUIRKS"),
("pattern", "PATTERNS"),
("fact", "FACTS"),
("question", "OPEN QUESTIONS"),
]:
section = render_facts_section(repo_facts, cat, label)
if section:
sections.append(section)
if global_facts:
sections.append("### Global\n")
for cat, label in [
("pitfall", "PITFALLS"),
("tool-quirk", "QUIRKS"),
("pattern", "PATTERNS"),
("fact", "FACTS"),
]:
section = render_facts_section(global_facts, cat, label)
if section:
sections.append(section)
if agent_facts:
sections.append(f"### Agent Notes ({agent})\n")
for cat, label in [
("pitfall", "PITFALLS"),
("tool-quirk", "QUIRKS"),
("pattern", "PATTERNS"),
("fact", "FACTS"),
]:
section = render_facts_section(agent_facts, cat, label)
if section:
sections.append(section)
# Per-agent markdown knowledge
if agent:
agent_md = load_agent_knowledge(agent)
if agent_md:
sections.append(f"### Agent Profile: {agent}\n")
sections.append(agent_md + "\n")
# Global markdown knowledge
global_chunks = load_global_knowledge()
if global_chunks:
sections.append("### Global Notes\n")
sections.extend(chunk + "\n" for chunk in global_chunks)
# If nothing was found
if len(sections) == 1:
sections.append("_No relevant knowledge found. Starting fresh._\n")
if not facts:
sections.append(
"_Knowledge store is empty. Run the harvester to populate it._\n"
)
# Join and truncate
context = "\n".join(sections)
context = truncate_to_tokens(context, max_tokens)
return context
def main():
parser = argparse.ArgumentParser(
description="Assemble pre-session context from knowledge store"
)
parser.add_argument(
"--repo",
type=str,
default=None,
help="Repository name to filter facts by",
)
parser.add_argument(
"--agent",
type=str,
default=None,
help="Agent type to filter facts by (e.g., mimo-sprint, groq-fast)",
)
parser.add_argument(
"--global",
dest="include_global",
action="store_true",
default=True,
help="Include global facts (default: true)",
)
parser.add_argument(
"--no-global",
dest="include_global",
action="store_false",
help="Exclude global facts",
)
parser.add_argument(
"--max-tokens",
type=int,
default=2000,
help="Maximum token count for output (default: 2000)",
)
parser.add_argument(
"--index",
type=str,
default=None,
help="Path to index.json (default: knowledge/index.json)",
)
parser.add_argument(
"--json",
dest="output_json",
action="store_true",
help="Output raw JSON instead of markdown",
)
args = parser.parse_args()
index_path = Path(args.index) if args.index else INDEX_PATH
if args.output_json:
# JSON mode: return the filtered, sorted facts
index = load_index(index_path)
facts = index.get("facts", [])
filtered = filter_facts(
facts,
repo=args.repo,
agent=args.agent,
include_global=args.include_global,
)
sorted_facts = sort_facts(filtered)
output = {
"repo": args.repo,
"agent": args.agent,
"include_global": args.include_global,
"total_indexed": len(facts),
"matched": len(sorted_facts),
"facts": sorted_facts,
}
print(json.dumps(output, indent=2))
else:
# Markdown mode: full bootstrap context
context = build_bootstrap_context(
repo=args.repo,
agent=args.agent,
include_global=args.include_global,
max_tokens=args.max_tokens,
index_path=index_path,
)
print(context)
return 0
if __name__ == "__main__":
sys.exit(main())