Assembles relevant knowledge from the store into a compact 2k-token context block for session injection. Features: - Filter by repo, agent type, and global scope - Sort by confidence (pitfalls first, patterns, facts) - Per-repo and per-agent markdown knowledge files - Graceful empty-store handling - JSON output mode for programmatic use - Token-count-aware truncation at line boundaries Closes #11
360 lines
10 KiB
Python
360 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Bootstrapper — assemble pre-session context from knowledge store.
|
|
|
|
Reads the knowledge store and produces a compact context block (2k tokens max)
|
|
that can be injected into a new session so it starts with situational awareness.
|
|
|
|
Usage:
|
|
python3 bootstrapper.py --repo the-nexus --agent mimo-sprint
|
|
python3 bootstrapper.py --repo timmy-home --global
|
|
python3 bootstrapper.py --global
|
|
python3 bootstrapper.py --repo the-nexus --max-tokens 1000
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
# Resolve knowledge root relative to this script's parent
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
REPO_ROOT = SCRIPT_DIR.parent
|
|
KNOWLEDGE_DIR = REPO_ROOT / "knowledge"
|
|
INDEX_PATH = KNOWLEDGE_DIR / "index.json"
|
|
|
|
# Approximate token count: ~4 chars per token for English text
|
|
CHARS_PER_TOKEN = 4
|
|
|
|
# Category sort priority (lower = shown first)
|
|
CATEGORY_PRIORITY = {
|
|
"pitfall": 0,
|
|
"tool-quirk": 1,
|
|
"pattern": 2,
|
|
"fact": 3,
|
|
"question": 4,
|
|
}
|
|
|
|
|
|
def load_index(index_path: Path = INDEX_PATH) -> dict:
|
|
"""Load and validate the knowledge index."""
|
|
if not index_path.exists():
|
|
return {"version": 1, "total_facts": 0, "facts": []}
|
|
|
|
with open(index_path) as f:
|
|
data = json.load(f)
|
|
|
|
if "facts" not in data:
|
|
print(f"WARNING: index.json missing 'facts' key", file=sys.stderr)
|
|
return {"version": 1, "total_facts": 0, "facts": []}
|
|
|
|
return data
|
|
|
|
|
|
def filter_facts(
|
|
facts: list[dict],
|
|
repo: Optional[str] = None,
|
|
agent: Optional[str] = None,
|
|
include_global: bool = True,
|
|
) -> list[dict]:
|
|
"""Filter facts by repo, agent, and global scope."""
|
|
filtered = []
|
|
|
|
for fact in facts:
|
|
fact_repo = fact.get("repo", "global")
|
|
fact_agent = fact.get("agent", "")
|
|
|
|
# Match by repo (regardless of agent)
|
|
if repo and fact_repo == repo:
|
|
filtered.append(fact)
|
|
continue
|
|
|
|
# Match by exact agent type
|
|
if agent and fact_agent == agent:
|
|
filtered.append(fact)
|
|
continue
|
|
|
|
# Include global facts without agent restriction (universal facts)
|
|
if include_global and fact_repo == "global" and not fact_agent:
|
|
filtered.append(fact)
|
|
|
|
return filtered
|
|
|
|
|
|
def sort_facts(facts: list[dict]) -> list[dict]:
|
|
"""
|
|
Sort facts by: confidence (desc), then category priority, then fact text.
|
|
Most reliable and most dangerous facts come first.
|
|
"""
|
|
|
|
def sort_key(f):
|
|
confidence = f.get("confidence", 0.5)
|
|
category = f.get("category", "fact")
|
|
cat_priority = CATEGORY_PRIORITY.get(category, 5)
|
|
return (-confidence, cat_priority, f.get("fact", ""))
|
|
|
|
return sorted(facts, key=sort_key)
|
|
|
|
|
|
def load_repo_knowledge(repo: str) -> Optional[str]:
|
|
"""Load per-repo knowledge markdown if it exists."""
|
|
repo_path = KNOWLEDGE_DIR / "repos" / f"{repo}.md"
|
|
if repo_path.exists():
|
|
return repo_path.read_text().strip()
|
|
return None
|
|
|
|
|
|
def load_agent_knowledge(agent: str) -> Optional[str]:
|
|
"""Load per-agent knowledge markdown if it exists."""
|
|
agent_path = KNOWLEDGE_DIR / "agents" / f"{agent}.md"
|
|
if agent_path.exists():
|
|
return agent_path.read_text().strip()
|
|
return None
|
|
|
|
|
|
def load_global_knowledge() -> list[str]:
|
|
"""Load all global knowledge markdown files."""
|
|
global_dir = KNOWLEDGE_DIR / "global"
|
|
if not global_dir.exists():
|
|
return []
|
|
|
|
chunks = []
|
|
for md_file in sorted(global_dir.glob("*.md")):
|
|
content = md_file.read_text().strip()
|
|
if content:
|
|
chunks.append(content)
|
|
return chunks
|
|
|
|
|
|
def render_facts_section(facts: list[dict], category: str, label: str) -> str:
|
|
"""Render a section of facts for a single category."""
|
|
cat_facts = [f for f in facts if f.get("category") == category]
|
|
if not cat_facts:
|
|
return ""
|
|
|
|
lines = [f"### {label}\n"]
|
|
for f in cat_facts:
|
|
conf = f.get("confidence", 0.5)
|
|
fact_text = f.get("fact", "")
|
|
repo_tag = f.get("repo", "")
|
|
if repo_tag and repo_tag != "global":
|
|
lines.append(f"- [{conf:.0%}] ({repo_tag}) {fact_text}")
|
|
else:
|
|
lines.append(f"- [{conf:.0%}] {fact_text}")
|
|
|
|
return "\n".join(lines) + "\n"
|
|
|
|
|
|
def estimate_tokens(text: str) -> int:
|
|
"""Rough token estimate."""
|
|
return len(text) // CHARS_PER_TOKEN
|
|
|
|
|
|
def truncate_to_tokens(text: str, max_tokens: int) -> str:
|
|
"""Truncate text to approximately max_tokens, cutting at line boundaries."""
|
|
max_chars = max_tokens * CHARS_PER_TOKEN
|
|
if len(text) <= max_chars:
|
|
return text
|
|
|
|
# Cut at last newline before the limit
|
|
truncated = text[:max_chars]
|
|
last_newline = truncated.rfind("\n")
|
|
if last_newline > 0:
|
|
truncated = truncated[:last_newline]
|
|
|
|
return truncated + "\n\n[... truncated to fit context window ...]"
|
|
|
|
|
|
def build_bootstrap_context(
|
|
repo: Optional[str] = None,
|
|
agent: Optional[str] = None,
|
|
include_global: bool = True,
|
|
max_tokens: int = 2000,
|
|
index_path: Path = INDEX_PATH,
|
|
) -> str:
|
|
"""
|
|
Build the full bootstrap context block.
|
|
|
|
Returns a markdown string suitable for injection into a session prompt.
|
|
"""
|
|
index = load_index(index_path)
|
|
facts = index.get("facts", [])
|
|
|
|
# Filter
|
|
filtered = filter_facts(facts, repo=repo, agent=agent, include_global=include_global)
|
|
|
|
# Sort
|
|
sorted_facts = sort_facts(filtered)
|
|
|
|
# Build sections
|
|
sections = ["## What You Know (bootstrapped)\n"]
|
|
|
|
# Per-repo markdown knowledge
|
|
if repo:
|
|
repo_md = load_repo_knowledge(repo)
|
|
if repo_md:
|
|
sections.append(f"### Repo Notes: {repo}\n")
|
|
sections.append(repo_md + "\n")
|
|
|
|
# Structured facts by category
|
|
if sorted_facts:
|
|
# Group by source
|
|
repo_facts = [f for f in sorted_facts if f.get("repo") == repo] if repo else []
|
|
global_facts = [f for f in sorted_facts if f.get("repo") == "global"]
|
|
agent_facts = [f for f in sorted_facts if f.get("agent") == agent] if agent else []
|
|
|
|
if repo_facts:
|
|
sections.append(f"### Repo: {repo}\n")
|
|
for cat, label in [
|
|
("pitfall", "PITFALLS"),
|
|
("tool-quirk", "QUIRKS"),
|
|
("pattern", "PATTERNS"),
|
|
("fact", "FACTS"),
|
|
("question", "OPEN QUESTIONS"),
|
|
]:
|
|
section = render_facts_section(repo_facts, cat, label)
|
|
if section:
|
|
sections.append(section)
|
|
|
|
if global_facts:
|
|
sections.append("### Global\n")
|
|
for cat, label in [
|
|
("pitfall", "PITFALLS"),
|
|
("tool-quirk", "QUIRKS"),
|
|
("pattern", "PATTERNS"),
|
|
("fact", "FACTS"),
|
|
]:
|
|
section = render_facts_section(global_facts, cat, label)
|
|
if section:
|
|
sections.append(section)
|
|
|
|
if agent_facts:
|
|
sections.append(f"### Agent Notes ({agent})\n")
|
|
for cat, label in [
|
|
("pitfall", "PITFALLS"),
|
|
("tool-quirk", "QUIRKS"),
|
|
("pattern", "PATTERNS"),
|
|
("fact", "FACTS"),
|
|
]:
|
|
section = render_facts_section(agent_facts, cat, label)
|
|
if section:
|
|
sections.append(section)
|
|
|
|
# Per-agent markdown knowledge
|
|
if agent:
|
|
agent_md = load_agent_knowledge(agent)
|
|
if agent_md:
|
|
sections.append(f"### Agent Profile: {agent}\n")
|
|
sections.append(agent_md + "\n")
|
|
|
|
# Global markdown knowledge
|
|
global_chunks = load_global_knowledge()
|
|
if global_chunks:
|
|
sections.append("### Global Notes\n")
|
|
sections.extend(chunk + "\n" for chunk in global_chunks)
|
|
|
|
# If nothing was found
|
|
if len(sections) == 1:
|
|
sections.append("_No relevant knowledge found. Starting fresh._\n")
|
|
if not facts:
|
|
sections.append(
|
|
"_Knowledge store is empty. Run the harvester to populate it._\n"
|
|
)
|
|
|
|
# Join and truncate
|
|
context = "\n".join(sections)
|
|
context = truncate_to_tokens(context, max_tokens)
|
|
|
|
return context
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Assemble pre-session context from knowledge store"
|
|
)
|
|
parser.add_argument(
|
|
"--repo",
|
|
type=str,
|
|
default=None,
|
|
help="Repository name to filter facts by",
|
|
)
|
|
parser.add_argument(
|
|
"--agent",
|
|
type=str,
|
|
default=None,
|
|
help="Agent type to filter facts by (e.g., mimo-sprint, groq-fast)",
|
|
)
|
|
parser.add_argument(
|
|
"--global",
|
|
dest="include_global",
|
|
action="store_true",
|
|
default=True,
|
|
help="Include global facts (default: true)",
|
|
)
|
|
parser.add_argument(
|
|
"--no-global",
|
|
dest="include_global",
|
|
action="store_false",
|
|
help="Exclude global facts",
|
|
)
|
|
parser.add_argument(
|
|
"--max-tokens",
|
|
type=int,
|
|
default=2000,
|
|
help="Maximum token count for output (default: 2000)",
|
|
)
|
|
parser.add_argument(
|
|
"--index",
|
|
type=str,
|
|
default=None,
|
|
help="Path to index.json (default: knowledge/index.json)",
|
|
)
|
|
parser.add_argument(
|
|
"--json",
|
|
dest="output_json",
|
|
action="store_true",
|
|
help="Output raw JSON instead of markdown",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
index_path = Path(args.index) if args.index else INDEX_PATH
|
|
|
|
if args.output_json:
|
|
# JSON mode: return the filtered, sorted facts
|
|
index = load_index(index_path)
|
|
facts = index.get("facts", [])
|
|
filtered = filter_facts(
|
|
facts,
|
|
repo=args.repo,
|
|
agent=args.agent,
|
|
include_global=args.include_global,
|
|
)
|
|
sorted_facts = sort_facts(filtered)
|
|
output = {
|
|
"repo": args.repo,
|
|
"agent": args.agent,
|
|
"include_global": args.include_global,
|
|
"total_indexed": len(facts),
|
|
"matched": len(sorted_facts),
|
|
"facts": sorted_facts,
|
|
}
|
|
print(json.dumps(output, indent=2))
|
|
else:
|
|
# Markdown mode: full bootstrap context
|
|
context = build_bootstrap_context(
|
|
repo=args.repo,
|
|
agent=args.agent,
|
|
include_global=args.include_global,
|
|
max_tokens=args.max_tokens,
|
|
index_path=index_path,
|
|
)
|
|
print(context)
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|