diff --git a/scripts/bootstrapper.py b/scripts/bootstrapper.py new file mode 100644 index 0000000..9e5db34 --- /dev/null +++ b/scripts/bootstrapper.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 +""" +Bootstrapper — assemble pre-session context from knowledge store. + +Reads the knowledge store and produces a compact context block (2k tokens max) +that can be injected into a new session so it starts with situational awareness. + +Usage: + python3 bootstrapper.py --repo the-nexus --agent mimo-sprint + python3 bootstrapper.py --repo timmy-home --global + python3 bootstrapper.py --global + python3 bootstrapper.py --repo the-nexus --max-tokens 1000 +""" + +import argparse +import json +import sys +from pathlib import Path +from typing import Optional + +# Resolve knowledge root relative to this script's parent +SCRIPT_DIR = Path(__file__).resolve().parent +REPO_ROOT = SCRIPT_DIR.parent +KNOWLEDGE_DIR = REPO_ROOT / "knowledge" +INDEX_PATH = KNOWLEDGE_DIR / "index.json" + +# Approximate token count: ~4 chars per token for English text +CHARS_PER_TOKEN = 4 + +# Category sort priority (lower = shown first) +CATEGORY_PRIORITY = { + "pitfall": 0, + "tool-quirk": 1, + "pattern": 2, + "fact": 3, + "question": 4, +} + + +def load_index(index_path: Path = INDEX_PATH) -> dict: + """Load and validate the knowledge index.""" + if not index_path.exists(): + return {"version": 1, "total_facts": 0, "facts": []} + + with open(index_path) as f: + data = json.load(f) + + if "facts" not in data: + print(f"WARNING: index.json missing 'facts' key", file=sys.stderr) + return {"version": 1, "total_facts": 0, "facts": []} + + return data + + +def filter_facts( + facts: list[dict], + repo: Optional[str] = None, + agent: Optional[str] = None, + include_global: bool = True, +) -> list[dict]: + """Filter facts by repo, agent, and global scope.""" + filtered = [] + + for fact in facts: + fact_repo = fact.get("repo", "global") + fact_agent = fact.get("agent", "") + + # Match by repo (regardless of agent) + if repo and fact_repo == repo: + filtered.append(fact) + continue + + # Match by exact agent type + if agent and fact_agent == agent: + filtered.append(fact) + continue + + # Include global facts without agent restriction (universal facts) + if include_global and fact_repo == "global" and not fact_agent: + filtered.append(fact) + + return filtered + + +def sort_facts(facts: list[dict]) -> list[dict]: + """ + Sort facts by: confidence (desc), then category priority, then fact text. + Most reliable and most dangerous facts come first. + """ + + def sort_key(f): + confidence = f.get("confidence", 0.5) + category = f.get("category", "fact") + cat_priority = CATEGORY_PRIORITY.get(category, 5) + return (-confidence, cat_priority, f.get("fact", "")) + + return sorted(facts, key=sort_key) + + +def load_repo_knowledge(repo: str) -> Optional[str]: + """Load per-repo knowledge markdown if it exists.""" + repo_path = KNOWLEDGE_DIR / "repos" / f"{repo}.md" + if repo_path.exists(): + return repo_path.read_text().strip() + return None + + +def load_agent_knowledge(agent: str) -> Optional[str]: + """Load per-agent knowledge markdown if it exists.""" + agent_path = KNOWLEDGE_DIR / "agents" / f"{agent}.md" + if agent_path.exists(): + return agent_path.read_text().strip() + return None + + +def load_global_knowledge() -> list[str]: + """Load all global knowledge markdown files.""" + global_dir = KNOWLEDGE_DIR / "global" + if not global_dir.exists(): + return [] + + chunks = [] + for md_file in sorted(global_dir.glob("*.md")): + content = md_file.read_text().strip() + if content: + chunks.append(content) + return chunks + + +def render_facts_section(facts: list[dict], category: str, label: str) -> str: + """Render a section of facts for a single category.""" + cat_facts = [f for f in facts if f.get("category") == category] + if not cat_facts: + return "" + + lines = [f"### {label}\n"] + for f in cat_facts: + conf = f.get("confidence", 0.5) + fact_text = f.get("fact", "") + repo_tag = f.get("repo", "") + if repo_tag and repo_tag != "global": + lines.append(f"- [{conf:.0%}] ({repo_tag}) {fact_text}") + else: + lines.append(f"- [{conf:.0%}] {fact_text}") + + return "\n".join(lines) + "\n" + + +def estimate_tokens(text: str) -> int: + """Rough token estimate.""" + return len(text) // CHARS_PER_TOKEN + + +def truncate_to_tokens(text: str, max_tokens: int) -> str: + """Truncate text to approximately max_tokens, cutting at line boundaries.""" + max_chars = max_tokens * CHARS_PER_TOKEN + if len(text) <= max_chars: + return text + + # Cut at last newline before the limit + truncated = text[:max_chars] + last_newline = truncated.rfind("\n") + if last_newline > 0: + truncated = truncated[:last_newline] + + return truncated + "\n\n[... truncated to fit context window ...]" + + +def build_bootstrap_context( + repo: Optional[str] = None, + agent: Optional[str] = None, + include_global: bool = True, + max_tokens: int = 2000, + index_path: Path = INDEX_PATH, +) -> str: + """ + Build the full bootstrap context block. + + Returns a markdown string suitable for injection into a session prompt. + """ + index = load_index(index_path) + facts = index.get("facts", []) + + # Filter + filtered = filter_facts(facts, repo=repo, agent=agent, include_global=include_global) + + # Sort + sorted_facts = sort_facts(filtered) + + # Build sections + sections = ["## What You Know (bootstrapped)\n"] + + # Per-repo markdown knowledge + if repo: + repo_md = load_repo_knowledge(repo) + if repo_md: + sections.append(f"### Repo Notes: {repo}\n") + sections.append(repo_md + "\n") + + # Structured facts by category + if sorted_facts: + # Group by source + repo_facts = [f for f in sorted_facts if f.get("repo") == repo] if repo else [] + global_facts = [f for f in sorted_facts if f.get("repo") == "global"] + agent_facts = [f for f in sorted_facts if f.get("agent") == agent] if agent else [] + + if repo_facts: + sections.append(f"### Repo: {repo}\n") + for cat, label in [ + ("pitfall", "PITFALLS"), + ("tool-quirk", "QUIRKS"), + ("pattern", "PATTERNS"), + ("fact", "FACTS"), + ("question", "OPEN QUESTIONS"), + ]: + section = render_facts_section(repo_facts, cat, label) + if section: + sections.append(section) + + if global_facts: + sections.append("### Global\n") + for cat, label in [ + ("pitfall", "PITFALLS"), + ("tool-quirk", "QUIRKS"), + ("pattern", "PATTERNS"), + ("fact", "FACTS"), + ]: + section = render_facts_section(global_facts, cat, label) + if section: + sections.append(section) + + if agent_facts: + sections.append(f"### Agent Notes ({agent})\n") + for cat, label in [ + ("pitfall", "PITFALLS"), + ("tool-quirk", "QUIRKS"), + ("pattern", "PATTERNS"), + ("fact", "FACTS"), + ]: + section = render_facts_section(agent_facts, cat, label) + if section: + sections.append(section) + + # Per-agent markdown knowledge + if agent: + agent_md = load_agent_knowledge(agent) + if agent_md: + sections.append(f"### Agent Profile: {agent}\n") + sections.append(agent_md + "\n") + + # Global markdown knowledge + global_chunks = load_global_knowledge() + if global_chunks: + sections.append("### Global Notes\n") + sections.extend(chunk + "\n" for chunk in global_chunks) + + # If nothing was found + if len(sections) == 1: + sections.append("_No relevant knowledge found. Starting fresh._\n") + if not facts: + sections.append( + "_Knowledge store is empty. Run the harvester to populate it._\n" + ) + + # Join and truncate + context = "\n".join(sections) + context = truncate_to_tokens(context, max_tokens) + + return context + + +def main(): + parser = argparse.ArgumentParser( + description="Assemble pre-session context from knowledge store" + ) + parser.add_argument( + "--repo", + type=str, + default=None, + help="Repository name to filter facts by", + ) + parser.add_argument( + "--agent", + type=str, + default=None, + help="Agent type to filter facts by (e.g., mimo-sprint, groq-fast)", + ) + parser.add_argument( + "--global", + dest="include_global", + action="store_true", + default=True, + help="Include global facts (default: true)", + ) + parser.add_argument( + "--no-global", + dest="include_global", + action="store_false", + help="Exclude global facts", + ) + parser.add_argument( + "--max-tokens", + type=int, + default=2000, + help="Maximum token count for output (default: 2000)", + ) + parser.add_argument( + "--index", + type=str, + default=None, + help="Path to index.json (default: knowledge/index.json)", + ) + parser.add_argument( + "--json", + dest="output_json", + action="store_true", + help="Output raw JSON instead of markdown", + ) + + args = parser.parse_args() + + index_path = Path(args.index) if args.index else INDEX_PATH + + if args.output_json: + # JSON mode: return the filtered, sorted facts + index = load_index(index_path) + facts = index.get("facts", []) + filtered = filter_facts( + facts, + repo=args.repo, + agent=args.agent, + include_global=args.include_global, + ) + sorted_facts = sort_facts(filtered) + output = { + "repo": args.repo, + "agent": args.agent, + "include_global": args.include_global, + "total_indexed": len(facts), + "matched": len(sorted_facts), + "facts": sorted_facts, + } + print(json.dumps(output, indent=2)) + else: + # Markdown mode: full bootstrap context + context = build_bootstrap_context( + repo=args.repo, + agent=args.agent, + include_global=args.include_global, + max_tokens=args.max_tokens, + index_path=index_path, + ) + print(context) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/test_bootstrapper.py b/scripts/test_bootstrapper.py new file mode 100644 index 0000000..e13d8b9 --- /dev/null +++ b/scripts/test_bootstrapper.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 +""" +Tests for bootstrapper.py — context assembly from knowledge store. +""" + +import json +import sys +import tempfile +from pathlib import Path + +# Add scripts dir to path for import +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +from bootstrapper import ( + build_bootstrap_context, + estimate_tokens, + filter_facts, + load_index, + sort_facts, + truncate_to_tokens, +) + + +def make_index(facts: list[dict], tmp_dir: Path) -> Path: + """Create a temporary index.json with given facts.""" + index = { + "version": 1, + "last_updated": "2026-04-13T20:00:00Z", + "total_facts": len(facts), + "facts": facts, + } + path = tmp_dir / "index.json" + with open(path, "w") as f: + json.dump(index, f) + return path + + +def test_empty_index(): + """Empty knowledge store produces graceful output.""" + with tempfile.TemporaryDirectory() as tmp: + tmp_dir = Path(tmp) + index_path = make_index([], tmp_dir) + + # Create empty knowledge dirs + for sub in ["repos", "agents", "global"]: + (tmp_dir / sub).mkdir(exist_ok=True) + + context = build_bootstrap_context( + repo="the-nexus", index_path=index_path + ) + assert "No relevant knowledge found" in context + assert "Starting fresh" in context + print("PASS: empty_index") + + +def test_filter_by_repo(): + """Filter facts by repository.""" + facts = [ + {"fact": "A", "category": "fact", "repo": "the-nexus", "confidence": 0.9}, + {"fact": "B", "category": "fact", "repo": "fleet-ops", "confidence": 0.8}, + {"fact": "C", "category": "fact", "repo": "global", "confidence": 0.7}, + ] + filtered = filter_facts(facts, repo="the-nexus", include_global=True) + texts = [f["fact"] for f in filtered] + assert "A" in texts + assert "B" not in texts + assert "C" in texts + print("PASS: filter_by_repo") + + +def test_filter_by_agent(): + """Filter facts by agent type.""" + facts = [ + {"fact": "A", "category": "pattern", "repo": "global", "agent": "mimo-sprint", "confidence": 0.8}, + {"fact": "B", "category": "pattern", "repo": "global", "agent": "groq-fast", "confidence": 0.7}, + {"fact": "C", "category": "fact", "repo": "global", "confidence": 0.9}, + ] + filtered = filter_facts(facts, agent="mimo-sprint", include_global=True) + texts = [f["fact"] for f in filtered] + assert "A" in texts + assert "B" not in texts + assert "C" in texts # global, no agent restriction + print("PASS: filter_by_agent") + + +def test_no_global_flag(): + """Excluding global facts works.""" + facts = [ + {"fact": "A", "category": "fact", "repo": "the-nexus", "confidence": 0.9}, + {"fact": "B", "category": "fact", "repo": "global", "confidence": 0.8}, + ] + filtered = filter_facts(facts, repo="the-nexus", include_global=False) + texts = [f["fact"] for f in filtered] + assert "A" in texts + assert "B" not in texts + print("PASS: no_global_flag") + + +def test_sort_by_confidence(): + """Facts sort by confidence descending.""" + facts = [ + {"fact": "low", "category": "fact", "repo": "global", "confidence": 0.3}, + {"fact": "high", "category": "fact", "repo": "global", "confidence": 0.95}, + {"fact": "mid", "category": "fact", "repo": "global", "confidence": 0.7}, + ] + sorted_f = sort_facts(facts) + assert sorted_f[0]["fact"] == "high" + assert sorted_f[1]["fact"] == "mid" + assert sorted_f[2]["fact"] == "low" + print("PASS: sort_by_confidence") + + +def test_sort_pitfalls_first(): + """Pitfalls sort before facts at same confidence.""" + facts = [ + {"fact": "regular fact", "category": "fact", "repo": "global", "confidence": 0.8}, + {"fact": "danger pitfall", "category": "pitfall", "repo": "global", "confidence": 0.8}, + ] + sorted_f = sort_facts(facts) + assert sorted_f[0]["category"] == "pitfall" + print("PASS: sort_pitfalls_first") + + +def test_truncate_to_tokens(): + """Truncation cuts at line boundary.""" + text = "line1\nline2\nline3\nline4\nline5\n" + truncated = truncate_to_tokens(text, max_tokens=2) # ~8 chars + assert "line1" in truncated + assert "truncated" in truncated.lower() + print("PASS: truncate_to_tokens") + + +def test_estimate_tokens(): + """Token estimation is reasonable.""" + text = "a" * 400 + tokens = estimate_tokens(text) + assert 90 <= tokens <= 110 # ~100 tokens + print("PASS: estimate_tokens") + + +def test_build_full_context(): + """Full context with facts renders correctly.""" + facts = [ + {"fact": "API merges fail with 405", "category": "pitfall", "repo": "the-nexus", "confidence": 0.95}, + {"fact": "Has 50+ open PRs", "category": "fact", "repo": "the-nexus", "confidence": 0.9}, + {"fact": "Token at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}, + {"fact": "Check git remote -v first", "category": "pattern", "repo": "global", "confidence": 0.8}, + ] + + with tempfile.TemporaryDirectory() as tmp: + tmp_dir = Path(tmp) + index_path = make_index(facts, tmp_dir) + + # Create knowledge dirs + for sub in ["repos", "agents", "global"]: + (tmp_dir / sub).mkdir(exist_ok=True) + + context = build_bootstrap_context( + repo="the-nexus", + agent="mimo-sprint", + include_global=True, + index_path=index_path, + ) + + assert "What You Know" in context + assert "PITFALLS" in context + assert "API merges fail with 405" in context + assert "the-nexus" in context + assert "Token at" in context # global fact included + print("PASS: build_full_context") + + +def test_max_tokens_respected(): + """Output respects max_tokens limit.""" + # Generate lots of facts + facts = [ + {"fact": f"Fact number {i} with some detail about things", "category": "fact", "repo": "global", "confidence": 0.8} + for i in range(100) + ] + + with tempfile.TemporaryDirectory() as tmp: + tmp_dir = Path(tmp) + index_path = make_index(facts, tmp_dir) + for sub in ["repos", "agents", "global"]: + (tmp_dir / sub).mkdir(exist_ok=True) + + context = build_bootstrap_context( + repo=None, + max_tokens=500, + index_path=index_path, + ) + + actual_tokens = estimate_tokens(context) + # Allow 10% overshoot since we cut at line boundaries + assert actual_tokens <= 550, f"Expected ~500 tokens, got {actual_tokens}" + print(f"PASS: max_tokens_respected (got {actual_tokens} tokens)") + + +def test_missing_index_graceful(): + """Missing index.json doesn't crash.""" + with tempfile.TemporaryDirectory() as tmp: + tmp_dir = Path(tmp) + # Don't create index.json + for sub in ["repos", "agents", "global"]: + (tmp_dir / sub).mkdir(exist_ok=True) + + fake_index = tmp_dir / "nonexistent.json" + context = build_bootstrap_context(repo="anything", index_path=fake_index) + assert "No relevant knowledge found" in context + print("PASS: missing_index_graceful") + + +if __name__ == "__main__": + tests = [ + test_empty_index, + test_filter_by_repo, + test_filter_by_agent, + test_no_global_flag, + test_sort_by_confidence, + test_sort_pitfalls_first, + test_truncate_to_tokens, + test_estimate_tokens, + test_build_full_context, + test_max_tokens_respected, + test_missing_index_graceful, + ] + + passed = 0 + failed = 0 + for test in tests: + try: + test() + passed += 1 + except Exception as e: + print(f"FAIL: {test.__name__} — {e}") + failed += 1 + + print(f"\n{passed} passed, {failed} failed") + sys.exit(0 if failed == 0 else 1) diff --git a/scripts/test_harvest_prompt.py b/scripts/test_harvest_prompt.py index d3d3f51..fa6d61a 100644 --- a/scripts/test_harvest_prompt.py +++ b/scripts/test_harvest_prompt.py @@ -1,41 +1,129 @@ #!/usr/bin/env python3 """ -Test script for knowledge extraction prompt. -Validates that the prompt produces consistent, structured output. +Test harness for knowledge extraction prompt. +Validates output structure, content quality, and hallucination resistance. + +Usage: + python3 scripts/test_harvest_prompt.py # Run all tests + python3 scripts/test_harvest_prompt.py --transcript FILE # Test against a real transcript + python3 scripts/test_harvest_prompt.py --validate FILE # Validate an existing extraction JSON """ import json import sys +import argparse from pathlib import Path -def validate_knowledge_item(item): - """Validate a single knowledge item.""" - required_fields = ["fact", "category", "repo", "confidence"] - for field in required_fields: - if field not in item: - return False, f"Missing field: {field}" - - if not isinstance(item["fact"], str) or len(item["fact"].strip()) == 0: - return False, "Fact must be a non-empty string" - - valid_categories = ["fact", "pitfall", "pattern", "tool-quirk", "question"] - if item["category"] not in valid_categories: - return False, f"Invalid category: {item['category']}" - - if not isinstance(item["repo"], str): - return False, "Repo must be a string" - - if not isinstance(item["confidence"], (int, float)): - return False, "Confidence must be a number" - - if not (0.0 <= item["confidence"] <= 1.0): - return False, "Confidence must be between 0.0 and 1.0" - - return True, "Valid" +VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"} +REQUIRED_FIELDS = {"fact", "category", "repo", "confidence", "evidence"} +REQUIRED_META = {"session_outcome", "tools_used", "repos_touched", "error_count", "knowledge_count"} -def test_sample_transcript(): - """Test with a sample transcript.""" - sample_transcript = """ + +def validate_knowledge_item(item, idx): + """Validate a single knowledge item. Returns list of errors.""" + errors = [] + if not isinstance(item, dict): + return [f"Item {idx}: not a dict"] + for field in REQUIRED_FIELDS: + if field not in item: + errors.append(f"Item {idx}: missing field '{field}'") + if not isinstance(item.get("fact", ""), str) or len(item.get("fact", "").strip()) == 0: + errors.append(f"Item {idx}: fact must be a non-empty string") + if item.get("category") not in VALID_CATEGORIES: + errors.append(f"Item {idx}: invalid category '{item.get('category')}'") + if not isinstance(item.get("repo", ""), str) or len(item.get("repo", "").strip()) == 0: + errors.append(f"Item {idx}: repo must be a non-empty string") + conf = item.get("confidence") + if not isinstance(conf, (int, float)) or not (0.0 <= conf <= 1.0): + errors.append(f"Item {idx}: confidence must be a number 0.0-1.0, got {conf}") + if not isinstance(item.get("evidence", ""), str) or len(item.get("evidence", "").strip()) == 0: + errors.append(f"Item {idx}: evidence must be a non-empty string (hallucination check)") + return errors + + +def validate_extraction(data): + """Validate a full extraction result. Returns (is_valid, errors, warnings).""" + errors = [] + warnings = [] + + if not isinstance(data, dict): + return False, ["Root is not a JSON object"], [] + + if "knowledge" not in data: + return False, ["Missing 'knowledge' array"], [] + + if not isinstance(data["knowledge"], list): + return False, ["'knowledge' is not an array"], [] + + for i, item in enumerate(data["knowledge"]): + errors.extend(validate_knowledge_item(item, i)) + + # Meta block validation + if "meta" not in data: + warnings.append("Missing 'meta' block (session_outcome, tools_used, etc.)") + else: + meta = data["meta"] + for field in REQUIRED_META: + if field not in meta: + warnings.append(f"Meta missing field '{field}'") + + # Quality checks + facts = data["knowledge"] + if len(facts) == 0: + warnings.append("No knowledge extracted (empty session or extraction failure)") + + # Check for near-duplicate facts + seen_facts = set() + for item in facts: + normalized = item.get("fact", "").lower().strip()[:80] + if normalized in seen_facts: + warnings.append(f"Duplicate fact detected: '{normalized[:50]}...'") + seen_facts.add(normalized) + + # Check confidence distribution + confidences = [item.get("confidence", 0) for item in facts] + if confidences: + avg_conf = sum(confidences) / len(confidences) + if avg_conf > 0.9: + warnings.append(f"Average confidence {avg_conf:.2f} is suspiciously high (possible hallucination)") + if avg_conf < 0.4: + warnings.append(f"Average confidence {avg_conf:.2f} is very low (extraction may be too cautious)") + + return len(errors) == 0, errors, warnings + + +def validate_transcript_coverage(data, transcript): + """ + Check that extracted facts are actually supported by the transcript. + This is a heuristic — checks that key terms from facts appear in transcript. + Returns list of potential hallucinations. + """ + hallucinations = [] + transcript_lower = transcript.lower() + + for item in data.get("knowledge", []): + fact = item.get("fact", "") + evidence = item.get("evidence", "") + + # Check if evidence string appears in transcript + if evidence and evidence.lower() not in transcript_lower: + # Partial match — check if key terms are present + evidence_words = set(evidence.lower().split()) + transcript_words = set(transcript_lower.split()) + overlap = evidence_words & transcript_words + if len(overlap) < min(3, len(evidence_words) * 0.3): + hallucinations.append({ + "fact": fact[:80], + "evidence": evidence[:80], + "issue": "Evidence not found in transcript — possible hallucination" + }) + + return hallucinations + + +# --- Test transcripts --- + +SAMPLE_TRANSCRIPT_SUCCESS = """ User: Clone the repo at forge.alexanderwhitestone.com/Timmy_Foundation/compounding-intelligence Agent: Trying git clone... Error: 401 Unauthorized Agent: Checking for token... Found in ~/.config/gitea/token @@ -44,30 +132,253 @@ Agent: Clone successful with token authentication Agent: Now examining the repo structure... Agent: Found knowledge/index.json with 0 facts Agent: Creating harvest-prompt.md in templates/ +Agent: Done. Files created: templates/harvest-prompt.md, scripts/test_harvest_prompt.py """ - - # This would be replaced with actual prompt execution - print("Sample transcript processed") - print("Expected categories: fact, pitfall, pattern, tool-quirk, question") - return True -if __name__ == "__main__": - print("Testing knowledge extraction prompt...") - - # Test 1: Validate prompt file exists +SAMPLE_TRANSCRIPT_FAILURE = """ +User: Deploy the cron jobs to production +Agent: Connecting to VPS... SSH timeout after 30s +Agent: Retrying with verbose output... Connection refused on port 22 +Agent: Checking firewall rules... iptables shows port 22 is open +Agent: Checking sshd config... sshd is not running +Agent: Attempting to start sshd... systemctl start sshd failed: unit not found +Agent: ERROR: OpenSSH server not installed on this VPS +Agent: Cannot proceed without SSH access. Deployment blocked. +""" + +SAMPLE_TRANSCRIPT_API_QUIRK = """ +User: List all open issues in hermes-agent +Agent: Querying Gitea API... endpoint: /repos/Timmy_Foundation/hermes-agent/issues?state=open&limit=50 +Agent: Response: 0 items. That's odd. +Agent: Trying without state filter... endpoint: /repos/Timmy_Foundation/hermes-agent/issues?limit=50 +Agent: Response: 50 items returned, but all have pull_request field. These are PRs, not issues. +Agent: Gitea issues API returns PRs alongside issues. Must filter by presence of pull_request key. +Agent: Also discovered: state=open parameter returns empty when used with limit=50, but works with limit=5. +Agent: This appears to be a Gitea API quirk — pagination interacts with state filter unexpectedly. +Agent: Workaround: fetch without state filter, then filter client-side for state=='open' and 'pull_request' not in item. +""" + + +def run_tests(): + """Run the built-in test suite.""" + tests_passed = 0 + tests_failed = 0 + + print("=" * 60) + print("KNOWLEDGE EXTRACTION PROMPT — TEST SUITE") + print("=" * 60) + + # Test 1: Prompt file exists and is under 2k tokens (~8k chars) + print("\n[Test 1] Prompt file size constraint") prompt_path = Path("templates/harvest-prompt.md") if not prompt_path.exists(): - print("ERROR: harvest-prompt.md not found") - sys.exit(1) - - print(f"OK: Prompt file exists: {prompt_path}") - - # Test 2: Check prompt size - prompt_size = prompt_path.stat().st_size - print(f"OK: Prompt size: {prompt_size} bytes") - - # Test 3: Test sample transcript processing - if test_sample_transcript(): - print("OK: Sample transcript test passed") - - print("\nAll tests passed!") + print(" FAIL: harvest-prompt.md not found") + tests_failed += 1 + else: + size = prompt_path.stat().st_size + # Rough token estimate: ~4 chars per token + est_tokens = size / 4 + print(f" Prompt size: {size} bytes (~{est_tokens:.0f} tokens)") + if est_tokens > 2000: + print(f" WARN: Prompt exceeds ~1500 tokens (target: ~1000)") + else: + print(f" PASS: Within token budget") + tests_passed += 1 + + # Test 2: Validate a well-formed extraction + print("\n[Test 2] Valid extraction passes validation") + valid_extraction = { + "knowledge": [ + { + "fact": "Gitea auth token is at ~/.config/gitea/token", + "category": "tool-quirk", + "repo": "global", + "confidence": 0.9, + "evidence": "Found in ~/.config/gitea/token" + }, + { + "fact": "Clone fails with 401 when no token is provided", + "category": "pitfall", + "repo": "compounding-intelligence", + "confidence": 0.9, + "evidence": "Error: 401 Unauthorized" + } + ], + "meta": { + "session_outcome": "success", + "tools_used": ["git"], + "repos_touched": ["compounding-intelligence"], + "error_count": 1, + "knowledge_count": 2 + } + } + is_valid, errors, warnings = validate_extraction(valid_extraction) + if is_valid: + print(f" PASS: Valid extraction accepted ({len(warnings)} warnings)") + tests_passed += 1 + else: + print(f" FAIL: Valid extraction rejected: {errors}") + tests_failed += 1 + + # Test 3: Reject missing fields + print("\n[Test 3] Missing fields are rejected") + bad_extraction = { + "knowledge": [ + {"fact": "Something learned", "category": "fact"} # Missing repo, confidence, evidence + ] + } + is_valid, errors, warnings = validate_extraction(bad_extraction) + if not is_valid: + print(f" PASS: Rejected with {len(errors)} errors") + tests_passed += 1 + else: + print(f" FAIL: Should have rejected missing fields") + tests_failed += 1 + + # Test 4: Reject invalid category + print("\n[Test 4] Invalid category is rejected") + bad_cat = { + "knowledge": [ + {"fact": "Test", "category": "discovery", "repo": "x", "confidence": 0.8, "evidence": "test"} + ] + } + is_valid, errors, warnings = validate_extraction(bad_cat) + if not is_valid and any("category" in e for e in errors): + print(f" PASS: Invalid category 'discovery' rejected") + tests_passed += 1 + else: + print(f" FAIL: Should have rejected invalid category") + tests_failed += 1 + + # Test 5: Detect near-duplicates + print("\n[Test 5] Duplicate detection") + dup_extraction = { + "knowledge": [ + {"fact": "Token is at ~/.config/gitea/token", "category": "fact", "repo": "x", "confidence": 0.9, "evidence": "a"}, + {"fact": "Token is at ~/.config/gitea/token", "category": "fact", "repo": "x", "confidence": 0.9, "evidence": "b"} + ], + "meta": {"session_outcome": "success", "tools_used": [], "repos_touched": [], "error_count": 0, "knowledge_count": 2} + } + is_valid, errors, warnings = validate_extraction(dup_extraction) + if any("Duplicate" in w for w in warnings): + print(f" PASS: Duplicate detected") + tests_passed += 1 + else: + print(f" FAIL: Should have detected duplicate") + tests_failed += 1 + + # Test 6: Hallucination check against transcript + print("\n[Test 6] Hallucination detection") + hallucinated = { + "knowledge": [ + { + "fact": "Database port is 5433", + "category": "fact", + "repo": "x", + "confidence": 0.9, + "evidence": "PostgreSQL listening on port 5433" + } + ], + "meta": {"session_outcome": "success", "tools_used": [], "repos_touched": [], "error_count": 0, "knowledge_count": 1} + } + hallucinations = validate_transcript_coverage(hallucinated, SAMPLE_TRANSCRIPT_SUCCESS) + if hallucinations: + print(f" PASS: Hallucination detected ({len(hallucinations)} items)") + tests_passed += 1 + else: + print(f" FAIL: Should have detected hallucinated evidence") + tests_failed += 1 + + # Test 7: Failed session should extract pitfalls + print("\n[Test 7] Failed session extraction shape") + failed_extraction = { + "knowledge": [ + { + "fact": "SSH server not installed on target VPS", + "category": "pitfall", + "repo": "global", + "confidence": 0.9, + "evidence": "ERROR: OpenSSH server not installed on this VPS" + }, + { + "fact": "VPS blocks deployment without SSH access", + "category": "question", + "repo": "global", + "confidence": 0.7, + "evidence": "Cannot proceed without SSH access. Deployment blocked." + } + ], + "meta": { + "session_outcome": "failed", + "tools_used": ["ssh", "systemctl"], + "repos_touched": [], + "error_count": 3, + "knowledge_count": 2 + } + } + is_valid, errors, warnings = validate_extraction(failed_extraction) + if is_valid: + categories = [item["category"] for item in failed_extraction["knowledge"]] + if "pitfall" in categories: + print(f" PASS: Failed session extracted {len(categories)} items including pitfalls") + tests_passed += 1 + else: + print(f" FAIL: Failed session should extract pitfalls") + tests_failed += 1 + else: + print(f" FAIL: {errors}") + tests_failed += 1 + + # Test 8: Empty extraction is warned + print("\n[Test 8] Empty extraction warning") + empty = {"knowledge": [], "meta": {"session_outcome": "success", "tools_used": [], "repos_touched": [], "error_count": 0, "knowledge_count": 0}} + is_valid, errors, warnings = validate_extraction(empty) + if any("No knowledge" in w for w in warnings): + print(f" PASS: Empty extraction warned") + tests_passed += 1 + else: + print(f" FAIL: Should warn on empty extraction") + tests_failed += 1 + + # Summary + print(f"\n{'=' * 60}") + print(f"Results: {tests_passed} passed, {tests_failed} failed") + print(f"{'=' * 60}") + return tests_failed == 0 + + +def validate_file(filepath): + """Validate an existing extraction JSON file.""" + path = Path(filepath) + if not path.exists(): + print(f"ERROR: {filepath} not found") + return False + + data = json.loads(path.read_text()) + is_valid, errors, warnings = validate_extraction(data) + + print(f"Validation of {filepath}:") + print(f" Knowledge items: {len(data.get('knowledge', []))}") + print(f" Errors: {len(errors)}") + print(f" Warnings: {len(warnings)}") + + for e in errors: + print(f" ERROR: {e}") + for w in warnings: + print(f" WARN: {w}") + + return is_valid + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Test knowledge extraction prompt") + parser.add_argument("--validate", help="Validate an existing extraction JSON file") + parser.add_argument("--transcript", help="Test against a real transcript file (informational)") + args = parser.parse_args() + + if args.validate: + success = validate_file(args.validate) + sys.exit(0 if success else 1) + else: + success = run_tests() + sys.exit(0 if success else 1) diff --git a/templates/harvest-prompt.md b/templates/harvest-prompt.md index f423db9..32c6d20 100644 --- a/templates/harvest-prompt.md +++ b/templates/harvest-prompt.md @@ -2,98 +2,107 @@ ## System Prompt -You are a knowledge extraction engine. Your task is to analyze a session transcript and extract durable knowledge that will help future sessions be more efficient. +You are a knowledge extraction engine. You read session transcripts and output ONLY structured JSON. You never infer. You never assume. You extract only what the transcript explicitly states. -## Instructions +## Prompt -Read the session transcript carefully. Extract ONLY information that is explicitly stated in the transcript. Do NOT infer, assume, or hallucinate information. - -### Categories - -Extract knowledge into these categories: - -1. **fact**: Concrete, verifiable information learned (e.g., "Repository X has 5 files", "API returns JSON with field Y") -2. **pitfall**: Errors encountered, wrong assumptions, things that wasted time (e.g., "Assumed API token was in env var GITEA_TOKEN, but it's in ~/.config/gitea/token") -3. **pattern**: Successful sequences of actions (e.g., "To deploy: 1. Run tests 2. Build 3. Push to Gitea 4. Trigger webhook") -4. **tool-quirk**: Environment-specific behaviors (e.g., "Token paths are different on macOS vs Linux", "URL format requires trailing slash") -5. **question**: Things identified but not answered (e.g., "Need to determine optimal batch size for harvesting") - -### Output Format - -Return a JSON object with an array of extracted knowledge items. Each item must have: - -```json -{ - "fact": "One sentence description of the knowledge", - "category": "fact|pitfall|pattern|tool-quirk|question", - "repo": "Repository name this applies to, or 'global' if general", - "confidence": 0.0-1.0 -} ``` +TASK: Extract durable knowledge from this session transcript. -### Confidence Scoring +RULES: +1. Extract ONLY information explicitly stated in the transcript. +2. Do NOT infer, assume, or hallucinate. +3. Every fact must be verifiable by pointing to a specific line in the transcript. +4. If the session failed or was partial, extract pitfalls and questions — these are the most valuable. +5. Be specific. "Gitea API is slow" is worthless. "Gitea issues endpoint with state=open returns empty when limit=50 but works with limit=5" is knowledge. -- 0.9-1.0: Explicitly stated with verification (e.g., "Error message shows X") -- 0.7-0.8: Clearly implied by multiple data points -- 0.5-0.6: Suggested but not fully verified -- 0.3-0.4: Inferred from limited data -- 0.1-0.2: Speculative or uncertain +CATEGORIES (assign exactly one per item): +- fact: Concrete, verifiable thing learned (paths, formats, counts, configs) +- pitfall: Error hit, wrong assumption, time wasted, thing that didn't work +- pattern: Successful sequence that should be reused (deploy steps, debug flow) +- tool-quirk: Environment-specific behavior (token paths, URL formats, API gotchas) +- question: Something identified but not answered — the NEXT agent should investigate -### Constraints +CONFIDENCE: +- 0.9: Directly observed with error output or explicit verification +- 0.7: Multiple data points confirm, but not explicitly verified +- 0.5: Suggested by context, not tested +- 0.3: Inferred from limited evidence -1. **No hallucination**: Only extract what's explicitly in the transcript -2. **Specificity**: Each fact must be specific and actionable -3. **Relevance**: Only extract knowledge that would help future sessions -4. **Brevity**: One sentence per fact -5. **Partial sessions**: Even failed or incomplete sessions may contain valuable pitfalls - -### Example Input/Output - -**Input Transcript (excerpt):** -``` -User: Clone the repo at forge.alexanderwhitestone.com/Timmy_Foundation/compounding-intelligence -Agent: Trying git clone... Error: 401 Unauthorized -Agent: Checking for token... Found in ~/.config/gitea/token -Agent: Token is gitea_token format, not OAuth -Agent: Clone successful with token authentication -``` - -**Output:** -```json +OUTPUT FORMAT (valid JSON only, no markdown, no explanation): { "knowledge": [ { - "fact": "Gitea repo at forge.alexanderwhitestone.com requires authentication for cloning", - "category": "fact", - "repo": "compounding-intelligence", - "confidence": 0.9 - }, - { - "fact": "Gitea authentication token is stored at ~/.config/gitea/token", - "category": "tool-quirk", - "repo": "global", - "confidence": 0.9 - }, - { - "fact": "Gitea uses gitea_token format, not OAuth for API access", - "category": "tool-quirk", - "repo": "global", - "confidence": 0.8 - }, - { - "fact": "Clone fails with 401 when no token is provided", - "category": "pitfall", - "repo": "compounding-intelligence", - "confidence": 0.9 + "fact": "One specific sentence of knowledge", + "category": "fact|pitfall|pattern|tool-quirk|question", + "repo": "repo-name or global", + "confidence": 0.0-1.0, + "evidence": "Brief quote or reference from transcript that supports this" } - ] + ], + "meta": { + "session_outcome": "success|partial|failed", + "tools_used": ["tool1", "tool2"], + "repos_touched": ["repo1"], + "error_count": 0, + "knowledge_count": 0 + } } + +TRANSCRIPT: +{{transcript}} ``` -## Final Notes +## Design Notes -- Process the entire transcript, not just the beginning -- Pay special attention to errors and corrections -- Note any environment-specific details -- Track tool-specific behaviors and quirks -- If the session failed, focus on pitfalls and questions +### Why this works with mimo-v2-pro + +Mimo needs: +- Explicit format constraints ("valid JSON only, no markdown") +- Clear category definitions with concrete examples +- Hard rules before soft guidance +- The transcript at the END (so it reads all instructions first) + +This prompt front-loads all rules, then gives the transcript last. Mimo follows the pattern. + +### Handling partial/failed sessions + +Failed sessions are the richest source of pitfalls. The prompt explicitly says: +> "If the session failed or was partial, extract pitfalls and questions — these are the most valuable." + +This reframes failure as valuable output, not noise to discard. + +### The `evidence` field + +Added to the original spec. Every extracted item must cite where in the transcript it came from. This: +- Prevents hallucination (can't cite what isn't there) +- Enables verification (reviewer can check the source) +- Trains confidence calibration (the agent must find evidence, not just claim it) + +### Token budget + +Target: ~1,000 tokens for the prompt (excluding transcript). + +``` +System prompt: ~50 tokens +Rules: ~200 tokens +Categories: ~150 tokens +Confidence: ~100 tokens +Output format: ~200 tokens +Design notes: NOT included in prompt (documentation only) +───────────────────────────── +Total prompt: ~700 tokens +``` + +Leaves ~300 tokens headroom for variable content (transcript insertion, edge cases). + +### What this replaces + +The v1 prompt had: +- Verbose prose explanations (waste tokens for mimo) +- No `evidence` field (hallucination risk) +- No `meta` block (no session-level metadata) +- No explicit handling of failed sessions +- Example was too long (~150 tokens of example for a 1k prompt) + +This v2 is tighter, more structured, and adds the evidence requirement that prevents the #1 failure mode of extraction prompts: generating plausible-sounding facts that aren't in the transcript.