compounding-intelligence/scripts/knowledge_synthesizer.py

#!/usr/bin/env python3
"""
knowledge_synthesizer.py — Zero-shot knowledge synthesis for compounding intelligence.

Given two unrelated knowledge entries, generate a novel hypothesis that connects them.
Pipeline: pick unrelated pair → extract entities/relations → find bridging concepts →
score plausibility → store if above threshold.

Usage:
    python3 scripts/knowledge_synthesizer.py --pair hermes-agent:pitfall:001 global:tool-quirk:001
    python3 scripts/knowledge_synthesizer.py --auto --threshold 0.75
    python3 scripts/knowledge_synthesizer.py --dry-run  # show candidate pair without synthesizing
"""

import argparse
import json
import os
import sys
import time
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional, Tuple, List, Dict

SCRIPT_DIR = Path(__file__).parent.absolute()
sys.path.insert(0, str(SCRIPT_DIR))

REPO_ROOT = SCRIPT_DIR.parent
KNOWLEDGE_DIR = REPO_ROOT / "knowledge"
TEMPLATE_PATH = SCRIPT_DIR.parent / "templates" / "synthesis-prompt.md"

# Default API configuration
DEFAULT_API_BASE = os.environ.get(
    "SYNTHESIS_API_BASE",
    os.environ.get("HARVESTER_API_BASE", "https://api.nousresearch.com/v1")
)
DEFAULT_API_KEY = os.environ.get("SYNTHESIS_API_KEY", "")
DEFAULT_MODEL = os.environ.get(
    "SYNTHESIS_MODEL",
    os.environ.get("HARVESTER_MODEL", "xiaomi/mimo-v2-pro")
)

# Places to look for API keys if not in env
API_KEY_PATHS = [
    os.path.expanduser("~/.config/nous/key"),
    os.path.expanduser("~/.hermes/keymaxxing/active/minimax.key"),
    os.path.expanduser("~/.config/openrouter/key"),
]


def find_api_key() -> str:
    for path in API_KEY_PATHS:
        if os.path.exists(path):
            with open(path) as f:
                key = f.read().strip()
                if key:
                    return key
    return ""


def load_index() -> dict:
    index_path = KNOWLEDGE_DIR / "index.json"
    if not index_path.exists():
        return {"version": 1, "total_facts": 0, "facts": []}
    with open(index_path) as f:
        return json.load(f)


def save_index(index: dict) -> None:
    KNOWLEDGE_DIR.mkdir(parents=True, exist_ok=True)
    index_path = KNOWLEDGE_DIR / "index.json"
    with open(index_path, 'w', encoding='utf-8') as f:
        json.dump(index, f, indent=2, ensure_ascii=False)


def next_sequence(facts: List[dict], domain: str, category: str) -> int:
    """Find next sequence number for given domain:category."""
    prefix = f"{domain}:{category}:"
    max_seq = 0
    for fact in facts:
        fid = fact.get('id', '')
        if fid.startswith(prefix):
            try:
                seq = int(fid.split(':')[-1])
                max_seq = max(max_seq, seq)
            except ValueError:
                continue
    return max_seq + 1


def generate_id(domain: str, category: str, facts: List[dict]) -> str:
    """Generate a new unique ID for synthesized fact."""
    seq = next_sequence(facts, domain, category)
    return f"{domain}:{category}:{seq:03d}"


def facts_are_unrelated(f1: dict, f2: dict) -> bool:
    """Return True if two facts have no existing 'related' link."""
    id1, id2 = f1['id'], f2['id']
    rel1 = set(f1.get('related', []))
    rel2 = set(f2.get('related', []))
    return (id2 not in rel1) and (id1 not in rel2)


def find_candidate_pair(facts: List[dict]) -> Optional[Tuple[dict, dict]]:
    """Pick two unrelated facts from different domains if possible."""
    # Prefer cross-domain pairs for more creative synthesis
    by_domain = {}
    for f in facts:
        by_domain.setdefault(f['domain'], []).append(f)

    domains = list(by_domain.keys())
    if len(domains) < 2:
        # Not enough domain diversity, pick any unrelated pair
        for i, f1 in enumerate(facts):
            for f2 in facts[i+1:]:
                if facts_are_unrelated(f1, f2):
                    return f1, f2
        return None

    # Try cross-domain first
    for d1 in domains:
        for d2 in domains:
            if d1 == d2:
                continue
            for f1 in by_domain[d1]:
                for f2 in by_domain[d2]:
                    if facts_are_unrelated(f1, f2):
                        return f1, f2

    # Fallback to any unrelated pair
    return find_candidate_pair_by_simple(facts)


def find_candidate_pair_by_simple(facts: List[dict]) -> Optional[Tuple[dict, dict]]:
    for i, f1 in enumerate(facts):
        for f2 in facts[i+1:]:
            if facts_are_unrelated(f1, f2):
                return f1, f2
    return None


def load_synthesis_prompt() -> str:
    if TEMPLATE_PATH.exists():
        return TEMPLATE_PATH.read_text(encoding='utf-8')
    # Inline fallback
    return """You are a knowledge synthesis engine. Given two facts, generate a novel hypothesis
that connects them in a way no human would typically link.

TASK:
- Fact A: {fact_a}
- Fact B: {fact_b}

OUTPUT a single JSON object:
{
  "hypothesis": "one concise sentence linking the two facts in an actionable way",
  "plausibility": 0.0-1.0,
  "bridging_concepts": ["concept1", "concept2"],
  "suggested_tags": ["tag1", "tag2"]
}

RULES:
1. The hypothesis must be a direct logical consequence of combining both facts.
2. Do NOT restate either fact — produce a new insight.
3. Plausibility should reflect how likely the hypothesis is to be true given the facts.
4. If no meaningful connection exists, return {"hypothesis":"","plausibility":0.0}.
5. Output ONLY valid JSON, no markdown.
"""


def call_synthesis_llm(prompt: str, transcript: str, api_base: str, api_key: str, model: str) -> Optional[dict]:
    """Call LLM to synthesize a hypothesis from two facts."""
    import urllib.request

    messages = [
        {"role": "system", "content": prompt},
        {"role": "user", "content": transcript}
    ]

    payload = json.dumps({
        "model": model,
        "messages": messages,
        "temperature": 0.7,  # More creative for synthesis
        "max_tokens": 512
    }).encode('utf-8')

    req = urllib.request.Request(
        f"{api_base}/chat/completions",
        data=payload,
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        },
        method="POST"
    )

    try:
        with urllib.request.urlopen(req, timeout=60) as resp:
            result = json.loads(resp.read().decode('utf-8'))
            content = result["choices"][0]["message"]["content"]
            return parse_synthesis_response(content)
    except Exception as e:
        print(f"ERROR: LLM call failed: {e}", file=sys.stderr)
        return None


def parse_synthesis_response(content: str) -> Optional[dict]:
    """Extract synthesis JSON from LLM response."""
    try:
        data = json.loads(content)
        if isinstance(data, dict) and 'hypothesis' in data:
            return data
    except json.JSONDecodeError:
        pass

    import re
    json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', content, re.DOTALL)
    if json_match:
        try:
            data = json.loads(json_match.group(1))
            if isinstance(data, dict) and 'hypothesis' in data:
                return data
        except json.JSONDecodeError:
            pass

    # Try finding any JSON object
    json_match = re.search(r'(\{.*"hypothesis".*\})', content, re.DOTALL)
    if json_match:
        try:
            return json.loads(json_match.group(1))
        except json.JSONDecodeError:
            pass

    return None


def heuristic_synthesis(f1: dict, f2: dict) -> dict:
    """Rule-based fallback synthesis when no LLM available."""
    # Simple bridging: combine tags and domains
    tags = list(set(f1.get('tags', []) + f2.get('tags', [])))
    fact1 = f1['fact']
    fact2 = f2['fact']

    # Very basic heuristic: "By applying X from domain1 to domain2, we can Y"
    hypothesis = (
        f"Cross-domain insight: techniques from '{f1['domain']}' "
        f"might solve problems in '{f2['domain']}'. "
        f"Specifically: {fact1} could inform {fact2}"
    )

    return {
        "hypothesis": hypothesis,
        "plausibility": 0.4,  # Low confidence for heuristic
        "bridging_concepts": tags[:3],
        "suggested_tags": tags
    }


def synthesize_fact(fact1: dict, fact2: dict, api_base: str, api_key: str, model: str,
                    dry_run: bool = False) -> Optional[dict]:
    """Generate a synthesized fact from two unrelated facts."""
    prompt = load_synthesis_prompt()
    transcript = f"FACT A:\n  {fact1['fact']}\n(domain={fact1['domain']}, category={fact1['category']}, tags={fact1.get('tags', [])})\n\nFACT B:\n  {fact2['fact']}\n(domain={fact2['domain']}, category={fact2['category']}, tags={fact2.get('tags', [])})"

    if dry_run:
        print(f"\n[DRY RUN] Would synthesize:")
        print(f"  Fact A: {fact1['fact'][:80]}")
        print(f"  Fact B: {fact2['fact'][:80]}")
        return None

    result = None
    if api_key:
        result = call_synthesis_llm(prompt, transcript, api_base, api_key, model)

    if result is None:
        print("WARNING: LLM synthesis failed or no API key; using heuristic fallback", file=sys.stderr)
        result = heuristic_synthesis(fact1, fact2)

    return result


def fingerprint(text: str) -> str:
    return hashlib.md5(text.lower().strip().encode('utf-8')).hexdigest()


def is_duplicate(hypothesis: str, existing_facts: List[dict]) -> bool:
    h_fp = fingerprint(hypothesis)
    for f in existing_facts:
        if fingerprint(f.get('fact', '')) == h_fp:
            return True
    return False


def store_synthesis(synth: dict, source_ids: List[str], index: dict, threshold: float = 0.5) -> bool:
    """Store synthesized fact if plausibility exceeds threshold."""
    plaus = synth.get('plausibility', 0.0)
    if plaus < threshold:
        print(f"Skipped: plausibility {plaus:.2f} below threshold {threshold}")
        return False

    hypothesis = synth['hypothesis'].strip()
    if not hypothesis or is_duplicate(hypothesis, index['facts']):
        print(f"Skipped: duplicate or empty hypothesis")
        return False

    # Build new fact
    new_fact = {
        "fact": hypothesis,
        "category": "pattern",  # Synthesized connections become reusable patterns
        "domain": "global",     # Cross-domain synthesis is globally applicable
        "confidence": round(plaus, 2),
        "tags": synth.get('suggested_tags', []),
        "related": source_ids,
        "first_seen": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
        "last_confirmed": datetime.now(timezone.utc).strftime("%Y-%m-%d"),
        "source_count": 1,
    }

    # Generate ID
    new_fact['id'] = generate_id("global", "pattern", index['facts'])

    # Update index
    index['facts'].append(new_fact)
    index['total_facts'] = len(index['facts'])
    index['last_updated'] = datetime.now(timezone.utc).isoformat()

    # Write index
    save_index(index)

    # Append to YAML
    yaml_path = KNOWLEDGE_DIR / "global" / "patterns.yaml"
    yaml_path.parent.mkdir(parents=True, exist_ok=True)
    mode = 'a' if yaml_path.exists() else 'w'
    with open(yaml_path, mode, encoding='utf-8') as f:
        if mode == 'w':
            f.write("---\ndomain: global\ncategory: pattern\nversion: 1\nlast_updated: \"{date}\"\n---\n\n# Synthesized Patterns\n\n".format(date=datetime.now(timezone.utc).strftime("%Y-%m-%d")))
        f.write(f"\n- id: {new_fact['id']}\n")
        f.write(f"  fact: \"{hypothesis}\"\n")
        f.write(f"  confidence: {plaus}\n")
        if new_fact['tags']:
            f.write(f"  tags: {json.dumps(new_fact['tags'])}\n")
        f.write(f"  related: {json.dumps(source_ids)}\n")
        f.write(f"  first_seen: \"{new_fact['first_seen']}\"\n")
        f.write(f"  last_confirmed: \"{new_fact['last_confirmed']}\"\n")

    print(f"✓ Stored synthesis as {new_fact['id']}: {hypothesis[:80]}")
    return True


def main():
    parser = argparse.ArgumentParser(description="Zero-shot knowledge synthesis")
    parser.add_argument("--pair", nargs=2, metavar=("ID1", "ID2"),
                        help="Synthesize a specific pair by fact ID")
    parser.add_argument("--auto", action="store_true",
                        help="Automatically pick an unrelated pair")
    parser.add_argument("--threshold", type=float, default=0.6,
                        help="Plausibility threshold for storage (default: 0.6)")
    parser.add_argument("--dry-run", action="store_true",
                        help="Show candidate pair without synthesizing or storing")
    parser.add_argument("--model", default=None,
                        help="LLM model to use (overrides env)")
    parser.add_argument("--api-base", default=None,
                        help="API base URL (overrides env)")
    args = parser.parse_args()

    # Resolve API credentials
    api_base = args.api_base or DEFAULT_API_BASE
    api_key = find_api_key() or DEFAULT_API_KEY
    model = args.model or DEFAULT_MODEL

    if not args.dry_run and not args.pair and not args.auto:
        print("ERROR: Must specify either --pair ID1 ID2 or --auto", file=sys.stderr)
        parser.print_help()
        sys.exit(1)

    # Load index
    index = load_index()
    facts = index['facts']

    if len(facts) < 2:
        print("ERROR: Need at least 2 facts in knowledge store to synthesize", file=sys.stderr)
        sys.exit(1)

    # Select facts
    f1, f2 = None, None
    if args.pair:
        id1, id2 = args.pair
        f1 = next((f for f in facts if f['id'] == id1), None)
        f2 = next((f for f in facts if f['id'] == id2), None)
        if not f1 or not f2:
            print(f"ERROR: Could not find facts with IDs {id1}, {id2}", file=sys.stderr)
            sys.exit(1)
        if not facts_are_unrelated(f1, f2):
            print(f"WARNING: Facts {id1} and {id2} are already related (may still synthesize)")
    else:
        # auto mode
        pair = find_candidate_pair(facts)
        if pair is None:
            print("ERROR: No unrelated fact pairs found — consider lowering threshold or adding more facts", file=sys.stderr)
            sys.exit(1)
        f1, f2 = pair
        print(f"Selected pair:\n  {f1['id']}: {f1['fact'][:60]}\n  {f2['id']}: {f2['fact'][:60]}")

    # Synthesize
    synth = synthesize_fact(f1, f2, api_base, api_key, model, dry_run=args.dry_run)
    if synth is None:
        sys.exit(0)  # dry-run path

    print(f"\nHypothesis: {synth['hypothesis']}")
    print(f"Plausibility: {synth.get('plausibility', 0.0):.2f}")
    print(f"Bridging concepts: {synth.get('bridging_concepts', [])}")

    # Store if acceptable
    store_synthesis(synth, [f1['id'], f2['id']], index, threshold=args.threshold)


if __name__ == '__main__':
    main()