compounding-intelligence/scripts/transcript_harvester.py

#!/usr/bin/env python3
"""
transcript_harvester.py — Rule-based knowledge extraction from Hermes session transcripts.

Extracts 5 knowledge categories without LLM inference:
  • qa_pair — user question + assistant answer
  • decision — explicit choice ("we decided to X", "I'll use Y")
  • pattern — solution/recipe ("the fix for Z is to do W")
  • preference — personal or team inclination ("I always", "I prefer")
  • fact — concrete observed information (errors, paths, commands)

Usage:
  python3 transcript_harvester.py --session ~/.hermes/sessions/session_xxx.jsonl
  python3 transcript_harvester.py --batch --sessions-dir ~/.hermes/sessions --limit 50
  python3 transcript_harvester.py --session session.jsonl --output knowledge/transcripts/
"""

import argparse
import json
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# Import session_reader from the same scripts directory
SCRIPT_DIR = Path(__file__).parent.absolute()
sys.path.insert(0, str(SCRIPT_DIR))
from session_reader import read_session


# --- Pattern matchers --------------------------------------------------------

DECISION_PATTERNS = [
    r"\b(we\s+(?:decided|chose|agreed|will|are going)\s+to\s+.*)",
    r"\b(I\s+will\s+use|I\s+choose|I\s+am going\s+to)\s+.*",
    r"\b(let's\s+(?:use|go\s+with|do|try))\s+.*",
    r"\b(the\s+(?:decision|choice)\s+is)\s+.*",
    r"\b(I'll\s+implement|I'll\s+deploy|I'll\s+create)\s+.*",
]

PATTERN_PATTERNS = [
    r"\b(the\s+fix\s+for\s+.*\s+is\s+to\s+.*)",
    r"\b(solution:?\s+.*)",
    r"\b(approach:?\s+.*)",
    r"\b(procedure:?\s+.*)",
    r"\b(to\s+resolve\s+this.*?,\s+.*)",
    r"\b(used\s+.*\s+to\s+.*)",  # "used X to do Y"
    r"\b(by\s+doing\s+.*\s+we\s+.*)",
    r"\b(Here's\s+the\s+.*\s+process:?)",  # "Here's the deployment process:"
    r"\b(The\s+steps\s+are:?)",
    r"\b(steps\s+to\s+.*:?)",
    r"\b(Implementation\s+plan:?)",
    r"\b(\d+\.\s+.*\n\d+\.)",  # numbered multi-step (at least two steps detected by newlines)
]

PREFERENCE_PATTERNS = [
    r"\b(I\s+(?:always|never|prefer|usually|typically|generally)\s+.*)",
    r"\b(I\s+like\s+.*)",
    r"\b(My\s+preference\s+is\s+.*)",
    r"\b(Alexander\s+(?:prefers|always|never).*)",
    r"\b(We\s+always\s+.*)",
]

ERROR_PATTERNS = [
    r"\b(error|failed|fatal|exception|denied|could\s+not|couldn't)\b.*",
]

# For a fix that follows an error within 2 messages
FIX_INDICATORS = [
    r"\b(fixed|resolved|added|generated|created|corrected|worked)\b",
    r"\b(the\s+key\s+is|solution\s+was|generate\s+a\s+new)\b",
]


def is_decision(text: str) -> bool:
    for p in DECISION_PATTERNS:
        if re.search(p, text, re.IGNORECASE):
            return True
    return False

def is_pattern(text: str) -> bool:
    for p in PATTERN_PATTERNS:
        if re.search(p, text, re.IGNORECASE):
            return True
    return False

def is_preference(text: str) -> bool:
    for p in PREFERENCE_PATTERNS:
        if re.search(p, text, re.IGNORECASE):
            return True
    return False

def is_error(text: str) -> bool:
    for p in ERROR_PATTERNS:
        if re.search(p, text, re.IGNORECASE):
            return True
    return False

def is_fix_indicator(text: str) -> bool:
    for p in FIX_INDICATORS:
        if re.search(p, text, re.IGNORECASE):
            return True
    return False


# --- Extractors --------------------------------------------------------------

def extract_qa_pair(messages: list[dict], idx: int) -> Optional[dict]:
    """Extract a question→answer pair: user question followed by assistant answer."""
    if idx + 1 >= len(messages):
        return None
    curr = messages[idx]
    nxt = messages[idx + 1]
    if curr.get('role') != 'user' or nxt.get('role') != 'assistant':
        return None
    question = curr.get('content', '').strip()
    answer = nxt.get('content', '').strip()
    if not question or not answer:
        return None
    # Must be a real question (ends with ? or starts with WH-)
    if not (question.endswith('?') or re.match(r'^(how|what|why|when|where|who|which|can|do|is|are)', question, re.IGNORECASE)):
        return None
    # Skip very short answers ("OK", "Yes")
    if len(answer.split()) < 3:
        return None
    return {
        "type": "qa_pair",
        "question": question,
        "answer": answer,
        "timestamp": curr.get('timestamp', ''),
    }


def extract_decision(messages: list[dict], idx: int) -> Optional[dict]:
    """Extract a decision statement from assistant or user message."""
    msg = messages[idx]
    text = msg.get('content', '').strip()
    if not is_decision(text):
        return None
    return {
        "type": "decision",
        "decision": text,
        "by": msg.get('role', 'unknown'),
        "timestamp": msg.get('timestamp', ''),
    }


def extract_pattern(messages: list[dict], idx: int) -> Optional[dict]:
    """Extract a pattern or solution description."""
    msg = messages[idx]
    text = msg.get('content', '').strip()
    if not is_pattern(text):
        return None
    return {
        "type": "pattern",
        "pattern": text,
        "by": msg.get('role', 'unknown'),
        "timestamp": msg.get('timestamp', ''),
    }


def extract_preference(messages: list[dict], idx: int) -> Optional[dict]:
    """Extract a stated preference."""
    msg = messages[idx]
    text = msg.get('content', '').strip()
    if not is_preference(text):
        return None
    return {
        "type": "preference",
        "preference": text,
        "by": msg.get('role', 'unknown'),
        "timestamp": msg.get('timestamp', ''),
    }


def extract_error_fix(messages: list[dict], idx: int) -> Optional[dict]:
    """
    Link an error to its fix. Catch two patterns:
    1. Error statement followed by explicit fix indicator ("fixed", "resolved")
    2. Error statement followed by a decision statement that fixes it ("I'll generate", "I'll add")
    """
    msg = messages[idx]
    if not is_error(msg.get('content', '')):
        return None
    error_text = msg.get('content', '').strip()

    window = min(idx + 8, len(messages))
    for j in range(idx + 1, window):
        follow_up = messages[j]
        follow_text = follow_up.get('content', '').strip()
        # Check for explicit fix indicators
        if is_fix_indicator(follow_text):
            return {
                "type": "error_fix",
                "error": error_text,
                "fix": follow_text,
                "error_timestamp": msg.get('timestamp', ''),
                "fix_timestamp": follow_up.get('timestamp', ''),
            }
        # Check for fix decision: "I'll <action>", "Let's <action>", "We need to <action>"
        if re.match(r"^(I'll|I will|Let's|We (will|should|need to))\s+\w+", follow_text, re.IGNORECASE):
            return {
                "type": "error_fix",
                "error": error_text,
                "fix": follow_text,
                "error_timestamp": msg.get('timestamp', ''),
                "fix_timestamp": follow_up.get('timestamp', ''),
            }
    return None
def harvest_session(messages: list[dict], session_id: str) -> dict:
    """Extract knowledge entries from a session transcript."""
    entries = []
    n = len(messages)

    for i in range(n):
        # QA pairs
        qa = extract_qa_pair(messages, i)
        if qa:
            qa['session_id'] = session_id
            entries.append(qa)

        # Decisions
        dec = extract_decision(messages, i)
        if dec:
            dec['session_id'] = session_id
            entries.append(dec)

        # Patterns
        pat = extract_pattern(messages, i)
        if pat:
            pat['session_id'] = session_id
            entries.append(pat)

        # Preferences
        pref = extract_preference(messages, i)
        if pref:
            pref['session_id'] = session_id
            entries.append(pref)

        # Error/fix pairs (spanning multiple messages)
        ef = extract_error_fix(messages, i)
        if ef:
            ef['session_id'] = session_id
            entries.append(ef)

    return {
        "session_id": session_id,
        "message_count": n,
        "entries": entries,
        "counts": {
            "qa_pair": sum(1 for e in entries if e['type'] == 'qa_pair'),
            "decision": sum(1 for e in entries if e['type'] == 'decision'),
            "pattern": sum(1 for e in entries if e['type'] == 'pattern'),
            "preference": sum(1 for e in entries if e['type'] == 'preference'),
            "error_fix": sum(1 for e in entries if e['type'] == 'error_fix'),
        }
    }


def write_json_output(results: list[dict], output_path: Path):
    """Write aggregated results to JSON."""
    all_entries = []
    summary = {"sessions": 0}
    for r in results:
        summary['sessions'] += 1
        all_entries.extend(r['entries'])

    output = {
        "harvester": "transcript_harvester",
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "summary": summary,
        "total_entries": len(all_entries),
        "entries": all_entries,
    }
    output_path.write_text(json.dumps(output, indent=2, ensure_ascii=False))
    return output


def write_report(results: list[dict], report_path: Path):
    """Write a human-readable markdown report."""
    lines = []
    lines.append("# Transcript Harvester Report")
    lines.append(f"Generated: {datetime.now(timezone.utc).isoformat()}")
    lines.append(f"Sessions processed: {len(results)}")

    totals = {cat: 0 for cat in ['qa_pair', 'decision', 'pattern', 'preference', 'error_fix']}
    for r in results:
        for cat, cnt in r['counts'].items():
            totals[cat] += cnt  # BUG: should be += cnt

    lines.append("\n## Extracted Knowledge by Category\n")
    for cat, cnt in totals.items():
        lines.append(f"- **{cat}**: {cnt}")

    lines.append("\n## Sample Entries\n")
    for r in results:
        for entry in r['entries'][:3]:
            lines.append(f"\n### {entry['type'].upper()} ({r['session_id']})\n")
            if entry['type'] == 'qa_pair':
                lines.append(f"**Q:** {entry['question']}\n")
                lines.append(f"**A:** {entry['answer']}\n")
            elif entry['type'] == 'decision':
                lines.append(f"**Decision:** {entry['decision']}\n")
                lines.append(f"By: {entry['by']}\n")
            elif entry['type'] == 'pattern':
                lines.append(f"**Pattern:** {entry['pattern']}\n")
            elif entry['type'] == 'preference':
                lines.append(f"**Preference:** {entry['preference']}\n")
            elif entry['type'] == 'error_fix':
                lines.append(f"**Error:** {entry['error']}\n")
                lines.append(f"**Fixed by:** {entry['fix']}\n")

    report_path.write_text("\n".join(lines))


def find_recent_sessions(sessions_dir: Path, limit: int = 50) -> list[Path]:
    """Find up to `limit` most recent .jsonl session files."""
    sessions = sorted(sessions_dir.glob("*.jsonl"), reverse=True)
    return sessions[:limit] if limit > 0 else sessions


def main():
    parser = argparse.ArgumentParser(description="Harvest knowledge from session transcripts")
    parser.add_argument('--session', help='Single session JSONL file')
    parser.add_argument('--batch', action='store_true', help='Batch mode')
    parser.add_argument('--sessions-dir', default=str(Path.home() / '.hermes' / 'sessions'),
                        help='Directory of session files')
    parser.add_argument('--output', default='knowledge/transcripts',
                        help='Output directory (default: knowledge/transcripts)')
    parser.add_argument('--limit', type=int, default=50,
                        help='Max sessions to process in batch (default: 50)')

    args = parser.parse_args()
    output_dir = Path(args.output)
    output_dir.mkdir(parents=True, exist_ok=True)

    results = []

    if args.session:
        messages = read_session(args.session)
        session_id = Path(args.session).stem
        results.append(harvest_session(messages, session_id))
    elif args.batch:
        sessions_dir = Path(args.sessions_dir)
        sessions = find_recent_sessions(sessions_dir, args.limit)
        print(f"Processing {len(sessions)} sessions...")
        for sf in sessions:
            messages = read_session(str(sf))
            results.append(harvest_session(messages, sf.stem))
    else:
        parser.print_help()
        sys.exit(1)

    # Write outputs
    json_path = output_dir / "transcript_knowledge.json"
    report_path = output_dir / "transcript_report.md"

    output = write_json_output(results, json_path)
    write_report(results, report_path)

    print(f"\nDone: {output['total_entries']} entries from {len(results)} sessions")
    print(f"Output: {json_path}")
    print(f"Report: {report_path}")

    # Print category totals
    totals = {}
    for r in results:
        for cat, cnt in r['counts'].items():
            totals[cat] = totals.get(cat, 0) + cnt
    print("\nCategory counts:")
    for cat, cnt in sorted(totals.items()):
        print(f"  {cat}: {cnt}")


if __name__ == '__main__':
    main()