#!/usr/bin/env python3 """ transcript_harvester.py — Rule-based knowledge extraction from Hermes session transcripts. Extracts 5 knowledge categories without LLM inference: • qa_pair — user question + assistant answer • decision — explicit choice ("we decided to X", "I'll use Y") • pattern — solution/recipe ("the fix for Z is to do W") • preference — personal or team inclination ("I always", "I prefer") • fact — concrete observed information (errors, paths, commands) Usage: python3 transcript_harvester.py --session ~/.hermes/sessions/session_xxx.jsonl python3 transcript_harvester.py --batch --sessions-dir ~/.hermes/sessions --limit 50 python3 transcript_harvester.py --session session.jsonl --output knowledge/transcripts/ """ import argparse import json import re import sys from datetime import datetime, timezone from pathlib import Path from typing import Optional # Import session_reader from the same scripts directory SCRIPT_DIR = Path(__file__).parent.absolute() sys.path.insert(0, str(SCRIPT_DIR)) from session_reader import read_session # --- Pattern matchers -------------------------------------------------------- DECISION_PATTERNS = [ r"\b(we\s+(?:decided|chose|agreed|will|are going)\s+to\s+.*)", r"\b(I\s+will\s+use|I\s+choose|I\s+am going\s+to)\s+.*", r"\b(let's\s+(?:use|go\s+with|do|try))\s+.*", r"\b(the\s+(?:decision|choice)\s+is)\s+.*", r"\b(I'll\s+implement|I'll\s+deploy|I'll\s+create)\s+.*", ] PATTERN_PATTERNS = [ r"\b(the\s+fix\s+for\s+.*\s+is\s+to\s+.*)", r"\b(solution:?\s+.*)", r"\b(approach:?\s+.*)", r"\b(procedure:?\s+.*)", r"\b(to\s+resolve\s+this.*?,\s+.*)", r"\b(used\s+.*\s+to\s+.*)", # "used X to do Y" r"\b(by\s+doing\s+.*\s+we\s+.*)", r"\b(Here's\s+the\s+.*\s+process:?)", # "Here's the deployment process:" r"\b(The\s+steps\s+are:?)", r"\b(steps\s+to\s+.*:?)", r"\b(Implementation\s+plan:?)", r"\b(\d+\.\s+.*\n\d+\.)", # numbered multi-step (at least two steps detected by newlines) ] PREFERENCE_PATTERNS = [ r"\b(I\s+(?:always|never|prefer|usually|typically|generally)\s+.*)", r"\b(I\s+like\s+.*)", r"\b(My\s+preference\s+is\s+.*)", r"\b(Alexander\s+(?:prefers|always|never).*)", r"\b(We\s+always\s+.*)", ] ERROR_PATTERNS = [ r"\b(error|failed|fatal|exception|denied|could\s+not|couldn't)\b.*", ] # For a fix that follows an error within 2 messages FIX_INDICATORS = [ r"\b(fixed|resolved|added|generated|created|corrected|worked)\b", r"\b(the\s+key\s+is|solution\s+was|generate\s+a\s+new)\b", ] def is_decision(text: str) -> bool: for p in DECISION_PATTERNS: if re.search(p, text, re.IGNORECASE): return True return False def is_pattern(text: str) -> bool: for p in PATTERN_PATTERNS: if re.search(p, text, re.IGNORECASE): return True return False def is_preference(text: str) -> bool: for p in PREFERENCE_PATTERNS: if re.search(p, text, re.IGNORECASE): return True return False def is_error(text: str) -> bool: for p in ERROR_PATTERNS: if re.search(p, text, re.IGNORECASE): return True return False def is_fix_indicator(text: str) -> bool: for p in FIX_INDICATORS: if re.search(p, text, re.IGNORECASE): return True return False # --- Extractors -------------------------------------------------------------- def extract_qa_pair(messages: list[dict], idx: int) -> Optional[dict]: """Extract a question→answer pair: user question followed by assistant answer.""" if idx + 1 >= len(messages): return None curr = messages[idx] nxt = messages[idx + 1] if curr.get('role') != 'user' or nxt.get('role') != 'assistant': return None question = curr.get('content', '').strip() answer = nxt.get('content', '').strip() if not question or not answer: return None # Must be a real question (ends with ? or starts with WH-) if not (question.endswith('?') or re.match(r'^(how|what|why|when|where|who|which|can|do|is|are)', question, re.IGNORECASE)): return None # Skip very short answers ("OK", "Yes") if len(answer.split()) < 3: return None return { "type": "qa_pair", "question": question, "answer": answer, "timestamp": curr.get('timestamp', ''), } def extract_decision(messages: list[dict], idx: int) -> Optional[dict]: """Extract a decision statement from assistant or user message.""" msg = messages[idx] text = msg.get('content', '').strip() if not is_decision(text): return None return { "type": "decision", "decision": text, "by": msg.get('role', 'unknown'), "timestamp": msg.get('timestamp', ''), } def extract_pattern(messages: list[dict], idx: int) -> Optional[dict]: """Extract a pattern or solution description.""" msg = messages[idx] text = msg.get('content', '').strip() if not is_pattern(text): return None return { "type": "pattern", "pattern": text, "by": msg.get('role', 'unknown'), "timestamp": msg.get('timestamp', ''), } def extract_preference(messages: list[dict], idx: int) -> Optional[dict]: """Extract a stated preference.""" msg = messages[idx] text = msg.get('content', '').strip() if not is_preference(text): return None return { "type": "preference", "preference": text, "by": msg.get('role', 'unknown'), "timestamp": msg.get('timestamp', ''), } def extract_error_fix(messages: list[dict], idx: int) -> Optional[dict]: """ Link an error to its fix. Catch two patterns: 1. Error statement followed by explicit fix indicator ("fixed", "resolved") 2. Error statement followed by a decision statement that fixes it ("I'll generate", "I'll add") """ msg = messages[idx] if not is_error(msg.get('content', '')): return None error_text = msg.get('content', '').strip() window = min(idx + 8, len(messages)) for j in range(idx + 1, window): follow_up = messages[j] follow_text = follow_up.get('content', '').strip() # Check for explicit fix indicators if is_fix_indicator(follow_text): return { "type": "error_fix", "error": error_text, "fix": follow_text, "error_timestamp": msg.get('timestamp', ''), "fix_timestamp": follow_up.get('timestamp', ''), } # Check for fix decision: "I'll ", "Let's ", "We need to " if re.match(r"^(I'll|I will|Let's|We (will|should|need to))\s+\w+", follow_text, re.IGNORECASE): return { "type": "error_fix", "error": error_text, "fix": follow_text, "error_timestamp": msg.get('timestamp', ''), "fix_timestamp": follow_up.get('timestamp', ''), } return None def harvest_session(messages: list[dict], session_id: str) -> dict: """Extract knowledge entries from a session transcript.""" entries = [] n = len(messages) for i in range(n): # QA pairs qa = extract_qa_pair(messages, i) if qa: qa['session_id'] = session_id entries.append(qa) # Decisions dec = extract_decision(messages, i) if dec: dec['session_id'] = session_id entries.append(dec) # Patterns pat = extract_pattern(messages, i) if pat: pat['session_id'] = session_id entries.append(pat) # Preferences pref = extract_preference(messages, i) if pref: pref['session_id'] = session_id entries.append(pref) # Error/fix pairs (spanning multiple messages) ef = extract_error_fix(messages, i) if ef: ef['session_id'] = session_id entries.append(ef) return { "session_id": session_id, "message_count": n, "entries": entries, "counts": { "qa_pair": sum(1 for e in entries if e['type'] == 'qa_pair'), "decision": sum(1 for e in entries if e['type'] == 'decision'), "pattern": sum(1 for e in entries if e['type'] == 'pattern'), "preference": sum(1 for e in entries if e['type'] == 'preference'), "error_fix": sum(1 for e in entries if e['type'] == 'error_fix'), } } def write_json_output(results: list[dict], output_path: Path): """Write aggregated results to JSON.""" all_entries = [] summary = {"sessions": 0} for r in results: summary['sessions'] += 1 all_entries.extend(r['entries']) output = { "harvester": "transcript_harvester", "generated_at": datetime.now(timezone.utc).isoformat(), "summary": summary, "total_entries": len(all_entries), "entries": all_entries, } output_path.write_text(json.dumps(output, indent=2, ensure_ascii=False)) return output def write_report(results: list[dict], report_path: Path): """Write a human-readable markdown report.""" lines = [] lines.append("# Transcript Harvester Report") lines.append(f"Generated: {datetime.now(timezone.utc).isoformat()}") lines.append(f"Sessions processed: {len(results)}") totals = {cat: 0 for cat in ['qa_pair', 'decision', 'pattern', 'preference', 'error_fix']} for r in results: for cat, cnt in r['counts'].items(): totals[cat] += cnt # BUG: should be += cnt lines.append("\n## Extracted Knowledge by Category\n") for cat, cnt in totals.items(): lines.append(f"- **{cat}**: {cnt}") lines.append("\n## Sample Entries\n") for r in results: for entry in r['entries'][:3]: lines.append(f"\n### {entry['type'].upper()} ({r['session_id']})\n") if entry['type'] == 'qa_pair': lines.append(f"**Q:** {entry['question']}\n") lines.append(f"**A:** {entry['answer']}\n") elif entry['type'] == 'decision': lines.append(f"**Decision:** {entry['decision']}\n") lines.append(f"By: {entry['by']}\n") elif entry['type'] == 'pattern': lines.append(f"**Pattern:** {entry['pattern']}\n") elif entry['type'] == 'preference': lines.append(f"**Preference:** {entry['preference']}\n") elif entry['type'] == 'error_fix': lines.append(f"**Error:** {entry['error']}\n") lines.append(f"**Fixed by:** {entry['fix']}\n") report_path.write_text("\n".join(lines)) def find_recent_sessions(sessions_dir: Path, limit: int = 50) -> list[Path]: """Find up to `limit` most recent .jsonl session files.""" sessions = sorted(sessions_dir.glob("*.jsonl"), reverse=True) return sessions[:limit] if limit > 0 else sessions def main(): parser = argparse.ArgumentParser(description="Harvest knowledge from session transcripts") parser.add_argument('--session', help='Single session JSONL file') parser.add_argument('--batch', action='store_true', help='Batch mode') parser.add_argument('--sessions-dir', default=str(Path.home() / '.hermes' / 'sessions'), help='Directory of session files') parser.add_argument('--output', default='knowledge/transcripts', help='Output directory (default: knowledge/transcripts)') parser.add_argument('--limit', type=int, default=50, help='Max sessions to process in batch (default: 50)') args = parser.parse_args() output_dir = Path(args.output) output_dir.mkdir(parents=True, exist_ok=True) results = [] if args.session: messages = read_session(args.session) session_id = Path(args.session).stem results.append(harvest_session(messages, session_id)) elif args.batch: sessions_dir = Path(args.sessions_dir) sessions = find_recent_sessions(sessions_dir, args.limit) print(f"Processing {len(sessions)} sessions...") for sf in sessions: messages = read_session(str(sf)) results.append(harvest_session(messages, sf.stem)) else: parser.print_help() sys.exit(1) # Write outputs json_path = output_dir / "transcript_knowledge.json" report_path = output_dir / "transcript_report.md" output = write_json_output(results, json_path) write_report(results, report_path) print(f"\nDone: {output['total_entries']} entries from {len(results)} sessions") print(f"Output: {json_path}") print(f"Report: {report_path}") # Print category totals totals = {} for r in results: for cat, cnt in r['counts'].items(): totals[cat] = totals.get(cat, 0) + cnt print("\nCategory counts:") for cat, cnt in sorted(totals.items()): print(f" {cat}: {cnt}") if __name__ == '__main__': main()