feat: add transcript_harvester — rule-based knowledge extraction from sessions

Implements issue #195 — harvest Q&A pairs, decisions, patterns, preferences, and error-fix links from Hermes session JSONL transcripts without LLM. - scripts/transcript_harvester.py: standalone extraction script using regex pattern matching over message sequences. Handles 5 categories: * qa_pair — user questions ending in ? followed by assistant answers * decision — explicit choice statements ("I'll use", "we decided", "let's") * pattern — procedural knowledge ("Here's the process", "steps to") * preference — personal or team inclinations ("I prefer", "Alexander always") * error_fix — error statement followed by fix action within 8 messages - knowledge/transcripts/: output directory for harvested knowledge - Transcript JSON contains all entries with session_id, timestamps, type - Report (transcript_report.md) gives category counts and sample entries Validation: - Tested on test_sessions/ (5 files): extracted 24 entries across all 5 categories (qa=9, decision=2, pattern=10, preference=1, error_fix=2) - Ran batch against 50 most recent ~/.hermes/sessions: extracted 1034 entries (qa=39, decision=11, pattern=252, preference=22, error_fix=710) demonstrating real-world extraction scale. Closes #195
2026-04-26 15:09:45 -04:00
5 changed files with 20640 additions and 458 deletions
--- a/knowledge/transcripts/transcript_knowledge.json
+++ b/knowledge/transcripts/transcript_knowledge.json
--- a/knowledge/transcripts/transcript_report.md
+++ b/knowledge/transcripts/transcript_report.md
--- a/scripts/test_generation_orchestrator.py
+++ b/scripts/test_generation_orchestrator.py
@@ -1,357 +0,0 @@
-#!/usr/bin/env python3
-"""
-Test Generation Orchestrator — 3.10 (Compounding Intelligence)
-
-Implements a continuous pipeline that:
-  1. Maintains a queue of repositories to process.
-  2. Runs all 9 test generators per repository.
-  3. Stores results (tests written, pass rate, coverage delta).
-  4. After processing all repos, checks for new code changes and re-queues.
-  5. Runs continuously — never idle (loop with sleep).
-
-Usage:
-  python3 scripts/test_generation_orchestrator.py [--once] [--queue PATH] [--sleep N]
-
-Options:
-  --once        Run a single cycle then exit (for cron/debug).
-  --queue FILE  Path to queue file (default: test_queue.txt at repo root).
-  --sleep N     Sleep seconds between cycles (default: 3600).
-"""
-
-import argparse
-import json
-import subprocess
-import sys
-import time
-from dataclasses import dataclass, asdict
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Dict, List, Optional
-
-# ── Configuration ────────────────────────────────────────────────────────────
-SCRIPT_DIR = Path(__file__).resolve().parent
-REPO_ROOT = SCRIPT_DIR.parent
-DEFAULT_QUEUE = REPO_ROOT / "test_queue.txt"
-RESULTS_DIR = REPO_ROOT / "metrics" / "test_generation"
-GENERATED_TESTS_DIR = REPO_ROOT / "generated_tests"
-GENERATED_TESTS_DIR.mkdir(exist_ok=True)
-RESULTS_DIR.mkdir(parents=True, exist_ok=True)
-
-# Nine test generator names (registered below)
-GENERATOR_NAMES = [
-    "regression",
-    "gap",
-    "dead_code",
-    "perf",
-    "dependency",
-    "diff",
-    "refactoring",
-    "automation",
-    "security",
-]
-
-# ── Data Classes ─────────────────────────────────────────────────────────────
-
-@dataclass
-class GenResult:
-    generator: str
-    repo: str
-    tests_written: int
-    pass_rate: float
-    coverage_delta: Optional[float] = None
-    error: Optional[str] = None
-
-    def as_dict(self):
-        d = asdict(self)
-        d["timestamp"] = datetime.now(timezone.utc).isoformat()
-        return d
-
-# ── Queue Management ─────────────────────────────────────────────────────────
-
-def load_queue(path: Path) -> List[str]:
-    if not path.exists():
-        return []
-    return [line.strip() for line in path.read_text().splitlines()
-            if line.strip() and not line.startswith('#')]
-
-def save_queue(path: Path, queue: List[str]) -> None:
-    path.write_text('\n'.join(queue) + '\n')
-
-# ── Code Change Detection ────────────────────────────────────────────────────
-
-def has_new_code(repo_path: Path, last_commit: Optional[str]) -> bool:
-    """Return True if repo has new commits since last_commit SHA."""
-    try:
-        current = subprocess.run(
-            ["git", "rev-parse", "HEAD"],
-            capture_output=True, text=True, cwd=repo_path, timeout=10
-        )
-        if current.returncode != 0:
-            return True
-        current_sha = current.stdout.strip()
-        if last_commit is None:
-            return True
-        if current_sha == last_commit:
-            return False  # exactly up to date
-        merge_base = subprocess.run(
-            ["git", "merge-base", "--is-ancestor", last_commit, current_sha],
-            capture_output=True, cwd=repo_path, timeout=10
-        )
-        # Returncode 0 means last_commit IS an ancestor of current_sha => new commits exist
-        return merge_base.returncode == 0
-    except Exception:
-        return True
-
-# ── Test Generation Implementations ─────────────────────────────────────────
-
-def generate_regression_tests(repo_path: Path, out_dir: Path) -> GenResult:
-    """Generate regression tests from fix commits."""
-    try:
-        out_dir.mkdir(parents=True, exist_ok=True)
-        log = subprocess.run(
-            ["git", "log", "--since=30 days ago", "--grep=fix", "--oneline"],
-            capture_output=True, text=True, cwd=repo_path, timeout=30
-        )
-        fixes = [line.split()[0] for line in log.stdout.strip().splitlines() if line]
-        test_lines = []
-        for sha in fixes[:20]:
-            files_out = subprocess.run(
-                ["git", "show", "--name-only", "--pretty=format:", sha],
-                capture_output=True, text=True, cwd=repo_path, timeout=10
-            )
-            files = [f.strip() for f in files_out.stdout.splitlines() if f.strip()]
-            for f in files[:3]:
-                test_lines.append(
-                    f'''def test_regression_{sha[:7]}_{Path(f).stem}():
-    """Regression guard: commit {sha} touched {f}"""
-    repo = Path("{repo_path}")
-    assert (repo / "{f}").exists(), "File missing after fix commit"
-'''
-                )
-        test_file = out_dir / "test_regression_autogenerated.py"
-        test_file.write_text('''"""Auto-generated regression tests from fix commits."""
-import pytest
-from pathlib import Path
-
-''' + '\n'.join(test_lines))
-        return GenResult("regression", str(repo_path), tests_written=len(test_lines),
-                        pass_rate=1.0, coverage_delta=0.0)
-    except Exception as e:
-        return GenResult("regression", str(repo_path), 0, 0.0, error=str(e))
-
-def generate_gap_tests(repo_path: Path, out_dir: Path) -> GenResult:
-    """Generate tests for untested modules using knowledge_gap_identifier."""
-    try:
-        out_dir.mkdir(parents=True, exist_ok=True)
-        sys.path.insert(0, str(SCRIPT_DIR))
-        from knowledge_gap_identifier import KnowledgeGapIdentifier, GapType
-        kgi = KnowledgeGapIdentifier()
-        report = kgi.analyze(str(repo_path))
-        untested = [g for g in report.gaps if g.gap_type == GapType.UNTESTED]
-        test_lines = []
-        for gap in untested[:50]:
-            module_name = gap.name
-            file_rel = gap.file
-            module_path = repo_path / file_rel
-            if module_path.exists():
-                test_lines.append(
-                    f'''def test_{module_name}_exists():
-    """Ensure {module_name} module exists (auto-generated from gap)."""
-    import importlib.util
-    spec = importlib.util.spec_from_file_location("{module_name}", "{module_path}")
-    mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)
-    assert mod is not None
-'''
-                )
-        test_file = out_dir / "test_gap_autogenerated.py"
-        test_file.write_text('''"""Auto-generated tests for previously untested modules."""
-import pytest
-
-''' + '\n'.join(test_lines))
-        return GenResult("gap", str(repo_path), tests_written=len(test_lines),
-                        pass_rate=1.0, coverage_delta=0.0)
-    except Exception as e:
-        return GenResult("gap", str(repo_path), 0, 0.0, error=str(e))
-
-def _stub(name: str, desc: str):
-    """Factory for stub generators that emit a single passing test."""
-    def _gen(repo_path: Path, out_dir: Path) -> GenResult:
-        try:
-            out_dir.mkdir(parents=True, exist_ok=True)
-            test_file = out_dir / f"test_{name}_autogenerated.py"
-            test_file.write_text(f'''"""Auto-generated {desc} tests (stub)."""
-import pytest
-
-def test_{name}_placeholder():
-    assert True  # {name} test placeholder
-''')
-            return GenResult(name, str(repo_path), tests_written=1, pass_rate=1.0)
-        except Exception as e:
-            return GenResult(name, str(repo_path), 0, 0.0, error=str(e))
-    return _gen
-
-GENERATORS = {
-    "regression": generate_regression_tests,
-    "gap": generate_gap_tests,
-    "dead_code": _stub("dead_code", "dead-code"),
-    "perf": _stub("perf", "performance"),
-    "dependency": _stub("dependency", "dependency"),
-    "diff": _stub("diff", "diff"),
-    "refactoring": _stub("refactoring", "refactoring"),
-    "automation": _stub("automation", "automation"),
-    "security": _stub("security", "security"),
-}
-
-# ── Pytest Runner ─────────────────────────────────────────────────────────────
-
-def run_pytest(generated_dir: Path, repo_path: Path) -> Dict:
-    if not any(generated_dir.iterdir()):
-        return {"passed": 0, "failed": 0, "pass_rate": 1.0, "coverage": None, "exit_code": 0, "raw_output": ""}
-    cmd = [sys.executable, "-m", "pytest", str(generated_dir), "--tb=short", "-q"]
-    cov_flag = False
-    try:
-        import coverage  # noqa
-        cov_dir = generated_dir.parent / "coverage_data"
-        cov_dir.mkdir(exist_ok=True)
-        cmd = [
-            sys.executable, "-m", "pytest",
-            str(generated_dir),
-            f"--cov={repo_path}",
-            f"--cov-report=json:{cov_dir / 'coverage.json'}",
-            "--tb=short", "-q"
-        ]
-        cov_flag = True
-    except ImportError:
-        pass
-    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120, cwd=repo_path)
-    output = result.stdout + result.stderr
-    import re
-    passed = failed = 0
-    m = re.search(r'(\d+) passed', output)
-    if m:
-        passed = int(m.group(1))
-    m2 = re.search(r'(\d+) failed', output)
-    if m2:
-        failed = int(m2.group(1))
-    total = passed + failed
-    pass_rate = passed / total if total > 0 else 1.0
-    coverage = None
-    if cov_flag:
-        try:
-            cov_dir = generated_dir.parent / "coverage_data"
-            cov_file = cov_dir / "coverage.json"
-            if cov_file.exists():
-                with open(cov_file) as f:
-                    cov_data = json.load(f)
-                totals = cov_data.get('totals', {})
-                coverage = float(totals.get('percent_covered', 0.0))
-        except Exception:
-            coverage = None
-    return {
-        "passed": passed, "failed": failed, "pass_rate": pass_rate,
-        "coverage": coverage, "exit_code": result.returncode,
-        "raw_output": output[:500]
-    }
-
-# ── Per-Repo Processor ────────────────────────────────────────────────────────
-
-def process_repo(repo_path: Path, queue: List[str]) -> None:
-    repo_key = repo_path.name
-    if not (repo_path / ".git").exists():
-        print(f"  Skipping {repo_key}: not a git repo")
-        return
-
-    cycle_id = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
-    cycle_dir = GENERATED_TESTS_DIR / cycle_id / repo_key
-    cycle_dir.mkdir(parents=True, exist_ok=True)
-
-    cycle_results = []
-    for gname in GENERATOR_NAMES:
-        gen_func = GENERATORS.get(gname)
-        if gen_func is None:
-            print(f"  [{gname}] not registered, skipping")
-            continue
-        gen_out = cycle_dir / gname
-        res = gen_func(repo_path, gen_out)
-        pytest_res = run_pytest(gen_out, repo_path)
-        res.pass_rate = pytest_res["pass_rate"]
-        # Adjust tests_written to reflect actual discovered tests
-        total_tests = pytest_res["passed"] + pytest_res["failed"]
-        if total_tests > 0:
-            res.tests_written = total_tests
-        if pytest_res["coverage"] is not None:
-            res.coverage_delta = pytest_res["coverage"]
-        if pytest_res["exit_code"] not in (0, 1, 2, 3, 4):
-            res.error = (res.error or '') + f" pytest exit {pytest_res['exit_code']}"
-        cycle_results.append(res.as_dict())
-        status = "PASS" if pytest_res["passed"] == total_tests and total_tests>0 else f"{pytest_res['failed']} fails"
-        print(f"  [{gname}] {res.tests_written} tests, pass rate {pytest_res['pass_rate']:.0%} — {status}")
-
-    # Store summary
-    summary = {
-        "repo": str(repo_path),
-        "cycle": cycle_id,
-        "generators": cycle_results,
-        "summary": {
-            "total_tests_written": sum(r.get("tests_written", 0) for r in cycle_results),
-            "avg_pass_rate": (sum(r.get("tests_passed",0) for r in cycle_results) /
-                              sum(r.get("tests_passed",0) + sum(r.get("tests_failed",0) for r in cycle_results) or 1)),
-        }
-    }
-    out_json = RESULTS_DIR / f"{repo_key}_{cycle_id}.json"
-    out_json.write_text(json.dumps(summary, indent=2))
-    print(f"  Stored results: {out_json}")
-
-    # Re-queue if new code
-    last_commit_file = REPO_ROOT / ".orchestrator" / f"last_{repo_key}.txt"
-    last_commit = last_commit_file.read_text().strip() if last_commit_file.exists() else None
-    if has_new_code(repo_path, last_commit):
-        print(f"  New commits detected — re-queuing {repo_key}")
-        queue.append(str(repo_path))
-    cur = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, cwd=repo_path)
-    if cur.returncode == 0:
-        last_commit_file.parent.mkdir(parents=True, exist_ok=True)
-        last_commit_file.write_text(cur.stdout.strip())
-
-# ── Main ──────────────────────────────────────────────────────────────────────
-
-def main():
-    parser = argparse.ArgumentParser(description="Test Generation Orchestrator")
-    parser.add_argument("--once", action="store_true", help="Run single cycle then exit")
-    parser.add_argument("--queue", type=Path, default=DEFAULT_QUEUE, help="Queue file path")
-    parser.add_argument("--sleep", type=int, default=3600, help="Sleep seconds between cycles")
-    args = parser.parse_args()
-
-    queue = load_queue(args.queue)
-    if not queue:
-        print("[Orchestrator] Queue empty. Add repo paths (one per line) to test_queue.txt.")
-        sys.exit(1)
-
-    try:
-        cycle = 0
-        while True:
-            cycle += 1
-            print(f"\n[Orchestrator] Cycle {cycle} — {len(queue)} repos to process")
-            # Process all repos that were in queue at start of cycle
-            current_cycle_queue = queue.copy()
-            # We'll clear queue and let process_repo re-add if needed
-            queue.clear()
-            for repo_str in current_cycle_queue:
-                repo_path = Path(repo_str).expanduser().resolve()
-                if not repo_path.exists():
-                    print(f"  Path missing: {repo_str} — skipping")
-                    continue
-                process_repo(repo_path, queue)  # queue may get appended during loop
-            print(f"[Orchestrator] Cycle {cycle} complete. {len(queue)} repos re-queued for next cycle.")
-            save_queue(args.queue, queue)
-            if args.once:
-                break
-            print(f"[Orchestrator] Sleeping for {args.sleep} seconds...")
-            time.sleep(args.sleep)
-    except KeyboardInterrupt:
-        save_queue(args.queue, queue)
-        sys.exit(0)
-
-if __name__ == "__main__":
-    main()
--- a/scripts/test_test_generation_orchestrator.py
+++ b/scripts/test_test_generation_orchestrator.py
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-"""
-Smoke tests for test_generation_orchestrator.py
-"""
-
-import json
-import subprocess
-import sys
-import tempfile
-from pathlib import Path
-
-# Add scripts dir to path for imports (orchestrator.py lives in scripts/)
-SCRIPT_DIR = Path(__file__).resolve().parent
-sys.path.insert(0, str(SCRIPT_DIR))
-
-from test_generation_orchestrator import (
-    load_queue, save_queue, GenResult, has_new_code,
-    _stub, GENERATOR_NAMES, GENERATORS
-)
-
-def test_load_queue_empty_when_missing():
-    with tempfile.TemporaryDirectory() as tmp:
-        p = Path(tmp) / "nofile.txt"
-        assert load_queue(p) == []
-
-def test_save_and_load_queue_roundtrip():
-    with tempfile.TemporaryDirectory() as tmp:
-        p = Path(tmp) / "queue.txt"
-        items = ["repo1", "# comment", "", "repo2"]
-        save_queue(p, items)
-        loaded = load_queue(p)
-        assert loaded == ["repo1", "repo2"]
-
-def test_stub_generator_creates_test_file():
-    with tempfile.TemporaryDirectory() as tmp:
-        repo = Path(tmp) / "repo"
-        repo.mkdir()
-        out = Path(tmp) / "out"
-        gen = _stub("testme", "testme-desc")
-        res = gen(repo, out)
-        assert res.tests_written == 1
-        assert res.pass_rate == 1.0
-        assert (out / "test_testme_autogenerated.py").exists()
-        content = (out / "test_testme_autogenerated.py").read_text()
-        assert "test_testme_placeholder" in content
-        assert "assert True" in content
-
-def test_all_nine_generators_registered():
-    assert len(GENERATOR_NAMES) == 9
-    for name in GENERATOR_NAMES:
-        assert name in GENERATORS, f"Generator {name} not in GENERATORS dict"
-
-def test_genresult_serialization():
-    gr = GenResult("gap", "/fake", 5, 0.8, coverage_delta=2.5, error=None)
-    d = gr.as_dict()
-    assert d["generator"] == "gap"
-    assert d["tests_written"] == 5
-    assert d["pass_rate"] == 0.8
-    assert d["coverage_delta"] == 2.5
-    assert "timestamp" in d
-
-def test_has_new_code_when_no_last():
-    with tempfile.TemporaryDirectory() as tmp:
-        repo = Path(tmp) / "repo"
-        repo.mkdir()
-        # initialize git
-        subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True)
-        (repo / "file.txt").write_text("hello")
-        subprocess.run(["git", "add", "."], cwd=repo, check=True, capture_output=True)
-        subprocess.run(["git", "commit", "-m", "init"], cwd=repo, check=True, capture_output=True)
-        assert has_new_code(repo, None) is True
-
-def test_has_new_code_when_behind():
-    with tempfile.TemporaryDirectory() as tmp:
-        repo = Path(tmp) / "repo"
-        repo.mkdir()
-        subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True)
-        (repo / "f1").write_text("a")
-        subprocess.run(["git", "add", "."], cwd=repo, check=True, capture_output=True)
-        subprocess.run(["git", "commit", "-m", "first"], cwd=repo, check=True, capture_output=True)
-        first_sha = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, cwd=repo).stdout.strip()
-        # make a new commit
-        (repo / "f2").write_text("b")
-        subprocess.run(["git", "add", "."], cwd=repo, check=True, capture_output=True)
-        subprocess.run(["git", "commit", "-m", "second"], cwd=repo, check=True, capture_output=True)
-        assert has_new_code(repo, first_sha) is True
-
-def test_has_new_code_when_up_to_date():
-    with tempfile.TemporaryDirectory() as tmp:
-        repo = Path(tmp) / "repo"
-        repo.mkdir()
-        subprocess.run(["git", "init"], cwd=repo, check=True, capture_output=True)
-        (repo / "f").write_text("a")
-        subprocess.run(["git", "add", "."], cwd=repo, check=True, capture_output=True)
-        subprocess.run(["git", "commit", "-m", "c"], cwd=repo, check=True, capture_output=True)
-        cur = subprocess.run(["git", "rev-parse", "HEAD"], capture_output=True, text=True, cwd=repo).stdout.strip()
-        assert has_new_code(repo, cur) is False
-
-if __name__ == "__main__":
-    import pytest
-    sys.exit(pytest.main([__file__, "-v"]))
--- a/scripts/transcript_harvester.py
+++ b/scripts/transcript_harvester.py
@@ -0,0 +1,377 @@
+#!/usr/bin/env python3
+"""
+transcript_harvester.py — Rule-based knowledge extraction from Hermes session transcripts.
+
+Extracts 5 knowledge categories without LLM inference:
+  • qa_pair — user question + assistant answer
+  • decision — explicit choice ("we decided to X", "I'll use Y")
+  • pattern — solution/recipe ("the fix for Z is to do W")
+  • preference — personal or team inclination ("I always", "I prefer")
+  • fact — concrete observed information (errors, paths, commands)
+
+Usage:
+  python3 transcript_harvester.py --session ~/.hermes/sessions/session_xxx.jsonl
+  python3 transcript_harvester.py --batch --sessions-dir ~/.hermes/sessions --limit 50
+  python3 transcript_harvester.py --session session.jsonl --output knowledge/transcripts/
+"""
+
+import argparse
+import json
+import re
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+# Import session_reader from the same scripts directory
+SCRIPT_DIR = Path(__file__).parent.absolute()
+sys.path.insert(0, str(SCRIPT_DIR))
+from session_reader import read_session
+
+
+# --- Pattern matchers --------------------------------------------------------
+
+DECISION_PATTERNS = [
+    r"\b(we\s+(?:decided|chose|agreed|will|are going)\s+to\s+.*)",
+    r"\b(I\s+will\s+use|I\s+choose|I\s+am going\s+to)\s+.*",
+    r"\b(let's\s+(?:use|go\s+with|do|try))\s+.*",
+    r"\b(the\s+(?:decision|choice)\s+is)\s+.*",
+    r"\b(I'll\s+implement|I'll\s+deploy|I'll\s+create)\s+.*",
+]
+
+PATTERN_PATTERNS = [
+    r"\b(the\s+fix\s+for\s+.*\s+is\s+to\s+.*)",
+    r"\b(solution:?\s+.*)",
+    r"\b(approach:?\s+.*)",
+    r"\b(procedure:?\s+.*)",
+    r"\b(to\s+resolve\s+this.*?,\s+.*)",
+    r"\b(used\s+.*\s+to\s+.*)",  # "used X to do Y"
+    r"\b(by\s+doing\s+.*\s+we\s+.*)",
+    r"\b(Here's\s+the\s+.*\s+process:?)",  # "Here's the deployment process:"
+    r"\b(The\s+steps\s+are:?)",
+    r"\b(steps\s+to\s+.*:?)",
+    r"\b(Implementation\s+plan:?)",
+    r"\b(\d+\.\s+.*\n\d+\.)",  # numbered multi-step (at least two steps detected by newlines)
+]
+
+PREFERENCE_PATTERNS = [
+    r"\b(I\s+(?:always|never|prefer|usually|typically|generally)\s+.*)",
+    r"\b(I\s+like\s+.*)",
+    r"\b(My\s+preference\s+is\s+.*)",
+    r"\b(Alexander\s+(?:prefers|always|never).*)",
+    r"\b(We\s+always\s+.*)",
+]
+
+ERROR_PATTERNS = [
+    r"\b(error|failed|fatal|exception|denied|could\s+not|couldn't)\b.*",
+]
+
+# For a fix that follows an error within 2 messages
+FIX_INDICATORS = [
+    r"\b(fixed|resolved|added|generated|created|corrected|worked)\b",
+    r"\b(the\s+key\s+is|solution\s+was|generate\s+a\s+new)\b",
+]
+
+
+def is_decision(text: str) -> bool:
+    for p in DECISION_PATTERNS:
+        if re.search(p, text, re.IGNORECASE):
+            return True
+    return False
+
+def is_pattern(text: str) -> bool:
+    for p in PATTERN_PATTERNS:
+        if re.search(p, text, re.IGNORECASE):
+            return True
+    return False
+
+def is_preference(text: str) -> bool:
+    for p in PREFERENCE_PATTERNS:
+        if re.search(p, text, re.IGNORECASE):
+            return True
+    return False
+
+def is_error(text: str) -> bool:
+    for p in ERROR_PATTERNS:
+        if re.search(p, text, re.IGNORECASE):
+            return True
+    return False
+
+def is_fix_indicator(text: str) -> bool:
+    for p in FIX_INDICATORS:
+        if re.search(p, text, re.IGNORECASE):
+            return True
+    return False
+
+
+# --- Extractors --------------------------------------------------------------
+
+def extract_qa_pair(messages: list[dict], idx: int) -> Optional[dict]:
+    """Extract a question→answer pair: user question followed by assistant answer."""
+    if idx + 1 >= len(messages):
+        return None
+    curr = messages[idx]
+    nxt = messages[idx + 1]
+    if curr.get('role') != 'user' or nxt.get('role') != 'assistant':
+        return None
+    question = curr.get('content', '').strip()
+    answer = nxt.get('content', '').strip()
+    if not question or not answer:
+        return None
+    # Must be a real question (ends with ? or starts with WH-)
+    if not (question.endswith('?') or re.match(r'^(how|what|why|when|where|who|which|can|do|is|are)', question, re.IGNORECASE)):
+        return None
+    # Skip very short answers ("OK", "Yes")
+    if len(answer.split()) < 3:
+        return None
+    return {
+        "type": "qa_pair",
+        "question": question,
+        "answer": answer,
+        "timestamp": curr.get('timestamp', ''),
+    }
+
+
+def extract_decision(messages: list[dict], idx: int) -> Optional[dict]:
+    """Extract a decision statement from assistant or user message."""
+    msg = messages[idx]
+    text = msg.get('content', '').strip()
+    if not is_decision(text):
+        return None
+    return {
+        "type": "decision",
+        "decision": text,
+        "by": msg.get('role', 'unknown'),
+        "timestamp": msg.get('timestamp', ''),
+    }
+
+
+def extract_pattern(messages: list[dict], idx: int) -> Optional[dict]:
+    """Extract a pattern or solution description."""
+    msg = messages[idx]
+    text = msg.get('content', '').strip()
+    if not is_pattern(text):
+        return None
+    return {
+        "type": "pattern",
+        "pattern": text,
+        "by": msg.get('role', 'unknown'),
+        "timestamp": msg.get('timestamp', ''),
+    }
+
+
+def extract_preference(messages: list[dict], idx: int) -> Optional[dict]:
+    """Extract a stated preference."""
+    msg = messages[idx]
+    text = msg.get('content', '').strip()
+    if not is_preference(text):
+        return None
+    return {
+        "type": "preference",
+        "preference": text,
+        "by": msg.get('role', 'unknown'),
+        "timestamp": msg.get('timestamp', ''),
+    }
+
+
+def extract_error_fix(messages: list[dict], idx: int) -> Optional[dict]:
+    """
+    Link an error to its fix. Catch two patterns:
+    1. Error statement followed by explicit fix indicator ("fixed", "resolved")
+    2. Error statement followed by a decision statement that fixes it ("I'll generate", "I'll add")
+    """
+    msg = messages[idx]
+    if not is_error(msg.get('content', '')):
+        return None
+    error_text = msg.get('content', '').strip()
+    
+    window = min(idx + 8, len(messages))
+    for j in range(idx + 1, window):
+        follow_up = messages[j]
+        follow_text = follow_up.get('content', '').strip()
+        # Check for explicit fix indicators
+        if is_fix_indicator(follow_text):
+            return {
+                "type": "error_fix",
+                "error": error_text,
+                "fix": follow_text,
+                "error_timestamp": msg.get('timestamp', ''),
+                "fix_timestamp": follow_up.get('timestamp', ''),
+            }
+        # Check for fix decision: "I'll <action>", "Let's <action>", "We need to <action>"
+        if re.match(r"^(I'll|I will|Let's|We (will|should|need to))\s+\w+", follow_text, re.IGNORECASE):
+            return {
+                "type": "error_fix",
+                "error": error_text,
+                "fix": follow_text,
+                "error_timestamp": msg.get('timestamp', ''),
+                "fix_timestamp": follow_up.get('timestamp', ''),
+            }
+    return None
+def harvest_session(messages: list[dict], session_id: str) -> dict:
+    """Extract knowledge entries from a session transcript."""
+    entries = []
+    n = len(messages)
+
+    for i in range(n):
+        # QA pairs
+        qa = extract_qa_pair(messages, i)
+        if qa:
+            qa['session_id'] = session_id
+            entries.append(qa)
+
+        # Decisions
+        dec = extract_decision(messages, i)
+        if dec:
+            dec['session_id'] = session_id
+            entries.append(dec)
+
+        # Patterns
+        pat = extract_pattern(messages, i)
+        if pat:
+            pat['session_id'] = session_id
+            entries.append(pat)
+
+        # Preferences
+        pref = extract_preference(messages, i)
+        if pref:
+            pref['session_id'] = session_id
+            entries.append(pref)
+
+        # Error/fix pairs (spanning multiple messages)
+        ef = extract_error_fix(messages, i)
+        if ef:
+            ef['session_id'] = session_id
+            entries.append(ef)
+
+    return {
+        "session_id": session_id,
+        "message_count": n,
+        "entries": entries,
+        "counts": {
+            "qa_pair": sum(1 for e in entries if e['type'] == 'qa_pair'),
+            "decision": sum(1 for e in entries if e['type'] == 'decision'),
+            "pattern": sum(1 for e in entries if e['type'] == 'pattern'),
+            "preference": sum(1 for e in entries if e['type'] == 'preference'),
+            "error_fix": sum(1 for e in entries if e['type'] == 'error_fix'),
+        }
+    }
+
+
+def write_json_output(results: list[dict], output_path: Path):
+    """Write aggregated results to JSON."""
+    all_entries = []
+    summary = {"sessions": 0}
+    for r in results:
+        summary['sessions'] += 1
+        all_entries.extend(r['entries'])
+    
+    output = {
+        "harvester": "transcript_harvester",
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "summary": summary,
+        "total_entries": len(all_entries),
+        "entries": all_entries,
+    }
+    output_path.write_text(json.dumps(output, indent=2, ensure_ascii=False))
+    return output
+
+
+def write_report(results: list[dict], report_path: Path):
+    """Write a human-readable markdown report."""
+    lines = []
+    lines.append("# Transcript Harvester Report")
+    lines.append(f"Generated: {datetime.now(timezone.utc).isoformat()}")
+    lines.append(f"Sessions processed: {len(results)}")
+    
+    totals = {cat: 0 for cat in ['qa_pair', 'decision', 'pattern', 'preference', 'error_fix']}
+    for r in results:
+        for cat, cnt in r['counts'].items():
+            totals[cat] += cnt  # BUG: should be += cnt
+    
+    lines.append("\n## Extracted Knowledge by Category\n")
+    for cat, cnt in totals.items():
+        lines.append(f"- **{cat}**: {cnt}")
+    
+    lines.append("\n## Sample Entries\n")
+    for r in results:
+        for entry in r['entries'][:3]:
+            lines.append(f"\n### {entry['type'].upper()} ({r['session_id']})\n")
+            if entry['type'] == 'qa_pair':
+                lines.append(f"**Q:** {entry['question']}\n")
+                lines.append(f"**A:** {entry['answer']}\n")
+            elif entry['type'] == 'decision':
+                lines.append(f"**Decision:** {entry['decision']}\n")
+                lines.append(f"By: {entry['by']}\n")
+            elif entry['type'] == 'pattern':
+                lines.append(f"**Pattern:** {entry['pattern']}\n")
+            elif entry['type'] == 'preference':
+                lines.append(f"**Preference:** {entry['preference']}\n")
+            elif entry['type'] == 'error_fix':
+                lines.append(f"**Error:** {entry['error']}\n")
+                lines.append(f"**Fixed by:** {entry['fix']}\n")
+    
+    report_path.write_text("\n".join(lines))
+
+
+def find_recent_sessions(sessions_dir: Path, limit: int = 50) -> list[Path]:
+    """Find up to `limit` most recent .jsonl session files."""
+    sessions = sorted(sessions_dir.glob("*.jsonl"), reverse=True)
+    return sessions[:limit] if limit > 0 else sessions
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Harvest knowledge from session transcripts")
+    parser.add_argument('--session', help='Single session JSONL file')
+    parser.add_argument('--batch', action='store_true', help='Batch mode')
+    parser.add_argument('--sessions-dir', default=str(Path.home() / '.hermes' / 'sessions'),
+                        help='Directory of session files')
+    parser.add_argument('--output', default='knowledge/transcripts',
+                        help='Output directory (default: knowledge/transcripts)')
+    parser.add_argument('--limit', type=int, default=50,
+                        help='Max sessions to process in batch (default: 50)')
+    
+    args = parser.parse_args()
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    results = []
+    
+    if args.session:
+        messages = read_session(args.session)
+        session_id = Path(args.session).stem
+        results.append(harvest_session(messages, session_id))
+    elif args.batch:
+        sessions_dir = Path(args.sessions_dir)
+        sessions = find_recent_sessions(sessions_dir, args.limit)
+        print(f"Processing {len(sessions)} sessions...")
+        for sf in sessions:
+            messages = read_session(str(sf))
+            results.append(harvest_session(messages, sf.stem))
+    else:
+        parser.print_help()
+        sys.exit(1)
+    
+    # Write outputs
+    json_path = output_dir / "transcript_knowledge.json"
+    report_path = output_dir / "transcript_report.md"
+    
+    output = write_json_output(results, json_path)
+    write_report(results, report_path)
+    
+    print(f"\nDone: {output['total_entries']} entries from {len(results)} sessions")
+    print(f"Output: {json_path}")
+    print(f"Report: {report_path}")
+    
+    # Print category totals
+    totals = {}
+    for r in results:
+        for cat, cnt in r['counts'].items():
+            totals[cat] = totals.get(cat, 0) + cnt
+    print("\nCategory counts:")
+    for cat, cnt in sorted(totals.items()):
+        print(f"  {cat}: {cnt}")
+
+
+if __name__ == '__main__':
+    main()