feat(knowledge-gap): add CLI entry point for monthly execution (Closes #172 )

- Adds main() with argparse for repo_path, --json, --output flags - Enables scheduled monthly execution via cron - Fixes to_dict() stats keys to use string values for JSON serialization
2026-04-26 11:23:36 -04:00
4 changed files with 42 additions and 20641 deletions
--- a/knowledge/transcripts/transcript_knowledge.json
+++ b/knowledge/transcripts/transcript_knowledge.json
--- a/knowledge/transcripts/transcript_report.md
+++ b/knowledge/transcripts/transcript_report.md
--- a/scripts/knowledge_gap_identifier.py
+++ b/scripts/knowledge_gap_identifier.py
@@ -75,7 +75,7 @@ class GapReport:
        return {
            "repo_path": self.repo_path,
            "total_gaps": len(self.gaps),
-            "stats": {k: len(v) for k, v in
+            "stats": {k.value: len(v) for k, v in
                      {gt: [g for g in self.gaps if g.gap_type == gt]
                       for gt in GapType}.items() if v},
            "gaps": [
@@ -273,3 +273,44 @@ class KnowledgeGapIdentifier:
                    ))

        return report
+
+def main() -> None:
+    import argparse
+    import json
+    import sys
+    parser = argparse.ArgumentParser(
+        description="Knowledge Gap Identifier — cross-reference code, docs, and tests to find gaps"
+    )
+    parser.add_argument(
+        "repo_path",
+        nargs="?",
+        default=".",
+        help="Path to repository root (default: current directory)"
+    )
+    parser.add_argument(
+        "--json",
+        action="store_true",
+        help="Output report as JSON instead of human-readable summary"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Write report to file instead of stdout"
+    )
+    args = parser.parse_args()
+
+    report = KnowledgeGapIdentifier().analyze(args.repo_path)
+
+    if args.json:
+        output = json.dumps(report.to_dict(), indent=2, default=str)
+    else:
+        output = report.summary()
+
+    if args.output:
+        with open(args.output, "w") as fh:
+            print(output, file=fh)
+    else:
+        print(output)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/transcript_harvester.py
+++ b/scripts/transcript_harvester.py
@@ -1,377 +0,0 @@
-#!/usr/bin/env python3
-"""
-transcript_harvester.py — Rule-based knowledge extraction from Hermes session transcripts.
-
-Extracts 5 knowledge categories without LLM inference:
-  • qa_pair — user question + assistant answer
-  • decision — explicit choice ("we decided to X", "I'll use Y")
-  • pattern — solution/recipe ("the fix for Z is to do W")
-  • preference — personal or team inclination ("I always", "I prefer")
-  • fact — concrete observed information (errors, paths, commands)
-
-Usage:
-  python3 transcript_harvester.py --session ~/.hermes/sessions/session_xxx.jsonl
-  python3 transcript_harvester.py --batch --sessions-dir ~/.hermes/sessions --limit 50
-  python3 transcript_harvester.py --session session.jsonl --output knowledge/transcripts/
-"""
-
-import argparse
-import json
-import re
-import sys
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Optional
-
-# Import session_reader from the same scripts directory
-SCRIPT_DIR = Path(__file__).parent.absolute()
-sys.path.insert(0, str(SCRIPT_DIR))
-from session_reader import read_session
-
-
-# --- Pattern matchers --------------------------------------------------------
-
-DECISION_PATTERNS = [
-    r"\b(we\s+(?:decided|chose|agreed|will|are going)\s+to\s+.*)",
-    r"\b(I\s+will\s+use|I\s+choose|I\s+am going\s+to)\s+.*",
-    r"\b(let's\s+(?:use|go\s+with|do|try))\s+.*",
-    r"\b(the\s+(?:decision|choice)\s+is)\s+.*",
-    r"\b(I'll\s+implement|I'll\s+deploy|I'll\s+create)\s+.*",
-]
-
-PATTERN_PATTERNS = [
-    r"\b(the\s+fix\s+for\s+.*\s+is\s+to\s+.*)",
-    r"\b(solution:?\s+.*)",
-    r"\b(approach:?\s+.*)",
-    r"\b(procedure:?\s+.*)",
-    r"\b(to\s+resolve\s+this.*?,\s+.*)",
-    r"\b(used\s+.*\s+to\s+.*)",  # "used X to do Y"
-    r"\b(by\s+doing\s+.*\s+we\s+.*)",
-    r"\b(Here's\s+the\s+.*\s+process:?)",  # "Here's the deployment process:"
-    r"\b(The\s+steps\s+are:?)",
-    r"\b(steps\s+to\s+.*:?)",
-    r"\b(Implementation\s+plan:?)",
-    r"\b(\d+\.\s+.*\n\d+\.)",  # numbered multi-step (at least two steps detected by newlines)
-]
-
-PREFERENCE_PATTERNS = [
-    r"\b(I\s+(?:always|never|prefer|usually|typically|generally)\s+.*)",
-    r"\b(I\s+like\s+.*)",
-    r"\b(My\s+preference\s+is\s+.*)",
-    r"\b(Alexander\s+(?:prefers|always|never).*)",
-    r"\b(We\s+always\s+.*)",
-]
-
-ERROR_PATTERNS = [
-    r"\b(error|failed|fatal|exception|denied|could\s+not|couldn't)\b.*",
-]
-
-# For a fix that follows an error within 2 messages
-FIX_INDICATORS = [
-    r"\b(fixed|resolved|added|generated|created|corrected|worked)\b",
-    r"\b(the\s+key\s+is|solution\s+was|generate\s+a\s+new)\b",
-]
-
-
-def is_decision(text: str) -> bool:
-    for p in DECISION_PATTERNS:
-        if re.search(p, text, re.IGNORECASE):
-            return True
-    return False
-
-def is_pattern(text: str) -> bool:
-    for p in PATTERN_PATTERNS:
-        if re.search(p, text, re.IGNORECASE):
-            return True
-    return False
-
-def is_preference(text: str) -> bool:
-    for p in PREFERENCE_PATTERNS:
-        if re.search(p, text, re.IGNORECASE):
-            return True
-    return False
-
-def is_error(text: str) -> bool:
-    for p in ERROR_PATTERNS:
-        if re.search(p, text, re.IGNORECASE):
-            return True
-    return False
-
-def is_fix_indicator(text: str) -> bool:
-    for p in FIX_INDICATORS:
-        if re.search(p, text, re.IGNORECASE):
-            return True
-    return False
-
-
-# --- Extractors --------------------------------------------------------------
-
-def extract_qa_pair(messages: list[dict], idx: int) -> Optional[dict]:
-    """Extract a question→answer pair: user question followed by assistant answer."""
-    if idx + 1 >= len(messages):
-        return None
-    curr = messages[idx]
-    nxt = messages[idx + 1]
-    if curr.get('role') != 'user' or nxt.get('role') != 'assistant':
-        return None
-    question = curr.get('content', '').strip()
-    answer = nxt.get('content', '').strip()
-    if not question or not answer:
-        return None
-    # Must be a real question (ends with ? or starts with WH-)
-    if not (question.endswith('?') or re.match(r'^(how|what|why|when|where|who|which|can|do|is|are)', question, re.IGNORECASE)):
-        return None
-    # Skip very short answers ("OK", "Yes")
-    if len(answer.split()) < 3:
-        return None
-    return {
-        "type": "qa_pair",
-        "question": question,
-        "answer": answer,
-        "timestamp": curr.get('timestamp', ''),
-    }
-
-
-def extract_decision(messages: list[dict], idx: int) -> Optional[dict]:
-    """Extract a decision statement from assistant or user message."""
-    msg = messages[idx]
-    text = msg.get('content', '').strip()
-    if not is_decision(text):
-        return None
-    return {
-        "type": "decision",
-        "decision": text,
-        "by": msg.get('role', 'unknown'),
-        "timestamp": msg.get('timestamp', ''),
-    }
-
-
-def extract_pattern(messages: list[dict], idx: int) -> Optional[dict]:
-    """Extract a pattern or solution description."""
-    msg = messages[idx]
-    text = msg.get('content', '').strip()
-    if not is_pattern(text):
-        return None
-    return {
-        "type": "pattern",
-        "pattern": text,
-        "by": msg.get('role', 'unknown'),
-        "timestamp": msg.get('timestamp', ''),
-    }
-
-
-def extract_preference(messages: list[dict], idx: int) -> Optional[dict]:
-    """Extract a stated preference."""
-    msg = messages[idx]
-    text = msg.get('content', '').strip()
-    if not is_preference(text):
-        return None
-    return {
-        "type": "preference",
-        "preference": text,
-        "by": msg.get('role', 'unknown'),
-        "timestamp": msg.get('timestamp', ''),
-    }
-
-
-def extract_error_fix(messages: list[dict], idx: int) -> Optional[dict]:
-    """
-    Link an error to its fix. Catch two patterns:
-    1. Error statement followed by explicit fix indicator ("fixed", "resolved")
-    2. Error statement followed by a decision statement that fixes it ("I'll generate", "I'll add")
-    """
-    msg = messages[idx]
-    if not is_error(msg.get('content', '')):
-        return None
-    error_text = msg.get('content', '').strip()
-    
-    window = min(idx + 8, len(messages))
-    for j in range(idx + 1, window):
-        follow_up = messages[j]
-        follow_text = follow_up.get('content', '').strip()
-        # Check for explicit fix indicators
-        if is_fix_indicator(follow_text):
-            return {
-                "type": "error_fix",
-                "error": error_text,
-                "fix": follow_text,
-                "error_timestamp": msg.get('timestamp', ''),
-                "fix_timestamp": follow_up.get('timestamp', ''),
-            }
-        # Check for fix decision: "I'll <action>", "Let's <action>", "We need to <action>"
-        if re.match(r"^(I'll|I will|Let's|We (will|should|need to))\s+\w+", follow_text, re.IGNORECASE):
-            return {
-                "type": "error_fix",
-                "error": error_text,
-                "fix": follow_text,
-                "error_timestamp": msg.get('timestamp', ''),
-                "fix_timestamp": follow_up.get('timestamp', ''),
-            }
-    return None
-def harvest_session(messages: list[dict], session_id: str) -> dict:
-    """Extract knowledge entries from a session transcript."""
-    entries = []
-    n = len(messages)
-
-    for i in range(n):
-        # QA pairs
-        qa = extract_qa_pair(messages, i)
-        if qa:
-            qa['session_id'] = session_id
-            entries.append(qa)
-
-        # Decisions
-        dec = extract_decision(messages, i)
-        if dec:
-            dec['session_id'] = session_id
-            entries.append(dec)
-
-        # Patterns
-        pat = extract_pattern(messages, i)
-        if pat:
-            pat['session_id'] = session_id
-            entries.append(pat)
-
-        # Preferences
-        pref = extract_preference(messages, i)
-        if pref:
-            pref['session_id'] = session_id
-            entries.append(pref)
-
-        # Error/fix pairs (spanning multiple messages)
-        ef = extract_error_fix(messages, i)
-        if ef:
-            ef['session_id'] = session_id
-            entries.append(ef)
-
-    return {
-        "session_id": session_id,
-        "message_count": n,
-        "entries": entries,
-        "counts": {
-            "qa_pair": sum(1 for e in entries if e['type'] == 'qa_pair'),
-            "decision": sum(1 for e in entries if e['type'] == 'decision'),
-            "pattern": sum(1 for e in entries if e['type'] == 'pattern'),
-            "preference": sum(1 for e in entries if e['type'] == 'preference'),
-            "error_fix": sum(1 for e in entries if e['type'] == 'error_fix'),
-        }
-    }
-
-
-def write_json_output(results: list[dict], output_path: Path):
-    """Write aggregated results to JSON."""
-    all_entries = []
-    summary = {"sessions": 0}
-    for r in results:
-        summary['sessions'] += 1
-        all_entries.extend(r['entries'])
-    
-    output = {
-        "harvester": "transcript_harvester",
-        "generated_at": datetime.now(timezone.utc).isoformat(),
-        "summary": summary,
-        "total_entries": len(all_entries),
-        "entries": all_entries,
-    }
-    output_path.write_text(json.dumps(output, indent=2, ensure_ascii=False))
-    return output
-
-
-def write_report(results: list[dict], report_path: Path):
-    """Write a human-readable markdown report."""
-    lines = []
-    lines.append("# Transcript Harvester Report")
-    lines.append(f"Generated: {datetime.now(timezone.utc).isoformat()}")
-    lines.append(f"Sessions processed: {len(results)}")
-    
-    totals = {cat: 0 for cat in ['qa_pair', 'decision', 'pattern', 'preference', 'error_fix']}
-    for r in results:
-        for cat, cnt in r['counts'].items():
-            totals[cat] += cnt  # BUG: should be += cnt
-    
-    lines.append("\n## Extracted Knowledge by Category\n")
-    for cat, cnt in totals.items():
-        lines.append(f"- **{cat}**: {cnt}")
-    
-    lines.append("\n## Sample Entries\n")
-    for r in results:
-        for entry in r['entries'][:3]:
-            lines.append(f"\n### {entry['type'].upper()} ({r['session_id']})\n")
-            if entry['type'] == 'qa_pair':
-                lines.append(f"**Q:** {entry['question']}\n")
-                lines.append(f"**A:** {entry['answer']}\n")
-            elif entry['type'] == 'decision':
-                lines.append(f"**Decision:** {entry['decision']}\n")
-                lines.append(f"By: {entry['by']}\n")
-            elif entry['type'] == 'pattern':
-                lines.append(f"**Pattern:** {entry['pattern']}\n")
-            elif entry['type'] == 'preference':
-                lines.append(f"**Preference:** {entry['preference']}\n")
-            elif entry['type'] == 'error_fix':
-                lines.append(f"**Error:** {entry['error']}\n")
-                lines.append(f"**Fixed by:** {entry['fix']}\n")
-    
-    report_path.write_text("\n".join(lines))
-
-
-def find_recent_sessions(sessions_dir: Path, limit: int = 50) -> list[Path]:
-    """Find up to `limit` most recent .jsonl session files."""
-    sessions = sorted(sessions_dir.glob("*.jsonl"), reverse=True)
-    return sessions[:limit] if limit > 0 else sessions
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Harvest knowledge from session transcripts")
-    parser.add_argument('--session', help='Single session JSONL file')
-    parser.add_argument('--batch', action='store_true', help='Batch mode')
-    parser.add_argument('--sessions-dir', default=str(Path.home() / '.hermes' / 'sessions'),
-                        help='Directory of session files')
-    parser.add_argument('--output', default='knowledge/transcripts',
-                        help='Output directory (default: knowledge/transcripts)')
-    parser.add_argument('--limit', type=int, default=50,
-                        help='Max sessions to process in batch (default: 50)')
-    
-    args = parser.parse_args()
-    output_dir = Path(args.output)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    
-    results = []
-    
-    if args.session:
-        messages = read_session(args.session)
-        session_id = Path(args.session).stem
-        results.append(harvest_session(messages, session_id))
-    elif args.batch:
-        sessions_dir = Path(args.sessions_dir)
-        sessions = find_recent_sessions(sessions_dir, args.limit)
-        print(f"Processing {len(sessions)} sessions...")
-        for sf in sessions:
-            messages = read_session(str(sf))
-            results.append(harvest_session(messages, sf.stem))
-    else:
-        parser.print_help()
-        sys.exit(1)
-    
-    # Write outputs
-    json_path = output_dir / "transcript_knowledge.json"
-    report_path = output_dir / "transcript_report.md"
-    
-    output = write_json_output(results, json_path)
-    write_report(results, report_path)
-    
-    print(f"\nDone: {output['total_entries']} entries from {len(results)} sessions")
-    print(f"Output: {json_path}")
-    print(f"Report: {report_path}")
-    
-    # Print category totals
-    totals = {}
-    for r in results:
-        for cat, cnt in r['counts'].items():
-            totals[cat] = totals.get(cat, 0) + cnt
-    print("\nCategory counts:")
-    for cat, cnt in sorted(totals.items()):
-        print(f"  {cat}: {cnt}")
-
-
-if __name__ == '__main__':
-    main()