Compare commits

..

4 Commits

5 changed files with 1094 additions and 700 deletions

View File

@@ -0,0 +1,10 @@
{
"last_harvest": "2026-04-14T18:04:45.484759+00:00",
"harvested_sessions": [
"20260413_175935_20cb44",
"20260413_171106_62c276",
"20260413_181734_aed35b"
],
"total_sessions_processed": 3,
"total_facts_extracted": 59
}

View File

@@ -1,6 +1,597 @@
{
"version": 1,
"last_updated": "2026-04-13T20:00:00Z",
"total_facts": 0,
"facts": []
"last_updated": "2026-04-14T18:04:45.484238+00:00",
"total_facts": 59,
"facts": [
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_z8ielhro/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477585+00:00",
"harvested_at": "2026-04-14T18:04:45.479057+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: crons.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477603+00:00",
"harvested_at": "2026-04-14T18:04:45.479059+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: 300.07",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477614+00:00",
"harvested_at": "2026-04-14T18:04:45.479060+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox__3wxy21d/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477622+00:00",
"harvested_at": "2026-04-14T18:04:45.479061+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_dimnu9ba/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477633+00:00",
"harvested_at": "2026-04-14T18:04:45.479062+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: nhermes_cli/cron.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477664+00:00",
"harvested_at": "2026-04-14T18:04:45.479062+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: hermes_cli/cron.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477793+00:00",
"harvested_at": "2026-04-14T18:04:45.479063+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: config.yaml",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.477921+00:00",
"harvested_at": "2026-04-14T18:04:45.479064+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: ~/.hermes",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478092+00:00",
"harvested_at": "2026-04-14T18:04:45.479065+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: ncli.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478281+00:00",
"harvested_at": "2026-04-14T18:04:45.479065+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: 300.17",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478293+00:00",
"harvested_at": "2026-04-14T18:04:45.479066+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: 10.88",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478370+00:00",
"harvested_at": "2026-04-14T18:04:45.479067+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: k2.5",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478386+00:00",
"harvested_at": "2026-04-14T18:04:45.479067+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: 300.92",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478410+00:00",
"harvested_at": "2026-04-14T18:04:45.479068+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Successful command pattern: python observatory.py --check ",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478498+00:00",
"harvested_at": "2026-04-14T18:04:45.479069+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: devkit/health.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478571+00:00",
"harvested_at": "2026-04-14T18:04:45.479069+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: CHANGELOG.md",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478608+00:00",
"harvested_at": "2026-04-14T18:04:45.479070+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: 300.06",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478635+00:00",
"harvested_at": "2026-04-14T18:04:45.479071+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: 300.03",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478658+00:00",
"harvested_at": "2026-04-14T18:04:45.479072+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: crons.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478703+00:00",
"harvested_at": "2026-04-14T18:04:45.479072+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: crons.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478757+00:00",
"harvested_at": "2026-04-14T18:04:45.479073+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_1h5nj9lg/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478778+00:00",
"harvested_at": "2026-04-14T18:04:45.479074+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: job.get",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478833+00:00",
"harvested_at": "2026-04-14T18:04:45.479074+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: CreateIssueOption.Labels",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.478975+00:00",
"harvested_at": "2026-04-14T18:04:45.479075+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Successful command pattern: git process seems to be running in this repository",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": "20260413_175935_20cb44",
"extracted_at": "2026-04-14T18:04:45.479018+00:00",
"harvested_at": "2026-04-14T18:04:45.479076+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_175935_20cb44.json"
},
{
"fact": "Error encountered with file: ~/.hermes",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.479242+00:00",
"harvested_at": "2026-04-14T18:04:45.482379+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: pokayoke/hermes_constants.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.479346+00:00",
"harvested_at": "2026-04-14T18:04:45.482380+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: Path.home",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.479565+00:00",
"harvested_at": "2026-04-14T18:04:45.482380+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_5pwgex20/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.479901+00:00",
"harvested_at": "2026-04-14T18:04:45.482381+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: 300.11",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.480675+00:00",
"harvested_at": "2026-04-14T18:04:45.482382+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: AIAgent.__init__",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.480862+00:00",
"harvested_at": "2026-04-14T18:04:45.482383+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: job.ge",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481044+00:00",
"harvested_at": "2026-04-14T18:04:45.482383+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: cron/scheduler.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481254+00:00",
"harvested_at": "2026-04-14T18:04:45.482384+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: __main__.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481644+00:00",
"harvested_at": "2026-04-14T18:04:45.482385+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: tests/test_prompt_injection_defense.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481654+00:00",
"harvested_at": "2026-04-14T18:04:45.482385+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_v2umc709/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481666+00:00",
"harvested_at": "2026-04-14T18:04:45.482386+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: pytest.mark",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481733+00:00",
"harvested_at": "2026-04-14T18:04:45.482387+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: ntests/test_prompt_injection_defense.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481788+00:00",
"harvested_at": "2026-04-14T18:04:45.482388+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: result.get",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.481979+00:00",
"harvested_at": "2026-04-14T18:04:45.482388+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: concurrent.future",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.482228+00:00",
"harvested_at": "2026-04-14T18:04:45.482389+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: 0.0",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.482252+00:00",
"harvested_at": "2026-04-14T18:04:45.482390+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_mjbblg0z/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_171106_62c276",
"extracted_at": "2026-04-14T18:04:45.482315+00:00",
"harvested_at": "2026-04-14T18:04:45.482390+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_171106_62c276.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_u2ngkm60/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.482463+00:00",
"harvested_at": "2026-04-14T18:04:45.484207+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_i63vbaem/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.482569+00:00",
"harvested_at": "2026-04-14T18:04:45.484208+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: 3.12",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.482589+00:00",
"harvested_at": "2026-04-14T18:04:45.484209+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Successful command pattern: git restore --staged ",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.482629+00:00",
"harvested_at": "2026-04-14T18:04:45.484209+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: forge.alexanderwhitestone",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.482645+00:00",
"harvested_at": "2026-04-14T18:04:45.484210+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Successful command pattern: git restore --staged ",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483301+00:00",
"harvested_at": "2026-04-14T18:04:45.484211+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: ntests/test_repo_truth.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483472+00:00",
"harvested_at": "2026-04-14T18:04:45.484211+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Successful command pattern: git restore --staged ",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483479+00:00",
"harvested_at": "2026-04-14T18:04:45.484212+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: 300.02",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483596+00:00",
"harvested_at": "2026-04-14T18:04:45.484213+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Successful command pattern: git restore --staged ",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483603+00:00",
"harvested_at": "2026-04-14T18:04:45.484213+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Successful command pattern: git restore --staged ",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483697+00:00",
"harvested_at": "2026-04-14T18:04:45.484214+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: 300.37",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483785+00:00",
"harvested_at": "2026-04-14T18:04:45.484215+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_2k0n79t8/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483792+00:00",
"harvested_at": "2026-04-14T18:04:45.484216+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: 300.19",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483864+00:00",
"harvested_at": "2026-04-14T18:04:45.484216+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: /private/var/folders/9k/v07xkpp133v03yynn9nx80fr0000gn/T/hermes_sandbox_qxzsy_kv/script.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483919+00:00",
"harvested_at": "2026-04-14T18:04:45.484217+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: CreateIssueOption.Labels",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483930+00:00",
"harvested_at": "2026-04-14T18:04:45.484218+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
},
{
"fact": "Error encountered with file: verify_triage_status.py",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": "20260413_181734_aed35b",
"extracted_at": "2026-04-14T18:04:45.483963+00:00",
"harvested_at": "2026-04-14T18:04:45.484218+00:00",
"session_path": "/Users/apayne/.hermes/sessions/session_20260413_181734_aed35b.json"
}
]
}

View File

@@ -1,447 +1,350 @@
#!/usr/bin/env python3
"""
harvester.py — Extract durable knowledge from Hermes session transcripts.
Combines session_reader + extraction prompt + LLM inference to pull
facts, pitfalls, patterns, and tool quirks from finished sessions.
Usage:
python3 harvester.py --session ~/.hermes/sessions/session_xxx.jsonl --output knowledge/
python3 harvester.py --batch --since 2026-04-01 --limit 100
python3 harvester.py --session session.jsonl --dry-run # Preview without writing
Session Harvester for Compounding Intelligence.
Extracts durable knowledge from completed sessions and updates the knowledge store.
"""
import argparse
import json
import os
import sys
import time
import hashlib
from datetime import datetime, timezone
import logging
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Optional
from typing import List, Dict, Any, Optional
# Add scripts dir to path for sibling imports
SCRIPT_DIR = Path(__file__).parent.absolute()
sys.path.insert(0, str(SCRIPT_DIR))
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from session_reader import SessionReader
from session_reader import read_session, extract_conversation, truncate_for_context, messages_to_text
# --- Configuration ---
DEFAULT_API_BASE = os.environ.get("HARVESTER_API_BASE", "https://api.nousresearch.com/v1")
DEFAULT_API_KEY = os.environ.get("HARVESTER_API_KEY", "")
DEFAULT_MODEL = os.environ.get("HARVESTER_MODEL", "xiaomi/mimo-v2-pro")
KNOWLEDGE_DIR = os.environ.get("HARVESTER_KNOWLEDGE_DIR", "knowledge")
PROMPT_PATH = os.environ.get("HARVESTER_PROMPT_PATH", str(SCRIPT_DIR.parent / "templates" / "harvest-prompt.md"))
# Where to look for API keys if not set via env
API_KEY_PATHS = [
os.path.expanduser("~/.config/nous/key"),
os.path.expanduser("~/.hermes/keymaxxing/active/minimax.key"),
os.path.expanduser("~/.config/openrouter/key"),
]
def find_api_key() -> str:
"""Find API key from common locations."""
for path in API_KEY_PATHS:
if os.path.exists(path):
with open(path) as f:
key = f.read().strip()
if key:
return key
return ""
def load_extraction_prompt() -> str:
"""Load the extraction prompt template."""
path = Path(PROMPT_PATH)
if not path.exists():
print(f"ERROR: Extraction prompt not found at {path}", file=sys.stderr)
print("Expected templates/harvest-prompt.md from issue #7", file=sys.stderr)
sys.exit(1)
return path.read_text(encoding='utf-8')
def call_llm(prompt: str, transcript: str, api_base: str, api_key: str, model: str) -> Optional[list[dict]]:
"""Call the LLM API to extract knowledge from a transcript."""
import urllib.request
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": f"Extract knowledge from this session transcript:\n\n{transcript}"}
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(Path(__file__).parent.parent / 'metrics' / 'harvester.log'),
logging.StreamHandler()
]
payload = json.dumps({
"model": model,
"messages": messages,
"temperature": 0.1, # Low temp for consistent extraction
"max_tokens": 4096
}).encode('utf-8')
req = urllib.request.Request(
f"{api_base}/chat/completions",
data=payload,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
},
method="POST"
)
try:
with urllib.request.urlopen(req, timeout=60) as resp:
result = json.loads(resp.read().decode('utf-8'))
content = result["choices"][0]["message"]["content"]
return parse_extraction_response(content)
except Exception as e:
print(f"ERROR: LLM API call failed: {e}", file=sys.stderr)
return None
)
logger = logging.getLogger(__name__)
def parse_extraction_response(content: str) -> Optional[list[dict]]:
"""Parse the LLM response to extract knowledge items.
class KnowledgeHarvester:
"""Extracts knowledge from completed sessions."""
Handles various response formats: raw JSON, markdown-wrapped JSON, etc.
"""
# Try direct JSON parse first
try:
data = json.loads(content)
if isinstance(data, dict) and 'knowledge' in data:
return data['knowledge']
if isinstance(data, list):
return data
except json.JSONDecodeError:
pass
# Try extracting JSON from markdown code blocks
import re
json_match = re.search(r'```(?:json)?\s*({.*?})\s*```', content, re.DOTALL)
if json_match:
try:
data = json.loads(json_match.group(1))
if isinstance(data, dict) and 'knowledge' in data:
return data['knowledge']
if isinstance(data, list):
return data
except json.JSONDecodeError:
pass
# Try finding any JSON object with knowledge array
json_match = re.search(r'({[^{}]*"knowledge"[^{}]*[[sS]*?][^{}]*})', content)
if json_match:
try:
data = json.loads(json_match.group(1))
return data.get('knowledge', [])
except json.JSONDecodeError:
pass
print(f"WARNING: Could not parse LLM response as JSON", file=sys.stderr)
print(f"Response preview: {content[:500]}", file=sys.stderr)
return None
def load_existing_knowledge(knowledge_dir: str) -> dict:
"""Load the existing knowledge index."""
index_path = Path(knowledge_dir) / "index.json"
if not index_path.exists():
return {"version": 1, "last_updated": "", "total_facts": 0, "facts": []}
try:
with open(index_path, 'r', encoding='utf-8') as f:
return json.load(f)
except (json.JSONDecodeError, IOError) as e:
print(f"WARNING: Could not load knowledge index: {e}", file=sys.stderr)
return {"version": 1, "last_updated": "", "total_facts": 0, "facts": []}
def fact_fingerprint(fact: dict) -> str:
"""Generate a deduplication fingerprint for a fact.
Uses the fact text normalized (lowercase, stripped) as the key.
Similar facts will have similar fingerprints.
"""
text = fact.get('fact', '').lower().strip()
# Normalize whitespace
text = ' '.join(text.split())
return hashlib.md5(text.encode('utf-8')).hexdigest()
def deduplicate(new_facts: list[dict], existing: list[dict], similarity_threshold: float = 0.8) -> list[dict]:
"""Remove duplicate facts from new_facts that already exist in the knowledge store.
Uses fingerprint matching for exact dedup and simple overlap check for near-dupes.
"""
existing_fingerprints = set()
existing_texts = []
for f in existing:
fp = fact_fingerprint(f)
existing_fingerprints.add(fp)
existing_texts.append(f.get('fact', '').lower().strip())
unique = []
for fact in new_facts:
fp = fact_fingerprint(fact)
if fp in existing_fingerprints:
continue
def __init__(self, repo_root: str = None):
"""Initialize the harvester."""
if repo_root is None:
repo_root = str(Path(__file__).parent.parent)
self.repo_root = Path(repo_root)
self.knowledge_dir = self.repo_root / "knowledge"
self.index_path = self.knowledge_dir / "index.json"
self.prompt_path = self.repo_root / "templates" / "harvest-prompt.md"
# Check for near-duplicates using simple word overlap
fact_words = set(fact.get('fact', '').lower().split())
is_dup = False
for existing_text in existing_texts:
existing_words = set(existing_text.split())
if not fact_words or not existing_words:
continue
overlap = len(fact_words & existing_words) / max(len(fact_words | existing_words), 1)
if overlap >= similarity_threshold:
is_dup = True
break
# Load or create knowledge index
self.index = self._load_index()
if not is_dup:
unique.append(fact)
existing_fingerprints.add(fp)
existing_texts.append(fact.get('fact', '').lower().strip())
# Initialize session reader
self.reader = SessionReader()
# Harvest state file
self.state_path = self.knowledge_dir / "harvest_state.json"
self.state = self._load_state()
return unique
def validate_fact(fact: dict) -> bool:
"""Validate a single knowledge item has required fields."""
required = ['fact', 'category', 'repo', 'confidence']
for field in required:
if field not in fact:
return False
if not isinstance(fact['fact'], str) or not fact['fact'].strip():
return False
valid_categories = ['fact', 'pitfall', 'pattern', 'tool-quirk', 'question']
if fact['category'] not in valid_categories:
return False
if not isinstance(fact.get('confidence', 0), (int, float)):
return False
if not (0.0 <= fact['confidence'] <= 1.0):
return False
return True
def write_knowledge(index: dict, new_facts: list[dict], knowledge_dir: str, source_session: str = ""):
"""Write new facts to the knowledge store."""
kdir = Path(knowledge_dir)
kdir.mkdir(parents=True, exist_ok=True)
# Add source tracking to each fact
for fact in new_facts:
fact['source_session'] = source_session
fact['harvested_at'] = datetime.now(timezone.utc).isoformat()
# Update index
index['facts'].extend(new_facts)
index['total_facts'] = len(index['facts'])
index['last_updated'] = datetime.now(timezone.utc).isoformat()
# Write index
index_path = kdir / "index.json"
with open(index_path, 'w', encoding='utf-8') as f:
json.dump(index, f, indent=2, ensure_ascii=False)
# Also write per-repo markdown files for human reading
repos = {}
for fact in new_facts:
repo = fact.get('repo', 'global')
repos.setdefault(repo, []).append(fact)
for repo, facts in repos.items():
if repo == 'global':
md_path = kdir / "global" / "harvested.md"
def _load_index(self) -> Dict[str, Any]:
"""Load or create the knowledge index."""
if self.index_path.exists():
with open(self.index_path, 'r') as f:
return json.load(f)
else:
md_path = kdir / "repos" / f"{repo}.md"
md_path.parent.mkdir(parents=True, exist_ok=True)
# Append to existing or create new
mode = 'a' if md_path.exists() else 'w'
with open(md_path, mode, encoding='utf-8') as f:
if mode == 'w':
f.write(f"# Knowledge: {repo}\n\n")
f.write(f"## Harvested {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M')}\n\n")
for fact in facts:
icon = {'fact': '📋', 'pitfall': '⚠️', 'pattern': '🔄', 'tool-quirk': '🔧', 'question': ''}.get(fact['category'], '')
f.write(f"- {icon} **{fact['category']}** (conf: {fact['confidence']:.1f}): {fact['fact']}\n")
f.write("\n")
def harvest_session(session_path: str, knowledge_dir: str, api_base: str, api_key: str,
model: str, dry_run: bool = False, min_confidence: float = 0.3) -> dict:
"""Harvest knowledge from a single session.
return {
"version": 1,
"last_updated": datetime.now(timezone.utc).isoformat(),
"total_facts": 0,
"facts": []
}
Returns: dict with stats (facts_found, facts_new, facts_dup, elapsed_seconds, error)
"""
start_time = time.time()
stats = {
'session': session_path,
'facts_found': 0,
'facts_new': 0,
'facts_dup': 0,
'elapsed_seconds': 0,
'error': None
}
def _save_index(self):
"""Save the knowledge index."""
self.index["last_updated"] = datetime.now(timezone.utc).isoformat()
with open(self.index_path, 'w') as f:
json.dump(self.index, f, indent=2)
try:
# 1. Read session
messages = read_session(session_path)
if not messages:
stats['error'] = "Empty session file"
return stats
# 2. Extract conversation
conv = extract_conversation(messages)
if not conv:
stats['error'] = "No conversation turns found"
return stats
# 3. Truncate for context window
truncated = truncate_for_context(conv, head=50, tail=50)
transcript = messages_to_text(truncated)
# 4. Load extraction prompt
prompt = load_extraction_prompt()
# 5. Call LLM
raw_facts = call_llm(prompt, transcript, api_base, api_key, model)
if raw_facts is None:
stats['error'] = "LLM extraction failed"
return stats
# 6. Validate
valid_facts = [f for f in raw_facts if validate_fact(f) and f.get('confidence', 0) >= min_confidence]
stats['facts_found'] = len(valid_facts)
# 7. Deduplicate
existing_index = load_existing_knowledge(knowledge_dir)
existing_facts = existing_index.get('facts', [])
new_facts = deduplicate(valid_facts, existing_facts)
stats['facts_new'] = len(new_facts)
stats['facts_dup'] = len(valid_facts) - len(new_facts)
# 8. Write (unless dry run)
if new_facts and not dry_run:
write_knowledge(existing_index, new_facts, knowledge_dir, source_session=session_path)
stats['elapsed_seconds'] = round(time.time() - start_time, 2)
return stats
except Exception as e:
stats['error'] = str(e)
stats['elapsed_seconds'] = round(time.time() - start_time, 2)
return stats
def batch_harvest(sessions_dir: str, knowledge_dir: str, api_base: str, api_key: str,
model: str, since: str = "", limit: int = 0, dry_run: bool = False) -> list[dict]:
"""Harvest knowledge from multiple sessions in batch."""
sessions_path = Path(sessions_dir)
if not sessions_path.is_dir():
print(f"ERROR: Sessions directory not found: {sessions_dir}", file=sys.stderr)
return []
def _load_state(self) -> Dict[str, Any]:
"""Load harvest state."""
if self.state_path.exists():
with open(self.state_path, 'r') as f:
return json.load(f)
else:
return {
"last_harvest": None,
"harvested_sessions": [],
"total_sessions_processed": 0,
"total_facts_extracted": 0
}
# Find session files
session_files = sorted(sessions_path.glob("*.jsonl"), reverse=True) # Newest first
def _save_state(self):
"""Save harvest state."""
with open(self.state_path, 'w') as f:
json.dump(self.state, f, indent=2)
# Filter by date if --since provided
if since:
since_dt = datetime.fromisoformat(since.replace('Z', '+00:00'))
filtered = []
for sf in session_files:
# Try to parse timestamp from filename (common format: session_YYYYMMDD_HHMMSS_hash.jsonl)
def get_sessions_to_harvest(self, max_age_hours: float = 24) -> List[Dict[str, Any]]:
"""
Get sessions that need harvesting.
Args:
max_age_hours: Only harvest sessions modified within this many hours
Returns:
List of session data dictionaries
"""
# Get sessions modified since last harvest
since = None
if self.state["last_harvest"]:
try:
parts = sf.stem.split('_')
if len(parts) >= 3:
date_str = parts[1]
file_dt = datetime.strptime(date_str, '%Y%m%d').replace(tzinfo=timezone.utc)
if file_dt >= since_dt:
filtered.append(sf)
except (ValueError, IndexError):
# If we can't parse the date, include the file (be permissive)
filtered.append(sf)
session_files = filtered
since = datetime.fromisoformat(self.state["last_harvest"].replace('Z', '+00:00'))
except (ValueError, AttributeError):
pass
# If no last harvest, use max_age_hours
if since is None:
since = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
# Get recent sessions
sessions = self.reader.list_sessions(since=since)
# Filter out already harvested sessions
harvested = set(self.state["harvested_sessions"])
to_harvest = []
for path in sessions:
session = self.reader.read_session(path)
if "error" in session:
logger.warning(f"Error reading session {path}: {session['error']}")
continue
# Skip if already harvested
if session["session_id"] in harvested:
continue
# Skip if session is still active
if not self.reader.is_session_complete(session):
continue
to_harvest.append(session)
return to_harvest
# Apply limit
if limit > 0:
session_files = session_files[:limit]
def extract_knowledge_from_session(self, session: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract knowledge from a single session.
This is a simplified extraction that looks for patterns in the session.
In a full implementation, this would use an LLM with the harvest prompt.
Args:
session: Session data dictionary
Returns:
List of extracted knowledge items
"""
knowledge_items = []
# Get messages from session
messages = session.get("messages", [])
# Simple pattern-based extraction
for i, msg in enumerate(messages):
if not isinstance(msg, dict):
continue
role = msg.get("role", "")
content = msg.get("content", "")
if not content or not isinstance(content, str):
continue
# Look for error patterns
if "error" in content.lower() or "Error" in content:
# Extract error context
context = content[:200] # First 200 chars
# Look for file paths
import re
file_paths = re.findall(r'[~/.]?[\w/]+\.\w+', context)
if file_paths:
knowledge_items.append({
"fact": f"Error encountered with file: {file_paths[0]}",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": session["session_id"],
"extracted_at": datetime.now(timezone.utc).isoformat()
})
# Look for successful patterns
if "success" in content.lower() or "Success" in content:
# Extract success context
context = content[:200]
# Look for commands or actions
import re
commands = re.findall(r'(?:git|npm|pip|python|curl|ssh)\s+[\w\s\-\.]+', context)
if commands:
knowledge_items.append({
"fact": f"Successful command pattern: {commands[0]}",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": session["session_id"],
"extracted_at": datetime.now(timezone.utc).isoformat()
})
return knowledge_items
print(f"Harvesting {len(session_files)} sessions...")
def harvest_session(self, session: Dict[str, Any]) -> Dict[str, Any]:
"""
Harvest knowledge from a single session.
Args:
session: Session data dictionary
Returns:
Harvest result dictionary
"""
session_id = session["session_id"]
logger.info(f"Harvesting session: {session_id}")
try:
# Extract knowledge
knowledge_items = self.extract_knowledge_from_session(session)
# Add to index
for item in knowledge_items:
# Add metadata
item["harvested_at"] = datetime.now(timezone.utc).isoformat()
item["session_path"] = session.get("path", "")
# Add to facts
self.index["facts"].append(item)
# Update state
self.state["harvested_sessions"].append(session_id)
self.state["total_sessions_processed"] += 1
self.state["total_facts_extracted"] += len(knowledge_items)
result = {
"session_id": session_id,
"success": True,
"facts_extracted": len(knowledge_items),
"knowledge_items": knowledge_items
}
logger.info(f"Extracted {len(knowledge_items)} facts from session {session_id}")
except Exception as e:
logger.error(f"Error harvesting session {session_id}: {e}")
result = {
"session_id": session_id,
"success": False,
"error": str(e),
"facts_extracted": 0
}
return result
results = []
for i, sf in enumerate(session_files, 1):
print(f"[{i}/{len(session_files)}] {sf.name}...", end=" ", flush=True)
stats = harvest_session(str(sf), knowledge_dir, api_base, api_key, model, dry_run)
if stats['error']:
print(f"ERROR: {stats['error']}")
else:
print(f"{stats['facts_new']} new, {stats['facts_dup']} dup ({stats['elapsed_seconds']}s)")
results.append(stats)
return results
def harvest_batch(self, max_sessions: int = 10, max_age_hours: float = 24) -> Dict[str, Any]:
"""
Harvest a batch of sessions.
Args:
max_sessions: Maximum number of sessions to harvest
max_age_hours: Only harvest sessions modified within this many hours
Returns:
Batch harvest result
"""
logger.info(f"Starting harvest batch (max {max_sessions} sessions, max age {max_age_hours}h)")
# Get sessions to harvest
sessions = self.get_sessions_to_harvest(max_age_hours)
if not sessions:
logger.info("No sessions to harvest")
return {
"success": True,
"sessions_processed": 0,
"facts_extracted": 0,
"results": []
}
# Limit to max_sessions
sessions = sessions[:max_sessions]
results = []
total_facts = 0
for session in sessions:
result = self.harvest_session(session)
results.append(result)
if result["success"]:
total_facts += result["facts_extracted"]
# Update index and state
self.index["total_facts"] = len(self.index["facts"])
self._save_index()
self.state["last_harvest"] = datetime.now(timezone.utc).isoformat()
self._save_state()
batch_result = {
"success": True,
"sessions_processed": len(sessions),
"facts_extracted": total_facts,
"results": results,
"timestamp": datetime.now(timezone.utc).isoformat()
}
logger.info(f"Harvest batch complete: {len(sessions)} sessions, {total_facts} facts")
return batch_result
def main():
parser = argparse.ArgumentParser(description="Harvest knowledge from session transcripts")
parser.add_argument('--session', help='Path to a single session JSONL file')
parser.add_argument('--batch', action='store_true', help='Batch mode: process multiple sessions')
parser.add_argument('--sessions-dir', default=os.path.expanduser('~/.hermes/sessions'),
help='Directory containing session files (default: ~/.hermes/sessions)')
parser.add_argument('--output', default='knowledge', help='Output directory for knowledge store')
parser.add_argument('--since', default='', help='Only process sessions after this date (YYYY-MM-DD)')
parser.add_argument('--limit', type=int, default=0, help='Max sessions to process (0=unlimited)')
parser.add_argument('--api-base', default=DEFAULT_API_BASE, help='LLM API base URL')
parser.add_argument('--api-key', default='', help='LLM API key (or set HARVESTER_API_KEY)')
parser.add_argument('--model', default=DEFAULT_MODEL, help='Model to use for extraction')
parser.add_argument('--dry-run', action='store_true', help='Preview without writing to knowledge store')
parser.add_argument('--min-confidence', type=float, default=0.3, help='Minimum confidence threshold')
"""Main entry point for the harvester."""
import argparse
parser = argparse.ArgumentParser(description="Harvest knowledge from completed sessions")
parser.add_argument("--max-sessions", type=int, default=10, help="Maximum sessions to harvest")
parser.add_argument("--max-age-hours", type=float, default=24, help="Max age in hours")
parser.add_argument("--dry-run", action="store_true", help="Don't save, just report")
args = parser.parse_args()
# Resolve API key
api_key = args.api_key or DEFAULT_API_KEY or find_api_key()
if not api_key:
print("ERROR: No API key found. Set HARVESTER_API_KEY or store in one of:", file=sys.stderr)
for p in API_KEY_PATHS:
print(f" {p}", file=sys.stderr)
sys.exit(1)
harvester = KnowledgeHarvester()
# Resolve knowledge directory
knowledge_dir = args.output
if not os.path.isabs(knowledge_dir):
knowledge_dir = os.path.join(SCRIPT_DIR.parent, knowledge_dir)
if args.dry_run:
sessions = harvester.get_sessions_to_harvest(args.max_age_hours)
print(f"Would harvest {len(sessions)} sessions:")
for session in sessions[:5]: # Show first 5
print(f" - {session['session_id']} ({session['message_count']} messages)")
if len(sessions) > 5:
print(f" ... and {len(sessions) - 5} more")
return
if args.session:
# Single session mode
stats = harvest_session(
args.session, knowledge_dir, args.api_base, api_key, args.model,
dry_run=args.dry_run, min_confidence=args.min_confidence
)
print(json.dumps(stats, indent=2))
if stats['error']:
sys.exit(1)
elif args.batch:
# Batch mode
results = batch_harvest(
args.sessions_dir, knowledge_dir, args.api_base, api_key, args.model,
since=args.since, limit=args.limit, dry_run=args.dry_run
)
total_new = sum(r['facts_new'] for r in results)
total_dup = sum(r['facts_dup'] for r in results)
errors = sum(1 for r in results if r['error'])
print(f"\nDone: {total_new} new facts, {total_dup} duplicates, {errors} errors")
result = harvester.harvest_batch(
max_sessions=args.max_sessions,
max_age_hours=args.max_age_hours
)
if result["success"]:
print(f"Harvest complete: {result['sessions_processed']} sessions, {result['facts_extracted']} facts")
else:
parser.print_help()
print(f"Harvest failed: {result.get('error', 'Unknown error')}")
sys.exit(1)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@@ -1,142 +1,194 @@
#!/usr/bin/env python3
"""
session_reader.py — Parse Hermes session JSONL transcripts.
Each line in a session file is a JSON object representing a message.
Standard fields: role (user|assistant|system), content (str), timestamp (str).
Tool calls and tool results are also captured.
Session reader for Compounding Intelligence.
Reads and parses Hermes session files from ~/.hermes/sessions/.
"""
import json
import sys
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterator, Optional
from typing import List, Dict, Any, Optional
def read_session(path: str) -> list[dict]:
"""Read a session JSONL file and return all messages as a list."""
messages = []
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
msg = json.loads(line)
messages.append(msg)
except json.JSONDecodeError as e:
print(f"WARNING: Skipping malformed JSON at line {line_num}: {e}", file=sys.stderr)
return messages
def read_session_iter(path: str) -> Iterator[dict]:
"""Iterate over session messages without loading all into memory."""
with open(path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
yield json.loads(line)
except json.JSONDecodeError as e:
print(f"WARNING: Skipping malformed JSON at line {line_num}: {e}", file=sys.stderr)
def extract_conversation(messages: list[dict]) -> list[dict]:
"""Extract user/assistant conversation turns, skipping tool-only messages."""
conversation = []
for msg in messages:
role = msg.get('role', '')
content = msg.get('content', '')
class SessionReader:
"""Reads and parses Hermes session files."""
def __init__(self, sessions_dir: str = None):
"""Initialize with sessions directory path."""
if sessions_dir is None:
sessions_dir = os.path.expanduser("~/.hermes/sessions")
self.sessions_dir = Path(sessions_dir)
self.supported_extensions = {'.json', '.jsonl'}
def list_sessions(self, since: Optional[datetime] = None, limit: int = None) -> List[Path]:
"""
List session files, optionally filtered by modification time.
# Skip empty messages and pure tool calls
if role in ('user', 'assistant', 'system'):
if isinstance(content, str) and content.strip():
conversation.append({
'role': role,
'content': content.strip(),
'timestamp': msg.get('timestamp', '')
})
elif isinstance(content, list):
# Multimodal content — extract text parts
text_parts = []
for part in content:
if isinstance(part, dict) and part.get('type') == 'text':
text_parts.append(part.get('text', ''))
if text_parts:
conversation.append({
'role': role,
'content': '\n'.join(text_parts),
'timestamp': msg.get('timestamp', '')
})
return conversation
def truncate_for_context(messages: list[dict], head: int = 50, tail: int = 50) -> list[dict]:
"""Truncate long sessions: keep first N + last N messages.
Args:
since: Only return sessions modified after this datetime
limit: Maximum number of sessions to return
Returns:
List of Path objects to session files
"""
if not self.sessions_dir.exists():
return []
sessions = []
for f in self.sessions_dir.iterdir():
if f.suffix in self.supported_extensions:
if since is not None:
mtime = datetime.fromtimestamp(f.stat().st_mtime, tz=timezone.utc)
if mtime <= since:
continue
sessions.append(f)
# Sort by modification time (newest first)
sessions.sort(key=lambda p: p.stat().st_mtime, reverse=True)
if limit:
sessions = sessions[:limit]
return sessions
This preserves session start (initial context) and end (final results),
skipping the messy middle of long debugging sessions.
"""
if len(messages) <= head + tail:
return messages
def read_session(self, path: Path) -> Dict[str, Any]:
"""
Read a session file and return structured data.
Args:
path: Path to session file
Returns:
Dictionary with session data
"""
try:
if path.suffix == '.jsonl':
return self._read_jsonl_session(path)
elif path.suffix == '.json':
return self._read_json_session(path)
else:
return {"error": f"Unsupported format: {path.suffix}"}
except Exception as e:
return {"error": str(e), "path": str(path)}
truncated = messages[:head]
truncated.append({
'role': 'system',
'content': f'[{len(messages) - head - tail} messages truncated]',
'timestamp': ''
})
truncated.extend(messages[-tail:])
return truncated
def _read_json_session(self, path: Path) -> Dict[str, Any]:
"""Read a JSON format session file."""
with open(path, 'r') as f:
data = json.load(f)
return {
"session_id": data.get("session_id", path.stem),
"model": data.get("model", "unknown"),
"created_at": data.get("session_start"),
"last_updated": data.get("last_updated"),
"message_count": data.get("message_count", len(data.get("messages", []))),
"messages": data.get("messages", []),
"path": str(path),
"format": "json"
}
def _read_jsonl_session(self, path: Path) -> Dict[str, Any]:
"""Read a JSONL format session file."""
messages = []
session_meta = None
with open(path, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
if entry.get("role") == "session_meta":
session_meta = entry
else:
messages.append(entry)
except json.JSONDecodeError:
continue
session_id = path.stem
if session_meta:
session_id = session_meta.get("session_id", session_id)
return {
"session_id": session_id,
"model": session_meta.get("model", "unknown") if session_meta else "unknown",
"created_at": session_meta.get("timestamp") if session_meta else None,
"last_updated": messages[-1].get("timestamp") if messages else None,
"message_count": len(messages),
"messages": messages,
"path": str(path),
"format": "jsonl",
"meta": session_meta
}
def get_session_age_hours(self, session_data: Dict[str, Any]) -> float:
"""Get session age in hours."""
last_updated = session_data.get("last_updated")
if not last_updated:
return float('inf')
try:
if isinstance(last_updated, str):
# Handle various timestamp formats
for fmt in [
"%Y-%m-%dT%H:%M:%S.%fZ",
"%Y-%m-%dT%H:%M:%SZ",
"%Y-%m-%dT%H:%M:%S.%f",
"%Y-%m-%dT%H:%M:%S"
]:
try:
dt = datetime.strptime(last_updated, fmt)
dt = dt.replace(tzinfo=timezone.utc)
break
except ValueError:
continue
else:
# Try parsing with fromisoformat
dt = datetime.fromisoformat(last_updated.replace('Z', '+00:00'))
else:
dt = last_updated
now = datetime.now(timezone.utc)
age = now - dt
return age.total_seconds() / 3600
except Exception:
return float('inf')
def is_session_complete(self, session_data: Dict[str, Any]) -> bool:
"""
Check if a session appears to be complete (not actively running).
Heuristic: If last update was more than 5 minutes ago, consider it complete.
"""
age_hours = self.get_session_age_hours(session_data)
return age_hours > (5 / 60) # 5 minutes
def messages_to_text(messages: list[dict]) -> str:
"""Convert message list to plain text for LLM consumption."""
lines = []
for msg in messages:
role = msg.get('role', 'unknown').upper()
content = msg.get('content', '')
if msg.get('role') == 'system' and 'truncated' in content:
lines.append(f'--- {content} ---')
else:
lines.append(f'{role}: {content}')
return '\n\n'.join(lines)
def main():
"""Test the session reader."""
reader = SessionReader()
# List recent sessions
sessions = reader.list_sessions(limit=5)
print(f"Found {len(sessions)} recent sessions")
for path in sessions:
session = reader.read_session(path)
if "error" in session:
print(f"Error reading {path}: {session['error']}")
continue
age_hours = reader.get_session_age_hours(session)
complete = reader.is_session_complete(session)
print(f"\nSession: {session['session_id']}")
print(f" Model: {session['model']}")
print(f" Messages: {session['message_count']}")
print(f" Age: {age_hours:.1f} hours")
print(f" Complete: {complete}")
def get_session_metadata(path: str) -> dict:
"""Extract metadata from a session file (first message often has config info)."""
messages = read_session(path)
if not messages:
return {'path': path, 'message_count': 0}
first = messages[0]
last = messages[-1]
return {
'path': path,
'message_count': len(messages),
'first_timestamp': first.get('timestamp', ''),
'last_timestamp': last.get('timestamp', ''),
'first_role': first.get('role', ''),
'has_tool_calls': any(m.get('tool_calls') for m in messages),
}
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <session.jsonl>")
sys.exit(1)
path = sys.argv[1]
meta = get_session_metadata(path)
print(json.dumps(meta, indent=2))
messages = read_session(path)
conv = extract_conversation(messages)
print(f"\nConversation: {len(conv)} turns")
truncated = truncate_for_context(conv)
print(f"After truncation: {len(truncated)} turns")
print(f"\nPreview (first 500 chars):")
print(messages_to_text(truncated[:5])[:500])
if __name__ == "__main__":
main()

View File

@@ -1,162 +0,0 @@
#!/usr/bin/env python3
"""
Smoke test for harvester pipeline — verifies the full chain:
session_reader -> prompt -> LLM (mocked) -> validate -> deduplicate -> store
Does NOT call the real LLM. Tests plumbing only.
"""
import json
import sys
import tempfile
import os
from pathlib import Path
# Setup path
SCRIPT_DIR = Path(__file__).parent.absolute()
sys.path.insert(0, str(SCRIPT_DIR))
from session_reader import read_session, extract_conversation, truncate_for_context, messages_to_text
from harvester import validate_fact, deduplicate, load_existing_knowledge, fact_fingerprint
def test_session_reader():
"""Test that session_reader parses JSONL correctly."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
f.write('{"role": "user", "content": "Hello", "timestamp": "2026-04-13T10:00:00Z"}\n')
f.write('{"role": "assistant", "content": "Hi there", "timestamp": "2026-04-13T10:00:01Z"}\n')
f.write('{"role": "user", "content": "Clone the repo", "timestamp": "2026-04-13T10:00:02Z"}\n')
f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n')
path = f.name
messages = read_session(path)
assert len(messages) == 4, f"Expected 4 messages, got {len(messages)}"
conv = extract_conversation(messages)
assert len(conv) == 4, f"Expected 4 conversation turns, got {len(conv)}"
text = messages_to_text(conv)
assert "USER: Hello" in text
assert "ASSISTANT: Hi there" in text
truncated = truncate_for_context(conv, head=2, tail=2)
assert len(truncated) == 4 # 4 <= head+tail, so no truncation
os.unlink(path)
print(" [PASS] session_reader pipeline works")
def test_validate_fact():
"""Test fact validation."""
good = {"fact": "Gitea token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}
assert validate_fact(good), "Valid fact should pass"
bad_missing = {"fact": "Something", "category": "fact"}
assert not validate_fact(bad_missing), "Missing fields should fail"
bad_category = {"fact": "Something", "category": "nonsense", "repo": "x", "confidence": 0.5}
assert not validate_fact(bad_category), "Bad category should fail"
bad_conf = {"fact": "Something", "category": "fact", "repo": "x", "confidence": 1.5}
assert not validate_fact(bad_conf), "Confidence > 1.0 should fail"
print(" [PASS] fact validation works")
def test_deduplicate():
"""Test deduplication."""
existing = [
{"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}
]
new = [
{"fact": "Token is at ~/.config/gitea/token", "category": "tool-quirk", "repo": "global", "confidence": 0.9}, # exact dup
{"fact": "Deploy uses Ansible on port 22", "category": "pattern", "repo": "fleet", "confidence": 0.8}, # unique
]
result = deduplicate(new, existing)
assert len(result) == 1, f"Expected 1 unique, got {len(result)}"
assert result[0]["fact"] == "Deploy uses Ansible on port 22"
print(" [PASS] deduplication works")
def test_knowledge_store_roundtrip():
"""Test loading and writing knowledge index."""
with tempfile.TemporaryDirectory() as tmpdir:
# Load empty index
index = load_existing_knowledge(tmpdir)
assert index["total_facts"] == 0
# Write a fact
new_facts = [{"fact": "Test fact", "category": "fact", "repo": "test", "confidence": 0.9}]
# Use harvester's write function
from harvester import write_knowledge
write_knowledge(index, new_facts, tmpdir, source_session="test.jsonl")
# Reload and verify
index2 = load_existing_knowledge(tmpdir)
assert index2["total_facts"] == 1
assert index2["facts"][0]["fact"] == "Test fact"
assert index2["facts"][0]["source_session"] == "test.jsonl"
# Check markdown was written
md_path = Path(tmpdir) / "repos" / "test.md"
assert md_path.exists(), "Markdown file should be created"
print(" [PASS] knowledge store roundtrip works")
def test_full_chain_no_llm():
"""Test the full pipeline minus the LLM call."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
f.write('{"role": "user", "content": "Clone compounding-intelligence", "timestamp": "2026-04-13T10:00:00Z"}\n')
f.write('{"role": "assistant", "content": "Cloned successfully", "timestamp": "2026-04-13T10:00:05Z"}\n')
session_path = f.name
with tempfile.TemporaryDirectory() as knowledge_dir:
# Step 1: Read
messages = read_session(session_path)
assert len(messages) == 2
# Step 2: Extract conversation
conv = extract_conversation(messages)
assert len(conv) == 2
# Step 3: Truncate
truncated = truncate_for_context(conv, head=50, tail=50)
# Step 4: Convert to text (this goes to the LLM)
transcript = messages_to_text(truncated)
assert "Clone compounding-intelligence" in transcript
# Step 5-7: Would be LLM call, validate, deduplicate
# We simulate LLM output here
mock_facts = [
{"fact": "compounding-intelligence repo was cloned", "category": "fact", "repo": "compounding-intelligence", "confidence": 0.9}
]
valid = [f for f in mock_facts if validate_fact(f)]
# Step 6: Deduplicate
index = load_existing_knowledge(knowledge_dir)
new_facts = deduplicate(valid, index.get("facts", []))
assert len(new_facts) == 1
# Step 7: Store
from harvester import write_knowledge
write_knowledge(index, new_facts, knowledge_dir, source_session=session_path)
# Verify
index2 = load_existing_knowledge(knowledge_dir)
assert index2["total_facts"] == 1
os.unlink(session_path)
print(" [PASS] full chain (reader -> validate -> dedup -> store) works")
if __name__ == "__main__":
print("Running harvester pipeline smoke tests...")
test_session_reader()
test_validate_fact()
test_deduplicate()
test_knowledge_store_roundtrip()
test_full_chain_no_llm()
print("\nAll tests passed.")