Files
the-nexus/bin/memory_mine.py
Alexander Whitestone 60d82da70a
Some checks failed
Review Approval Gate / verify-review (pull_request) Failing after 10s
CI / test (pull_request) Failing after 57s
CI / validate (pull_request) Failing after 57s
fix: guard memory_mine.py against git commit shell injection (closes #1430)
2026-04-21 23:28:44 -04:00

267 lines
7.6 KiB
Python

#!/usr/bin/env python3
"""
memory_mine.py — Mine session transcripts into MemPalace.
Reads Hermes session logs (JSONL format) and stores summaries
in the palace. Supports batch mining, single-file processing,
and live directory watching.
Usage:
# Mine a single session file
python3 bin/memory_mine.py ~/.hermes/sessions/2026-04-13.jsonl
# Mine all sessions from last 7 days
python3 bin/memory_mine.py --days 7
# Mine a specific wing's sessions
python3 bin/memory_mine.py --wing wing_bezalel --days 14
# Dry run — show what would be mined
python3 bin/memory_mine.py --dry-run --days 7
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
import time
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Optional
# ── Safety guard ───────────────────────────────────────────────────────
# Prevent accidental execution from git commit messages containing
# code examples with backticks (shell substitution). See issue #1430.
if os.environ.get("GIT_DIR") or os.environ.get("GIT_INDEX_FILE"):
# Running inside a git hook — exit silently to prevent
# shell substitution in commit messages from triggering mining.
sys.exit(0)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("memory-mine")
REPO_ROOT = Path(__file__).resolve().parent.parent
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))
def parse_session_file(path: Path) -> list[dict]:
"""
Parse a JSONL session file into turns.
Each line is expected to be a JSON object with:
- role: "user" | "assistant" | "system" | "tool"
- content: text
- timestamp: ISO string (optional)
"""
turns = []
with open(path) as f:
for i, line in enumerate(f):
line = line.strip()
if not line:
continue
try:
turn = json.loads(line)
turns.append(turn)
except json.JSONDecodeError:
logger.debug(f"Skipping malformed line {i+1} in {path}")
return turns
def summarize_session(turns: list[dict], agent_name: str = "unknown") -> str:
"""
Generate a compact summary of a session's turns.
Keeps user messages and key agent responses, strips noise.
"""
if not turns:
return "Empty session."
user_msgs = []
agent_msgs = []
tool_calls = []
for turn in turns:
role = turn.get("role", "")
content = str(turn.get("content", ""))[:300]
if role == "user":
user_msgs.append(content)
elif role == "assistant":
agent_msgs.append(content)
elif role == "tool":
tool_name = turn.get("name", turn.get("tool", "unknown"))
tool_calls.append(f"{tool_name}: {content[:150]}")
parts = [f"Session by {agent_name}:"]
if user_msgs:
parts.append(f"\nUser asked ({len(user_msgs)} messages):")
for msg in user_msgs[:5]:
parts.append(f" - {msg[:200]}")
if len(user_msgs) > 5:
parts.append(f" ... and {len(user_msgs) - 5} more")
if agent_msgs:
parts.append(f"\nAgent responded ({len(agent_msgs)} messages):")
for msg in agent_msgs[:3]:
parts.append(f" - {msg[:200]}")
if tool_calls:
parts.append(f"\nTools used ({len(tool_calls)} calls):")
for tc in tool_calls[:5]:
parts.append(f" - {tc}")
return "\n".join(parts)
def mine_session(
path: Path,
wing: str,
palace_path: Optional[Path] = None,
dry_run: bool = False,
) -> Optional[str]:
"""
Mine a single session file into MemPalace.
Returns the document ID if stored, None on failure or dry run.
"""
try:
from agent.memory import AgentMemory
except ImportError:
logger.error("Cannot import agent.memory — is the repo in PYTHONPATH?")
return None
turns = parse_session_file(path)
if not turns:
logger.debug(f"Empty session file: {path}")
return None
agent_name = wing.replace("wing_", "")
summary = summarize_session(turns, agent_name)
if dry_run:
print(f"\n--- {path.name} ---")
print(summary[:500])
print(f"({len(turns)} turns)")
return None
mem = AgentMemory(agent_name=agent_name, wing=wing, palace_path=palace_path)
doc_id = mem.remember(
summary,
room="hermes",
source_file=str(path),
metadata={
"type": "mined_session",
"source": str(path),
"turn_count": len(turns),
"agent": agent_name,
"timestamp": datetime.now(timezone.utc).isoformat(),
},
)
if doc_id:
logger.info(f"Mined {path.name}{doc_id} ({len(turns)} turns)")
else:
logger.warning(f"Failed to mine {path.name}")
return doc_id
def find_session_files(
sessions_dir: Path,
days: int = 7,
pattern: str = "*.jsonl",
) -> list[Path]:
"""
Find session files from the last N days.
"""
cutoff = datetime.now() - timedelta(days=days)
files = []
if not sessions_dir.exists():
logger.warning(f"Sessions directory not found: {sessions_dir}")
return files
for path in sorted(sessions_dir.glob(pattern)):
# Use file modification time as proxy for session date
mtime = datetime.fromtimestamp(path.stat().st_mtime)
if mtime >= cutoff:
files.append(path)
return files
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
description="Mine session transcripts into MemPalace"
)
parser.add_argument(
"files", nargs="*", help="Session files to mine (JSONL format)"
)
parser.add_argument(
"--days", type=int, default=7,
help="Mine sessions from last N days (default: 7)"
)
parser.add_argument(
"--sessions-dir",
default=str(Path.home() / ".hermes" / "sessions"),
help="Directory containing session JSONL files"
)
parser.add_argument(
"--wing", default=None,
help="Wing name (default: auto-detect from MEMPALACE_WING env or 'wing_timmy')"
)
parser.add_argument(
"--palace-path", default=None,
help="Override palace path"
)
parser.add_argument(
"--dry-run", action="store_true",
help="Show what would be mined without storing"
)
args = parser.parse_args(argv)
wing = args.wing or os.environ.get("MEMPALACE_WING", "wing_timmy")
palace_path = Path(args.palace_path) if args.palace_path else None
if args.files:
files = [Path(f) for f in args.files]
else:
sessions_dir = Path(args.sessions_dir)
files = find_session_files(sessions_dir, days=args.days)
if not files:
logger.info("No session files found to mine.")
return 0
logger.info(f"Mining {len(files)} session files (wing={wing})")
mined = 0
failed = 0
for path in files:
result = mine_session(path, wing=wing, palace_path=palace_path, dry_run=args.dry_run)
if result:
mined += 1
elif result is None and not args.dry_run:
failed += 1
if args.dry_run:
logger.info(f"Dry run complete — {len(files)} files would be mined")
else:
logger.info(f"Mining complete — {mined} mined, {failed} failed")
return 0
if __name__ == "__main__":
sys.exit(main())