Files
compounding-intelligence/scripts/harvester.py

351 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Session Harvester for Compounding Intelligence.
Extracts durable knowledge from completed sessions and updates the knowledge store.
"""
import json
import os
import sys
import logging
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import List, Dict, Any, Optional
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
from session_reader import SessionReader
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(Path(__file__).parent.parent / 'metrics' / 'harvester.log'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class KnowledgeHarvester:
"""Extracts knowledge from completed sessions."""
def __init__(self, repo_root: str = None):
"""Initialize the harvester."""
if repo_root is None:
repo_root = str(Path(__file__).parent.parent)
self.repo_root = Path(repo_root)
self.knowledge_dir = self.repo_root / "knowledge"
self.index_path = self.knowledge_dir / "index.json"
self.prompt_path = self.repo_root / "templates" / "harvest-prompt.md"
# Load or create knowledge index
self.index = self._load_index()
# Initialize session reader
self.reader = SessionReader()
# Harvest state file
self.state_path = self.knowledge_dir / "harvest_state.json"
self.state = self._load_state()
def _load_index(self) -> Dict[str, Any]:
"""Load or create the knowledge index."""
if self.index_path.exists():
with open(self.index_path, 'r') as f:
return json.load(f)
else:
return {
"version": 1,
"last_updated": datetime.now(timezone.utc).isoformat(),
"total_facts": 0,
"facts": []
}
def _save_index(self):
"""Save the knowledge index."""
self.index["last_updated"] = datetime.now(timezone.utc).isoformat()
with open(self.index_path, 'w') as f:
json.dump(self.index, f, indent=2)
def _load_state(self) -> Dict[str, Any]:
"""Load harvest state."""
if self.state_path.exists():
with open(self.state_path, 'r') as f:
return json.load(f)
else:
return {
"last_harvest": None,
"harvested_sessions": [],
"total_sessions_processed": 0,
"total_facts_extracted": 0
}
def _save_state(self):
"""Save harvest state."""
with open(self.state_path, 'w') as f:
json.dump(self.state, f, indent=2)
def get_sessions_to_harvest(self, max_age_hours: float = 24) -> List[Dict[str, Any]]:
"""
Get sessions that need harvesting.
Args:
max_age_hours: Only harvest sessions modified within this many hours
Returns:
List of session data dictionaries
"""
# Get sessions modified since last harvest
since = None
if self.state["last_harvest"]:
try:
since = datetime.fromisoformat(self.state["last_harvest"].replace('Z', '+00:00'))
except (ValueError, AttributeError):
pass
# If no last harvest, use max_age_hours
if since is None:
since = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
# Get recent sessions
sessions = self.reader.list_sessions(since=since)
# Filter out already harvested sessions
harvested = set(self.state["harvested_sessions"])
to_harvest = []
for path in sessions:
session = self.reader.read_session(path)
if "error" in session:
logger.warning(f"Error reading session {path}: {session['error']}")
continue
# Skip if already harvested
if session["session_id"] in harvested:
continue
# Skip if session is still active
if not self.reader.is_session_complete(session):
continue
to_harvest.append(session)
return to_harvest
def extract_knowledge_from_session(self, session: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extract knowledge from a single session.
This is a simplified extraction that looks for patterns in the session.
In a full implementation, this would use an LLM with the harvest prompt.
Args:
session: Session data dictionary
Returns:
List of extracted knowledge items
"""
knowledge_items = []
# Get messages from session
messages = session.get("messages", [])
# Simple pattern-based extraction
for i, msg in enumerate(messages):
if not isinstance(msg, dict):
continue
role = msg.get("role", "")
content = msg.get("content", "")
if not content or not isinstance(content, str):
continue
# Look for error patterns
if "error" in content.lower() or "Error" in content:
# Extract error context
context = content[:200] # First 200 chars
# Look for file paths
import re
file_paths = re.findall(r'[~/.]?[\w/]+\.\w+', context)
if file_paths:
knowledge_items.append({
"fact": f"Error encountered with file: {file_paths[0]}",
"category": "pitfall",
"repo": "global",
"confidence": 0.7,
"session_id": session["session_id"],
"extracted_at": datetime.now(timezone.utc).isoformat()
})
# Look for successful patterns
if "success" in content.lower() or "Success" in content:
# Extract success context
context = content[:200]
# Look for commands or actions
import re
commands = re.findall(r'(?:git|npm|pip|python|curl|ssh)\s+[\w\s\-\.]+', context)
if commands:
knowledge_items.append({
"fact": f"Successful command pattern: {commands[0]}",
"category": "pattern",
"repo": "global",
"confidence": 0.6,
"session_id": session["session_id"],
"extracted_at": datetime.now(timezone.utc).isoformat()
})
return knowledge_items
def harvest_session(self, session: Dict[str, Any]) -> Dict[str, Any]:
"""
Harvest knowledge from a single session.
Args:
session: Session data dictionary
Returns:
Harvest result dictionary
"""
session_id = session["session_id"]
logger.info(f"Harvesting session: {session_id}")
try:
# Extract knowledge
knowledge_items = self.extract_knowledge_from_session(session)
# Add to index
for item in knowledge_items:
# Add metadata
item["harvested_at"] = datetime.now(timezone.utc).isoformat()
item["session_path"] = session.get("path", "")
# Add to facts
self.index["facts"].append(item)
# Update state
self.state["harvested_sessions"].append(session_id)
self.state["total_sessions_processed"] += 1
self.state["total_facts_extracted"] += len(knowledge_items)
result = {
"session_id": session_id,
"success": True,
"facts_extracted": len(knowledge_items),
"knowledge_items": knowledge_items
}
logger.info(f"Extracted {len(knowledge_items)} facts from session {session_id}")
except Exception as e:
logger.error(f"Error harvesting session {session_id}: {e}")
result = {
"session_id": session_id,
"success": False,
"error": str(e),
"facts_extracted": 0
}
return result
def harvest_batch(self, max_sessions: int = 10, max_age_hours: float = 24) -> Dict[str, Any]:
"""
Harvest a batch of sessions.
Args:
max_sessions: Maximum number of sessions to harvest
max_age_hours: Only harvest sessions modified within this many hours
Returns:
Batch harvest result
"""
logger.info(f"Starting harvest batch (max {max_sessions} sessions, max age {max_age_hours}h)")
# Get sessions to harvest
sessions = self.get_sessions_to_harvest(max_age_hours)
if not sessions:
logger.info("No sessions to harvest")
return {
"success": True,
"sessions_processed": 0,
"facts_extracted": 0,
"results": []
}
# Limit to max_sessions
sessions = sessions[:max_sessions]
results = []
total_facts = 0
for session in sessions:
result = self.harvest_session(session)
results.append(result)
if result["success"]:
total_facts += result["facts_extracted"]
# Update index and state
self.index["total_facts"] = len(self.index["facts"])
self._save_index()
self.state["last_harvest"] = datetime.now(timezone.utc).isoformat()
self._save_state()
batch_result = {
"success": True,
"sessions_processed": len(sessions),
"facts_extracted": total_facts,
"results": results,
"timestamp": datetime.now(timezone.utc).isoformat()
}
logger.info(f"Harvest batch complete: {len(sessions)} sessions, {total_facts} facts")
return batch_result
def main():
"""Main entry point for the harvester."""
import argparse
parser = argparse.ArgumentParser(description="Harvest knowledge from completed sessions")
parser.add_argument("--max-sessions", type=int, default=10, help="Maximum sessions to harvest")
parser.add_argument("--max-age-hours", type=float, default=24, help="Max age in hours")
parser.add_argument("--dry-run", action="store_true", help="Don't save, just report")
args = parser.parse_args()
harvester = KnowledgeHarvester()
if args.dry_run:
sessions = harvester.get_sessions_to_harvest(args.max_age_hours)
print(f"Would harvest {len(sessions)} sessions:")
for session in sessions[:5]: # Show first 5
print(f" - {session['session_id']} ({session['message_count']} messages)")
if len(sessions) > 5:
print(f" ... and {len(sessions) - 5} more")
return
result = harvester.harvest_batch(
max_sessions=args.max_sessions,
max_age_hours=args.max_age_hours
)
if result["success"]:
print(f"Harvest complete: {result['sessions_processed']} sessions, {result['facts_extracted']} facts")
else:
print(f"Harvest failed: {result.get('error', 'Unknown error')}")
sys.exit(1)
if __name__ == "__main__":
main()