feat: Self-Coding Foundation (Phase 1)

Implements the foundational infrastructure for Timmy's self-modification capability: ## New Services 1. **GitSafety** (src/self_coding/git_safety.py) - Atomic git operations with rollback capability - Snapshot/restore for safe experimentation - Feature branch management (timmy/self-edit/{timestamp}) - Merge to main only after tests pass 2. **CodebaseIndexer** (src/self_coding/codebase_indexer.py) - AST-based parsing of Python source files - Extracts classes, functions, imports, docstrings - Builds dependency graph for blast radius analysis - SQLite storage with hash-based incremental indexing - get_summary() for LLM context (<4000 tokens) - get_relevant_files() for task-based file discovery 3. **ModificationJournal** (src/self_coding/modification_journal.py) - Persistent log of all self-modification attempts - Tracks outcomes: success, failure, rollback - find_similar() for learning from past attempts - Success rate metrics and recent failure tracking - Supports vector embeddings (Phase 2) 4. **ReflectionService** (src/self_coding/reflection.py) - LLM-powered analysis of modification attempts - Generates lessons learned from successes and failures - Fallback templates when LLM unavailable - Supports context from similar past attempts ## Test Coverage - 104 new tests across 7 test files - 95% code coverage on self_coding module - Green path tests: full workflow integration - Red path tests: errors, rollbacks, edge cases - Safety constraint tests: test coverage requirements, protected files ## Usage from self_coding import GitSafety, CodebaseIndexer, ModificationJournal git = GitSafety(repo_path=/path/to/repo) indexer = CodebaseIndexer(repo_path=/path/to/repo) journal = ModificationJournal() Phase 2 will build the Self-Edit MCP Tool that orchestrates these services.
2026-02-26 11:08:05 -05:00
parent 6c6b6f8a54
commit 18bc64b36d
12 changed files with 4535 additions and 0 deletions
--- a/src/self_coding/modification_journal.py
+++ b/src/self_coding/modification_journal.py
@@ -0,0 +1,425 @@
+"""Modification Journal — Persistent log of self-modification attempts.
+
+Tracks successes and failures so Timmy can learn from experience.
+Supports semantic search for similar past attempts.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import sqlite3
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from enum import Enum
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# Default database location
+DEFAULT_DB_PATH = Path("data/self_coding.db")
+
+
+class Outcome(str, Enum):
+    """Possible outcomes of a modification attempt."""
+    SUCCESS = "success"
+    FAILURE = "failure"
+    ROLLBACK = "rollback"
+
+
+@dataclass
+class ModificationAttempt:
+    """A single self-modification attempt.
+    
+    Attributes:
+        id: Unique identifier (auto-generated by database)
+        timestamp: When the attempt was made
+        task_description: What was Timmy trying to do
+        approach: Strategy/approach planned
+        files_modified: List of file paths that were modified
+        diff: The actual git diff of changes
+        test_results: Pytest output
+        outcome: success, failure, or rollback
+        failure_analysis: LLM-generated analysis of why it failed
+        reflection: LLM-generated lessons learned
+        retry_count: Number of retry attempts
+        embedding: Vector embedding of task_description (for semantic search)
+    """
+    task_description: str
+    approach: str = ""
+    files_modified: list[str] = field(default_factory=list)
+    diff: str = ""
+    test_results: str = ""
+    outcome: Outcome = Outcome.FAILURE
+    failure_analysis: str = ""
+    reflection: str = ""
+    retry_count: int = 0
+    id: Optional[int] = None
+    timestamp: Optional[datetime] = None
+    embedding: Optional[bytes] = None
+
+
+class ModificationJournal:
+    """Persistent log of self-modification attempts.
+    
+    Before any self-modification, Timmy should query the journal for
+    similar past attempts and include relevant ones in the LLM context.
+    
+    Usage:
+        journal = ModificationJournal()
+        
+        # Log an attempt
+        attempt = ModificationAttempt(
+            task_description="Add error handling",
+            files_modified=["src/app.py"],
+            outcome=Outcome.SUCCESS,
+        )
+        await journal.log_attempt(attempt)
+        
+        # Find similar past attempts
+        similar = await journal.find_similar("Add error handling to endpoints")
+        
+        # Get success metrics
+        metrics = await journal.get_success_rate()
+    """
+    
+    def __init__(
+        self,
+        db_path: Optional[str | Path] = None,
+    ) -> None:
+        """Initialize ModificationJournal.
+        
+        Args:
+            db_path: SQLite database path. Defaults to data/self_coding.db
+        """
+        self.db_path = Path(db_path) if db_path else DEFAULT_DB_PATH
+        self._ensure_schema()
+        logger.info("ModificationJournal initialized at %s", self.db_path)
+    
+    def _get_conn(self) -> sqlite3.Connection:
+        """Get database connection with schema ensured."""
+        self.db_path.parent.mkdir(parents=True, exist_ok=True)
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        return conn
+    
+    def _ensure_schema(self) -> None:
+        """Create database tables if they don't exist."""
+        with self._get_conn() as conn:
+            conn.execute(
+                """
+                CREATE TABLE IF NOT EXISTS modification_journal (
+                    id INTEGER PRIMARY KEY AUTOINCREMENT,
+                    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
+                    task_description TEXT NOT NULL,
+                    approach TEXT,
+                    files_modified JSON,
+                    diff TEXT,
+                    test_results TEXT,
+                    outcome TEXT CHECK(outcome IN ('success', 'failure', 'rollback')),
+                    failure_analysis TEXT,
+                    reflection TEXT,
+                    retry_count INTEGER DEFAULT 0,
+                    embedding BLOB
+                )
+                """
+            )
+            
+            # Create indexes for common queries
+            conn.execute(
+                "CREATE INDEX IF NOT EXISTS idx_journal_outcome ON modification_journal(outcome)"
+            )
+            conn.execute(
+                "CREATE INDEX IF NOT EXISTS idx_journal_timestamp ON modification_journal(timestamp)"
+            )
+            conn.execute(
+                "CREATE INDEX IF NOT EXISTS idx_journal_task ON modification_journal(task_description)"
+            )
+            
+            conn.commit()
+    
+    async def log_attempt(self, attempt: ModificationAttempt) -> int:
+        """Log a modification attempt to the journal.
+        
+        Args:
+            attempt: The modification attempt to log
+            
+        Returns:
+            ID of the logged entry
+        """
+        with self._get_conn() as conn:
+            cursor = conn.execute(
+                """
+                INSERT INTO modification_journal
+                (task_description, approach, files_modified, diff, test_results,
+                 outcome, failure_analysis, reflection, retry_count, embedding)
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+                """,
+                (
+                    attempt.task_description,
+                    attempt.approach,
+                    json.dumps(attempt.files_modified),
+                    attempt.diff,
+                    attempt.test_results,
+                    attempt.outcome.value,
+                    attempt.failure_analysis,
+                    attempt.reflection,
+                    attempt.retry_count,
+                    attempt.embedding,
+                ),
+            )
+            conn.commit()
+            
+            attempt_id = cursor.lastrowid
+            logger.info(
+                "Logged modification attempt %d: %s (%s)",
+                attempt_id,
+                attempt.task_description[:50],
+                attempt.outcome.value,
+            )
+            return attempt_id
+    
+    async def find_similar(
+        self,
+        task_description: str,
+        limit: int = 5,
+        include_outcomes: Optional[list[Outcome]] = None,
+    ) -> list[ModificationAttempt]:
+        """Find similar past modification attempts.
+        
+        Uses keyword matching for now. In Phase 2, will use vector embeddings
+        for semantic search.
+        
+        Args:
+            task_description: Task to find similar attempts for
+            limit: Maximum number of results
+            include_outcomes: Filter by outcomes (None = all)
+            
+        Returns:
+            List of similar modification attempts
+        """
+        # Extract keywords from task description
+        keywords = set(task_description.lower().split())
+        keywords -= {"the", "a", "an", "to", "in", "on", "at", "for", "with", "and", "or", "of", "is", "are"}
+        
+        with self._get_conn() as conn:
+            # Build query
+            if include_outcomes:
+                outcome_filter = "AND outcome IN ({})".format(
+                    ",".join("?" * len(include_outcomes))
+                )
+                outcome_values = [o.value for o in include_outcomes]
+            else:
+                outcome_filter = ""
+                outcome_values = []
+            
+            rows = conn.execute(
+                f"""
+                SELECT id, timestamp, task_description, approach, files_modified,
+                       diff, test_results, outcome, failure_analysis, reflection,
+                       retry_count
+                FROM modification_journal
+                WHERE 1=1 {outcome_filter}
+                ORDER BY timestamp DESC
+                LIMIT ?
+                """,
+                outcome_values + [limit * 3],  # Get more for scoring
+            ).fetchall()
+        
+        # Score by keyword match
+        scored = []
+        for row in rows:
+            score = 0
+            task = row["task_description"].lower()
+            approach = (row["approach"] or "").lower()
+            
+            for kw in keywords:
+                if kw in task:
+                    score += 3
+                if kw in approach:
+                    score += 1
+            
+            # Boost recent attempts (only if already matched)
+            if score > 0:
+                timestamp = datetime.fromisoformat(row["timestamp"])
+                if timestamp.tzinfo is None:
+                    timestamp = timestamp.replace(tzinfo=timezone.utc)
+                age_days = (datetime.now(timezone.utc) - timestamp).days
+                if age_days < 7:
+                    score += 2
+                elif age_days < 30:
+                    score += 1
+            
+            if score > 0:
+                scored.append((score, row))
+        
+        # Sort by score, take top N
+        scored.sort(reverse=True, key=lambda x: x[0])
+        top_rows = scored[:limit]
+        
+        # Convert to ModificationAttempt objects
+        return [self._row_to_attempt(row) for _, row in top_rows]
+    
+    async def get_success_rate(self) -> dict[str, float]:
+        """Get success rate metrics.
+        
+        Returns:
+            Dict with overall and per-category success rates:
+            {
+                "overall": float,  # 0.0 to 1.0
+                "success": int,    # count
+                "failure": int,    # count
+                "rollback": int,   # count
+                "total": int,      # total attempts
+            }
+        """
+        with self._get_conn() as conn:
+            rows = conn.execute(
+                """
+                SELECT outcome, COUNT(*) as count
+                FROM modification_journal
+                GROUP BY outcome
+                """
+            ).fetchall()
+        
+        counts = {row["outcome"]: row["count"] for row in rows}
+        
+        success = counts.get("success", 0)
+        failure = counts.get("failure", 0)
+        rollback = counts.get("rollback", 0)
+        total = success + failure + rollback
+        
+        overall = success / total if total > 0 else 0.0
+        
+        return {
+            "overall": overall,
+            "success": success,
+            "failure": failure,
+            "rollback": rollback,
+            "total": total,
+        }
+    
+    async def get_recent_failures(self, limit: int = 10) -> list[ModificationAttempt]:
+        """Get recent failed attempts with their analyses.
+        
+        Args:
+            limit: Maximum number of failures to return
+            
+        Returns:
+            List of failed modification attempts
+        """
+        with self._get_conn() as conn:
+            rows = conn.execute(
+                """
+                SELECT id, timestamp, task_description, approach, files_modified,
+                       diff, test_results, outcome, failure_analysis, reflection,
+                       retry_count
+                FROM modification_journal
+                WHERE outcome IN ('failure', 'rollback')
+                ORDER BY timestamp DESC
+                LIMIT ?
+                """,
+                (limit,),
+            ).fetchall()
+        
+        return [self._row_to_attempt(row) for row in rows]
+    
+    async def get_by_id(self, attempt_id: int) -> Optional[ModificationAttempt]:
+        """Get a specific modification attempt by ID.
+        
+        Args:
+            attempt_id: ID of the attempt
+            
+        Returns:
+            ModificationAttempt or None if not found
+        """
+        with self._get_conn() as conn:
+            row = conn.execute(
+                """
+                SELECT id, timestamp, task_description, approach, files_modified,
+                       diff, test_results, outcome, failure_analysis, reflection,
+                       retry_count
+                FROM modification_journal
+                WHERE id = ?
+                """,
+                (attempt_id,),
+            ).fetchone()
+        
+        if not row:
+            return None
+        
+        return self._row_to_attempt(row)
+    
+    async def update_reflection(self, attempt_id: int, reflection: str) -> bool:
+        """Update the reflection for a modification attempt.
+        
+        Args:
+            attempt_id: ID of the attempt
+            reflection: New reflection text
+            
+        Returns:
+            True if updated, False if not found
+        """
+        with self._get_conn() as conn:
+            cursor = conn.execute(
+                """
+                UPDATE modification_journal
+                SET reflection = ?
+                WHERE id = ?
+                """,
+                (reflection, attempt_id),
+            )
+            conn.commit()
+            
+            if cursor.rowcount > 0:
+                logger.info("Updated reflection for attempt %d", attempt_id)
+                return True
+            return False
+    
+    async def get_attempts_for_file(
+        self,
+        file_path: str,
+        limit: int = 10,
+    ) -> list[ModificationAttempt]:
+        """Get all attempts that modified a specific file.
+        
+        Args:
+            file_path: Path to file (relative to repo root)
+            limit: Maximum number of attempts
+            
+        Returns:
+            List of modification attempts affecting this file
+        """
+        with self._get_conn() as conn:
+            # Try exact match first, then partial match
+            rows = conn.execute(
+                """
+                SELECT id, timestamp, task_description, approach, files_modified,
+                       diff, test_results, outcome, failure_analysis, reflection,
+                       retry_count
+                FROM modification_journal
+                WHERE files_modified LIKE ? OR files_modified LIKE ?
+                ORDER BY timestamp DESC
+                LIMIT ?
+                """,
+                (f'%"{file_path}"%', f'%{file_path}%', limit),
+            ).fetchall()
+        
+        return [self._row_to_attempt(row) for row in rows]
+    
+    def _row_to_attempt(self, row: sqlite3.Row) -> ModificationAttempt:
+        """Convert a database row to ModificationAttempt."""
+        return ModificationAttempt(
+            id=row["id"],
+            timestamp=datetime.fromisoformat(row["timestamp"]),
+            task_description=row["task_description"],
+            approach=row["approach"] or "",
+            files_modified=json.loads(row["files_modified"] or "[]"),
+            diff=row["diff"] or "",
+            test_results=row["test_results"] or "",
+            outcome=Outcome(row["outcome"]),
+            failure_analysis=row["failure_analysis"] or "",
+            reflection=row["reflection"] or "",
+            retry_count=row["retry_count"] or 0,
+        )