feat: code quality audit + autoresearch integration + infra hardening (#150)

2026-03-08 12:50:44 -04:00
parent fd0ede0d51
commit ae3bb1cc21
186 changed files with 5129 additions and 3289 deletions
--- a/src/timmy/memory/vector_store.py
+++ b/src/timmy/memory/vector_store.py
@@ -25,11 +25,12 @@ def _get_model():
    global _model, _has_embeddings
    if _has_embeddings is False:
        return None
-    
+
    if _model is not None:
        return _model
-    
+
    from config import settings
+
    # In test mode or low-memory environments, skip embedding model load
    if settings.timmy_skip_embeddings:
        _has_embeddings = False
@@ -37,7 +38,8 @@ def _get_model():

    try:
        from sentence_transformers import SentenceTransformer
-        _model = SentenceTransformer('all-MiniLM-L6-v2')
+
+        _model = SentenceTransformer("all-MiniLM-L6-v2")
        _has_embeddings = True
        return _model
    except (ImportError, RuntimeError, Exception):
@@ -56,7 +58,7 @@ def _get_embedding_dimension() -> int:

 def _compute_embedding(text: str) -> list[float]:
    """Compute embedding vector for text.
-    
+
    Uses sentence-transformers if available, otherwise returns
    a simple hash-based vector for basic similarity.
    """
@@ -66,30 +68,31 @@ def _compute_embedding(text: str) -> list[float]:
            return model.encode(text).tolist()
        except Exception:
            pass
-    
+
    # Fallback: simple character n-gram hash embedding
    # Not as good but allows the system to work without heavy deps
    dim = 384
    vec = [0.0] * dim
    text = text.lower()
-    
+
    # Generate character trigram features
    for i in range(len(text) - 2):
-        trigram = text[i:i+3]
+        trigram = text[i : i + 3]
        hash_val = hash(trigram) % dim
        vec[hash_val] += 1.0
-    
+
    # Normalize
-    norm = sum(x*x for x in vec) ** 0.5
+    norm = sum(x * x for x in vec) ** 0.5
    if norm > 0:
-        vec = [x/norm for x in vec]
-    
+        vec = [x / norm for x in vec]
+
    return vec


@dataclass
 class MemoryEntry:
    """A memory entry with vector embedding."""
+
    id: str = field(default_factory=lambda: str(uuid.uuid4()))
    content: str = ""  # The actual text content
    source: str = ""  # Where it came from (agent, user, system)
@@ -99,9 +102,7 @@ class MemoryEntry:
    session_id: Optional[str] = None
    metadata: Optional[dict] = None
    embedding: Optional[list[float]] = None
-    timestamp: str = field(
-        default_factory=lambda: datetime.now(timezone.utc).isoformat()
-    )
+    timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
    relevance_score: Optional[float] = None  # Set during search


@@ -110,7 +111,7 @@ def _get_conn() -> sqlite3.Connection:
    DB_PATH.parent.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(str(DB_PATH))
    conn.row_factory = sqlite3.Row
-    
+
    # Try to load sqlite-vss extension
    try:
        conn.enable_load_extension(True)
@@ -119,7 +120,7 @@ def _get_conn() -> sqlite3.Connection:
        _has_vss = True
    except Exception:
        _has_vss = False
-    
+
    # Create tables
    conn.execute(
        """
@@ -137,24 +138,14 @@ def _get_conn() -> sqlite3.Connection:
        )
        """
    )
-    
+
    # Create indexes
-    conn.execute(
-        "CREATE INDEX IF NOT EXISTS idx_memory_agent ON memory_entries(agent_id)"
-    )
-    conn.execute(
-        "CREATE INDEX IF NOT EXISTS idx_memory_task ON memory_entries(task_id)"
-    )
-    conn.execute(
-        "CREATE INDEX IF NOT EXISTS idx_memory_session ON memory_entries(session_id)"
-    )
-    conn.execute(
-        "CREATE INDEX IF NOT EXISTS idx_memory_time ON memory_entries(timestamp)"
-    )
-    conn.execute(
-        "CREATE INDEX IF NOT EXISTS idx_memory_type ON memory_entries(context_type)"
-    )
-    
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_agent ON memory_entries(agent_id)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_task ON memory_entries(task_id)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_session ON memory_entries(session_id)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_time ON memory_entries(timestamp)")
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_memory_type ON memory_entries(context_type)")
+
    conn.commit()
    return conn

@@ -170,7 +161,7 @@ def store_memory(
    compute_embedding: bool = True,
 ) -> MemoryEntry:
    """Store a memory entry with optional embedding.
-    
+
    Args:
        content: The text content to store
        source: Source of the memory (agent name, user, system)
@@ -180,14 +171,14 @@ def store_memory(
        session_id: Session identifier
        metadata: Additional structured data
        compute_embedding: Whether to compute vector embedding
-    
+
    Returns:
        The stored MemoryEntry
    """
    embedding = None
    if compute_embedding:
        embedding = _compute_embedding(content)
-    
+
    entry = MemoryEntry(
        content=content,
        source=source,
@@ -198,7 +189,7 @@ def store_memory(
        metadata=metadata,
        embedding=embedding,
    )
-    
+
    conn = _get_conn()
    conn.execute(
        """
@@ -222,7 +213,7 @@ def store_memory(
    )
    conn.commit()
    conn.close()
-    
+
    return entry


@@ -235,7 +226,7 @@ def search_memories(
    min_relevance: float = 0.0,
 ) -> list[MemoryEntry]:
    """Search for memories by semantic similarity.
-    
+
    Args:
        query: Search query text
        limit: Maximum results
@@ -243,18 +234,18 @@ def search_memories(
        agent_id: Filter by agent
        session_id: Filter by session
        min_relevance: Minimum similarity score (0-1)
-    
+
    Returns:
        List of MemoryEntry objects sorted by relevance
    """
    query_embedding = _compute_embedding(query)
-    
+
    conn = _get_conn()
-    
+
    # Build query with filters
    conditions = []
    params = []
-    
+
    if context_type:
        conditions.append("context_type = ?")
        params.append(context_type)
@@ -264,9 +255,9 @@ def search_memories(
    if session_id:
        conditions.append("session_id = ?")
        params.append(session_id)
-    
+
    where_clause = "WHERE " + " AND ".join(conditions) if conditions else ""
-    
+
    # Fetch candidates (we'll do in-memory similarity for now)
    # For production with sqlite-vss, this would use vector similarity index
    query_sql = f"""
@@ -276,10 +267,10 @@ def search_memories(
        LIMIT ?
    """
    params.append(limit * 3)  # Get more candidates for ranking
-    
+
    rows = conn.execute(query_sql, params).fetchall()
    conn.close()
-    
+
    # Compute similarity scores
    results = []
    for row in rows:
@@ -295,7 +286,7 @@ def search_memories(
            embedding=json.loads(row["embedding"]) if row["embedding"] else None,
            timestamp=row["timestamp"],
        )
-        
+
        if entry.embedding:
            # Cosine similarity
            score = _cosine_similarity(query_embedding, entry.embedding)
@@ -308,7 +299,7 @@ def search_memories(
            entry.relevance_score = score
            if score >= min_relevance:
                results.append(entry)
-    
+
    # Sort by relevance and return top results
    results.sort(key=lambda x: x.relevance_score or 0, reverse=True)
    return results[:limit]
@@ -316,9 +307,9 @@ def search_memories(

 def _cosine_similarity(a: list[float], b: list[float]) -> float:
    """Compute cosine similarity between two vectors."""
-    dot = sum(x*y for x, y in zip(a, b))
-    norm_a = sum(x*x for x in a) ** 0.5
-    norm_b = sum(x*x for x in b) ** 0.5
+    dot = sum(x * y for x, y in zip(a, b))
+    norm_a = sum(x * x for x in a) ** 0.5
+    norm_b = sum(x * x for x in b) ** 0.5
    if norm_a == 0 or norm_b == 0:
        return 0.0
    return dot / (norm_a * norm_b)
@@ -334,51 +325,47 @@ def _keyword_overlap(query: str, content: str) -> float:
    return overlap / len(query_words)


-def get_memory_context(
-    query: str,
-    max_tokens: int = 2000,
-    **filters
-) -> str:
+def get_memory_context(query: str, max_tokens: int = 2000, **filters) -> str:
    """Get relevant memory context as formatted text for LLM prompts.
-    
+
    Args:
        query: Search query
        max_tokens: Approximate maximum tokens to return
        **filters: Additional filters (agent_id, session_id, etc.)
-    
+
    Returns:
        Formatted context string for inclusion in prompts
    """
    memories = search_memories(query, limit=20, **filters)
-    
+
    context_parts = []
    total_chars = 0
    max_chars = max_tokens * 4  # Rough approximation
-    
+
    for mem in memories:
        formatted = f"[{mem.source}]: {mem.content}"
        if total_chars + len(formatted) > max_chars:
            break
        context_parts.append(formatted)
        total_chars += len(formatted)
-    
+
    if not context_parts:
        return ""
-    
+
    return "Relevant context from memory:\n" + "\n\n".join(context_parts)


 def recall_personal_facts(agent_id: Optional[str] = None) -> list[str]:
    """Recall personal facts about the user or system.
-    
+
    Args:
        agent_id: Optional agent filter
-    
+
    Returns:
        List of fact strings
    """
    conn = _get_conn()
-    
+
    if agent_id:
        rows = conn.execute(
            """
@@ -398,7 +385,7 @@ def recall_personal_facts(agent_id: Optional[str] = None) -> list[str]:
            LIMIT 100
            """,
        ).fetchall()
-    
+
    conn.close()
    return [r["content"] for r in rows]

@@ -434,11 +421,11 @@ def update_personal_fact(memory_id: str, new_content: str) -> bool:

 def store_personal_fact(fact: str, agent_id: Optional[str] = None) -> MemoryEntry:
    """Store a personal fact about the user or system.
-    
+
    Args:
        fact: The fact to store
        agent_id: Associated agent
-    
+
    Returns:
        The stored MemoryEntry
    """
@@ -453,7 +440,7 @@ def store_personal_fact(fact: str, agent_id: Optional[str] = None) -> MemoryEntr

 def delete_memory(memory_id: str) -> bool:
    """Delete a memory entry by ID.
-    
+
    Returns:
        True if deleted, False if not found
    """
@@ -470,29 +457,27 @@ def delete_memory(memory_id: str) -> bool:

 def get_memory_stats() -> dict:
    """Get statistics about the memory store.
-    
+
    Returns:
        Dict with counts by type, total entries, etc.
    """
    conn = _get_conn()
-    
-    total = conn.execute(
-        "SELECT COUNT(*) as count FROM memory_entries"
-    ).fetchone()["count"]
-    
+
+    total = conn.execute("SELECT COUNT(*) as count FROM memory_entries").fetchone()["count"]
+
    by_type = {}
    rows = conn.execute(
        "SELECT context_type, COUNT(*) as count FROM memory_entries GROUP BY context_type"
    ).fetchall()
    for row in rows:
        by_type[row["context_type"]] = row["count"]
-    
+
    with_embeddings = conn.execute(
        "SELECT COUNT(*) as count FROM memory_entries WHERE embedding IS NOT NULL"
    ).fetchone()["count"]
-    
+
    conn.close()
-    
+
    return {
        "total_entries": total,
        "by_type": by_type,
@@ -503,20 +488,20 @@ def get_memory_stats() -> dict:

 def prune_memories(older_than_days: int = 90, keep_facts: bool = True) -> int:
    """Delete old memories to manage storage.
-    
+
    Args:
        older_than_days: Delete memories older than this
        keep_facts: Whether to preserve fact-type memories
-    
+
    Returns:
        Number of entries deleted
    """
    from datetime import timedelta
-    
+
    cutoff = (datetime.now(timezone.utc) - timedelta(days=older_than_days)).isoformat()
-    
+
    conn = _get_conn()
-    
+
    if keep_facts:
        cursor = conn.execute(
            """
@@ -530,9 +515,9 @@ def prune_memories(older_than_days: int = 90, keep_facts: bool = True) -> int:
            "DELETE FROM memory_entries WHERE timestamp < ?",
            (cutoff,),
        )
-    
+
    deleted = cursor.rowcount
    conn.commit()
    conn.close()
-    
+
    return deleted