timmy-config/bin/request_log.py

#!/usr/bin/env python3
"""
Request Log Telemetry — "Verify What Actually Happened"

Issue #446: [P2.5] request_log Telemetry Table

Every agent writes a row to request_log for every inference call.
No exceptions. No summarizing. Actual rows.

This module provides:
  - log_inference(): write a telemetry row
  - query_requests(): read recent telemetry
  - did_agent_call_provider(): answer verification questions

Database: ~/.local/timmy/request_log.db
Override via REQUEST_LOG_PATH environment variable.
"""

import os
import sqlite3
import sys
import json
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Optional, Dict, Any, List

# Default DB location (matches ansible group_vars/wizards.yml)
DEFAULT_DB_PATH = Path.home() / ".local" / "timmy" / "request_log.db"


def get_db_path() -> Path:
    """Return the configured request_log database path."""
    env_path = os.environ.get("REQUEST_LOG_PATH")
    if env_path:
        return Path(env_path).expanduser()
    return DEFAULT_DB_PATH


def ensure_db() -> Path:
    """
    Ensure the database and schema exist.
    Creates the DB and schema if missing.
    Returns the DB path.
    """
    db_path = get_db_path()
    db_path.parent.mkdir(parents=True, exist_ok=True)

    if not db_path.exists():
        # Create with schema
        schema = """
CREATE TABLE IF NOT EXISTS request_log (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    timestamp TEXT NOT NULL DEFAULT (datetime('now')),
    agent_name TEXT NOT NULL,
    provider TEXT NOT NULL,
    model TEXT NOT NULL,
    endpoint TEXT NOT NULL,
    tokens_in INTEGER,
    tokens_out INTEGER,
    latency_ms INTEGER,
    status TEXT NOT NULL,
    error_message TEXT
);

CREATE INDEX IF NOT EXISTS idx_request_log_agent
    ON request_log (agent_name, timestamp);

CREATE INDEX IF NOT EXISTS idx_request_log_provider
    ON request_log (provider, timestamp);

CREATE INDEX IF NOT EXISTS idx_request_log_status
    ON request_log (status, timestamp);
"""
        conn = sqlite3.connect(str(db_path))
        conn.executescript(schema)
        conn.commit()
        conn.close()

    return db_path


def log_inference(
    *,
    agent_name: str,
    provider: str,
    model: str,
    endpoint: str,
    tokens_in: Optional[int] = None,
    tokens_out: Optional[int] = None,
    latency_ms: Optional[int] = None,
    status: str = "success",
    error_message: Optional[str] = None,
    db_path: Optional[Path] = None,
) -> Optional[int]:
    """
    Log a single inference request to the request_log table.

    Args:
        agent_name: Name of the agent making the call
        provider: Provider name (anthropic, openrouter, ollama, etc.)
        model: Model identifier
        endpoint: API endpoint called
        tokens_in: Input token count (optional but recommended)
        tokens_out: Output token count (optional but recommended)
        latency_ms: Latency in milliseconds (optional but recommended)
        status: One of 'success', 'error', 'timeout', 'fallback'
        error_message: Error text if status is error/timeout
        db_path: Override DB path (for testing)

    Returns:
        Row ID if inserted, None on failure
    """
    db = Path(db_path) if db_path else get_db_path()

    try:
        # Ensure DB exists
        if not db.exists():
            ensure_db()

        conn = sqlite3.connect(str(db))
        cursor = conn.cursor()
        cursor.execute("""
            INSERT INTO request_log
                (timestamp, agent_name, provider, model, endpoint,
                 tokens_in, tokens_out, latency_ms, status, error_message)
            VALUES (datetime('now'), ?, ?, ?, ?, ?, ?, ?, ?, ?)
        """, (
            agent_name, provider, model, endpoint,
            tokens_in, tokens_out, latency_ms, status, error_message
        ))
        row_id = cursor.lastrowid
        conn.commit()
        conn.close()
        return row_id
    except Exception as e:
        # Never break production — swallow errors for telemetry
        return None


def query_requests(
    *,
    agent_name: Optional[str] = None,
    provider: Optional[str] = None,
    model: Optional[str] = None,
    hours: int = 1,
    status: Optional[str] = None,
    limit: int = 100,
    db_path: Optional[Path] = None,
) -> List[Dict[str, Any]]:
    """
    Query recent inference logs.

    Args:
        agent_name: Filter by agent name
        provider: Filter by provider
        model: Filter by model
        hours: Lookback window (default 1 hour)
        status: Filter by status ('success', 'error', etc.)
        limit: Max rows to return
        db_path: Override DB path

    Returns:
        List of matching records as dicts
    """
    db = Path(db_path) if db_path else get_db_path()

    if not db.exists():
        return []

    conditions = ["timestamp > datetime('now', '-' || ? || ' hours')"]
    params = [hours]

    if agent_name:
        conditions.append("agent_name = ?")
        params.append(agent_name)
    if provider:
        conditions.append("provider = ?")
        params.append(provider)
    if model:
        conditions.append("model = ?")
        params.append(model)
    if status:
        conditions.append("status = ?")
        params.append(status)

    where_clause = " AND ".join(conditions)

    try:
        conn = sqlite3.connect(str(db))
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        cursor.execute(f"""
            SELECT * FROM request_log
            WHERE {where_clause}
            ORDER BY timestamp DESC
            LIMIT ?
        """, tuple(params) + (limit,))

        rows = [dict(row) for row in cursor.fetchall()]
        conn.close()
        return rows
    except Exception:
        return []


def did_agent_call_provider(
    agent_name: str,
    provider: str,
    hours: int = 1,
    min_success_count: int = 1,
    db_path: Optional[Path] = None,
) -> bool:
    """
    Answer: "Did agent X actually call provider Y in the last N hours?"

    Returns True if agent made at least min_success_count successful calls.
    """
    rows = query_requests(
        agent_name=agent_name,
        provider=provider,
        hours=hours,
        status="success",
        db_path=db_path,
    )
    return len(rows) >= min_success_count


def get_recent_activity_summary(hours: int = 1) -> Dict[str, Any]:
    """Get aggregate statistics for recent activity (uses v_recent_activity view if available)."""
    db = get_db_path()
    if not db.exists():
        return {"error": "Database not found"}

    try:
        conn = sqlite3.connect(str(db))
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()

        # Try the view first
        try:
            cursor.execute("""
                SELECT agent_name, provider, model, status,
                       COUNT(*) as call_count, AVG(latency_ms) as avg_latency
                FROM request_log
                WHERE timestamp > datetime('now', '-' || ? || ' hours')
                GROUP BY agent_name, provider, model, status
            """, (hours,))
            rows = [dict(row) for row in cursor.fetchall()]
            conn.close()
            return {"by_agent_provider": rows}
        except Exception:
            conn.close()
            return {"error": "query failed"}
    except Exception:
        return {"error": "db error"}