Timmy-time-dashboard/src/timmy/agent.py

"""Agent creation with three-tier memory system.

Memory Architecture:
- Tier 1 (Hot): MEMORY.md — always loaded, ~300 lines
- Tier 2 (Vault): memory/ — structured markdown, append-only
- Tier 3 (Semantic): Vector search over vault files

Model Management:
- Pulls requested model automatically if not available
- Falls back through capability-based model chains
- Multi-modal support with vision model fallbacks

Handoff Protocol maintains continuity across sessions.
"""

import logging
from typing import TYPE_CHECKING, Union

import httpx
from agno.agent import Agent
from agno.db.sqlite import SqliteDb
from agno.models.ollama import Ollama

from config import check_ollama_model_available, settings
from timmy.prompts import get_system_prompt
from timmy.tools import create_full_toolkit

if TYPE_CHECKING:
    from timmy.backends import ClaudeBackend, GrokBackend

logger = logging.getLogger(__name__)

# Union type for callers that want to hint the return type.
TimmyAgent = Union[Agent, "GrokBackend", "ClaudeBackend"]

# Models known to be too small for reliable tool calling.
# These hallucinate tool calls as text, invoke tools randomly,
# and leak raw JSON into responses.
_SMALL_MODEL_PATTERNS = (
    "llama3.2",
    "phi-3",
    "gemma:2b",
    "tinyllama",
    "qwen2:0.5b",
    "qwen2:1.5b",
)


def _check_model_available(model_name: str) -> bool:
    """Check if an Ollama model is available locally."""
    return check_ollama_model_available(model_name)


def _pull_model(model_name: str) -> bool:
    """Attempt to pull a model from Ollama.

    Returns:
        True if successful or model already exists
    """
    try:
        import json
        import urllib.request

        logger.info("Pulling model: %s", model_name)

        url = settings.normalized_ollama_url
        req = urllib.request.Request(
            f"{url}/api/pull",
            method="POST",
            headers={"Content-Type": "application/json"},
            data=json.dumps({"name": model_name, "stream": False}).encode(),
        )

        with urllib.request.urlopen(req, timeout=300) as response:
            if response.status == 200:
                logger.info("Successfully pulled model: %s", model_name)
                return True
            else:
                logger.error("Failed to pull %s: HTTP %s", model_name, response.status)
                return False

    except Exception as exc:
        logger.error("Error pulling model %s: %s", model_name, exc)
        return False


def _resolve_model_with_fallback(
    requested_model: str | None = None,
    require_vision: bool = False,
    auto_pull: bool = True,
) -> tuple[str, bool]:
    """Resolve model with automatic pulling and fallback.

    Args:
        requested_model: Preferred model to use
        require_vision: Whether the model needs vision capabilities
        auto_pull: Whether to attempt pulling missing models

    Returns:
        Tuple of (model_name, is_fallback)
    """
    model = requested_model or settings.ollama_model

    # Check if requested model is available
    if _check_model_available(model):
        logger.debug("Using available model: %s", model)
        return model, False

    # Try to pull the requested model
    if auto_pull:
        logger.info("Model %s not available locally, attempting to pull...", model)
        if _pull_model(model):
            return model, False
        logger.warning("Failed to pull %s, checking fallbacks...", model)

    # Use appropriate configurable fallback chain (from settings / env vars)
    fallback_chain = settings.vision_fallback_models if require_vision else settings.fallback_models

    for fallback_model in fallback_chain:
        if _check_model_available(fallback_model):
            logger.warning("Using fallback model %s (requested: %s)", fallback_model, model)
            return fallback_model, True

        # Try to pull the fallback
        if auto_pull and _pull_model(fallback_model):
            logger.info("Pulled and using fallback model %s (requested: %s)", fallback_model, model)
            return fallback_model, True

    # Absolute last resort - return the requested model and hope for the best
    logger.error("No models available in fallback chain. Requested: %s", model)
    return model, False


def _model_supports_tools(model_name: str) -> bool:
    """Check if the configured model can reliably handle tool calling.

    Small models (< 7B) tend to hallucinate tool calls as text or invoke
    them randomly.  For these models, it's better to run tool-free and let
    the model answer directly from its training data.
    """
    model_lower = model_name.lower()
    for pattern in _SMALL_MODEL_PATTERNS:
        if pattern in model_lower:
            return False
    return True


def _warmup_model(model_name: str) -> bool:
    """Warm up an Ollama model by sending a minimal generation request.

    This prevents 'Server disconnected' errors on first request after cold model load.
    Cold loads can take 30-40s, so we use a 60s timeout.

    Args:
        model_name: Name of the Ollama model to warm up

    Returns:
        True if warmup succeeded, False otherwise (does not raise)
    """
    try:
        response = httpx.post(
            f"{settings.ollama_url}/api/generate",
            json={"model": model_name, "prompt": "hi", "options": {"num_predict": 1}},
            timeout=60.0,
        )
        response.raise_for_status()
        logger.info("Model %s warmed up successfully", model_name)
        return True
    except Exception as exc:
        logger.warning("Model warmup failed: %s — first request may disconnect", exc)
        return False


def _resolve_backend(requested: str | None) -> str:
    """Return the backend name to use.

    Priority (highest -> lowest):
      1. CLI flag passed directly to create_timmy()
      2. TIMMY_MODEL_BACKEND env var / .env setting
      3. 'ollama' (safe default -- no surprises)
    """
    if requested is not None:
        return requested

    return settings.timmy_model_backend  # "ollama" | "grok" | "claude"


def _build_tools_list(use_tools: bool, skip_mcp: bool, model_name: str) -> list:
    """Assemble the tools list based on model capability and MCP flags.

    Returns a list of Toolkit / MCPTools objects, or an empty list.
    """
    if not use_tools:
        logger.info("Tools disabled for model %s (too small for reliable tool calling)", model_name)
        return []

    tools_list: list = [create_full_toolkit()]

    # Add MCP tool servers (lazy-connected on first arun()).
    # Skipped when skip_mcp=True — MCP's stdio transport uses anyio cancel
    # scopes that conflict with asyncio background task cancellation (#72).
    if not skip_mcp:
        try:
            from timmy.mcp_tools import create_filesystem_mcp_tools, create_gitea_mcp_tools

            gitea_mcp = create_gitea_mcp_tools()
            if gitea_mcp:
                tools_list.append(gitea_mcp)

            fs_mcp = create_filesystem_mcp_tools()
            if fs_mcp:
                tools_list.append(fs_mcp)
        except Exception as exc:
            logger.debug("MCP tools unavailable: %s", exc)

    return tools_list


def _build_prompt(use_tools: bool, session_id: str) -> str:
    """Build the full system prompt with optional memory context."""
    base_prompt = get_system_prompt(tools_enabled=use_tools, session_id=session_id)

    try:
        from timmy.memory_system import memory_system

        memory_context = memory_system.get_system_context()
        if memory_context:
            # Smaller budget for small models — expanded prompt uses more tokens
            max_context = 2000 if not use_tools else 8000
            if len(memory_context) > max_context:
                memory_context = memory_context[:max_context] + "\n... [truncated]"
            return (
                f"{base_prompt}\n\n"
                f"## GROUNDED CONTEXT (verified sources — cite when using)\n\n"
                f"{memory_context}"
            )
    except Exception as exc:
        logger.warning("Failed to load memory context: %s", exc)

    return base_prompt


def _create_ollama_agent(
    *,
    db_file: str,
    model_name: str,
    tools_list: list,
    full_prompt: str,
    use_tools: bool,
) -> Agent:
    """Construct the Agno Agent with Ollama backend and warm up the model."""
    model_kwargs = {}
    if settings.ollama_num_ctx > 0:
        model_kwargs["options"] = {"num_ctx": settings.ollama_num_ctx}

    agent = Agent(
        name="Agent",
        model=Ollama(id=model_name, host=settings.ollama_url, timeout=300, **model_kwargs),
        db=SqliteDb(db_file=db_file),
        description=full_prompt,
        add_history_to_context=True,
        num_history_runs=20,
        markdown=False,
        tools=tools_list if tools_list else None,
        tool_call_limit=settings.max_agent_steps if use_tools else None,
        telemetry=settings.telemetry_enabled,
    )
    _warmup_model(model_name)
    return agent


def create_timmy(
    db_file: str = "timmy.db",
    backend: str | None = None,
    *,
    skip_mcp: bool = False,
    session_id: str = "unknown",
) -> TimmyAgent:
    """Instantiate the agent — Ollama, Grok, or Claude.

    Args:
        db_file:    SQLite file for Agno conversation memory (Ollama path only).
        backend:    "ollama" | "grok" | "claude" | None (reads config/env).
        skip_mcp:   If True, omit MCP tool servers (Gitea, filesystem).
                    Use for background tasks (thinking, QA) where MCP's
                    stdio cancel-scope lifecycle conflicts with asyncio
                    task cancellation.

    Returns an Agno Agent or backend-specific agent — all expose
    print_response(message, stream).
    """
    resolved = _resolve_backend(backend)

    if resolved == "claude":
        from timmy.backends import ClaudeBackend

        return ClaudeBackend()

    if resolved == "grok":
        from timmy.backends import GrokBackend

        return GrokBackend()

    # Default: Ollama via Agno.
    model_name, is_fallback = _resolve_model_with_fallback(
        requested_model=None,
        require_vision=False,
        auto_pull=True,
    )

    if not _check_model_available(model_name):
        logger.error(
            "Ollama unreachable and no local models available. "
            "Start Ollama with 'ollama serve' or use --backend claude explicitly."
        )

    if is_fallback:
        logger.info("Using fallback model %s (requested was unavailable)", model_name)

    use_tools = _model_supports_tools(model_name)
    tools_list = _build_tools_list(use_tools, skip_mcp, model_name)
    full_prompt = _build_prompt(use_tools, session_id)

    return _create_ollama_agent(
        db_file=db_file,
        model_name=model_name,
        tools_list=tools_list,
        full_prompt=full_prompt,
        use_tools=use_tools,
    )


class TimmyWithMemory:
    """Agent wrapper with explicit three-tier memory management."""

    def __init__(self, db_file: str = "timmy.db") -> None:
        from timmy.memory_system import memory_system

        self.agent = create_timmy(db_file=db_file)
        self.memory = memory_system
        self.session_active = True

        # Store initial context for reference
        self.initial_context = self.memory.get_system_context()

    def chat(self, message: str) -> str:
        """Simple chat interface that tracks in memory.

        Retries on transient Ollama errors (GPU contention, timeouts)
        with exponential backoff (#70).
        """
        import time

        # Check for user facts to extract
        self._extract_and_store_facts(message)

        # Retry with backoff — GPU contention causes ReadError/ReadTimeout
        max_retries = 3
        for attempt in range(1, max_retries + 1):
            try:
                result = self.agent.run(message, stream=False)
                return result.content if hasattr(result, "content") else str(result)
            except (
                httpx.ConnectError,
                httpx.ReadError,
                httpx.ReadTimeout,
                httpx.ConnectTimeout,
                ConnectionError,
                TimeoutError,
            ) as exc:
                if attempt < max_retries:
                    wait = min(2**attempt, 16)
                    logger.warning(
                        "Ollama contention on attempt %d/%d: %s. Waiting %ds before retry...",
                        attempt,
                        max_retries,
                        type(exc).__name__,
                        wait,
                    )
                    time.sleep(wait)
                else:
                    logger.error(
                        "Ollama unreachable after %d attempts: %s",
                        max_retries,
                        exc,
                    )
                    raise

    def _extract_and_store_facts(self, message: str) -> None:
        """Extract user facts from message and store in memory."""
        try:
            from timmy.conversation import conversation_manager

            name = conversation_manager.extract_user_name(message)
            if name:
                self.memory.update_user_fact("Name", name)
                self.memory.record_decision(f"Learned user's name: {name}")
        except Exception as exc:
            logger.warning("User name extraction failed: %s", exc)
            pass  # Best-effort extraction

    def end_session(self, summary: str = "Session completed") -> None:
        """End session and write handoff."""
        if self.session_active:
            self.memory.end_session(summary)
            self.session_active = False

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end_session()
        return False