"""Agent creation with three-tier memory system. Memory Architecture: - Tier 1 (Hot): MEMORY.md — always loaded, ~300 lines - Tier 2 (Vault): memory/ — structured markdown, append-only - Tier 3 (Semantic): Vector search over vault files Model Management: - Pulls requested model automatically if not available - Falls back through capability-based model chains - Multi-modal support with vision model fallbacks Handoff Protocol maintains continuity across sessions. """ import logging from typing import TYPE_CHECKING, Union import httpx from agno.agent import Agent from agno.db.sqlite import SqliteDb from agno.models.ollama import Ollama from config import check_ollama_model_available, settings from timmy.prompts import get_system_prompt from timmy.tools import create_full_toolkit if TYPE_CHECKING: from timmy.backends import ClaudeBackend, GrokBackend logger = logging.getLogger(__name__) # Union type for callers that want to hint the return type. TimmyAgent = Union[Agent, "GrokBackend", "ClaudeBackend"] # Models known to be too small for reliable tool calling. # These hallucinate tool calls as text, invoke tools randomly, # and leak raw JSON into responses. _SMALL_MODEL_PATTERNS = ( "llama3.2", "phi-3", "gemma:2b", "tinyllama", "qwen2:0.5b", "qwen2:1.5b", ) def _check_model_available(model_name: str) -> bool: """Check if an Ollama model is available locally.""" return check_ollama_model_available(model_name) def _pull_model(model_name: str) -> bool: """Attempt to pull a model from Ollama. Returns: True if successful or model already exists """ try: import json import urllib.request logger.info("Pulling model: %s", model_name) url = settings.normalized_ollama_url req = urllib.request.Request( f"{url}/api/pull", method="POST", headers={"Content-Type": "application/json"}, data=json.dumps({"name": model_name, "stream": False}).encode(), ) with urllib.request.urlopen(req, timeout=300) as response: if response.status == 200: logger.info("Successfully pulled model: %s", model_name) return True else: logger.error("Failed to pull %s: HTTP %s", model_name, response.status) return False except Exception as exc: logger.error("Error pulling model %s: %s", model_name, exc) return False def _resolve_model_with_fallback( requested_model: str | None = None, require_vision: bool = False, auto_pull: bool = True, ) -> tuple[str, bool]: """Resolve model with automatic pulling and fallback. Args: requested_model: Preferred model to use require_vision: Whether the model needs vision capabilities auto_pull: Whether to attempt pulling missing models Returns: Tuple of (model_name, is_fallback) """ model = requested_model or settings.ollama_model # Check if requested model is available if _check_model_available(model): logger.debug("Using available model: %s", model) return model, False # Try to pull the requested model if auto_pull: logger.info("Model %s not available locally, attempting to pull...", model) if _pull_model(model): return model, False logger.warning("Failed to pull %s, checking fallbacks...", model) # Use appropriate configurable fallback chain (from settings / env vars) fallback_chain = settings.vision_fallback_models if require_vision else settings.fallback_models for fallback_model in fallback_chain: if _check_model_available(fallback_model): logger.warning("Using fallback model %s (requested: %s)", fallback_model, model) return fallback_model, True # Try to pull the fallback if auto_pull and _pull_model(fallback_model): logger.info("Pulled and using fallback model %s (requested: %s)", fallback_model, model) return fallback_model, True # Absolute last resort - return the requested model and hope for the best logger.error("No models available in fallback chain. Requested: %s", model) return model, False def _model_supports_tools(model_name: str) -> bool: """Check if the configured model can reliably handle tool calling. Small models (< 7B) tend to hallucinate tool calls as text or invoke them randomly. For these models, it's better to run tool-free and let the model answer directly from its training data. """ model_lower = model_name.lower() for pattern in _SMALL_MODEL_PATTERNS: if pattern in model_lower: return False return True def _warmup_model(model_name: str) -> bool: """Warm up an Ollama model by sending a minimal generation request. This prevents 'Server disconnected' errors on first request after cold model load. Cold loads can take 30-40s, so we use a 60s timeout. Args: model_name: Name of the Ollama model to warm up Returns: True if warmup succeeded, False otherwise (does not raise) """ try: response = httpx.post( f"{settings.ollama_url}/api/generate", json={"model": model_name, "prompt": "hi", "options": {"num_predict": 1}}, timeout=60.0, ) response.raise_for_status() logger.info("Model %s warmed up successfully", model_name) return True except Exception as exc: logger.warning("Model warmup failed: %s — first request may disconnect", exc) return False def _resolve_backend(requested: str | None) -> str: """Return the backend name to use. Priority (highest -> lowest): 1. CLI flag passed directly to create_timmy() 2. TIMMY_MODEL_BACKEND env var / .env setting 3. 'ollama' (safe default -- no surprises) """ if requested is not None: return requested return settings.timmy_model_backend # "ollama" | "grok" | "claude" def _build_tools_list(use_tools: bool, skip_mcp: bool, model_name: str) -> list: """Assemble the tools list based on model capability and MCP flags. Returns a list of Toolkit / MCPTools objects, or an empty list. """ if not use_tools: logger.info("Tools disabled for model %s (too small for reliable tool calling)", model_name) return [] tools_list: list = [create_full_toolkit()] # Add MCP tool servers (lazy-connected on first arun()). # Skipped when skip_mcp=True — MCP's stdio transport uses anyio cancel # scopes that conflict with asyncio background task cancellation (#72). if not skip_mcp: try: from timmy.mcp_tools import create_filesystem_mcp_tools, create_gitea_mcp_tools gitea_mcp = create_gitea_mcp_tools() if gitea_mcp: tools_list.append(gitea_mcp) fs_mcp = create_filesystem_mcp_tools() if fs_mcp: tools_list.append(fs_mcp) except Exception as exc: logger.debug("MCP tools unavailable: %s", exc) return tools_list def _build_prompt(use_tools: bool, session_id: str) -> str: """Build the full system prompt with optional memory context.""" base_prompt = get_system_prompt(tools_enabled=use_tools, session_id=session_id) try: from timmy.memory_system import memory_system memory_context = memory_system.get_system_context() if memory_context: # Smaller budget for small models — expanded prompt uses more tokens max_context = 2000 if not use_tools else 8000 if len(memory_context) > max_context: memory_context = memory_context[:max_context] + "\n... [truncated]" return ( f"{base_prompt}\n\n" f"## GROUNDED CONTEXT (verified sources — cite when using)\n\n" f"{memory_context}" ) except Exception as exc: logger.warning("Failed to load memory context: %s", exc) return base_prompt def _create_ollama_agent( *, db_file: str, model_name: str, tools_list: list, full_prompt: str, use_tools: bool, ) -> Agent: """Construct the Agno Agent with Ollama backend and warm up the model.""" model_kwargs = {} if settings.ollama_num_ctx > 0: model_kwargs["options"] = {"num_ctx": settings.ollama_num_ctx} agent = Agent( name="Agent", model=Ollama(id=model_name, host=settings.ollama_url, timeout=300, **model_kwargs), db=SqliteDb(db_file=db_file), description=full_prompt, add_history_to_context=True, num_history_runs=20, markdown=False, tools=tools_list if tools_list else None, tool_call_limit=settings.max_agent_steps if use_tools else None, telemetry=settings.telemetry_enabled, ) _warmup_model(model_name) return agent def create_timmy( db_file: str = "timmy.db", backend: str | None = None, *, skip_mcp: bool = False, session_id: str = "unknown", ) -> TimmyAgent: """Instantiate the agent — Ollama, Grok, or Claude. Args: db_file: SQLite file for Agno conversation memory (Ollama path only). backend: "ollama" | "grok" | "claude" | None (reads config/env). skip_mcp: If True, omit MCP tool servers (Gitea, filesystem). Use for background tasks (thinking, QA) where MCP's stdio cancel-scope lifecycle conflicts with asyncio task cancellation. Returns an Agno Agent or backend-specific agent — all expose print_response(message, stream). """ resolved = _resolve_backend(backend) if resolved == "claude": from timmy.backends import ClaudeBackend return ClaudeBackend() if resolved == "grok": from timmy.backends import GrokBackend return GrokBackend() # Default: Ollama via Agno. model_name, is_fallback = _resolve_model_with_fallback( requested_model=None, require_vision=False, auto_pull=True, ) if not _check_model_available(model_name): logger.error( "Ollama unreachable and no local models available. " "Start Ollama with 'ollama serve' or use --backend claude explicitly." ) if is_fallback: logger.info("Using fallback model %s (requested was unavailable)", model_name) use_tools = _model_supports_tools(model_name) tools_list = _build_tools_list(use_tools, skip_mcp, model_name) full_prompt = _build_prompt(use_tools, session_id) return _create_ollama_agent( db_file=db_file, model_name=model_name, tools_list=tools_list, full_prompt=full_prompt, use_tools=use_tools, ) class TimmyWithMemory: """Agent wrapper with explicit three-tier memory management.""" def __init__(self, db_file: str = "timmy.db") -> None: from timmy.memory_system import memory_system self.agent = create_timmy(db_file=db_file) self.memory = memory_system self.session_active = True # Store initial context for reference self.initial_context = self.memory.get_system_context() def chat(self, message: str) -> str: """Simple chat interface that tracks in memory. Retries on transient Ollama errors (GPU contention, timeouts) with exponential backoff (#70). """ import time # Check for user facts to extract self._extract_and_store_facts(message) # Retry with backoff — GPU contention causes ReadError/ReadTimeout max_retries = 3 for attempt in range(1, max_retries + 1): try: result = self.agent.run(message, stream=False) return result.content if hasattr(result, "content") else str(result) except ( httpx.ConnectError, httpx.ReadError, httpx.ReadTimeout, httpx.ConnectTimeout, ConnectionError, TimeoutError, ) as exc: if attempt < max_retries: wait = min(2**attempt, 16) logger.warning( "Ollama contention on attempt %d/%d: %s. Waiting %ds before retry...", attempt, max_retries, type(exc).__name__, wait, ) time.sleep(wait) else: logger.error( "Ollama unreachable after %d attempts: %s", max_retries, exc, ) raise def _extract_and_store_facts(self, message: str) -> None: """Extract user facts from message and store in memory.""" try: from timmy.conversation import conversation_manager name = conversation_manager.extract_user_name(message) if name: self.memory.update_user_fact("Name", name) self.memory.record_decision(f"Learned user's name: {name}") except Exception as exc: logger.warning("User name extraction failed: %s", exc) pass # Best-effort extraction def end_session(self, summary: str = "Session completed") -> None: """End session and write handoff.""" if self.session_active: self.memory.end_session(summary) self.session_active = False def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.end_session() return False