Timmy-time-dashboard/src/config.py

import logging as _logging
import os
import sys
from datetime import UTC
from datetime import datetime as _datetime
from typing import Literal

from pydantic_settings import BaseSettings, SettingsConfigDict

APP_START_TIME: _datetime = _datetime.now(UTC)


def normalize_ollama_url(url: str) -> str:
    """Replace localhost with 127.0.0.1 to avoid IPv6 resolution delays."""
    return url.replace("localhost", "127.0.0.1")


class Settings(BaseSettings):
    """Central configuration — all env-var access goes through this class."""

    # Display name for the primary agent — override with AGENT_NAME env var
    agent_name: str = "Agent"

    # Ollama host — override with OLLAMA_URL env var or .env file
    ollama_url: str = "http://localhost:11434"

    @property
    def normalized_ollama_url(self) -> str:
        """Return ollama_url with localhost replaced by 127.0.0.1."""
        return normalize_ollama_url(self.ollama_url)

    # LLM model passed to Agno/Ollama — override with OLLAMA_MODEL
    # qwen3:14b (Q5_K_M) is the primary model: tool calling F1 0.971, ~17.5 GB
    # at 32K context — optimal for M3 Max 36 GB (Issue #1063).
    # qwen3:30b exceeded memory budget at 32K+ context on 36 GB hardware.
    ollama_model: str = "qwen3:14b"

    # Fast routing model — override with OLLAMA_FAST_MODEL
    # qwen3:8b (Q6_K): tool calling F1 0.933 at ~45-55 tok/s (2x speed of 14B).
    # Use for routine tasks: simple tool calls, file reads, status checks.
    # Combined memory with qwen3:14b: ~17 GB — both can stay loaded simultaneously.
    ollama_fast_model: str = "qwen3:8b"

    # Maximum concurrently loaded Ollama models — override with OLLAMA_MAX_LOADED_MODELS
    # Set to 2 to keep qwen3:8b (fast) + qwen3:14b (primary) both hot.
    # Requires setting OLLAMA_MAX_LOADED_MODELS=2 in the Ollama server environment.
    ollama_max_loaded_models: int = 2

    # Context window size for Ollama inference — override with OLLAMA_NUM_CTX
    # qwen3:14b at 32K: ~17.5 GB total (weights + KV cache) on M3 Max 36 GB.
    # Set to 0 to use model defaults.
    ollama_num_ctx: int = 32768

    # Maximum models loaded simultaneously in Ollama — override with OLLAMA_MAX_LOADED_MODELS
    # Set to 2 so Qwen3-8B and Qwen3-14B can stay hot concurrently (~17 GB combined).
    # Requires Ollama ≥ 0.1.33.  Export this to the Ollama process environment:
    #   OLLAMA_MAX_LOADED_MODELS=2 ollama serve
    # or add it to your systemd/launchd unit before starting the harness.
    ollama_max_loaded_models: int = 2

    # Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS
    # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:8b,qwen2.5:14b"
    # Or edit config/providers.yaml → fallback_chains for the canonical source.
    fallback_models: list[str] = [
        "qwen3:8b",
        "qwen2.5:14b",
        "qwen2.5:7b",
        "llama3.1:8b-instruct",
        "llama3.1",
        "llama3.2:3b",
    ]
    vision_fallback_models: list[str] = [
        "llama3.2:3b",
        "llava:7b",
        "qwen2.5-vl:3b",
        "moondream:1.8b",
    ]

    # Set DEBUG=true to enable /docs and /redoc (disabled by default)
    debug: bool = False

    # Telegram bot token — set via TELEGRAM_TOKEN env var or the /telegram/setup endpoint
    telegram_token: str = ""

    # Discord bot token — set via DISCORD_TOKEN env var or the /discord/setup endpoint
    discord_token: str = ""

    # ── Discord action confirmation ──────────────────────────────────────────
    # When True, dangerous tools (shell, write_file, python) require user
    # confirmation via Discord button before executing.
    discord_confirm_actions: bool = True
    # Seconds to wait for user confirmation before auto-rejecting.
    discord_confirm_timeout: int = 120

    # ── Backend selection ────────────────────────────────────────────────────
    # "ollama"  — always use Ollama (default, safe everywhere)
    # "auto"    — pick best available local backend, fall back to Ollama
    timmy_model_backend: Literal["ollama", "grok", "claude", "auto"] = "ollama"

    # ── Grok (xAI) — opt-in premium cloud backend ────────────────────────
    # Grok is a premium augmentation layer — local-first ethos preserved.
    # Only used when explicitly enabled and query complexity warrants it.
    grok_enabled: bool = False
    xai_api_key: str = ""
    xai_base_url: str = "https://api.x.ai/v1"
    grok_default_model: str = "grok-3-fast"
    grok_max_sats_per_query: int = 200
    grok_sats_hard_cap: int = 100  # Absolute ceiling on sats per Grok query
    grok_free: bool = False  # Skip Lightning invoice when user has own API key

    # ── Database ──────────────────────────────────────────────────────────
    db_busy_timeout_ms: int = 5000  # SQLite PRAGMA busy_timeout (ms)

    # ── Claude (Anthropic) — cloud fallback backend ────────────────────────
    # Used when Ollama is offline and local inference isn't available.
    # Set ANTHROPIC_API_KEY to enable.  Default model is Haiku (fast + cheap).
    anthropic_api_key: str = ""
    claude_model: str = "haiku"

    # ── Content Moderation ──────────────────────────────────────────────
    # Three-layer moderation pipeline for AI narrator output.
    # Uses Llama Guard via Ollama with regex fallback.
    moderation_enabled: bool = True
    moderation_guard_model: str = "llama-guard3:1b"
    # Default confidence threshold — per-game profiles can override.
    moderation_threshold: float = 0.8

    # ── Spark Intelligence ────────────────────────────────────────────────
    # Enable/disable the Spark cognitive layer.
    # When enabled, Spark captures swarm events, runs EIDOS predictions,
    # consolidates memories, and generates advisory recommendations.
    spark_enabled: bool = True

    # ── Git / DevOps ──────────────────────────────────────────────────────
    git_default_repo_dir: str = "~/repos"

    # Repository root - auto-detected but can be overridden
    # This is the main project directory where .git lives
    repo_root: str = ""

    # ── Creative — Image Generation (Pixel) ───────────────────────────────
    flux_model_id: str = "black-forest-labs/FLUX.1-schnell"
    image_output_dir: str = "data/images"
    image_default_steps: int = 4

    # ── Creative — Music Generation (Lyra) ────────────────────────────────
    music_output_dir: str = "data/music"
    ace_step_model: str = "ace-step/ACE-Step-v1.5"

    # ── Creative — Video Generation (Reel) ────────────────────────────────
    video_output_dir: str = "data/video"
    wan_model_id: str = "Wan-AI/Wan2.1-T2V-1.3B"
    video_default_resolution: str = "480p"

    # ── Creative — Pipeline / Assembly ────────────────────────────────────
    creative_output_dir: str = "data/creative"
    video_transition_duration: float = 1.0
    default_video_codec: str = "libx264"

    # ── L402 Lightning ───────────────────────────────────────────────────
    # HMAC secrets for macaroon signing and invoice verification.
    # Generate with: python3 -c "import secrets; print(secrets.token_hex(32))"
    # In production (TIMMY_ENV=production), these MUST be set or the app will refuse to start.
    l402_hmac_secret: str = ""
    l402_macaroon_secret: str = ""
    lightning_backend: Literal["mock", "lnd"] = "mock"

    # ── Privacy / Sovereignty ────────────────────────────────────────────
    # Disable Agno telemetry for air-gapped/sovereign deployments.
    # Default is False (telemetry disabled) to align with sovereign AI vision.
    telemetry_enabled: bool = False

    # ── Sovereignty Metrics ──────────────────────────────────────────────
    # Alert when API cost per research task exceeds this threshold (USD).
    sovereignty_api_cost_alert_threshold: float = 1.00

    # CORS allowed origins for the web chat interface (Gitea Pages, etc.)
    # Set CORS_ORIGINS as a comma-separated list, e.g. "http://localhost:3000,https://example.com"
    cors_origins: list[str] = [
        "http://localhost:3000",
        "http://localhost:8000",
        "http://127.0.0.1:3000",
        "http://127.0.0.1:8000",
    ]

    # ── Matrix Frontend Integration ────────────────────────────────────────
    # URL of the Matrix frontend (Replit/Tailscale) for CORS.
    # When set, this origin is added to CORS allowed_origins.
    # Example: "http://100.124.176.28:8080" or "https://alexanderwhitestone.com"
    matrix_frontend_url: str = ""  # Empty = disabled

    # WebSocket authentication token for Matrix connections.
    # When set, clients must provide this token via ?token= query param
    # or in the first message as {"type": "auth", "token": "..."}.
    # Empty/unset = auth disabled (dev mode).
    matrix_ws_token: str = ""

    # Trusted hosts for the Host header check (TrustedHostMiddleware).
    # Set TRUSTED_HOSTS as a comma-separated list. Wildcards supported (e.g. "*.ts.net").
    # Defaults include localhost + Tailscale MagicDNS. Add your Tailscale IP if needed.
    trusted_hosts: list[str] = [
        "localhost",
        "127.0.0.1",
        "*.local",
        "*.ts.net",
        "testserver",
    ]

    # Environment mode: development | production
    # In production, security settings are strictly enforced.
    timmy_env: Literal["development", "production"] = "development"

    # ── Memory Management ──────────────────────────────────────────────
    # Auto-prune vector store memories older than this many days on startup.
    # Set to 0 to disable auto-pruning.
    memory_prune_days: int = 90
    # When True, fact-type memories are kept even when older than the TTL.
    memory_prune_keep_facts: bool = True
    # Maximum size in MB for the memory/notes/ vault directory.
    # When exceeded, a warning is logged. Set to 0 to disable.
    memory_vault_max_mb: int = 100
    # Auto-prune thoughts older than this many days. 0 = disabled.
    thoughts_prune_days: int = 90
    # Minimum thoughts to keep regardless of age.
    thoughts_prune_keep_min: int = 200
    # Auto-prune system events older than this many days. 0 = disabled.
    events_prune_days: int = 90
    # Minimum events to keep regardless of age.
    events_prune_keep_min: int = 200

    # ── Agentic Loop ──────────────────────────────────────────────────
    # Maximum steps the agentic loop will execute before stopping.
    max_agent_steps: int = 10

    # ── Test / Diagnostics ─────────────────────────────────────────────
    # Skip loading heavy embedding models (for tests / low-memory envs).
    timmy_skip_embeddings: bool = False
    # Embedding backend: "ollama" for Ollama, "local" for sentence-transformers.
    timmy_embedding_backend: Literal["ollama", "local"] = "local"
    # Ollama model to use for embeddings (e.g., "nomic-embed-text").
    ollama_embedding_model: str = "nomic-embed-text"
    # Disable CSRF middleware entirely (for tests).
    timmy_disable_csrf: bool = False
    # Mark the process as running in test mode.
    timmy_test_mode: bool = False

    # ── Brain / rqlite ─────────────────────────────────────────────────
    # URL of the local rqlite node for distributed memory.
    # Empty string means rqlite is not configured.
    rqlite_url: str = ""
    # Source identifier for brain memory entries.
    brain_source: str = "default"
    # Path override for the local brain SQLite database.
    brain_db_path: str = ""

    # ── Security Tuning ───────────────────────────────────────────────
    # Set to True in production to mark CSRF cookies as Secure (HTTPS only).
    csrf_cookie_secure: bool = False
    # Maximum size in bytes for chat API request bodies.
    chat_api_max_body_bytes: int = 1_048_576  # 1 MB

    # ── Self-Modification ──────────────────────────────────────────────
    # Enable self-modification capabilities. When enabled, the agent can
    # edit its own source code, run tests, and commit changes.
    self_modify_enabled: bool = False
    self_modify_max_retries: int = 2
    self_modify_allowed_dirs: str = "src,tests"
    self_modify_backend: str = "auto"  # "ollama", "anthropic", or "auto"

    # ── Work Orders ──────────────────────────────────────────────────
    # External users and agents can submit work orders for improvements.
    work_orders_enabled: bool = True
    work_orders_auto_execute: bool = False  # Master switch for auto-execution
    work_orders_auto_threshold: str = (
        "low"  # Max priority that auto-executes: "low" | "medium" | "high" | "none"
    )

    # ── Custom Weights & Models ──────────────────────────────────────
    # Directory for custom model weights (GGUF, safetensors, HF checkpoints).
    # Models placed here can be registered at runtime and assigned to agents.
    custom_weights_dir: str = "data/models"
    # Enable the reward model for scoring agent outputs (PRM-style).
    reward_model_enabled: bool = False
    # Reward model name (must be available via Ollama or a custom weight path).
    reward_model_name: str = ""
    # Minimum votes for majority-vote reward scoring (odd number recommended).
    reward_model_votes: int = 3

    # ── Browser Local Models (iPhone / WebGPU) ───────────────────────
    # Enable in-browser LLM inference via WebLLM for offline iPhone use.
    # When enabled, the mobile dashboard loads a small model directly
    # in the browser — no server or Ollama required.
    browser_model_enabled: bool = True
    # WebLLM model ID — must be a pre-compiled MLC model.
    # Recommended for iPhone: SmolLM2-360M (fast) or Qwen3-0.6B (smart).
    browser_model_id: str = "SmolLM2-360M-Instruct-q4f16_1-MLC"
    # Fallback to server when browser model is unavailable or too slow.
    browser_model_fallback: bool = True

    # ── Deep Focus Mode ─────────────────────────────────────────────
    # "deep" = single-problem context; "broad" = default multi-task.
    focus_mode: Literal["deep", "broad"] = "broad"

    # ── Default Thinking ──────────────────────────────────────────────
    # When enabled, the agent starts an internal thought loop on server start.
    thinking_enabled: bool = True
    thinking_interval_seconds: int = 300  # 5 minutes between thoughts
    thinking_timeout_seconds: int = 120  # max wall-clock time per thinking cycle
    thinking_distill_every: int = 10  # distill facts from thoughts every Nth thought
    thinking_issue_every: int = 20  # file Gitea issues from thoughts every Nth thought
    thinking_memory_check_every: int = 50  # check memory status every Nth thought
    thinking_idle_timeout_minutes: int = 60  # pause thoughts after N minutes without user input

    # ── Gitea Integration ─────────────────────────────────────────────
    # Local Gitea instance for issue tracking and self-improvement.
    # These values are passed as env vars to the gitea-mcp server process.
    gitea_url: str = "http://localhost:3000"
    gitea_token: str = ""  # GITEA_TOKEN env var; falls back to .timmy_gitea_token
    gitea_repo: str = "rockachopa/Timmy-time-dashboard"  # owner/repo
    gitea_enabled: bool = True

    # ── MCP Servers ────────────────────────────────────────────────────
    # External tool servers connected via Model Context Protocol (stdio).
    mcp_gitea_command: str = "gitea-mcp-server -t stdio"
    mcp_filesystem_command: str = "npx -y @modelcontextprotocol/server-filesystem"
    mcp_timeout: int = 15
    mcp_bridge_timeout: int = 60  # HTTP timeout for MCP bridge Ollama calls (seconds)

    # ── Backlog Triage Loop ────────────────────────────────────────────
    # Autonomous loop: fetch open issues, score, assign to agents.
    backlog_triage_enabled: bool = False
    # Seconds between triage cycles (default: 15 minutes).
    backlog_triage_interval_seconds: int = 900
    # When True, score and summarize but don't write to Gitea.
    backlog_triage_dry_run: bool = False
    # Create a daily triage summary issue/comment.
    backlog_triage_daily_summary: bool = True

    # ── Loop QA (Self-Testing) ─────────────────────────────────────────
    # Self-test orchestrator that probes capabilities alongside the thinking loop.
    loop_qa_enabled: bool = True
    loop_qa_interval_ticks: int = 5  # run 1 self-test every Nth thinking tick (~25 min)
    loop_qa_upgrade_threshold: int = 3  # consecutive failures → file task
    loop_qa_max_per_hour: int = 12  # safety throttle

    # ── Vassal Protocol (Autonomous Orchestrator) ─────────────────────
    # Timmy as lead decision-maker: triage backlog, dispatch agents, monitor health.
    # See timmy/vassal/ for implementation.
    vassal_enabled: bool = False  # off by default — enable when Qwen3-14B is loaded
    vassal_cycle_interval: int = 300  # seconds between orchestration cycles (5 min)
    vassal_max_dispatch_per_cycle: int = 10  # cap on new dispatches per cycle
    vassal_stuck_threshold_minutes: int = 120  # minutes before agent issue is "stuck"
    vassal_idle_threshold_minutes: int = 30  # minutes before agent is "idle"

    # ── Paperclip AI — orchestration bridge ────────────────────────────
    # URL where the Paperclip server listens.
    # For VPS deployment behind nginx, use the public domain.
    paperclip_url: str = "http://localhost:3100"
    # Enable/disable the Paperclip integration.
    paperclip_enabled: bool = False
    # API key or auth-gate cookie for authenticating with Paperclip.
    paperclip_api_key: str = ""
    # Timmy's agent ID in the Paperclip org chart.
    paperclip_agent_id: str = ""
    # Company ID in Paperclip — required for most API calls.
    paperclip_company_id: str = ""
    # Timeout in seconds for Paperclip HTTP calls.
    paperclip_timeout: int = 30
    # How often (seconds) Timmy polls Paperclip for work (0 = disabled).
    paperclip_poll_interval: int = 0

    # ── OpenFang — vendored agent runtime ─────────────────────────────
    # URL where the OpenFang sidecar listens.  Set to the Docker service
    # name when running in compose, or localhost for bare-metal dev.
    openfang_url: str = "http://localhost:8080"
    # Enable/disable OpenFang integration.  When disabled, the tool
    # executor falls back to Timmy's native (simulated) execution.
    openfang_enabled: bool = False
    # Timeout in seconds for OpenFang hand execution (some hands are slow).
    openfang_timeout: int = 120

    # ── Autoresearch — autonomous ML experiment loops ──────────────────
    # Integrates Karpathy's autoresearch pattern: agents modify training
    # code, run time-boxed experiments, evaluate metrics, and iterate.
    autoresearch_enabled: bool = False
    autoresearch_workspace: str = "data/experiments"
    autoresearch_time_budget: int = 300  # seconds per experiment run
    autoresearch_max_iterations: int = 100
    autoresearch_metric: str = "val_bpb"  # metric to optimise (lower = better)
    # M3 Max / Apple Silicon tuning (Issue #905).
    # dataset: "tinystories" (default, lower-entropy, recommended for Mac) or "openwebtext".
    autoresearch_dataset: str = "tinystories"
    # backend: "auto" detects MLX on Apple Silicon; "cpu" forces CPU fallback.
    autoresearch_backend: str = "auto"

    # ── Weekly Narrative Summary ───────────────────────────────────────
    # Generates a human-readable weekly summary of development activity.
    # Disabling this will stop the weekly narrative generation.
    weekly_narrative_enabled: bool = True
    weekly_narrative_lookback_days: int = 7
    weekly_narrative_output_dir: str = ".loop"

    # ── Local Hands (Shell + Git) ──────────────────────────────────────
    # Enable local shell/git execution hands.
    hands_shell_enabled: bool = True
    # Default timeout in seconds for shell commands.
    hands_shell_timeout: int = 60
    # Comma-separated additional command prefixes to allow.
    hands_shell_extra_allowed: str = ""
    # Enable the git hand for version-control operations.
    hands_git_enabled: bool = True
    # Default timeout for git operations.
    hands_git_timeout: int = 60

    # ── Hermes Health Monitor ─────────────────────────────────────────
    # Enable the Hermes system health monitor (memory, disk, Ollama, processes, network).
    hermes_enabled: bool = True
    # How often Hermes runs a full health cycle (seconds). Default: 5 minutes.
    hermes_interval_seconds: int = 300
    # Alert threshold: free memory below this triggers model unloading / alert (GB).
    hermes_memory_free_min_gb: float = 4.0
    # Alert threshold: free disk below this triggers cleanup / alert (GB).
    hermes_disk_free_min_gb: float = 10.0

    # ── Energy Budget Monitoring ───────────────────────────────────────
    # Enable energy budget monitoring (tracks CPU/GPU power during inference).
    energy_budget_enabled: bool = True
    # Watts threshold that auto-activates low power mode (on-battery only).
    energy_budget_watts_threshold: float = 15.0
    # Model to prefer in low power mode (smaller = more efficient).
    energy_low_power_model: str = "qwen3:1b"

    # ── Error Logging ─────────────────────────────────────────────────
    error_log_enabled: bool = True
    error_log_dir: str = "logs"
    error_log_max_bytes: int = 5_242_880  # 5 MB
    error_log_backup_count: int = 5
    error_feedback_enabled: bool = True  # Auto-create bug report tasks
    error_dedup_window_seconds: int = 300  # 5-min dedup window

    # ── Bannerlord / GABS ────────────────────────────────────────────
    # GABS (Game Action Bridge Server) TCP JSON-RPC endpoint.
    # The GABS mod runs inside the Windows VM and exposes a JSON-RPC server
    # on port 4825 that Timmy uses to read and act on Bannerlord game state.
    # Set GABS_HOST to the VM's LAN IP (e.g. "10.0.0.50") to enable.
    gabs_enabled: bool = False
    gabs_host: str = "127.0.0.1"
    gabs_port: int = 4825
    gabs_timeout: float = 5.0  # socket timeout in seconds
    # How often (seconds) the observer polls GABS for fresh game state.
    gabs_poll_interval: int = 60
    # Path to the Bannerlord journal inside the memory vault.
    # Relative to repo root.  Written by the GABS observer loop.
    gabs_journal_path: str = "memory/bannerlord/journal.md"

    # ── Scripture / Biblical Integration ──────────────────────────────
    # Enable the biblical text module.
    scripture_enabled: bool = True
    # Primary translation for retrieval and citation.
    scripture_translation: str = "ESV"
    # Meditation mode: sequential | thematic | lectionary
    scripture_meditation_mode: str = "sequential"
    # Background meditation interval in seconds (0 = disabled).
    scripture_meditation_interval: int = 0

    def _compute_repo_root(self) -> str:
        """Auto-detect repo root if not set."""
        if self.repo_root:
            return self.repo_root
        # Walk up from this file to find .git
        import os

        path = os.path.dirname(os.path.abspath(__file__))
        path = os.path.dirname(os.path.dirname(path))  # src/ -> project root
        while path != os.path.dirname(path):
            if os.path.exists(os.path.join(path, ".git")):
                return path
            path = os.path.dirname(path)
        return os.getcwd()

    def model_post_init(self, __context) -> None:
        """Post-init: resolve gitea_token from file if not set via env."""
        if not self.gitea_token:
            # Priority: Timmy's own token → legacy admin token
            repo_root = self._compute_repo_root()
            timmy_token_path = os.path.join(repo_root, ".timmy_gitea_token")
            legacy_token_path = os.path.expanduser("~/.config/gitea/token")
            for token_path in (timmy_token_path, legacy_token_path):
                try:
                    if os.path.isfile(token_path):
                        token = open(token_path).read().strip()  # noqa: SIM115
                        if token:
                            self.gitea_token = token
                            break
                except OSError:
                    pass

    model_config = SettingsConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )


settings = Settings()
# Ensure repo_root is computed if not set
if not settings.repo_root:
    settings.repo_root = settings._compute_repo_root()

# ── Model fallback configuration ────────────────────────────────────────────
# Fallback chains are now in settings.fallback_models / settings.vision_fallback_models.
# Override via env vars (FALLBACK_MODELS, VISION_FALLBACK_MODELS) or
# edit config/providers.yaml → fallback_chains.


def check_ollama_model_available(model_name: str) -> bool:
    """Check if a specific Ollama model is available locally."""
    try:
        import json
        import urllib.request

        url = settings.normalized_ollama_url
        req = urllib.request.Request(
            f"{url}/api/tags",
            method="GET",
            headers={"Accept": "application/json"},
        )
        with urllib.request.urlopen(req, timeout=5) as response:
            data = json.loads(response.read().decode())
            models = [m.get("name", "") for m in data.get("models", [])]
            return any(
                model_name == m or model_name == m.split(":")[0] or m.startswith(model_name)
                for m in models
            )
    except (OSError, ValueError) as exc:
        _startup_logger.debug("Ollama model check failed: %s", exc)
        return False


def get_effective_ollama_model() -> str:
    """Get the effective Ollama model, with fallback logic.

    Walks the configurable ``settings.fallback_models`` chain when the
    user's preferred model is not available locally.
    """
    user_model = settings.ollama_model

    if check_ollama_model_available(user_model):
        return user_model

    # Walk the configurable fallback chain
    for fallback in settings.fallback_models:
        if check_ollama_model_available(fallback):
            _startup_logger.warning(
                "Requested model '%s' not available. Using fallback: %s",
                user_model,
                fallback,
            )
            return fallback

    # Last resort - return user's setting and hope for the best
    return user_model


# ── Startup validation ───────────────────────────────────────────────────────
_startup_logger = _logging.getLogger("config")
_startup_validated = False


def validate_startup(*, force: bool = False) -> None:
    """Enforce security requirements — call from app entry points, not import.

    Skipped in test mode (TIMMY_TEST_MODE=1) unless force=True.
    In production: sys.exit(1) if required secrets are missing.
    In development: log warnings only.
    """
    global _startup_validated
    if _startup_validated and not force:
        return

    if os.environ.get("TIMMY_TEST_MODE") == "1" and not force:
        _startup_validated = True
        return

    if settings.timmy_env == "production":
        _missing = []
        if not settings.l402_hmac_secret:
            _missing.append("L402_HMAC_SECRET")
        if not settings.l402_macaroon_secret:
            _missing.append("L402_MACAROON_SECRET")
        if _missing:
            _startup_logger.error(
                "PRODUCTION SECURITY ERROR: The following secrets must be set: %s\n"
                'Generate with: python3 -c "import secrets; print(secrets.token_hex(32))"\n'
                "Set in .env file or environment variables.",
                ", ".join(_missing),
            )
            sys.exit(1)
        if "*" in settings.cors_origins:
            _startup_logger.error(
                "PRODUCTION SECURITY ERROR: CORS wildcard '*' is not allowed "
                "in production. Set CORS_ORIGINS to explicit origins."
            )
            sys.exit(1)
        _startup_logger.info("Production mode: security secrets validated ✓")
    else:
        if "*" in settings.cors_origins:
            _startup_logger.warning(
                "SEC: CORS_ORIGINS contains wildcard '*' — "
                "restrict to explicit origins before deploying to production."
            )
        if not settings.l402_hmac_secret:
            _startup_logger.warning(
                "SEC: L402_HMAC_SECRET is not set — "
                "set a unique secret in .env before deploying to production."
            )
        if not settings.l402_macaroon_secret:
            _startup_logger.warning(
                "SEC: L402_MACAROON_SECRET is not set — "
                "set a unique secret in .env before deploying to production."
            )

    _startup_validated = True