1
0
This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/src/config.py
Alexander Whitestone 28d1905df4 feat: add vLLM as alternative inference backend (#1281)
Adds vLLM (high-throughput OpenAI-compatible inference server) as a
selectable backend alongside the existing Ollama and vllm-mlx backends.
vLLM's continuous batching gives 3-10x throughput for agentic workloads.

Changes:
- config.py: add `vllm` to timmy_model_backend Literal; add vllm_url /
  vllm_model settings (VLLM_URL / VLLM_MODEL env vars)
- cascade.py: add vllm provider type with _check_provider_available
  (hits /health) and _call_vllm (OpenAI-compatible completions)
- providers.yaml: add disabled-by-default vllm-local provider (priority 3,
  port 8001); bump OpenAI/Anthropic backup priorities to 4/5
- health.py: add _check_vllm/_check_vllm_sync with 30-second TTL cache;
  /health and /health/sovereignty reflect vLLM status when it is the
  active backend
- docker-compose.yml: add vllm service behind 'vllm' profile (GPU
  passthrough commented-out template included); add vllm-cache volume
- CLAUDE.md: add vLLM row to Service Fallback Matrix
- tests: 26 new unit tests covering availability checks, _call_vllm,
  providers.yaml validation, config options, and health helpers

Graceful fallback: if vLLM is unavailable the cascade router automatically
falls back to Ollama. The app never crashes.

Fixes #1281

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-23 21:52:52 -04:00

634 lines
32 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging as _logging
import os
import sys
from datetime import UTC
from datetime import datetime as _datetime
from typing import Literal
from pydantic_settings import BaseSettings, SettingsConfigDict
APP_START_TIME: _datetime = _datetime.now(UTC)
def normalize_ollama_url(url: str) -> str:
"""Replace localhost with 127.0.0.1 to avoid IPv6 resolution delays."""
return url.replace("localhost", "127.0.0.1")
class Settings(BaseSettings):
"""Central configuration — all env-var access goes through this class."""
# Display name for the primary agent — override with AGENT_NAME env var
agent_name: str = "Agent"
# Ollama host — override with OLLAMA_URL env var or .env file
ollama_url: str = "http://localhost:11434"
@property
def normalized_ollama_url(self) -> str:
"""Return ollama_url with localhost replaced by 127.0.0.1."""
return normalize_ollama_url(self.ollama_url)
# LLM model passed to Agno/Ollama — override with OLLAMA_MODEL
# qwen3:14b (Q5_K_M) is the primary model: tool calling F1 0.971, ~17.5 GB
# at 32K context — optimal for M3 Max 36 GB (Issue #1063).
# qwen3:30b exceeded memory budget at 32K+ context on 36 GB hardware.
ollama_model: str = "qwen3:14b"
# Fast routing model — override with OLLAMA_FAST_MODEL
# qwen3:8b (Q6_K): tool calling F1 0.933 at ~45-55 tok/s (2x speed of 14B).
# Use for routine tasks: simple tool calls, file reads, status checks.
# Combined memory with qwen3:14b: ~17 GB — both can stay loaded simultaneously.
ollama_fast_model: str = "qwen3:8b"
# Maximum concurrently loaded Ollama models — override with OLLAMA_MAX_LOADED_MODELS
# Set to 2 to keep qwen3:8b (fast) + qwen3:14b (primary) both hot.
# Requires setting OLLAMA_MAX_LOADED_MODELS=2 in the Ollama server environment.
ollama_max_loaded_models: int = 2
# Context window size for Ollama inference — override with OLLAMA_NUM_CTX
# qwen3:14b at 32K: ~17.5 GB total (weights + KV cache) on M3 Max 36 GB.
# Set to 0 to use model defaults.
ollama_num_ctx: int = 32768
# Maximum models loaded simultaneously in Ollama — override with OLLAMA_MAX_LOADED_MODELS
# Set to 2 so Qwen3-8B and Qwen3-14B can stay hot concurrently (~17 GB combined).
# Requires Ollama ≥ 0.1.33. Export this to the Ollama process environment:
# OLLAMA_MAX_LOADED_MODELS=2 ollama serve
# or add it to your systemd/launchd unit before starting the harness.
ollama_max_loaded_models: int = 2
# Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS
# as comma-separated strings, e.g. FALLBACK_MODELS="qwen3:8b,qwen2.5:14b"
# Or edit config/providers.yaml → fallback_chains for the canonical source.
fallback_models: list[str] = [
"qwen3:8b",
"qwen2.5:14b",
"qwen2.5:7b",
"llama3.1:8b-instruct",
"llama3.1",
"llama3.2:3b",
]
vision_fallback_models: list[str] = [
"llama3.2:3b",
"llava:7b",
"qwen2.5-vl:3b",
"moondream:1.8b",
]
# Set DEBUG=true to enable /docs and /redoc (disabled by default)
debug: bool = False
# Telegram bot token — set via TELEGRAM_TOKEN env var or the /telegram/setup endpoint
telegram_token: str = ""
# Discord bot token — set via DISCORD_TOKEN env var or the /discord/setup endpoint
discord_token: str = ""
# ── Discord action confirmation ──────────────────────────────────────────
# When True, dangerous tools (shell, write_file, python) require user
# confirmation via Discord button before executing.
discord_confirm_actions: bool = True
# Seconds to wait for user confirmation before auto-rejecting.
discord_confirm_timeout: int = 120
# ── Backend selection ────────────────────────────────────────────────────
# "ollama" — always use Ollama (default, safe everywhere)
# "vllm" — use vLLM inference server (OpenAI-compatible, faster throughput)
# "auto" — pick best available local backend, fall back to Ollama
timmy_model_backend: Literal["ollama", "vllm", "grok", "claude", "auto"] = "ollama"
# ── vLLM backend ──────────────────────────────────────────────────────────
# vLLM is an OpenAI-compatible inference server optimised for continuous
# batching — 310x higher throughput than Ollama for agentic workloads.
# Start server: python -m vllm.entrypoints.openai.api_server \
# --model Qwen/Qwen2.5-14B-Instruct --port 8001
# Then set TIMMY_LLM_BACKEND=vllm (or enable vllm-local in providers.yaml)
vllm_url: str = "http://localhost:8001"
vllm_model: str = "Qwen/Qwen2.5-14B-Instruct"
# ── Grok (xAI) — opt-in premium cloud backend ────────────────────────
# Grok is a premium augmentation layer — local-first ethos preserved.
# Only used when explicitly enabled and query complexity warrants it.
grok_enabled: bool = False
xai_api_key: str = ""
xai_base_url: str = "https://api.x.ai/v1"
grok_default_model: str = "grok-3-fast"
grok_max_sats_per_query: int = 200
grok_sats_hard_cap: int = 100 # Absolute ceiling on sats per Grok query
grok_free: bool = False # Skip Lightning invoice when user has own API key
# ── Database ──────────────────────────────────────────────────────────
db_busy_timeout_ms: int = 5000 # SQLite PRAGMA busy_timeout (ms)
# ── Claude (Anthropic) — cloud fallback backend ────────────────────────
# Used when Ollama is offline and local inference isn't available.
# Set ANTHROPIC_API_KEY to enable. Default model is Haiku (fast + cheap).
anthropic_api_key: str = ""
claude_model: str = "haiku"
# ── Content Moderation ──────────────────────────────────────────────
# Three-layer moderation pipeline for AI narrator output.
# Uses Llama Guard via Ollama with regex fallback.
moderation_enabled: bool = True
moderation_guard_model: str = "llama-guard3:1b"
# Default confidence threshold — per-game profiles can override.
moderation_threshold: float = 0.8
# ── Spark Intelligence ────────────────────────────────────────────────
# Enable/disable the Spark cognitive layer.
# When enabled, Spark captures swarm events, runs EIDOS predictions,
# consolidates memories, and generates advisory recommendations.
spark_enabled: bool = True
# ── Git / DevOps ──────────────────────────────────────────────────────
git_default_repo_dir: str = "~/repos"
# Repository root - auto-detected but can be overridden
# This is the main project directory where .git lives
repo_root: str = ""
# ── Creative — Image Generation (Pixel) ───────────────────────────────
flux_model_id: str = "black-forest-labs/FLUX.1-schnell"
image_output_dir: str = "data/images"
image_default_steps: int = 4
# ── Creative — Music Generation (Lyra) ────────────────────────────────
music_output_dir: str = "data/music"
ace_step_model: str = "ace-step/ACE-Step-v1.5"
# ── Creative — Video Generation (Reel) ────────────────────────────────
video_output_dir: str = "data/video"
wan_model_id: str = "Wan-AI/Wan2.1-T2V-1.3B"
video_default_resolution: str = "480p"
# ── Creative — Pipeline / Assembly ────────────────────────────────────
creative_output_dir: str = "data/creative"
video_transition_duration: float = 1.0
default_video_codec: str = "libx264"
# ── L402 Lightning ───────────────────────────────────────────────────
# HMAC secrets for macaroon signing and invoice verification.
# Generate with: python3 -c "import secrets; print(secrets.token_hex(32))"
# In production (TIMMY_ENV=production), these MUST be set or the app will refuse to start.
l402_hmac_secret: str = ""
l402_macaroon_secret: str = ""
lightning_backend: Literal["mock", "lnd"] = "mock"
# ── Privacy / Sovereignty ────────────────────────────────────────────
# Disable Agno telemetry for air-gapped/sovereign deployments.
# Default is False (telemetry disabled) to align with sovereign AI vision.
telemetry_enabled: bool = False
# ── Sovereignty Metrics ──────────────────────────────────────────────
# Alert when API cost per research task exceeds this threshold (USD).
sovereignty_api_cost_alert_threshold: float = 1.00
# CORS allowed origins for the web chat interface (Gitea Pages, etc.)
# Set CORS_ORIGINS as a comma-separated list, e.g. "http://localhost:3000,https://example.com"
cors_origins: list[str] = [
"http://localhost:3000",
"http://localhost:8000",
"http://127.0.0.1:3000",
"http://127.0.0.1:8000",
]
# ── Matrix Frontend Integration ────────────────────────────────────────
# URL of the Matrix frontend (Replit/Tailscale) for CORS.
# When set, this origin is added to CORS allowed_origins.
# Example: "http://100.124.176.28:8080" or "https://alexanderwhitestone.com"
matrix_frontend_url: str = "" # Empty = disabled
# WebSocket authentication token for Matrix connections.
# When set, clients must provide this token via ?token= query param
# or in the first message as {"type": "auth", "token": "..."}.
# Empty/unset = auth disabled (dev mode).
matrix_ws_token: str = ""
# Trusted hosts for the Host header check (TrustedHostMiddleware).
# Set TRUSTED_HOSTS as a comma-separated list. Wildcards supported (e.g. "*.ts.net").
# Defaults include localhost + Tailscale MagicDNS. Add your Tailscale IP if needed.
trusted_hosts: list[str] = [
"localhost",
"127.0.0.1",
"*.local",
"*.ts.net",
"testserver",
]
# Environment mode: development | production
# In production, security settings are strictly enforced.
timmy_env: Literal["development", "production"] = "development"
# ── Memory Management ──────────────────────────────────────────────
# Auto-prune vector store memories older than this many days on startup.
# Set to 0 to disable auto-pruning.
memory_prune_days: int = 90
# When True, fact-type memories are kept even when older than the TTL.
memory_prune_keep_facts: bool = True
# Maximum size in MB for the memory/notes/ vault directory.
# When exceeded, a warning is logged. Set to 0 to disable.
memory_vault_max_mb: int = 100
# Auto-prune thoughts older than this many days. 0 = disabled.
thoughts_prune_days: int = 90
# Minimum thoughts to keep regardless of age.
thoughts_prune_keep_min: int = 200
# Auto-prune system events older than this many days. 0 = disabled.
events_prune_days: int = 90
# Minimum events to keep regardless of age.
events_prune_keep_min: int = 200
# ── Agentic Loop ──────────────────────────────────────────────────
# Maximum steps the agentic loop will execute before stopping.
max_agent_steps: int = 10
# ── Test / Diagnostics ─────────────────────────────────────────────
# Skip loading heavy embedding models (for tests / low-memory envs).
timmy_skip_embeddings: bool = False
# Embedding backend: "ollama" for Ollama, "local" for sentence-transformers.
timmy_embedding_backend: Literal["ollama", "local"] = "local"
# Ollama model to use for embeddings (e.g., "nomic-embed-text").
ollama_embedding_model: str = "nomic-embed-text"
# Disable CSRF middleware entirely (for tests).
timmy_disable_csrf: bool = False
# Mark the process as running in test mode.
timmy_test_mode: bool = False
# ── Brain / rqlite ─────────────────────────────────────────────────
# URL of the local rqlite node for distributed memory.
# Empty string means rqlite is not configured.
rqlite_url: str = ""
# Source identifier for brain memory entries.
brain_source: str = "default"
# Path override for the local brain SQLite database.
brain_db_path: str = ""
# ── Security Tuning ───────────────────────────────────────────────
# Set to True in production to mark CSRF cookies as Secure (HTTPS only).
csrf_cookie_secure: bool = False
# Maximum size in bytes for chat API request bodies.
chat_api_max_body_bytes: int = 1_048_576 # 1 MB
# ── Self-Modification ──────────────────────────────────────────────
# Enable self-modification capabilities. When enabled, the agent can
# edit its own source code, run tests, and commit changes.
self_modify_enabled: bool = False
self_modify_max_retries: int = 2
self_modify_allowed_dirs: str = "src,tests"
self_modify_backend: str = "auto" # "ollama", "anthropic", or "auto"
# ── Work Orders ──────────────────────────────────────────────────
# External users and agents can submit work orders for improvements.
work_orders_enabled: bool = True
work_orders_auto_execute: bool = False # Master switch for auto-execution
work_orders_auto_threshold: str = (
"low" # Max priority that auto-executes: "low" | "medium" | "high" | "none"
)
# ── Custom Weights & Models ──────────────────────────────────────
# Directory for custom model weights (GGUF, safetensors, HF checkpoints).
# Models placed here can be registered at runtime and assigned to agents.
custom_weights_dir: str = "data/models"
# Enable the reward model for scoring agent outputs (PRM-style).
reward_model_enabled: bool = False
# Reward model name (must be available via Ollama or a custom weight path).
reward_model_name: str = ""
# Minimum votes for majority-vote reward scoring (odd number recommended).
reward_model_votes: int = 3
# ── Browser Local Models (iPhone / WebGPU) ───────────────────────
# Enable in-browser LLM inference via WebLLM for offline iPhone use.
# When enabled, the mobile dashboard loads a small model directly
# in the browser — no server or Ollama required.
browser_model_enabled: bool = True
# WebLLM model ID — must be a pre-compiled MLC model.
# Recommended for iPhone: SmolLM2-360M (fast) or Qwen3-0.6B (smart).
browser_model_id: str = "SmolLM2-360M-Instruct-q4f16_1-MLC"
# Fallback to server when browser model is unavailable or too slow.
browser_model_fallback: bool = True
# ── Deep Focus Mode ─────────────────────────────────────────────
# "deep" = single-problem context; "broad" = default multi-task.
focus_mode: Literal["deep", "broad"] = "broad"
# ── Default Thinking ──────────────────────────────────────────────
# When enabled, the agent starts an internal thought loop on server start.
thinking_enabled: bool = True
thinking_interval_seconds: int = 300 # 5 minutes between thoughts
thinking_timeout_seconds: int = 120 # max wall-clock time per thinking cycle
thinking_distill_every: int = 10 # distill facts from thoughts every Nth thought
thinking_issue_every: int = 20 # file Gitea issues from thoughts every Nth thought
thinking_memory_check_every: int = 50 # check memory status every Nth thought
thinking_idle_timeout_minutes: int = 60 # pause thoughts after N minutes without user input
# ── Gitea Integration ─────────────────────────────────────────────
# Local Gitea instance for issue tracking and self-improvement.
# These values are passed as env vars to the gitea-mcp server process.
gitea_url: str = "http://localhost:3000"
gitea_token: str = "" # GITEA_TOKEN env var; falls back to .timmy_gitea_token
gitea_repo: str = "rockachopa/Timmy-time-dashboard" # owner/repo
gitea_enabled: bool = True
# ── MCP Servers ────────────────────────────────────────────────────
# External tool servers connected via Model Context Protocol (stdio).
mcp_gitea_command: str = "gitea-mcp-server -t stdio"
mcp_filesystem_command: str = "npx -y @modelcontextprotocol/server-filesystem"
mcp_timeout: int = 15
mcp_bridge_timeout: int = 60 # HTTP timeout for MCP bridge Ollama calls (seconds)
# ── Backlog Triage Loop ────────────────────────────────────────────
# Autonomous loop: fetch open issues, score, assign to agents.
backlog_triage_enabled: bool = False
# Seconds between triage cycles (default: 15 minutes).
backlog_triage_interval_seconds: int = 900
# When True, score and summarize but don't write to Gitea.
backlog_triage_dry_run: bool = False
# Create a daily triage summary issue/comment.
backlog_triage_daily_summary: bool = True
# ── Loop QA (Self-Testing) ─────────────────────────────────────────
# Self-test orchestrator that probes capabilities alongside the thinking loop.
loop_qa_enabled: bool = True
loop_qa_interval_ticks: int = 5 # run 1 self-test every Nth thinking tick (~25 min)
loop_qa_upgrade_threshold: int = 3 # consecutive failures → file task
loop_qa_max_per_hour: int = 12 # safety throttle
# ── Vassal Protocol (Autonomous Orchestrator) ─────────────────────
# Timmy as lead decision-maker: triage backlog, dispatch agents, monitor health.
# See timmy/vassal/ for implementation.
vassal_enabled: bool = False # off by default — enable when Qwen3-14B is loaded
vassal_cycle_interval: int = 300 # seconds between orchestration cycles (5 min)
vassal_max_dispatch_per_cycle: int = 10 # cap on new dispatches per cycle
vassal_stuck_threshold_minutes: int = 120 # minutes before agent issue is "stuck"
vassal_idle_threshold_minutes: int = 30 # minutes before agent is "idle"
# ── Paperclip AI — orchestration bridge ────────────────────────────
# URL where the Paperclip server listens.
# For VPS deployment behind nginx, use the public domain.
paperclip_url: str = "http://localhost:3100"
# Enable/disable the Paperclip integration.
paperclip_enabled: bool = False
# API key or auth-gate cookie for authenticating with Paperclip.
paperclip_api_key: str = ""
# Timmy's agent ID in the Paperclip org chart.
paperclip_agent_id: str = ""
# Company ID in Paperclip — required for most API calls.
paperclip_company_id: str = ""
# Timeout in seconds for Paperclip HTTP calls.
paperclip_timeout: int = 30
# How often (seconds) Timmy polls Paperclip for work (0 = disabled).
paperclip_poll_interval: int = 0
# ── OpenFang — vendored agent runtime ─────────────────────────────
# URL where the OpenFang sidecar listens. Set to the Docker service
# name when running in compose, or localhost for bare-metal dev.
openfang_url: str = "http://localhost:8080"
# Enable/disable OpenFang integration. When disabled, the tool
# executor falls back to Timmy's native (simulated) execution.
openfang_enabled: bool = False
# Timeout in seconds for OpenFang hand execution (some hands are slow).
openfang_timeout: int = 120
# ── Autoresearch — autonomous ML experiment loops ──────────────────
# Integrates Karpathy's autoresearch pattern: agents modify training
# code, run time-boxed experiments, evaluate metrics, and iterate.
autoresearch_enabled: bool = False
autoresearch_workspace: str = "data/experiments"
autoresearch_time_budget: int = 300 # seconds per experiment run
autoresearch_max_iterations: int = 100
autoresearch_metric: str = "val_bpb" # metric to optimise (lower = better)
# M3 Max / Apple Silicon tuning (Issue #905).
# dataset: "tinystories" (default, lower-entropy, recommended for Mac) or "openwebtext".
autoresearch_dataset: str = "tinystories"
# backend: "auto" detects MLX on Apple Silicon; "cpu" forces CPU fallback.
autoresearch_backend: str = "auto"
# ── Weekly Narrative Summary ───────────────────────────────────────
# Generates a human-readable weekly summary of development activity.
# Disabling this will stop the weekly narrative generation.
weekly_narrative_enabled: bool = True
weekly_narrative_lookback_days: int = 7
weekly_narrative_output_dir: str = ".loop"
# ── Local Hands (Shell + Git) ──────────────────────────────────────
# Enable local shell/git execution hands.
hands_shell_enabled: bool = True
# Default timeout in seconds for shell commands.
hands_shell_timeout: int = 60
# Comma-separated additional command prefixes to allow.
hands_shell_extra_allowed: str = ""
# Enable the git hand for version-control operations.
hands_git_enabled: bool = True
# Default timeout for git operations.
hands_git_timeout: int = 60
# ── Hermes Health Monitor ─────────────────────────────────────────
# Enable the Hermes system health monitor (memory, disk, Ollama, processes, network).
hermes_enabled: bool = True
# How often Hermes runs a full health cycle (seconds). Default: 5 minutes.
hermes_interval_seconds: int = 300
# Alert threshold: free memory below this triggers model unloading / alert (GB).
hermes_memory_free_min_gb: float = 4.0
# Alert threshold: free disk below this triggers cleanup / alert (GB).
hermes_disk_free_min_gb: float = 10.0
# ── Energy Budget Monitoring ───────────────────────────────────────
# Enable energy budget monitoring (tracks CPU/GPU power during inference).
energy_budget_enabled: bool = True
# Watts threshold that auto-activates low power mode (on-battery only).
energy_budget_watts_threshold: float = 15.0
# Model to prefer in low power mode (smaller = more efficient).
energy_low_power_model: str = "qwen3:1b"
# ── Error Logging ─────────────────────────────────────────────────
error_log_enabled: bool = True
error_log_dir: str = "logs"
error_log_max_bytes: int = 5_242_880 # 5 MB
error_log_backup_count: int = 5
error_feedback_enabled: bool = True # Auto-create bug report tasks
error_dedup_window_seconds: int = 300 # 5-min dedup window
# ── Bannerlord / GABS ────────────────────────────────────────────
# GABS (Game Action Bridge Server) TCP JSON-RPC endpoint.
# The GABS mod runs inside the Windows VM and exposes a JSON-RPC server
# on port 4825 that Timmy uses to read and act on Bannerlord game state.
# Set GABS_HOST to the VM's LAN IP (e.g. "10.0.0.50") to enable.
gabs_enabled: bool = False
gabs_host: str = "127.0.0.1"
gabs_port: int = 4825
gabs_timeout: float = 5.0 # socket timeout in seconds
# How often (seconds) the observer polls GABS for fresh game state.
gabs_poll_interval: int = 60
# Path to the Bannerlord journal inside the memory vault.
# Relative to repo root. Written by the GABS observer loop.
gabs_journal_path: str = "memory/bannerlord/journal.md"
# ── Scripture / Biblical Integration ──────────────────────────────
# Enable the biblical text module.
scripture_enabled: bool = True
# Primary translation for retrieval and citation.
scripture_translation: str = "ESV"
# Meditation mode: sequential | thematic | lectionary
scripture_meditation_mode: str = "sequential"
# Background meditation interval in seconds (0 = disabled).
scripture_meditation_interval: int = 0
def _compute_repo_root(self) -> str:
"""Auto-detect repo root if not set."""
if self.repo_root:
return self.repo_root
# Walk up from this file to find .git
import os
path = os.path.dirname(os.path.abspath(__file__))
path = os.path.dirname(os.path.dirname(path)) # src/ -> project root
while path != os.path.dirname(path):
if os.path.exists(os.path.join(path, ".git")):
return path
path = os.path.dirname(path)
return os.getcwd()
def model_post_init(self, __context) -> None:
"""Post-init: resolve gitea_token from file if not set via env."""
if not self.gitea_token:
# Priority: Timmy's own token → legacy admin token
repo_root = self._compute_repo_root()
timmy_token_path = os.path.join(repo_root, ".timmy_gitea_token")
legacy_token_path = os.path.expanduser("~/.config/gitea/token")
for token_path in (timmy_token_path, legacy_token_path):
try:
if os.path.isfile(token_path):
token = open(token_path).read().strip() # noqa: SIM115
if token:
self.gitea_token = token
break
except OSError:
pass
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
settings = Settings()
# Ensure repo_root is computed if not set
if not settings.repo_root:
settings.repo_root = settings._compute_repo_root()
# ── Model fallback configuration ────────────────────────────────────────────
# Fallback chains are now in settings.fallback_models / settings.vision_fallback_models.
# Override via env vars (FALLBACK_MODELS, VISION_FALLBACK_MODELS) or
# edit config/providers.yaml → fallback_chains.
def check_ollama_model_available(model_name: str) -> bool:
"""Check if a specific Ollama model is available locally."""
try:
import json
import urllib.request
url = settings.normalized_ollama_url
req = urllib.request.Request(
f"{url}/api/tags",
method="GET",
headers={"Accept": "application/json"},
)
with urllib.request.urlopen(req, timeout=5) as response:
data = json.loads(response.read().decode())
models = [m.get("name", "") for m in data.get("models", [])]
return any(
model_name == m or model_name == m.split(":")[0] or m.startswith(model_name)
for m in models
)
except (OSError, ValueError) as exc:
_startup_logger.debug("Ollama model check failed: %s", exc)
return False
def get_effective_ollama_model() -> str:
"""Get the effective Ollama model, with fallback logic.
Walks the configurable ``settings.fallback_models`` chain when the
user's preferred model is not available locally.
"""
user_model = settings.ollama_model
if check_ollama_model_available(user_model):
return user_model
# Walk the configurable fallback chain
for fallback in settings.fallback_models:
if check_ollama_model_available(fallback):
_startup_logger.warning(
"Requested model '%s' not available. Using fallback: %s",
user_model,
fallback,
)
return fallback
# Last resort - return user's setting and hope for the best
return user_model
# ── Startup validation ───────────────────────────────────────────────────────
_startup_logger = _logging.getLogger("config")
_startup_validated = False
def validate_startup(*, force: bool = False) -> None:
"""Enforce security requirements — call from app entry points, not import.
Skipped in test mode (TIMMY_TEST_MODE=1) unless force=True.
In production: sys.exit(1) if required secrets are missing.
In development: log warnings only.
"""
global _startup_validated
if _startup_validated and not force:
return
if os.environ.get("TIMMY_TEST_MODE") == "1" and not force:
_startup_validated = True
return
if settings.timmy_env == "production":
_missing = []
if not settings.l402_hmac_secret:
_missing.append("L402_HMAC_SECRET")
if not settings.l402_macaroon_secret:
_missing.append("L402_MACAROON_SECRET")
if _missing:
_startup_logger.error(
"PRODUCTION SECURITY ERROR: The following secrets must be set: %s\n"
'Generate with: python3 -c "import secrets; print(secrets.token_hex(32))"\n'
"Set in .env file or environment variables.",
", ".join(_missing),
)
sys.exit(1)
if "*" in settings.cors_origins:
_startup_logger.error(
"PRODUCTION SECURITY ERROR: CORS wildcard '*' is not allowed "
"in production. Set CORS_ORIGINS to explicit origins."
)
sys.exit(1)
_startup_logger.info("Production mode: security secrets validated ✓")
else:
if "*" in settings.cors_origins:
_startup_logger.warning(
"SEC: CORS_ORIGINS contains wildcard '*'"
"restrict to explicit origins before deploying to production."
)
if not settings.l402_hmac_secret:
_startup_logger.warning(
"SEC: L402_HMAC_SECRET is not set — "
"set a unique secret in .env before deploying to production."
)
if not settings.l402_macaroon_secret:
_startup_logger.warning(
"SEC: L402_MACAROON_SECRET is not set — "
"set a unique secret in .env before deploying to production."
)
_startup_validated = True