diff --git a/.env.example b/.env.example index 28ef7a74..5c9b731c 100644 --- a/.env.example +++ b/.env.example @@ -17,6 +17,11 @@ # LLM model to use via Ollama (default: qwen3.5:latest) # OLLAMA_MODEL=qwen3.5:latest +# Ollama context window size (default: 4096 tokens) +# Set higher for more context, lower to save RAM. 0 = model default. +# qwen3:30b + 4096 ctx ≈ 19GB VRAM; default ctx ≈ 45GB. +# OLLAMA_NUM_CTX=4096 + # Enable FastAPI interactive docs at /docs and /redoc (default: false) # DEBUG=true diff --git a/config/providers.yaml b/config/providers.yaml index 20d5272b..2b67dd04 100644 --- a/config/providers.yaml +++ b/config/providers.yaml @@ -28,6 +28,7 @@ providers: - name: qwen3.5:latest default: true context_window: 128000 + # Note: actual context is capped by OLLAMA_NUM_CTX (default 4096) to save RAM capabilities: [text, tools, json, streaming] - name: llama3.1:8b-instruct context_window: 128000 diff --git a/src/config.py b/src/config.py index f4f139a2..2aa0d5dd 100644 --- a/src/config.py +++ b/src/config.py @@ -22,6 +22,11 @@ class Settings(BaseSettings): # llama3.2 (3B) hallucinated tool output consistently in testing. ollama_model: str = "qwen3.5:latest" + # Context window size for Ollama inference — override with OLLAMA_NUM_CTX + # qwen3:30b with default context eats 45GB on a 39GB Mac. + # 4096 keeps memory at ~19GB. Set to 0 to use model defaults. + ollama_num_ctx: int = 4096 + # Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS # as comma-separated strings, e.g. FALLBACK_MODELS="qwen3.5:latest,llama3.1" # Or edit config/providers.yaml → fallback_chains for the canonical source. diff --git a/src/timmy/agent.py b/src/timmy/agent.py index 2cac1fbb..911c3bdd 100644 --- a/src/timmy/agent.py +++ b/src/timmy/agent.py @@ -306,9 +306,12 @@ def create_timmy( logger.warning("Failed to load memory context: %s", exc) full_prompt = base_prompt + model_kwargs = {} + if settings.ollama_num_ctx > 0: + model_kwargs["options"] = {"num_ctx": settings.ollama_num_ctx} agent = Agent( name="Agent", - model=Ollama(id=model_name, host=settings.ollama_url, timeout=300), + model=Ollama(id=model_name, host=settings.ollama_url, timeout=300, **model_kwargs), db=SqliteDb(db_file=db_file), description=full_prompt, add_history_to_context=True, diff --git a/src/timmy/agents/base.py b/src/timmy/agents/base.py index ed8e8792..cc0ccfb1 100644 --- a/src/timmy/agents/base.py +++ b/src/timmy/agents/base.py @@ -73,9 +73,12 @@ class BaseAgent(ABC): if handler: tool_instances.append(handler) + ollama_kwargs = {} + if settings.ollama_num_ctx > 0: + ollama_kwargs["options"] = {"num_ctx": settings.ollama_num_ctx} return Agent( name=self.name, - model=Ollama(id=self.model, host=settings.ollama_url, timeout=300), + model=Ollama(id=self.model, host=settings.ollama_url, timeout=300, **ollama_kwargs), description=system_prompt, tools=tool_instances if tool_instances else None, add_history_to_context=True, diff --git a/tests/timmy/test_agent.py b/tests/timmy/test_agent.py index 948105f5..55b8fb1d 100644 --- a/tests/timmy/test_agent.py +++ b/tests/timmy/test_agent.py @@ -67,6 +67,7 @@ def test_create_timmy_respects_custom_ollama_url(): ): mock_settings.ollama_model = "llama3.2" mock_settings.ollama_url = custom_url + mock_settings.ollama_num_ctx = 4096 mock_settings.timmy_model_backend = "ollama" mock_settings.airllm_model_size = "70b" @@ -237,6 +238,7 @@ def test_create_timmy_includes_tools_for_large_model(): ): mock_settings.ollama_model = "llama3.1" mock_settings.ollama_url = "http://localhost:11434" + mock_settings.ollama_num_ctx = 4096 mock_settings.timmy_model_backend = "ollama" mock_settings.airllm_model_size = "70b" mock_settings.telemetry_enabled = False