forked from Rockachopa/Timmy-time-dashboard
Merge pull request '[loop-cycle-7] feat: add OLLAMA_NUM_CTX config (#83)' (#98) from fix/num-ctx-remaining into main
Reviewed-on: http://localhost:3000/rockachopa/Timmy-time-dashboard/pulls/98
This commit is contained in:
@@ -17,6 +17,11 @@
|
|||||||
# LLM model to use via Ollama (default: qwen3.5:latest)
|
# LLM model to use via Ollama (default: qwen3.5:latest)
|
||||||
# OLLAMA_MODEL=qwen3.5:latest
|
# OLLAMA_MODEL=qwen3.5:latest
|
||||||
|
|
||||||
|
# Ollama context window size (default: 4096 tokens)
|
||||||
|
# Set higher for more context, lower to save RAM. 0 = model default.
|
||||||
|
# qwen3:30b + 4096 ctx ≈ 19GB VRAM; default ctx ≈ 45GB.
|
||||||
|
# OLLAMA_NUM_CTX=4096
|
||||||
|
|
||||||
# Enable FastAPI interactive docs at /docs and /redoc (default: false)
|
# Enable FastAPI interactive docs at /docs and /redoc (default: false)
|
||||||
# DEBUG=true
|
# DEBUG=true
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ providers:
|
|||||||
- name: qwen3.5:latest
|
- name: qwen3.5:latest
|
||||||
default: true
|
default: true
|
||||||
context_window: 128000
|
context_window: 128000
|
||||||
|
# Note: actual context is capped by OLLAMA_NUM_CTX (default 4096) to save RAM
|
||||||
capabilities: [text, tools, json, streaming]
|
capabilities: [text, tools, json, streaming]
|
||||||
- name: llama3.1:8b-instruct
|
- name: llama3.1:8b-instruct
|
||||||
context_window: 128000
|
context_window: 128000
|
||||||
|
|||||||
@@ -22,6 +22,11 @@ class Settings(BaseSettings):
|
|||||||
# llama3.2 (3B) hallucinated tool output consistently in testing.
|
# llama3.2 (3B) hallucinated tool output consistently in testing.
|
||||||
ollama_model: str = "qwen3.5:latest"
|
ollama_model: str = "qwen3.5:latest"
|
||||||
|
|
||||||
|
# Context window size for Ollama inference — override with OLLAMA_NUM_CTX
|
||||||
|
# qwen3:30b with default context eats 45GB on a 39GB Mac.
|
||||||
|
# 4096 keeps memory at ~19GB. Set to 0 to use model defaults.
|
||||||
|
ollama_num_ctx: int = 4096
|
||||||
|
|
||||||
# Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS
|
# Fallback model chains — override with FALLBACK_MODELS / VISION_FALLBACK_MODELS
|
||||||
# as comma-separated strings, e.g. FALLBACK_MODELS="qwen3.5:latest,llama3.1"
|
# as comma-separated strings, e.g. FALLBACK_MODELS="qwen3.5:latest,llama3.1"
|
||||||
# Or edit config/providers.yaml → fallback_chains for the canonical source.
|
# Or edit config/providers.yaml → fallback_chains for the canonical source.
|
||||||
|
|||||||
@@ -306,9 +306,12 @@ def create_timmy(
|
|||||||
logger.warning("Failed to load memory context: %s", exc)
|
logger.warning("Failed to load memory context: %s", exc)
|
||||||
full_prompt = base_prompt
|
full_prompt = base_prompt
|
||||||
|
|
||||||
|
model_kwargs = {}
|
||||||
|
if settings.ollama_num_ctx > 0:
|
||||||
|
model_kwargs["options"] = {"num_ctx": settings.ollama_num_ctx}
|
||||||
agent = Agent(
|
agent = Agent(
|
||||||
name="Agent",
|
name="Agent",
|
||||||
model=Ollama(id=model_name, host=settings.ollama_url, timeout=300),
|
model=Ollama(id=model_name, host=settings.ollama_url, timeout=300, **model_kwargs),
|
||||||
db=SqliteDb(db_file=db_file),
|
db=SqliteDb(db_file=db_file),
|
||||||
description=full_prompt,
|
description=full_prompt,
|
||||||
add_history_to_context=True,
|
add_history_to_context=True,
|
||||||
|
|||||||
@@ -73,9 +73,12 @@ class BaseAgent(ABC):
|
|||||||
if handler:
|
if handler:
|
||||||
tool_instances.append(handler)
|
tool_instances.append(handler)
|
||||||
|
|
||||||
|
ollama_kwargs = {}
|
||||||
|
if settings.ollama_num_ctx > 0:
|
||||||
|
ollama_kwargs["options"] = {"num_ctx": settings.ollama_num_ctx}
|
||||||
return Agent(
|
return Agent(
|
||||||
name=self.name,
|
name=self.name,
|
||||||
model=Ollama(id=self.model, host=settings.ollama_url, timeout=300),
|
model=Ollama(id=self.model, host=settings.ollama_url, timeout=300, **ollama_kwargs),
|
||||||
description=system_prompt,
|
description=system_prompt,
|
||||||
tools=tool_instances if tool_instances else None,
|
tools=tool_instances if tool_instances else None,
|
||||||
add_history_to_context=True,
|
add_history_to_context=True,
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ def test_create_timmy_respects_custom_ollama_url():
|
|||||||
):
|
):
|
||||||
mock_settings.ollama_model = "llama3.2"
|
mock_settings.ollama_model = "llama3.2"
|
||||||
mock_settings.ollama_url = custom_url
|
mock_settings.ollama_url = custom_url
|
||||||
|
mock_settings.ollama_num_ctx = 4096
|
||||||
mock_settings.timmy_model_backend = "ollama"
|
mock_settings.timmy_model_backend = "ollama"
|
||||||
mock_settings.airllm_model_size = "70b"
|
mock_settings.airllm_model_size = "70b"
|
||||||
|
|
||||||
@@ -237,6 +238,7 @@ def test_create_timmy_includes_tools_for_large_model():
|
|||||||
):
|
):
|
||||||
mock_settings.ollama_model = "llama3.1"
|
mock_settings.ollama_model = "llama3.1"
|
||||||
mock_settings.ollama_url = "http://localhost:11434"
|
mock_settings.ollama_url = "http://localhost:11434"
|
||||||
|
mock_settings.ollama_num_ctx = 4096
|
||||||
mock_settings.timmy_model_backend = "ollama"
|
mock_settings.timmy_model_backend = "ollama"
|
||||||
mock_settings.airllm_model_size = "70b"
|
mock_settings.airllm_model_size = "70b"
|
||||||
mock_settings.telemetry_enabled = False
|
mock_settings.telemetry_enabled = False
|
||||||
|
|||||||
Reference in New Issue
Block a user