From 1ed8e91a44957b725b7357edf74843ad881fa346 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Mon, 13 Apr 2026 14:59:14 -0400 Subject: [PATCH 1/2] feat: add Gemma 4 to multimodal capabilities registry - Add gemma4 (4b, 12b, 27b) to KNOWN_MODEL_CAPABILITIES with VISION, TOOLS, TEXT, JSON, STREAMING - Add gemma4:12b to VISION fallback chain (preferred over llama3.2:3b) - Add Gemma 4 variants to vision model test assertions - Create MULTIMODAL_BACKLOG.md for tracking multimodal improvements --- MULTIMODAL_BACKLOG.md | 14 ++++++++++++ src/infrastructure/models/multimodal.py | 30 +++++++++++++++++++++++++ tests/infrastructure/test_multimodal.py | 4 ++++ 3 files changed, 48 insertions(+) create mode 100644 MULTIMODAL_BACKLOG.md diff --git a/MULTIMODAL_BACKLOG.md b/MULTIMODAL_BACKLOG.md new file mode 100644 index 0000000..b467dfe --- /dev/null +++ b/MULTIMODAL_BACKLOG.md @@ -0,0 +1,14 @@ +# Multimodal Backlog + +Tracking multimodal capability improvements for Timmy. + +## Pending + +- [ ] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. +- [ ] **Task 2:** Add Gemma 4 tests to `tests/infrastructure/test_multimodal.py` — verify vision capability detection and fallback chain inclusion. +- [ ] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models. +- [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain. + +## Completed + +_(none yet)_ diff --git a/src/infrastructure/models/multimodal.py b/src/infrastructure/models/multimodal.py index 402f46b..35c953a 100644 --- a/src/infrastructure/models/multimodal.py +++ b/src/infrastructure/models/multimodal.py @@ -162,6 +162,35 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = { "gemma2:2b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING}, "gemma2:9b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING}, "gemma2:27b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING}, + # Gemma 4 — multimodal (vision + text + tools) + "gemma4": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, + "gemma4:4b": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, + "gemma4:12b": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, + "gemma4:27b": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, # Mistral series "mistral": { ModelCapability.TEXT, @@ -252,6 +281,7 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = { # These are tried in order when the primary model doesn't support a capability DEFAULT_FALLBACK_CHAINS: dict[ModelCapability, list[str]] = { ModelCapability.VISION: [ + "gemma4:12b", # Gemma 4 — multimodal, fast and capable "llama3.2:3b", # Fast vision model "llava:7b", # Classic vision model "qwen2.5-vl:3b", # Qwen vision diff --git a/tests/infrastructure/test_multimodal.py b/tests/infrastructure/test_multimodal.py index ac8bfd7..67abd92 100644 --- a/tests/infrastructure/test_multimodal.py +++ b/tests/infrastructure/test_multimodal.py @@ -71,6 +71,10 @@ class TestKnownModelCapabilities: "llava", "moondream", "qwen2.5-vl", + "gemma4", + "gemma4:4b", + "gemma4:12b", + "gemma4:27b", ] for name in vision_names: assert ModelCapability.VISION in KNOWN_MODEL_CAPABILITIES[name], name From 212e1bcc479b06d071757677750a27bdd3e399f7 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Mon, 13 Apr 2026 15:04:20 -0400 Subject: [PATCH 2/2] docs: mark Task 1 as completed in multimodal backlog --- MULTIMODAL_BACKLOG.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/MULTIMODAL_BACKLOG.md b/MULTIMODAL_BACKLOG.md index b467dfe..f8dfd0a 100644 --- a/MULTIMODAL_BACKLOG.md +++ b/MULTIMODAL_BACKLOG.md @@ -4,11 +4,9 @@ Tracking multimodal capability improvements for Timmy. ## Pending -- [ ] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. -- [ ] **Task 2:** Add Gemma 4 tests to `tests/infrastructure/test_multimodal.py` — verify vision capability detection and fallback chain inclusion. - [ ] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models. - [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain. ## Completed -_(none yet)_ +- [x] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. ✅ PR #1493