From 1ed8e91a44957b725b7357edf74843ad881fa346 Mon Sep 17 00:00:00 2001
From: Alexander Whitestone <alexpaynex@gmail.com>
Date: Mon, 13 Apr 2026 14:59:14 -0400
Subject: [PATCH 1/2] feat: add Gemma 4 to multimodal capabilities registry

- Add gemma4 (4b, 12b, 27b) to KNOWN_MODEL_CAPABILITIES with VISION, TOOLS, TEXT, JSON, STREAMING
- Add gemma4:12b to VISION fallback chain (preferred over llama3.2:3b)
- Add Gemma 4 variants to vision model test assertions
- Create MULTIMODAL_BACKLOG.md for tracking multimodal improvements
---
 MULTIMODAL_BACKLOG.md                   | 14 ++++++++++++
 src/infrastructure/models/multimodal.py | 30 +++++++++++++++++++++++++
 tests/infrastructure/test_multimodal.py |  4 ++++
 3 files changed, 48 insertions(+)
 create mode 100644 MULTIMODAL_BACKLOG.md

diff --git a/MULTIMODAL_BACKLOG.md b/MULTIMODAL_BACKLOG.md
new file mode 100644
index 0000000..b467dfe
--- /dev/null
+++ b/MULTIMODAL_BACKLOG.md
@@ -0,0 +1,14 @@
+# Multimodal Backlog
+
+Tracking multimodal capability improvements for Timmy.
+
+## Pending
+
+- [ ] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming.
+- [ ] **Task 2:** Add Gemma 4 tests to `tests/infrastructure/test_multimodal.py` — verify vision capability detection and fallback chain inclusion.
+- [ ] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models.
+- [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain.
+
+## Completed
+
+_(none yet)_
diff --git a/src/infrastructure/models/multimodal.py b/src/infrastructure/models/multimodal.py
index 402f46b..35c953a 100644
--- a/src/infrastructure/models/multimodal.py
+++ b/src/infrastructure/models/multimodal.py
@@ -162,6 +162,35 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = {
     "gemma2:2b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING},
     "gemma2:9b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING},
     "gemma2:27b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING},
+    # Gemma 4 — multimodal (vision + text + tools)
+    "gemma4": {
+        ModelCapability.TEXT,
+        ModelCapability.VISION,
+        ModelCapability.TOOLS,
+        ModelCapability.JSON,
+        ModelCapability.STREAMING,
+    },
+    "gemma4:4b": {
+        ModelCapability.TEXT,
+        ModelCapability.VISION,
+        ModelCapability.TOOLS,
+        ModelCapability.JSON,
+        ModelCapability.STREAMING,
+    },
+    "gemma4:12b": {
+        ModelCapability.TEXT,
+        ModelCapability.VISION,
+        ModelCapability.TOOLS,
+        ModelCapability.JSON,
+        ModelCapability.STREAMING,
+    },
+    "gemma4:27b": {
+        ModelCapability.TEXT,
+        ModelCapability.VISION,
+        ModelCapability.TOOLS,
+        ModelCapability.JSON,
+        ModelCapability.STREAMING,
+    },
     # Mistral series
     "mistral": {
         ModelCapability.TEXT,
@@ -252,6 +281,7 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = {
 # These are tried in order when the primary model doesn't support a capability
 DEFAULT_FALLBACK_CHAINS: dict[ModelCapability, list[str]] = {
     ModelCapability.VISION: [
+        "gemma4:12b",  # Gemma 4 — multimodal, fast and capable
         "llama3.2:3b",  # Fast vision model
         "llava:7b",  # Classic vision model
         "qwen2.5-vl:3b",  # Qwen vision
diff --git a/tests/infrastructure/test_multimodal.py b/tests/infrastructure/test_multimodal.py
index ac8bfd7..67abd92 100644
--- a/tests/infrastructure/test_multimodal.py
+++ b/tests/infrastructure/test_multimodal.py
@@ -71,6 +71,10 @@ class TestKnownModelCapabilities:
             "llava",
             "moondream",
             "qwen2.5-vl",
+            "gemma4",
+            "gemma4:4b",
+            "gemma4:12b",
+            "gemma4:27b",
         ]
         for name in vision_names:
             assert ModelCapability.VISION in KNOWN_MODEL_CAPABILITIES[name], name

From 212e1bcc479b06d071757677750a27bdd3e399f7 Mon Sep 17 00:00:00 2001
From: Alexander Whitestone <alexpaynex@gmail.com>
Date: Mon, 13 Apr 2026 15:04:20 -0400
Subject: [PATCH 2/2] docs: mark Task 1 as completed in multimodal backlog

---
 MULTIMODAL_BACKLOG.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/MULTIMODAL_BACKLOG.md b/MULTIMODAL_BACKLOG.md
index b467dfe..f8dfd0a 100644
--- a/MULTIMODAL_BACKLOG.md
+++ b/MULTIMODAL_BACKLOG.md
@@ -4,11 +4,9 @@ Tracking multimodal capability improvements for Timmy.
 
 ## Pending
 
-- [ ] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming.
-- [ ] **Task 2:** Add Gemma 4 tests to `tests/infrastructure/test_multimodal.py` — verify vision capability detection and fallback chain inclusion.
 - [ ] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models.
 - [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain.
 
 ## Completed
 
-_(none yet)_
+- [x] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. ✅ PR #1493