diff --git a/MULTIMODAL_BACKLOG.md b/MULTIMODAL_BACKLOG.md index 5bc50a3..6c34076 100644 --- a/MULTIMODAL_BACKLOG.md +++ b/MULTIMODAL_BACKLOG.md @@ -27,3 +27,9 @@ - [ ] Ingest PDF research papers on agentic workflows. - [ ] Analyze diagrams and charts to extract structural logic. - [ ] Synthesize findings into `Sovereign_Knowledge_Graph.md`. + +## General Tasks + +- [x] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. ✅ PR #1493 +- [ ] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models. +- [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain. diff --git a/src/infrastructure/models/multimodal.py b/src/infrastructure/models/multimodal.py index 402f46b..35c953a 100644 --- a/src/infrastructure/models/multimodal.py +++ b/src/infrastructure/models/multimodal.py @@ -162,6 +162,35 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = { "gemma2:2b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING}, "gemma2:9b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING}, "gemma2:27b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING}, + # Gemma 4 — multimodal (vision + text + tools) + "gemma4": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, + "gemma4:4b": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, + "gemma4:12b": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, + "gemma4:27b": { + ModelCapability.TEXT, + ModelCapability.VISION, + ModelCapability.TOOLS, + ModelCapability.JSON, + ModelCapability.STREAMING, + }, # Mistral series "mistral": { ModelCapability.TEXT, @@ -252,6 +281,7 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = { # These are tried in order when the primary model doesn't support a capability DEFAULT_FALLBACK_CHAINS: dict[ModelCapability, list[str]] = { ModelCapability.VISION: [ + "gemma4:12b", # Gemma 4 — multimodal, fast and capable "llama3.2:3b", # Fast vision model "llava:7b", # Classic vision model "qwen2.5-vl:3b", # Qwen vision diff --git a/tests/infrastructure/test_multimodal.py b/tests/infrastructure/test_multimodal.py index 4ad1678..7be22b5 100644 --- a/tests/infrastructure/test_multimodal.py +++ b/tests/infrastructure/test_multimodal.py @@ -76,6 +76,10 @@ class TestKnownModelCapabilities: "llava", "moondream", "qwen2.5-vl", + "gemma4", + "gemma4:4b", + "gemma4:12b", + "gemma4:27b", ] for name in vision_names: assert ModelCapability.VISION in KNOWN_MODEL_CAPABILITIES[name], name