Merge feat/gemma4-multimodal-support: resolve MULTIMODAL_BACKLOG.md conflict
Combined main's rich epic structure with PR #1493's Task 3/4 items. Code changes to multimodal.py merged cleanly.
This commit is contained in:
@@ -27,3 +27,9 @@
|
||||
- [ ] Ingest PDF research papers on agentic workflows.
|
||||
- [ ] Analyze diagrams and charts to extract structural logic.
|
||||
- [ ] Synthesize findings into `Sovereign_Knowledge_Graph.md`.
|
||||
|
||||
## General Tasks
|
||||
|
||||
- [x] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. ✅ PR #1493
|
||||
- [ ] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models.
|
||||
- [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain.
|
||||
|
||||
@@ -162,6 +162,35 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = {
|
||||
"gemma2:2b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING},
|
||||
"gemma2:9b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING},
|
||||
"gemma2:27b": {ModelCapability.TEXT, ModelCapability.JSON, ModelCapability.STREAMING},
|
||||
# Gemma 4 — multimodal (vision + text + tools)
|
||||
"gemma4": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.TOOLS,
|
||||
ModelCapability.JSON,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"gemma4:4b": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.TOOLS,
|
||||
ModelCapability.JSON,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"gemma4:12b": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.TOOLS,
|
||||
ModelCapability.JSON,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"gemma4:27b": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.TOOLS,
|
||||
ModelCapability.JSON,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
# Mistral series
|
||||
"mistral": {
|
||||
ModelCapability.TEXT,
|
||||
@@ -252,6 +281,7 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = {
|
||||
# These are tried in order when the primary model doesn't support a capability
|
||||
DEFAULT_FALLBACK_CHAINS: dict[ModelCapability, list[str]] = {
|
||||
ModelCapability.VISION: [
|
||||
"gemma4:12b", # Gemma 4 — multimodal, fast and capable
|
||||
"llama3.2:3b", # Fast vision model
|
||||
"llava:7b", # Classic vision model
|
||||
"qwen2.5-vl:3b", # Qwen vision
|
||||
|
||||
@@ -76,6 +76,10 @@ class TestKnownModelCapabilities:
|
||||
"llava",
|
||||
"moondream",
|
||||
"qwen2.5-vl",
|
||||
"gemma4",
|
||||
"gemma4:4b",
|
||||
"gemma4:12b",
|
||||
"gemma4:27b",
|
||||
]
|
||||
for name in vision_names:
|
||||
assert ModelCapability.VISION in KNOWN_MODEL_CAPABILITIES[name], name
|
||||
|
||||
Reference in New Issue
Block a user