Compare commits
4 Commits
docs/archi
...
feat/add-v
| Author | SHA1 | Date | |
|---|---|---|---|
| 4b62f215e0 | |||
| 9d72b7d6cb | |||
| f238ce092a | |||
| d36e9e4d1d |
@@ -32,4 +32,5 @@
|
||||
|
||||
- [x] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. ✅ PR #1493
|
||||
- [x] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models. ✅ PR #1494
|
||||
- [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain.
|
||||
- [x] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain. ✅ PR #1495
|
||||
- [x] **Task 5:** Add VIDEO-capable models to `KNOWN_MODEL_CAPABILITIES` when video understanding models become available in Ollama (e.g., video-llama, video-llava). ✅ (this PR)
|
||||
|
||||
@@ -192,6 +192,43 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = {
|
||||
ModelCapability.JSON,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
# Video understanding models (Ollama)
|
||||
"video-llama": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.VIDEO,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"video-llama:7b": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.VIDEO,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"video-llama:13b": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.VIDEO,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"video-llava": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.VIDEO,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"video-llava:7b": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.VIDEO,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
"video-llava:13b": {
|
||||
ModelCapability.TEXT,
|
||||
ModelCapability.VISION,
|
||||
ModelCapability.VIDEO,
|
||||
ModelCapability.STREAMING,
|
||||
},
|
||||
# Mistral series
|
||||
"mistral": {
|
||||
ModelCapability.TEXT,
|
||||
@@ -289,8 +326,11 @@ DEFAULT_FALLBACK_CHAINS: dict[ModelCapability, list[str]] = {
|
||||
"moondream:1.8b", # Tiny vision model (last resort)
|
||||
],
|
||||
ModelCapability.VIDEO: [
|
||||
# Video models are not yet available in Ollama
|
||||
# Placeholder for future video understanding models
|
||||
"video-llama:7b", # Video-LLaMA — video understanding with temporal reasoning
|
||||
"video-llava:7b", # Video-LLaVA — video understanding via visual-language alignment
|
||||
"video-llama:13b", # Larger Video-LLaMA (more capable, slower)
|
||||
"video-llava:13b", # Larger Video-LLaVA
|
||||
"llava:7b", # Graceful degradation to vision-only model
|
||||
],
|
||||
|
||||
ModelCapability.TOOLS: [
|
||||
|
||||
@@ -94,6 +94,24 @@ class TestKnownModelCapabilities:
|
||||
for name, caps in KNOWN_MODEL_CAPABILITIES.items():
|
||||
assert ModelCapability.TEXT in caps, f"{name} should have TEXT"
|
||||
|
||||
def test_video_models_have_video(self):
|
||||
video_names = [
|
||||
"video-llama",
|
||||
"video-llama:7b",
|
||||
"video-llama:13b",
|
||||
"video-llava",
|
||||
"video-llava:7b",
|
||||
"video-llava:13b",
|
||||
]
|
||||
for name in video_names:
|
||||
assert ModelCapability.VIDEO in KNOWN_MODEL_CAPABILITIES[name], name
|
||||
assert ModelCapability.VISION in KNOWN_MODEL_CAPABILITIES[name], name
|
||||
assert ModelCapability.TEXT in KNOWN_MODEL_CAPABILITIES[name], name
|
||||
|
||||
def test_video_models_have_streaming(self):
|
||||
for name in ["video-llama", "video-llava"]:
|
||||
assert ModelCapability.STREAMING in KNOWN_MODEL_CAPABILITIES[name], name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default fallback chains
|
||||
@@ -110,6 +128,14 @@ class TestDefaultFallbackChains:
|
||||
def test_audio_chain_empty(self):
|
||||
assert DEFAULT_FALLBACK_CHAINS[ModelCapability.AUDIO] == []
|
||||
|
||||
def test_video_chain_non_empty(self):
|
||||
assert len(DEFAULT_FALLBACK_CHAINS[ModelCapability.VIDEO]) > 0
|
||||
|
||||
def test_video_chain_has_video_models(self):
|
||||
chain = DEFAULT_FALLBACK_CHAINS[ModelCapability.VIDEO]
|
||||
assert any("video-llama" in m for m in chain)
|
||||
assert any("video-llava" in m for m in chain)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers to build a manager without hitting the network
|
||||
@@ -457,6 +483,12 @@ class TestGetModelForContent:
|
||||
model, _ = mgr.get_model_for_content("IMAGE")
|
||||
assert model == "llava:7b"
|
||||
|
||||
def test_video_content_uses_video_fallback(self):
|
||||
mgr = _make_manager(["video-llama:7b", "llava:7b"])
|
||||
model, is_fb = mgr.get_model_for_content("video")
|
||||
# Should prefer video-llama from fallback chain
|
||||
assert model == "video-llama:7b"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level convenience functions
|
||||
|
||||
Reference in New Issue
Block a user