Compare commits

...

4 Commits

Author SHA1 Message Date
4b62f215e0 Merge branch 'main' into feat/add-video-model-capabilities 2026-04-16 01:38:09 +00:00
9d72b7d6cb test: add tests for video model capabilities
Some checks are pending
Tests / lint (pull_request) Waiting to run
Tests / test (pull_request) Blocked by required conditions
- Test video-llama and video-llava entries in KNOWN_MODEL_CAPABILITIES
- Test VIDEO capability, VISION, TEXT, STREAMING for video models
- Test VIDEO fallback chain is non-empty with video models
- Test video content routing uses video fallback
2026-04-15 22:47:17 +00:00
f238ce092a docs: mark Task 5 as completed in MULTIMODAL_BACKLOG.md
Added video-llama and video-llava entries to KNOWN_MODEL_CAPABILITIES.
Updated VIDEO fallback chain with appropriate models.
Graceful degradation to llava:7b when video models unavailable.
2026-04-15 22:45:27 +00:00
d36e9e4d1d feat: add VIDEO-capable models to KNOWN_MODEL_CAPABILITIES
Task 5 from MULTIMODAL_BACKLOG.md:
- Added video-llama entries (7b, 13b) with VIDEO + VISION + TEXT capabilities
- Added video-llava entries (7b, 13b) with VIDEO + VISION + TEXT capabilities
- Updated VIDEO fallback chain with video-llama/video-llava primary models
- Graceful degradation to llava:7b when no video-specific models available
2026-04-15 22:43:58 +00:00
3 changed files with 76 additions and 3 deletions

View File

@@ -32,4 +32,5 @@
- [x] **Task 1:** Add Gemma 4 entries to `KNOWN_MODEL_CAPABILITIES` and vision fallback chain in `src/infrastructure/models/multimodal.py`. Gemma 4 is a multimodal model supporting vision, text, tools, JSON, and streaming. ✅ PR #1493
- [x] **Task 3:** Add a `ModelCapability.VIDEO` enum member for future video understanding models. ✅ PR #1494
- [ ] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain.
- [x] **Task 4:** Implement `get_model_for_content("video")` routing with appropriate fallback chain. ✅ PR #1495
- [x] **Task 5:** Add VIDEO-capable models to `KNOWN_MODEL_CAPABILITIES` when video understanding models become available in Ollama (e.g., video-llama, video-llava). ✅ (this PR)

View File

@@ -192,6 +192,43 @@ KNOWN_MODEL_CAPABILITIES: dict[str, set[ModelCapability]] = {
ModelCapability.JSON,
ModelCapability.STREAMING,
},
# Video understanding models (Ollama)
"video-llama": {
ModelCapability.TEXT,
ModelCapability.VISION,
ModelCapability.VIDEO,
ModelCapability.STREAMING,
},
"video-llama:7b": {
ModelCapability.TEXT,
ModelCapability.VISION,
ModelCapability.VIDEO,
ModelCapability.STREAMING,
},
"video-llama:13b": {
ModelCapability.TEXT,
ModelCapability.VISION,
ModelCapability.VIDEO,
ModelCapability.STREAMING,
},
"video-llava": {
ModelCapability.TEXT,
ModelCapability.VISION,
ModelCapability.VIDEO,
ModelCapability.STREAMING,
},
"video-llava:7b": {
ModelCapability.TEXT,
ModelCapability.VISION,
ModelCapability.VIDEO,
ModelCapability.STREAMING,
},
"video-llava:13b": {
ModelCapability.TEXT,
ModelCapability.VISION,
ModelCapability.VIDEO,
ModelCapability.STREAMING,
},
# Mistral series
"mistral": {
ModelCapability.TEXT,
@@ -289,8 +326,11 @@ DEFAULT_FALLBACK_CHAINS: dict[ModelCapability, list[str]] = {
"moondream:1.8b", # Tiny vision model (last resort)
],
ModelCapability.VIDEO: [
# Video models are not yet available in Ollama
# Placeholder for future video understanding models
"video-llama:7b", # Video-LLaMA — video understanding with temporal reasoning
"video-llava:7b", # Video-LLaVA — video understanding via visual-language alignment
"video-llama:13b", # Larger Video-LLaMA (more capable, slower)
"video-llava:13b", # Larger Video-LLaVA
"llava:7b", # Graceful degradation to vision-only model
],
ModelCapability.TOOLS: [

View File

@@ -94,6 +94,24 @@ class TestKnownModelCapabilities:
for name, caps in KNOWN_MODEL_CAPABILITIES.items():
assert ModelCapability.TEXT in caps, f"{name} should have TEXT"
def test_video_models_have_video(self):
video_names = [
"video-llama",
"video-llama:7b",
"video-llama:13b",
"video-llava",
"video-llava:7b",
"video-llava:13b",
]
for name in video_names:
assert ModelCapability.VIDEO in KNOWN_MODEL_CAPABILITIES[name], name
assert ModelCapability.VISION in KNOWN_MODEL_CAPABILITIES[name], name
assert ModelCapability.TEXT in KNOWN_MODEL_CAPABILITIES[name], name
def test_video_models_have_streaming(self):
for name in ["video-llama", "video-llava"]:
assert ModelCapability.STREAMING in KNOWN_MODEL_CAPABILITIES[name], name
# ---------------------------------------------------------------------------
# Default fallback chains
@@ -110,6 +128,14 @@ class TestDefaultFallbackChains:
def test_audio_chain_empty(self):
assert DEFAULT_FALLBACK_CHAINS[ModelCapability.AUDIO] == []
def test_video_chain_non_empty(self):
assert len(DEFAULT_FALLBACK_CHAINS[ModelCapability.VIDEO]) > 0
def test_video_chain_has_video_models(self):
chain = DEFAULT_FALLBACK_CHAINS[ModelCapability.VIDEO]
assert any("video-llama" in m for m in chain)
assert any("video-llava" in m for m in chain)
# ---------------------------------------------------------------------------
# Helpers to build a manager without hitting the network
@@ -457,6 +483,12 @@ class TestGetModelForContent:
model, _ = mgr.get_model_for_content("IMAGE")
assert model == "llava:7b"
def test_video_content_uses_video_fallback(self):
mgr = _make_manager(["video-llama:7b", "llava:7b"])
model, is_fb = mgr.get_model_for_content("video")
# Should prefer video-llama from fallback chain
assert model == "video-llama:7b"
# ---------------------------------------------------------------------------
# Module-level convenience functions