feat: enhance scene description generator for #689

- Auto-detect best vision model (gemma4:latest preferred)
- Add --check-model flag for model availability
- Add --format sharegpt for training pipeline integration
- Add retry logic for Ollama API calls
- Add video frame extraction with ffmpeg
- Structured JSON output with validation
- Comprehensive CLI with --help

Closes #689
This commit is contained in:
2026-04-15 09:37:25 +00:00
parent 3ad934febd
commit 990676fb02

View File

@@ -0,0 +1,359 @@
#!/usr/bin/env python3
"""
Auto-generate scene descriptions from image/video assets.
Scans a directory for media files, generates scene descriptions using
a local vision model (Ollama), and outputs training pairs in JSONL format.
Supports Gemma 4 multimodal vision via Ollama. Falls back gracefully when
models are unavailable.
Usage:
python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl
python scripts/generate_scene_descriptions.py --input ./assets --model gemma4:latest --limit 50
python scripts/generate_scene_descriptions.py --input ./assets --format sharegpt
python scripts/generate_scene_descriptions.py --dry-run # List files without generating
python scripts/generate_scene_descriptions.py --input ./assets --check-model # Verify model availability
Ref: timmy-config#689
"""
import argparse
import base64
import json
import os
import re
import subprocess
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Supported media extensions
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS
# Vision models in preference order (best first)
VISION_MODELS = [
"gemma4:latest", # Gemma 4 — multimodal vision (8B, Q4_K_M)
"gemma3:12b", # Gemma 3 — fallback vision
"llava:latest", # LLaVA — generic vision
"llava-phi3:latest", # LLaVA-Phi3 — lightweight vision
]
# Vision model prompt template (structured JSON output)
SCENE_PROMPT = """Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation):
{
"mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",
"colors": ["dominant color 1", "dominant color 2", "dominant color 3"],
"composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",
"camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",
"lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",
"description": "2-3 sentence visual description of the scene"
}
Be specific. Describe what you see, not what you imagine."""
# ShareGPT format prompt (for training pipeline integration)
SHAREGPT_SCENE_PROMPT = """Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description."""
def check_model_available(model: str, ollama_url: str = "http://localhost:11434") -> bool:
"""Check if a model is available in Ollama."""
try:
req = urllib.request.Request(f"{ollama_url}/api/tags")
resp = urllib.request.urlopen(req, timeout=10)
data = json.loads(resp.read())
available = [m["name"] for m in data.get("models", [])]
return model in available
except Exception:
return False
def auto_detect_model(ollama_url: str = "http://localhost:11434") -> Optional[str]:
"""Auto-detect the best available vision model."""
for model in VISION_MODELS:
if check_model_available(model, ollama_url):
print(f"Auto-detected vision model: {model}", file=sys.stderr)
return model
return None
def scan_media(input_dir: str) -> list[Path]:
"""Scan directory for media files recursively."""
media_files = []
input_path = Path(input_dir)
if not input_path.exists():
print(f"Error: {input_dir} does not exist", file=sys.stderr)
return media_files
for ext in sorted(ALL_EXTS):
media_files.extend(input_path.rglob(f"*{ext}"))
media_files.extend(input_path.rglob(f"*{ext.upper()}"))
return sorted(set(media_files))
def extract_video_frame(video_path: Path, output_path: Path) -> bool:
"""Extract a representative frame from a video using ffmpeg."""
try:
result = subprocess.run(
["ffmpeg", "-i", str(video_path), "-vframes", "1",
"-q:v", "2", str(output_path), "-y"],
capture_output=True, timeout=30,
)
return output_path.exists() and output_path.stat().st_size > 0
except FileNotFoundError:
print(" ffmpeg not found — skipping video frame extraction", file=sys.stderr)
return False
except Exception as e:
print(f" ffmpeg error: {e}", file=sys.stderr)
return False
def describe_image(
image_path: Path,
model: str = "gemma4:latest",
ollama_url: str = "http://localhost:11434",
max_retries: int = 2,
) -> Optional[dict]:
"""Generate scene description using Ollama vision model with retry."""
for attempt in range(max_retries + 1):
try:
with open(image_path, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode()
req = urllib.request.Request(
f"{ollama_url}/api/generate",
data=json.dumps({
"model": model,
"prompt": SCENE_PROMPT,
"images": [image_b64],
"stream": False,
"options": {"temperature": 0.3, "num_predict": 512}
}).encode(),
headers={"Content-Type": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=120)
data = json.loads(resp.read())
response_text = data.get("response", "")
# Parse JSON from response
json_match = re.search(r"\{[\s\S]*\}", response_text)
if json_match:
parsed = json.loads(json_match.group())
# Validate required fields
required = ["mood", "colors", "composition", "camera", "description"]
if all(k in parsed for k in required):
return parsed
# Fallback: return raw description
return {
"description": response_text[:500],
"mood": "unknown",
"colors": [],
"composition": "unknown",
"camera": "unknown",
"lighting": "unknown"
}
except (urllib.error.URLError, TimeoutError) as e:
if attempt < max_retries:
wait = 2 ** attempt
print(f" Retry {attempt + 1}/{max_retries} after {wait}s: {e}", file=sys.stderr)
time.sleep(wait)
else:
print(f" Error describing {image_path.name}: {e}", file=sys.stderr)
return None
except Exception as e:
print(f" Error describing {image_path.name}: {e}", file=sys.stderr)
return None
def describe_image_sharegpt(
image_path: Path,
model: str = "gemma4:latest",
ollama_url: str = "http://localhost:11434",
max_retries: int = 2,
) -> Optional[str]:
"""Generate scene description in natural language for ShareGPT format."""
for attempt in range(max_retries + 1):
try:
with open(image_path, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode()
req = urllib.request.Request(
f"{ollama_url}/api/generate",
data=json.dumps({
"model": model,
"prompt": SHAREGPT_SCENE_PROMPT,
"images": [image_b64],
"stream": False,
"options": {"temperature": 0.5, "num_predict": 256}
}).encode(),
headers={"Content-Type": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=120)
data = json.loads(resp.read())
return data.get("response", "").strip()
except (urllib.error.URLError, TimeoutError) as e:
if attempt < max_retries:
time.sleep(2 ** attempt)
else:
return None
except Exception:
return None
def generate_training_pairs(
media_files: list[Path],
model: str,
ollama_url: str,
limit: int = 0,
dry_run: bool = False,
output_format: str = "jsonl",
) -> list[dict]:
"""Generate training pairs from media files."""
pairs = []
files = media_files[:limit] if limit > 0 else media_files
print(f"Processing {len(files)} files with model {model}...", file=sys.stderr)
for i, media_path in enumerate(files):
print(f" [{i + 1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ", flush=True)
if dry_run:
print("(dry run)", file=sys.stderr)
pairs.append({"source": str(media_path), "status": "dry-run"})
continue
is_video = media_path.suffix.lower() in VIDEO_EXTS
work_path = media_path
if is_video:
frame_path = media_path.with_suffix(".frame.jpg")
if extract_video_frame(media_path, frame_path):
work_path = frame_path
else:
print("SKIP (frame extraction failed)", file=sys.stderr)
continue
if output_format == "sharegpt":
# ShareGPT format for training pipeline
description = describe_image_sharegpt(work_path, model, ollama_url)
if description:
pair = {
"conversations": [
{"from": "human", "value": f"<image>\n{SHAREGPT_SCENE_PROMPT}"},
{"from": "gpt", "value": description}
],
"source": str(media_path),
"media_type": "video" if is_video else "image",
"model": model,
"generated_at": datetime.now(timezone.utc).isoformat(),
}
pairs.append(pair)
print("OK", file=sys.stderr)
else:
print("FAIL", file=sys.stderr)
else:
# Structured JSONL format
description = describe_image(work_path, model, ollama_url)
if description:
pair = {
"source": str(media_path),
"media_type": "video" if is_video else "image",
"description": description,
"model": model,
"generated_at": datetime.now(timezone.utc).isoformat(),
}
pairs.append(pair)
print("OK", file=sys.stderr)
else:
print("FAIL", file=sys.stderr)
# Cleanup temp frame
if is_video and work_path != media_path:
try:
work_path.unlink()
except Exception:
pass
# Small delay to avoid overwhelming Ollama
time.sleep(0.5)
return pairs
def main():
parser = argparse.ArgumentParser(
description="Auto-generate scene descriptions from media assets using vision AI"
)
parser.add_argument("--input", "-i", required=True, help="Input directory with media files")
parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")
parser.add_argument("--model", "-m", default="", help="Ollama model name (auto-detects if empty)")
parser.add_argument("--ollama-url", default="http://localhost:11434")
parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")
parser.add_argument("--dry-run", action="store_true", help="List files without generating")
parser.add_argument("--check-model", action="store_true", help="Check model availability and exit")
parser.add_argument("--format", choices=["jsonl", "sharegpt"], default="jsonl",
help="Output format: jsonl (structured) or sharegpt (training pipeline)")
args = parser.parse_args()
# Model detection
if args.check_model:
if args.model:
available = check_model_available(args.model, args.ollama_url)
print(f"Model '{args.model}': {'✅ available' if available else '❌ not found'}")
else:
model = auto_detect_model(args.ollama_url)
if model:
print(f"✅ Best available: {model}")
else:
print("❌ No vision models found in Ollama")
sys.exit(0)
# Auto-detect model if not specified
model = args.model
if not model:
model = auto_detect_model(args.ollama_url)
if not model:
print("Error: No vision model available. Install one with: ollama pull gemma4:latest",
file=sys.stderr)
sys.exit(1)
# Scan and process
media_files = scan_media(args.input)
print(f"Found {len(media_files)} media files", file=sys.stderr)
if not media_files:
print("No media files found.", file=sys.stderr)
sys.exit(1)
pairs = generate_training_pairs(
media_files, model, args.ollama_url,
args.limit, args.dry_run, args.format
)
# Write output
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
for pair in pairs:
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)
# Summary
success = len([p for p in pairs if "description" in p or "conversations" in p])
failed = len(pairs) - success
if failed > 0:
print(f" ⚠️ {failed} files failed", file=sys.stderr)
if __name__ == "__main__":
main()