Merge PR #729: scripts/generate_scene_descriptions.py (added)

This commit is contained in:
Merge Bot
2026-04-16 05:03:55 +00:00
parent e8b5d5edb4
commit 17adc703f8

View File

@@ -0,0 +1,409 @@
#!/usr/bin/env python3
"""
Auto-generate scene descriptions from image/video assets.
Scans a directory for media files, generates scene descriptions using
a local vision model (Ollama), and outputs training pairs in JSONL format.
Supports Gemma 4 multimodal vision via Ollama. Falls back gracefully when
models are unavailable.
Usage:
python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl
python scripts/generate_scene_descriptions.py --input ./assets --model gemma4:latest --limit 50
python scripts/generate_scene_descriptions.py --input ./assets --format sharegpt
python scripts/generate_scene_descriptions.py --dry-run # List files without generating
python scripts/generate_scene_descriptions.py --input ./assets --check-model # Verify model availability
Ref: timmy-config#689
"""
import argparse
import base64
import json
import os
import re
import subprocess
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Supported media extensions
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS
# File size limit (50MB) — prevents unbounded memory usage on large images
MAX_FILE_SIZE = 50 * 1024 * 1024
# Vision models in preference order (best first)
VISION_MODELS = [
"gemma4:latest", # Gemma 4 — multimodal vision (8B, Q4_K_M)
"gemma3:12b", # Gemma 3 — fallback vision
"llava:latest", # LLaVA — generic vision
"llava-phi3:latest", # LLaVA-Phi3 — lightweight vision
]
# Vision model prompt template (structured JSON output)
SCENE_PROMPT = """Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation):
{
"mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",
"colors": ["dominant color 1", "dominant color 2", "dominant color 3"],
"composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",
"camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",
"lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",
"description": "2-3 sentence visual description of the scene"
}
Be specific. Describe what you see, not what you imagine."""
# ShareGPT format prompt (for training pipeline integration)
SHAREGPT_SCENE_PROMPT = """Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description."""
def check_model_available(model: str, ollama_url: str = "http://localhost:11434") -> bool:
"""Check if a model is available in Ollama."""
try:
req = urllib.request.Request(f"{ollama_url}/api/tags")
resp = urllib.request.urlopen(req, timeout=10)
data = json.loads(resp.read())
available = [m["name"] for m in data.get("models", [])]
return model in available
except Exception:
return False
def auto_detect_model(ollama_url: str = "http://localhost:11434") -> Optional[str]:
"""Auto-detect the best available vision model."""
for model in VISION_MODELS:
if check_model_available(model, ollama_url):
print(f"Auto-detected vision model: {model}", file=sys.stderr)
return model
return None
def scan_media(input_dir: str) -> list[Path]:
"""Scan directory for media files recursively."""
media_files = []
input_path = Path(input_dir)
if not input_path.exists():
print(f"Error: {input_dir} does not exist", file=sys.stderr)
return media_files
for ext in sorted(ALL_EXTS):
media_files.extend(input_path.rglob(f"*{ext}"))
media_files.extend(input_path.rglob(f"*{ext.upper()}"))
return sorted(set(media_files))
def extract_video_frame(video_path: Path, output_path: Path) -> bool:
"""Extract a representative frame from a video using ffmpeg."""
try:
result = subprocess.run(
# FIX #3: Seek 2s in before grabbing frame — avoids black/title frames
["ffmpeg", "-ss", "2", "-i", str(video_path), "-vframes", "1",
"-q:v", "2", str(output_path), "-y"],
capture_output=True, timeout=30,
)
if result.returncode != 0 and result.stderr:
print(f" ffmpeg stderr: {result.stderr.decode(errors='replace')[:200]}", file=sys.stderr)
return output_path.exists() and output_path.stat().st_size > 0
except FileNotFoundError:
print(" ffmpeg not found — skipping video frame extraction", file=sys.stderr)
return False
except Exception as e:
print(f" ffmpeg error: {e}", file=sys.stderr)
return False
def describe_image(
image_path: Path,
model: str = "gemma4:latest",
ollama_url: str = "http://localhost:11434",
max_retries: int = 2,
) -> Optional[dict]:
"""Generate scene description using Ollama vision model with retry."""
# FIX #1: Check file size before reading into memory
if image_path.stat().st_size > MAX_FILE_SIZE:
print(f" Skipping {image_path.name}: exceeds {MAX_FILE_SIZE // (1024*1024)}MB limit", file=sys.stderr)
return None
for attempt in range(max_retries + 1):
try:
with open(image_path, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode()
req = urllib.request.Request(
f"{ollama_url}/api/generate",
data=json.dumps({
"model": model,
"prompt": SCENE_PROMPT,
"images": [image_b64],
"stream": False,
"options": {"temperature": 0.3, "num_predict": 1024}
}).encode(),
headers={"Content-Type": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=120)
data = json.loads(resp.read())
response_text = data.get("response", "")
# Parse JSON from response (handle both complete and truncated JSON)
json_match = re.search(r"\{[\s\S]*\}", response_text)
if not json_match:
# Try to find opening brace for truncated JSON
brace_match = re.search(r"\{", response_text)
if brace_match:
json_match = brace_match
if json_match:
raw_json = json_match.group() if hasattr(json_match, 'group') else response_text[json_match.start():]
# Try strict parse first
try:
parsed = json.loads(raw_json)
required = ["mood", "colors", "composition", "camera", "description"]
if all(k in parsed for k in required) and parsed.get("description"):
return parsed
except json.JSONDecodeError:
# Attempt repair: extract fields from truncated JSON
repaired = {}
for field in ["mood", "colors", "composition", "camera", "lighting", "description"]:
pat = rf'"\s*{field}"\s*:\s*"([^"]*)"'
m = re.search(pat, response_text)
if m:
repaired[field] = m.group(1)
elif field == "colors":
colors_match = re.search(r'"colors"\s*:\s*\[([^\]]*)\]', response_text)
if colors_match:
repaired[field] = [c.strip().strip('"') for c in colors_match.group(1).split(",") if c.strip()]
else:
repaired[field] = []
else:
repaired[field] = "unknown"
if repaired.get("description") or repaired.get("mood") != "unknown":
return repaired
# Final fallback: natural language response
clean = re.sub(r"[*_`#]", "", response_text).strip()
clean = re.sub(r"\n{3,}", "\n\n", clean)
return {
"description": clean[:500] if clean else response_text[:500],
"mood": "unknown",
"colors": [],
"composition": "unknown",
"camera": "unknown",
"lighting": "unknown"
}
except (urllib.error.URLError, TimeoutError) as e:
if attempt < max_retries:
wait = 2 ** attempt
print(f" Retry {attempt + 1}/{max_retries} after {wait}s: {e}", file=sys.stderr)
time.sleep(wait)
else:
print(f" Error describing {image_path.name}: {e}", file=sys.stderr)
return None
except Exception as e:
print(f" Error describing {image_path.name}: {e}", file=sys.stderr)
return None
def describe_image_sharegpt(
image_path: Path,
model: str = "gemma4:latest",
ollama_url: str = "http://localhost:11434",
max_retries: int = 2,
) -> Optional[str]:
"""Generate scene description in natural language for ShareGPT format."""
# FIX #1: Check file size before reading into memory
if image_path.stat().st_size > MAX_FILE_SIZE:
print(f" Skipping {image_path.name}: exceeds {MAX_FILE_SIZE // (1024*1024)}MB limit", file=sys.stderr)
return None
for attempt in range(max_retries + 1):
try:
with open(image_path, "rb") as f:
image_b64 = base64.b64encode(f.read()).decode()
req = urllib.request.Request(
f"{ollama_url}/api/generate",
data=json.dumps({
"model": model,
"prompt": SHAREGPT_SCENE_PROMPT,
"images": [image_b64],
"stream": False,
"options": {"temperature": 0.5, "num_predict": 256}
}).encode(),
headers={"Content-Type": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=120)
data = json.loads(resp.read())
return data.get("response", "").strip()
except (urllib.error.URLError, TimeoutError) as e:
if attempt < max_retries:
time.sleep(2 ** attempt)
else:
return None
except Exception:
return None
def generate_training_pairs(
media_files: list[Path],
model: str,
ollama_url: str,
limit: int = 0,
dry_run: bool = False,
output_format: str = "jsonl",
) -> list[dict]:
"""Generate training pairs from media files."""
pairs = []
files = media_files[:limit] if limit > 0 else media_files
print(f"Processing {len(files)} files with model {model}...", file=sys.stderr)
for i, media_path in enumerate(files):
print(f" [{i + 1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ", flush=True)
if dry_run:
print("(dry run)", file=sys.stderr)
pairs.append({"source": str(media_path), "status": "dry-run"})
continue
is_video = media_path.suffix.lower() in VIDEO_EXTS
work_path = media_path
if is_video:
frame_path = media_path.with_suffix(".frame.jpg")
if extract_video_frame(media_path, frame_path):
work_path = frame_path
else:
print("SKIP (frame extraction failed)", file=sys.stderr)
continue
try:
if output_format == "sharegpt":
# ShareGPT format for training pipeline
description = describe_image_sharegpt(work_path, model, ollama_url)
if description:
pair = {
"conversations": [
{"from": "human", "value": f"<image>\n{SHAREGPT_SCENE_PROMPT}"},
{"from": "gpt", "value": description}
],
"source": str(media_path),
"media_type": "video" if is_video else "image",
"model": model,
"generated_at": datetime.now(timezone.utc).isoformat(),
}
pairs.append(pair)
print("OK", file=sys.stderr)
else:
print("FAIL", file=sys.stderr)
else:
# Structured JSONL format
description = describe_image(work_path, model, ollama_url)
if description:
pair = {
"source": str(media_path),
"media_type": "video" if is_video else "image",
"description": description,
"model": model,
"generated_at": datetime.now(timezone.utc).isoformat(),
}
pairs.append(pair)
print("OK", file=sys.stderr)
else:
print("FAIL", file=sys.stderr)
finally:
# FIX #6: Cleanup temp frame in try/finally — survives crashes
if is_video and work_path != media_path:
try:
work_path.unlink()
except Exception:
pass
# Small delay between files (reduced from 0.5s — Ollama is local)
time.sleep(0.1)
return pairs
def main():
parser = argparse.ArgumentParser(
description="Auto-generate scene descriptions from media assets using vision AI"
)
parser.add_argument("--input", "-i", default="", help="Input directory with media files")
parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")
parser.add_argument("--model", "-m", default=None, help="Ollama model name (auto-detects best available if omitted)")
parser.add_argument("--ollama-url", default="http://localhost:11434")
parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")
parser.add_argument("--dry-run", action="store_true", help="List files without generating")
parser.add_argument("--check-model", action="store_true", help="Check model availability and exit")
parser.add_argument("--format", choices=["jsonl", "sharegpt"], default="jsonl",
help="Output format: jsonl (structured) or sharegpt (training pipeline)")
args = parser.parse_args()
# Model detection
if args.check_model:
if args.model:
available = check_model_available(args.model, args.ollama_url)
print(f"Model '{args.model}': {'✅ available' if available else '❌ not found'}")
else:
model = auto_detect_model(args.ollama_url)
if model:
print(f"✅ Best available: {model}")
else:
print("❌ No vision models found in Ollama — install one with: ollama pull gemma4:latest")
sys.exit(0)
# Auto-detect model if not specified
model = args.model
if not model:
model = auto_detect_model(args.ollama_url)
if not model:
# Fall back to best default even if not installed — let Ollama handle the error
model = "gemma4:latest"
print(f"Warning: No vision models detected. Falling back to {model}", file=sys.stderr)
# Validate input
if not args.input:
print("Error: --input is required (unless using --check-model)", file=sys.stderr)
sys.exit(1)
# Scan and process
media_files = scan_media(args.input)
print(f"Found {len(media_files)} media files", file=sys.stderr)
if not media_files:
print("No media files found.", file=sys.stderr)
sys.exit(1)
pairs = generate_training_pairs(
media_files, model, args.ollama_url,
args.limit, args.dry_run, args.format
)
# Write output
output_path = Path(args.output)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
for pair in pairs:
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)
# Summary
success = len([p for p in pairs if "description" in p or "conversations" in p])
failed = len(pairs) - success
if failed > 0:
print(f" ⚠️ {failed} files failed", file=sys.stderr)
if __name__ == "__main__":
main()