#!/usr/bin/env python3 """ Auto-generate scene descriptions from image/video assets. Scans a directory for media files, generates scene descriptions using a local vision model (Ollama), and outputs training pairs in JSONL format. Usage: python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl python scripts/generate_scene_descriptions.py --input ./assets --model gemma3:12b --limit 50 python scripts/generate_scene_descriptions.py --dry-run # List files without generating """ import argparse import json import os import subprocess import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Optional # Supported media extensions IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"} VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"} ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS # Vision model prompt template SCENE_PROMPT = """Describe this image for a visual scene database. Output JSON: { "mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic", "colors": ["dominant color 1", "dominant color 2", "dominant color 3"], "composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder", "camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off", "lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit", "description": "2-3 sentence visual description of the scene" } Be specific. Describe what you see, not what you imagine.""" def scan_media(input_dir: str) -> list[Path]: """Scan directory for media files.""" media_files = [] input_path = Path(input_dir) if not input_path.exists(): print(f"Error: {input_dir} does not exist", file=sys.stderr) return media_files for ext in sorted(ALL_EXTS): media_files.extend(input_path.rglob(f"*{ext}")) media_files.extend(input_path.rglob(f"*{ext.upper()}")) return sorted(set(media_files)) def extract_video_frame(video_path: Path, output_path: Path) -> bool: """Extract a representative frame from a video.""" try: subprocess.run( ["ffmpeg", "-i", str(video_path), "-vframes", "1", "-q:v", "2", str(output_path), "-y"], capture_output=True, timeout=30, ) return output_path.exists() except Exception: return False def describe_image(image_path: Path, model: str = "gemma3:12b", ollama_url: str = "http://localhost:11434") -> Optional[dict]: """Generate scene description using Ollama vision model.""" try: import base64 as b64 with open(image_path, "rb") as f: image_b64 = b64.b64encode(f.read()).decode() import urllib.request req = urllib.request.Request( f"{ollama_url}/api/generate", data=json.dumps({ "model": model, "prompt": SCENE_PROMPT, "images": [image_b64], "stream": False, "options": {"temperature": 0.3, "num_predict": 512} }).encode(), headers={"Content-Type": "application/json"}, ) resp = urllib.request.urlopen(req, timeout=120) data = json.loads(resp.read()) response_text = data.get("response", "") # Parse JSON from response import re json_match = re.search(r"\{[\s\S]*\}", response_text) if json_match: return json.loads(json_match.group()) return {"description": response_text[:500], "mood": "unknown", "colors": [], "composition": "unknown", "camera": "unknown", "lighting": "unknown"} except Exception as e: print(f" Error describing {image_path.name}: {e}", file=sys.stderr) return None def generate_training_pairs(media_files: list[Path], model: str, ollama_url: str, limit: int = 0, dry_run: bool = False) -> list[dict]: """Generate training pairs from media files.""" pairs = [] files = media_files[:limit] if limit > 0 else media_files print(f"Processing {len(files)} files...", file=sys.stderr) for i, media_path in enumerate(files): print(f" [{i+1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ") if dry_run: print("(dry run)", file=sys.stderr) pairs.append({"source": str(media_path), "status": "dry-run"}) continue is_video = media_path.suffix.lower() in VIDEO_EXTS work_path = media_path if is_video: # Extract frame for video frame_path = media_path.with_suffix(".frame.jpg") if extract_video_frame(media_path, frame_path): work_path = frame_path else: print("SKIP (frame extraction failed)", file=sys.stderr) continue description = describe_image(work_path, model, ollama_url) if description: pair = { "source": str(media_path), "media_type": "video" if is_video else "image", "description": description, "model": model, "generated_at": datetime.now(timezone.utc).isoformat(), } pairs.append(pair) print("OK", file=sys.stderr) else: print("FAIL", file=sys.stderr) # Cleanup temp frame if is_video and work_path != media_path: try: work_path.unlink() except Exception: pass return pairs def main(): parser = argparse.ArgumentParser(description="Auto-generate scene descriptions from media") parser.add_argument("--input", "-i", required=True, help="Input directory with media files") parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl") parser.add_argument("--model", "-m", default="gemma3:12b", help="Ollama model name") parser.add_argument("--ollama-url", default="http://localhost:11434") parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)") parser.add_argument("--dry-run", action="store_true", help="List files without generating") args = parser.parse_args() media_files = scan_media(args.input) print(f"Found {len(media_files)} media files", file=sys.stderr) if not media_files: print("No media files found.", file=sys.stderr) sys.exit(1) pairs = generate_training_pairs(media_files, args.model, args.ollama_url, args.limit, args.dry_run) # Write output output_path = Path(args.output) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: for pair in pairs: f.write(json.dumps(pair, ensure_ascii=False) + "\n") print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr) if __name__ == "__main__": main()