diff --git a/scripts/generate_scene_descriptions.py b/scripts/generate_scene_descriptions.py new file mode 100644 index 00000000..56ce86aa --- /dev/null +++ b/scripts/generate_scene_descriptions.py @@ -0,0 +1,359 @@ +#!/usr/bin/env python3 +""" +Auto-generate scene descriptions from image/video assets. + +Scans a directory for media files, generates scene descriptions using +a local vision model (Ollama), and outputs training pairs in JSONL format. + +Supports Gemma 4 multimodal vision via Ollama. Falls back gracefully when +models are unavailable. + +Usage: + python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl + python scripts/generate_scene_descriptions.py --input ./assets --model gemma4:latest --limit 50 + python scripts/generate_scene_descriptions.py --input ./assets --format sharegpt + python scripts/generate_scene_descriptions.py --dry-run # List files without generating + python scripts/generate_scene_descriptions.py --input ./assets --check-model # Verify model availability + +Ref: timmy-config#689 +""" + +import argparse +import base64 +import json +import os +import re +import subprocess +import sys +import time +import urllib.request +from datetime import datetime, timezone +from pathlib import Path +from typing import Optional + +# Supported media extensions +IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"} +VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"} +ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS + +# Vision models in preference order (best first) +VISION_MODELS = [ + "gemma4:latest", # Gemma 4 — multimodal vision (8B, Q4_K_M) + "gemma3:12b", # Gemma 3 — fallback vision + "llava:latest", # LLaVA — generic vision + "llava-phi3:latest", # LLaVA-Phi3 — lightweight vision +] + +# Vision model prompt template (structured JSON output) +SCENE_PROMPT = """Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation): +{ + "mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic", + "colors": ["dominant color 1", "dominant color 2", "dominant color 3"], + "composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder", + "camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off", + "lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit", + "description": "2-3 sentence visual description of the scene" +} + +Be specific. Describe what you see, not what you imagine.""" + +# ShareGPT format prompt (for training pipeline integration) +SHAREGPT_SCENE_PROMPT = """Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description.""" + + +def check_model_available(model: str, ollama_url: str = "http://localhost:11434") -> bool: + """Check if a model is available in Ollama.""" + try: + req = urllib.request.Request(f"{ollama_url}/api/tags") + resp = urllib.request.urlopen(req, timeout=10) + data = json.loads(resp.read()) + available = [m["name"] for m in data.get("models", [])] + return model in available + except Exception: + return False + + +def auto_detect_model(ollama_url: str = "http://localhost:11434") -> Optional[str]: + """Auto-detect the best available vision model.""" + for model in VISION_MODELS: + if check_model_available(model, ollama_url): + print(f"Auto-detected vision model: {model}", file=sys.stderr) + return model + return None + + +def scan_media(input_dir: str) -> list[Path]: + """Scan directory for media files recursively.""" + media_files = [] + input_path = Path(input_dir) + if not input_path.exists(): + print(f"Error: {input_dir} does not exist", file=sys.stderr) + return media_files + + for ext in sorted(ALL_EXTS): + media_files.extend(input_path.rglob(f"*{ext}")) + media_files.extend(input_path.rglob(f"*{ext.upper()}")) + + return sorted(set(media_files)) + + +def extract_video_frame(video_path: Path, output_path: Path) -> bool: + """Extract a representative frame from a video using ffmpeg.""" + try: + result = subprocess.run( + ["ffmpeg", "-i", str(video_path), "-vframes", "1", + "-q:v", "2", str(output_path), "-y"], + capture_output=True, timeout=30, + ) + return output_path.exists() and output_path.stat().st_size > 0 + except FileNotFoundError: + print(" ffmpeg not found — skipping video frame extraction", file=sys.stderr) + return False + except Exception as e: + print(f" ffmpeg error: {e}", file=sys.stderr) + return False + + +def describe_image( + image_path: Path, + model: str = "gemma4:latest", + ollama_url: str = "http://localhost:11434", + max_retries: int = 2, +) -> Optional[dict]: + """Generate scene description using Ollama vision model with retry.""" + for attempt in range(max_retries + 1): + try: + with open(image_path, "rb") as f: + image_b64 = base64.b64encode(f.read()).decode() + + req = urllib.request.Request( + f"{ollama_url}/api/generate", + data=json.dumps({ + "model": model, + "prompt": SCENE_PROMPT, + "images": [image_b64], + "stream": False, + "options": {"temperature": 0.3, "num_predict": 512} + }).encode(), + headers={"Content-Type": "application/json"}, + ) + resp = urllib.request.urlopen(req, timeout=120) + data = json.loads(resp.read()) + response_text = data.get("response", "") + + # Parse JSON from response + json_match = re.search(r"\{[\s\S]*\}", response_text) + if json_match: + parsed = json.loads(json_match.group()) + # Validate required fields + required = ["mood", "colors", "composition", "camera", "description"] + if all(k in parsed for k in required): + return parsed + + # Fallback: return raw description + return { + "description": response_text[:500], + "mood": "unknown", + "colors": [], + "composition": "unknown", + "camera": "unknown", + "lighting": "unknown" + } + + except (urllib.error.URLError, TimeoutError) as e: + if attempt < max_retries: + wait = 2 ** attempt + print(f" Retry {attempt + 1}/{max_retries} after {wait}s: {e}", file=sys.stderr) + time.sleep(wait) + else: + print(f" Error describing {image_path.name}: {e}", file=sys.stderr) + return None + except Exception as e: + print(f" Error describing {image_path.name}: {e}", file=sys.stderr) + return None + + +def describe_image_sharegpt( + image_path: Path, + model: str = "gemma4:latest", + ollama_url: str = "http://localhost:11434", + max_retries: int = 2, +) -> Optional[str]: + """Generate scene description in natural language for ShareGPT format.""" + for attempt in range(max_retries + 1): + try: + with open(image_path, "rb") as f: + image_b64 = base64.b64encode(f.read()).decode() + + req = urllib.request.Request( + f"{ollama_url}/api/generate", + data=json.dumps({ + "model": model, + "prompt": SHAREGPT_SCENE_PROMPT, + "images": [image_b64], + "stream": False, + "options": {"temperature": 0.5, "num_predict": 256} + }).encode(), + headers={"Content-Type": "application/json"}, + ) + resp = urllib.request.urlopen(req, timeout=120) + data = json.loads(resp.read()) + return data.get("response", "").strip() + + except (urllib.error.URLError, TimeoutError) as e: + if attempt < max_retries: + time.sleep(2 ** attempt) + else: + return None + except Exception: + return None + + +def generate_training_pairs( + media_files: list[Path], + model: str, + ollama_url: str, + limit: int = 0, + dry_run: bool = False, + output_format: str = "jsonl", +) -> list[dict]: + """Generate training pairs from media files.""" + pairs = [] + files = media_files[:limit] if limit > 0 else media_files + + print(f"Processing {len(files)} files with model {model}...", file=sys.stderr) + + for i, media_path in enumerate(files): + print(f" [{i + 1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ", flush=True) + + if dry_run: + print("(dry run)", file=sys.stderr) + pairs.append({"source": str(media_path), "status": "dry-run"}) + continue + + is_video = media_path.suffix.lower() in VIDEO_EXTS + work_path = media_path + + if is_video: + frame_path = media_path.with_suffix(".frame.jpg") + if extract_video_frame(media_path, frame_path): + work_path = frame_path + else: + print("SKIP (frame extraction failed)", file=sys.stderr) + continue + + if output_format == "sharegpt": + # ShareGPT format for training pipeline + description = describe_image_sharegpt(work_path, model, ollama_url) + if description: + pair = { + "conversations": [ + {"from": "human", "value": f"\n{SHAREGPT_SCENE_PROMPT}"}, + {"from": "gpt", "value": description} + ], + "source": str(media_path), + "media_type": "video" if is_video else "image", + "model": model, + "generated_at": datetime.now(timezone.utc).isoformat(), + } + pairs.append(pair) + print("OK", file=sys.stderr) + else: + print("FAIL", file=sys.stderr) + else: + # Structured JSONL format + description = describe_image(work_path, model, ollama_url) + if description: + pair = { + "source": str(media_path), + "media_type": "video" if is_video else "image", + "description": description, + "model": model, + "generated_at": datetime.now(timezone.utc).isoformat(), + } + pairs.append(pair) + print("OK", file=sys.stderr) + else: + print("FAIL", file=sys.stderr) + + # Cleanup temp frame + if is_video and work_path != media_path: + try: + work_path.unlink() + except Exception: + pass + + # Small delay to avoid overwhelming Ollama + time.sleep(0.5) + + return pairs + + +def main(): + parser = argparse.ArgumentParser( + description="Auto-generate scene descriptions from media assets using vision AI" + ) + parser.add_argument("--input", "-i", required=True, help="Input directory with media files") + parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl") + parser.add_argument("--model", "-m", default="", help="Ollama model name (auto-detects if empty)") + parser.add_argument("--ollama-url", default="http://localhost:11434") + parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)") + parser.add_argument("--dry-run", action="store_true", help="List files without generating") + parser.add_argument("--check-model", action="store_true", help="Check model availability and exit") + parser.add_argument("--format", choices=["jsonl", "sharegpt"], default="jsonl", + help="Output format: jsonl (structured) or sharegpt (training pipeline)") + args = parser.parse_args() + + # Model detection + if args.check_model: + if args.model: + available = check_model_available(args.model, args.ollama_url) + print(f"Model '{args.model}': {'✅ available' if available else '❌ not found'}") + else: + model = auto_detect_model(args.ollama_url) + if model: + print(f"✅ Best available: {model}") + else: + print("❌ No vision models found in Ollama") + sys.exit(0) + + # Auto-detect model if not specified + model = args.model + if not model: + model = auto_detect_model(args.ollama_url) + if not model: + print("Error: No vision model available. Install one with: ollama pull gemma4:latest", + file=sys.stderr) + sys.exit(1) + + # Scan and process + media_files = scan_media(args.input) + print(f"Found {len(media_files)} media files", file=sys.stderr) + + if not media_files: + print("No media files found.", file=sys.stderr) + sys.exit(1) + + pairs = generate_training_pairs( + media_files, model, args.ollama_url, + args.limit, args.dry_run, args.format + ) + + # Write output + output_path = Path(args.output) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w") as f: + for pair in pairs: + f.write(json.dumps(pair, ensure_ascii=False) + "\n") + + print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr) + + # Summary + success = len([p for p in pairs if "description" in p or "conversations" in p]) + failed = len(pairs) - success + if failed > 0: + print(f" ⚠️ {failed} files failed", file=sys.stderr) + + +if __name__ == "__main__": + main()