#!/usr/bin/env python3 """ generate_scenes_from_media.py — Auto-generate scene descriptions from image/video assets. Scans a directory for images/videos, generates scene descriptions using a vision model, and outputs as training pairs in JSONL format. Usage: python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --output training-data/media-scenes.jsonl python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --model llava --dry-run python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --max 10 --json """ import argparse import hashlib import json import os import subprocess import sys import time from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Tuple # Supported media formats IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"} VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".flv"} ALL_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS def find_media_files(assets_dir: str, max_files: int = 0) -> List[Path]: """Scan directory for media files.""" assets_path = Path(assets_dir) if not assets_path.exists(): print(f"ERROR: Directory not found: {assets_dir}", file=sys.stderr) return [] media_files = [] for ext in sorted(ALL_EXTENSIONS): media_files.extend(assets_path.rglob(f"*{ext}")) media_files.extend(assets_path.rglob(f"*{ext.upper()}")) # Deduplicate media_files = sorted(set(media_files)) if max_files > 0: media_files = media_files[:max_files] return media_files def file_hash(filepath: Path) -> str: """Generate hash for file deduplication.""" return hashlib.sha256(str(filepath).encode()).hexdigest()[:16] def generate_description_prompt(filepath: Path) -> str: """Generate the prompt for vision model.""" if filepath.suffix.lower() in IMAGE_EXTENSIONS: return ( "Describe this image as a visual scene for a training dataset. " "Include: mood, dominant colors (2-3), composition type, camera angle, " "and a vivid 1-2 sentence description. Format as JSON with keys: " "mood, colors, composition, camera, description." ) else: return ( "Describe this video frame as a visual scene for a training dataset. " "Include: mood, dominant colors (2-3), composition type, camera movement, " "and a vivid 1-2 sentence description. Format as JSON with keys: " "mood, colors, composition, camera, description." ) def call_vision_model(filepath: Path, model: str = "llava") -> Optional[dict]: """ Call a vision model to generate scene description. Supports: - llava (local via ollama) - gpt-4-vision (OpenAI API) - claude-vision (Anthropic API) """ prompt = generate_description_prompt(filepath) try: if model.startswith("llava") or model == "ollama": # Local Ollama with LLaVA result = subprocess.run( ["curl", "-s", "http://localhost:11434/api/generate", "-d", json.dumps({ "model": "llava", "prompt": prompt, "images": [str(filepath)], "stream": False, })], capture_output=True, text=True, timeout=60 ) if result.returncode == 0: response = json.loads(result.stdout) return parse_description(response.get("response", "")) elif model.startswith("gpt-4"): # OpenAI GPT-4 Vision (requires API key) import base64 with open(filepath, "rb") as f: image_data = base64.b64encode(f.read()).decode() api_key = os.environ.get("OPENAI_API_KEY") if not api_key: print("ERROR: OPENAI_API_KEY not set", file=sys.stderr) return None result = subprocess.run( ["curl", "-s", "https://api.openai.com/v1/chat/completions", "-H", f"Authorization: Bearer {api_key}", "-H", "Content-Type: application/json", "-d", json.dumps({ "model": "gpt-4-vision-preview", "messages": [{ "role": "user", "content": [ {"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}} ] }], "max_tokens": 500 })], capture_output=True, text=True, timeout=60 ) if result.returncode == 0: response = json.loads(result.stdout) content = response["choices"][0]["message"]["content"] return parse_description(content) elif model.startswith("claude"): # Anthropic Claude Vision (requires API key) import base64 with open(filepath, "rb") as f: image_data = base64.b64encode(f.read()).decode() api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr) return None media_type = "image/jpeg" if filepath.suffix.lower() in {".jpg", ".jpeg"} else "image/png" result = subprocess.run( ["curl", "-s", "https://api.anthropic.com/v1/messages", "-H", f"x-api-key: {api_key}", "-H", "anthropic-version: 2023-06-01", "-H", "Content-Type: application/json", "-d", json.dumps({ "model": "claude-3-opus-20240229", "max_tokens": 500, "messages": [{ "role": "user", "content": [ {"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}}, {"type": "text", "text": prompt} ] }] })], capture_output=True, text=True, timeout=60 ) if result.returncode == 0: response = json.loads(result.stdout) content = response["content"][0]["text"] return parse_description(content) except (subprocess.TimeoutExpired, json.JSONDecodeError, KeyError) as e: print(f"ERROR calling vision model: {e}", file=sys.stderr) return None def parse_description(text: str) -> dict: """Parse model response into structured description.""" # Try to extract JSON from response import re json_match = re.search(r'\{[^}]+\}', text, re.DOTALL) if json_match: try: return json.loads(json_match.group()) except json.JSONDecodeError: pass # Fallback: parse manually desc = { "mood": "unknown", "colors": [], "composition": "unknown", "camera": "unknown", "description": text[:500], } # Try to extract mood mood_match = re.search(r'mood["\s:]+(\w+)', text, re.IGNORECASE) if mood_match: desc["mood"] = mood_match.group(1).lower() # Try to extract colors color_match = re.search(r'colors?["\s:]+\[([^\]]+)\]', text, re.IGNORECASE) if color_match: desc["colors"] = [c.strip().strip('"').strip("'") for c in color_match.group(1).split(",")] return desc def generate_training_pair(filepath: Path, description: dict, model: str) -> dict: """Generate a training pair from media file and description.""" return { "source_file": str(filepath), "source_hash": file_hash(filepath), "source_type": "media_asset", "media_type": "image" if filepath.suffix.lower() in IMAGE_EXTENSIONS else "video", "model": model, "timestamp": datetime.now(timezone.utc).isoformat(), "source_session_id": f"media-gen-{int(time.time())}", "prompt": f"Describe the visual scene in {filepath.name}", "response": description.get("description", ""), "scene": { "mood": description.get("mood", "unknown"), "colors": description.get("colors", []), "composition": description.get("composition", "unknown"), "camera": description.get("camera", "unknown"), "description": description.get("description", ""), }, } def main(): parser = argparse.ArgumentParser(description="Generate scene descriptions from media") parser.add_argument("--assets", required=True, help="Assets directory to scan") parser.add_argument("--output", help="Output JSONL file path") parser.add_argument("--model", default="llava", help="Vision model (llava/gpt-4/claude)") parser.add_argument("--max", type=int, default=0, help="Max files to process (0=all)") parser.add_argument("--dry-run", action="store_true", help="Don't call vision model") parser.add_argument("--json", action="store_true", help="JSON output") args = parser.parse_args() media_files = find_media_files(args.assets, args.max) if not media_files: print("No media files found.", file=sys.stderr) sys.exit(1) print(f"Found {len(media_files)} media files in {args.assets}") if args.dry_run: print("\nDry run — files to process:") for f in media_files[:20]: print(f" {f.relative_to(args.assets)}") if len(media_files) > 20: print(f" ... and {len(media_files) - 20} more") sys.exit(0) pairs = [] errors = 0 for i, filepath in enumerate(media_files, 1): print(f"[{i}/{len(media_files)}] Processing {filepath.name}...", end=" ", flush=True) description = call_vision_model(filepath, args.model) if description: pair = generate_training_pair(filepath, description, args.model) pairs.append(pair) print(f"OK (mood: {pair['scene']['mood']})") else: errors += 1 print("ERROR") # Output output_path = args.output or "training-data/media-scene-descriptions.jsonl" if args.json: print(json.dumps({"pairs": pairs, "total": len(pairs), "errors": errors}, indent=2)) else: with open(output_path, 'w') as f: for pair in pairs: f.write(json.dumps(pair, ensure_ascii=False) + '\n') print(f"\nGenerated {len(pairs)} scene descriptions ({errors} errors)") print(f"Output: {output_path}") if __name__ == "__main__": main()