#!/usr/bin/env python3 """ generate_scene_descriptions.py — Auto-generate scene descriptions from image/video. Scans an assets directory for images, generates rich scene descriptions using a local vision model, and outputs as training pairs. Usage: python3 training/scripts/generate_scene_descriptions.py --input assets/ python3 training/scripts/generate_scene_descriptions.py --input assets/ --model gemma3:27b python3 training/scripts/generate_scene_descriptions.py --input assets/ --dry-run """ import json import os import sys import base64 import glob import urllib.request from datetime import datetime, timezone from pathlib import Path ENDPOINT = os.environ.get("OPENAI_BASE_URL", "http://localhost:11434/v1") API_KEY = os.environ.get("OPENAI_API_KEY", "ollama") DESCRIPTION_PROMPT = """Describe this image as a rich visual scene for a training dataset. Output a JSON object with these fields: - mood: primary emotional mood (1-2 words) - colors: array of 3-5 dominant colors - composition: framing type (wide shot, close-up, etc.) - camera: camera movement suggestion - description: rich 2-3 sentence scene description Focus on: lighting, atmosphere, emotion, visual composition. Be specific, not generic. Return ONLY valid JSON, no markdown fences.""" def encode_image(image_path: str) -> str: """Encode image to base64.""" with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode() def generate_description(image_path: str, model: str = "gemma3:27b") -> dict: """Generate scene description for an image using vision model.""" b64 = encode_image(image_path) data = { "model": model, "messages": [ { "role": "user", "content": [ {"type": "text", "text": DESCRIPTION_PROMPT}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}} ] } ], "max_tokens": 512, "temperature": 0.3, } url = f"{ENDPOINT}/chat/completions" try: req = urllib.request.Request(url, json.dumps(data).encode(), { "Content-Type": "application/json", "Authorization": f"Bearer {API_KEY}", }, method="POST") with urllib.request.urlopen(req, timeout=60) as resp: result = json.loads(resp.read()) content = result["choices"][0]["message"]["content"] # Try to extract JSON from response content = content.strip() if content.startswith("```"): content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip() return json.loads(content) except Exception as e: return {"error": str(e), "description": "", "mood": "unknown", "colors": [], "composition": "", "camera": ""} def scan_assets(directory: str) -> list: """Find all image files in directory.""" extensions = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".mp4", ".mov", ".avi"} files = [] for ext in extensions: files.extend(glob.glob(os.path.join(directory, f"**/*{ext}"), recursive=True)) return sorted(files) def generate_training_pairs(assets_dir: str, output_path: str, model: str = "gemma3:27b", dry_run: bool = False, limit: int = None): """Generate training pairs from image assets.""" files = scan_assets(assets_dir) if limit: files = files[:limit] print(f"Assets directory: {assets_dir}") print(f"Found: {len(files)} image/video files") print(f"Model: {model}") print(f"Mode: {'DRY RUN' if dry_run else 'GENERATE'}") print() pairs = [] errors = 0 for i, filepath in enumerate(files): filename = os.path.basename(filepath) print(f" [{i+1}/{len(files)}] {filename}...", end=" ", flush=True) if dry_run: print("SKIP") continue scene = generate_description(filepath, model) if "error" in scene: print(f"ERROR: {scene['error'][:50]}") errors += 1 continue pair = { "terse": f"[image: {filename}]", "rich": scene.get("description", ""), "domain": "scene_descriptions", "source_asset": filepath, "mood": scene.get("mood", ""), "colors": scene.get("colors", []), "composition": scene.get("composition", ""), "camera": scene.get("camera", ""), "model": model, "timestamp": datetime.now(timezone.utc).isoformat(), } pairs.append(pair) print(f"OK ({scene.get('mood', '?')})") # Write output if pairs and not dry_run: with open(output_path, "w") as f: for p in pairs: f.write(json.dumps(p, ensure_ascii=False) + "\n") print(f"\nGenerated: {len(pairs)} pairs, {errors} errors") print(f"Output: {output_path}") elif dry_run: print(f"\nDry run: would generate {len(files)} pairs") return pairs def main(): import argparse parser = argparse.ArgumentParser(description="Generate scene descriptions from images") parser.add_argument("--input", required=True, help="Assets directory") parser.add_argument("--output", default="training/data/scene-descriptions/auto-generated-scenes.jsonl") parser.add_argument("--model", default="gemma3:27b") parser.add_argument("--dry-run", action="store_true") parser.add_argument("--limit", type=int, default=None) args = parser.parse_args() generate_training_pairs(args.input, args.output, args.model, args.dry_run, args.limit) if __name__ == "__main__": main()