diff --git a/training/scripts/generate_scene_descriptions.py b/training/scripts/generate_scene_descriptions.py new file mode 100755 index 00000000..53bc82d9 --- /dev/null +++ b/training/scripts/generate_scene_descriptions.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +""" +generate_scene_descriptions.py — Auto-generate scene descriptions from image/video. + +Scans an assets directory for images, generates rich scene descriptions +using a local vision model, and outputs as training pairs. + +Usage: + python3 training/scripts/generate_scene_descriptions.py --input assets/ + python3 training/scripts/generate_scene_descriptions.py --input assets/ --model gemma3:27b + python3 training/scripts/generate_scene_descriptions.py --input assets/ --dry-run +""" + +import json +import os +import sys +import base64 +import glob +import urllib.request +from datetime import datetime, timezone +from pathlib import Path + +ENDPOINT = os.environ.get("OPENAI_BASE_URL", "http://localhost:11434/v1") +API_KEY = os.environ.get("OPENAI_API_KEY", "ollama") + +DESCRIPTION_PROMPT = """Describe this image as a rich visual scene for a training dataset. + +Output a JSON object with these fields: +- mood: primary emotional mood (1-2 words) +- colors: array of 3-5 dominant colors +- composition: framing type (wide shot, close-up, etc.) +- camera: camera movement suggestion +- description: rich 2-3 sentence scene description + +Focus on: lighting, atmosphere, emotion, visual composition. Be specific, not generic. + +Return ONLY valid JSON, no markdown fences.""" + + +def encode_image(image_path: str) -> str: + """Encode image to base64.""" + with open(image_path, "rb") as f: + return base64.b64encode(f.read()).decode() + + +def generate_description(image_path: str, model: str = "gemma3:27b") -> dict: + """Generate scene description for an image using vision model.""" + b64 = encode_image(image_path) + + data = { + "model": model, + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": DESCRIPTION_PROMPT}, + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}} + ] + } + ], + "max_tokens": 512, + "temperature": 0.3, + } + + url = f"{ENDPOINT}/chat/completions" + try: + req = urllib.request.Request(url, json.dumps(data).encode(), { + "Content-Type": "application/json", + "Authorization": f"Bearer {API_KEY}", + }, method="POST") + with urllib.request.urlopen(req, timeout=60) as resp: + result = json.loads(resp.read()) + content = result["choices"][0]["message"]["content"] + + # Try to extract JSON from response + content = content.strip() + if content.startswith("```"): + content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip() + + return json.loads(content) + except Exception as e: + return {"error": str(e), "description": "", "mood": "unknown", "colors": [], "composition": "", "camera": ""} + + +def scan_assets(directory: str) -> list: + """Find all image files in directory.""" + extensions = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".mp4", ".mov", ".avi"} + files = [] + for ext in extensions: + files.extend(glob.glob(os.path.join(directory, f"**/*{ext}"), recursive=True)) + return sorted(files) + + +def generate_training_pairs(assets_dir: str, output_path: str, model: str = "gemma3:27b", + dry_run: bool = False, limit: int = None): + """Generate training pairs from image assets.""" + files = scan_assets(assets_dir) + if limit: + files = files[:limit] + + print(f"Assets directory: {assets_dir}") + print(f"Found: {len(files)} image/video files") + print(f"Model: {model}") + print(f"Mode: {'DRY RUN' if dry_run else 'GENERATE'}") + print() + + pairs = [] + errors = 0 + + for i, filepath in enumerate(files): + filename = os.path.basename(filepath) + print(f" [{i+1}/{len(files)}] {filename}...", end=" ", flush=True) + + if dry_run: + print("SKIP") + continue + + scene = generate_description(filepath, model) + + if "error" in scene: + print(f"ERROR: {scene['error'][:50]}") + errors += 1 + continue + + pair = { + "terse": f"[image: {filename}]", + "rich": scene.get("description", ""), + "domain": "scene_descriptions", + "source_asset": filepath, + "mood": scene.get("mood", ""), + "colors": scene.get("colors", []), + "composition": scene.get("composition", ""), + "camera": scene.get("camera", ""), + "model": model, + "timestamp": datetime.now(timezone.utc).isoformat(), + } + pairs.append(pair) + print(f"OK ({scene.get('mood', '?')})") + + # Write output + if pairs and not dry_run: + with open(output_path, "w") as f: + for p in pairs: + f.write(json.dumps(p, ensure_ascii=False) + "\n") + print(f"\nGenerated: {len(pairs)} pairs, {errors} errors") + print(f"Output: {output_path}") + elif dry_run: + print(f"\nDry run: would generate {len(files)} pairs") + + return pairs + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Generate scene descriptions from images") + parser.add_argument("--input", required=True, help="Assets directory") + parser.add_argument("--output", default="training/data/scene-descriptions/auto-generated-scenes.jsonl") + parser.add_argument("--model", default="gemma3:27b") + parser.add_argument("--dry-run", action="store_true") + parser.add_argument("--limit", type=int, default=None) + args = parser.parse_args() + generate_training_pairs(args.input, args.output, args.model, args.dry_run, args.limit) + + +if __name__ == "__main__": + main()