diff --git a/training/scripts/generate_scene_descriptions.py b/training/scripts/generate_scene_descriptions.py
new file mode 100755
index 00000000..53bc82d9
--- /dev/null
+++ b/training/scripts/generate_scene_descriptions.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""
+generate_scene_descriptions.py — Auto-generate scene descriptions from image/video.
+
+Scans an assets directory for images, generates rich scene descriptions
+using a local vision model, and outputs as training pairs.
+
+Usage:
+    python3 training/scripts/generate_scene_descriptions.py --input assets/
+    python3 training/scripts/generate_scene_descriptions.py --input assets/ --model gemma3:27b
+    python3 training/scripts/generate_scene_descriptions.py --input assets/ --dry-run
+"""
+
+import json
+import os
+import sys
+import base64
+import glob
+import urllib.request
+from datetime import datetime, timezone
+from pathlib import Path
+
+ENDPOINT = os.environ.get("OPENAI_BASE_URL", "http://localhost:11434/v1")
+API_KEY = os.environ.get("OPENAI_API_KEY", "ollama")
+
+DESCRIPTION_PROMPT = """Describe this image as a rich visual scene for a training dataset.
+
+Output a JSON object with these fields:
+- mood: primary emotional mood (1-2 words)
+- colors: array of 3-5 dominant colors
+- composition: framing type (wide shot, close-up, etc.)
+- camera: camera movement suggestion
+- description: rich 2-3 sentence scene description
+
+Focus on: lighting, atmosphere, emotion, visual composition. Be specific, not generic.
+
+Return ONLY valid JSON, no markdown fences."""
+
+
+def encode_image(image_path: str) -> str:
+    """Encode image to base64."""
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode()
+
+
+def generate_description(image_path: str, model: str = "gemma3:27b") -> dict:
+    """Generate scene description for an image using vision model."""
+    b64 = encode_image(image_path)
+
+    data = {
+        "model": model,
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": DESCRIPTION_PROMPT},
+                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
+                ]
+            }
+        ],
+        "max_tokens": 512,
+        "temperature": 0.3,
+    }
+
+    url = f"{ENDPOINT}/chat/completions"
+    try:
+        req = urllib.request.Request(url, json.dumps(data).encode(), {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {API_KEY}",
+        }, method="POST")
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            result = json.loads(resp.read())
+            content = result["choices"][0]["message"]["content"]
+
+            # Try to extract JSON from response
+            content = content.strip()
+            if content.startswith("```"):
+                content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip()
+
+            return json.loads(content)
+    except Exception as e:
+        return {"error": str(e), "description": "", "mood": "unknown", "colors": [], "composition": "", "camera": ""}
+
+
+def scan_assets(directory: str) -> list:
+    """Find all image files in directory."""
+    extensions = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".mp4", ".mov", ".avi"}
+    files = []
+    for ext in extensions:
+        files.extend(glob.glob(os.path.join(directory, f"**/*{ext}"), recursive=True))
+    return sorted(files)
+
+
+def generate_training_pairs(assets_dir: str, output_path: str, model: str = "gemma3:27b",
+                           dry_run: bool = False, limit: int = None):
+    """Generate training pairs from image assets."""
+    files = scan_assets(assets_dir)
+    if limit:
+        files = files[:limit]
+
+    print(f"Assets directory: {assets_dir}")
+    print(f"Found: {len(files)} image/video files")
+    print(f"Model: {model}")
+    print(f"Mode: {'DRY RUN' if dry_run else 'GENERATE'}")
+    print()
+
+    pairs = []
+    errors = 0
+
+    for i, filepath in enumerate(files):
+        filename = os.path.basename(filepath)
+        print(f"  [{i+1}/{len(files)}] {filename}...", end=" ", flush=True)
+
+        if dry_run:
+            print("SKIP")
+            continue
+
+        scene = generate_description(filepath, model)
+
+        if "error" in scene:
+            print(f"ERROR: {scene['error'][:50]}")
+            errors += 1
+            continue
+
+        pair = {
+            "terse": f"[image: {filename}]",
+            "rich": scene.get("description", ""),
+            "domain": "scene_descriptions",
+            "source_asset": filepath,
+            "mood": scene.get("mood", ""),
+            "colors": scene.get("colors", []),
+            "composition": scene.get("composition", ""),
+            "camera": scene.get("camera", ""),
+            "model": model,
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+        }
+        pairs.append(pair)
+        print(f"OK ({scene.get('mood', '?')})")
+
+    # Write output
+    if pairs and not dry_run:
+        with open(output_path, "w") as f:
+            for p in pairs:
+                f.write(json.dumps(p, ensure_ascii=False) + "\n")
+        print(f"\nGenerated: {len(pairs)} pairs, {errors} errors")
+        print(f"Output: {output_path}")
+    elif dry_run:
+        print(f"\nDry run: would generate {len(files)} pairs")
+
+    return pairs
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Generate scene descriptions from images")
+    parser.add_argument("--input", required=True, help="Assets directory")
+    parser.add_argument("--output", default="training/data/scene-descriptions/auto-generated-scenes.jsonl")
+    parser.add_argument("--model", default="gemma3:27b")
+    parser.add_argument("--dry-run", action="store_true")
+    parser.add_argument("--limit", type=int, default=None)
+    args = parser.parse_args()
+    generate_training_pairs(args.input, args.output, args.model, args.dry_run, args.limit)
+
+
+if __name__ == "__main__":
+    main()