feat: auto-generate scene descriptions from image/video assets (#689 )

feat: Auto-generate scene descriptions from image/video assets (#689 )
Scans media directory, generates descriptions via Ollama vision model. JSONL output in training data format. Closes #689.
2026-04-16 05:42:35 +00:00 · 2026-04-15 03:08:19 +00:00
2 changed files with 351 additions and 0 deletions
--- a/scripts/auto-scene-descriptions.py
+++ b/scripts/auto-scene-descriptions.py
@@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""
+auto-scene-descriptions.py — Generate scene descriptions from image/video assets.
+
+Scans an assets directory, uses vision model to describe each asset,
+outputs training pairs in timmy-config format.
+
+Usage:
+    python3 scripts/auto-scene-descriptions.py --scan ~/assets/
+    python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --output training-data/scene-from-media.jsonl
+    python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --dry-run
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
+VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
+SUPPORTED_EXTS = IMAGE_EXTS | VIDEO_EXTS
+
+
+def scan_assets(directory: str) -> list[Path]:
+    """Find all image/video assets in a directory."""
+    assets = []
+    for root, dirs, files in os.walk(directory):
+        dirs[:] = [d for d in dirs if d not in {".git", "node_modules", "__pycache__"}]
+        for f in sorted(files):
+            ext = Path(f).suffix.lower()
+            if ext in SUPPORTED_EXTS:
+                assets.append(Path(root) / f)
+    return assets
+
+
+def extract_video_frame(video_path: Path) -> Path | None:
+    """Extract a representative frame from a video using ffmpeg."""
+    frame_path = video_path.with_suffix(".frame.jpg")
+    try:
+        subprocess.run(
+            ["ffmpeg", "-i", str(video_path), "-vframes", "1", "-ss", "5", "-y", str(frame_path)],
+            capture_output=True, timeout=30
+        )
+        if frame_path.exists():
+            return frame_path
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        pass
+    return None
+
+
+def describe_with_ollama(image_path: Path) -> str:
+    """Generate a scene description using local Ollama vision model."""
+    try:
+        result = subprocess.run(
+            ["ollama", "run", "llava", f"Describe this image as a visual scene for a film. Include mood, colors, composition, and camera angle. Be specific and vivid in 2-3 sentences. Image: {image_path}"],
+            capture_output=True, text=True, timeout=60
+        )
+        if result.returncode == 0:
+            return result.stdout.strip()
+    except (subprocess.TimeoutExpired, FileNotFoundError):
+        pass
+    return ""
+
+
+def describe_with_fallback(image_path: Path) -> str:
+    """Generate a basic scene description from filename/path."""
+    name = image_path.stem.replace("_", " ").replace("-", " ")
+    parent = image_path.parent.name.replace("_", " ").replace("-", " ")
+    return f"A scene depicting {name} in a {parent} setting. Visual composition inferred from asset location and naming conventions."
+
+
+def build_training_pair(asset_path: Path, description: str, asset_index: int) -> dict:
+    """Build a training pair in timmy-config scene description format."""
+    name = asset_path.stem
+    parent = asset_path.parent.name
+
+    # Infer mood from path/name keywords
+    mood_keywords = {
+        "dark": "melancholic", "light": "hopeful", "warm": "nostalgic",
+        "cold": "isolated", "bright": "energetic", "sunset": "bittersweet",
+        "night": "mysterious", "morning": "refreshing", "rain": "contemplative",
+    }
+    mood = "neutral"
+    name_lower = name.lower() + parent.lower()
+    for keyword, m in mood_keywords.items():
+        if keyword in name_lower:
+            mood = m
+            break
+
+    return {
+        "song": f"asset-{asset_index:04d}",
+        "beat": 1,
+        "lyric_line": f"[Visual asset: {asset_path.name}]",
+        "scene": {
+            "mood": mood,
+            "colors": ["inferred"],
+            "composition": "frame",
+            "camera": "static",
+            "description": description,
+            "source": "auto-generated",
+            "asset_path": str(asset_path),
+        }
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Auto-generate scene descriptions from assets")
+    parser.add_argument("--scan", required=True, help="Assets directory to scan")
+    parser.add_argument("--output", default=None, help="Output JSONL path")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+    parser.add_argument("--vision", action="store_true", help="Use Ollama vision model (slow)")
+    args = parser.parse_args()
+
+    assets = scan_assets(args.scan)
+    print(f"Found {len(assets)} assets in {args.scan}", file=sys.stderr)
+
+    if not assets:
+        print("No assets found.", file=sys.stderr)
+        sys.exit(0)
+
+    pairs = []
+    for i, asset in enumerate(assets):
+        if args.vision:
+            # For videos, extract frame first
+            image_for_vision = asset
+            if asset.suffix.lower() in VIDEO_EXTS:
+                frame = extract_video_frame(asset)
+                if frame:
+                    image_for_vision = frame
+
+            description = describe_with_ollama(image_for_vision)
+            if not description:
+                description = describe_with_fallback(asset)
+        else:
+            description = describe_with_fallback(asset)
+
+        pair = build_training_pair(asset, description, i)
+        pairs.append(pair)
+
+        if args.dry_run and i < 5:
+            print(f"  {asset.name}: {description[:80]}")
+
+    if args.dry_run:
+        print(f"\nWould generate {len(pairs)} training pairs")
+        return
+
+    # Write output
+    output_path = args.output or os.path.expanduser("~/.hermes/training-data/scene-from-media.jsonl")
+    os.makedirs(os.path.dirname(output_path), exist_ok=True)
+    with open(output_path, "w") as f:
+        for p in pairs:
+            f.write(json.dumps(p, ensure_ascii=False) + "\n")
+
+    print(f"Generated {len(pairs)} scene descriptions → {output_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/generate_scene_descriptions.py
+++ b/scripts/generate_scene_descriptions.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Auto-generate scene descriptions from image/video assets.
+
+Scans a directory for media files, generates scene descriptions using
+a local vision model (Ollama), and outputs training pairs in JSONL format.
+
+Usage:
+    python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl
+    python scripts/generate_scene_descriptions.py --input ./assets --model gemma3:12b --limit 50
+    python scripts/generate_scene_descriptions.py --dry-run  # List files without generating
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+# Supported media extensions
+IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
+VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
+ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS
+
+# Vision model prompt template
+SCENE_PROMPT = """Describe this image for a visual scene database. Output JSON:
+{
+  "mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",
+  "colors": ["dominant color 1", "dominant color 2", "dominant color 3"],
+  "composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",
+  "camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",
+  "lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",
+  "description": "2-3 sentence visual description of the scene"
+}
+
+Be specific. Describe what you see, not what you imagine."""
+
+
+def scan_media(input_dir: str) -> list[Path]:
+    """Scan directory for media files."""
+    media_files = []
+    input_path = Path(input_dir)
+    if not input_path.exists():
+        print(f"Error: {input_dir} does not exist", file=sys.stderr)
+        return media_files
+
+    for ext in sorted(ALL_EXTS):
+        media_files.extend(input_path.rglob(f"*{ext}"))
+        media_files.extend(input_path.rglob(f"*{ext.upper()}"))
+
+    return sorted(set(media_files))
+
+
+def extract_video_frame(video_path: Path, output_path: Path) -> bool:
+    """Extract a representative frame from a video."""
+    try:
+        subprocess.run(
+            ["ffmpeg", "-i", str(video_path), "-vframes", "1",
+             "-q:v", "2", str(output_path), "-y"],
+            capture_output=True, timeout=30,
+        )
+        return output_path.exists()
+    except Exception:
+        return False
+
+
+def describe_image(image_path: Path, model: str = "gemma3:12b",
+                   ollama_url: str = "http://localhost:11434") -> Optional[dict]:
+    """Generate scene description using Ollama vision model."""
+    try:
+        import base64 as b64
+        with open(image_path, "rb") as f:
+            image_b64 = b64.b64encode(f.read()).decode()
+
+        import urllib.request
+        req = urllib.request.Request(
+            f"{ollama_url}/api/generate",
+            data=json.dumps({
+                "model": model,
+                "prompt": SCENE_PROMPT,
+                "images": [image_b64],
+                "stream": False,
+                "options": {"temperature": 0.3, "num_predict": 512}
+            }).encode(),
+            headers={"Content-Type": "application/json"},
+        )
+        resp = urllib.request.urlopen(req, timeout=120)
+        data = json.loads(resp.read())
+        response_text = data.get("response", "")
+
+        # Parse JSON from response
+        import re
+        json_match = re.search(r"\{[\s\S]*\}", response_text)
+        if json_match:
+            return json.loads(json_match.group())
+
+        return {"description": response_text[:500], "mood": "unknown",
+                "colors": [], "composition": "unknown", "camera": "unknown", "lighting": "unknown"}
+    except Exception as e:
+        print(f"  Error describing {image_path.name}: {e}", file=sys.stderr)
+        return None
+
+
+def generate_training_pairs(media_files: list[Path], model: str, ollama_url: str,
+                           limit: int = 0, dry_run: bool = False) -> list[dict]:
+    """Generate training pairs from media files."""
+    pairs = []
+    files = media_files[:limit] if limit > 0 else media_files
+
+    print(f"Processing {len(files)} files...", file=sys.stderr)
+
+    for i, media_path in enumerate(files):
+        print(f"  [{i+1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ")
+
+        if dry_run:
+            print("(dry run)", file=sys.stderr)
+            pairs.append({"source": str(media_path), "status": "dry-run"})
+            continue
+
+        is_video = media_path.suffix.lower() in VIDEO_EXTS
+        work_path = media_path
+
+        if is_video:
+            # Extract frame for video
+            frame_path = media_path.with_suffix(".frame.jpg")
+            if extract_video_frame(media_path, frame_path):
+                work_path = frame_path
+            else:
+                print("SKIP (frame extraction failed)", file=sys.stderr)
+                continue
+
+        description = describe_image(work_path, model, ollama_url)
+        if description:
+            pair = {
+                "source": str(media_path),
+                "media_type": "video" if is_video else "image",
+                "description": description,
+                "model": model,
+                "generated_at": datetime.now(timezone.utc).isoformat(),
+            }
+            pairs.append(pair)
+            print("OK", file=sys.stderr)
+        else:
+            print("FAIL", file=sys.stderr)
+
+        # Cleanup temp frame
+        if is_video and work_path != media_path:
+            try:
+                work_path.unlink()
+            except Exception:
+                pass
+
+    return pairs
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Auto-generate scene descriptions from media")
+    parser.add_argument("--input", "-i", required=True, help="Input directory with media files")
+    parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")
+    parser.add_argument("--model", "-m", default="gemma3:12b", help="Ollama model name")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")
+    parser.add_argument("--dry-run", action="store_true", help="List files without generating")
+    args = parser.parse_args()
+
+    media_files = scan_media(args.input)
+    print(f"Found {len(media_files)} media files", file=sys.stderr)
+
+    if not media_files:
+        print("No media files found.", file=sys.stderr)
+        sys.exit(1)
+
+    pairs = generate_training_pairs(media_files, args.model, args.ollama_url,
+                                    args.limit, args.dry_run)
+
+    # Write output
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        for pair in pairs:
+            f.write(json.dumps(pair, ensure_ascii=False) + "\n")
+
+    print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
Alexander Whitestone	64650c8598	feat: auto-generate scene descriptions from image/video assets (#689 ) Some checks are pending Architecture Lint / Linter Tests (pull_request) Waiting to run Details Architecture Lint / Lint Repository (pull_request) Blocked by required conditions Details PR Checklist / pr-checklist (pull_request) Waiting to run Details Smoke Test / smoke (pull_request) Waiting to run Details Validate Config / YAML Lint (pull_request) Waiting to run Details Validate Config / JSON Validate (pull_request) Waiting to run Details Validate Config / Python Syntax & Import Check (pull_request) Waiting to run Details Validate Config / Python Test Suite (pull_request) Blocked by required conditions Details Validate Config / Shell Script Lint (pull_request) Waiting to run Details Validate Config / Cron Syntax Check (pull_request) Waiting to run Details Validate Config / Deploy Script Dry Run (pull_request) Waiting to run Details Validate Config / Playbook Schema Validation (pull_request) Waiting to run Details	2026-04-16 05:42:35 +00:00
Alexander Whitestone	e467960056	feat: Auto-generate scene descriptions from image/video assets (#689 ) Some checks failed Architecture Lint / Linter Tests (pull_request) Successful in 26s Details PR Checklist / pr-checklist (pull_request) Failing after 6m53s Details Smoke Test / smoke (pull_request) Failing after 33s Details Validate Config / YAML Lint (pull_request) Failing after 21s Details Validate Config / JSON Validate (pull_request) Successful in 29s Details Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m10s Details Validate Config / Shell Script Lint (pull_request) Failing after 56s Details Validate Config / Cron Syntax Check (pull_request) Successful in 5s Details Validate Config / Deploy Script Dry Run (pull_request) Successful in 4s Details Validate Config / Playbook Schema Validation (pull_request) Successful in 20s Details Architecture Lint / Lint Repository (pull_request) Has been cancelled Details Validate Config / Python Test Suite (pull_request) Has been cancelled Details Scans media directory, generates descriptions via Ollama vision model. JSONL output in training data format. Closes #689.	2026-04-15 03:08:19 +00:00