From 64650c8598f846bc755a2e8d87d32878b409153d Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Thu, 16 Apr 2026 05:42:35 +0000 Subject: [PATCH] feat: auto-generate scene descriptions from image/video assets (#689) --- scripts/auto-scene-descriptions.py | 160 +++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 scripts/auto-scene-descriptions.py diff --git a/scripts/auto-scene-descriptions.py b/scripts/auto-scene-descriptions.py new file mode 100644 index 00000000..1d03509a --- /dev/null +++ b/scripts/auto-scene-descriptions.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +""" +auto-scene-descriptions.py — Generate scene descriptions from image/video assets. + +Scans an assets directory, uses vision model to describe each asset, +outputs training pairs in timmy-config format. + +Usage: + python3 scripts/auto-scene-descriptions.py --scan ~/assets/ + python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --output training-data/scene-from-media.jsonl + python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --dry-run +""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + +IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"} +VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"} +SUPPORTED_EXTS = IMAGE_EXTS | VIDEO_EXTS + + +def scan_assets(directory: str) -> list[Path]: + """Find all image/video assets in a directory.""" + assets = [] + for root, dirs, files in os.walk(directory): + dirs[:] = [d for d in dirs if d not in {".git", "node_modules", "__pycache__"}] + for f in sorted(files): + ext = Path(f).suffix.lower() + if ext in SUPPORTED_EXTS: + assets.append(Path(root) / f) + return assets + + +def extract_video_frame(video_path: Path) -> Path | None: + """Extract a representative frame from a video using ffmpeg.""" + frame_path = video_path.with_suffix(".frame.jpg") + try: + subprocess.run( + ["ffmpeg", "-i", str(video_path), "-vframes", "1", "-ss", "5", "-y", str(frame_path)], + capture_output=True, timeout=30 + ) + if frame_path.exists(): + return frame_path + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + return None + + +def describe_with_ollama(image_path: Path) -> str: + """Generate a scene description using local Ollama vision model.""" + try: + result = subprocess.run( + ["ollama", "run", "llava", f"Describe this image as a visual scene for a film. Include mood, colors, composition, and camera angle. Be specific and vivid in 2-3 sentences. Image: {image_path}"], + capture_output=True, text=True, timeout=60 + ) + if result.returncode == 0: + return result.stdout.strip() + except (subprocess.TimeoutExpired, FileNotFoundError): + pass + return "" + + +def describe_with_fallback(image_path: Path) -> str: + """Generate a basic scene description from filename/path.""" + name = image_path.stem.replace("_", " ").replace("-", " ") + parent = image_path.parent.name.replace("_", " ").replace("-", " ") + return f"A scene depicting {name} in a {parent} setting. Visual composition inferred from asset location and naming conventions." + + +def build_training_pair(asset_path: Path, description: str, asset_index: int) -> dict: + """Build a training pair in timmy-config scene description format.""" + name = asset_path.stem + parent = asset_path.parent.name + + # Infer mood from path/name keywords + mood_keywords = { + "dark": "melancholic", "light": "hopeful", "warm": "nostalgic", + "cold": "isolated", "bright": "energetic", "sunset": "bittersweet", + "night": "mysterious", "morning": "refreshing", "rain": "contemplative", + } + mood = "neutral" + name_lower = name.lower() + parent.lower() + for keyword, m in mood_keywords.items(): + if keyword in name_lower: + mood = m + break + + return { + "song": f"asset-{asset_index:04d}", + "beat": 1, + "lyric_line": f"[Visual asset: {asset_path.name}]", + "scene": { + "mood": mood, + "colors": ["inferred"], + "composition": "frame", + "camera": "static", + "description": description, + "source": "auto-generated", + "asset_path": str(asset_path), + } + } + + +def main(): + parser = argparse.ArgumentParser(description="Auto-generate scene descriptions from assets") + parser.add_argument("--scan", required=True, help="Assets directory to scan") + parser.add_argument("--output", default=None, help="Output JSONL path") + parser.add_argument("--dry-run", action="store_true", help="Show what would be generated") + parser.add_argument("--vision", action="store_true", help="Use Ollama vision model (slow)") + args = parser.parse_args() + + assets = scan_assets(args.scan) + print(f"Found {len(assets)} assets in {args.scan}", file=sys.stderr) + + if not assets: + print("No assets found.", file=sys.stderr) + sys.exit(0) + + pairs = [] + for i, asset in enumerate(assets): + if args.vision: + # For videos, extract frame first + image_for_vision = asset + if asset.suffix.lower() in VIDEO_EXTS: + frame = extract_video_frame(asset) + if frame: + image_for_vision = frame + + description = describe_with_ollama(image_for_vision) + if not description: + description = describe_with_fallback(asset) + else: + description = describe_with_fallback(asset) + + pair = build_training_pair(asset, description, i) + pairs.append(pair) + + if args.dry_run and i < 5: + print(f" {asset.name}: {description[:80]}") + + if args.dry_run: + print(f"\nWould generate {len(pairs)} training pairs") + return + + # Write output + output_path = args.output or os.path.expanduser("~/.hermes/training-data/scene-from-media.jsonl") + os.makedirs(os.path.dirname(output_path), exist_ok=True) + with open(output_path, "w") as f: + for p in pairs: + f.write(json.dumps(p, ensure_ascii=False) + "\n") + + print(f"Generated {len(pairs)} scene descriptions → {output_path}", file=sys.stderr) + + +if __name__ == "__main__": + main()