timmy-config/scripts/auto-scene-descriptions.py

#!/usr/bin/env python3
"""
auto-scene-descriptions.py — Generate scene descriptions from image/video assets.

Scans an assets directory, uses vision model to describe each asset,
outputs training pairs in timmy-config format.

Usage:
    python3 scripts/auto-scene-descriptions.py --scan ~/assets/
    python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --output training-data/scene-from-media.jsonl
    python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --dry-run
"""

import argparse
import json
import os
import subprocess
import sys
from pathlib import Path

IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
SUPPORTED_EXTS = IMAGE_EXTS | VIDEO_EXTS


def scan_assets(directory: str) -> list[Path]:
    """Find all image/video assets in a directory."""
    assets = []
    for root, dirs, files in os.walk(directory):
        dirs[:] = [d for d in dirs if d not in {".git", "node_modules", "__pycache__"}]
        for f in sorted(files):
            ext = Path(f).suffix.lower()
            if ext in SUPPORTED_EXTS:
                assets.append(Path(root) / f)
    return assets


def extract_video_frame(video_path: Path) -> Path | None:
    """Extract a representative frame from a video using ffmpeg."""
    frame_path = video_path.with_suffix(".frame.jpg")
    try:
        subprocess.run(
            ["ffmpeg", "-i", str(video_path), "-vframes", "1", "-ss", "5", "-y", str(frame_path)],
            capture_output=True, timeout=30
        )
        if frame_path.exists():
            return frame_path
    except (subprocess.TimeoutExpired, FileNotFoundError):
        pass
    return None


def describe_with_ollama(image_path: Path) -> str:
    """Generate a scene description using local Ollama vision model."""
    try:
        result = subprocess.run(
            ["ollama", "run", "llava", f"Describe this image as a visual scene for a film. Include mood, colors, composition, and camera angle. Be specific and vivid in 2-3 sentences. Image: {image_path}"],
            capture_output=True, text=True, timeout=60
        )
        if result.returncode == 0:
            return result.stdout.strip()
    except (subprocess.TimeoutExpired, FileNotFoundError):
        pass
    return ""


def describe_with_fallback(image_path: Path) -> str:
    """Generate a basic scene description from filename/path."""
    name = image_path.stem.replace("_", " ").replace("-", " ")
    parent = image_path.parent.name.replace("_", " ").replace("-", " ")
    return f"A scene depicting {name} in a {parent} setting. Visual composition inferred from asset location and naming conventions."


def build_training_pair(asset_path: Path, description: str, asset_index: int) -> dict:
    """Build a training pair in timmy-config scene description format."""
    name = asset_path.stem
    parent = asset_path.parent.name

    # Infer mood from path/name keywords
    mood_keywords = {
        "dark": "melancholic", "light": "hopeful", "warm": "nostalgic",
        "cold": "isolated", "bright": "energetic", "sunset": "bittersweet",
        "night": "mysterious", "morning": "refreshing", "rain": "contemplative",
    }
    mood = "neutral"
    name_lower = name.lower() + parent.lower()
    for keyword, m in mood_keywords.items():
        if keyword in name_lower:
            mood = m
            break

    return {
        "song": f"asset-{asset_index:04d}",
        "beat": 1,
        "lyric_line": f"[Visual asset: {asset_path.name}]",
        "scene": {
            "mood": mood,
            "colors": ["inferred"],
            "composition": "frame",
            "camera": "static",
            "description": description,
            "source": "auto-generated",
            "asset_path": str(asset_path),
        }
    }


def main():
    parser = argparse.ArgumentParser(description="Auto-generate scene descriptions from assets")
    parser.add_argument("--scan", required=True, help="Assets directory to scan")
    parser.add_argument("--output", default=None, help="Output JSONL path")
    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
    parser.add_argument("--vision", action="store_true", help="Use Ollama vision model (slow)")
    args = parser.parse_args()

    assets = scan_assets(args.scan)
    print(f"Found {len(assets)} assets in {args.scan}", file=sys.stderr)

    if not assets:
        print("No assets found.", file=sys.stderr)
        sys.exit(0)

    pairs = []
    for i, asset in enumerate(assets):
        if args.vision:
            # For videos, extract frame first
            image_for_vision = asset
            if asset.suffix.lower() in VIDEO_EXTS:
                frame = extract_video_frame(asset)
                if frame:
                    image_for_vision = frame

            description = describe_with_ollama(image_for_vision)
            if not description:
                description = describe_with_fallback(asset)
        else:
            description = describe_with_fallback(asset)

        pair = build_training_pair(asset, description, i)
        pairs.append(pair)

        if args.dry_run and i < 5:
            print(f"  {asset.name}: {description[:80]}")

    if args.dry_run:
        print(f"\nWould generate {len(pairs)} training pairs")
        return

    # Write output
    output_path = args.output or os.path.expanduser("~/.hermes/training-data/scene-from-media.jsonl")
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, "w") as f:
        for p in pairs:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"Generated {len(pairs)} scene descriptions → {output_path}", file=sys.stderr)


if __name__ == "__main__":
    main()