Some checks failed
Architecture Lint / Linter Tests (pull_request) Has been cancelled
Architecture Lint / Lint Repository (pull_request) Has been cancelled
PR Checklist / pr-checklist (pull_request) Has been cancelled
Smoke Test / smoke (pull_request) Has been cancelled
Validate Config / YAML Lint (pull_request) Has been cancelled
Validate Config / JSON Validate (pull_request) Has been cancelled
Validate Config / Python Syntax & Import Check (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
Validate Config / Shell Script Lint (pull_request) Has been cancelled
Validate Config / Cron Syntax Check (pull_request) Has been cancelled
Validate Config / Deploy Script Dry Run (pull_request) Has been cancelled
Validate Config / Playbook Schema Validation (pull_request) Has been cancelled
161 lines
5.7 KiB
Python
161 lines
5.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
auto-scene-descriptions.py — Generate scene descriptions from image/video assets.
|
|
|
|
Scans an assets directory, uses vision model to describe each asset,
|
|
outputs training pairs in timmy-config format.
|
|
|
|
Usage:
|
|
python3 scripts/auto-scene-descriptions.py --scan ~/assets/
|
|
python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --output training-data/scene-from-media.jsonl
|
|
python3 scripts/auto-scene-descriptions.py --scan ~/assets/ --dry-run
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
|
|
VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
|
|
SUPPORTED_EXTS = IMAGE_EXTS | VIDEO_EXTS
|
|
|
|
|
|
def scan_assets(directory: str) -> list[Path]:
|
|
"""Find all image/video assets in a directory."""
|
|
assets = []
|
|
for root, dirs, files in os.walk(directory):
|
|
dirs[:] = [d for d in dirs if d not in {".git", "node_modules", "__pycache__"}]
|
|
for f in sorted(files):
|
|
ext = Path(f).suffix.lower()
|
|
if ext in SUPPORTED_EXTS:
|
|
assets.append(Path(root) / f)
|
|
return assets
|
|
|
|
|
|
def extract_video_frame(video_path: Path) -> Path | None:
|
|
"""Extract a representative frame from a video using ffmpeg."""
|
|
frame_path = video_path.with_suffix(".frame.jpg")
|
|
try:
|
|
subprocess.run(
|
|
["ffmpeg", "-i", str(video_path), "-vframes", "1", "-ss", "5", "-y", str(frame_path)],
|
|
capture_output=True, timeout=30
|
|
)
|
|
if frame_path.exists():
|
|
return frame_path
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
pass
|
|
return None
|
|
|
|
|
|
def describe_with_ollama(image_path: Path) -> str:
|
|
"""Generate a scene description using local Ollama vision model."""
|
|
try:
|
|
result = subprocess.run(
|
|
["ollama", "run", "llava", f"Describe this image as a visual scene for a film. Include mood, colors, composition, and camera angle. Be specific and vivid in 2-3 sentences. Image: {image_path}"],
|
|
capture_output=True, text=True, timeout=60
|
|
)
|
|
if result.returncode == 0:
|
|
return result.stdout.strip()
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
pass
|
|
return ""
|
|
|
|
|
|
def describe_with_fallback(image_path: Path) -> str:
|
|
"""Generate a basic scene description from filename/path."""
|
|
name = image_path.stem.replace("_", " ").replace("-", " ")
|
|
parent = image_path.parent.name.replace("_", " ").replace("-", " ")
|
|
return f"A scene depicting {name} in a {parent} setting. Visual composition inferred from asset location and naming conventions."
|
|
|
|
|
|
def build_training_pair(asset_path: Path, description: str, asset_index: int) -> dict:
|
|
"""Build a training pair in timmy-config scene description format."""
|
|
name = asset_path.stem
|
|
parent = asset_path.parent.name
|
|
|
|
# Infer mood from path/name keywords
|
|
mood_keywords = {
|
|
"dark": "melancholic", "light": "hopeful", "warm": "nostalgic",
|
|
"cold": "isolated", "bright": "energetic", "sunset": "bittersweet",
|
|
"night": "mysterious", "morning": "refreshing", "rain": "contemplative",
|
|
}
|
|
mood = "neutral"
|
|
name_lower = name.lower() + parent.lower()
|
|
for keyword, m in mood_keywords.items():
|
|
if keyword in name_lower:
|
|
mood = m
|
|
break
|
|
|
|
return {
|
|
"song": f"asset-{asset_index:04d}",
|
|
"beat": 1,
|
|
"lyric_line": f"[Visual asset: {asset_path.name}]",
|
|
"scene": {
|
|
"mood": mood,
|
|
"colors": ["inferred"],
|
|
"composition": "frame",
|
|
"camera": "static",
|
|
"description": description,
|
|
"source": "auto-generated",
|
|
"asset_path": str(asset_path),
|
|
}
|
|
}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Auto-generate scene descriptions from assets")
|
|
parser.add_argument("--scan", required=True, help="Assets directory to scan")
|
|
parser.add_argument("--output", default=None, help="Output JSONL path")
|
|
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
|
parser.add_argument("--vision", action="store_true", help="Use Ollama vision model (slow)")
|
|
args = parser.parse_args()
|
|
|
|
assets = scan_assets(args.scan)
|
|
print(f"Found {len(assets)} assets in {args.scan}", file=sys.stderr)
|
|
|
|
if not assets:
|
|
print("No assets found.", file=sys.stderr)
|
|
sys.exit(0)
|
|
|
|
pairs = []
|
|
for i, asset in enumerate(assets):
|
|
if args.vision:
|
|
# For videos, extract frame first
|
|
image_for_vision = asset
|
|
if asset.suffix.lower() in VIDEO_EXTS:
|
|
frame = extract_video_frame(asset)
|
|
if frame:
|
|
image_for_vision = frame
|
|
|
|
description = describe_with_ollama(image_for_vision)
|
|
if not description:
|
|
description = describe_with_fallback(asset)
|
|
else:
|
|
description = describe_with_fallback(asset)
|
|
|
|
pair = build_training_pair(asset, description, i)
|
|
pairs.append(pair)
|
|
|
|
if args.dry_run and i < 5:
|
|
print(f" {asset.name}: {description[:80]}")
|
|
|
|
if args.dry_run:
|
|
print(f"\nWould generate {len(pairs)} training pairs")
|
|
return
|
|
|
|
# Write output
|
|
output_path = args.output or os.path.expanduser("~/.hermes/training-data/scene-from-media.jsonl")
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
with open(output_path, "w") as f:
|
|
for p in pairs:
|
|
f.write(json.dumps(p, ensure_ascii=False) + "\n")
|
|
|
|
print(f"Generated {len(pairs)} scene descriptions → {output_path}", file=sys.stderr)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|