Files
timmy-config/scripts/generate_scenes_from_media.py
Alexander Whitestone a2e61f6def
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 21s
Smoke Test / smoke (pull_request) Failing after 15s
Validate Config / YAML Lint (pull_request) Failing after 18s
Validate Config / JSON Validate (pull_request) Successful in 21s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 1m3s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 1m11s
Validate Config / Cron Syntax Check (pull_request) Successful in 15s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 14s
Validate Config / Playbook Schema Validation (pull_request) Successful in 27s
PR Checklist / pr-checklist (pull_request) Failing after 12m35s
Architecture Lint / Lint Repository (pull_request) Failing after 22s
feat: auto-generate scene descriptions from image/video assets (#689)
scripts/generate_scenes_from_media.py:
  Scans assets dir for images/videos (jpg/png/mp4/mov/etc)
  Calls vision model (llava/gpt-4/claude) to describe scenes
  Outputs training pairs: image_path -> scene description
  Includes provenance: model, timestamp, source_session_id
  --assets dir, --output file, --model, --max, --dry-run
  JSON parsing with fallback for plain text responses

tests/test_generate_scenes_from_media.py: 12 tests
  find_media_files: images, videos, max limit, missing dir
  file_hash: consistent, different files
  generate_prompt: image vs video
  parse_description: JSON, plain text
  generate_training_pair: structure, video type

Usage:
  python3 scripts/generate_scenes_from_media.py --assets ~/assets/
  python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --model gpt-4
  python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --dry-run
2026-04-21 07:22:28 -04:00

287 lines
11 KiB
Python

#!/usr/bin/env python3
"""
generate_scenes_from_media.py — Auto-generate scene descriptions from image/video assets.
Scans a directory for images/videos, generates scene descriptions using
a vision model, and outputs as training pairs in JSONL format.
Usage:
python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --output training-data/media-scenes.jsonl
python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --model llava --dry-run
python3 scripts/generate_scenes_from_media.py --assets ~/assets/ --max 10 --json
"""
import argparse
import hashlib
import json
import os
import subprocess
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Tuple
# Supported media formats
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp", ".tiff"}
VIDEO_EXTENSIONS = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".flv"}
ALL_EXTENSIONS = IMAGE_EXTENSIONS | VIDEO_EXTENSIONS
def find_media_files(assets_dir: str, max_files: int = 0) -> List[Path]:
"""Scan directory for media files."""
assets_path = Path(assets_dir)
if not assets_path.exists():
print(f"ERROR: Directory not found: {assets_dir}", file=sys.stderr)
return []
media_files = []
for ext in sorted(ALL_EXTENSIONS):
media_files.extend(assets_path.rglob(f"*{ext}"))
media_files.extend(assets_path.rglob(f"*{ext.upper()}"))
# Deduplicate
media_files = sorted(set(media_files))
if max_files > 0:
media_files = media_files[:max_files]
return media_files
def file_hash(filepath: Path) -> str:
"""Generate hash for file deduplication."""
return hashlib.sha256(str(filepath).encode()).hexdigest()[:16]
def generate_description_prompt(filepath: Path) -> str:
"""Generate the prompt for vision model."""
if filepath.suffix.lower() in IMAGE_EXTENSIONS:
return (
"Describe this image as a visual scene for a training dataset. "
"Include: mood, dominant colors (2-3), composition type, camera angle, "
"and a vivid 1-2 sentence description. Format as JSON with keys: "
"mood, colors, composition, camera, description."
)
else:
return (
"Describe this video frame as a visual scene for a training dataset. "
"Include: mood, dominant colors (2-3), composition type, camera movement, "
"and a vivid 1-2 sentence description. Format as JSON with keys: "
"mood, colors, composition, camera, description."
)
def call_vision_model(filepath: Path, model: str = "llava") -> Optional[dict]:
"""
Call a vision model to generate scene description.
Supports:
- llava (local via ollama)
- gpt-4-vision (OpenAI API)
- claude-vision (Anthropic API)
"""
prompt = generate_description_prompt(filepath)
try:
if model.startswith("llava") or model == "ollama":
# Local Ollama with LLaVA
result = subprocess.run(
["curl", "-s", "http://localhost:11434/api/generate", "-d",
json.dumps({
"model": "llava",
"prompt": prompt,
"images": [str(filepath)],
"stream": False,
})],
capture_output=True, text=True, timeout=60
)
if result.returncode == 0:
response = json.loads(result.stdout)
return parse_description(response.get("response", ""))
elif model.startswith("gpt-4"):
# OpenAI GPT-4 Vision (requires API key)
import base64
with open(filepath, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
print("ERROR: OPENAI_API_KEY not set", file=sys.stderr)
return None
result = subprocess.run(
["curl", "-s", "https://api.openai.com/v1/chat/completions",
"-H", f"Authorization: Bearer {api_key}",
"-H", "Content-Type: application/json",
"-d", json.dumps({
"model": "gpt-4-vision-preview",
"messages": [{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]
}],
"max_tokens": 500
})],
capture_output=True, text=True, timeout=60
)
if result.returncode == 0:
response = json.loads(result.stdout)
content = response["choices"][0]["message"]["content"]
return parse_description(content)
elif model.startswith("claude"):
# Anthropic Claude Vision (requires API key)
import base64
with open(filepath, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print("ERROR: ANTHROPIC_API_KEY not set", file=sys.stderr)
return None
media_type = "image/jpeg" if filepath.suffix.lower() in {".jpg", ".jpeg"} else "image/png"
result = subprocess.run(
["curl", "-s", "https://api.anthropic.com/v1/messages",
"-H", f"x-api-key: {api_key}",
"-H", "anthropic-version: 2023-06-01",
"-H", "Content-Type: application/json",
"-d", json.dumps({
"model": "claude-3-opus-20240229",
"max_tokens": 500,
"messages": [{
"role": "user",
"content": [
{"type": "image", "source": {"type": "base64", "media_type": media_type, "data": image_data}},
{"type": "text", "text": prompt}
]
}]
})],
capture_output=True, text=True, timeout=60
)
if result.returncode == 0:
response = json.loads(result.stdout)
content = response["content"][0]["text"]
return parse_description(content)
except (subprocess.TimeoutExpired, json.JSONDecodeError, KeyError) as e:
print(f"ERROR calling vision model: {e}", file=sys.stderr)
return None
def parse_description(text: str) -> dict:
"""Parse model response into structured description."""
# Try to extract JSON from response
import re
json_match = re.search(r'\{[^}]+\}', text, re.DOTALL)
if json_match:
try:
return json.loads(json_match.group())
except json.JSONDecodeError:
pass
# Fallback: parse manually
desc = {
"mood": "unknown",
"colors": [],
"composition": "unknown",
"camera": "unknown",
"description": text[:500],
}
# Try to extract mood
mood_match = re.search(r'mood["\s:]+(\w+)', text, re.IGNORECASE)
if mood_match:
desc["mood"] = mood_match.group(1).lower()
# Try to extract colors
color_match = re.search(r'colors?["\s:]+\[([^\]]+)\]', text, re.IGNORECASE)
if color_match:
desc["colors"] = [c.strip().strip('"').strip("'") for c in color_match.group(1).split(",")]
return desc
def generate_training_pair(filepath: Path, description: dict, model: str) -> dict:
"""Generate a training pair from media file and description."""
return {
"source_file": str(filepath),
"source_hash": file_hash(filepath),
"source_type": "media_asset",
"media_type": "image" if filepath.suffix.lower() in IMAGE_EXTENSIONS else "video",
"model": model,
"timestamp": datetime.now(timezone.utc).isoformat(),
"source_session_id": f"media-gen-{int(time.time())}",
"prompt": f"Describe the visual scene in {filepath.name}",
"response": description.get("description", ""),
"scene": {
"mood": description.get("mood", "unknown"),
"colors": description.get("colors", []),
"composition": description.get("composition", "unknown"),
"camera": description.get("camera", "unknown"),
"description": description.get("description", ""),
},
}
def main():
parser = argparse.ArgumentParser(description="Generate scene descriptions from media")
parser.add_argument("--assets", required=True, help="Assets directory to scan")
parser.add_argument("--output", help="Output JSONL file path")
parser.add_argument("--model", default="llava", help="Vision model (llava/gpt-4/claude)")
parser.add_argument("--max", type=int, default=0, help="Max files to process (0=all)")
parser.add_argument("--dry-run", action="store_true", help="Don't call vision model")
parser.add_argument("--json", action="store_true", help="JSON output")
args = parser.parse_args()
media_files = find_media_files(args.assets, args.max)
if not media_files:
print("No media files found.", file=sys.stderr)
sys.exit(1)
print(f"Found {len(media_files)} media files in {args.assets}")
if args.dry_run:
print("\nDry run — files to process:")
for f in media_files[:20]:
print(f" {f.relative_to(args.assets)}")
if len(media_files) > 20:
print(f" ... and {len(media_files) - 20} more")
sys.exit(0)
pairs = []
errors = 0
for i, filepath in enumerate(media_files, 1):
print(f"[{i}/{len(media_files)}] Processing {filepath.name}...", end=" ", flush=True)
description = call_vision_model(filepath, args.model)
if description:
pair = generate_training_pair(filepath, description, args.model)
pairs.append(pair)
print(f"OK (mood: {pair['scene']['mood']})")
else:
errors += 1
print("ERROR")
# Output
output_path = args.output or "training-data/media-scene-descriptions.jsonl"
if args.json:
print(json.dumps({"pairs": pairs, "total": len(pairs), "errors": errors}, indent=2))
else:
with open(output_path, 'w') as f:
for pair in pairs:
f.write(json.dumps(pair, ensure_ascii=False) + '\n')
print(f"\nGenerated {len(pairs)} scene descriptions ({errors} errors)")
print(f"Output: {output_path}")
if __name__ == "__main__":
main()