Files
timmy-config/training/scripts/generate_scene_descriptions.py
Alexander Whitestone 9f2a76fc3e
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 31s
PR Checklist / pr-checklist (pull_request) Failing after 13m48s
Smoke Test / smoke (pull_request) Failing after 13m22s
Validate Config / YAML Lint (pull_request) Failing after 5s
Validate Config / JSON Validate (pull_request) Successful in 4s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 21s
Validate Config / Shell Script Lint (pull_request) Failing after 22s
Validate Config / Cron Syntax Check (pull_request) Successful in 8s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 8s
Validate Training Data / validate (pull_request) Successful in 10s
Validate Config / Playbook Schema Validation (pull_request) Successful in 15s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
feat: auto-generate scene descriptions from image/video (#689)
2026-04-17 01:58:05 -04:00

167 lines
5.6 KiB
Python
Executable File

#!/usr/bin/env python3
"""
generate_scene_descriptions.py — Auto-generate scene descriptions from image/video.
Scans an assets directory for images, generates rich scene descriptions
using a local vision model, and outputs as training pairs.
Usage:
python3 training/scripts/generate_scene_descriptions.py --input assets/
python3 training/scripts/generate_scene_descriptions.py --input assets/ --model gemma3:27b
python3 training/scripts/generate_scene_descriptions.py --input assets/ --dry-run
"""
import json
import os
import sys
import base64
import glob
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
ENDPOINT = os.environ.get("OPENAI_BASE_URL", "http://localhost:11434/v1")
API_KEY = os.environ.get("OPENAI_API_KEY", "ollama")
DESCRIPTION_PROMPT = """Describe this image as a rich visual scene for a training dataset.
Output a JSON object with these fields:
- mood: primary emotional mood (1-2 words)
- colors: array of 3-5 dominant colors
- composition: framing type (wide shot, close-up, etc.)
- camera: camera movement suggestion
- description: rich 2-3 sentence scene description
Focus on: lighting, atmosphere, emotion, visual composition. Be specific, not generic.
Return ONLY valid JSON, no markdown fences."""
def encode_image(image_path: str) -> str:
"""Encode image to base64."""
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode()
def generate_description(image_path: str, model: str = "gemma3:27b") -> dict:
"""Generate scene description for an image using vision model."""
b64 = encode_image(image_path)
data = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": DESCRIPTION_PROMPT},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
]
}
],
"max_tokens": 512,
"temperature": 0.3,
}
url = f"{ENDPOINT}/chat/completions"
try:
req = urllib.request.Request(url, json.dumps(data).encode(), {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}",
}, method="POST")
with urllib.request.urlopen(req, timeout=60) as resp:
result = json.loads(resp.read())
content = result["choices"][0]["message"]["content"]
# Try to extract JSON from response
content = content.strip()
if content.startswith("```"):
content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip()
return json.loads(content)
except Exception as e:
return {"error": str(e), "description": "", "mood": "unknown", "colors": [], "composition": "", "camera": ""}
def scan_assets(directory: str) -> list:
"""Find all image files in directory."""
extensions = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".mp4", ".mov", ".avi"}
files = []
for ext in extensions:
files.extend(glob.glob(os.path.join(directory, f"**/*{ext}"), recursive=True))
return sorted(files)
def generate_training_pairs(assets_dir: str, output_path: str, model: str = "gemma3:27b",
dry_run: bool = False, limit: int = None):
"""Generate training pairs from image assets."""
files = scan_assets(assets_dir)
if limit:
files = files[:limit]
print(f"Assets directory: {assets_dir}")
print(f"Found: {len(files)} image/video files")
print(f"Model: {model}")
print(f"Mode: {'DRY RUN' if dry_run else 'GENERATE'}")
print()
pairs = []
errors = 0
for i, filepath in enumerate(files):
filename = os.path.basename(filepath)
print(f" [{i+1}/{len(files)}] {filename}...", end=" ", flush=True)
if dry_run:
print("SKIP")
continue
scene = generate_description(filepath, model)
if "error" in scene:
print(f"ERROR: {scene['error'][:50]}")
errors += 1
continue
pair = {
"terse": f"[image: {filename}]",
"rich": scene.get("description", ""),
"domain": "scene_descriptions",
"source_asset": filepath,
"mood": scene.get("mood", ""),
"colors": scene.get("colors", []),
"composition": scene.get("composition", ""),
"camera": scene.get("camera", ""),
"model": model,
"timestamp": datetime.now(timezone.utc).isoformat(),
}
pairs.append(pair)
print(f"OK ({scene.get('mood', '?')})")
# Write output
if pairs and not dry_run:
with open(output_path, "w") as f:
for p in pairs:
f.write(json.dumps(p, ensure_ascii=False) + "\n")
print(f"\nGenerated: {len(pairs)} pairs, {errors} errors")
print(f"Output: {output_path}")
elif dry_run:
print(f"\nDry run: would generate {len(files)} pairs")
return pairs
def main():
import argparse
parser = argparse.ArgumentParser(description="Generate scene descriptions from images")
parser.add_argument("--input", required=True, help="Assets directory")
parser.add_argument("--output", default="training/data/scene-descriptions/auto-generated-scenes.jsonl")
parser.add_argument("--model", default="gemma3:27b")
parser.add_argument("--dry-run", action="store_true")
parser.add_argument("--limit", type=int, default=None)
args = parser.parse_args()
generate_training_pairs(args.input, args.output, args.model, args.dry_run, args.limit)
if __name__ == "__main__":
main()