Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 31s
PR Checklist / pr-checklist (pull_request) Failing after 13m48s
Smoke Test / smoke (pull_request) Failing after 13m22s
Validate Config / YAML Lint (pull_request) Failing after 5s
Validate Config / JSON Validate (pull_request) Successful in 4s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 21s
Validate Config / Shell Script Lint (pull_request) Failing after 22s
Validate Config / Cron Syntax Check (pull_request) Successful in 8s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 8s
Validate Training Data / validate (pull_request) Successful in 10s
Validate Config / Playbook Schema Validation (pull_request) Successful in 15s
Architecture Lint / Lint Repository (pull_request) Has been cancelled
Validate Config / Python Test Suite (pull_request) Has been cancelled
167 lines
5.6 KiB
Python
Executable File
167 lines
5.6 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
generate_scene_descriptions.py — Auto-generate scene descriptions from image/video.
|
|
|
|
Scans an assets directory for images, generates rich scene descriptions
|
|
using a local vision model, and outputs as training pairs.
|
|
|
|
Usage:
|
|
python3 training/scripts/generate_scene_descriptions.py --input assets/
|
|
python3 training/scripts/generate_scene_descriptions.py --input assets/ --model gemma3:27b
|
|
python3 training/scripts/generate_scene_descriptions.py --input assets/ --dry-run
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import sys
|
|
import base64
|
|
import glob
|
|
import urllib.request
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
ENDPOINT = os.environ.get("OPENAI_BASE_URL", "http://localhost:11434/v1")
|
|
API_KEY = os.environ.get("OPENAI_API_KEY", "ollama")
|
|
|
|
DESCRIPTION_PROMPT = """Describe this image as a rich visual scene for a training dataset.
|
|
|
|
Output a JSON object with these fields:
|
|
- mood: primary emotional mood (1-2 words)
|
|
- colors: array of 3-5 dominant colors
|
|
- composition: framing type (wide shot, close-up, etc.)
|
|
- camera: camera movement suggestion
|
|
- description: rich 2-3 sentence scene description
|
|
|
|
Focus on: lighting, atmosphere, emotion, visual composition. Be specific, not generic.
|
|
|
|
Return ONLY valid JSON, no markdown fences."""
|
|
|
|
|
|
def encode_image(image_path: str) -> str:
|
|
"""Encode image to base64."""
|
|
with open(image_path, "rb") as f:
|
|
return base64.b64encode(f.read()).decode()
|
|
|
|
|
|
def generate_description(image_path: str, model: str = "gemma3:27b") -> dict:
|
|
"""Generate scene description for an image using vision model."""
|
|
b64 = encode_image(image_path)
|
|
|
|
data = {
|
|
"model": model,
|
|
"messages": [
|
|
{
|
|
"role": "user",
|
|
"content": [
|
|
{"type": "text", "text": DESCRIPTION_PROMPT},
|
|
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}
|
|
]
|
|
}
|
|
],
|
|
"max_tokens": 512,
|
|
"temperature": 0.3,
|
|
}
|
|
|
|
url = f"{ENDPOINT}/chat/completions"
|
|
try:
|
|
req = urllib.request.Request(url, json.dumps(data).encode(), {
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {API_KEY}",
|
|
}, method="POST")
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
result = json.loads(resp.read())
|
|
content = result["choices"][0]["message"]["content"]
|
|
|
|
# Try to extract JSON from response
|
|
content = content.strip()
|
|
if content.startswith("```"):
|
|
content = content.split("\n", 1)[1].rsplit("```", 1)[0].strip()
|
|
|
|
return json.loads(content)
|
|
except Exception as e:
|
|
return {"error": str(e), "description": "", "mood": "unknown", "colors": [], "composition": "", "camera": ""}
|
|
|
|
|
|
def scan_assets(directory: str) -> list:
|
|
"""Find all image files in directory."""
|
|
extensions = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".mp4", ".mov", ".avi"}
|
|
files = []
|
|
for ext in extensions:
|
|
files.extend(glob.glob(os.path.join(directory, f"**/*{ext}"), recursive=True))
|
|
return sorted(files)
|
|
|
|
|
|
def generate_training_pairs(assets_dir: str, output_path: str, model: str = "gemma3:27b",
|
|
dry_run: bool = False, limit: int = None):
|
|
"""Generate training pairs from image assets."""
|
|
files = scan_assets(assets_dir)
|
|
if limit:
|
|
files = files[:limit]
|
|
|
|
print(f"Assets directory: {assets_dir}")
|
|
print(f"Found: {len(files)} image/video files")
|
|
print(f"Model: {model}")
|
|
print(f"Mode: {'DRY RUN' if dry_run else 'GENERATE'}")
|
|
print()
|
|
|
|
pairs = []
|
|
errors = 0
|
|
|
|
for i, filepath in enumerate(files):
|
|
filename = os.path.basename(filepath)
|
|
print(f" [{i+1}/{len(files)}] {filename}...", end=" ", flush=True)
|
|
|
|
if dry_run:
|
|
print("SKIP")
|
|
continue
|
|
|
|
scene = generate_description(filepath, model)
|
|
|
|
if "error" in scene:
|
|
print(f"ERROR: {scene['error'][:50]}")
|
|
errors += 1
|
|
continue
|
|
|
|
pair = {
|
|
"terse": f"[image: {filename}]",
|
|
"rich": scene.get("description", ""),
|
|
"domain": "scene_descriptions",
|
|
"source_asset": filepath,
|
|
"mood": scene.get("mood", ""),
|
|
"colors": scene.get("colors", []),
|
|
"composition": scene.get("composition", ""),
|
|
"camera": scene.get("camera", ""),
|
|
"model": model,
|
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
}
|
|
pairs.append(pair)
|
|
print(f"OK ({scene.get('mood', '?')})")
|
|
|
|
# Write output
|
|
if pairs and not dry_run:
|
|
with open(output_path, "w") as f:
|
|
for p in pairs:
|
|
f.write(json.dumps(p, ensure_ascii=False) + "\n")
|
|
print(f"\nGenerated: {len(pairs)} pairs, {errors} errors")
|
|
print(f"Output: {output_path}")
|
|
elif dry_run:
|
|
print(f"\nDry run: would generate {len(files)} pairs")
|
|
|
|
return pairs
|
|
|
|
|
|
def main():
|
|
import argparse
|
|
parser = argparse.ArgumentParser(description="Generate scene descriptions from images")
|
|
parser.add_argument("--input", required=True, help="Assets directory")
|
|
parser.add_argument("--output", default="training/data/scene-descriptions/auto-generated-scenes.jsonl")
|
|
parser.add_argument("--model", default="gemma3:27b")
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
parser.add_argument("--limit", type=int, default=None)
|
|
args = parser.parse_args()
|
|
generate_training_pairs(args.input, args.output, args.model, args.dry_run, args.limit)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|