|
|
|
|
@@ -1,191 +0,0 @@
|
|
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
"""
|
|
|
|
|
Auto-generate scene descriptions from image/video assets.
|
|
|
|
|
|
|
|
|
|
Scans a directory for media files, generates scene descriptions using
|
|
|
|
|
a local vision model (Ollama), and outputs training pairs in JSONL format.
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl
|
|
|
|
|
python scripts/generate_scene_descriptions.py --input ./assets --model gemma3:12b --limit 50
|
|
|
|
|
python scripts/generate_scene_descriptions.py --dry-run # List files without generating
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
|
import json
|
|
|
|
|
import os
|
|
|
|
|
import subprocess
|
|
|
|
|
import sys
|
|
|
|
|
import time
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
|
# Supported media extensions
|
|
|
|
|
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
|
|
|
|
|
VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
|
|
|
|
|
ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS
|
|
|
|
|
|
|
|
|
|
# Vision model prompt template
|
|
|
|
|
SCENE_PROMPT = """Describe this image for a visual scene database. Output JSON:
|
|
|
|
|
{
|
|
|
|
|
"mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",
|
|
|
|
|
"colors": ["dominant color 1", "dominant color 2", "dominant color 3"],
|
|
|
|
|
"composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",
|
|
|
|
|
"camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",
|
|
|
|
|
"lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",
|
|
|
|
|
"description": "2-3 sentence visual description of the scene"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Be specific. Describe what you see, not what you imagine."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def scan_media(input_dir: str) -> list[Path]:
|
|
|
|
|
"""Scan directory for media files."""
|
|
|
|
|
media_files = []
|
|
|
|
|
input_path = Path(input_dir)
|
|
|
|
|
if not input_path.exists():
|
|
|
|
|
print(f"Error: {input_dir} does not exist", file=sys.stderr)
|
|
|
|
|
return media_files
|
|
|
|
|
|
|
|
|
|
for ext in sorted(ALL_EXTS):
|
|
|
|
|
media_files.extend(input_path.rglob(f"*{ext}"))
|
|
|
|
|
media_files.extend(input_path.rglob(f"*{ext.upper()}"))
|
|
|
|
|
|
|
|
|
|
return sorted(set(media_files))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_video_frame(video_path: Path, output_path: Path) -> bool:
|
|
|
|
|
"""Extract a representative frame from a video."""
|
|
|
|
|
try:
|
|
|
|
|
subprocess.run(
|
|
|
|
|
["ffmpeg", "-i", str(video_path), "-vframes", "1",
|
|
|
|
|
"-q:v", "2", str(output_path), "-y"],
|
|
|
|
|
capture_output=True, timeout=30,
|
|
|
|
|
)
|
|
|
|
|
return output_path.exists()
|
|
|
|
|
except Exception:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def describe_image(image_path: Path, model: str = "gemma3:12b",
|
|
|
|
|
ollama_url: str = "http://localhost:11434") -> Optional[dict]:
|
|
|
|
|
"""Generate scene description using Ollama vision model."""
|
|
|
|
|
try:
|
|
|
|
|
import base64 as b64
|
|
|
|
|
with open(image_path, "rb") as f:
|
|
|
|
|
image_b64 = b64.b64encode(f.read()).decode()
|
|
|
|
|
|
|
|
|
|
import urllib.request
|
|
|
|
|
req = urllib.request.Request(
|
|
|
|
|
f"{ollama_url}/api/generate",
|
|
|
|
|
data=json.dumps({
|
|
|
|
|
"model": model,
|
|
|
|
|
"prompt": SCENE_PROMPT,
|
|
|
|
|
"images": [image_b64],
|
|
|
|
|
"stream": False,
|
|
|
|
|
"options": {"temperature": 0.3, "num_predict": 512}
|
|
|
|
|
}).encode(),
|
|
|
|
|
headers={"Content-Type": "application/json"},
|
|
|
|
|
)
|
|
|
|
|
resp = urllib.request.urlopen(req, timeout=120)
|
|
|
|
|
data = json.loads(resp.read())
|
|
|
|
|
response_text = data.get("response", "")
|
|
|
|
|
|
|
|
|
|
# Parse JSON from response
|
|
|
|
|
import re
|
|
|
|
|
json_match = re.search(r"\{[\s\S]*\}", response_text)
|
|
|
|
|
if json_match:
|
|
|
|
|
return json.loads(json_match.group())
|
|
|
|
|
|
|
|
|
|
return {"description": response_text[:500], "mood": "unknown",
|
|
|
|
|
"colors": [], "composition": "unknown", "camera": "unknown", "lighting": "unknown"}
|
|
|
|
|
except Exception as e:
|
|
|
|
|
print(f" Error describing {image_path.name}: {e}", file=sys.stderr)
|
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_training_pairs(media_files: list[Path], model: str, ollama_url: str,
|
|
|
|
|
limit: int = 0, dry_run: bool = False) -> list[dict]:
|
|
|
|
|
"""Generate training pairs from media files."""
|
|
|
|
|
pairs = []
|
|
|
|
|
files = media_files[:limit] if limit > 0 else media_files
|
|
|
|
|
|
|
|
|
|
print(f"Processing {len(files)} files...", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
for i, media_path in enumerate(files):
|
|
|
|
|
print(f" [{i+1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ")
|
|
|
|
|
|
|
|
|
|
if dry_run:
|
|
|
|
|
print("(dry run)", file=sys.stderr)
|
|
|
|
|
pairs.append({"source": str(media_path), "status": "dry-run"})
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
is_video = media_path.suffix.lower() in VIDEO_EXTS
|
|
|
|
|
work_path = media_path
|
|
|
|
|
|
|
|
|
|
if is_video:
|
|
|
|
|
# Extract frame for video
|
|
|
|
|
frame_path = media_path.with_suffix(".frame.jpg")
|
|
|
|
|
if extract_video_frame(media_path, frame_path):
|
|
|
|
|
work_path = frame_path
|
|
|
|
|
else:
|
|
|
|
|
print("SKIP (frame extraction failed)", file=sys.stderr)
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
description = describe_image(work_path, model, ollama_url)
|
|
|
|
|
if description:
|
|
|
|
|
pair = {
|
|
|
|
|
"source": str(media_path),
|
|
|
|
|
"media_type": "video" if is_video else "image",
|
|
|
|
|
"description": description,
|
|
|
|
|
"model": model,
|
|
|
|
|
"generated_at": datetime.now(timezone.utc).isoformat(),
|
|
|
|
|
}
|
|
|
|
|
pairs.append(pair)
|
|
|
|
|
print("OK", file=sys.stderr)
|
|
|
|
|
else:
|
|
|
|
|
print("FAIL", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
# Cleanup temp frame
|
|
|
|
|
if is_video and work_path != media_path:
|
|
|
|
|
try:
|
|
|
|
|
work_path.unlink()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
return pairs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
parser = argparse.ArgumentParser(description="Auto-generate scene descriptions from media")
|
|
|
|
|
parser.add_argument("--input", "-i", required=True, help="Input directory with media files")
|
|
|
|
|
parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")
|
|
|
|
|
parser.add_argument("--model", "-m", default="gemma3:12b", help="Ollama model name")
|
|
|
|
|
parser.add_argument("--ollama-url", default="http://localhost:11434")
|
|
|
|
|
parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")
|
|
|
|
|
parser.add_argument("--dry-run", action="store_true", help="List files without generating")
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
media_files = scan_media(args.input)
|
|
|
|
|
print(f"Found {len(media_files)} media files", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
if not media_files:
|
|
|
|
|
print("No media files found.", file=sys.stderr)
|
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
|
|
pairs = generate_training_pairs(media_files, args.model, args.ollama_url,
|
|
|
|
|
args.limit, args.dry_run)
|
|
|
|
|
|
|
|
|
|
# Write output
|
|
|
|
|
output_path = Path(args.output)
|
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
with open(output_path, "w") as f:
|
|
|
|
|
for pair in pairs:
|
|
|
|
|
f.write(json.dumps(pair, ensure_ascii=False) + "\n")
|
|
|
|
|
|
|
|
|
|
print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|