Merge PR #729: scripts/generate_scene_descriptions.py (added)

2026-04-16 05:03:55 +00:00
parent e8b5d5edb4
commit 17adc703f8
1 changed files with 409 additions and 0 deletions
--- a/scripts/generate_scene_descriptions.py
+++ b/scripts/generate_scene_descriptions.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+"""
+Auto-generate scene descriptions from image/video assets.
+
+Scans a directory for media files, generates scene descriptions using
+a local vision model (Ollama), and outputs training pairs in JSONL format.
+
+Supports Gemma 4 multimodal vision via Ollama. Falls back gracefully when
+models are unavailable.
+
+Usage:
+    python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl
+    python scripts/generate_scene_descriptions.py --input ./assets --model gemma4:latest --limit 50
+    python scripts/generate_scene_descriptions.py --input ./assets --format sharegpt
+    python scripts/generate_scene_descriptions.py --dry-run  # List files without generating
+    python scripts/generate_scene_descriptions.py --input ./assets --check-model  # Verify model availability
+
+Ref: timmy-config#689
+"""
+
+import argparse
+import base64
+import json
+import os
+import re
+import subprocess
+import sys
+import time
+import urllib.request
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+# Supported media extensions
+IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
+VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
+ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS
+
+# File size limit (50MB) — prevents unbounded memory usage on large images
+MAX_FILE_SIZE = 50 * 1024 * 1024
+
+# Vision models in preference order (best first)
+VISION_MODELS = [
+    "gemma4:latest",       # Gemma 4 — multimodal vision (8B, Q4_K_M)
+    "gemma3:12b",          # Gemma 3 — fallback vision
+    "llava:latest",        # LLaVA — generic vision
+    "llava-phi3:latest",   # LLaVA-Phi3 — lightweight vision
+]
+
+# Vision model prompt template (structured JSON output)
+SCENE_PROMPT = """Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation):
+{
+  "mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",
+  "colors": ["dominant color 1", "dominant color 2", "dominant color 3"],
+  "composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",
+  "camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",
+  "lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",
+  "description": "2-3 sentence visual description of the scene"
+}
+
+Be specific. Describe what you see, not what you imagine."""
+
+# ShareGPT format prompt (for training pipeline integration)
+SHAREGPT_SCENE_PROMPT = """Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description."""
+
+
+def check_model_available(model: str, ollama_url: str = "http://localhost:11434") -> bool:
+    """Check if a model is available in Ollama."""
+    try:
+        req = urllib.request.Request(f"{ollama_url}/api/tags")
+        resp = urllib.request.urlopen(req, timeout=10)
+        data = json.loads(resp.read())
+        available = [m["name"] for m in data.get("models", [])]
+        return model in available
+    except Exception:
+        return False
+
+
+def auto_detect_model(ollama_url: str = "http://localhost:11434") -> Optional[str]:
+    """Auto-detect the best available vision model."""
+    for model in VISION_MODELS:
+        if check_model_available(model, ollama_url):
+            print(f"Auto-detected vision model: {model}", file=sys.stderr)
+            return model
+    return None
+
+
+def scan_media(input_dir: str) -> list[Path]:
+    """Scan directory for media files recursively."""
+    media_files = []
+    input_path = Path(input_dir)
+    if not input_path.exists():
+        print(f"Error: {input_dir} does not exist", file=sys.stderr)
+        return media_files
+
+    for ext in sorted(ALL_EXTS):
+        media_files.extend(input_path.rglob(f"*{ext}"))
+        media_files.extend(input_path.rglob(f"*{ext.upper()}"))
+
+    return sorted(set(media_files))
+
+
+def extract_video_frame(video_path: Path, output_path: Path) -> bool:
+    """Extract a representative frame from a video using ffmpeg."""
+    try:
+        result = subprocess.run(
+            # FIX #3: Seek 2s in before grabbing frame — avoids black/title frames
+            ["ffmpeg", "-ss", "2", "-i", str(video_path), "-vframes", "1",
+             "-q:v", "2", str(output_path), "-y"],
+            capture_output=True, timeout=30,
+        )
+        if result.returncode != 0 and result.stderr:
+            print(f"  ffmpeg stderr: {result.stderr.decode(errors='replace')[:200]}", file=sys.stderr)
+        return output_path.exists() and output_path.stat().st_size > 0
+    except FileNotFoundError:
+        print("  ffmpeg not found — skipping video frame extraction", file=sys.stderr)
+        return False
+    except Exception as e:
+        print(f"  ffmpeg error: {e}", file=sys.stderr)
+        return False
+
+
+def describe_image(
+    image_path: Path,
+    model: str = "gemma4:latest",
+    ollama_url: str = "http://localhost:11434",
+    max_retries: int = 2,
+) -> Optional[dict]:
+    """Generate scene description using Ollama vision model with retry."""
+    # FIX #1: Check file size before reading into memory
+    if image_path.stat().st_size > MAX_FILE_SIZE:
+        print(f"  Skipping {image_path.name}: exceeds {MAX_FILE_SIZE // (1024*1024)}MB limit", file=sys.stderr)
+        return None
+
+    for attempt in range(max_retries + 1):
+        try:
+            with open(image_path, "rb") as f:
+                image_b64 = base64.b64encode(f.read()).decode()
+
+            req = urllib.request.Request(
+                f"{ollama_url}/api/generate",
+                data=json.dumps({
+                    "model": model,
+                    "prompt": SCENE_PROMPT,
+                    "images": [image_b64],
+                    "stream": False,
+                    "options": {"temperature": 0.3, "num_predict": 1024}
+                }).encode(),
+                headers={"Content-Type": "application/json"},
+            )
+            resp = urllib.request.urlopen(req, timeout=120)
+            data = json.loads(resp.read())
+            response_text = data.get("response", "")
+
+            # Parse JSON from response (handle both complete and truncated JSON)
+            json_match = re.search(r"\{[\s\S]*\}", response_text)
+            if not json_match:
+                # Try to find opening brace for truncated JSON
+                brace_match = re.search(r"\{", response_text)
+                if brace_match:
+                    json_match = brace_match
+
+            if json_match:
+                raw_json = json_match.group() if hasattr(json_match, 'group') else response_text[json_match.start():]
+                # Try strict parse first
+                try:
+                    parsed = json.loads(raw_json)
+                    required = ["mood", "colors", "composition", "camera", "description"]
+                    if all(k in parsed for k in required) and parsed.get("description"):
+                        return parsed
+                except json.JSONDecodeError:
+                    # Attempt repair: extract fields from truncated JSON
+                    repaired = {}
+                    for field in ["mood", "colors", "composition", "camera", "lighting", "description"]:
+                        pat = rf'"\s*{field}"\s*:\s*"([^"]*)"'
+                        m = re.search(pat, response_text)
+                        if m:
+                            repaired[field] = m.group(1)
+                        elif field == "colors":
+                            colors_match = re.search(r'"colors"\s*:\s*\[([^\]]*)\]', response_text)
+                            if colors_match:
+                                repaired[field] = [c.strip().strip('"') for c in colors_match.group(1).split(",") if c.strip()]
+                            else:
+                                repaired[field] = []
+                        else:
+                            repaired[field] = "unknown"
+                    if repaired.get("description") or repaired.get("mood") != "unknown":
+                        return repaired
+
+            # Final fallback: natural language response
+            clean = re.sub(r"[*_`#]", "", response_text).strip()
+            clean = re.sub(r"\n{3,}", "\n\n", clean)
+            return {
+                "description": clean[:500] if clean else response_text[:500],
+                "mood": "unknown",
+                "colors": [],
+                "composition": "unknown",
+                "camera": "unknown",
+                "lighting": "unknown"
+            }
+
+        except (urllib.error.URLError, TimeoutError) as e:
+            if attempt < max_retries:
+                wait = 2 ** attempt
+                print(f"  Retry {attempt + 1}/{max_retries} after {wait}s: {e}", file=sys.stderr)
+                time.sleep(wait)
+            else:
+                print(f"  Error describing {image_path.name}: {e}", file=sys.stderr)
+                return None
+        except Exception as e:
+            print(f"  Error describing {image_path.name}: {e}", file=sys.stderr)
+            return None
+
+
+def describe_image_sharegpt(
+    image_path: Path,
+    model: str = "gemma4:latest",
+    ollama_url: str = "http://localhost:11434",
+    max_retries: int = 2,
+) -> Optional[str]:
+    """Generate scene description in natural language for ShareGPT format."""
+    # FIX #1: Check file size before reading into memory
+    if image_path.stat().st_size > MAX_FILE_SIZE:
+        print(f"  Skipping {image_path.name}: exceeds {MAX_FILE_SIZE // (1024*1024)}MB limit", file=sys.stderr)
+        return None
+
+    for attempt in range(max_retries + 1):
+        try:
+            with open(image_path, "rb") as f:
+                image_b64 = base64.b64encode(f.read()).decode()
+
+            req = urllib.request.Request(
+                f"{ollama_url}/api/generate",
+                data=json.dumps({
+                    "model": model,
+                    "prompt": SHAREGPT_SCENE_PROMPT,
+                    "images": [image_b64],
+                    "stream": False,
+                    "options": {"temperature": 0.5, "num_predict": 256}
+                }).encode(),
+                headers={"Content-Type": "application/json"},
+            )
+            resp = urllib.request.urlopen(req, timeout=120)
+            data = json.loads(resp.read())
+            return data.get("response", "").strip()
+
+        except (urllib.error.URLError, TimeoutError) as e:
+            if attempt < max_retries:
+                time.sleep(2 ** attempt)
+            else:
+                return None
+        except Exception:
+            return None
+
+
+def generate_training_pairs(
+    media_files: list[Path],
+    model: str,
+    ollama_url: str,
+    limit: int = 0,
+    dry_run: bool = False,
+    output_format: str = "jsonl",
+) -> list[dict]:
+    """Generate training pairs from media files."""
+    pairs = []
+    files = media_files[:limit] if limit > 0 else media_files
+
+    print(f"Processing {len(files)} files with model {model}...", file=sys.stderr)
+
+    for i, media_path in enumerate(files):
+        print(f"  [{i + 1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ", flush=True)
+
+        if dry_run:
+            print("(dry run)", file=sys.stderr)
+            pairs.append({"source": str(media_path), "status": "dry-run"})
+            continue
+
+        is_video = media_path.suffix.lower() in VIDEO_EXTS
+        work_path = media_path
+
+        if is_video:
+            frame_path = media_path.with_suffix(".frame.jpg")
+            if extract_video_frame(media_path, frame_path):
+                work_path = frame_path
+            else:
+                print("SKIP (frame extraction failed)", file=sys.stderr)
+                continue
+
+        try:
+            if output_format == "sharegpt":
+                # ShareGPT format for training pipeline
+                description = describe_image_sharegpt(work_path, model, ollama_url)
+                if description:
+                    pair = {
+                        "conversations": [
+                            {"from": "human", "value": f"<image>\n{SHAREGPT_SCENE_PROMPT}"},
+                            {"from": "gpt", "value": description}
+                        ],
+                        "source": str(media_path),
+                        "media_type": "video" if is_video else "image",
+                        "model": model,
+                        "generated_at": datetime.now(timezone.utc).isoformat(),
+                    }
+                    pairs.append(pair)
+                    print("OK", file=sys.stderr)
+                else:
+                    print("FAIL", file=sys.stderr)
+            else:
+                # Structured JSONL format
+                description = describe_image(work_path, model, ollama_url)
+                if description:
+                    pair = {
+                        "source": str(media_path),
+                        "media_type": "video" if is_video else "image",
+                        "description": description,
+                        "model": model,
+                        "generated_at": datetime.now(timezone.utc).isoformat(),
+                    }
+                    pairs.append(pair)
+                    print("OK", file=sys.stderr)
+                else:
+                    print("FAIL", file=sys.stderr)
+        finally:
+            # FIX #6: Cleanup temp frame in try/finally — survives crashes
+            if is_video and work_path != media_path:
+                try:
+                    work_path.unlink()
+                except Exception:
+                    pass
+
+        # Small delay between files (reduced from 0.5s — Ollama is local)
+        time.sleep(0.1)
+
+    return pairs
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Auto-generate scene descriptions from media assets using vision AI"
+    )
+    parser.add_argument("--input", "-i", default="", help="Input directory with media files")
+    parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")
+    parser.add_argument("--model", "-m", default=None, help="Ollama model name (auto-detects best available if omitted)")
+    parser.add_argument("--ollama-url", default="http://localhost:11434")
+    parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")
+    parser.add_argument("--dry-run", action="store_true", help="List files without generating")
+    parser.add_argument("--check-model", action="store_true", help="Check model availability and exit")
+    parser.add_argument("--format", choices=["jsonl", "sharegpt"], default="jsonl",
+                        help="Output format: jsonl (structured) or sharegpt (training pipeline)")
+    args = parser.parse_args()
+
+    # Model detection
+    if args.check_model:
+        if args.model:
+            available = check_model_available(args.model, args.ollama_url)
+            print(f"Model '{args.model}': {'✅ available' if available else '❌ not found'}")
+        else:
+            model = auto_detect_model(args.ollama_url)
+            if model:
+                print(f"✅ Best available: {model}")
+            else:
+                print("❌ No vision models found in Ollama — install one with: ollama pull gemma4:latest")
+        sys.exit(0)
+
+    # Auto-detect model if not specified
+    model = args.model
+    if not model:
+        model = auto_detect_model(args.ollama_url)
+        if not model:
+            # Fall back to best default even if not installed — let Ollama handle the error
+            model = "gemma4:latest"
+            print(f"Warning: No vision models detected. Falling back to {model}", file=sys.stderr)
+
+    # Validate input
+    if not args.input:
+        print("Error: --input is required (unless using --check-model)", file=sys.stderr)
+        sys.exit(1)
+
+    # Scan and process
+    media_files = scan_media(args.input)
+    print(f"Found {len(media_files)} media files", file=sys.stderr)
+
+    if not media_files:
+        print("No media files found.", file=sys.stderr)
+        sys.exit(1)
+
+    pairs = generate_training_pairs(
+        media_files, model, args.ollama_url,
+        args.limit, args.dry_run, args.format
+    )
+
+    # Write output
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, "w") as f:
+        for pair in pairs:
+            f.write(json.dumps(pair, ensure_ascii=False) + "\n")
+
+    print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)
+
+    # Summary
+    success = len([p for p in pairs if "description" in p or "conversations" in p])
+    failed = len(pairs) - success
+    if failed > 0:
+        print(f"  ⚠️  {failed} files failed", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()