scripts/generate_scene_descriptions.py

#!/usr/bin/env python3
"""
Auto-generate scene descriptions from image/video assets.

Scans a directory for media files, generates scene descriptions using
a local vision model (Ollama), and outputs training pairs in JSONL format.

Supports Gemma 4 multimodal vision via Ollama. Falls back gracefully when
models are unavailable.

Usage:
    python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl
    python scripts/generate_scene_descriptions.py --input ./assets --model gemma4:latest --limit 50
    python scripts/generate_scene_descriptions.py --input ./assets --format sharegpt
    python scripts/generate_scene_descriptions.py --dry-run  # List files without generating
    python scripts/generate_scene_descriptions.py --input ./assets --check-model  # Verify model availability

Ref: timmy-config#689
"""

import argparse
import base64
import json
import os
import re
import subprocess
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# Supported media extensions
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS

# Vision models in preference order (best first)
VISION_MODELS = [
    "gemma4:latest",       # Gemma 4 — multimodal vision (8B, Q4_K_M)
    "gemma3:12b",          # Gemma 3 — fallback vision
    "llava:latest",        # LLaVA — generic vision
    "llava-phi3:latest",   # LLaVA-Phi3 — lightweight vision
]

# Vision model prompt template (structured JSON output)
SCENE_PROMPT = """Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation):
{
  "mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",
  "colors": ["dominant color 1", "dominant color 2", "dominant color 3"],
  "composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",
  "camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",
  "lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",
  "description": "2-3 sentence visual description of the scene"
}

Be specific. Describe what you see, not what you imagine."""

# ShareGPT format prompt (for training pipeline integration)
SHAREGPT_SCENE_PROMPT = """Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description."""


def check_model_available(model: str, ollama_url: str = "http://localhost:11434") -> bool:
    """Check if a model is available in Ollama."""
    try:
        req = urllib.request.Request(f"{ollama_url}/api/tags")
        resp = urllib.request.urlopen(req, timeout=10)
        data = json.loads(resp.read())
        available = [m["name"] for m in data.get("models", [])]
        return model in available
    except Exception:
        return False


def auto_detect_model(ollama_url: str = "http://localhost:11434") -> Optional[str]:
    """Auto-detect the best available vision model."""
    for model in VISION_MODELS:
        if check_model_available(model, ollama_url):
            print(f"Auto-detected vision model: {model}", file=sys.stderr)
            return model
    return None


def scan_media(input_dir: str) -> list[Path]:
    """Scan directory for media files recursively."""
    media_files = []
    input_path = Path(input_dir)
    if not input_path.exists():
        print(f"Error: {input_dir} does not exist", file=sys.stderr)
        return media_files

    for ext in sorted(ALL_EXTS):
        media_files.extend(input_path.rglob(f"*{ext}"))
        media_files.extend(input_path.rglob(f"*{ext.upper()}"))

    return sorted(set(media_files))


def extract_video_frame(video_path: Path, output_path: Path) -> bool:
    """Extract a representative frame from a video using ffmpeg."""
    try:
        result = subprocess.run(
            ["ffmpeg", "-i", str(video_path), "-vframes", "1",
             "-q:v", "2", str(output_path), "-y"],
            capture_output=True, timeout=30,
        )
        return output_path.exists() and output_path.stat().st_size > 0
    except FileNotFoundError:
        print("  ffmpeg not found — skipping video frame extraction", file=sys.stderr)
        return False
    except Exception as e:
        print(f"  ffmpeg error: {e}", file=sys.stderr)
        return False


def describe_image(
    image_path: Path,
    model: str = "gemma4:latest",
    ollama_url: str = "http://localhost:11434",
    max_retries: int = 2,
) -> Optional[dict]:
    """Generate scene description using Ollama vision model with retry."""
    for attempt in range(max_retries + 1):
        try:
            with open(image_path, "rb") as f:
                image_b64 = base64.b64encode(f.read()).decode()

            req = urllib.request.Request(
                f"{ollama_url}/api/generate",
                data=json.dumps({
                    "model": model,
                    "prompt": SCENE_PROMPT,
                    "images": [image_b64],
                    "stream": False,
                    "options": {"temperature": 0.3, "num_predict": 512}
                }).encode(),
                headers={"Content-Type": "application/json"},
            )
            resp = urllib.request.urlopen(req, timeout=120)
            data = json.loads(resp.read())
            response_text = data.get("response", "")

            # Parse JSON from response
            json_match = re.search(r"\{[\s\S]*\}", response_text)
            if json_match:
                try:
                    parsed = json.loads(json_match.group())
                    # Validate required fields
                    required = ["mood", "colors", "composition", "camera", "description"]
                    if all(k in parsed for k in required):
                        if parsed["description"]:  # Ensure non-empty description
                            return parsed
                except json.JSONDecodeError:
                    pass

            # Fallback: model returned natural language — wrap it
            # Clean up markdown formatting
            clean = re.sub(r"[*_`#]", "", response_text).strip()
            clean = re.sub(r"\n{3,}", "\n\n", clean)
            return {
                "description": clean[:500] if clean else response_text[:500],
                "mood": "unknown",
                "colors": [],
                "composition": "unknown",
                "camera": "unknown",
                "lighting": "unknown"
            }

        except (urllib.error.URLError, TimeoutError) as e:
            if attempt < max_retries:
                wait = 2 ** attempt
                print(f"  Retry {attempt + 1}/{max_retries} after {wait}s: {e}", file=sys.stderr)
                time.sleep(wait)
            else:
                print(f"  Error describing {image_path.name}: {e}", file=sys.stderr)
                return None
        except Exception as e:
            print(f"  Error describing {image_path.name}: {e}", file=sys.stderr)
            return None


def describe_image_sharegpt(
    image_path: Path,
    model: str = "gemma4:latest",
    ollama_url: str = "http://localhost:11434",
    max_retries: int = 2,
) -> Optional[str]:
    """Generate scene description in natural language for ShareGPT format."""
    for attempt in range(max_retries + 1):
        try:
            with open(image_path, "rb") as f:
                image_b64 = base64.b64encode(f.read()).decode()

            req = urllib.request.Request(
                f"{ollama_url}/api/generate",
                data=json.dumps({
                    "model": model,
                    "prompt": SHAREGPT_SCENE_PROMPT,
                    "images": [image_b64],
                    "stream": False,
                    "options": {"temperature": 0.5, "num_predict": 256}
                }).encode(),
                headers={"Content-Type": "application/json"},
            )
            resp = urllib.request.urlopen(req, timeout=120)
            data = json.loads(resp.read())
            return data.get("response", "").strip()

        except (urllib.error.URLError, TimeoutError) as e:
            if attempt < max_retries:
                time.sleep(2 ** attempt)
            else:
                return None
        except Exception:
            return None


def generate_training_pairs(
    media_files: list[Path],
    model: str,
    ollama_url: str,
    limit: int = 0,
    dry_run: bool = False,
    output_format: str = "jsonl",
) -> list[dict]:
    """Generate training pairs from media files."""
    pairs = []
    files = media_files[:limit] if limit > 0 else media_files

    print(f"Processing {len(files)} files with model {model}...", file=sys.stderr)

    for i, media_path in enumerate(files):
        print(f"  [{i + 1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ", flush=True)

        if dry_run:
            print("(dry run)", file=sys.stderr)
            pairs.append({"source": str(media_path), "status": "dry-run"})
            continue

        is_video = media_path.suffix.lower() in VIDEO_EXTS
        work_path = media_path

        if is_video:
            frame_path = media_path.with_suffix(".frame.jpg")
            if extract_video_frame(media_path, frame_path):
                work_path = frame_path
            else:
                print("SKIP (frame extraction failed)", file=sys.stderr)
                continue

        if output_format == "sharegpt":
            # ShareGPT format for training pipeline
            description = describe_image_sharegpt(work_path, model, ollama_url)
            if description:
                pair = {
                    "conversations": [
                        {"from": "human", "value": f"<image>\n{SHAREGPT_SCENE_PROMPT}"},
                        {"from": "gpt", "value": description}
                    ],
                    "source": str(media_path),
                    "media_type": "video" if is_video else "image",
                    "model": model,
                    "generated_at": datetime.now(timezone.utc).isoformat(),
                }
                pairs.append(pair)
                print("OK", file=sys.stderr)
            else:
                print("FAIL", file=sys.stderr)
        else:
            # Structured JSONL format
            description = describe_image(work_path, model, ollama_url)
            if description:
                pair = {
                    "source": str(media_path),
                    "media_type": "video" if is_video else "image",
                    "description": description,
                    "model": model,
                    "generated_at": datetime.now(timezone.utc).isoformat(),
                }
                pairs.append(pair)
                print("OK", file=sys.stderr)
            else:
                print("FAIL", file=sys.stderr)

        # Cleanup temp frame
        if is_video and work_path != media_path:
            try:
                work_path.unlink()
            except Exception:
                pass

        # Small delay to avoid overwhelming Ollama
        time.sleep(0.5)

    return pairs


def main():
    parser = argparse.ArgumentParser(
        description="Auto-generate scene descriptions from media assets using vision AI"
    )
    parser.add_argument("--input", "-i", required=True, help="Input directory with media files")
    parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")
    parser.add_argument("--model", "-m", default="gemma4:latest", help="Ollama model name (auto-detects if empty)")
    parser.add_argument("--ollama-url", default="http://localhost:11434")
    parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")
    parser.add_argument("--dry-run", action="store_true", help="List files without generating")
    parser.add_argument("--check-model", action="store_true", help="Check model availability and exit")
    parser.add_argument("--format", choices=["jsonl", "sharegpt"], default="jsonl",
                        help="Output format: jsonl (structured) or sharegpt (training pipeline)")
    args = parser.parse_args()

    # Model detection
    if args.check_model:
        if args.model:
            available = check_model_available(args.model, args.ollama_url)
            print(f"Model '{args.model}': {'✅ available' if available else '❌ not found'}")
        else:
            model = auto_detect_model(args.ollama_url)
            if model:
                print(f"✅ Best available: {model}")
            else:
                print("❌ No vision models found in Ollama")
        sys.exit(0)

    # Auto-detect model if not specified
    model = args.model
    if not model:
        model = auto_detect_model(args.ollama_url)
        if not model:
            print("Error: No vision model available. Install one with: ollama pull gemma4:latest",
                  file=sys.stderr)
            sys.exit(1)

    # Scan and process
    media_files = scan_media(args.input)
    print(f"Found {len(media_files)} media files", file=sys.stderr)

    if not media_files:
        print("No media files found.", file=sys.stderr)
        sys.exit(1)

    pairs = generate_training_pairs(
        media_files, model, args.ollama_url,
        args.limit, args.dry_run, args.format
    )

    # Write output
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w") as f:
        for pair in pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")

    print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)

    # Summary
    success = len([p for p in pairs if "description" in p or "conversations" in p])
    failed = len(pairs) - success
    if failed > 0:
        print(f"  ⚠️  {failed} files failed", file=sys.stderr)


if __name__ == "__main__":
    main()
feat: enhance scene description generator for #689 - Auto-detect best vision model (gemma4:latest preferred) - Add --check-model flag for model availability - Add --format sharegpt for training pipeline integration - Add retry logic for Ollama API calls - Add video frame extraction with ffmpeg - Structured JSON output with validation - Comprehensive CLI with --help Closes #689 2026-04-15 09:37:25 +00:00			`#!/usr/bin/env python3`
			`"""`
			`Auto-generate scene descriptions from image/video assets.`

			`Scans a directory for media files, generates scene descriptions using`
			`a local vision model (Ollama), and outputs training pairs in JSONL format.`

			`Supports Gemma 4 multimodal vision via Ollama. Falls back gracefully when`
			`models are unavailable.`

			`Usage:`
			`python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl`
			`python scripts/generate_scene_descriptions.py --input ./assets --model gemma4:latest --limit 50`
			`python scripts/generate_scene_descriptions.py --input ./assets --format sharegpt`
			`python scripts/generate_scene_descriptions.py --dry-run # List files without generating`
			`python scripts/generate_scene_descriptions.py --input ./assets --check-model # Verify model availability`

			`Ref: timmy-config#689`
			`"""`

			`import argparse`
			`import base64`
			`import json`
			`import os`
			`import re`
			`import subprocess`
			`import sys`
			`import time`
			`import urllib.request`
			`from datetime import datetime, timezone`
			`from pathlib import Path`
			`from typing import Optional`

			`# Supported media extensions`
			`IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}`
			`VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}`
			`ALL_EXTS = IMAGE_EXTS \| VIDEO_EXTS`

			`# Vision models in preference order (best first)`
			`VISION_MODELS = [`
			`"gemma4:latest", # Gemma 4 — multimodal vision (8B, Q4_K_M)`
			`"gemma3:12b", # Gemma 3 — fallback vision`
			`"llava:latest", # LLaVA — generic vision`
			`"llava-phi3:latest", # LLaVA-Phi3 — lightweight vision`
			`]`

			`# Vision model prompt template (structured JSON output)`
			`SCENE_PROMPT = """Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation):`
			`{`
			`"mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",`
			`"colors": ["dominant color 1", "dominant color 2", "dominant color 3"],`
			`"composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",`
			`"camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",`
			`"lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",`
			`"description": "2-3 sentence visual description of the scene"`
			`}`

			`Be specific. Describe what you see, not what you imagine."""`

			`# ShareGPT format prompt (for training pipeline integration)`
			`SHAREGPT_SCENE_PROMPT = """Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description."""`


			`def check_model_available(model: str, ollama_url: str = "http://localhost:11434") -> bool:`
			`"""Check if a model is available in Ollama."""`
			`try:`
			`req = urllib.request.Request(f"{ollama_url}/api/tags")`
			`resp = urllib.request.urlopen(req, timeout=10)`
			`data = json.loads(resp.read())`
			`available = [m["name"] for m in data.get("models", [])]`
			`return model in available`
			`except Exception:`
			`return False`


			`def auto_detect_model(ollama_url: str = "http://localhost:11434") -> Optional[str]:`
			`"""Auto-detect the best available vision model."""`
			`for model in VISION_MODELS:`
			`if check_model_available(model, ollama_url):`
			`print(f"Auto-detected vision model: {model}", file=sys.stderr)`
			`return model`
			`return None`


			`def scan_media(input_dir: str) -> list[Path]:`
			`"""Scan directory for media files recursively."""`
			`media_files = []`
			`input_path = Path(input_dir)`
			`if not input_path.exists():`
			`print(f"Error: {input_dir} does not exist", file=sys.stderr)`
			`return media_files`

			`for ext in sorted(ALL_EXTS):`
			`media_files.extend(input_path.rglob(f"*{ext}"))`
			`media_files.extend(input_path.rglob(f"*{ext.upper()}"))`

			`return sorted(set(media_files))`


			`def extract_video_frame(video_path: Path, output_path: Path) -> bool:`
			`"""Extract a representative frame from a video using ffmpeg."""`
			`try:`
			`result = subprocess.run(`
			`["ffmpeg", "-i", str(video_path), "-vframes", "1",`
			`"-q:v", "2", str(output_path), "-y"],`
			`capture_output=True, timeout=30,`
			`)`
			`return output_path.exists() and output_path.stat().st_size > 0`
			`except FileNotFoundError:`
			`print(" ffmpeg not found — skipping video frame extraction", file=sys.stderr)`
			`return False`
			`except Exception as e:`
			`print(f" ffmpeg error: {e}", file=sys.stderr)`
			`return False`


			`def describe_image(`
			`image_path: Path,`
			`model: str = "gemma4:latest",`
			`ollama_url: str = "http://localhost:11434",`
			`max_retries: int = 2,`
			`) -> Optional[dict]:`
			`"""Generate scene description using Ollama vision model with retry."""`
			`for attempt in range(max_retries + 1):`
			`try:`
			`with open(image_path, "rb") as f:`
			`image_b64 = base64.b64encode(f.read()).decode()`

			`req = urllib.request.Request(`
			`f"{ollama_url}/api/generate",`
			`data=json.dumps({`
			`"model": model,`
			`"prompt": SCENE_PROMPT,`
			`"images": [image_b64],`
			`"stream": False,`
			`"options": {"temperature": 0.3, "num_predict": 512}`
			`}).encode(),`
			`headers={"Content-Type": "application/json"},`
			`)`
			`resp = urllib.request.urlopen(req, timeout=120)`
			`data = json.loads(resp.read())`
			`response_text = data.get("response", "")`

			`# Parse JSON from response`
			`json_match = re.search(r"\{[\s\S]*\}", response_text)`
			`if json_match:`
fix: improve JSON parsing fallback and default to gemma4 2026-04-15 09:42:27 +00:00			`try:`
			`parsed = json.loads(json_match.group())`
			`# Validate required fields`
			`required = ["mood", "colors", "composition", "camera", "description"]`
			`if all(k in parsed for k in required):`
			`if parsed["description"]: # Ensure non-empty description`
			`return parsed`
			`except json.JSONDecodeError:`
			`pass`

			`# Fallback: model returned natural language — wrap it`
			`# Clean up markdown formatting`
			clean = re.sub(r"[*_`#]", "", response_text).strip()
			`clean = re.sub(r"\n{3,}", "\n\n", clean)`
feat: enhance scene description generator for #689 - Auto-detect best vision model (gemma4:latest preferred) - Add --check-model flag for model availability - Add --format sharegpt for training pipeline integration - Add retry logic for Ollama API calls - Add video frame extraction with ffmpeg - Structured JSON output with validation - Comprehensive CLI with --help Closes #689 2026-04-15 09:37:25 +00:00			`return {`
fix: improve JSON parsing fallback and default to gemma4 2026-04-15 09:42:27 +00:00			`"description": clean[:500] if clean else response_text[:500],`
feat: enhance scene description generator for #689 - Auto-detect best vision model (gemma4:latest preferred) - Add --check-model flag for model availability - Add --format sharegpt for training pipeline integration - Add retry logic for Ollama API calls - Add video frame extraction with ffmpeg - Structured JSON output with validation - Comprehensive CLI with --help Closes #689 2026-04-15 09:37:25 +00:00			`"mood": "unknown",`
			`"colors": [],`
			`"composition": "unknown",`
			`"camera": "unknown",`
			`"lighting": "unknown"`
			`}`

			`except (urllib.error.URLError, TimeoutError) as e:`
			`if attempt < max_retries:`
			`wait = 2 ** attempt`
			`print(f" Retry {attempt + 1}/{max_retries} after {wait}s: {e}", file=sys.stderr)`
			`time.sleep(wait)`
			`else:`
			`print(f" Error describing {image_path.name}: {e}", file=sys.stderr)`
			`return None`
			`except Exception as e:`
			`print(f" Error describing {image_path.name}: {e}", file=sys.stderr)`
			`return None`


			`def describe_image_sharegpt(`
			`image_path: Path,`
			`model: str = "gemma4:latest",`
			`ollama_url: str = "http://localhost:11434",`
			`max_retries: int = 2,`
			`) -> Optional[str]:`
			`"""Generate scene description in natural language for ShareGPT format."""`
			`for attempt in range(max_retries + 1):`
			`try:`
			`with open(image_path, "rb") as f:`
			`image_b64 = base64.b64encode(f.read()).decode()`

			`req = urllib.request.Request(`
			`f"{ollama_url}/api/generate",`
			`data=json.dumps({`
			`"model": model,`
			`"prompt": SHAREGPT_SCENE_PROMPT,`
			`"images": [image_b64],`
			`"stream": False,`
			`"options": {"temperature": 0.5, "num_predict": 256}`
			`}).encode(),`
			`headers={"Content-Type": "application/json"},`
			`)`
			`resp = urllib.request.urlopen(req, timeout=120)`
			`data = json.loads(resp.read())`
			`return data.get("response", "").strip()`

			`except (urllib.error.URLError, TimeoutError) as e:`
			`if attempt < max_retries:`
			`time.sleep(2 ** attempt)`
			`else:`
			`return None`
			`except Exception:`
			`return None`


			`def generate_training_pairs(`
			`media_files: list[Path],`
			`model: str,`
			`ollama_url: str,`
			`limit: int = 0,`
			`dry_run: bool = False,`
			`output_format: str = "jsonl",`
			`) -> list[dict]:`
			`"""Generate training pairs from media files."""`
			`pairs = []`
			`files = media_files[:limit] if limit > 0 else media_files`

			`print(f"Processing {len(files)} files with model {model}...", file=sys.stderr)`

			`for i, media_path in enumerate(files):`
			`print(f" [{i + 1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ", flush=True)`

			`if dry_run:`
			`print("(dry run)", file=sys.stderr)`
			`pairs.append({"source": str(media_path), "status": "dry-run"})`
			`continue`

			`is_video = media_path.suffix.lower() in VIDEO_EXTS`
			`work_path = media_path`

			`if is_video:`
			`frame_path = media_path.with_suffix(".frame.jpg")`
			`if extract_video_frame(media_path, frame_path):`
			`work_path = frame_path`
			`else:`
			`print("SKIP (frame extraction failed)", file=sys.stderr)`
			`continue`

			`if output_format == "sharegpt":`
			`# ShareGPT format for training pipeline`
			`description = describe_image_sharegpt(work_path, model, ollama_url)`
			`if description:`
			`pair = {`
			`"conversations": [`
			`{"from": "human", "value": f"<image>\n{SHAREGPT_SCENE_PROMPT}"},`
			`{"from": "gpt", "value": description}`
			`],`
			`"source": str(media_path),`
			`"media_type": "video" if is_video else "image",`
			`"model": model,`
			`"generated_at": datetime.now(timezone.utc).isoformat(),`
			`}`
			`pairs.append(pair)`
			`print("OK", file=sys.stderr)`
			`else:`
			`print("FAIL", file=sys.stderr)`
			`else:`
			`# Structured JSONL format`
			`description = describe_image(work_path, model, ollama_url)`
			`if description:`
			`pair = {`
			`"source": str(media_path),`
			`"media_type": "video" if is_video else "image",`
			`"description": description,`
			`"model": model,`
			`"generated_at": datetime.now(timezone.utc).isoformat(),`
			`}`
			`pairs.append(pair)`
			`print("OK", file=sys.stderr)`
			`else:`
			`print("FAIL", file=sys.stderr)`

			`# Cleanup temp frame`
			`if is_video and work_path != media_path:`
			`try:`
			`work_path.unlink()`
			`except Exception:`
			`pass`

			`# Small delay to avoid overwhelming Ollama`
			`time.sleep(0.5)`

			`return pairs`


			`def main():`
			`parser = argparse.ArgumentParser(`
			`description="Auto-generate scene descriptions from media assets using vision AI"`
			`)`
			`parser.add_argument("--input", "-i", required=True, help="Input directory with media files")`
			`parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")`
fix: improve JSON parsing fallback and default to gemma4 2026-04-15 09:42:27 +00:00			`parser.add_argument("--model", "-m", default="gemma4:latest", help="Ollama model name (auto-detects if empty)")`
feat: enhance scene description generator for #689 - Auto-detect best vision model (gemma4:latest preferred) - Add --check-model flag for model availability - Add --format sharegpt for training pipeline integration - Add retry logic for Ollama API calls - Add video frame extraction with ffmpeg - Structured JSON output with validation - Comprehensive CLI with --help Closes #689 2026-04-15 09:37:25 +00:00			`parser.add_argument("--ollama-url", default="http://localhost:11434")`
			`parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")`
			`parser.add_argument("--dry-run", action="store_true", help="List files without generating")`
			`parser.add_argument("--check-model", action="store_true", help="Check model availability and exit")`
			`parser.add_argument("--format", choices=["jsonl", "sharegpt"], default="jsonl",`
			`help="Output format: jsonl (structured) or sharegpt (training pipeline)")`
			`args = parser.parse_args()`

			`# Model detection`
			`if args.check_model:`
			`if args.model:`
			`available = check_model_available(args.model, args.ollama_url)`
			`print(f"Model '{args.model}': {'✅ available' if available else '❌ not found'}")`
			`else:`
			`model = auto_detect_model(args.ollama_url)`
			`if model:`
			`print(f"✅ Best available: {model}")`
			`else:`
			`print("❌ No vision models found in Ollama")`
			`sys.exit(0)`

			`# Auto-detect model if not specified`
			`model = args.model`
			`if not model:`
			`model = auto_detect_model(args.ollama_url)`
			`if not model:`
			`print("Error: No vision model available. Install one with: ollama pull gemma4:latest",`
			`file=sys.stderr)`
			`sys.exit(1)`

			`# Scan and process`
			`media_files = scan_media(args.input)`
			`print(f"Found {len(media_files)} media files", file=sys.stderr)`

			`if not media_files:`
			`print("No media files found.", file=sys.stderr)`
			`sys.exit(1)`

			`pairs = generate_training_pairs(`
			`media_files, model, args.ollama_url,`
			`args.limit, args.dry_run, args.format`
			`)`

			`# Write output`
			`output_path = Path(args.output)`
			`output_path.parent.mkdir(parents=True, exist_ok=True)`
			`with open(output_path, "w") as f:`
			`for pair in pairs:`
			`f.write(json.dumps(pair, ensure_ascii=False) + "\n")`

			`print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)`

			`# Summary`
			`success = len([p for p in pairs if "description" in p or "conversations" in p])`
			`failed = len(pairs) - success`
			`if failed > 0:`
			`print(f" ⚠️ {failed} files failed", file=sys.stderr)`


			`if __name__ == "__main__":`
			`main()`