#!/usr/bin/env python3
"""
Auto-generate scene descriptions from image/video assets.

Scans a directory for media files, generates scene descriptions using
a local vision model (Ollama), and outputs training pairs in JSONL format.

Supports Gemma 4 multimodal vision via Ollama. Falls back gracefully when
models are unavailable.

Usage:
    python scripts/generate_scene_descriptions.py --input ./assets --output training-data/scene-descriptions-auto.jsonl
    python scripts/generate_scene_descriptions.py --input ./assets --model gemma4:latest --limit 50
    python scripts/generate_scene_descriptions.py --input ./assets --format sharegpt
    python scripts/generate_scene_descriptions.py --dry-run  # List files without generating
    python scripts/generate_scene_descriptions.py --input ./assets --check-model  # Verify model availability

Ref: timmy-config#689
"""

import argparse
import base64
import json
import os
import re
import subprocess
import sys
import time
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional

# Supported media extensions
IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"}
VIDEO_EXTS = {".mp4", ".webm", ".mov", ".avi", ".mkv"}
ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS

# File size limit (50MB) — prevents unbounded memory usage on large images
MAX_FILE_SIZE = 50 * 1024 * 1024

# Vision models in preference order (best first)
VISION_MODELS = [
    "gemma4:latest",       # Gemma 4 — multimodal vision (8B, Q4_K_M)
    "gemma3:12b",          # Gemma 3 — fallback vision
    "llava:latest",        # LLaVA — generic vision
    "llava-phi3:latest",   # LLaVA-Phi3 — lightweight vision
]

# Vision model prompt template (structured JSON output)
SCENE_PROMPT = """Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation):
{
  "mood": "one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic",
  "colors": ["dominant color 1", "dominant color 2", "dominant color 3"],
  "composition": "one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder",
  "camera": "one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off",
  "lighting": "one of: natural, artificial, mixed, dramatic, soft, harsh, backlit",
  "description": "2-3 sentence visual description of the scene"
}

Be specific. Describe what you see, not what you imagine."""

# ShareGPT format prompt (for training pipeline integration)
SHAREGPT_SCENE_PROMPT = """Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description."""


def check_model_available(model: str, ollama_url: str = "http://localhost:11434") -> bool:
    """Check if a model is available in Ollama."""
    try:
        req = urllib.request.Request(f"{ollama_url}/api/tags")
        resp = urllib.request.urlopen(req, timeout=10)
        data = json.loads(resp.read())
        available = [m["name"] for m in data.get("models", [])]
        return model in available
    except Exception:
        return False


def auto_detect_model(ollama_url: str = "http://localhost:11434") -> Optional[str]:
    """Auto-detect the best available vision model."""
    for model in VISION_MODELS:
        if check_model_available(model, ollama_url):
            print(f"Auto-detected vision model: {model}", file=sys.stderr)
            return model
    return None


def scan_media(input_dir: str) -> list[Path]:
    """Scan directory for media files recursively."""
    media_files = []
    input_path = Path(input_dir)
    if not input_path.exists():
        print(f"Error: {input_dir} does not exist", file=sys.stderr)
        return media_files

    for ext in sorted(ALL_EXTS):
        media_files.extend(input_path.rglob(f"*{ext}"))
        media_files.extend(input_path.rglob(f"*{ext.upper()}"))

    return sorted(set(media_files))


def extract_video_frame(video_path: Path, output_path: Path) -> bool:
    """Extract a representative frame from a video using ffmpeg."""
    try:
        result = subprocess.run(
            # FIX #3: Seek 2s in before grabbing frame — avoids black/title frames
            ["ffmpeg", "-ss", "2", "-i", str(video_path), "-vframes", "1",
             "-q:v", "2", str(output_path), "-y"],
            capture_output=True, timeout=30,
        )
        if result.returncode != 0 and result.stderr:
            print(f"  ffmpeg stderr: {result.stderr.decode(errors='replace')[:200]}", file=sys.stderr)
        return output_path.exists() and output_path.stat().st_size > 0
    except FileNotFoundError:
        print("  ffmpeg not found — skipping video frame extraction", file=sys.stderr)
        return False
    except Exception as e:
        print(f"  ffmpeg error: {e}", file=sys.stderr)
        return False


def describe_image(
    image_path: Path,
    model: str = "gemma4:latest",
    ollama_url: str = "http://localhost:11434",
    max_retries: int = 2,
) -> Optional[dict]:
    """Generate scene description using Ollama vision model with retry."""
    # FIX #1: Check file size before reading into memory
    if image_path.stat().st_size > MAX_FILE_SIZE:
        print(f"  Skipping {image_path.name}: exceeds {MAX_FILE_SIZE // (1024*1024)}MB limit", file=sys.stderr)
        return None

    for attempt in range(max_retries + 1):
        try:
            with open(image_path, "rb") as f:
                image_b64 = base64.b64encode(f.read()).decode()

            req = urllib.request.Request(
                f"{ollama_url}/api/generate",
                data=json.dumps({
                    "model": model,
                    "prompt": SCENE_PROMPT,
                    "images": [image_b64],
                    "stream": False,
                    "options": {"temperature": 0.3, "num_predict": 1024}
                }).encode(),
                headers={"Content-Type": "application/json"},
            )
            resp = urllib.request.urlopen(req, timeout=120)
            data = json.loads(resp.read())
            response_text = data.get("response", "")

            # Parse JSON from response (handle both complete and truncated JSON)
            json_match = re.search(r"\{[\s\S]*\}", response_text)
            if not json_match:
                # Try to find opening brace for truncated JSON
                brace_match = re.search(r"\{", response_text)
                if brace_match:
                    json_match = brace_match

            if json_match:
                raw_json = json_match.group() if hasattr(json_match, 'group') else response_text[json_match.start():]
                # Try strict parse first
                try:
                    parsed = json.loads(raw_json)
                    required = ["mood", "colors", "composition", "camera", "description"]
                    if all(k in parsed for k in required) and parsed.get("description"):
                        return parsed
                except json.JSONDecodeError:
                    # Attempt repair: extract fields from truncated JSON
                    repaired = {}
                    for field in ["mood", "colors", "composition", "camera", "lighting", "description"]:
                        pat = rf'"\s*{field}"\s*:\s*"([^"]*)"'
                        m = re.search(pat, response_text)
                        if m:
                            repaired[field] = m.group(1)
                        elif field == "colors":
                            colors_match = re.search(r'"colors"\s*:\s*\[([^\]]*)\]', response_text)
                            if colors_match:
                                repaired[field] = [c.strip().strip('"') for c in colors_match.group(1).split(",") if c.strip()]
                            else:
                                repaired[field] = []
                        else:
                            repaired[field] = "unknown"
                    if repaired.get("description") or repaired.get("mood") != "unknown":
                        return repaired

            # Final fallback: natural language response
            clean = re.sub(r"[*_`#]", "", response_text).strip()
            clean = re.sub(r"\n{3,}", "\n\n", clean)
            return {
                "description": clean[:500] if clean else response_text[:500],
                "mood": "unknown",
                "colors": [],
                "composition": "unknown",
                "camera": "unknown",
                "lighting": "unknown"
            }

        except (urllib.error.URLError, TimeoutError) as e:
            if attempt < max_retries:
                wait = 2 ** attempt
                print(f"  Retry {attempt + 1}/{max_retries} after {wait}s: {e}", file=sys.stderr)
                time.sleep(wait)
            else:
                print(f"  Error describing {image_path.name}: {e}", file=sys.stderr)
                return None
        except Exception as e:
            print(f"  Error describing {image_path.name}: {e}", file=sys.stderr)
            return None


def describe_image_sharegpt(
    image_path: Path,
    model: str = "gemma4:latest",
    ollama_url: str = "http://localhost:11434",
    max_retries: int = 2,
) -> Optional[str]:
    """Generate scene description in natural language for ShareGPT format."""
    # FIX #1: Check file size before reading into memory
    if image_path.stat().st_size > MAX_FILE_SIZE:
        print(f"  Skipping {image_path.name}: exceeds {MAX_FILE_SIZE // (1024*1024)}MB limit", file=sys.stderr)
        return None

    for attempt in range(max_retries + 1):
        try:
            with open(image_path, "rb") as f:
                image_b64 = base64.b64encode(f.read()).decode()

            req = urllib.request.Request(
                f"{ollama_url}/api/generate",
                data=json.dumps({
                    "model": model,
                    "prompt": SHAREGPT_SCENE_PROMPT,
                    "images": [image_b64],
                    "stream": False,
                    "options": {"temperature": 0.5, "num_predict": 256}
                }).encode(),
                headers={"Content-Type": "application/json"},
            )
            resp = urllib.request.urlopen(req, timeout=120)
            data = json.loads(resp.read())
            return data.get("response", "").strip()

        except (urllib.error.URLError, TimeoutError) as e:
            if attempt < max_retries:
                time.sleep(2 ** attempt)
            else:
                return None
        except Exception:
            return None


def generate_training_pairs(
    media_files: list[Path],
    model: str,
    ollama_url: str,
    limit: int = 0,
    dry_run: bool = False,
    output_format: str = "jsonl",
) -> list[dict]:
    """Generate training pairs from media files."""
    pairs = []
    files = media_files[:limit] if limit > 0 else media_files

    print(f"Processing {len(files)} files with model {model}...", file=sys.stderr)

    for i, media_path in enumerate(files):
        print(f"  [{i + 1}/{len(files)}] {media_path.name}...", file=sys.stderr, end=" ", flush=True)

        if dry_run:
            print("(dry run)", file=sys.stderr)
            pairs.append({"source": str(media_path), "status": "dry-run"})
            continue

        is_video = media_path.suffix.lower() in VIDEO_EXTS
        work_path = media_path

        if is_video:
            frame_path = media_path.with_suffix(".frame.jpg")
            if extract_video_frame(media_path, frame_path):
                work_path = frame_path
            else:
                print("SKIP (frame extraction failed)", file=sys.stderr)
                continue

        try:
            if output_format == "sharegpt":
                # ShareGPT format for training pipeline
                description = describe_image_sharegpt(work_path, model, ollama_url)
                if description:
                    pair = {
                        "conversations": [
                            {"from": "human", "value": f"<image>\n{SHAREGPT_SCENE_PROMPT}"},
                            {"from": "gpt", "value": description}
                        ],
                        "source": str(media_path),
                        "media_type": "video" if is_video else "image",
                        "model": model,
                        "generated_at": datetime.now(timezone.utc).isoformat(),
                    }
                    pairs.append(pair)
                    print("OK", file=sys.stderr)
                else:
                    print("FAIL", file=sys.stderr)
            else:
                # Structured JSONL format
                description = describe_image(work_path, model, ollama_url)
                if description:
                    pair = {
                        "source": str(media_path),
                        "media_type": "video" if is_video else "image",
                        "description": description,
                        "model": model,
                        "generated_at": datetime.now(timezone.utc).isoformat(),
                    }
                    pairs.append(pair)
                    print("OK", file=sys.stderr)
                else:
                    print("FAIL", file=sys.stderr)
        finally:
            # FIX #6: Cleanup temp frame in try/finally — survives crashes
            if is_video and work_path != media_path:
                try:
                    work_path.unlink()
                except Exception:
                    pass

        # Small delay between files (reduced from 0.5s — Ollama is local)
        time.sleep(0.1)

    return pairs


def main():
    parser = argparse.ArgumentParser(
        description="Auto-generate scene descriptions from media assets using vision AI"
    )
    parser.add_argument("--input", "-i", default="", help="Input directory with media files")
    parser.add_argument("--output", "-o", default="training-data/scene-descriptions-auto.jsonl")
    parser.add_argument("--model", "-m", default=None, help="Ollama model name (auto-detects best available if omitted)")
    parser.add_argument("--ollama-url", default="http://localhost:11434")
    parser.add_argument("--limit", "-l", type=int, default=0, help="Max files to process (0=all)")
    parser.add_argument("--dry-run", action="store_true", help="List files without generating")
    parser.add_argument("--check-model", action="store_true", help="Check model availability and exit")
    parser.add_argument("--format", choices=["jsonl", "sharegpt"], default="jsonl",
                        help="Output format: jsonl (structured) or sharegpt (training pipeline)")
    args = parser.parse_args()

    # Model detection
    if args.check_model:
        if args.model:
            available = check_model_available(args.model, args.ollama_url)
            print(f"Model '{args.model}': {'✅ available' if available else '❌ not found'}")
        else:
            model = auto_detect_model(args.ollama_url)
            if model:
                print(f"✅ Best available: {model}")
            else:
                print("❌ No vision models found in Ollama — install one with: ollama pull gemma4:latest")
        sys.exit(0)

    # Auto-detect model if not specified
    model = args.model
    if not model:
        model = auto_detect_model(args.ollama_url)
        if not model:
            # Fall back to best default even if not installed — let Ollama handle the error
            model = "gemma4:latest"
            print(f"Warning: No vision models detected. Falling back to {model}", file=sys.stderr)

    # Validate input
    if not args.input:
        print("Error: --input is required (unless using --check-model)", file=sys.stderr)
        sys.exit(1)

    # Scan and process
    media_files = scan_media(args.input)
    print(f"Found {len(media_files)} media files", file=sys.stderr)

    if not media_files:
        print("No media files found.", file=sys.stderr)
        sys.exit(1)

    pairs = generate_training_pairs(
        media_files, model, args.ollama_url,
        args.limit, args.dry_run, args.format
    )

    # Write output
    output_path = Path(args.output)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w") as f:
        for pair in pairs:
            f.write(json.dumps(pair, ensure_ascii=False) + "\n")

    print(f"\nWrote {len(pairs)} pairs to {output_path}", file=sys.stderr)

    # Summary
    success = len([p for p in pairs if "description" in p or "conversations" in p])
    failed = len(pairs) - success
    if failed > 0:
        print(f"  ⚠️  {failed} files failed", file=sys.stderr)


if __name__ == "__main__":
    main()