2026-04-15 09:37:25 +00:00
#!/usr/bin/env python3
"""
Auto - generate scene descriptions from image / video assets .
Scans a directory for media files , generates scene descriptions using
a local vision model ( Ollama ) , and outputs training pairs in JSONL format .
Supports Gemma 4 multimodal vision via Ollama . Falls back gracefully when
models are unavailable .
Usage :
python scripts / generate_scene_descriptions . py - - input . / assets - - output training - data / scene - descriptions - auto . jsonl
python scripts / generate_scene_descriptions . py - - input . / assets - - model gemma4 : latest - - limit 50
python scripts / generate_scene_descriptions . py - - input . / assets - - format sharegpt
python scripts / generate_scene_descriptions . py - - dry - run # List files without generating
python scripts / generate_scene_descriptions . py - - input . / assets - - check - model # Verify model availability
Ref : timmy - config #689
"""
import argparse
import base64
import json
import os
import re
import subprocess
import sys
import time
import urllib . request
from datetime import datetime , timezone
from pathlib import Path
from typing import Optional
# Supported media extensions
IMAGE_EXTS = { " .jpg " , " .jpeg " , " .png " , " .gif " , " .webp " , " .bmp " }
VIDEO_EXTS = { " .mp4 " , " .webm " , " .mov " , " .avi " , " .mkv " }
ALL_EXTS = IMAGE_EXTS | VIDEO_EXTS
# Vision models in preference order (best first)
VISION_MODELS = [
" gemma4:latest " , # Gemma 4 — multimodal vision (8B, Q4_K_M)
" gemma3:12b " , # Gemma 3 — fallback vision
" llava:latest " , # LLaVA — generic vision
" llava-phi3:latest " , # LLaVA-Phi3 — lightweight vision
]
# Vision model prompt template (structured JSON output)
SCENE_PROMPT = """ Describe this image for a visual scene database. Output ONLY valid JSON (no markdown, no explanation):
{
" mood " : " one of: calm, energetic, dark, warm, cool, chaotic, serene, tense, joyful, melancholic " ,
" colors " : [ " dominant color 1 " , " dominant color 2 " , " dominant color 3 " ] ,
" composition " : " one of: close-up, wide-shot, medium-shot, low-angle, high-angle, bird-eye, profile, over-shoulder " ,
" camera " : " one of: static, slow-pan, tracking, handheld, crane, dolly, steady, locked-off " ,
" lighting " : " one of: natural, artificial, mixed, dramatic, soft, harsh, backlit " ,
" description " : " 2-3 sentence visual description of the scene "
}
Be specific . Describe what you see , not what you imagine . """
# ShareGPT format prompt (for training pipeline integration)
SHAREGPT_SCENE_PROMPT = """ Analyze this image and describe the visual scene. Include mood, dominant colors, composition, camera angle, lighting, and a vivid 2-3 sentence description. """
def check_model_available ( model : str , ollama_url : str = " http://localhost:11434 " ) - > bool :
""" Check if a model is available in Ollama. """
try :
req = urllib . request . Request ( f " { ollama_url } /api/tags " )
resp = urllib . request . urlopen ( req , timeout = 10 )
data = json . loads ( resp . read ( ) )
available = [ m [ " name " ] for m in data . get ( " models " , [ ] ) ]
return model in available
except Exception :
return False
def auto_detect_model ( ollama_url : str = " http://localhost:11434 " ) - > Optional [ str ] :
""" Auto-detect the best available vision model. """
for model in VISION_MODELS :
if check_model_available ( model , ollama_url ) :
print ( f " Auto-detected vision model: { model } " , file = sys . stderr )
return model
return None
def scan_media ( input_dir : str ) - > list [ Path ] :
""" Scan directory for media files recursively. """
media_files = [ ]
input_path = Path ( input_dir )
if not input_path . exists ( ) :
print ( f " Error: { input_dir } does not exist " , file = sys . stderr )
return media_files
for ext in sorted ( ALL_EXTS ) :
media_files . extend ( input_path . rglob ( f " * { ext } " ) )
media_files . extend ( input_path . rglob ( f " * { ext . upper ( ) } " ) )
return sorted ( set ( media_files ) )
def extract_video_frame ( video_path : Path , output_path : Path ) - > bool :
""" Extract a representative frame from a video using ffmpeg. """
try :
result = subprocess . run (
[ " ffmpeg " , " -i " , str ( video_path ) , " -vframes " , " 1 " ,
" -q:v " , " 2 " , str ( output_path ) , " -y " ] ,
capture_output = True , timeout = 30 ,
)
return output_path . exists ( ) and output_path . stat ( ) . st_size > 0
except FileNotFoundError :
print ( " ffmpeg not found — skipping video frame extraction " , file = sys . stderr )
return False
except Exception as e :
print ( f " ffmpeg error: { e } " , file = sys . stderr )
return False
def describe_image (
image_path : Path ,
model : str = " gemma4:latest " ,
ollama_url : str = " http://localhost:11434 " ,
max_retries : int = 2 ,
) - > Optional [ dict ] :
""" Generate scene description using Ollama vision model with retry. """
for attempt in range ( max_retries + 1 ) :
try :
with open ( image_path , " rb " ) as f :
image_b64 = base64 . b64encode ( f . read ( ) ) . decode ( )
req = urllib . request . Request (
f " { ollama_url } /api/generate " ,
data = json . dumps ( {
" model " : model ,
" prompt " : SCENE_PROMPT ,
" images " : [ image_b64 ] ,
" stream " : False ,
" options " : { " temperature " : 0.3 , " num_predict " : 512 }
} ) . encode ( ) ,
headers = { " Content-Type " : " application/json " } ,
)
resp = urllib . request . urlopen ( req , timeout = 120 )
data = json . loads ( resp . read ( ) )
response_text = data . get ( " response " , " " )
# Parse JSON from response
json_match = re . search ( r " \ { [ \ s \ S]* \ } " , response_text )
if json_match :
2026-04-15 09:42:27 +00:00
try :
parsed = json . loads ( json_match . group ( ) )
# Validate required fields
required = [ " mood " , " colors " , " composition " , " camera " , " description " ]
if all ( k in parsed for k in required ) :
if parsed [ " description " ] : # Ensure non-empty description
return parsed
except json . JSONDecodeError :
pass
# Fallback: model returned natural language — wrap it
# Clean up markdown formatting
clean = re . sub ( r " [*_`#] " , " " , response_text ) . strip ( )
clean = re . sub ( r " \ n { 3,} " , " \n \n " , clean )
2026-04-15 09:37:25 +00:00
return {
2026-04-15 09:42:27 +00:00
" description " : clean [ : 500 ] if clean else response_text [ : 500 ] ,
2026-04-15 09:37:25 +00:00
" mood " : " unknown " ,
" colors " : [ ] ,
" composition " : " unknown " ,
" camera " : " unknown " ,
" lighting " : " unknown "
}
except ( urllib . error . URLError , TimeoutError ) as e :
if attempt < max_retries :
wait = 2 * * attempt
print ( f " Retry { attempt + 1 } / { max_retries } after { wait } s: { e } " , file = sys . stderr )
time . sleep ( wait )
else :
print ( f " Error describing { image_path . name } : { e } " , file = sys . stderr )
return None
except Exception as e :
print ( f " Error describing { image_path . name } : { e } " , file = sys . stderr )
return None
def describe_image_sharegpt (
image_path : Path ,
model : str = " gemma4:latest " ,
ollama_url : str = " http://localhost:11434 " ,
max_retries : int = 2 ,
) - > Optional [ str ] :
""" Generate scene description in natural language for ShareGPT format. """
for attempt in range ( max_retries + 1 ) :
try :
with open ( image_path , " rb " ) as f :
image_b64 = base64 . b64encode ( f . read ( ) ) . decode ( )
req = urllib . request . Request (
f " { ollama_url } /api/generate " ,
data = json . dumps ( {
" model " : model ,
" prompt " : SHAREGPT_SCENE_PROMPT ,
" images " : [ image_b64 ] ,
" stream " : False ,
" options " : { " temperature " : 0.5 , " num_predict " : 256 }
} ) . encode ( ) ,
headers = { " Content-Type " : " application/json " } ,
)
resp = urllib . request . urlopen ( req , timeout = 120 )
data = json . loads ( resp . read ( ) )
return data . get ( " response " , " " ) . strip ( )
except ( urllib . error . URLError , TimeoutError ) as e :
if attempt < max_retries :
time . sleep ( 2 * * attempt )
else :
return None
except Exception :
return None
def generate_training_pairs (
media_files : list [ Path ] ,
model : str ,
ollama_url : str ,
limit : int = 0 ,
dry_run : bool = False ,
output_format : str = " jsonl " ,
) - > list [ dict ] :
""" Generate training pairs from media files. """
pairs = [ ]
files = media_files [ : limit ] if limit > 0 else media_files
print ( f " Processing { len ( files ) } files with model { model } ... " , file = sys . stderr )
for i , media_path in enumerate ( files ) :
print ( f " [ { i + 1 } / { len ( files ) } ] { media_path . name } ... " , file = sys . stderr , end = " " , flush = True )
if dry_run :
print ( " (dry run) " , file = sys . stderr )
pairs . append ( { " source " : str ( media_path ) , " status " : " dry-run " } )
continue
is_video = media_path . suffix . lower ( ) in VIDEO_EXTS
work_path = media_path
if is_video :
frame_path = media_path . with_suffix ( " .frame.jpg " )
if extract_video_frame ( media_path , frame_path ) :
work_path = frame_path
else :
print ( " SKIP (frame extraction failed) " , file = sys . stderr )
continue
if output_format == " sharegpt " :
# ShareGPT format for training pipeline
description = describe_image_sharegpt ( work_path , model , ollama_url )
if description :
pair = {
" conversations " : [
{ " from " : " human " , " value " : f " <image> \n { SHAREGPT_SCENE_PROMPT } " } ,
{ " from " : " gpt " , " value " : description }
] ,
" source " : str ( media_path ) ,
" media_type " : " video " if is_video else " image " ,
" model " : model ,
" generated_at " : datetime . now ( timezone . utc ) . isoformat ( ) ,
}
pairs . append ( pair )
print ( " OK " , file = sys . stderr )
else :
print ( " FAIL " , file = sys . stderr )
else :
# Structured JSONL format
description = describe_image ( work_path , model , ollama_url )
if description :
pair = {
" source " : str ( media_path ) ,
" media_type " : " video " if is_video else " image " ,
" description " : description ,
" model " : model ,
" generated_at " : datetime . now ( timezone . utc ) . isoformat ( ) ,
}
pairs . append ( pair )
print ( " OK " , file = sys . stderr )
else :
print ( " FAIL " , file = sys . stderr )
# Cleanup temp frame
if is_video and work_path != media_path :
try :
work_path . unlink ( )
except Exception :
pass
# Small delay to avoid overwhelming Ollama
time . sleep ( 0.5 )
return pairs
def main ( ) :
parser = argparse . ArgumentParser (
description = " Auto-generate scene descriptions from media assets using vision AI "
)
parser . add_argument ( " --input " , " -i " , required = True , help = " Input directory with media files " )
parser . add_argument ( " --output " , " -o " , default = " training-data/scene-descriptions-auto.jsonl " )
2026-04-15 09:42:27 +00:00
parser . add_argument ( " --model " , " -m " , default = " gemma4:latest " , help = " Ollama model name (auto-detects if empty) " )
2026-04-15 09:37:25 +00:00
parser . add_argument ( " --ollama-url " , default = " http://localhost:11434 " )
parser . add_argument ( " --limit " , " -l " , type = int , default = 0 , help = " Max files to process (0=all) " )
parser . add_argument ( " --dry-run " , action = " store_true " , help = " List files without generating " )
parser . add_argument ( " --check-model " , action = " store_true " , help = " Check model availability and exit " )
parser . add_argument ( " --format " , choices = [ " jsonl " , " sharegpt " ] , default = " jsonl " ,
help = " Output format: jsonl (structured) or sharegpt (training pipeline) " )
args = parser . parse_args ( )
# Model detection
if args . check_model :
if args . model :
available = check_model_available ( args . model , args . ollama_url )
print ( f " Model ' { args . model } ' : { ' ✅ available ' if available else ' ❌ not found ' } " )
else :
model = auto_detect_model ( args . ollama_url )
if model :
print ( f " ✅ Best available: { model } " )
else :
print ( " ❌ No vision models found in Ollama " )
sys . exit ( 0 )
# Auto-detect model if not specified
model = args . model
if not model :
model = auto_detect_model ( args . ollama_url )
if not model :
print ( " Error: No vision model available. Install one with: ollama pull gemma4:latest " ,
file = sys . stderr )
sys . exit ( 1 )
# Scan and process
media_files = scan_media ( args . input )
print ( f " Found { len ( media_files ) } media files " , file = sys . stderr )
if not media_files :
print ( " No media files found. " , file = sys . stderr )
sys . exit ( 1 )
pairs = generate_training_pairs (
media_files , model , args . ollama_url ,
args . limit , args . dry_run , args . format
)
# Write output
output_path = Path ( args . output )
output_path . parent . mkdir ( parents = True , exist_ok = True )
with open ( output_path , " w " ) as f :
for pair in pairs :
f . write ( json . dumps ( pair , ensure_ascii = False ) + " \n " )
print ( f " \n Wrote { len ( pairs ) } pairs to { output_path } " , file = sys . stderr )
# Summary
success = len ( [ p for p in pairs if " description " in p or " conversations " in p ] )
failed = len ( pairs ) - success
if failed > 0 :
print ( f " ⚠️ { failed } files failed " , file = sys . stderr )
if __name__ == " __main__ " :
main ( )