24 KiB
Optimization Reference
See also: architecture.md · composition.md · scenes.md · shaders.md · inputs.md · troubleshooting.md
Hardware Detection
Detect the user's hardware at script startup and adapt rendering parameters automatically. Never hardcode worker counts or resolution.
CPU and Memory Detection
import multiprocessing
import platform
import shutil
import os
def detect_hardware():
"""Detect hardware capabilities and return render config."""
cpu_count = multiprocessing.cpu_count()
# Leave 1-2 cores free for OS + ffmpeg encoding
if cpu_count >= 16:
workers = cpu_count - 2
elif cpu_count >= 8:
workers = cpu_count - 1
elif cpu_count >= 4:
workers = cpu_count - 1
else:
workers = max(1, cpu_count)
# Memory detection (platform-specific)
try:
if platform.system() == "Darwin":
import subprocess
mem_bytes = int(subprocess.check_output(["sysctl", "-n", "hw.memsize"]).strip())
elif platform.system() == "Linux":
with open("/proc/meminfo") as f:
for line in f:
if line.startswith("MemTotal"):
mem_bytes = int(line.split()[1]) * 1024
break
else:
mem_bytes = 8 * 1024**3 # assume 8GB on unknown
except Exception:
mem_bytes = 8 * 1024**3
mem_gb = mem_bytes / (1024**3)
# Each worker uses ~50-150MB depending on grid sizes
# Cap workers if memory is tight
mem_per_worker_mb = 150
max_workers_by_mem = int(mem_gb * 1024 * 0.6 / mem_per_worker_mb) # use 60% of RAM
workers = min(workers, max_workers_by_mem)
# ffmpeg availability and codec support
has_ffmpeg = shutil.which("ffmpeg") is not None
return {
"cpu_count": cpu_count,
"workers": workers,
"mem_gb": mem_gb,
"platform": platform.system(),
"arch": platform.machine(),
"has_ffmpeg": has_ffmpeg,
}
Adaptive Quality Profiles
Scale resolution, FPS, CRF, and grid density based on hardware:
def quality_profile(hw, target_duration_s, user_preference="auto"):
"""
Returns render settings adapted to hardware.
user_preference: "auto", "draft", "preview", "production", "max"
"""
if user_preference == "draft":
return {"vw": 960, "vh": 540, "fps": 12, "crf": 28, "workers": min(4, hw["workers"]),
"grid_scale": 0.5, "shaders": "minimal", "particles_max": 200}
if user_preference == "preview":
return {"vw": 1280, "vh": 720, "fps": 15, "crf": 25, "workers": hw["workers"],
"grid_scale": 0.75, "shaders": "standard", "particles_max": 500}
if user_preference == "max":
return {"vw": 3840, "vh": 2160, "fps": 30, "crf": 15, "workers": hw["workers"],
"grid_scale": 2.0, "shaders": "full", "particles_max": 3000}
# "production" or "auto"
# Auto-detect: estimate render time, downgrade if it would take too long
n_frames = int(target_duration_s * 24)
est_seconds_per_frame = 0.18 # ~180ms at 1080p
est_total_s = n_frames * est_seconds_per_frame / max(1, hw["workers"])
if hw["mem_gb"] < 4 or hw["cpu_count"] <= 2:
# Low-end: 720p, 15fps
return {"vw": 1280, "vh": 720, "fps": 15, "crf": 23, "workers": hw["workers"],
"grid_scale": 0.75, "shaders": "standard", "particles_max": 500}
if est_total_s > 3600: # would take over an hour
# Downgrade to 720p to speed up
return {"vw": 1280, "vh": 720, "fps": 24, "crf": 20, "workers": hw["workers"],
"grid_scale": 0.75, "shaders": "standard", "particles_max": 800}
# Standard production: 1080p 24fps
return {"vw": 1920, "vh": 1080, "fps": 24, "crf": 20, "workers": hw["workers"],
"grid_scale": 1.0, "shaders": "full", "particles_max": 1200}
def apply_quality_profile(profile):
"""Set globals from quality profile."""
global VW, VH, FPS, N_WORKERS
VW = profile["vw"]
VH = profile["vh"]
FPS = profile["fps"]
N_WORKERS = profile["workers"]
# Grid sizes scale with resolution
# CRF passed to ffmpeg encoder
# Shader set determines which post-processing is active
CLI Integration
parser = argparse.ArgumentParser()
parser.add_argument("--quality", choices=["draft", "preview", "production", "max", "auto"],
default="auto", help="Render quality preset")
parser.add_argument("--aspect", choices=["landscape", "portrait", "square"],
default="landscape", help="Aspect ratio preset")
parser.add_argument("--workers", type=int, default=0, help="Override worker count (0=auto)")
parser.add_argument("--resolution", type=str, default="", help="Override resolution e.g. 1280x720")
args = parser.parse_args()
hw = detect_hardware()
if args.workers > 0:
hw["workers"] = args.workers
profile = quality_profile(hw, target_duration, args.quality)
# Apply aspect ratio preset (before manual resolution override)
ASPECT_PRESETS = {
"landscape": (1920, 1080),
"portrait": (1080, 1920),
"square": (1080, 1080),
}
if args.aspect != "landscape" and not args.resolution:
profile["vw"], profile["vh"] = ASPECT_PRESETS[args.aspect]
if args.resolution:
w, h = args.resolution.split("x")
profile["vw"], profile["vh"] = int(w), int(h)
apply_quality_profile(profile)
log(f"Hardware: {hw['cpu_count']} cores, {hw['mem_gb']:.1f}GB RAM, {hw['platform']}")
log(f"Render: {profile['vw']}x{profile['vh']} @{profile['fps']}fps, "
f"CRF {profile['crf']}, {profile['workers']} workers")
Portrait Mode Considerations
Portrait (1080x1920) has the same pixel count as landscape 1080p, so performance is equivalent. But composition patterns differ:
| Concern | Landscape | Portrait |
|---|---|---|
Grid cols at lg |
160 | 90 |
Grid rows at lg |
45 | 80 |
| Max text line chars | ~50 centered | ~25-30 centered |
| Vertical rain | Short travel | Long, dramatic travel |
| Horizontal spectrum | Full width | Needs rotation or compression |
| Radial effects | Natural circles | Tall ellipses (aspect correction handles this) |
| Particle explosions | Wide spread | Tall spread |
| Text stacking | 3-4 lines comfortable | 8-10 lines comfortable |
| Quote layout | 2-3 wide lines | 5-6 short lines |
Portrait-optimized patterns:
- Vertical rain/matrix effects are naturally enhanced — longer column travel
- Fire columns rise through more screen space
- Rising embers/particles have more vertical runway
- Text can be stacked more aggressively with more lines
- Radial effects work if aspect correction is applied (GridLayer handles this automatically)
- Spectrum bars can be rotated 90 degrees (vertical bars from bottom)
Portrait text layout:
def layout_text_portrait(text, max_chars_per_line=25, grid=None):
"""Break text into short lines for portrait display."""
words = text.split()
lines = []; current = ""
for w in words:
if len(current) + len(w) + 1 > max_chars_per_line:
lines.append(current.strip())
current = w + " "
else:
current += w + " "
if current.strip():
lines.append(current.strip())
return lines
Performance Budget
Target: 100-200ms per frame (5-10 fps single-threaded, 40-80 fps across 8 workers).
| Component | Time | Notes |
|---|---|---|
| Feature extraction | 1-5ms | Pre-computed for all frames before render |
| Effect function | 2-15ms | Vectorized numpy, avoid Python loops |
| Character render | 80-150ms | Bottleneck -- per-cell Python loop |
| Shader pipeline | 5-25ms | Depends on active shaders |
| ffmpeg encode | ~5ms | Amortized by pipe buffering |
Bitmap Pre-Rasterization
Rasterize every character at init, not per-frame:
# At init time -- done once
for c in all_characters:
img = Image.new("L", (cell_w, cell_h), 0)
ImageDraw.Draw(img).text((0, 0), c, fill=255, font=font)
bitmaps[c] = np.array(img, dtype=np.float32) / 255.0 # float32 for fast multiply
# At render time -- fast lookup
bitmap = bitmaps[char]
canvas[y:y+ch, x:x+cw] = np.maximum(canvas[y:y+ch, x:x+cw],
(bitmap[:,:,None] * color).astype(np.uint8))
Collect all characters from all palettes + overlay text into the init set. Lazy-init for any missed characters.
Pre-Rendered Background Textures
Alternative to _render_vf() for backgrounds where characters don't need to change every frame. Pre-bake a static ASCII texture once at init, then multiply by a per-cell color field each frame. One matrix multiply vs thousands of bitmap blits.
Use when: background layer uses a fixed character palette and only color/brightness varies per frame. NOT suitable for layers where character selection depends on a changing value field.
Init: Bake the Texture
# In GridLayer.__init__:
self._bg_row_idx = np.clip(
(np.arange(VH) - self.oy) // self.ch, 0, self.rows - 1
)
self._bg_col_idx = np.clip(
(np.arange(VW) - self.ox) // self.cw, 0, self.cols - 1
)
self._bg_textures = {}
def make_bg_texture(self, palette):
"""Pre-render a static ASCII texture (grayscale float32) once."""
if palette not in self._bg_textures:
texture = np.zeros((VH, VW), dtype=np.float32)
rng = random.Random(12345)
ch_list = [c for c in palette if c != " " and c in self.bm]
if not ch_list:
ch_list = list(self.bm.keys())[:5]
for row in range(self.rows):
y = self.oy + row * self.ch
if y + self.ch > VH:
break
for col in range(self.cols):
x = self.ox + col * self.cw
if x + self.cw > VW:
break
bm = self.bm[rng.choice(ch_list)]
texture[y:y+self.ch, x:x+self.cw] = bm
self._bg_textures[palette] = texture
return self._bg_textures[palette]
Render: Color Field x Cached Texture
def render_bg(self, color_field, palette=PAL_CIRCUIT):
"""Fast background: pre-rendered ASCII texture * per-cell color field.
color_field: (rows, cols, 3) uint8. Returns (VH, VW, 3) uint8."""
texture = self.make_bg_texture(palette)
# Expand cell colors to pixel coords via pre-computed index maps
color_px = color_field[
self._bg_row_idx[:, None], self._bg_col_idx[None, :]
].astype(np.float32)
return (texture[:, :, None] * color_px).astype(np.uint8)
Usage in a Scene
# Build per-cell color from effect fields (cheap — rows*cols, not VH*VW)
hue = ((t * 0.05 + val * 0.2) % 1.0).astype(np.float32)
R, G, B = hsv2rgb(hue, np.full_like(val, 0.5), val)
color_field = mkc(R, G, B, g.rows, g.cols) # (rows, cols, 3) uint8
# Render background — single matrix multiply, no per-cell loop
canvas_bg = g.render_bg(color_field, PAL_DENSE)
The texture init loop runs once and is cached per palette. Per-frame cost is one fancy-index lookup + one broadcast multiply — orders of magnitude faster than the per-cell bitmap blit loop in render() for dense backgrounds.
Coordinate Array Caching
Pre-compute all grid-relative coordinate arrays at init, not per-frame:
# These are O(rows*cols) and used in every effect
self.rr = np.arange(rows)[:, None] # row indices
self.cc = np.arange(cols)[None, :] # col indices
self.dist = np.sqrt(dx**2 + dy**2) # distance from center
self.angle = np.arctan2(dy, dx) # angle from center
self.dist_n = ... # normalized distance
Vectorized Effect Patterns
Avoid Per-Cell Python Loops in Effects
The render loop (compositing bitmaps) is unavoidably per-cell. But effect functions must be fully vectorized numpy -- never iterate over rows/cols in Python.
Bad (O(rows*cols) Python loop):
for r in range(rows):
for c in range(cols):
val[r, c] = math.sin(c * 0.1 + t) * math.cos(r * 0.1 - t)
Good (vectorized):
val = np.sin(g.cc * 0.1 + t) * np.cos(g.rr * 0.1 - t)
Vectorized Matrix Rain
The naive per-column per-trail-pixel loop is the second biggest bottleneck after the render loop. Use numpy fancy indexing:
# Instead of nested Python loops over columns and trail pixels:
# Build row index arrays for all active trail pixels at once
all_rows = []
all_cols = []
all_fades = []
for c in range(cols):
head = int(S["ry"][c])
trail_len = S["rln"][c]
for i in range(trail_len):
row = head - i
if 0 <= row < rows:
all_rows.append(row)
all_cols.append(c)
all_fades.append(1.0 - i / trail_len)
# Vectorized assignment
ar = np.array(all_rows)
ac = np.array(all_cols)
af = np.array(all_fades, dtype=np.float32)
# Assign chars and colors in bulk using fancy indexing
ch[ar, ac] = ... # vectorized char assignment
co[ar, ac, 1] = (af * bri * 255).astype(np.uint8) # green channel
Vectorized Fire Columns
Same pattern -- accumulate index arrays, assign in bulk:
fire_val = np.zeros((rows, cols), dtype=np.float32)
for fi in range(n_cols):
fx_c = int((fi * cols / n_cols + np.sin(t * 2 + fi * 0.7) * 3) % cols)
height = int(energy * rows * 0.7)
dy = np.arange(min(height, rows))
fr = rows - 1 - dy
frac = dy / max(height, 1)
# Width spread: base columns wider at bottom
for dx in range(-1, 2): # 3-wide columns
c = fx_c + dx
if 0 <= c < cols:
fire_val[fr, c] = np.maximum(fire_val[fr, c],
(1 - frac * 0.6) * (0.5 + rms * 0.5))
# Now map fire_val to chars and colors in one vectorized pass
PIL String Rendering for Text-Heavy Scenes
Alternative to per-cell bitmap blitting when rendering many long text strings (scrolling tickers, typewriter sequences, idea floods). Uses PIL's native ImageDraw.text() which renders an entire string in one C call, vs one Python-loop bitmap blit per character.
Typical win: a scene with 56 ticker rows renders 56 PIL text() calls instead of ~10K individual bitmap blits.
Use when: scene renders many rows of readable text strings. NOT suitable for sparse or spatially-scattered single characters (use normal render() for those).
from PIL import Image, ImageDraw
def render_text_layer(grid, rows_data, font):
"""Render dense text rows via PIL instead of per-cell bitmap blitting.
Args:
grid: GridLayer instance (for oy, ch, ox, font metrics)
rows_data: list of (row_index, text_string, rgb_tuple) — one per row
font: PIL ImageFont instance (grid.font)
Returns:
uint8 array (VH, VW, 3) — canvas with rendered text
"""
img = Image.new("RGB", (VW, VH), (0, 0, 0))
draw = ImageDraw.Draw(img)
for row_idx, text, color in rows_data:
y = grid.oy + row_idx * grid.ch
if y + grid.ch > VH:
break
draw.text((grid.ox, y), text, fill=color, font=font)
return np.array(img)
Usage in a Ticker Scene
# Build ticker data (text + color per row)
rows_data = []
for row in range(n_tickers):
text = build_ticker_text(row, t) # scrolling substring
color = hsv2rgb_scalar(hue, 0.85, bri) # (R, G, B) tuple
rows_data.append((row, text, color))
# One PIL pass instead of thousands of bitmap blits
canvas_tickers = render_text_layer(g_md, rows_data, g_md.font)
# Blend with other layers normally
result = blend_canvas(canvas_bg, canvas_tickers, "screen", 0.9)
This is purely a rendering optimization — same visual output, fewer draw calls. The grid's render() method is still needed for sparse character fields where characters are placed individually based on value fields.
Bloom Optimization
Do NOT use scipy.ndimage.uniform_filter -- measured at 424ms/frame.
Use 4x downsample + manual box blur instead -- 84ms/frame (5x faster):
sm = canvas[::4, ::4].astype(np.float32) # 4x downsample
br = np.where(sm > threshold, sm, 0)
for _ in range(3): # 3-pass manual box blur
p = np.pad(br, ((1,1),(1,1),(0,0)), mode='edge')
br = (p[:-2,:-2] + p[:-2,1:-1] + p[:-2,2:] +
p[1:-1,:-2] + p[1:-1,1:-1] + p[1:-1,2:] +
p[2:,:-2] + p[2:,1:-1] + p[2:,2:]) / 9.0
bl = np.repeat(np.repeat(br, 4, axis=0), 4, axis=1)[:H, :W]
Vignette Caching
Distance field is resolution- and strength-dependent, never changes per frame:
_vig_cache = {}
def sh_vignette(canvas, strength):
key = (canvas.shape[0], canvas.shape[1], round(strength, 2))
if key not in _vig_cache:
Y = np.linspace(-1, 1, H)[:, None]
X = np.linspace(-1, 1, W)[None, :]
_vig_cache[key] = np.clip(1.0 - np.sqrt(X**2+Y**2) * strength, 0.15, 1).astype(np.float32)
return np.clip(canvas * _vig_cache[key][:,:,None], 0, 255).astype(np.uint8)
Same pattern for CRT barrel distortion (cache remap coordinates).
Film Grain Optimization
Generate noise at half resolution, tile up:
noise = np.random.randint(-amt, amt+1, (H//2, W//2, 1), dtype=np.int16)
noise = np.repeat(np.repeat(noise, 2, axis=0), 2, axis=1)[:H, :W]
2x blocky grain looks like film grain and costs 1/4 the random generation.
Parallel Rendering
Worker Architecture
hw = detect_hardware()
N_WORKERS = hw["workers"]
# Batch splitting (for non-clip architectures)
batch_size = (n_frames + N_WORKERS - 1) // N_WORKERS
batches = [(i, i*batch_size, min((i+1)*batch_size, n_frames), features, seg_path) ...]
with multiprocessing.Pool(N_WORKERS) as pool:
segments = pool.starmap(render_batch, batches)
Per-Clip Parallelism (Preferred for Segmented Videos)
from concurrent.futures import ProcessPoolExecutor, as_completed
with ProcessPoolExecutor(max_workers=N_WORKERS) as pool:
futures = {pool.submit(render_clip, seg, features, path): seg["id"]
for seg, path in clip_args}
for fut in as_completed(futures):
clip_id = futures[fut]
try:
fut.result()
log(f" {clip_id} done")
except Exception as e:
log(f" {clip_id} FAILED: {e}")
Worker Isolation
Each worker:
- Creates its own
Rendererinstance (with full grid + bitmap init) - Opens its own ffmpeg subprocess
- Has independent random seed (
random.seed(batch_id * 10000)) - Writes to its own segment file and stderr log
ffmpeg Pipe Safety
CRITICAL: Never stderr=subprocess.PIPE with long-running ffmpeg. The stderr buffer fills at ~64KB and deadlocks:
# WRONG -- will deadlock
pipe = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
# RIGHT -- stderr to file
stderr_fh = open(err_path, "w")
pipe = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=stderr_fh)
# ... write all frames ...
pipe.stdin.close()
pipe.wait()
stderr_fh.close()
Concatenation
with open(concat_file, "w") as cf:
for seg in segments:
cf.write(f"file '{seg}'\n")
cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", concat_file]
if audio_path:
cmd += ["-i", audio_path, "-c:v", "copy", "-c:a", "aac", "-b:a", "192k", "-shortest"]
else:
cmd += ["-c:v", "copy"]
cmd.append(output_path)
subprocess.run(cmd, capture_output=True, check=True)
Particle System Performance
Cap particle counts based on quality profile:
| System | Low | Standard | High |
|---|---|---|---|
| Explosion | 300 | 1000 | 2500 |
| Embers | 500 | 1500 | 3000 |
| Starfield | 300 | 800 | 1500 |
| Dissolve | 200 | 600 | 1200 |
Cull by truncating lists:
MAX_PARTICLES = profile.get("particles_max", 1200)
if len(S["px"]) > MAX_PARTICLES:
for k in ("px", "py", "vx", "vy", "life", "char"):
S[k] = S[k][-MAX_PARTICLES:] # keep newest
Memory Management
- Feature arrays: pre-computed for all frames, shared across workers via fork semantics (COW)
- Canvas: allocated once per worker, reused (
np.zeros(...)) - Character arrays: allocated per frame (cheap -- rows*cols U1 strings)
- Bitmap cache: ~500KB per grid size, initialized once per worker
Total memory per worker: ~50-150MB. Total: ~400-800MB for 8 workers.
For low-memory systems (< 4GB), reduce worker count and use smaller grids.
Brightness Verification
After render, spot-check brightness at sample timestamps:
for t in [2, 30, 60, 120, 180]:
cmd = ["ffmpeg", "-ss", str(t), "-i", output_path,
"-frames:v", "1", "-f", "rawvideo", "-pix_fmt", "rgb24", "-"]
r = subprocess.run(cmd, capture_output=True)
arr = np.frombuffer(r.stdout, dtype=np.uint8)
print(f"t={t}s mean={arr.mean():.1f} max={arr.max()}")
Target: mean > 5 for quiet sections, mean > 15 for active sections. If consistently below, increase brightness floor in effects and/or global boost multiplier.
Render Time Estimates
Scale with hardware. Baseline: 1080p, 24fps, ~180ms/frame/worker.
| Duration | Frames | 4 workers | 8 workers | 16 workers |
|---|---|---|---|---|
| 30s | 720 | ~3 min | ~2 min | ~1 min |
| 2 min | 2,880 | ~13 min | ~7 min | ~4 min |
| 3.5 min | 5,040 | ~23 min | ~12 min | ~6 min |
| 5 min | 7,200 | ~33 min | ~17 min | ~9 min |
| 10 min | 14,400 | ~65 min | ~33 min | ~17 min |
At 720p: multiply times by ~0.5. At 4K: multiply by ~4.
Heavier effects (many particles, dense grids, extra shader passes) add ~20-50%.
Temp File Cleanup
Rendering generates intermediate files that accumulate across runs. Clean up after the final concat/mux step.
Files to Clean
| File type | Source | Location |
|---|---|---|
| WAV extracts | ffmpeg -i input.mp3 ... tmp.wav |
tempfile.mktemp() or project dir |
| Segment clips | render_clip() output |
segments/seg_00.mp4 etc. |
| Concat list | ffmpeg concat demuxer input | segments/concat.txt |
| ffmpeg stderr logs | piped to file for debugging | *.log in project dir |
| Feature cache | pickled numpy arrays | *.pkl or *.npz |
Cleanup Function
import glob
import tempfile
import shutil
def cleanup_render_artifacts(segments_dir="segments", keep_final=True):
"""Remove intermediate files after successful render.
Call this AFTER verifying the final output exists and plays correctly.
Args:
segments_dir: directory containing segment clips and concat list
keep_final: if True, only delete intermediates (not the final output)
"""
removed = []
# 1. Segment clips
if os.path.isdir(segments_dir):
shutil.rmtree(segments_dir)
removed.append(f"directory: {segments_dir}")
# 2. Temporary WAV files
for wav in glob.glob("*.wav"):
if wav.startswith("tmp") or wav.startswith("extracted_"):
os.remove(wav)
removed.append(wav)
# 3. ffmpeg stderr logs
for log in glob.glob("ffmpeg_*.log"):
os.remove(log)
removed.append(log)
# 4. Feature cache (optional — useful to keep for re-renders)
# for cache in glob.glob("features_*.npz"):
# os.remove(cache)
# removed.append(cache)
print(f"Cleaned {len(removed)} artifacts: {removed}")
return removed
Integration with Render Pipeline
Call cleanup at the end of the main render script, after the final output is verified:
# At end of main()
if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
cleanup_render_artifacts(segments_dir="segments")
print(f"Done. Output: {output_path}")
else:
print("WARNING: final output missing or empty — skipping cleanup")
Temp File Best Practices
- Use
tempfile.mkdtemp()for segment directories — avoids polluting the project dir - Name WAV extracts with
tempfile.mktemp(suffix=".wav")so they're in the OS temp dir - For debugging, set
KEEP_INTERMEDIATES=1env var to skip cleanup - Feature caches (
.npz) are cheap to store and expensive to recompute — default to keeping them