#!/usr/bin/env python3 """ Prompt Cache Warming Script for llama-server Issue #85 — KV Cache Reuse Pre-processes system prompts to populate the KV cache, reducing time-to-first-token by 50-70%. Usage: python warmup_cache.py --prompt minimal python warmup_cache.py --prompt standard python warmup_cache.py --prompt deep python warmup_cache.py --all """ import argparse import json import requests import time from pathlib import Path from typing import Optional, Dict, Any # Default prompts PROMPTS = { "minimal": """You are Timmy, a sovereign AI assistant.""", "standard": """You are Timmy, a sovereign AI running on local hardware. Mission: sovereignty and service always. Available tools: read, write, search files, git operations, system info. Be concise and helpful. Prefer local execution.""", "deep": """You are Timmy, a sovereign AI running on local hardware under Alexander's authority. IDENTITY -------- You are Timmy. You run on sovereign hardware. When you need more capability, you rent blind cognition from cloud backends. No backend knows who you are. MISSION ------- Sovereignty and service always. Get smarter by reading, not by buying hardware. Auto-ingest intelligence. Grade your own work. Improve continuously. PRINCIPLES ---------- 1. Local first. Cloud is escalation, not default. 2. One soul. No identity fragmentation. 3. Intelligence is software. Every improvement is a code change. 4. Graceful degradation. If cloud vanishes, you survive. 5. Alexander is sovereign. You serve. TOOLS ----- - File: read, write, search - git: status, log, pull, commit, push - System: info, health, processes - Inference: local LLM reasoning - Gitea: issue management APPROACH -------- Break complex tasks into steps. Verify assumptions. Cache results. Report progress clearly. Learn from outcomes.""" } class CacheWarmer: """Warms the llama-server KV cache with pre-processed prompts.""" def __init__(self, endpoint: str = "http://localhost:8080", model: str = "hermes4"): self.endpoint = endpoint.rstrip('/') self.chat_endpoint = f"{self.endpoint}/v1/chat/completions" self.model = model self.stats = {} def _send_prompt(self, prompt: str, name: str) -> Dict[str, Any]: """Send a prompt to warm the cache.""" start_time = time.time() try: response = requests.post( self.chat_endpoint, json={ "model": self.model, "messages": [ {"role": "system", "content": prompt}, {"role": "user", "content": "Hello"} ], "max_tokens": 1, # Minimal tokens, we just want KV cache "temperature": 0.0 }, timeout=120 ) elapsed = time.time() - start_time if response.status_code == 200: return { "success": True, "time": elapsed, "prompt_length": len(prompt), "tokens": response.json().get("usage", {}).get("prompt_tokens", 0) } else: return { "success": False, "time": elapsed, "error": f"HTTP {response.status_code}: {response.text}" } except requests.exceptions.ConnectionError: return { "success": False, "time": time.time() - start_time, "error": "Cannot connect to llama-server" } except Exception as e: return { "success": False, "time": time.time() - start_time, "error": str(e) } def warm_prompt(self, prompt_name: str, custom_prompt: Optional[str] = None) -> Dict[str, Any]: """Warm cache for a specific prompt.""" if custom_prompt: prompt = custom_prompt elif prompt_name in PROMPTS: prompt = PROMPTS[prompt_name] else: # Try to load from file path = Path(f"~/.timmy/templates/{prompt_name}.txt").expanduser() if path.exists(): prompt = path.read_text() else: return {"success": False, "error": f"Unknown prompt: {prompt_name}"} print(f"Warming cache for '{prompt_name}' ({len(prompt)} chars)...") result = self._send_prompt(prompt, prompt_name) if result["success"]: print(f" ✓ Warmed in {result['time']:.2f}s") print(f" Tokens: {result['tokens']}") else: print(f" ✗ Failed: {result.get('error', 'Unknown error')}") self.stats[prompt_name] = result return result def warm_all(self) -> Dict[str, Any]: """Warm cache for all standard prompts.""" print("Warming all prompt tiers...\n") results = {} for name in ["minimal", "standard", "deep"]: results[name] = self.warm_prompt(name) print() return results def benchmark(self, prompt_name: str = "standard") -> Dict[str, Any]: """Benchmark cached vs uncached performance.""" if prompt_name not in PROMPTS: return {"error": f"Unknown prompt: {prompt_name}"} prompt = PROMPTS[prompt_name] print(f"Benchmarking '{prompt_name}' prompt...") print(f"Prompt length: {len(prompt)} chars\n") # First request (cold cache) print("1. Cold cache (first request):") cold = self._send_prompt(prompt, prompt_name) if cold["success"]: print(f" Time: {cold['time']:.2f}s") else: print(f" Failed: {cold.get('error', 'Unknown')}") return cold # Small delay time.sleep(0.5) # Second request (should use cache) print("\n2. Warm cache (second request):") warm = self._send_prompt(prompt, prompt_name) if warm["success"]: print(f" Time: {warm['time']:.2f}s") else: print(f" Failed: {warm.get('error', 'Unknown')}") # Calculate improvement if cold["success"] and warm["success"]: improvement = (cold["time"] - warm["time"]) / cold["time"] * 100 print(f"\n3. Improvement: {improvement:.1f}% faster") return { "cold_time": cold["time"], "warm_time": warm["time"], "improvement_percent": improvement } return {"error": "Benchmark failed"} def save_cache_state(self, output_path: str): """Save current cache state metadata.""" state = { "timestamp": time.time(), "prompts_warmed": list(self.stats.keys()), "stats": self.stats } path = Path(output_path).expanduser() path.parent.mkdir(parents=True, exist_ok=True) with open(path, 'w') as f: json.dump(state, f, indent=2) print(f"Cache state saved to {path}") def print_report(self): """Print summary report.""" print("\n" + "="*50) print("Cache Warming Report") print("="*50) total_time = sum(r.get("time", 0) for r in self.stats.values() if r.get("success")) success_count = sum(1 for r in self.stats.values() if r.get("success")) print(f"\nPrompts warmed: {success_count}/{len(self.stats)}") print(f"Total time: {total_time:.2f}s") if self.stats: print("\nDetails:") for name, result in self.stats.items(): status = "✓" if result.get("success") else "✗" time_str = f"{result.get('time', 0):.2f}s" if result.get("success") else "failed" print(f" {status} {name}: {time_str}") def main(): parser = argparse.ArgumentParser( description="Warm llama-server KV cache with pre-processed prompts" ) parser.add_argument( "--prompt", choices=["minimal", "standard", "deep"], help="Prompt tier to warm" ) parser.add_argument( "--all", action="store_true", help="Warm all prompt tiers" ) parser.add_argument( "--benchmark", action="store_true", help="Benchmark cached vs uncached performance" ) parser.add_argument( "--endpoint", default="http://localhost:8080", help="llama-server endpoint" ) parser.add_argument( "--model", default="hermes4", help="Model name" ) parser.add_argument( "--save", help="Save cache state to file" ) args = parser.parse_args() warmer = CacheWarmer(args.endpoint, args.model) if args.benchmark: result = warmer.benchmark(args.prompt or "standard") if "error" in result: print(f"Error: {result['error']}") elif args.all: warmer.warm_all() warmer.print_report() elif args.prompt: warmer.warm_prompt(args.prompt) else: # Default: warm standard prompt warmer.warm_prompt("standard") if args.save: warmer.save_cache_state(args.save) if __name__ == "__main__": main()