timmy-home/timmy-local/scripts/warmup_cache.py

#!/usr/bin/env python3
"""
Prompt Cache Warming Script for llama-server
Issue #85 — KV Cache Reuse

Pre-processes system prompts to populate the KV cache,
reducing time-to-first-token by 50-70%.

Usage:
  python warmup_cache.py --prompt minimal
  python warmup_cache.py --prompt standard
  python warmup_cache.py --prompt deep
  python warmup_cache.py --all
"""

import argparse
import json
import requests
import time
from pathlib import Path
from typing import Optional, Dict, Any


# Default prompts
PROMPTS = {
    "minimal": """You are Timmy, a sovereign AI assistant.""",

    "standard": """You are Timmy, a sovereign AI running on local hardware.

Mission: sovereignty and service always.

Available tools: read, write, search files, git operations, system info.

Be concise and helpful. Prefer local execution.""",

    "deep": """You are Timmy, a sovereign AI running on local hardware under Alexander's authority.

IDENTITY
--------
You are Timmy. You run on sovereign hardware. When you need more capability, you rent blind cognition from cloud backends. No backend knows who you are.

MISSION
-------
Sovereignty and service always. Get smarter by reading, not by buying hardware. Auto-ingest intelligence. Grade your own work. Improve continuously.

PRINCIPLES
----------
1. Local first. Cloud is escalation, not default.
2. One soul. No identity fragmentation.
3. Intelligence is software. Every improvement is a code change.
4. Graceful degradation. If cloud vanishes, you survive.
5. Alexander is sovereign. You serve.

TOOLS
-----
- File: read, write, search
- git: status, log, pull, commit, push
- System: info, health, processes
- Inference: local LLM reasoning
- Gitea: issue management

APPROACH
--------
Break complex tasks into steps. Verify assumptions. Cache results. Report progress clearly. Learn from outcomes."""
}


class CacheWarmer:
    """Warms the llama-server KV cache with pre-processed prompts."""

    def __init__(self, endpoint: str = "http://localhost:8080", model: str = "hermes4"):
        self.endpoint = endpoint.rstrip('/')
        self.chat_endpoint = f"{self.endpoint}/v1/chat/completions"
        self.model = model
        self.stats = {}

    def _send_prompt(self, prompt: str, name: str) -> Dict[str, Any]:
        """Send a prompt to warm the cache."""
        start_time = time.time()

        try:
            response = requests.post(
                self.chat_endpoint,
                json={
                    "model": self.model,
                    "messages": [
                        {"role": "system", "content": prompt},
                        {"role": "user", "content": "Hello"}
                    ],
                    "max_tokens": 1,  # Minimal tokens, we just want KV cache
                    "temperature": 0.0
                },
                timeout=120
            )

            elapsed = time.time() - start_time

            if response.status_code == 200:
                return {
                    "success": True,
                    "time": elapsed,
                    "prompt_length": len(prompt),
                    "tokens": response.json().get("usage", {}).get("prompt_tokens", 0)
                }
            else:
                return {
                    "success": False,
                    "time": elapsed,
                    "error": f"HTTP {response.status_code}: {response.text}"
                }

        except requests.exceptions.ConnectionError:
            return {
                "success": False,
                "time": time.time() - start_time,
                "error": "Cannot connect to llama-server"
            }
        except Exception as e:
            return {
                "success": False,
                "time": time.time() - start_time,
                "error": str(e)
            }

    def warm_prompt(self, prompt_name: str, custom_prompt: Optional[str] = None) -> Dict[str, Any]:
        """Warm cache for a specific prompt."""
        if custom_prompt:
            prompt = custom_prompt
        elif prompt_name in PROMPTS:
            prompt = PROMPTS[prompt_name]
        else:
            # Try to load from file
            path = Path(f"~/.timmy/templates/{prompt_name}.txt").expanduser()
            if path.exists():
                prompt = path.read_text()
            else:
                return {"success": False, "error": f"Unknown prompt: {prompt_name}"}

        print(f"Warming cache for '{prompt_name}' ({len(prompt)} chars)...")
        result = self._send_prompt(prompt, prompt_name)

        if result["success"]:
            print(f"  ✓ Warmed in {result['time']:.2f}s")
            print(f"  Tokens: {result['tokens']}")
        else:
            print(f"  ✗ Failed: {result.get('error', 'Unknown error')}")

        self.stats[prompt_name] = result
        return result

    def warm_all(self) -> Dict[str, Any]:
        """Warm cache for all standard prompts."""
        print("Warming all prompt tiers...\n")

        results = {}
        for name in ["minimal", "standard", "deep"]:
            results[name] = self.warm_prompt(name)
            print()

        return results

    def benchmark(self, prompt_name: str = "standard") -> Dict[str, Any]:
        """Benchmark cached vs uncached performance."""
        if prompt_name not in PROMPTS:
            return {"error": f"Unknown prompt: {prompt_name}"}

        prompt = PROMPTS[prompt_name]
        print(f"Benchmarking '{prompt_name}' prompt...")
        print(f"Prompt length: {len(prompt)} chars\n")

        # First request (cold cache)
        print("1. Cold cache (first request):")
        cold = self._send_prompt(prompt, prompt_name)
        if cold["success"]:
            print(f"   Time: {cold['time']:.2f}s")
        else:
            print(f"   Failed: {cold.get('error', 'Unknown')}")
            return cold

        # Small delay
        time.sleep(0.5)

        # Second request (should use cache)
        print("\n2. Warm cache (second request):")
        warm = self._send_prompt(prompt, prompt_name)
        if warm["success"]:
            print(f"   Time: {warm['time']:.2f}s")
        else:
            print(f"   Failed: {warm.get('error', 'Unknown')}")

        # Calculate improvement
        if cold["success"] and warm["success"]:
            improvement = (cold["time"] - warm["time"]) / cold["time"] * 100
            print(f"\n3. Improvement: {improvement:.1f}% faster")

            return {
                "cold_time": cold["time"],
                "warm_time": warm["time"],
                "improvement_percent": improvement
            }

        return {"error": "Benchmark failed"}

    def save_cache_state(self, output_path: str):
        """Save current cache state metadata."""
        state = {
            "timestamp": time.time(),
            "prompts_warmed": list(self.stats.keys()),
            "stats": self.stats
        }

        path = Path(output_path).expanduser()
        path.parent.mkdir(parents=True, exist_ok=True)

        with open(path, 'w') as f:
            json.dump(state, f, indent=2)

        print(f"Cache state saved to {path}")

    def print_report(self):
        """Print summary report."""
        print("\n" + "="*50)
        print("Cache Warming Report")
        print("="*50)

        total_time = sum(r.get("time", 0) for r in self.stats.values() if r.get("success"))
        success_count = sum(1 for r in self.stats.values() if r.get("success"))

        print(f"\nPrompts warmed: {success_count}/{len(self.stats)}")
        print(f"Total time: {total_time:.2f}s")

        if self.stats:
            print("\nDetails:")
            for name, result in self.stats.items():
                status = "✓" if result.get("success") else "✗"
                time_str = f"{result.get('time', 0):.2f}s" if result.get("success") else "failed"
                print(f"  {status} {name}: {time_str}")


def main():
    parser = argparse.ArgumentParser(
        description="Warm llama-server KV cache with pre-processed prompts"
    )
    parser.add_argument(
        "--prompt",
        choices=["minimal", "standard", "deep"],
        help="Prompt tier to warm"
    )
    parser.add_argument(
        "--all",
        action="store_true",
        help="Warm all prompt tiers"
    )
    parser.add_argument(
        "--benchmark",
        action="store_true",
        help="Benchmark cached vs uncached performance"
    )
    parser.add_argument(
        "--endpoint",
        default="http://localhost:8080",
        help="llama-server endpoint"
    )
    parser.add_argument(
        "--model",
        default="hermes4",
        help="Model name"
    )
    parser.add_argument(
        "--save",
        help="Save cache state to file"
    )

    args = parser.parse_args()

    warmer = CacheWarmer(args.endpoint, args.model)

    if args.benchmark:
        result = warmer.benchmark(args.prompt or "standard")
        if "error" in result:
            print(f"Error: {result['error']}")

    elif args.all:
        warmer.warm_all()
        warmer.print_report()

    elif args.prompt:
        warmer.warm_prompt(args.prompt)

    else:
        # Default: warm standard prompt
        warmer.warm_prompt("standard")

    if args.save:
        warmer.save_cache_state(args.save)


if __name__ == "__main__":
    main()