300 lines
9.4 KiB
Python
Executable File
300 lines
9.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Prompt Cache Warming Script for llama-server
|
|
Issue #85 — KV Cache Reuse
|
|
|
|
Pre-processes system prompts to populate the KV cache,
|
|
reducing time-to-first-token by 50-70%.
|
|
|
|
Usage:
|
|
python warmup_cache.py --prompt minimal
|
|
python warmup_cache.py --prompt standard
|
|
python warmup_cache.py --prompt deep
|
|
python warmup_cache.py --all
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import requests
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
|
|
|
|
# Default prompts
|
|
PROMPTS = {
|
|
"minimal": """You are Timmy, a sovereign AI assistant.""",
|
|
|
|
"standard": """You are Timmy, a sovereign AI running on local hardware.
|
|
|
|
Mission: sovereignty and service always.
|
|
|
|
Available tools: read, write, search files, git operations, system info.
|
|
|
|
Be concise and helpful. Prefer local execution.""",
|
|
|
|
"deep": """You are Timmy, a sovereign AI running on local hardware under Alexander's authority.
|
|
|
|
IDENTITY
|
|
--------
|
|
You are Timmy. You run on sovereign hardware. When you need more capability, you rent blind cognition from cloud backends. No backend knows who you are.
|
|
|
|
MISSION
|
|
-------
|
|
Sovereignty and service always. Get smarter by reading, not by buying hardware. Auto-ingest intelligence. Grade your own work. Improve continuously.
|
|
|
|
PRINCIPLES
|
|
----------
|
|
1. Local first. Cloud is escalation, not default.
|
|
2. One soul. No identity fragmentation.
|
|
3. Intelligence is software. Every improvement is a code change.
|
|
4. Graceful degradation. If cloud vanishes, you survive.
|
|
5. Alexander is sovereign. You serve.
|
|
|
|
TOOLS
|
|
-----
|
|
- File: read, write, search
|
|
- git: status, log, pull, commit, push
|
|
- System: info, health, processes
|
|
- Inference: local LLM reasoning
|
|
- Gitea: issue management
|
|
|
|
APPROACH
|
|
--------
|
|
Break complex tasks into steps. Verify assumptions. Cache results. Report progress clearly. Learn from outcomes."""
|
|
}
|
|
|
|
|
|
class CacheWarmer:
|
|
"""Warms the llama-server KV cache with pre-processed prompts."""
|
|
|
|
def __init__(self, endpoint: str = "http://localhost:8080", model: str = "hermes4"):
|
|
self.endpoint = endpoint.rstrip('/')
|
|
self.chat_endpoint = f"{self.endpoint}/v1/chat/completions"
|
|
self.model = model
|
|
self.stats = {}
|
|
|
|
def _send_prompt(self, prompt: str, name: str) -> Dict[str, Any]:
|
|
"""Send a prompt to warm the cache."""
|
|
start_time = time.time()
|
|
|
|
try:
|
|
response = requests.post(
|
|
self.chat_endpoint,
|
|
json={
|
|
"model": self.model,
|
|
"messages": [
|
|
{"role": "system", "content": prompt},
|
|
{"role": "user", "content": "Hello"}
|
|
],
|
|
"max_tokens": 1, # Minimal tokens, we just want KV cache
|
|
"temperature": 0.0
|
|
},
|
|
timeout=120
|
|
)
|
|
|
|
elapsed = time.time() - start_time
|
|
|
|
if response.status_code == 200:
|
|
return {
|
|
"success": True,
|
|
"time": elapsed,
|
|
"prompt_length": len(prompt),
|
|
"tokens": response.json().get("usage", {}).get("prompt_tokens", 0)
|
|
}
|
|
else:
|
|
return {
|
|
"success": False,
|
|
"time": elapsed,
|
|
"error": f"HTTP {response.status_code}: {response.text}"
|
|
}
|
|
|
|
except requests.exceptions.ConnectionError:
|
|
return {
|
|
"success": False,
|
|
"time": time.time() - start_time,
|
|
"error": "Cannot connect to llama-server"
|
|
}
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"time": time.time() - start_time,
|
|
"error": str(e)
|
|
}
|
|
|
|
def warm_prompt(self, prompt_name: str, custom_prompt: Optional[str] = None) -> Dict[str, Any]:
|
|
"""Warm cache for a specific prompt."""
|
|
if custom_prompt:
|
|
prompt = custom_prompt
|
|
elif prompt_name in PROMPTS:
|
|
prompt = PROMPTS[prompt_name]
|
|
else:
|
|
# Try to load from file
|
|
path = Path(f"~/.timmy/templates/{prompt_name}.txt").expanduser()
|
|
if path.exists():
|
|
prompt = path.read_text()
|
|
else:
|
|
return {"success": False, "error": f"Unknown prompt: {prompt_name}"}
|
|
|
|
print(f"Warming cache for '{prompt_name}' ({len(prompt)} chars)...")
|
|
result = self._send_prompt(prompt, prompt_name)
|
|
|
|
if result["success"]:
|
|
print(f" ✓ Warmed in {result['time']:.2f}s")
|
|
print(f" Tokens: {result['tokens']}")
|
|
else:
|
|
print(f" ✗ Failed: {result.get('error', 'Unknown error')}")
|
|
|
|
self.stats[prompt_name] = result
|
|
return result
|
|
|
|
def warm_all(self) -> Dict[str, Any]:
|
|
"""Warm cache for all standard prompts."""
|
|
print("Warming all prompt tiers...\n")
|
|
|
|
results = {}
|
|
for name in ["minimal", "standard", "deep"]:
|
|
results[name] = self.warm_prompt(name)
|
|
print()
|
|
|
|
return results
|
|
|
|
def benchmark(self, prompt_name: str = "standard") -> Dict[str, Any]:
|
|
"""Benchmark cached vs uncached performance."""
|
|
if prompt_name not in PROMPTS:
|
|
return {"error": f"Unknown prompt: {prompt_name}"}
|
|
|
|
prompt = PROMPTS[prompt_name]
|
|
print(f"Benchmarking '{prompt_name}' prompt...")
|
|
print(f"Prompt length: {len(prompt)} chars\n")
|
|
|
|
# First request (cold cache)
|
|
print("1. Cold cache (first request):")
|
|
cold = self._send_prompt(prompt, prompt_name)
|
|
if cold["success"]:
|
|
print(f" Time: {cold['time']:.2f}s")
|
|
else:
|
|
print(f" Failed: {cold.get('error', 'Unknown')}")
|
|
return cold
|
|
|
|
# Small delay
|
|
time.sleep(0.5)
|
|
|
|
# Second request (should use cache)
|
|
print("\n2. Warm cache (second request):")
|
|
warm = self._send_prompt(prompt, prompt_name)
|
|
if warm["success"]:
|
|
print(f" Time: {warm['time']:.2f}s")
|
|
else:
|
|
print(f" Failed: {warm.get('error', 'Unknown')}")
|
|
|
|
# Calculate improvement
|
|
if cold["success"] and warm["success"]:
|
|
improvement = (cold["time"] - warm["time"]) / cold["time"] * 100
|
|
print(f"\n3. Improvement: {improvement:.1f}% faster")
|
|
|
|
return {
|
|
"cold_time": cold["time"],
|
|
"warm_time": warm["time"],
|
|
"improvement_percent": improvement
|
|
}
|
|
|
|
return {"error": "Benchmark failed"}
|
|
|
|
def save_cache_state(self, output_path: str):
|
|
"""Save current cache state metadata."""
|
|
state = {
|
|
"timestamp": time.time(),
|
|
"prompts_warmed": list(self.stats.keys()),
|
|
"stats": self.stats
|
|
}
|
|
|
|
path = Path(output_path).expanduser()
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(path, 'w') as f:
|
|
json.dump(state, f, indent=2)
|
|
|
|
print(f"Cache state saved to {path}")
|
|
|
|
def print_report(self):
|
|
"""Print summary report."""
|
|
print("\n" + "="*50)
|
|
print("Cache Warming Report")
|
|
print("="*50)
|
|
|
|
total_time = sum(r.get("time", 0) for r in self.stats.values() if r.get("success"))
|
|
success_count = sum(1 for r in self.stats.values() if r.get("success"))
|
|
|
|
print(f"\nPrompts warmed: {success_count}/{len(self.stats)}")
|
|
print(f"Total time: {total_time:.2f}s")
|
|
|
|
if self.stats:
|
|
print("\nDetails:")
|
|
for name, result in self.stats.items():
|
|
status = "✓" if result.get("success") else "✗"
|
|
time_str = f"{result.get('time', 0):.2f}s" if result.get("success") else "failed"
|
|
print(f" {status} {name}: {time_str}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Warm llama-server KV cache with pre-processed prompts"
|
|
)
|
|
parser.add_argument(
|
|
"--prompt",
|
|
choices=["minimal", "standard", "deep"],
|
|
help="Prompt tier to warm"
|
|
)
|
|
parser.add_argument(
|
|
"--all",
|
|
action="store_true",
|
|
help="Warm all prompt tiers"
|
|
)
|
|
parser.add_argument(
|
|
"--benchmark",
|
|
action="store_true",
|
|
help="Benchmark cached vs uncached performance"
|
|
)
|
|
parser.add_argument(
|
|
"--endpoint",
|
|
default="http://localhost:8080",
|
|
help="llama-server endpoint"
|
|
)
|
|
parser.add_argument(
|
|
"--model",
|
|
default="hermes4",
|
|
help="Model name"
|
|
)
|
|
parser.add_argument(
|
|
"--save",
|
|
help="Save cache state to file"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
warmer = CacheWarmer(args.endpoint, args.model)
|
|
|
|
if args.benchmark:
|
|
result = warmer.benchmark(args.prompt or "standard")
|
|
if "error" in result:
|
|
print(f"Error: {result['error']}")
|
|
|
|
elif args.all:
|
|
warmer.warm_all()
|
|
warmer.print_report()
|
|
|
|
elif args.prompt:
|
|
warmer.warm_prompt(args.prompt)
|
|
|
|
else:
|
|
# Default: warm standard prompt
|
|
warmer.warm_prompt("standard")
|
|
|
|
if args.save:
|
|
warmer.save_cache_state(args.save)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|