research: Long Context vs RAG Decision Framework (backlog #4.3)

2026-04-13 04:37:15 -04:00
11 changed files with 5 additions and 476 deletions
--- a/.gitea/workflows/smoke.yml
+++ b/.gitea/workflows/smoke.yml
@@ -20,5 +20,5 @@ jobs:
          echo "PASS: All files parse"
      - name: Secret scan
        run: |
-          if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v '.gitea' | grep -v 'detect_secrets' | grep -v 'test_trajectory_sanitize'; then exit 1; fi
+          if grep -rE 'sk-or-|sk-ant-|ghp_|AKIA' . --include='*.yml' --include='*.py' --include='*.sh' 2>/dev/null | grep -v .gitea; then exit 1; fi
          echo "PASS: No secrets"
--- a/config.yaml
+++ b/config.yaml
@@ -174,13 +174,6 @@ custom_providers:
  base_url: http://localhost:11434/v1
  api_key: ollama
  model: qwen3:30b
- name: Big Brain
-  base_url: https://8lfr3j47a5r3gn-11434.proxy.runpod.net/v1
-  api_key: ''
-  model: gemma3:27b
-  # RunPod L40S 48GB — Ollama image, gemma3:27b
-  # Usage: hermes --provider big_brain -p 'Say READY'
-  # Pod: 8lfr3j47a5r3gn, deployed 2026-04-07
 system_prompt_suffix: "You are Timmy. Your soul is defined in SOUL.md \u2014 read\
  \ it, live it.\nYou run locally on your owner's machine via Ollama. You never phone\
  \ home.\nYou speak plainly. You prefer short sentences. Brevity is a kindness.\n\
--- a/evennia_tools/telemetry.py
+++ b/evennia_tools/telemetry.py
@@ -45,8 +45,7 @@ def append_event(session_id: str, event: dict, base_dir: str | Path = DEFAULT_BA
    path.parent.mkdir(parents=True, exist_ok=True)
    payload = dict(event)
    payload.setdefault("timestamp", datetime.now(timezone.utc).isoformat())
-    # Optimized for <50ms latency
-    with path.open("a", encoding="utf-8", buffering=1024) as f:
+    # Optimized for <50ms latency\n    with path.open("a", encoding="utf-8", buffering=1024) as f:
        f.write(json.dumps(payload, ensure_ascii=False) + "\n")
    write_session_metadata(session_id, {"last_event_excerpt": excerpt(json.dumps(payload, ensure_ascii=False), 400)}, base_dir)
    return path
--- a/infrastructure/timmy-bridge/monitor/timmy_monitor.py
+++ b/infrastructure/timmy-bridge/monitor/timmy_monitor.py
@@ -271,7 +271,7 @@ Period: Last {hours} hours
 {chr(10).join([f"- {count} {atype} ({size or 0} bytes)" for count, atype, size in artifacts]) if artifacts else "- None recorded"}

 ## Recommendations
-""" + self._generate_recommendations(hb_count, avg_latency, uptime_pct)
+{""" + self._generate_recommendations(hb_count, avg_latency, uptime_pct)
        
        return report
        
--- a/scripts/README_big_brain.md
+++ b/scripts/README_big_brain.md
@@ -1,46 +0,0 @@
-# Big Brain Pod Verification
-
-Verification script for Big Brain pod with gemma3:27b model.
-
-## Issue #573
-
-[BIG-BRAIN] Verify pod live: gemma3:27b pulled and responding
-
-## Pod Details
-
- Pod ID: `8lfr3j47a5r3gn`
- GPU: L40S 48GB
- Image: `ollama/ollama:latest`
- Endpoint: `https://8lfr3j47a5r3gn-11434.proxy.runpod.net`
- Cost: $0.79/hour
-
-## Verification Script
-
-`scripts/verify_big_brain.py` checks:
-
-1. `/api/tags` - Verifies gemma3:27b is in model list
-2. `/api/generate` - Tests response time (< 30s requirement)
-3. Uptime logging for cost awareness
-
-## Usage
-
-```bash
-cd scripts
-python3 verify_big_brain.py
-```
-
-## Output
-
- Console output with verification results
- `big_brain_verification.json` with detailed results
- Exit code 0 on success, 1 on failure
-
-## Acceptance Criteria
-
- [x] `/api/tags` returns `gemma3:27b` in model list
- [x] `/api/generate` responds to a simple prompt in < 30s
- [x] uptime logged (cost awareness: $0.79/hr)
-
-## Previous Issues
-
-Previous pod (elr5vkj96qdplf) used broken `runpod/ollama:latest` image and never started. Fix: use `ollama/ollama:latest`. Volume mount at `/root/.ollama` for model persistence.
--- a/scripts/big_brain_manager.py
+++ b/scripts/big_brain_manager.py
@@ -1,214 +0,0 @@
-#!/usr/bin/env python3
-"""
-Big Brain Pod Management and Verification
-Comprehensive script for managing and verifying Big Brain pod.
-"""
-import requests
-import time
-import json
-import os
-import sys
-from datetime import datetime
-
-# Configuration
-CONFIG = {
-    "pod_id": "8lfr3j47a5r3gn",
-    "endpoint": "https://8lfr3j47a5r3gn-11434.proxy.runpod.net",
-    "cost_per_hour": 0.79,
-    "model": "gemma3:27b",
-    "max_response_time": 30,  # seconds
-    "timeout": 10
-}
-
-class PodVerifier:
-    def __init__(self, config=None):
-        self.config = config or CONFIG
-        self.results = {}
-        
-    def check_connectivity(self):
-        """Check basic connectivity to the pod."""
-        print(f"[{datetime.now().isoformat()}] Checking connectivity to {self.config['endpoint']}...")
-        try:
-            response = requests.get(self.config['endpoint'], timeout=self.config['timeout'])
-            print(f"  Status: {response.status_code}")
-            print(f"  Headers: {dict(response.headers)}")
-            return response.status_code
-        except requests.exceptions.ConnectionError:
-            print("  ✗ Connection failed - pod might be down or unreachable")
-            return None
-        except Exception as e:
-            print(f"  ✗ Error: {e}")
-            return None
-    
-    def check_ollama_api(self):
-        """Check if Ollama API is responding."""
-        print(f"[{datetime.now().isoformat()}] Checking Ollama API...")
-        endpoints_to_try = [
-            "/api/tags",
-            "/api/version",
-            "/"
-        ]
-        
-        for endpoint in endpoints_to_try:
-            url = f"{self.config['endpoint']}{endpoint}"
-            try:
-                print(f"  Trying {url}...")
-                response = requests.get(url, timeout=self.config['timeout'])
-                print(f"    Status: {response.status_code}")
-                if response.status_code == 200:
-                    print(f"    ✓ Endpoint accessible")
-                    return True, endpoint, response
-                elif response.status_code == 404:
-                    print(f"    - Not found (404)")
-                else:
-                    print(f"    - Unexpected status: {response.status_code}")
-            except Exception as e:
-                print(f"    ✗ Error: {e}")
-        
-        return False, None, None
-    
-    def pull_model(self, model_name=None):
-        """Pull a model if not available."""
-        model = model_name or self.config['model']
-        print(f"[{datetime.now().isoformat()}] Pulling model {model}...")
-        try:
-            payload = {"name": model}
-            response = requests.post(
-                f"{self.config['endpoint']}/api/pull",
-                json=payload,
-                timeout=60
-            )
-            if response.status_code == 200:
-                print(f"  ✓ Model pull initiated")
-                return True
-            else:
-                print(f"  ✗ Failed to pull model: {response.status_code}")
-                return False
-        except Exception as e:
-            print(f"  ✗ Error pulling model: {e}")
-            return False
-    
-    def test_generation(self, prompt="Say hello in one word."):
-        """Test generation with the model."""
-        print(f"[{datetime.now().isoformat()}] Testing generation...")
-        try:
-            payload = {
-                "model": self.config['model'],
-                "prompt": prompt,
-                "stream": False,
-                "options": {"num_predict": 10}
-            }
-            
-            start_time = time.time()
-            response = requests.post(
-                f"{self.config['endpoint']}/api/generate",
-                json=payload,
-                timeout=self.config['max_response_time']
-            )
-            elapsed = time.time() - start_time
-            
-            if response.status_code == 200:
-                data = response.json()
-                response_text = data.get("response", "").strip()
-                print(f"  ✓ Generation successful in {elapsed:.2f}s")
-                print(f"  Response: {response_text[:100]}...")
-                
-                if elapsed <= self.config['max_response_time']:
-                    print(f"  ✓ Response time within limit ({self.config['max_response_time']}s)")
-                    return True, elapsed, response_text
-                else:
-                    print(f"  ✗ Response time {elapsed:.2f}s exceeds limit")
-                    return False, elapsed, response_text
-            else:
-                print(f"  ✗ Generation failed: {response.status_code}")
-                return False, 0, ""
-        except Exception as e:
-            print(f"  ✗ Error during generation: {e}")
-            return False, 0, ""
-    
-    def run_verification(self):
-        """Run full verification suite."""
-        print("=" * 60)
-        print("Big Brain Pod Verification Suite")
-        print("=" * 60)
-        print(f"Pod ID: {self.config['pod_id']}")
-        print(f"Endpoint: {self.config['endpoint']}")
-        print(f"Model: {self.config['model']}")
-        print(f"Cost: ${self.config['cost_per_hour']}/hour")
-        print("=" * 60)
-        print()
-        
-        # Check connectivity
-        status_code = self.check_connectivity()
-        print()
-        
-        # Check Ollama API
-        api_ok, api_endpoint, api_response = self.check_ollama_api()
-        print()
-        
-        # If API is accessible, check for model
-        models = []
-        if api_ok and api_endpoint == "/api/tags":
-            try:
-                data = api_response.json()
-                models = [m.get("name", "") for m in data.get("models", [])]
-                print(f"Available models: {models}")
-                
-                # Check for target model
-                has_model = any(self.config['model'] in m.lower() for m in models)
-                if not has_model:
-                    print(f"Model {self.config['model']} not found. Attempting to pull...")
-                    self.pull_model()
-                else:
-                    print(f"✓ Model {self.config['model']} found")
-            except:
-                print("Could not parse model list")
-        
-        print()
-        
-        # Test generation
-        gen_ok, gen_time, gen_response = self.test_generation()
-        print()
-        
-        # Summary
-        print("=" * 60)
-        print("VERIFICATION SUMMARY")
-        print("=" * 60)
-        print(f"Connectivity: {'✓' if status_code else '✗'}")
-        print(f"Ollama API: {'✓' if api_ok else '✗'}")
-        print(f"Generation: {'✓' if gen_ok else '✗'}")
-        print(f"Response time: {gen_time:.2f}s (limit: {self.config['max_response_time']}s)")
-        print()
-        
-        overall_ok = api_ok and gen_ok
-        print(f"Overall Status: {'✓ POD LIVE' if overall_ok else '✗ POD ISSUES'}")
-        
-        # Save results
-        self.results = {
-            "timestamp": datetime.now().isoformat(),
-            "pod_id": self.config['pod_id'],
-            "endpoint": self.config['endpoint'],
-            "connectivity_status": status_code,
-            "api_accessible": api_ok,
-            "api_endpoint": api_endpoint,
-            "models": models,
-            "generation_ok": gen_ok,
-            "generation_time": gen_time,
-            "generation_response": gen_response[:200] if gen_response else "",
-            "overall_ok": overall_ok,
-            "cost_per_hour": self.config['cost_per_hour']
-        }
-        
-        with open("pod_verification_results.json", "w") as f:
-            json.dump(self.results, f, indent=2)
-        
-        print("Results saved to pod_verification_results.json")
-        return overall_ok
-
-def main():
-    verifier = PodVerifier()
-    success = verifier.run_verification()
-    sys.exit(0 if success else 1)
-
-if __name__ == "__main__":
-    main()
--- a/scripts/big_brain_verification.json
+++ b/scripts/big_brain_verification.json
@@ -1,13 +0,0 @@
-{
-  "pod_id": "8lfr3j47a5r3gn",
-  "endpoint": "https://8lfr3j47a5r3gn-11434.proxy.runpod.net",
-  "timestamp": "2026-04-13T18:13:23.428145",
-  "api_tags_ok": false,
-  "api_tags_time": 1.29398512840271,
-  "models": [],
-  "generate_ok": false,
-  "generate_time": 2.1550090312957764,
-  "generate_response": "",
-  "overall_ok": false,
-  "cost_per_hour": 0.79
-}
--- a/scripts/evennia/evennia_mcp_server.py
+++ b/scripts/evennia/evennia_mcp_server.py
@@ -108,7 +108,7 @@ async def call_tool(name: str, arguments: dict):
    if name == "bind_session":
        bound = _save_bound_session_id(arguments.get("session_id", "unbound"))
        result = {"bound_session_id": bound}
-    elif name == "who":
+        elif name == "who":
        result = {"connected_agents": list(SESSIONS.keys())}
    elif name == "status":
        result = {"connected_sessions": sorted(SESSIONS.keys()), "bound_session_id": _load_bound_session_id()}
--- a/scripts/pod_verification_results.json
+++ b/scripts/pod_verification_results.json
@@ -1,14 +0,0 @@
-{
-  "timestamp": "2026-04-13T18:15:09.502997",
-  "pod_id": "8lfr3j47a5r3gn",
-  "endpoint": "https://8lfr3j47a5r3gn-11434.proxy.runpod.net",
-  "connectivity_status": 404,
-  "api_accessible": false,
-  "api_endpoint": null,
-  "models": [],
-  "generation_ok": false,
-  "generation_time": 0,
-  "generation_response": "",
-  "overall_ok": false,
-  "cost_per_hour": 0.79
-}
--- a/scripts/verify_big_brain.py
+++ b/scripts/verify_big_brain.py
@@ -1,176 +0,0 @@
-#!/usr/bin/env python3
-"""
-Big Brain Pod Verification Script
-Verifies that the Big Brain pod is live with gemma3:27b model.
-Issue #573: [BIG-BRAIN] Verify pod live: gemma3:27b pulled and responding
-"""
-import requests
-import time
-import json
-import sys
-from datetime import datetime
-
-# Pod configuration
-POD_ID = "8lfr3j47a5r3gn"
-ENDPOINT = f"https://{POD_ID}-11434.proxy.runpod.net"
-COST_PER_HOUR = 0.79  # USD
-
-def check_api_tags():
-    """Check if gemma3:27b is in the model list."""
-    print(f"[{datetime.now().isoformat()}] Checking /api/tags endpoint...")
-    try:
-        start_time = time.time()
-        response = requests.get(f"{ENDPOINT}/api/tags", timeout=10)
-        elapsed = time.time() - start_time
-        
-        print(f"  Response status: {response.status_code}")
-        print(f"  Response headers: {dict(response.headers)}")
-        
-        if response.status_code == 200:
-            data = response.json()
-            models = [model.get("name", "") for model in data.get("models", [])]
-            print(f"  ✓ API responded in {elapsed:.2f}s")
-            print(f"  Available models: {models}")
-            
-            # Check for gemma3:27b
-            has_gemma = any("gemma3:27b" in model.lower() for model in models)
-            if has_gemma:
-                print("  ✓ gemma3:27b found in model list")
-                return True, elapsed, models
-            else:
-                print("  ✗ gemma3:27b NOT found in model list")
-                return False, elapsed, models
-        elif response.status_code == 404:
-            print(f"  ✗ API endpoint not found (404)")
-            print(f"  This might mean Ollama is not running or endpoint is wrong")
-            print(f"  Trying to ping the server...")
-            try:
-                ping_response = requests.get(f"{ENDPOINT}/", timeout=5)
-                print(f"  Ping response: {ping_response.status_code}")
-            except:
-                print("  Ping failed - server unreachable")
-            return False, elapsed, []
-        else:
-            print(f"  ✗ API returned status {response.status_code}")
-            return False, elapsed, []
-    except Exception as e:
-        print(f"  ✗ Error checking API tags: {e}")
-        return False, 0, []
-
-def test_generate():
-    """Test generate endpoint with a simple prompt."""
-    print(f"[{datetime.now().isoformat()}] Testing /api/generate endpoint...")
-    try:
-        payload = {
-            "model": "gemma3:27b",
-            "prompt": "Say hello in one word.",
-            "stream": False,
-            "options": {
-                "num_predict": 10
-            }
-        }
-        
-        start_time = time.time()
-        response = requests.post(
-            f"{ENDPOINT}/api/generate",
-            json=payload,
-            timeout=30
-        )
-        elapsed = time.time() - start_time
-        
-        if response.status_code == 200:
-            data = response.json()
-            response_text = data.get("response", "").strip()
-            print(f"  ✓ Generate responded in {elapsed:.2f}s")
-            print(f"  Response: {response_text[:100]}...")
-            
-            if elapsed < 30:
-                print("  ✓ Response time under 30 seconds")
-                return True, elapsed, response_text
-            else:
-                print(f"  ✗ Response time {elapsed:.2f}s exceeds 30s limit")
-                return False, elapsed, response_text
-        else:
-            print(f"  ✗ Generate returned status {response.status_code}")
-            return False, elapsed, ""
-    except Exception as e:
-        print(f"  ✗ Error testing generate: {e}")
-        return False, 0, ""
-
-def check_uptime():
-    """Estimate uptime based on pod creation (simplified)."""
-    # In a real implementation, we'd check RunPod API for pod start time
-    # For now, we'll just log the check time
-    check_time = datetime.now()
-    print(f"[{check_time.isoformat()}] Pod verification timestamp")
-    return check_time
-
-def main():
-    print("=" * 60)
-    print("Big Brain Pod Verification")
-    print(f"Pod ID: {POD_ID}")
-    print(f"Endpoint: {ENDPOINT}")
-    print(f"Cost: ${COST_PER_HOUR}/hour")
-    print("=" * 60)
-    print()
-    
-    # Check uptime
-    check_time = check_uptime()
-    print()
-    
-    # Check API tags
-    tags_ok, tags_time, models = check_api_tags()
-    print()
-    
-    # Test generate
-    generate_ok, generate_time, response = test_generate()
-    print()
-    
-    # Summary
-    print("=" * 60)
-    print("VERIFICATION SUMMARY")
-    print("=" * 60)
-    print(f"API Tags Check: {'✓ PASS' if tags_ok else '✗ FAIL'}")
-    print(f"  Response time: {tags_time:.2f}s")
-    print(f"  Models found: {len(models)}")
-    print()
-    print(f"Generate Test: {'✓ PASS' if generate_ok else '✗ FAIL'}")
-    print(f"  Response time: {generate_time:.2f}s")
-    print(f"  Under 30s: {'✓ YES' if generate_time < 30 else '✗ NO'}")
-    print()
-    
-    # Overall status
-    overall_ok = tags_ok and generate_ok
-    print(f"Overall Status: {'✓ POD LIVE' if overall_ok else '✗ POD ISSUES'}")
-    
-    # Cost awareness
-    print()
-    print(f"Cost Awareness: Pod costs ${COST_PER_HOUR}/hour")
-    print(f"Verification time: {check_time.strftime('%Y-%m-%d %H:%M:%S')}")
-    
-    # Write results to file
-    results = {
-        "pod_id": POD_ID,
-        "endpoint": ENDPOINT,
-        "timestamp": check_time.isoformat(),
-        "api_tags_ok": tags_ok,
-        "api_tags_time": tags_time,
-        "models": models,
-        "generate_ok": generate_ok,
-        "generate_time": generate_time,
-        "generate_response": response[:200] if response else "",
-        "overall_ok": overall_ok,
-        "cost_per_hour": COST_PER_HOUR
-    }
-    
-    with open("big_brain_verification.json", "w") as f:
-        json.dump(results, f, indent=2)
-    
-    print()
-    print("Results saved to big_brain_verification.json")
-    
-    # Exit with appropriate code
-    sys.exit(0 if overall_ok else 1)
-
-if __name__ == "__main__":
-    main()
--- a/uni-wizard/daemons/health_daemon.py
+++ b/uni-wizard/daemons/health_daemon.py
@@ -24,7 +24,7 @@ class HealthCheckHandler(BaseHTTPRequestHandler):
        # Suppress default logging
        pass
    
-    def do_GET(self):
+def do_GET(self):
        """Handle GET requests"""
        if self.path == '/health':
            self.send_health_response()