feat(ansible): Canonical IaC playbook for fleet management

Implements the Ansible Infrastructure as Code story from KT 2026-04-08. One canonical Ansible playbook defines: - Deadman switch (snapshot good config on health, rollback+restart on death) - Golden state config deployment (Anthropic BANNED, Kimi→Gemini→Ollama) - Cron schedule (source-controlled, no manual crontab edits) - Agent startup sequence (pull→validate→start→verify) - request_log telemetry table (every inference call logged) - Thin config pattern (immutable local pointer to upstream) - Gitea webhook handler (deploy on merge) - Config validator (rejects banned providers) Fleet inventory: Timmy (Mac), Allegro (VPS), Bezalel (VPS), Ezra (VPS) Roles: wizard_base, golden_state, deadman_switch, request_log, cron_manager Addresses: timmy-config #442, #443, #444, #445, #446 References: KT Final 2026-04-08 P2, KT Bezalel 2026-04-08 #1-#5
2026-04-09 22:25:31 +00:00
parent a6fded436f
commit 7ec45642eb
28 changed files with 1571 additions and 0 deletions
--- a/ansible/roles/request_log/files/request_log_schema.sql
+++ b/ansible/roles/request_log/files/request_log_schema.sql
@@ -0,0 +1,64 @@
+-- =============================================================================
+-- request_log — Inference Telemetry Table
+-- =============================================================================
+-- Every agent writes to this table BEFORE and AFTER every inference call.
+-- No exceptions. No summarizing. No describing what you would log.
+-- Actually write the row.
+--
+-- Source: KT Bezalel Architecture Session 2026-04-08
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS request_log (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    timestamp TEXT NOT NULL DEFAULT (datetime('now')),
+    agent_name TEXT NOT NULL,
+    provider TEXT NOT NULL,
+    model TEXT NOT NULL,
+    endpoint TEXT NOT NULL,
+    tokens_in INTEGER,
+    tokens_out INTEGER,
+    latency_ms INTEGER,
+    status TEXT NOT NULL,  -- 'success', 'error', 'timeout', 'fallback'
+    error_message TEXT
+);
+
+-- Index for common queries
+CREATE INDEX IF NOT EXISTS idx_request_log_agent
+    ON request_log (agent_name, timestamp);
+
+CREATE INDEX IF NOT EXISTS idx_request_log_provider
+    ON request_log (provider, timestamp);
+
+CREATE INDEX IF NOT EXISTS idx_request_log_status
+    ON request_log (status, timestamp);
+
+-- View: recent activity per agent (last hour)
+CREATE VIEW IF NOT EXISTS v_recent_activity AS
+    SELECT
+        agent_name,
+        provider,
+        model,
+        status,
+        COUNT(*) as call_count,
+        AVG(latency_ms) as avg_latency_ms,
+        SUM(tokens_in) as total_tokens_in,
+        SUM(tokens_out) as total_tokens_out
+    FROM request_log
+    WHERE timestamp > datetime('now', '-1 hour')
+    GROUP BY agent_name, provider, model, status;
+
+-- View: provider reliability (last 24 hours)
+CREATE VIEW IF NOT EXISTS v_provider_reliability AS
+    SELECT
+        provider,
+        model,
+        COUNT(*) as total_calls,
+        SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successes,
+        SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
+        SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeouts,
+        SUM(CASE WHEN status = 'fallback' THEN 1 ELSE 0 END) as fallbacks,
+        ROUND(100.0 * SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) / COUNT(*), 1) as success_rate,
+        AVG(latency_ms) as avg_latency_ms
+    FROM request_log
+    WHERE timestamp > datetime('now', '-24 hours')
+    GROUP BY provider, model;