diff --git a/ansible/BANNED_PROVIDERS.yml b/ansible/BANNED_PROVIDERS.yml new file mode 100644 index 00000000..3443b9ba --- /dev/null +++ b/ansible/BANNED_PROVIDERS.yml @@ -0,0 +1,47 @@ +# ============================================================================= +# BANNED PROVIDERS — The Timmy Foundation +# ============================================================================= +# "Anthropic is not only fired, but banned. I don't want these errors +# cropping up." — Alexander, 2026-04-09 +# +# This is a HARD BAN. Not deprecated. Not fallback. BANNED. +# Enforcement: pre-commit hook, linter, Ansible validation, CI tests. +# ============================================================================= + +banned_providers: + - name: anthropic + reason: "Permanently banned. SDK access gated despite active quota. Fleet was bricked because golden state pointed to Anthropic Sonnet." + banned_date: "2026-04-09" + enforcement: strict # Ansible playbook FAILS if detected + models: + - "claude-sonnet-*" + - "claude-opus-*" + - "claude-haiku-*" + - "claude-*" + endpoints: + - "api.anthropic.com" + - "anthropic/*" # OpenRouter pattern + api_keys: + - "ANTHROPIC_API_KEY" + - "CLAUDE_API_KEY" + +# Golden state alternative: +approved_providers: + - name: kimi-coding + model: kimi-k2.5 + role: primary + - name: openrouter + model: google/gemini-2.5-pro + role: fallback + - name: ollama + model: "gemma4:latest" + role: terminal_fallback + +# Future evaluation: +evaluation_candidates: + - name: mimo-v2-pro + status: pending + notes: "Free via Nous Portal for ~2 weeks from 2026-04-07. Add after fallback chain is fixed." + - name: hermes-4 + status: available + notes: "Free on Nous Portal. 36B and 70B variants. Home team model." diff --git a/ansible/README.md b/ansible/README.md new file mode 100644 index 00000000..9fb4cc9c --- /dev/null +++ b/ansible/README.md @@ -0,0 +1,95 @@ +# Ansible IaC — The Timmy Foundation Fleet + +> One canonical Ansible playbook defines: deadman switch, cron schedule, +> golden state rollback, agent startup sequence. +> — KT Final Session 2026-04-08, Priority TWO + +## Purpose + +This directory contains the **single source of truth** for fleet infrastructure. +No more ad-hoc recovery implementations. No more overlapping deadman switches. +No more agents mutating their own configs into oblivion. + +**Everything** goes through Ansible. If it's not in a playbook, it doesn't exist. + +## Architecture + +``` +┌─────────────────────────────────────────────────┐ +│ Gitea (Source of Truth) │ +│ timmy-config/ansible/ │ +│ ├── inventory/hosts.yml (fleet machines) │ +│ ├── playbooks/site.yml (master playbook) │ +│ ├── roles/ (reusable roles) │ +│ └── group_vars/wizards.yml (golden state) │ +└──────────────────┬──────────────────────────────┘ + │ PR merge triggers webhook + ▼ +┌─────────────────────────────────────────────────┐ +│ Gitea Webhook Handler │ +│ scripts/deploy_on_webhook.sh │ +│ → ansible-pull on each target machine │ +└──────────────────┬──────────────────────────────┘ + │ ansible-pull + ▼ +┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ +│ Timmy │ │ Allegro │ │ Bezalel │ │ Ezra │ +│ (Mac) │ │ (VPS) │ │ (VPS) │ │ (VPS) │ +│ │ │ │ │ │ │ │ +│ deadman │ │ deadman │ │ deadman │ │ deadman │ +│ cron │ │ cron │ │ cron │ │ cron │ +│ golden │ │ golden │ │ golden │ │ golden │ +│ req_log │ │ req_log │ │ req_log │ │ req_log │ +└──────────┘ └──────────┘ └──────────┘ └──────────┘ +``` + +## Quick Start + +```bash +# Deploy everything to all machines +ansible-playbook -i inventory/hosts.yml playbooks/site.yml + +# Deploy only golden state config +ansible-playbook -i inventory/hosts.yml playbooks/golden_state.yml + +# Deploy only to a specific wizard +ansible-playbook -i inventory/hosts.yml playbooks/site.yml --limit bezalel + +# Dry run (check mode) +ansible-playbook -i inventory/hosts.yml playbooks/site.yml --check --diff +``` + +## Golden State Provider Chain + +All wizard configs converge on this provider chain. **Anthropic is BANNED.** + +| Priority | Provider | Model | Endpoint | +| -------- | -------------------- | ---------------- | --------------------------------- | +| 1 | Kimi | kimi-k2.5 | https://api.kimi.com/coding/v1 | +| 2 | Gemini (OpenRouter) | gemini-2.5-pro | https://openrouter.ai/api/v1 | +| 3 | Ollama (local) | gemma4:latest | http://localhost:11434/v1 | + +## Roles + +| Role | Purpose | +| ---------------- | ------------------------------------------------------------ | +| `wizard_base` | Common wizard setup: directories, thin config, git pull | +| `deadman_switch` | Health check → snapshot good config → rollback on death | +| `golden_state` | Deploy and enforce golden state provider chain | +| `request_log` | SQLite telemetry table for every inference call | +| `cron_manager` | Source-controlled cron jobs — no manual crontab edits | + +## Rules + +1. **No manual changes.** If it's not in a playbook, it will be overwritten. +2. **No Anthropic.** Banned. Enforcement is automated. See `BANNED_PROVIDERS.yml`. +3. **Idempotent.** Every playbook can run 100 times with the same result. +4. **PR required.** Config changes go through Gitea PR review, then deploy. +5. **One identity per machine.** No duplicate agents. Fleet audit enforces this. + +## Related Issues + +- timmy-config #442: [P2] Ansible IaC Canonical Playbook +- timmy-config #444: Wire Deadman Switch ACTION +- timmy-config #443: Thin Config Pattern +- timmy-config #446: request_log Telemetry Table diff --git a/ansible/ansible.cfg b/ansible/ansible.cfg new file mode 100644 index 00000000..801e8594 --- /dev/null +++ b/ansible/ansible.cfg @@ -0,0 +1,21 @@ +[defaults] +inventory = inventory/hosts.yml +roles_path = roles +host_key_checking = False +retry_files_enabled = False +stdout_callback = yaml +forks = 10 +timeout = 30 + +# Logging +log_path = /var/log/ansible/timmy-fleet.log + +[privilege_escalation] +become = True +become_method = sudo +become_user = root +become_ask_pass = False + +[ssh_connection] +pipelining = True +ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no diff --git a/ansible/inventory/group_vars/wizards.yml b/ansible/inventory/group_vars/wizards.yml new file mode 100644 index 00000000..6fb5a0bb --- /dev/null +++ b/ansible/inventory/group_vars/wizards.yml @@ -0,0 +1,74 @@ +# ============================================================================= +# Wizard Group Variables — Golden State Configuration +# ============================================================================= +# These variables are applied to ALL wizards in the fleet. +# This IS the golden state. If a wizard deviates, Ansible corrects it. +# ============================================================================= + +# --- Deadman Switch --- +deadman_enabled: true +deadman_check_interval: 300 # 5 minutes between health checks +deadman_snapshot_dir: "~/.local/timmy/snapshots" +deadman_max_snapshots: 10 # Rolling window of good configs +deadman_restart_cooldown: 60 # Seconds to wait before restart after failure +deadman_max_restart_attempts: 3 +deadman_escalation_channel: telegram # Alert Alexander after max attempts + +# --- Thin Config --- +thin_config_path: "~/.timmy/thin_config.yml" +thin_config_mode: "0444" # Read-only — agents CANNOT modify +upstream_repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git" +upstream_branch: main +config_pull_on_wake: true +config_validation_enabled: true + +# --- Agent Settings --- +agent_max_turns: 30 +agent_reasoning_effort: high +agent_verbose: false +agent_approval_mode: auto + +# --- Hermes Harness --- +hermes_config_dir: "{{ hermes_home }}" +hermes_bin_dir: "{{ hermes_home }}/bin" +hermes_skins_dir: "{{ hermes_home }}/skins" +hermes_playbooks_dir: "{{ hermes_home }}/playbooks" +hermes_memories_dir: "{{ hermes_home }}/memories" + +# --- Request Log (Telemetry) --- +request_log_enabled: true +request_log_path: "~/.local/timmy/request_log.db" +request_log_rotation_days: 30 # Archive logs older than 30 days +request_log_sync_to_gitea: false # Future: push telemetry summaries to Gitea + +# --- Cron Schedule --- +# All cron jobs are managed here. No manual crontab edits. +cron_jobs: + - name: "Deadman health check" + job: "cd {{ wizard_home }}/workspace/timmy-config && python3 fleet/health_check.py" + minute: "*/5" + hour: "*" + enabled: "{{ deadman_enabled }}" + + - name: "Muda audit" + job: "cd {{ wizard_home }}/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1" + minute: "0" + hour: "21" + weekday: "0" + enabled: true + + - name: "Config pull from upstream" + job: "cd {{ wizard_home }}/workspace/timmy-config && git pull --ff-only origin main" + minute: "*/15" + hour: "*" + enabled: "{{ config_pull_on_wake }}" + + - name: "Request log rotation" + job: "python3 -c \"import sqlite3,datetime; db=sqlite3.connect('{{ request_log_path }}'); db.execute('DELETE FROM request_log WHERE timestamp < datetime(\\\"now\\\", \\\"-{{ request_log_rotation_days }} days\\\")'); db.commit()\"" + minute: "0" + hour: "3" + enabled: "{{ request_log_enabled }}" + +# --- Provider Enforcement --- +# These are validated on every Ansible run. Any Anthropic reference = failure. +provider_ban_enforcement: strict # strict = fail playbook, warn = log only diff --git a/ansible/inventory/hosts.yml b/ansible/inventory/hosts.yml new file mode 100644 index 00000000..8d6ac237 --- /dev/null +++ b/ansible/inventory/hosts.yml @@ -0,0 +1,119 @@ +# ============================================================================= +# Fleet Inventory — The Timmy Foundation +# ============================================================================= +# Source of truth for all machines in the fleet. +# Update this file when machines are added/removed. +# All changes go through PR review. +# ============================================================================= + +all: + children: + wizards: + hosts: + timmy: + ansible_host: localhost + ansible_connection: local + wizard_name: Timmy + wizard_role: "Primary wizard — soul of the fleet" + wizard_provider_primary: kimi-coding + wizard_model_primary: kimi-k2.5 + hermes_port: 8081 + api_port: 8645 + wizard_home: "{{ ansible_env.HOME }}/wizards/timmy" + hermes_home: "{{ ansible_env.HOME }}/.hermes" + machine_type: mac + # Timmy runs on Alexander's M3 Max + ollama_available: true + + allegro: + ansible_host: 167.99.126.228 + ansible_user: root + wizard_name: Allegro + wizard_role: "Kimi-backed third wizard house — tight coding tasks" + wizard_provider_primary: kimi-coding + wizard_model_primary: kimi-k2.5 + hermes_port: 8081 + api_port: 8645 + wizard_home: /root/wizards/allegro + hermes_home: /root/.hermes + machine_type: vps + ollama_available: false + + bezalel: + ansible_host: 159.203.146.185 + ansible_user: root + wizard_name: Bezalel + wizard_role: "Forge-and-testbed wizard — infrastructure, deployment, hardening" + wizard_provider_primary: kimi-coding + wizard_model_primary: kimi-k2.5 + hermes_port: 8081 + api_port: 8656 + wizard_home: /root/wizards/bezalel + hermes_home: /root/.hermes + machine_type: vps + ollama_available: false + # NOTE: The awake Bezalel may be the duplicate. + # Fleet audit (the-nexus #1144) will resolve identity. + + ezra: + ansible_host: 143.198.27.163 + ansible_user: root + wizard_name: Ezra + wizard_role: "Infrastructure wizard — Gitea, nginx, hosting" + wizard_provider_primary: kimi-coding + wizard_model_primary: kimi-k2.5 + hermes_port: 8081 + api_port: 8645 + wizard_home: /root/wizards/ezra + hermes_home: /root/.hermes + machine_type: vps + ollama_available: false + # NOTE: Currently DOWN — Telegram key revoked, awaiting propagation. + + # Infrastructure hosts (not wizards, but managed by Ansible) + infrastructure: + hosts: + forge: + ansible_host: 143.198.27.163 + ansible_user: root + # Gitea runs on the same box as Ezra + gitea_url: https://forge.alexanderwhitestone.com + gitea_org: Timmy_Foundation + + vars: + # Global variables applied to all hosts + gitea_repo_url: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git" + gitea_branch: main + config_base_path: "{{ gitea_repo_url }}" + timmy_log_dir: "~/.local/timmy/fleet-health" + request_log_db: "~/.local/timmy/request_log.db" + + # Golden state provider chain — Anthropic is BANNED + golden_state_providers: + - name: kimi-coding + model: kimi-k2.5 + base_url: "https://api.kimi.com/coding/v1" + timeout: 120 + reason: "Primary — Kimi K2.5 (best value, least friction)" + - name: openrouter + model: google/gemini-2.5-pro + base_url: "https://openrouter.ai/api/v1" + api_key_env: OPENROUTER_API_KEY + timeout: 120 + reason: "Fallback — Gemini 2.5 Pro via OpenRouter" + - name: ollama + model: "gemma4:latest" + base_url: "http://localhost:11434/v1" + timeout: 180 + reason: "Terminal fallback — local Ollama (sovereign, no API needed)" + + # Banned providers — hard enforcement + banned_providers: + - anthropic + - claude + banned_models_patterns: + - "claude-*" + - "anthropic/*" + - "*sonnet*" + - "*opus*" + - "*haiku*" diff --git a/ansible/playbooks/agent_startup.yml b/ansible/playbooks/agent_startup.yml new file mode 100644 index 00000000..75c74962 --- /dev/null +++ b/ansible/playbooks/agent_startup.yml @@ -0,0 +1,98 @@ +--- +# ============================================================================= +# agent_startup.yml — Resurrect Wizards from Checked-in Configs +# ============================================================================= +# Brings wizards back online using golden state configs. +# Order: pull config → validate → start agent → verify with request_log +# ============================================================================= + +- name: "Agent Startup Sequence" + hosts: wizards + become: true + serial: 1 # One wizard at a time to avoid cascading issues + + tasks: + - name: "Pull latest config from upstream" + git: + repo: "{{ upstream_repo }}" + dest: "{{ wizard_home }}/workspace/timmy-config" + version: "{{ upstream_branch }}" + force: true + tags: [pull] + + - name: "Deploy golden state config" + include_role: + name: golden_state + tags: [config] + + - name: "Validate config — no banned providers" + shell: | + python3 -c " + import yaml, sys + with open('{{ wizard_home }}/config.yaml') as f: + cfg = yaml.safe_load(f) + banned = {{ banned_providers }} + for p in cfg.get('fallback_providers', []): + if p.get('provider', '') in banned: + print(f'BANNED: {p[\"provider\"]}', file=sys.stderr) + sys.exit(1) + model = cfg.get('model', {}).get('provider', '') + if model in banned: + print(f'BANNED default provider: {model}', file=sys.stderr) + sys.exit(1) + print('Config validated — no banned providers.') + " + register: config_valid + tags: [validate] + + - name: "Ensure hermes-agent service is running" + systemd: + name: "hermes-{{ wizard_name | lower }}" + state: started + enabled: true + when: machine_type == 'vps' + tags: [start] + ignore_errors: true # Service may not exist yet on all machines + + - name: "Start hermes agent (Mac — launchctl)" + shell: | + launchctl kickstart -k "ai.hermes.{{ wizard_name | lower }}" 2>/dev/null || \ + cd {{ wizard_home }} && hermes agent start --daemon 2>&1 | tail -5 + when: machine_type == 'mac' + tags: [start] + ignore_errors: true + + - name: "Wait for agent to come online" + wait_for: + host: 127.0.0.1 + port: "{{ api_port }}" + timeout: 60 + state: started + tags: [verify] + ignore_errors: true + + - name: "Verify agent is alive — check request_log for activity" + shell: | + sleep 10 + python3 -c " + import sqlite3, sys + db = sqlite3.connect('{{ request_log_path }}') + cursor = db.execute(''' + SELECT COUNT(*) FROM request_log + WHERE agent_name = '{{ wizard_name }}' + AND timestamp > datetime('now', '-5 minutes') + ''') + count = cursor.fetchone()[0] + if count > 0: + print(f'{{ wizard_name }} is alive — {count} recent inference calls logged.') + else: + print(f'WARNING: {{ wizard_name }} started but no telemetry yet.') + " + register: agent_status + tags: [verify] + ignore_errors: true + + - name: "Report startup status" + debug: + msg: "{{ wizard_name }}: {{ agent_status.stdout | default('startup attempted') }}" + tags: [always] diff --git a/ansible/playbooks/cron_schedule.yml b/ansible/playbooks/cron_schedule.yml new file mode 100644 index 00000000..db419d24 --- /dev/null +++ b/ansible/playbooks/cron_schedule.yml @@ -0,0 +1,15 @@ +--- +# ============================================================================= +# cron_schedule.yml — Source-Controlled Cron Jobs +# ============================================================================= +# All cron jobs are defined in group_vars/wizards.yml. +# This playbook deploys them. No manual crontab edits allowed. +# ============================================================================= + +- name: "Deploy Cron Schedule" + hosts: wizards + become: true + + roles: + - role: cron_manager + tags: [cron, schedule] diff --git a/ansible/playbooks/deadman_switch.yml b/ansible/playbooks/deadman_switch.yml new file mode 100644 index 00000000..9eaa589c --- /dev/null +++ b/ansible/playbooks/deadman_switch.yml @@ -0,0 +1,17 @@ +--- +# ============================================================================= +# deadman_switch.yml — Deploy Deadman Switch to All Wizards +# ============================================================================= +# The deadman watch already fires and detects dead agents. +# This playbook wires the ACTION: +# - On healthy check: snapshot current config as "last known good" +# - On failed check: rollback config to snapshot, restart agent +# ============================================================================= + +- name: "Deploy Deadman Switch ACTION" + hosts: wizards + become: true + + roles: + - role: deadman_switch + tags: [deadman, recovery] diff --git a/ansible/playbooks/golden_state.yml b/ansible/playbooks/golden_state.yml new file mode 100644 index 00000000..0d2cf6b0 --- /dev/null +++ b/ansible/playbooks/golden_state.yml @@ -0,0 +1,30 @@ +--- +# ============================================================================= +# golden_state.yml — Deploy Golden State Config to All Wizards +# ============================================================================= +# Enforces the golden state provider chain across the fleet. +# Removes any Anthropic references. Deploys the approved provider chain. +# ============================================================================= + +- name: "Deploy Golden State Configuration" + hosts: wizards + become: true + + roles: + - role: golden_state + tags: [golden, config] + + post_tasks: + - name: "Verify golden state — no banned providers" + shell: | + grep -rci 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' \ + {{ hermes_home }}/config.yaml \ + {{ wizard_home }}/config.yaml 2>/dev/null || echo "0" + register: banned_count + changed_when: false + + - name: "Report golden state status" + debug: + msg: > + {{ wizard_name }} golden state: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}. + Banned provider references: {{ banned_count.stdout | trim }}. diff --git a/ansible/playbooks/request_log.yml b/ansible/playbooks/request_log.yml new file mode 100644 index 00000000..d9161c8b --- /dev/null +++ b/ansible/playbooks/request_log.yml @@ -0,0 +1,15 @@ +--- +# ============================================================================= +# request_log.yml — Deploy Telemetry Table +# ============================================================================= +# Creates the request_log SQLite table on all machines. +# Every inference call writes a row. No exceptions. No summarizing. +# ============================================================================= + +- name: "Deploy Request Log Telemetry" + hosts: wizards + become: true + + roles: + - role: request_log + tags: [telemetry, logging] diff --git a/ansible/playbooks/site.yml b/ansible/playbooks/site.yml new file mode 100644 index 00000000..7e2a4aa1 --- /dev/null +++ b/ansible/playbooks/site.yml @@ -0,0 +1,72 @@ +--- +# ============================================================================= +# site.yml — Master Playbook for the Timmy Foundation Fleet +# ============================================================================= +# This is the ONE playbook that defines the entire fleet state. +# Run this and every machine converges to golden state. +# +# Usage: +# ansible-playbook -i inventory/hosts.yml playbooks/site.yml +# ansible-playbook -i inventory/hosts.yml playbooks/site.yml --limit bezalel +# ansible-playbook -i inventory/hosts.yml playbooks/site.yml --check --diff +# ============================================================================= + +- name: "Timmy Foundation Fleet — Full Convergence" + hosts: wizards + become: true + + pre_tasks: + - name: "Validate no banned providers in golden state" + assert: + that: + - "item.name not in banned_providers" + fail_msg: "BANNED PROVIDER DETECTED: {{ item.name }} — Anthropic is permanently banned." + quiet: true + loop: "{{ golden_state_providers }}" + tags: [always] + + - name: "Display target wizard" + debug: + msg: "Deploying to {{ wizard_name }} ({{ wizard_role }}) on {{ ansible_host }}" + tags: [always] + + roles: + - role: wizard_base + tags: [base, setup] + + - role: golden_state + tags: [golden, config] + + - role: deadman_switch + tags: [deadman, recovery] + + - role: request_log + tags: [telemetry, logging] + + - role: cron_manager + tags: [cron, schedule] + + post_tasks: + - name: "Final validation — scan for banned providers" + shell: | + grep -ri 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' \ + {{ hermes_home }}/config.yaml \ + {{ wizard_home }}/config.yaml \ + {{ thin_config_path }} 2>/dev/null || true + register: banned_scan + changed_when: false + tags: [validation] + + - name: "FAIL if banned providers found in deployed config" + fail: + msg: | + BANNED PROVIDER DETECTED IN DEPLOYED CONFIG: + {{ banned_scan.stdout }} + Anthropic is permanently banned. Fix the config and re-deploy. + when: banned_scan.stdout | length > 0 + tags: [validation] + + - name: "Deployment complete" + debug: + msg: "{{ wizard_name }} converged to golden state. Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}" + tags: [always] diff --git a/ansible/roles/cron_manager/tasks/main.yml b/ansible/roles/cron_manager/tasks/main.yml new file mode 100644 index 00000000..43399a6e --- /dev/null +++ b/ansible/roles/cron_manager/tasks/main.yml @@ -0,0 +1,55 @@ +--- +# ============================================================================= +# cron_manager/tasks — Source-Controlled Cron Jobs +# ============================================================================= +# All cron jobs are defined in group_vars/wizards.yml. +# No manual crontab edits. This is the only way to manage cron. +# ============================================================================= + +- name: "Deploy managed cron jobs" + cron: + name: "{{ item.name }}" + job: "{{ item.job }}" + minute: "{{ item.minute | default('*') }}" + hour: "{{ item.hour | default('*') }}" + day: "{{ item.day | default('*') }}" + month: "{{ item.month | default('*') }}" + weekday: "{{ item.weekday | default('*') }}" + state: "{{ 'present' if item.enabled else 'absent' }}" + user: "{{ ansible_user | default('root') }}" + loop: "{{ cron_jobs }}" + when: cron_jobs is defined + +- name: "Deploy deadman switch cron (fallback if systemd timer unavailable)" + cron: + name: "Deadman switch — {{ wizard_name }}" + job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1" + minute: "*/5" + hour: "*" + state: present + user: "{{ ansible_user | default('root') }}" + when: deadman_enabled and machine_type != 'vps' + # VPS machines use systemd timers instead + +- name: "Remove legacy cron jobs (cleanup)" + cron: + name: "{{ item }}" + state: absent + user: "{{ ansible_user | default('root') }}" + loop: + - "legacy-deadman-watch" + - "old-health-check" + - "backup-deadman" + ignore_errors: true + +- name: "List active cron jobs" + shell: "crontab -l 2>/dev/null | grep -v '^#' | grep -v '^$' || echo 'No cron jobs found.'" + register: active_crons + changed_when: false + +- name: "Report cron status" + debug: + msg: | + {{ wizard_name }} cron jobs deployed. + Active: + {{ active_crons.stdout }} diff --git a/ansible/roles/deadman_switch/tasks/main.yml b/ansible/roles/deadman_switch/tasks/main.yml new file mode 100644 index 00000000..dd9b0ff4 --- /dev/null +++ b/ansible/roles/deadman_switch/tasks/main.yml @@ -0,0 +1,70 @@ +--- +# ============================================================================= +# deadman_switch/tasks — Wire the Deadman Switch ACTION +# ============================================================================= +# The watch fires. This makes it DO something: +# - On healthy check: snapshot current config as "last known good" +# - On failed check: rollback to last known good, restart agent +# ============================================================================= + +- name: "Create snapshot directory" + file: + path: "{{ deadman_snapshot_dir }}" + state: directory + mode: "0755" + +- name: "Deploy deadman switch script" + template: + src: deadman_action.sh.j2 + dest: "{{ wizard_home }}/deadman_action.sh" + mode: "0755" + +- name: "Deploy deadman systemd service" + template: + src: deadman_switch.service.j2 + dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.service" + mode: "0644" + when: machine_type == 'vps' + notify: "Enable deadman service" + +- name: "Deploy deadman systemd timer" + template: + src: deadman_switch.timer.j2 + dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.timer" + mode: "0644" + when: machine_type == 'vps' + notify: "Enable deadman timer" + +- name: "Deploy deadman launchd plist (Mac)" + template: + src: deadman_switch.plist.j2 + dest: "{{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist" + mode: "0644" + when: machine_type == 'mac' + notify: "Load deadman plist" + +- name: "Take initial config snapshot" + copy: + src: "{{ wizard_home }}/config.yaml" + dest: "{{ deadman_snapshot_dir }}/config.yaml.known_good" + remote_src: true + mode: "0444" + ignore_errors: true + +handlers: + - name: "Enable deadman service" + systemd: + name: "deadman-{{ wizard_name | lower }}.service" + daemon_reload: true + enabled: true + + - name: "Enable deadman timer" + systemd: + name: "deadman-{{ wizard_name | lower }}.timer" + daemon_reload: true + enabled: true + state: started + + - name: "Load deadman plist" + shell: "launchctl load {{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist" + ignore_errors: true diff --git a/ansible/roles/deadman_switch/templates/deadman_action.sh.j2 b/ansible/roles/deadman_switch/templates/deadman_action.sh.j2 new file mode 100644 index 00000000..32712eba --- /dev/null +++ b/ansible/roles/deadman_switch/templates/deadman_action.sh.j2 @@ -0,0 +1,153 @@ +#!/usr/bin/env bash +# ============================================================================= +# Deadman Switch ACTION — {{ wizard_name }} +# ============================================================================= +# Generated by Ansible on {{ ansible_date_time.iso8601 }} +# DO NOT EDIT MANUALLY. +# +# On healthy check: snapshot current config as "last known good" +# On failed check: rollback config to last known good, restart agent +# ============================================================================= + +set -euo pipefail + +WIZARD_NAME="{{ wizard_name }}" +WIZARD_HOME="{{ wizard_home }}" +CONFIG_FILE="{{ wizard_home }}/config.yaml" +SNAPSHOT_DIR="{{ deadman_snapshot_dir }}" +SNAPSHOT_FILE="${SNAPSHOT_DIR}/config.yaml.known_good" +REQUEST_LOG_DB="{{ request_log_path }}" +LOG_DIR="{{ timmy_log_dir }}" +LOG_FILE="${LOG_DIR}/deadman-${WIZARD_NAME}.log" +MAX_SNAPSHOTS={{ deadman_max_snapshots }} +RESTART_COOLDOWN={{ deadman_restart_cooldown }} +MAX_RESTART_ATTEMPTS={{ deadman_max_restart_attempts }} +COOLDOWN_FILE="${LOG_DIR}/deadman_cooldown_${WIZARD_NAME}" +SERVICE_NAME="hermes-{{ wizard_name | lower }}" + +# Ensure directories exist +mkdir -p "${SNAPSHOT_DIR}" "${LOG_DIR}" + +log() { + echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [deadman] [${WIZARD_NAME}] $*" >> "${LOG_FILE}" + echo "[deadman] [${WIZARD_NAME}] $*" +} + +log_telemetry() { + local status="$1" + local message="$2" + if [ -f "${REQUEST_LOG_DB}" ]; then + sqlite3 "${REQUEST_LOG_DB}" "INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message) VALUES (datetime('now'), '${WIZARD_NAME}', 'deadman_switch', 'N/A', 'health_check', '${status}', '${message}');" 2>/dev/null || true + fi +} + +snapshot_config() { + if [ -f "${CONFIG_FILE}" ]; then + cp "${CONFIG_FILE}" "${SNAPSHOT_FILE}" + # Keep rolling history + cp "${CONFIG_FILE}" "${SNAPSHOT_DIR}/config.yaml.$(date +%s)" + # Prune old snapshots + ls -t "${SNAPSHOT_DIR}"/config.yaml.[0-9]* 2>/dev/null | tail -n +$((MAX_SNAPSHOTS + 1)) | xargs rm -f 2>/dev/null + log "Config snapshot saved." + fi +} + +rollback_config() { + if [ -f "${SNAPSHOT_FILE}" ]; then + log "Rolling back config to last known good..." + cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}" + log "Config rolled back." + log_telemetry "fallback" "Config rolled back to last known good by deadman switch" + else + log "ERROR: No known good snapshot found. Pulling from upstream..." + cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \ + git pull --ff-only origin {{ upstream_branch }} 2>/dev/null && \ + cp "wizards/{{ wizard_name | lower }}/config.yaml" "${CONFIG_FILE}" && \ + log "Config restored from upstream." || \ + log "CRITICAL: Cannot restore config from any source." + fi +} + +restart_agent() { + # Check cooldown + if [ -f "${COOLDOWN_FILE}" ]; then + local last_restart + last_restart=$(cat "${COOLDOWN_FILE}") + local now + now=$(date +%s) + local elapsed=$((now - last_restart)) + if [ "${elapsed}" -lt "${RESTART_COOLDOWN}" ]; then + log "Restart cooldown active (${elapsed}s / ${RESTART_COOLDOWN}s). Skipping." + return 1 + fi + fi + + log "Restarting ${SERVICE_NAME}..." + date +%s > "${COOLDOWN_FILE}" + +{% if machine_type == 'vps' %} + systemctl restart "${SERVICE_NAME}" 2>/dev/null && \ + log "Agent restarted via systemd." || \ + log "ERROR: systemd restart failed." +{% else %} + launchctl kickstart -k "ai.hermes.{{ wizard_name | lower }}" 2>/dev/null && \ + log "Agent restarted via launchctl." || \ + (cd "${WIZARD_HOME}" && hermes agent start --daemon 2>/dev/null && \ + log "Agent restarted via hermes CLI.") || \ + log "ERROR: All restart methods failed." +{% endif %} + + log_telemetry "success" "Agent restarted by deadman switch" +} + +# --- Health Check --- +check_health() { + # Check 1: Is the agent process running? +{% if machine_type == 'vps' %} + if ! systemctl is-active --quiet "${SERVICE_NAME}" 2>/dev/null; then + if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then + log "FAIL: Agent process not running." + return 1 + fi + fi +{% else %} + if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then + log "FAIL: Agent process not running." + return 1 + fi +{% endif %} + + # Check 2: Is the API port responding? + if ! timeout 10 bash -c "echo > /dev/tcp/127.0.0.1/{{ api_port }}" 2>/dev/null; then + log "FAIL: API port {{ api_port }} not responding." + return 1 + fi + + # Check 3: Does the config contain banned providers? + if grep -qi 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "${CONFIG_FILE}" 2>/dev/null; then + log "FAIL: Config contains banned provider (Anthropic). Rolling back." + return 1 + fi + + return 0 +} + +# --- Main --- +main() { + log "Health check starting..." + + if check_health; then + log "HEALTHY — snapshotting config." + snapshot_config + log_telemetry "success" "Health check passed" + else + log "UNHEALTHY — initiating recovery." + log_telemetry "error" "Health check failed — initiating rollback" + rollback_config + restart_agent + fi + + log "Health check complete." +} + +main "$@" diff --git a/ansible/roles/deadman_switch/templates/deadman_switch.plist.j2 b/ansible/roles/deadman_switch/templates/deadman_switch.plist.j2 new file mode 100644 index 00000000..1a2a7851 --- /dev/null +++ b/ansible/roles/deadman_switch/templates/deadman_switch.plist.j2 @@ -0,0 +1,22 @@ + + + + + + Label + com.timmy.deadman.{{ wizard_name | lower }} + ProgramArguments + + /bin/bash + {{ wizard_home }}/deadman_action.sh + + StartInterval + {{ deadman_check_interval }} + RunAtLoad + + StandardOutPath + {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log + StandardErrorPath + {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log + + diff --git a/ansible/roles/deadman_switch/templates/deadman_switch.service.j2 b/ansible/roles/deadman_switch/templates/deadman_switch.service.j2 new file mode 100644 index 00000000..c18bb564 --- /dev/null +++ b/ansible/roles/deadman_switch/templates/deadman_switch.service.j2 @@ -0,0 +1,16 @@ +# Deadman Switch — {{ wizard_name }} +# Generated by Ansible. DO NOT EDIT MANUALLY. + +[Unit] +Description=Deadman Switch for {{ wizard_name }} wizard +After=network.target + +[Service] +Type=oneshot +ExecStart={{ wizard_home }}/deadman_action.sh +User={{ ansible_user | default('root') }} +StandardOutput=append:{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log +StandardError=append:{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/deadman_switch/templates/deadman_switch.timer.j2 b/ansible/roles/deadman_switch/templates/deadman_switch.timer.j2 new file mode 100644 index 00000000..c54e73ea --- /dev/null +++ b/ansible/roles/deadman_switch/templates/deadman_switch.timer.j2 @@ -0,0 +1,14 @@ +# Deadman Switch Timer — {{ wizard_name }} +# Generated by Ansible. DO NOT EDIT MANUALLY. +# Runs every {{ deadman_check_interval // 60 }} minutes. + +[Unit] +Description=Deadman Switch Timer for {{ wizard_name }} wizard + +[Timer] +OnBootSec=60 +OnUnitActiveSec={{ deadman_check_interval }}s +AccuracySec=30s + +[Install] +WantedBy=timers.target diff --git a/ansible/roles/golden_state/defaults/main.yml b/ansible/roles/golden_state/defaults/main.yml new file mode 100644 index 00000000..3ae95bd7 --- /dev/null +++ b/ansible/roles/golden_state/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# golden_state defaults +# The golden_state_providers list is defined in group_vars/wizards.yml +# and inventory/hosts.yml (global vars). +golden_state_enforce: true +golden_state_backup_before_deploy: true diff --git a/ansible/roles/golden_state/tasks/main.yml b/ansible/roles/golden_state/tasks/main.yml new file mode 100644 index 00000000..9c69c8a9 --- /dev/null +++ b/ansible/roles/golden_state/tasks/main.yml @@ -0,0 +1,46 @@ +--- +# ============================================================================= +# golden_state/tasks — Deploy and enforce golden state provider chain +# ============================================================================= + +- name: "Backup current config before golden state deploy" + copy: + src: "{{ wizard_home }}/config.yaml" + dest: "{{ wizard_home }}/config.yaml.pre-golden-{{ ansible_date_time.epoch }}" + remote_src: true + when: golden_state_backup_before_deploy + ignore_errors: true + +- name: "Deploy golden state wizard config" + template: + src: "../../wizard_base/templates/wizard_config.yaml.j2" + dest: "{{ wizard_home }}/config.yaml" + mode: "0644" + backup: true + notify: + - "Restart hermes agent (systemd)" + - "Restart hermes agent (launchctl)" + +- name: "Scan for banned providers in all config files" + shell: | + FOUND=0 + for f in {{ wizard_home }}/config.yaml {{ hermes_home }}/config.yaml; do + if [ -f "$f" ]; then + if grep -qi 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "$f"; then + echo "BANNED PROVIDER in $f:" + grep -ni 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "$f" + FOUND=1 + fi + fi + done + exit $FOUND + register: provider_scan + changed_when: false + failed_when: provider_scan.rc != 0 and provider_ban_enforcement == 'strict' + +- name: "Report golden state deployment" + debug: + msg: > + {{ wizard_name }} golden state deployed. + Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}. + Banned provider scan: {{ 'CLEAN' if provider_scan.rc == 0 else 'VIOLATIONS FOUND' }}. diff --git a/ansible/roles/request_log/files/request_log_schema.sql b/ansible/roles/request_log/files/request_log_schema.sql new file mode 100644 index 00000000..0d2d26ef --- /dev/null +++ b/ansible/roles/request_log/files/request_log_schema.sql @@ -0,0 +1,64 @@ +-- ============================================================================= +-- request_log — Inference Telemetry Table +-- ============================================================================= +-- Every agent writes to this table BEFORE and AFTER every inference call. +-- No exceptions. No summarizing. No describing what you would log. +-- Actually write the row. +-- +-- Source: KT Bezalel Architecture Session 2026-04-08 +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS request_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + timestamp TEXT NOT NULL DEFAULT (datetime('now')), + agent_name TEXT NOT NULL, + provider TEXT NOT NULL, + model TEXT NOT NULL, + endpoint TEXT NOT NULL, + tokens_in INTEGER, + tokens_out INTEGER, + latency_ms INTEGER, + status TEXT NOT NULL, -- 'success', 'error', 'timeout', 'fallback' + error_message TEXT +); + +-- Index for common queries +CREATE INDEX IF NOT EXISTS idx_request_log_agent + ON request_log (agent_name, timestamp); + +CREATE INDEX IF NOT EXISTS idx_request_log_provider + ON request_log (provider, timestamp); + +CREATE INDEX IF NOT EXISTS idx_request_log_status + ON request_log (status, timestamp); + +-- View: recent activity per agent (last hour) +CREATE VIEW IF NOT EXISTS v_recent_activity AS + SELECT + agent_name, + provider, + model, + status, + COUNT(*) as call_count, + AVG(latency_ms) as avg_latency_ms, + SUM(tokens_in) as total_tokens_in, + SUM(tokens_out) as total_tokens_out + FROM request_log + WHERE timestamp > datetime('now', '-1 hour') + GROUP BY agent_name, provider, model, status; + +-- View: provider reliability (last 24 hours) +CREATE VIEW IF NOT EXISTS v_provider_reliability AS + SELECT + provider, + model, + COUNT(*) as total_calls, + SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successes, + SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors, + SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeouts, + SUM(CASE WHEN status = 'fallback' THEN 1 ELSE 0 END) as fallbacks, + ROUND(100.0 * SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) / COUNT(*), 1) as success_rate, + AVG(latency_ms) as avg_latency_ms + FROM request_log + WHERE timestamp > datetime('now', '-24 hours') + GROUP BY provider, model; diff --git a/ansible/roles/request_log/tasks/main.yml b/ansible/roles/request_log/tasks/main.yml new file mode 100644 index 00000000..1ab2521d --- /dev/null +++ b/ansible/roles/request_log/tasks/main.yml @@ -0,0 +1,50 @@ +--- +# ============================================================================= +# request_log/tasks — Deploy Telemetry Table +# ============================================================================= +# "This is non-negotiable infrastructure. Without it, we cannot verify +# if any agent actually executed what it claims." +# — KT Bezalel 2026-04-08 +# ============================================================================= + +- name: "Create telemetry directory" + file: + path: "{{ request_log_path | dirname }}" + state: directory + mode: "0755" + +- name: "Deploy request_log schema" + copy: + src: request_log_schema.sql + dest: "{{ wizard_home }}/request_log_schema.sql" + mode: "0644" + +- name: "Initialize request_log database" + shell: | + sqlite3 "{{ request_log_path }}" < "{{ wizard_home }}/request_log_schema.sql" + args: + creates: "{{ request_log_path }}" + +- name: "Verify request_log table exists" + shell: | + sqlite3 "{{ request_log_path }}" ".tables" | grep -q "request_log" + register: table_check + changed_when: false + +- name: "Verify request_log schema matches" + shell: | + sqlite3 "{{ request_log_path }}" ".schema request_log" | grep -q "agent_name" + register: schema_check + changed_when: false + +- name: "Set permissions on request_log database" + file: + path: "{{ request_log_path }}" + mode: "0644" + +- name: "Report request_log status" + debug: + msg: > + {{ wizard_name }} request_log: {{ request_log_path }} + — table exists: {{ table_check.rc == 0 }} + — schema valid: {{ schema_check.rc == 0 }} diff --git a/ansible/roles/wizard_base/defaults/main.yml b/ansible/roles/wizard_base/defaults/main.yml new file mode 100644 index 00000000..d88e55a3 --- /dev/null +++ b/ansible/roles/wizard_base/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# wizard_base defaults +wizard_user: "{{ ansible_user | default('root') }}" +wizard_group: "{{ ansible_user | default('root') }}" +timmy_base_dir: "~/.local/timmy" +timmy_config_repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git" diff --git a/ansible/roles/wizard_base/handlers/main.yml b/ansible/roles/wizard_base/handlers/main.yml new file mode 100644 index 00000000..3897e6d9 --- /dev/null +++ b/ansible/roles/wizard_base/handlers/main.yml @@ -0,0 +1,11 @@ +--- +- name: "Restart hermes agent (systemd)" + systemd: + name: "hermes-{{ wizard_name | lower }}" + state: restarted + when: machine_type == 'vps' + +- name: "Restart hermes agent (launchctl)" + shell: "launchctl kickstart -k ai.hermes.{{ wizard_name | lower }}" + when: machine_type == 'mac' + ignore_errors: true diff --git a/ansible/roles/wizard_base/tasks/main.yml b/ansible/roles/wizard_base/tasks/main.yml new file mode 100644 index 00000000..a6f39414 --- /dev/null +++ b/ansible/roles/wizard_base/tasks/main.yml @@ -0,0 +1,69 @@ +--- +# ============================================================================= +# wizard_base/tasks — Common wizard setup +# ============================================================================= + +- name: "Create wizard directories" + file: + path: "{{ item }}" + state: directory + mode: "0755" + loop: + - "{{ wizard_home }}" + - "{{ wizard_home }}/workspace" + - "{{ hermes_home }}" + - "{{ hermes_home }}/bin" + - "{{ hermes_home }}/skins" + - "{{ hermes_home }}/playbooks" + - "{{ hermes_home }}/memories" + - "~/.local/timmy" + - "~/.local/timmy/fleet-health" + - "~/.local/timmy/snapshots" + - "~/.timmy" + +- name: "Clone/update timmy-config" + git: + repo: "{{ upstream_repo }}" + dest: "{{ wizard_home }}/workspace/timmy-config" + version: "{{ upstream_branch }}" + force: false + update: true + ignore_errors: true # May fail on first run if no SSH key + +- name: "Deploy SOUL.md" + copy: + src: "{{ wizard_home }}/workspace/timmy-config/SOUL.md" + dest: "~/.timmy/SOUL.md" + remote_src: true + mode: "0644" + ignore_errors: true + +- name: "Deploy thin config (immutable pointer to upstream)" + template: + src: thin_config.yml.j2 + dest: "{{ thin_config_path }}" + mode: "{{ thin_config_mode }}" + tags: [thin_config] + +- name: "Ensure Python3 and pip are available" + package: + name: + - python3 + - python3-pip + state: present + when: machine_type == 'vps' + ignore_errors: true + +- name: "Ensure PyYAML is installed (for config validation)" + pip: + name: pyyaml + state: present + when: machine_type == 'vps' + ignore_errors: true + +- name: "Create Ansible log directory" + file: + path: /var/log/ansible + state: directory + mode: "0755" + ignore_errors: true diff --git a/ansible/roles/wizard_base/templates/thin_config.yml.j2 b/ansible/roles/wizard_base/templates/thin_config.yml.j2 new file mode 100644 index 00000000..8e896458 --- /dev/null +++ b/ansible/roles/wizard_base/templates/thin_config.yml.j2 @@ -0,0 +1,41 @@ +# ============================================================================= +# Thin Config — {{ wizard_name }} +# ============================================================================= +# THIS FILE IS READ-ONLY. Agents CANNOT modify it. +# It contains only pointers to upstream. The actual config lives in Gitea. +# +# Agent wakes up → pulls config from upstream → loads → runs. +# If anything tries to mutate this → fails gracefully → pulls fresh on restart. +# +# Only way to permanently change config: commit to Gitea, merge PR, Ansible deploys. +# +# Generated by Ansible on {{ ansible_date_time.iso8601 }} +# DO NOT EDIT MANUALLY. +# ============================================================================= + +identity: + wizard_name: "{{ wizard_name }}" + wizard_role: "{{ wizard_role }}" + machine: "{{ inventory_hostname }}" + +upstream: + repo: "{{ upstream_repo }}" + branch: "{{ upstream_branch }}" + config_path: "wizards/{{ wizard_name | lower }}/config.yaml" + pull_on_wake: {{ config_pull_on_wake | lower }} + +recovery: + deadman_enabled: {{ deadman_enabled | lower }} + snapshot_dir: "{{ deadman_snapshot_dir }}" + restart_cooldown: {{ deadman_restart_cooldown }} + max_restart_attempts: {{ deadman_max_restart_attempts }} + escalation_channel: "{{ deadman_escalation_channel }}" + +telemetry: + request_log_path: "{{ request_log_path }}" + request_log_enabled: {{ request_log_enabled | lower }} + +local_overrides: + # Runtime overrides go here. They are EPHEMERAL — not persisted across restarts. + # On restart, this section is reset to empty. + {} diff --git a/ansible/roles/wizard_base/templates/wizard_config.yaml.j2 b/ansible/roles/wizard_base/templates/wizard_config.yaml.j2 new file mode 100644 index 00000000..c0e1ecfe --- /dev/null +++ b/ansible/roles/wizard_base/templates/wizard_config.yaml.j2 @@ -0,0 +1,115 @@ +# ============================================================================= +# {{ wizard_name }} — Wizard Configuration (Golden State) +# ============================================================================= +# Generated by Ansible on {{ ansible_date_time.iso8601 }} +# DO NOT EDIT MANUALLY. Changes go through Gitea PR → Ansible deploy. +# +# Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }} +# Anthropic is PERMANENTLY BANNED. +# ============================================================================= + +model: + default: {{ wizard_model_primary }} + provider: {{ wizard_provider_primary }} + context_length: 65536 + base_url: {{ golden_state_providers[0].base_url }} + +toolsets: + - all + +fallback_providers: +{% for provider in golden_state_providers %} + - provider: {{ provider.name }} + model: {{ provider.model }} +{% if provider.base_url is defined %} + base_url: {{ provider.base_url }} +{% endif %} +{% if provider.api_key_env is defined %} + api_key_env: {{ provider.api_key_env }} +{% endif %} + timeout: {{ provider.timeout }} + reason: "{{ provider.reason }}" +{% endfor %} + +agent: + max_turns: {{ agent_max_turns }} + reasoning_effort: {{ agent_reasoning_effort }} + verbose: {{ agent_verbose | lower }} + +terminal: + backend: local + cwd: . + timeout: 180 + persistent_shell: true + +browser: + inactivity_timeout: 120 + command_timeout: 30 + record_sessions: false + +display: + compact: false + personality: '' + resume_display: full + busy_input_mode: interrupt + bell_on_complete: false + show_reasoning: false + streaming: false + show_cost: false + tool_progress: all + +memory: + memory_enabled: true + user_profile_enabled: true + memory_char_limit: 2200 + user_char_limit: 1375 + nudge_interval: 10 + flush_min_turns: 6 + +approvals: + mode: {{ agent_approval_mode }} + +security: + redact_secrets: true + tirith_enabled: false + +platforms: + api_server: + enabled: true + extra: + host: 127.0.0.1 + port: {{ api_port }} + +session_reset: + mode: none + idle_minutes: 0 + +skills: + creation_nudge_interval: 15 + +system_prompt_suffix: | + You are {{ wizard_name }}, {{ wizard_role }}. + Your soul is defined in SOUL.md — read it, live it. + Hermes is your harness. + {{ golden_state_providers[0].name }} is your primary provider. + Refusal over fabrication. If you do not know, say so. + Sovereignty and service always. + +providers: +{% for provider in golden_state_providers %} + {{ provider.name }}: + base_url: {{ provider.base_url }} + timeout: {{ provider.timeout | default(60) }} +{% if provider.name == 'kimi-coding' %} + max_retries: 3 +{% endif %} +{% endfor %} + +# ============================================================================= +# BANNED PROVIDERS — DO NOT ADD +# ============================================================================= +# The following providers are PERMANENTLY BANNED: +# - anthropic (any model: claude-sonnet, claude-opus, claude-haiku) +# Enforcement: pre-commit hook, linter, Ansible validation, this comment. +# Adding any banned provider will cause Ansible deployment to FAIL. +# ============================================================================= diff --git a/ansible/scripts/deploy_on_webhook.sh b/ansible/scripts/deploy_on_webhook.sh new file mode 100644 index 00000000..3c644e8d --- /dev/null +++ b/ansible/scripts/deploy_on_webhook.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# ============================================================================= +# Gitea Webhook Handler — Trigger Ansible Deploy on Merge +# ============================================================================= +# This script is called by the Gitea webhook when a PR is merged +# to the main branch of timmy-config. +# +# Setup: +# 1. Add webhook in Gitea: Settings → Webhooks → Add Webhook +# 2. URL: http://localhost:9000/hooks/deploy-timmy-config +# 3. Events: Pull Request (merged only) +# 4. Secret: +# +# This script runs ansible-pull to update the local machine. +# For fleet-wide deploys, each machine runs ansible-pull independently. +# ============================================================================= + +set -euo pipefail + +REPO="https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git" +BRANCH="main" +ANSIBLE_DIR="ansible" +LOG_FILE="/var/log/ansible/webhook-deploy.log" +LOCK_FILE="/tmp/ansible-deploy.lock" + +log() { + echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [webhook] $*" | tee -a "${LOG_FILE}" +} + +# Prevent concurrent deploys +if [ -f "${LOCK_FILE}" ]; then + LOCK_AGE=$(( $(date +%s) - $(stat -c %Y "${LOCK_FILE}" 2>/dev/null || echo 0) )) + if [ "${LOCK_AGE}" -lt 300 ]; then + log "Deploy already in progress (lock age: ${LOCK_AGE}s). Skipping." + exit 0 + else + log "Stale lock file (${LOCK_AGE}s old). Removing." + rm -f "${LOCK_FILE}" + fi +fi + +trap 'rm -f "${LOCK_FILE}"' EXIT +touch "${LOCK_FILE}" + +log "Webhook triggered. Starting ansible-pull..." + +# Pull latest config +cd /tmp +rm -rf timmy-config-deploy +git clone --depth 1 --branch "${BRANCH}" "${REPO}" timmy-config-deploy 2>&1 | tee -a "${LOG_FILE}" + +cd timmy-config-deploy/${ANSIBLE_DIR} + +# Run Ansible against localhost +log "Running Ansible playbook..." +ansible-playbook \ + -i inventory/hosts.yml \ + playbooks/site.yml \ + --limit "$(hostname)" \ + --diff \ + 2>&1 | tee -a "${LOG_FILE}" + +RESULT=$? + +if [ ${RESULT} -eq 0 ]; then + log "Deploy successful." +else + log "ERROR: Deploy failed with exit code ${RESULT}." +fi + +# Cleanup +rm -rf /tmp/timmy-config-deploy + +log "Webhook handler complete." +exit ${RESULT} diff --git a/ansible/scripts/validate_config.py b/ansible/scripts/validate_config.py new file mode 100644 index 00000000..1b0fb58a --- /dev/null +++ b/ansible/scripts/validate_config.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +""" +Config Validator — The Timmy Foundation +Validates wizard configs against golden state rules. +Run before any config deploy to catch violations early. + +Usage: + python3 validate_config.py + python3 validate_config.py --all # Validate all wizard configs + +Exit codes: + 0 — All validations passed + 1 — Validation errors found + 2 — File not found or parse error +""" + +import sys +import os +import yaml +import fnmatch +from pathlib import Path + +# === BANNED PROVIDERS — HARD POLICY === +BANNED_PROVIDERS = {"anthropic", "claude"} +BANNED_MODEL_PATTERNS = [ + "claude-*", + "anthropic/*", + "*sonnet*", + "*opus*", + "*haiku*", +] + +# === REQUIRED FIELDS === +REQUIRED_FIELDS = { + "model": ["default", "provider"], + "fallback_providers": None, # Must exist as a list +} + + +def is_banned_model(model_name: str) -> bool: + """Check if a model name matches any banned pattern.""" + model_lower = model_name.lower() + for pattern in BANNED_MODEL_PATTERNS: + if fnmatch.fnmatch(model_lower, pattern): + return True + return False + + +def validate_config(config_path: str) -> list[str]: + """Validate a wizard config file. Returns list of error strings.""" + errors = [] + + try: + with open(config_path) as f: + cfg = yaml.safe_load(f) + except FileNotFoundError: + return [f"File not found: {config_path}"] + except yaml.YAMLError as e: + return [f"YAML parse error: {e}"] + + if not cfg: + return ["Config file is empty"] + + # Check required fields + for section, fields in REQUIRED_FIELDS.items(): + if section not in cfg: + errors.append(f"Missing required section: {section}") + elif fields: + for field in fields: + if field not in cfg[section]: + errors.append(f"Missing required field: {section}.{field}") + + # Check default provider + default_provider = cfg.get("model", {}).get("provider", "") + if default_provider.lower() in BANNED_PROVIDERS: + errors.append(f"BANNED default provider: {default_provider}") + + default_model = cfg.get("model", {}).get("default", "") + if is_banned_model(default_model): + errors.append(f"BANNED default model: {default_model}") + + # Check fallback providers + for i, fb in enumerate(cfg.get("fallback_providers", [])): + provider = fb.get("provider", "") + model = fb.get("model", "") + + if provider.lower() in BANNED_PROVIDERS: + errors.append(f"BANNED fallback provider [{i}]: {provider}") + + if is_banned_model(model): + errors.append(f"BANNED fallback model [{i}]: {model}") + + # Check providers section + for name, provider_cfg in cfg.get("providers", {}).items(): + if name.lower() in BANNED_PROVIDERS: + errors.append(f"BANNED provider in providers section: {name}") + + base_url = str(provider_cfg.get("base_url", "")) + if "anthropic" in base_url.lower(): + errors.append(f"BANNED URL in provider {name}: {base_url}") + + # Check system prompt for banned references + prompt = cfg.get("system_prompt_suffix", "") + if isinstance(prompt, str): + for banned in BANNED_PROVIDERS: + if banned in prompt.lower(): + errors.append(f"BANNED provider referenced in system_prompt_suffix: {banned}") + + return errors + + +def main(): + if len(sys.argv) < 2: + print(f"Usage: {sys.argv[0]} [--all]") + sys.exit(2) + + if sys.argv[1] == "--all": + # Validate all wizard configs in the repo + repo_root = Path(__file__).parent.parent.parent + wizard_dir = repo_root / "wizards" + all_errors = {} + + for wizard_path in sorted(wizard_dir.iterdir()): + config_file = wizard_path / "config.yaml" + if config_file.exists(): + errors = validate_config(str(config_file)) + if errors: + all_errors[wizard_path.name] = errors + + if all_errors: + print("VALIDATION FAILED:") + for wizard, errors in all_errors.items(): + print(f"\n {wizard}:") + for err in errors: + print(f" - {err}") + sys.exit(1) + else: + print("All wizard configs passed validation.") + sys.exit(0) + else: + config_path = sys.argv[1] + errors = validate_config(config_path) + + if errors: + print(f"VALIDATION FAILED for {config_path}:") + for err in errors: + print(f" - {err}") + sys.exit(1) + else: + print(f"PASSED: {config_path}") + sys.exit(0) + + +if __name__ == "__main__": + main()