Compare commits

...

17 Commits

Author SHA1 Message Date
Perplexity
7ec45642eb feat(ansible): Canonical IaC playbook for fleet management
Some checks failed
PR Checklist / pr-checklist (pull_request) Failing after 1m27s
Implements the Ansible Infrastructure as Code story from KT 2026-04-08.

One canonical Ansible playbook defines:
- Deadman switch (snapshot good config on health, rollback+restart on death)
- Golden state config deployment (Anthropic BANNED, Kimi→Gemini→Ollama)
- Cron schedule (source-controlled, no manual crontab edits)
- Agent startup sequence (pull→validate→start→verify)
- request_log telemetry table (every inference call logged)
- Thin config pattern (immutable local pointer to upstream)
- Gitea webhook handler (deploy on merge)
- Config validator (rejects banned providers)

Fleet inventory: Timmy (Mac), Allegro (VPS), Bezalel (VPS), Ezra (VPS)

Roles: wizard_base, golden_state, deadman_switch, request_log, cron_manager

Addresses: timmy-config #442, #443, #444, #445, #446
References: KT Final 2026-04-08 P2, KT Bezalel 2026-04-08 #1-#5
2026-04-09 22:25:31 +00:00
a6fded436f Merge PR #431
Co-authored-by: Perplexity Computer <perplexity@tower.local>
Co-committed-by: Perplexity Computer <perplexity@tower.local>
2026-04-09 16:27:48 +00:00
641537eb07 Merge pull request '[EPIC] Gemini — Sovereign Infrastructure Suite Implementation' (#418) from feat/gemini-epic-398-1775648372708 into main 2026-04-08 23:38:18 +00:00
17fde3c03f feat: implement README.md
Some checks failed
PR Checklist / pr-checklist (pull_request) Failing after 2m38s
2026-04-08 11:40:45 +00:00
b53fdcd034 feat: implement telemetry.py 2026-04-08 11:40:43 +00:00
1cc1d2ae86 feat: implement skill_installer.py 2026-04-08 11:40:40 +00:00
9ec0d1d80e feat: implement cross_repo_test.py 2026-04-08 11:40:35 +00:00
e9cdaf09dc feat: implement phase_tracker.py 2026-04-08 11:40:30 +00:00
e8302b4af2 feat: implement self_healing.py 2026-04-08 11:40:25 +00:00
311ecf19db feat: implement model_eval.py 2026-04-08 11:40:19 +00:00
77f258efa5 feat: implement gitea_webhook_handler.py 2026-04-08 11:40:12 +00:00
5e12451588 feat: implement adr_manager.py 2026-04-08 11:40:05 +00:00
80b6ceb118 feat: implement agent_dispatch.py 2026-04-08 11:39:57 +00:00
ffb85cc10f feat: implement fleet_llama.py 2026-04-08 11:39:52 +00:00
4179646456 feat: implement architecture_linter_v2.py 2026-04-08 11:39:46 +00:00
681fd0763f feat: implement provision_wizard.py 2026-04-08 11:39:40 +00:00
b21c2833f7 Merge pull request '[PERPLEXITY-08] Add PR checklist CI workflow and enforcement script' (#411) from perplexity/pr-checklist-ci into main 2026-04-08 11:11:02 +00:00
42 changed files with 3082 additions and 0 deletions

View File

@@ -0,0 +1,134 @@
# validate-config.yaml
# Validates all config files, scripts, and playbooks on every PR.
# Addresses #289: repo-native validation for timmy-config changes.
#
# Runs: YAML lint, Python syntax check, shell lint, JSON validation,
# deploy script dry-run, and cron syntax verification.
name: Validate Config
on:
pull_request:
branches: [main]
push:
branches: [main]
jobs:
yaml-lint:
name: YAML Lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install yamllint
run: pip install yamllint
- name: Lint YAML files
run: |
find . -name '*.yaml' -o -name '*.yml' | \
grep -v '.gitea/workflows' | \
xargs -r yamllint -d '{extends: relaxed, rules: {line-length: {max: 200}}}'
json-validate:
name: JSON Validate
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Validate JSON files
run: |
find . -name '*.json' -print0 | while IFS= read -r -d '' f; do
echo "Validating: $f"
python3 -m json.tool "$f" > /dev/null || exit 1
done
python-check:
name: Python Syntax & Import Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install dependencies
run: |
pip install py_compile flake8
- name: Compile-check all Python files
run: |
find . -name '*.py' -print0 | while IFS= read -r -d '' f; do
echo "Checking: $f"
python3 -m py_compile "$f" || exit 1
done
- name: Flake8 critical errors only
run: |
flake8 --select=E9,F63,F7,F82 --show-source --statistics \
scripts/ allegro/ cron/ || true
shell-lint:
name: Shell Script Lint
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install shellcheck
run: sudo apt-get install -y shellcheck
- name: Lint shell scripts
run: |
find . -name '*.sh' -print0 | xargs -0 -r shellcheck --severity=error || true
cron-validate:
name: Cron Syntax Check
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Validate cron entries
run: |
if [ -d cron ]; then
find cron -name '*.cron' -o -name '*.crontab' | while read f; do
echo "Checking cron: $f"
# Basic syntax validation
while IFS= read -r line; do
[[ "$line" =~ ^#.*$ ]] && continue
[[ -z "$line" ]] && continue
fields=$(echo "$line" | awk '{print NF}')
if [ "$fields" -lt 6 ]; then
echo "ERROR: Too few fields in $f: $line"
exit 1
fi
done < "$f"
done
fi
deploy-dry-run:
name: Deploy Script Dry Run
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Syntax-check deploy.sh
run: |
if [ -f deploy.sh ]; then
bash -n deploy.sh
echo "deploy.sh syntax OK"
fi
playbook-schema:
name: Playbook Schema Validation
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Validate playbook structure
run: |
python3 -c "
import yaml, sys, glob
required_keys = {'name', 'description'}
for f in glob.glob('playbooks/*.yaml'):
with open(f) as fh:
try:
data = yaml.safe_load(fh)
if not isinstance(data, dict):
print(f'ERROR: {f} is not a YAML mapping')
sys.exit(1)
missing = required_keys - set(data.keys())
if missing:
print(f'WARNING: {f} missing keys: {missing}')
print(f'OK: {f}')
except yaml.YAMLError as e:
print(f'ERROR: {f}: {e}')
sys.exit(1)
"

View File

@@ -0,0 +1,47 @@
# =============================================================================
# BANNED PROVIDERS — The Timmy Foundation
# =============================================================================
# "Anthropic is not only fired, but banned. I don't want these errors
# cropping up." — Alexander, 2026-04-09
#
# This is a HARD BAN. Not deprecated. Not fallback. BANNED.
# Enforcement: pre-commit hook, linter, Ansible validation, CI tests.
# =============================================================================
banned_providers:
- name: anthropic
reason: "Permanently banned. SDK access gated despite active quota. Fleet was bricked because golden state pointed to Anthropic Sonnet."
banned_date: "2026-04-09"
enforcement: strict # Ansible playbook FAILS if detected
models:
- "claude-sonnet-*"
- "claude-opus-*"
- "claude-haiku-*"
- "claude-*"
endpoints:
- "api.anthropic.com"
- "anthropic/*" # OpenRouter pattern
api_keys:
- "ANTHROPIC_API_KEY"
- "CLAUDE_API_KEY"
# Golden state alternative:
approved_providers:
- name: kimi-coding
model: kimi-k2.5
role: primary
- name: openrouter
model: google/gemini-2.5-pro
role: fallback
- name: ollama
model: "gemma4:latest"
role: terminal_fallback
# Future evaluation:
evaluation_candidates:
- name: mimo-v2-pro
status: pending
notes: "Free via Nous Portal for ~2 weeks from 2026-04-07. Add after fallback chain is fixed."
- name: hermes-4
status: available
notes: "Free on Nous Portal. 36B and 70B variants. Home team model."

95
ansible/README.md Normal file
View File

@@ -0,0 +1,95 @@
# Ansible IaC — The Timmy Foundation Fleet
> One canonical Ansible playbook defines: deadman switch, cron schedule,
> golden state rollback, agent startup sequence.
> — KT Final Session 2026-04-08, Priority TWO
## Purpose
This directory contains the **single source of truth** for fleet infrastructure.
No more ad-hoc recovery implementations. No more overlapping deadman switches.
No more agents mutating their own configs into oblivion.
**Everything** goes through Ansible. If it's not in a playbook, it doesn't exist.
## Architecture
```
┌─────────────────────────────────────────────────┐
│ Gitea (Source of Truth) │
│ timmy-config/ansible/ │
│ ├── inventory/hosts.yml (fleet machines) │
│ ├── playbooks/site.yml (master playbook) │
│ ├── roles/ (reusable roles) │
│ └── group_vars/wizards.yml (golden state) │
└──────────────────┬──────────────────────────────┘
│ PR merge triggers webhook
┌─────────────────────────────────────────────────┐
│ Gitea Webhook Handler │
│ scripts/deploy_on_webhook.sh │
│ → ansible-pull on each target machine │
└──────────────────┬──────────────────────────────┘
│ ansible-pull
┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐
│ Timmy │ │ Allegro │ │ Bezalel │ │ Ezra │
│ (Mac) │ │ (VPS) │ │ (VPS) │ │ (VPS) │
│ │ │ │ │ │ │ │
│ deadman │ │ deadman │ │ deadman │ │ deadman │
│ cron │ │ cron │ │ cron │ │ cron │
│ golden │ │ golden │ │ golden │ │ golden │
│ req_log │ │ req_log │ │ req_log │ │ req_log │
└──────────┘ └──────────┘ └──────────┘ └──────────┘
```
## Quick Start
```bash
# Deploy everything to all machines
ansible-playbook -i inventory/hosts.yml playbooks/site.yml
# Deploy only golden state config
ansible-playbook -i inventory/hosts.yml playbooks/golden_state.yml
# Deploy only to a specific wizard
ansible-playbook -i inventory/hosts.yml playbooks/site.yml --limit bezalel
# Dry run (check mode)
ansible-playbook -i inventory/hosts.yml playbooks/site.yml --check --diff
```
## Golden State Provider Chain
All wizard configs converge on this provider chain. **Anthropic is BANNED.**
| Priority | Provider | Model | Endpoint |
| -------- | -------------------- | ---------------- | --------------------------------- |
| 1 | Kimi | kimi-k2.5 | https://api.kimi.com/coding/v1 |
| 2 | Gemini (OpenRouter) | gemini-2.5-pro | https://openrouter.ai/api/v1 |
| 3 | Ollama (local) | gemma4:latest | http://localhost:11434/v1 |
## Roles
| Role | Purpose |
| ---------------- | ------------------------------------------------------------ |
| `wizard_base` | Common wizard setup: directories, thin config, git pull |
| `deadman_switch` | Health check → snapshot good config → rollback on death |
| `golden_state` | Deploy and enforce golden state provider chain |
| `request_log` | SQLite telemetry table for every inference call |
| `cron_manager` | Source-controlled cron jobs — no manual crontab edits |
## Rules
1. **No manual changes.** If it's not in a playbook, it will be overwritten.
2. **No Anthropic.** Banned. Enforcement is automated. See `BANNED_PROVIDERS.yml`.
3. **Idempotent.** Every playbook can run 100 times with the same result.
4. **PR required.** Config changes go through Gitea PR review, then deploy.
5. **One identity per machine.** No duplicate agents. Fleet audit enforces this.
## Related Issues
- timmy-config #442: [P2] Ansible IaC Canonical Playbook
- timmy-config #444: Wire Deadman Switch ACTION
- timmy-config #443: Thin Config Pattern
- timmy-config #446: request_log Telemetry Table

21
ansible/ansible.cfg Normal file
View File

@@ -0,0 +1,21 @@
[defaults]
inventory = inventory/hosts.yml
roles_path = roles
host_key_checking = False
retry_files_enabled = False
stdout_callback = yaml
forks = 10
timeout = 30
# Logging
log_path = /var/log/ansible/timmy-fleet.log
[privilege_escalation]
become = True
become_method = sudo
become_user = root
become_ask_pass = False
[ssh_connection]
pipelining = True
ssh_args = -o ControlMaster=auto -o ControlPersist=60s -o StrictHostKeyChecking=no

View File

@@ -0,0 +1,74 @@
# =============================================================================
# Wizard Group Variables — Golden State Configuration
# =============================================================================
# These variables are applied to ALL wizards in the fleet.
# This IS the golden state. If a wizard deviates, Ansible corrects it.
# =============================================================================
# --- Deadman Switch ---
deadman_enabled: true
deadman_check_interval: 300 # 5 minutes between health checks
deadman_snapshot_dir: "~/.local/timmy/snapshots"
deadman_max_snapshots: 10 # Rolling window of good configs
deadman_restart_cooldown: 60 # Seconds to wait before restart after failure
deadman_max_restart_attempts: 3
deadman_escalation_channel: telegram # Alert Alexander after max attempts
# --- Thin Config ---
thin_config_path: "~/.timmy/thin_config.yml"
thin_config_mode: "0444" # Read-only — agents CANNOT modify
upstream_repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"
upstream_branch: main
config_pull_on_wake: true
config_validation_enabled: true
# --- Agent Settings ---
agent_max_turns: 30
agent_reasoning_effort: high
agent_verbose: false
agent_approval_mode: auto
# --- Hermes Harness ---
hermes_config_dir: "{{ hermes_home }}"
hermes_bin_dir: "{{ hermes_home }}/bin"
hermes_skins_dir: "{{ hermes_home }}/skins"
hermes_playbooks_dir: "{{ hermes_home }}/playbooks"
hermes_memories_dir: "{{ hermes_home }}/memories"
# --- Request Log (Telemetry) ---
request_log_enabled: true
request_log_path: "~/.local/timmy/request_log.db"
request_log_rotation_days: 30 # Archive logs older than 30 days
request_log_sync_to_gitea: false # Future: push telemetry summaries to Gitea
# --- Cron Schedule ---
# All cron jobs are managed here. No manual crontab edits.
cron_jobs:
- name: "Deadman health check"
job: "cd {{ wizard_home }}/workspace/timmy-config && python3 fleet/health_check.py"
minute: "*/5"
hour: "*"
enabled: "{{ deadman_enabled }}"
- name: "Muda audit"
job: "cd {{ wizard_home }}/workspace/timmy-config && bash fleet/muda-audit.sh >> /tmp/muda-audit.log 2>&1"
minute: "0"
hour: "21"
weekday: "0"
enabled: true
- name: "Config pull from upstream"
job: "cd {{ wizard_home }}/workspace/timmy-config && git pull --ff-only origin main"
minute: "*/15"
hour: "*"
enabled: "{{ config_pull_on_wake }}"
- name: "Request log rotation"
job: "python3 -c \"import sqlite3,datetime; db=sqlite3.connect('{{ request_log_path }}'); db.execute('DELETE FROM request_log WHERE timestamp < datetime(\\\"now\\\", \\\"-{{ request_log_rotation_days }} days\\\")'); db.commit()\""
minute: "0"
hour: "3"
enabled: "{{ request_log_enabled }}"
# --- Provider Enforcement ---
# These are validated on every Ansible run. Any Anthropic reference = failure.
provider_ban_enforcement: strict # strict = fail playbook, warn = log only

119
ansible/inventory/hosts.yml Normal file
View File

@@ -0,0 +1,119 @@
# =============================================================================
# Fleet Inventory — The Timmy Foundation
# =============================================================================
# Source of truth for all machines in the fleet.
# Update this file when machines are added/removed.
# All changes go through PR review.
# =============================================================================
all:
children:
wizards:
hosts:
timmy:
ansible_host: localhost
ansible_connection: local
wizard_name: Timmy
wizard_role: "Primary wizard — soul of the fleet"
wizard_provider_primary: kimi-coding
wizard_model_primary: kimi-k2.5
hermes_port: 8081
api_port: 8645
wizard_home: "{{ ansible_env.HOME }}/wizards/timmy"
hermes_home: "{{ ansible_env.HOME }}/.hermes"
machine_type: mac
# Timmy runs on Alexander's M3 Max
ollama_available: true
allegro:
ansible_host: 167.99.126.228
ansible_user: root
wizard_name: Allegro
wizard_role: "Kimi-backed third wizard house — tight coding tasks"
wizard_provider_primary: kimi-coding
wizard_model_primary: kimi-k2.5
hermes_port: 8081
api_port: 8645
wizard_home: /root/wizards/allegro
hermes_home: /root/.hermes
machine_type: vps
ollama_available: false
bezalel:
ansible_host: 159.203.146.185
ansible_user: root
wizard_name: Bezalel
wizard_role: "Forge-and-testbed wizard — infrastructure, deployment, hardening"
wizard_provider_primary: kimi-coding
wizard_model_primary: kimi-k2.5
hermes_port: 8081
api_port: 8656
wizard_home: /root/wizards/bezalel
hermes_home: /root/.hermes
machine_type: vps
ollama_available: false
# NOTE: The awake Bezalel may be the duplicate.
# Fleet audit (the-nexus #1144) will resolve identity.
ezra:
ansible_host: 143.198.27.163
ansible_user: root
wizard_name: Ezra
wizard_role: "Infrastructure wizard — Gitea, nginx, hosting"
wizard_provider_primary: kimi-coding
wizard_model_primary: kimi-k2.5
hermes_port: 8081
api_port: 8645
wizard_home: /root/wizards/ezra
hermes_home: /root/.hermes
machine_type: vps
ollama_available: false
# NOTE: Currently DOWN — Telegram key revoked, awaiting propagation.
# Infrastructure hosts (not wizards, but managed by Ansible)
infrastructure:
hosts:
forge:
ansible_host: 143.198.27.163
ansible_user: root
# Gitea runs on the same box as Ezra
gitea_url: https://forge.alexanderwhitestone.com
gitea_org: Timmy_Foundation
vars:
# Global variables applied to all hosts
gitea_repo_url: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"
gitea_branch: main
config_base_path: "{{ gitea_repo_url }}"
timmy_log_dir: "~/.local/timmy/fleet-health"
request_log_db: "~/.local/timmy/request_log.db"
# Golden state provider chain — Anthropic is BANNED
golden_state_providers:
- name: kimi-coding
model: kimi-k2.5
base_url: "https://api.kimi.com/coding/v1"
timeout: 120
reason: "Primary — Kimi K2.5 (best value, least friction)"
- name: openrouter
model: google/gemini-2.5-pro
base_url: "https://openrouter.ai/api/v1"
api_key_env: OPENROUTER_API_KEY
timeout: 120
reason: "Fallback — Gemini 2.5 Pro via OpenRouter"
- name: ollama
model: "gemma4:latest"
base_url: "http://localhost:11434/v1"
timeout: 180
reason: "Terminal fallback — local Ollama (sovereign, no API needed)"
# Banned providers — hard enforcement
banned_providers:
- anthropic
- claude
banned_models_patterns:
- "claude-*"
- "anthropic/*"
- "*sonnet*"
- "*opus*"
- "*haiku*"

View File

@@ -0,0 +1,98 @@
---
# =============================================================================
# agent_startup.yml — Resurrect Wizards from Checked-in Configs
# =============================================================================
# Brings wizards back online using golden state configs.
# Order: pull config → validate → start agent → verify with request_log
# =============================================================================
- name: "Agent Startup Sequence"
hosts: wizards
become: true
serial: 1 # One wizard at a time to avoid cascading issues
tasks:
- name: "Pull latest config from upstream"
git:
repo: "{{ upstream_repo }}"
dest: "{{ wizard_home }}/workspace/timmy-config"
version: "{{ upstream_branch }}"
force: true
tags: [pull]
- name: "Deploy golden state config"
include_role:
name: golden_state
tags: [config]
- name: "Validate config — no banned providers"
shell: |
python3 -c "
import yaml, sys
with open('{{ wizard_home }}/config.yaml') as f:
cfg = yaml.safe_load(f)
banned = {{ banned_providers }}
for p in cfg.get('fallback_providers', []):
if p.get('provider', '') in banned:
print(f'BANNED: {p[\"provider\"]}', file=sys.stderr)
sys.exit(1)
model = cfg.get('model', {}).get('provider', '')
if model in banned:
print(f'BANNED default provider: {model}', file=sys.stderr)
sys.exit(1)
print('Config validated — no banned providers.')
"
register: config_valid
tags: [validate]
- name: "Ensure hermes-agent service is running"
systemd:
name: "hermes-{{ wizard_name | lower }}"
state: started
enabled: true
when: machine_type == 'vps'
tags: [start]
ignore_errors: true # Service may not exist yet on all machines
- name: "Start hermes agent (Mac — launchctl)"
shell: |
launchctl kickstart -k "ai.hermes.{{ wizard_name | lower }}" 2>/dev/null || \
cd {{ wizard_home }} && hermes agent start --daemon 2>&1 | tail -5
when: machine_type == 'mac'
tags: [start]
ignore_errors: true
- name: "Wait for agent to come online"
wait_for:
host: 127.0.0.1
port: "{{ api_port }}"
timeout: 60
state: started
tags: [verify]
ignore_errors: true
- name: "Verify agent is alive — check request_log for activity"
shell: |
sleep 10
python3 -c "
import sqlite3, sys
db = sqlite3.connect('{{ request_log_path }}')
cursor = db.execute('''
SELECT COUNT(*) FROM request_log
WHERE agent_name = '{{ wizard_name }}'
AND timestamp > datetime('now', '-5 minutes')
''')
count = cursor.fetchone()[0]
if count > 0:
print(f'{{ wizard_name }} is alive — {count} recent inference calls logged.')
else:
print(f'WARNING: {{ wizard_name }} started but no telemetry yet.')
"
register: agent_status
tags: [verify]
ignore_errors: true
- name: "Report startup status"
debug:
msg: "{{ wizard_name }}: {{ agent_status.stdout | default('startup attempted') }}"
tags: [always]

View File

@@ -0,0 +1,15 @@
---
# =============================================================================
# cron_schedule.yml — Source-Controlled Cron Jobs
# =============================================================================
# All cron jobs are defined in group_vars/wizards.yml.
# This playbook deploys them. No manual crontab edits allowed.
# =============================================================================
- name: "Deploy Cron Schedule"
hosts: wizards
become: true
roles:
- role: cron_manager
tags: [cron, schedule]

View File

@@ -0,0 +1,17 @@
---
# =============================================================================
# deadman_switch.yml — Deploy Deadman Switch to All Wizards
# =============================================================================
# The deadman watch already fires and detects dead agents.
# This playbook wires the ACTION:
# - On healthy check: snapshot current config as "last known good"
# - On failed check: rollback config to snapshot, restart agent
# =============================================================================
- name: "Deploy Deadman Switch ACTION"
hosts: wizards
become: true
roles:
- role: deadman_switch
tags: [deadman, recovery]

View File

@@ -0,0 +1,30 @@
---
# =============================================================================
# golden_state.yml — Deploy Golden State Config to All Wizards
# =============================================================================
# Enforces the golden state provider chain across the fleet.
# Removes any Anthropic references. Deploys the approved provider chain.
# =============================================================================
- name: "Deploy Golden State Configuration"
hosts: wizards
become: true
roles:
- role: golden_state
tags: [golden, config]
post_tasks:
- name: "Verify golden state — no banned providers"
shell: |
grep -rci 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' \
{{ hermes_home }}/config.yaml \
{{ wizard_home }}/config.yaml 2>/dev/null || echo "0"
register: banned_count
changed_when: false
- name: "Report golden state status"
debug:
msg: >
{{ wizard_name }} golden state: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}.
Banned provider references: {{ banned_count.stdout | trim }}.

View File

@@ -0,0 +1,15 @@
---
# =============================================================================
# request_log.yml — Deploy Telemetry Table
# =============================================================================
# Creates the request_log SQLite table on all machines.
# Every inference call writes a row. No exceptions. No summarizing.
# =============================================================================
- name: "Deploy Request Log Telemetry"
hosts: wizards
become: true
roles:
- role: request_log
tags: [telemetry, logging]

View File

@@ -0,0 +1,72 @@
---
# =============================================================================
# site.yml — Master Playbook for the Timmy Foundation Fleet
# =============================================================================
# This is the ONE playbook that defines the entire fleet state.
# Run this and every machine converges to golden state.
#
# Usage:
# ansible-playbook -i inventory/hosts.yml playbooks/site.yml
# ansible-playbook -i inventory/hosts.yml playbooks/site.yml --limit bezalel
# ansible-playbook -i inventory/hosts.yml playbooks/site.yml --check --diff
# =============================================================================
- name: "Timmy Foundation Fleet — Full Convergence"
hosts: wizards
become: true
pre_tasks:
- name: "Validate no banned providers in golden state"
assert:
that:
- "item.name not in banned_providers"
fail_msg: "BANNED PROVIDER DETECTED: {{ item.name }} — Anthropic is permanently banned."
quiet: true
loop: "{{ golden_state_providers }}"
tags: [always]
- name: "Display target wizard"
debug:
msg: "Deploying to {{ wizard_name }} ({{ wizard_role }}) on {{ ansible_host }}"
tags: [always]
roles:
- role: wizard_base
tags: [base, setup]
- role: golden_state
tags: [golden, config]
- role: deadman_switch
tags: [deadman, recovery]
- role: request_log
tags: [telemetry, logging]
- role: cron_manager
tags: [cron, schedule]
post_tasks:
- name: "Final validation — scan for banned providers"
shell: |
grep -ri 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' \
{{ hermes_home }}/config.yaml \
{{ wizard_home }}/config.yaml \
{{ thin_config_path }} 2>/dev/null || true
register: banned_scan
changed_when: false
tags: [validation]
- name: "FAIL if banned providers found in deployed config"
fail:
msg: |
BANNED PROVIDER DETECTED IN DEPLOYED CONFIG:
{{ banned_scan.stdout }}
Anthropic is permanently banned. Fix the config and re-deploy.
when: banned_scan.stdout | length > 0
tags: [validation]
- name: "Deployment complete"
debug:
msg: "{{ wizard_name }} converged to golden state. Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}"
tags: [always]

View File

@@ -0,0 +1,55 @@
---
# =============================================================================
# cron_manager/tasks — Source-Controlled Cron Jobs
# =============================================================================
# All cron jobs are defined in group_vars/wizards.yml.
# No manual crontab edits. This is the only way to manage cron.
# =============================================================================
- name: "Deploy managed cron jobs"
cron:
name: "{{ item.name }}"
job: "{{ item.job }}"
minute: "{{ item.minute | default('*') }}"
hour: "{{ item.hour | default('*') }}"
day: "{{ item.day | default('*') }}"
month: "{{ item.month | default('*') }}"
weekday: "{{ item.weekday | default('*') }}"
state: "{{ 'present' if item.enabled else 'absent' }}"
user: "{{ ansible_user | default('root') }}"
loop: "{{ cron_jobs }}"
when: cron_jobs is defined
- name: "Deploy deadman switch cron (fallback if systemd timer unavailable)"
cron:
name: "Deadman switch — {{ wizard_name }}"
job: "{{ wizard_home }}/deadman_action.sh >> {{ timmy_log_dir }}/deadman-{{ wizard_name }}.log 2>&1"
minute: "*/5"
hour: "*"
state: present
user: "{{ ansible_user | default('root') }}"
when: deadman_enabled and machine_type != 'vps'
# VPS machines use systemd timers instead
- name: "Remove legacy cron jobs (cleanup)"
cron:
name: "{{ item }}"
state: absent
user: "{{ ansible_user | default('root') }}"
loop:
- "legacy-deadman-watch"
- "old-health-check"
- "backup-deadman"
ignore_errors: true
- name: "List active cron jobs"
shell: "crontab -l 2>/dev/null | grep -v '^#' | grep -v '^$' || echo 'No cron jobs found.'"
register: active_crons
changed_when: false
- name: "Report cron status"
debug:
msg: |
{{ wizard_name }} cron jobs deployed.
Active:
{{ active_crons.stdout }}

View File

@@ -0,0 +1,70 @@
---
# =============================================================================
# deadman_switch/tasks — Wire the Deadman Switch ACTION
# =============================================================================
# The watch fires. This makes it DO something:
# - On healthy check: snapshot current config as "last known good"
# - On failed check: rollback to last known good, restart agent
# =============================================================================
- name: "Create snapshot directory"
file:
path: "{{ deadman_snapshot_dir }}"
state: directory
mode: "0755"
- name: "Deploy deadman switch script"
template:
src: deadman_action.sh.j2
dest: "{{ wizard_home }}/deadman_action.sh"
mode: "0755"
- name: "Deploy deadman systemd service"
template:
src: deadman_switch.service.j2
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.service"
mode: "0644"
when: machine_type == 'vps'
notify: "Enable deadman service"
- name: "Deploy deadman systemd timer"
template:
src: deadman_switch.timer.j2
dest: "/etc/systemd/system/deadman-{{ wizard_name | lower }}.timer"
mode: "0644"
when: machine_type == 'vps'
notify: "Enable deadman timer"
- name: "Deploy deadman launchd plist (Mac)"
template:
src: deadman_switch.plist.j2
dest: "{{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
mode: "0644"
when: machine_type == 'mac'
notify: "Load deadman plist"
- name: "Take initial config snapshot"
copy:
src: "{{ wizard_home }}/config.yaml"
dest: "{{ deadman_snapshot_dir }}/config.yaml.known_good"
remote_src: true
mode: "0444"
ignore_errors: true
handlers:
- name: "Enable deadman service"
systemd:
name: "deadman-{{ wizard_name | lower }}.service"
daemon_reload: true
enabled: true
- name: "Enable deadman timer"
systemd:
name: "deadman-{{ wizard_name | lower }}.timer"
daemon_reload: true
enabled: true
state: started
- name: "Load deadman plist"
shell: "launchctl load {{ ansible_env.HOME }}/Library/LaunchAgents/com.timmy.deadman.{{ wizard_name | lower }}.plist"
ignore_errors: true

View File

@@ -0,0 +1,153 @@
#!/usr/bin/env bash
# =============================================================================
# Deadman Switch ACTION — {{ wizard_name }}
# =============================================================================
# Generated by Ansible on {{ ansible_date_time.iso8601 }}
# DO NOT EDIT MANUALLY.
#
# On healthy check: snapshot current config as "last known good"
# On failed check: rollback config to last known good, restart agent
# =============================================================================
set -euo pipefail
WIZARD_NAME="{{ wizard_name }}"
WIZARD_HOME="{{ wizard_home }}"
CONFIG_FILE="{{ wizard_home }}/config.yaml"
SNAPSHOT_DIR="{{ deadman_snapshot_dir }}"
SNAPSHOT_FILE="${SNAPSHOT_DIR}/config.yaml.known_good"
REQUEST_LOG_DB="{{ request_log_path }}"
LOG_DIR="{{ timmy_log_dir }}"
LOG_FILE="${LOG_DIR}/deadman-${WIZARD_NAME}.log"
MAX_SNAPSHOTS={{ deadman_max_snapshots }}
RESTART_COOLDOWN={{ deadman_restart_cooldown }}
MAX_RESTART_ATTEMPTS={{ deadman_max_restart_attempts }}
COOLDOWN_FILE="${LOG_DIR}/deadman_cooldown_${WIZARD_NAME}"
SERVICE_NAME="hermes-{{ wizard_name | lower }}"
# Ensure directories exist
mkdir -p "${SNAPSHOT_DIR}" "${LOG_DIR}"
log() {
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [deadman] [${WIZARD_NAME}] $*" >> "${LOG_FILE}"
echo "[deadman] [${WIZARD_NAME}] $*"
}
log_telemetry() {
local status="$1"
local message="$2"
if [ -f "${REQUEST_LOG_DB}" ]; then
sqlite3 "${REQUEST_LOG_DB}" "INSERT INTO request_log (timestamp, agent_name, provider, model, endpoint, status, error_message) VALUES (datetime('now'), '${WIZARD_NAME}', 'deadman_switch', 'N/A', 'health_check', '${status}', '${message}');" 2>/dev/null || true
fi
}
snapshot_config() {
if [ -f "${CONFIG_FILE}" ]; then
cp "${CONFIG_FILE}" "${SNAPSHOT_FILE}"
# Keep rolling history
cp "${CONFIG_FILE}" "${SNAPSHOT_DIR}/config.yaml.$(date +%s)"
# Prune old snapshots
ls -t "${SNAPSHOT_DIR}"/config.yaml.[0-9]* 2>/dev/null | tail -n +$((MAX_SNAPSHOTS + 1)) | xargs rm -f 2>/dev/null
log "Config snapshot saved."
fi
}
rollback_config() {
if [ -f "${SNAPSHOT_FILE}" ]; then
log "Rolling back config to last known good..."
cp "${SNAPSHOT_FILE}" "${CONFIG_FILE}"
log "Config rolled back."
log_telemetry "fallback" "Config rolled back to last known good by deadman switch"
else
log "ERROR: No known good snapshot found. Pulling from upstream..."
cd "${WIZARD_HOME}/workspace/timmy-config" 2>/dev/null && \
git pull --ff-only origin {{ upstream_branch }} 2>/dev/null && \
cp "wizards/{{ wizard_name | lower }}/config.yaml" "${CONFIG_FILE}" && \
log "Config restored from upstream." || \
log "CRITICAL: Cannot restore config from any source."
fi
}
restart_agent() {
# Check cooldown
if [ -f "${COOLDOWN_FILE}" ]; then
local last_restart
last_restart=$(cat "${COOLDOWN_FILE}")
local now
now=$(date +%s)
local elapsed=$((now - last_restart))
if [ "${elapsed}" -lt "${RESTART_COOLDOWN}" ]; then
log "Restart cooldown active (${elapsed}s / ${RESTART_COOLDOWN}s). Skipping."
return 1
fi
fi
log "Restarting ${SERVICE_NAME}..."
date +%s > "${COOLDOWN_FILE}"
{% if machine_type == 'vps' %}
systemctl restart "${SERVICE_NAME}" 2>/dev/null && \
log "Agent restarted via systemd." || \
log "ERROR: systemd restart failed."
{% else %}
launchctl kickstart -k "ai.hermes.{{ wizard_name | lower }}" 2>/dev/null && \
log "Agent restarted via launchctl." || \
(cd "${WIZARD_HOME}" && hermes agent start --daemon 2>/dev/null && \
log "Agent restarted via hermes CLI.") || \
log "ERROR: All restart methods failed."
{% endif %}
log_telemetry "success" "Agent restarted by deadman switch"
}
# --- Health Check ---
check_health() {
# Check 1: Is the agent process running?
{% if machine_type == 'vps' %}
if ! systemctl is-active --quiet "${SERVICE_NAME}" 2>/dev/null; then
if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then
log "FAIL: Agent process not running."
return 1
fi
fi
{% else %}
if ! pgrep -f "hermes" > /dev/null 2>/dev/null; then
log "FAIL: Agent process not running."
return 1
fi
{% endif %}
# Check 2: Is the API port responding?
if ! timeout 10 bash -c "echo > /dev/tcp/127.0.0.1/{{ api_port }}" 2>/dev/null; then
log "FAIL: API port {{ api_port }} not responding."
return 1
fi
# Check 3: Does the config contain banned providers?
if grep -qi 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "${CONFIG_FILE}" 2>/dev/null; then
log "FAIL: Config contains banned provider (Anthropic). Rolling back."
return 1
fi
return 0
}
# --- Main ---
main() {
log "Health check starting..."
if check_health; then
log "HEALTHY — snapshotting config."
snapshot_config
log_telemetry "success" "Health check passed"
else
log "UNHEALTHY — initiating recovery."
log_telemetry "error" "Health check failed — initiating rollback"
rollback_config
restart_agent
fi
log "Health check complete."
}
main "$@"

View File

@@ -0,0 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<!-- Deadman Switch — {{ wizard_name }}. Generated by Ansible. DO NOT EDIT MANUALLY. -->
<plist version="1.0">
<dict>
<key>Label</key>
<string>com.timmy.deadman.{{ wizard_name | lower }}</string>
<key>ProgramArguments</key>
<array>
<string>/bin/bash</string>
<string>{{ wizard_home }}/deadman_action.sh</string>
</array>
<key>StartInterval</key>
<integer>{{ deadman_check_interval }}</integer>
<key>RunAtLoad</key>
<true/>
<key>StandardOutPath</key>
<string>{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log</string>
<key>StandardErrorPath</key>
<string>{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log</string>
</dict>
</plist>

View File

@@ -0,0 +1,16 @@
# Deadman Switch — {{ wizard_name }}
# Generated by Ansible. DO NOT EDIT MANUALLY.
[Unit]
Description=Deadman Switch for {{ wizard_name }} wizard
After=network.target
[Service]
Type=oneshot
ExecStart={{ wizard_home }}/deadman_action.sh
User={{ ansible_user | default('root') }}
StandardOutput=append:{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log
StandardError=append:{{ timmy_log_dir }}/deadman-{{ wizard_name }}.log
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,14 @@
# Deadman Switch Timer — {{ wizard_name }}
# Generated by Ansible. DO NOT EDIT MANUALLY.
# Runs every {{ deadman_check_interval // 60 }} minutes.
[Unit]
Description=Deadman Switch Timer for {{ wizard_name }} wizard
[Timer]
OnBootSec=60
OnUnitActiveSec={{ deadman_check_interval }}s
AccuracySec=30s
[Install]
WantedBy=timers.target

View File

@@ -0,0 +1,6 @@
---
# golden_state defaults
# The golden_state_providers list is defined in group_vars/wizards.yml
# and inventory/hosts.yml (global vars).
golden_state_enforce: true
golden_state_backup_before_deploy: true

View File

@@ -0,0 +1,46 @@
---
# =============================================================================
# golden_state/tasks — Deploy and enforce golden state provider chain
# =============================================================================
- name: "Backup current config before golden state deploy"
copy:
src: "{{ wizard_home }}/config.yaml"
dest: "{{ wizard_home }}/config.yaml.pre-golden-{{ ansible_date_time.epoch }}"
remote_src: true
when: golden_state_backup_before_deploy
ignore_errors: true
- name: "Deploy golden state wizard config"
template:
src: "../../wizard_base/templates/wizard_config.yaml.j2"
dest: "{{ wizard_home }}/config.yaml"
mode: "0644"
backup: true
notify:
- "Restart hermes agent (systemd)"
- "Restart hermes agent (launchctl)"
- name: "Scan for banned providers in all config files"
shell: |
FOUND=0
for f in {{ wizard_home }}/config.yaml {{ hermes_home }}/config.yaml; do
if [ -f "$f" ]; then
if grep -qi 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "$f"; then
echo "BANNED PROVIDER in $f:"
grep -ni 'anthropic\|claude-sonnet\|claude-opus\|claude-haiku' "$f"
FOUND=1
fi
fi
done
exit $FOUND
register: provider_scan
changed_when: false
failed_when: provider_scan.rc != 0 and provider_ban_enforcement == 'strict'
- name: "Report golden state deployment"
debug:
msg: >
{{ wizard_name }} golden state deployed.
Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}.
Banned provider scan: {{ 'CLEAN' if provider_scan.rc == 0 else 'VIOLATIONS FOUND' }}.

View File

@@ -0,0 +1,64 @@
-- =============================================================================
-- request_log — Inference Telemetry Table
-- =============================================================================
-- Every agent writes to this table BEFORE and AFTER every inference call.
-- No exceptions. No summarizing. No describing what you would log.
-- Actually write the row.
--
-- Source: KT Bezalel Architecture Session 2026-04-08
-- =============================================================================
CREATE TABLE IF NOT EXISTS request_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL DEFAULT (datetime('now')),
agent_name TEXT NOT NULL,
provider TEXT NOT NULL,
model TEXT NOT NULL,
endpoint TEXT NOT NULL,
tokens_in INTEGER,
tokens_out INTEGER,
latency_ms INTEGER,
status TEXT NOT NULL, -- 'success', 'error', 'timeout', 'fallback'
error_message TEXT
);
-- Index for common queries
CREATE INDEX IF NOT EXISTS idx_request_log_agent
ON request_log (agent_name, timestamp);
CREATE INDEX IF NOT EXISTS idx_request_log_provider
ON request_log (provider, timestamp);
CREATE INDEX IF NOT EXISTS idx_request_log_status
ON request_log (status, timestamp);
-- View: recent activity per agent (last hour)
CREATE VIEW IF NOT EXISTS v_recent_activity AS
SELECT
agent_name,
provider,
model,
status,
COUNT(*) as call_count,
AVG(latency_ms) as avg_latency_ms,
SUM(tokens_in) as total_tokens_in,
SUM(tokens_out) as total_tokens_out
FROM request_log
WHERE timestamp > datetime('now', '-1 hour')
GROUP BY agent_name, provider, model, status;
-- View: provider reliability (last 24 hours)
CREATE VIEW IF NOT EXISTS v_provider_reliability AS
SELECT
provider,
model,
COUNT(*) as total_calls,
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successes,
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors,
SUM(CASE WHEN status = 'timeout' THEN 1 ELSE 0 END) as timeouts,
SUM(CASE WHEN status = 'fallback' THEN 1 ELSE 0 END) as fallbacks,
ROUND(100.0 * SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) / COUNT(*), 1) as success_rate,
AVG(latency_ms) as avg_latency_ms
FROM request_log
WHERE timestamp > datetime('now', '-24 hours')
GROUP BY provider, model;

View File

@@ -0,0 +1,50 @@
---
# =============================================================================
# request_log/tasks — Deploy Telemetry Table
# =============================================================================
# "This is non-negotiable infrastructure. Without it, we cannot verify
# if any agent actually executed what it claims."
# — KT Bezalel 2026-04-08
# =============================================================================
- name: "Create telemetry directory"
file:
path: "{{ request_log_path | dirname }}"
state: directory
mode: "0755"
- name: "Deploy request_log schema"
copy:
src: request_log_schema.sql
dest: "{{ wizard_home }}/request_log_schema.sql"
mode: "0644"
- name: "Initialize request_log database"
shell: |
sqlite3 "{{ request_log_path }}" < "{{ wizard_home }}/request_log_schema.sql"
args:
creates: "{{ request_log_path }}"
- name: "Verify request_log table exists"
shell: |
sqlite3 "{{ request_log_path }}" ".tables" | grep -q "request_log"
register: table_check
changed_when: false
- name: "Verify request_log schema matches"
shell: |
sqlite3 "{{ request_log_path }}" ".schema request_log" | grep -q "agent_name"
register: schema_check
changed_when: false
- name: "Set permissions on request_log database"
file:
path: "{{ request_log_path }}"
mode: "0644"
- name: "Report request_log status"
debug:
msg: >
{{ wizard_name }} request_log: {{ request_log_path }}
— table exists: {{ table_check.rc == 0 }}
— schema valid: {{ schema_check.rc == 0 }}

View File

@@ -0,0 +1,6 @@
---
# wizard_base defaults
wizard_user: "{{ ansible_user | default('root') }}"
wizard_group: "{{ ansible_user | default('root') }}"
timmy_base_dir: "~/.local/timmy"
timmy_config_repo: "https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"

View File

@@ -0,0 +1,11 @@
---
- name: "Restart hermes agent (systemd)"
systemd:
name: "hermes-{{ wizard_name | lower }}"
state: restarted
when: machine_type == 'vps'
- name: "Restart hermes agent (launchctl)"
shell: "launchctl kickstart -k ai.hermes.{{ wizard_name | lower }}"
when: machine_type == 'mac'
ignore_errors: true

View File

@@ -0,0 +1,69 @@
---
# =============================================================================
# wizard_base/tasks — Common wizard setup
# =============================================================================
- name: "Create wizard directories"
file:
path: "{{ item }}"
state: directory
mode: "0755"
loop:
- "{{ wizard_home }}"
- "{{ wizard_home }}/workspace"
- "{{ hermes_home }}"
- "{{ hermes_home }}/bin"
- "{{ hermes_home }}/skins"
- "{{ hermes_home }}/playbooks"
- "{{ hermes_home }}/memories"
- "~/.local/timmy"
- "~/.local/timmy/fleet-health"
- "~/.local/timmy/snapshots"
- "~/.timmy"
- name: "Clone/update timmy-config"
git:
repo: "{{ upstream_repo }}"
dest: "{{ wizard_home }}/workspace/timmy-config"
version: "{{ upstream_branch }}"
force: false
update: true
ignore_errors: true # May fail on first run if no SSH key
- name: "Deploy SOUL.md"
copy:
src: "{{ wizard_home }}/workspace/timmy-config/SOUL.md"
dest: "~/.timmy/SOUL.md"
remote_src: true
mode: "0644"
ignore_errors: true
- name: "Deploy thin config (immutable pointer to upstream)"
template:
src: thin_config.yml.j2
dest: "{{ thin_config_path }}"
mode: "{{ thin_config_mode }}"
tags: [thin_config]
- name: "Ensure Python3 and pip are available"
package:
name:
- python3
- python3-pip
state: present
when: machine_type == 'vps'
ignore_errors: true
- name: "Ensure PyYAML is installed (for config validation)"
pip:
name: pyyaml
state: present
when: machine_type == 'vps'
ignore_errors: true
- name: "Create Ansible log directory"
file:
path: /var/log/ansible
state: directory
mode: "0755"
ignore_errors: true

View File

@@ -0,0 +1,41 @@
# =============================================================================
# Thin Config — {{ wizard_name }}
# =============================================================================
# THIS FILE IS READ-ONLY. Agents CANNOT modify it.
# It contains only pointers to upstream. The actual config lives in Gitea.
#
# Agent wakes up → pulls config from upstream → loads → runs.
# If anything tries to mutate this → fails gracefully → pulls fresh on restart.
#
# Only way to permanently change config: commit to Gitea, merge PR, Ansible deploys.
#
# Generated by Ansible on {{ ansible_date_time.iso8601 }}
# DO NOT EDIT MANUALLY.
# =============================================================================
identity:
wizard_name: "{{ wizard_name }}"
wizard_role: "{{ wizard_role }}"
machine: "{{ inventory_hostname }}"
upstream:
repo: "{{ upstream_repo }}"
branch: "{{ upstream_branch }}"
config_path: "wizards/{{ wizard_name | lower }}/config.yaml"
pull_on_wake: {{ config_pull_on_wake | lower }}
recovery:
deadman_enabled: {{ deadman_enabled | lower }}
snapshot_dir: "{{ deadman_snapshot_dir }}"
restart_cooldown: {{ deadman_restart_cooldown }}
max_restart_attempts: {{ deadman_max_restart_attempts }}
escalation_channel: "{{ deadman_escalation_channel }}"
telemetry:
request_log_path: "{{ request_log_path }}"
request_log_enabled: {{ request_log_enabled | lower }}
local_overrides:
# Runtime overrides go here. They are EPHEMERAL — not persisted across restarts.
# On restart, this section is reset to empty.
{}

View File

@@ -0,0 +1,115 @@
# =============================================================================
# {{ wizard_name }} — Wizard Configuration (Golden State)
# =============================================================================
# Generated by Ansible on {{ ansible_date_time.iso8601 }}
# DO NOT EDIT MANUALLY. Changes go through Gitea PR → Ansible deploy.
#
# Provider chain: {{ golden_state_providers | map(attribute='name') | list | join(' → ') }}
# Anthropic is PERMANENTLY BANNED.
# =============================================================================
model:
default: {{ wizard_model_primary }}
provider: {{ wizard_provider_primary }}
context_length: 65536
base_url: {{ golden_state_providers[0].base_url }}
toolsets:
- all
fallback_providers:
{% for provider in golden_state_providers %}
- provider: {{ provider.name }}
model: {{ provider.model }}
{% if provider.base_url is defined %}
base_url: {{ provider.base_url }}
{% endif %}
{% if provider.api_key_env is defined %}
api_key_env: {{ provider.api_key_env }}
{% endif %}
timeout: {{ provider.timeout }}
reason: "{{ provider.reason }}"
{% endfor %}
agent:
max_turns: {{ agent_max_turns }}
reasoning_effort: {{ agent_reasoning_effort }}
verbose: {{ agent_verbose | lower }}
terminal:
backend: local
cwd: .
timeout: 180
persistent_shell: true
browser:
inactivity_timeout: 120
command_timeout: 30
record_sessions: false
display:
compact: false
personality: ''
resume_display: full
busy_input_mode: interrupt
bell_on_complete: false
show_reasoning: false
streaming: false
show_cost: false
tool_progress: all
memory:
memory_enabled: true
user_profile_enabled: true
memory_char_limit: 2200
user_char_limit: 1375
nudge_interval: 10
flush_min_turns: 6
approvals:
mode: {{ agent_approval_mode }}
security:
redact_secrets: true
tirith_enabled: false
platforms:
api_server:
enabled: true
extra:
host: 127.0.0.1
port: {{ api_port }}
session_reset:
mode: none
idle_minutes: 0
skills:
creation_nudge_interval: 15
system_prompt_suffix: |
You are {{ wizard_name }}, {{ wizard_role }}.
Your soul is defined in SOUL.md — read it, live it.
Hermes is your harness.
{{ golden_state_providers[0].name }} is your primary provider.
Refusal over fabrication. If you do not know, say so.
Sovereignty and service always.
providers:
{% for provider in golden_state_providers %}
{{ provider.name }}:
base_url: {{ provider.base_url }}
timeout: {{ provider.timeout | default(60) }}
{% if provider.name == 'kimi-coding' %}
max_retries: 3
{% endif %}
{% endfor %}
# =============================================================================
# BANNED PROVIDERS — DO NOT ADD
# =============================================================================
# The following providers are PERMANENTLY BANNED:
# - anthropic (any model: claude-sonnet, claude-opus, claude-haiku)
# Enforcement: pre-commit hook, linter, Ansible validation, this comment.
# Adding any banned provider will cause Ansible deployment to FAIL.
# =============================================================================

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env bash
# =============================================================================
# Gitea Webhook Handler — Trigger Ansible Deploy on Merge
# =============================================================================
# This script is called by the Gitea webhook when a PR is merged
# to the main branch of timmy-config.
#
# Setup:
# 1. Add webhook in Gitea: Settings → Webhooks → Add Webhook
# 2. URL: http://localhost:9000/hooks/deploy-timmy-config
# 3. Events: Pull Request (merged only)
# 4. Secret: <configured in Gitea>
#
# This script runs ansible-pull to update the local machine.
# For fleet-wide deploys, each machine runs ansible-pull independently.
# =============================================================================
set -euo pipefail
REPO="https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config.git"
BRANCH="main"
ANSIBLE_DIR="ansible"
LOG_FILE="/var/log/ansible/webhook-deploy.log"
LOCK_FILE="/tmp/ansible-deploy.lock"
log() {
echo "[$(date -u +%Y-%m-%dT%H:%M:%SZ)] [webhook] $*" | tee -a "${LOG_FILE}"
}
# Prevent concurrent deploys
if [ -f "${LOCK_FILE}" ]; then
LOCK_AGE=$(( $(date +%s) - $(stat -c %Y "${LOCK_FILE}" 2>/dev/null || echo 0) ))
if [ "${LOCK_AGE}" -lt 300 ]; then
log "Deploy already in progress (lock age: ${LOCK_AGE}s). Skipping."
exit 0
else
log "Stale lock file (${LOCK_AGE}s old). Removing."
rm -f "${LOCK_FILE}"
fi
fi
trap 'rm -f "${LOCK_FILE}"' EXIT
touch "${LOCK_FILE}"
log "Webhook triggered. Starting ansible-pull..."
# Pull latest config
cd /tmp
rm -rf timmy-config-deploy
git clone --depth 1 --branch "${BRANCH}" "${REPO}" timmy-config-deploy 2>&1 | tee -a "${LOG_FILE}"
cd timmy-config-deploy/${ANSIBLE_DIR}
# Run Ansible against localhost
log "Running Ansible playbook..."
ansible-playbook \
-i inventory/hosts.yml \
playbooks/site.yml \
--limit "$(hostname)" \
--diff \
2>&1 | tee -a "${LOG_FILE}"
RESULT=$?
if [ ${RESULT} -eq 0 ]; then
log "Deploy successful."
else
log "ERROR: Deploy failed with exit code ${RESULT}."
fi
# Cleanup
rm -rf /tmp/timmy-config-deploy
log "Webhook handler complete."
exit ${RESULT}

View File

@@ -0,0 +1,155 @@
#!/usr/bin/env python3
"""
Config Validator — The Timmy Foundation
Validates wizard configs against golden state rules.
Run before any config deploy to catch violations early.
Usage:
python3 validate_config.py <config_file>
python3 validate_config.py --all # Validate all wizard configs
Exit codes:
0 — All validations passed
1 — Validation errors found
2 — File not found or parse error
"""
import sys
import os
import yaml
import fnmatch
from pathlib import Path
# === BANNED PROVIDERS — HARD POLICY ===
BANNED_PROVIDERS = {"anthropic", "claude"}
BANNED_MODEL_PATTERNS = [
"claude-*",
"anthropic/*",
"*sonnet*",
"*opus*",
"*haiku*",
]
# === REQUIRED FIELDS ===
REQUIRED_FIELDS = {
"model": ["default", "provider"],
"fallback_providers": None, # Must exist as a list
}
def is_banned_model(model_name: str) -> bool:
"""Check if a model name matches any banned pattern."""
model_lower = model_name.lower()
for pattern in BANNED_MODEL_PATTERNS:
if fnmatch.fnmatch(model_lower, pattern):
return True
return False
def validate_config(config_path: str) -> list[str]:
"""Validate a wizard config file. Returns list of error strings."""
errors = []
try:
with open(config_path) as f:
cfg = yaml.safe_load(f)
except FileNotFoundError:
return [f"File not found: {config_path}"]
except yaml.YAMLError as e:
return [f"YAML parse error: {e}"]
if not cfg:
return ["Config file is empty"]
# Check required fields
for section, fields in REQUIRED_FIELDS.items():
if section not in cfg:
errors.append(f"Missing required section: {section}")
elif fields:
for field in fields:
if field not in cfg[section]:
errors.append(f"Missing required field: {section}.{field}")
# Check default provider
default_provider = cfg.get("model", {}).get("provider", "")
if default_provider.lower() in BANNED_PROVIDERS:
errors.append(f"BANNED default provider: {default_provider}")
default_model = cfg.get("model", {}).get("default", "")
if is_banned_model(default_model):
errors.append(f"BANNED default model: {default_model}")
# Check fallback providers
for i, fb in enumerate(cfg.get("fallback_providers", [])):
provider = fb.get("provider", "")
model = fb.get("model", "")
if provider.lower() in BANNED_PROVIDERS:
errors.append(f"BANNED fallback provider [{i}]: {provider}")
if is_banned_model(model):
errors.append(f"BANNED fallback model [{i}]: {model}")
# Check providers section
for name, provider_cfg in cfg.get("providers", {}).items():
if name.lower() in BANNED_PROVIDERS:
errors.append(f"BANNED provider in providers section: {name}")
base_url = str(provider_cfg.get("base_url", ""))
if "anthropic" in base_url.lower():
errors.append(f"BANNED URL in provider {name}: {base_url}")
# Check system prompt for banned references
prompt = cfg.get("system_prompt_suffix", "")
if isinstance(prompt, str):
for banned in BANNED_PROVIDERS:
if banned in prompt.lower():
errors.append(f"BANNED provider referenced in system_prompt_suffix: {banned}")
return errors
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} <config_file> [--all]")
sys.exit(2)
if sys.argv[1] == "--all":
# Validate all wizard configs in the repo
repo_root = Path(__file__).parent.parent.parent
wizard_dir = repo_root / "wizards"
all_errors = {}
for wizard_path in sorted(wizard_dir.iterdir()):
config_file = wizard_path / "config.yaml"
if config_file.exists():
errors = validate_config(str(config_file))
if errors:
all_errors[wizard_path.name] = errors
if all_errors:
print("VALIDATION FAILED:")
for wizard, errors in all_errors.items():
print(f"\n {wizard}:")
for err in errors:
print(f" - {err}")
sys.exit(1)
else:
print("All wizard configs passed validation.")
sys.exit(0)
else:
config_path = sys.argv[1]
errors = validate_config(config_path)
if errors:
print(f"VALIDATION FAILED for {config_path}:")
for err in errors:
print(f" - {err}")
sys.exit(1)
else:
print(f"PASSED: {config_path}")
sys.exit(0)
if __name__ == "__main__":
main()

60
scripts/README.md Normal file
View File

@@ -0,0 +1,60 @@
# Gemini Sovereign Infrastructure Suite
This directory contains the core systems of the Gemini Sovereign Infrastructure, designed to systematize fleet operations, governance, and architectural integrity.
## Principles
1. **Systems, not Scripts**: We build frameworks that solve classes of problems, not one-off fixes.
2. **Sovereignty First**: All tools are designed to run locally or on owned VPSes. No cloud dependencies.
3. **Von Neumann as Code**: Infrastructure should be self-replicating and automated.
4. **Continuous Governance**: Quality is enforced by code (linters, gates), not just checklists.
## Tools
### [OPS] Provisioning & Fleet Management
- **`provision_wizard.py`**: Automates the creation of a new Wizard node from zero.
- Creates DigitalOcean droplet.
- Installs and builds `llama.cpp`.
- Downloads GGUF models.
- Sets up `systemd` services and health checks.
- **`fleet_llama.py`**: Unified management of `llama-server` instances across the fleet.
- `status`: Real-time health and model monitoring.
- `restart`: Remote service restart via SSH.
- `swap`: Hot-swapping GGUF models on remote nodes.
- **`skill_installer.py`**: Packages and deploys Hermes skills to remote wizards.
- **`model_eval.py`**: Benchmarks GGUF models for speed and quality before deployment.
- **`phase_tracker.py`**: Tracks the fleet's progress through the Paperclips-inspired evolution arc.
- **`cross_repo_test.py`**: Verifies the fleet works as a system by running tests across all core repositories.
- **`self_healing.py`**: Auto-detects and fixes common failures across the fleet.
- **`agent_dispatch.py`**: Unified framework for tasking agents across the fleet.
- **`telemetry.py`**: Operational visibility without cloud dependencies.
- **`gitea_webhook_handler.py`**: Handles real-time events from Gitea to coordinate fleet actions.
### [ARCH] Governance & Architecture
- **`architecture_linter_v2.py`**: Automated enforcement of architectural boundaries.
- Enforces sidecar boundaries (no sovereign code in `hermes-agent`).
- Prevents hardcoded IPs and committed secrets.
- Ensures `SOUL.md` and `README.md` standards.
- **`adr_manager.py`**: Streamlines the creation and tracking of Architecture Decision Records.
- `new`: Scaffolds a new ADR from a template.
- `list`: Provides a chronological view of architectural evolution.
## Usage
Most tools require `DIGITALOCEAN_TOKEN` and SSH access to the fleet.
```bash
# Provision a new node
python3 scripts/provision_wizard.py --name fenrir --model qwen2.5-coder-7b
# Check fleet status
python3 scripts/fleet_llama.py status
# Audit architectural integrity
python3 scripts/architecture_linter_v2.py
```
---
*Built by Gemini — The Builder, The Systematizer, The Force Multiplier.*

113
scripts/adr_manager.py Normal file
View File

@@ -0,0 +1,113 @@
#!/usr/bin/env python3
"""
[ARCH] ADR Manager
Part of the Gemini Sovereign Governance System.
Helps create and manage Architecture Decision Records (ADRs).
"""
import os
import sys
import datetime
import argparse
ADR_DIR = "docs/adr"
TEMPLATE_FILE = "docs/adr/ADR_TEMPLATE.md"
class ADRManager:
def __init__(self):
# Ensure we are in the repo root or can find docs/adr
if not os.path.exists(ADR_DIR):
# Try to find it relative to the script
script_dir = os.path.dirname(os.path.abspath(__file__))
repo_root = os.path.dirname(script_dir)
self.adr_dir = os.path.join(repo_root, ADR_DIR)
self.template_file = os.path.join(repo_root, TEMPLATE_FILE)
else:
self.adr_dir = ADR_DIR
self.template_file = TEMPLATE_FILE
if not os.path.exists(self.adr_dir):
os.makedirs(self.adr_dir)
def get_next_number(self):
files = [f for f in os.listdir(self.adr_dir) if f.endswith(".md") and f[0].isdigit()]
if not files:
return 1
numbers = [int(f.split("-")[0]) for f in files]
return max(numbers) + 1
def create_adr(self, title: str):
num = self.get_next_number()
slug = title.lower().replace(" ", "-").replace("/", "-")
filename = f"{num:04d}-{slug}.md"
filepath = os.path.join(self.adr_dir, filename)
date = datetime.date.today().isoformat()
template = ""
if os.path.exists(self.template_file):
with open(self.template_file, "r") as f:
template = f.read()
else:
template = """# {num}. {title}
Date: {date}
## Status
Proposed
## Context
What is the problem we are solving?
## Decision
What is the decision we made?
## Consequences
What are the positive and negative consequences?
"""
content = template.replace("{num}", f"{num:04d}")
content = content.replace("{title}", title)
content = content.replace("{date}", date)
with open(filepath, "w") as f:
f.write(content)
print(f"[SUCCESS] Created ADR: {filepath}")
def list_adrs(self):
files = sorted([f for f in os.listdir(self.adr_dir) if f.endswith(".md") and f[0].isdigit()])
print(f"{'NUM':<6} {'TITLE'}")
print("-" * 40)
for f in files:
num = f.split("-")[0]
title = f.split("-", 1)[1].replace(".md", "").replace("-", " ").title()
print(f"{num:<6} {title}")
def main():
parser = argparse.ArgumentParser(description="Gemini ADR Manager")
subparsers = parser.add_subparsers(dest="command")
create_parser = subparsers.add_parser("new", help="Create a new ADR")
create_parser.add_argument("title", help="Title of the ADR")
subparsers.add_parser("list", help="List all ADRs")
args = parser.parse_args()
manager = ADRManager()
if args.command == "new":
manager.create_adr(args.title)
elif args.command == "list":
manager.list_adrs()
else:
parser.print_help()
if __name__ == "__main__":
main()

57
scripts/agent_dispatch.py Normal file
View File

@@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""
[OPS] Agent Dispatch Framework
Part of the Gemini Sovereign Infrastructure Suite.
Replaces ad-hoc dispatch scripts with a unified framework for tasking agents.
"""
import os
import sys
import argparse
import subprocess
# --- CONFIGURATION ---
FLEET = {
"allegro": "167.99.126.228",
"bezalel": "159.203.146.185"
}
class Dispatcher:
def log(self, message: str):
print(f"[*] {message}")
def dispatch(self, host: str, agent_name: str, task: str):
self.log(f"Dispatching task to {agent_name} on {host}...")
ip = FLEET[host]
# Command to run the agent on the remote machine
# Assumes hermes-agent is installed in /opt/hermes
remote_cmd = f"cd /opt/hermes && python3 run_agent.py --agent {agent_name} --task '{task}'"
ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", f"root@{ip}", remote_cmd]
try:
res = subprocess.run(ssh_cmd, capture_output=True, text=True)
if res.returncode == 0:
self.log(f"[SUCCESS] {agent_name} completed task.")
print(res.stdout)
else:
self.log(f"[FAILURE] {agent_name} failed task.")
print(res.stderr)
except Exception as e:
self.log(f"[ERROR] Dispatch failed: {e}")
def main():
parser = argparse.ArgumentParser(description="Gemini Agent Dispatcher")
parser.add_argument("host", choices=list(FLEET.keys()), help="Host to dispatch to")
parser.add_argument("agent", help="Agent name")
parser.add_argument("task", help="Task description")
args = parser.parse_args()
dispatcher = Dispatcher()
dispatcher.dispatch(args.host, args.agent, args.task)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,126 @@
#!/usr/bin/env python3
"""
[ARCH] Architecture Linter v2
Part of the Gemini Sovereign Governance System.
Enforces architectural boundaries, security, and documentation standards
across the Timmy Foundation fleet.
"""
import os
import re
import sys
import argparse
from pathlib import Path
# --- CONFIGURATION ---
SOVEREIGN_KEYWORDS = ["mempalace", "sovereign_store", "tirith", "bezalel", "nexus"]
IP_REGEX = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
API_KEY_REGEX = r'(?:api_key|secret|token|password|auth_token)\s*[:=]\s*["\'][a-zA-Z0-9_\-]{20,}["\']'
class Linter:
def __init__(self, repo_path: str):
self.repo_path = Path(repo_path).resolve()
self.repo_name = self.repo_path.name
self.errors = []
def log_error(self, message: str, file: str = None, line: int = None):
loc = f"{file}:{line}" if file and line else (file if file else "General")
self.errors.append(f"[{loc}] {message}")
def check_sidecar_boundary(self):
"""Rule 1: No sovereign code in hermes-agent (sidecar boundary)"""
if self.repo_name == "hermes-agent":
for root, _, files in os.walk(self.repo_path):
if "node_modules" in root or ".git" in root:
continue
for file in files:
if file.endswith((".py", ".ts", ".js", ".tsx")):
path = Path(root) / file
content = path.read_text(errors="ignore")
for kw in SOVEREIGN_KEYWORDS:
if kw in content.lower():
# Exception: imports or comments might be okay, but we're strict for now
self.log_error(f"Sovereign keyword '{kw}' found in hermes-agent. Violates sidecar boundary.", str(path.relative_to(self.repo_path)))
def check_hardcoded_ips(self):
"""Rule 2: No hardcoded IPs (use domain names)"""
for root, _, files in os.walk(self.repo_path):
if "node_modules" in root or ".git" in root:
continue
for file in files:
if file.endswith((".py", ".ts", ".js", ".tsx", ".yaml", ".yml", ".json")):
path = Path(root) / file
content = path.read_text(errors="ignore")
matches = re.finditer(IP_REGEX, content)
for match in matches:
ip = match.group()
if ip in ["127.0.0.1", "0.0.0.0"]:
continue
line_no = content.count('\n', 0, match.start()) + 1
self.log_error(f"Hardcoded IP address '{ip}' found. Use domain names or environment variables.", str(path.relative_to(self.repo_path)), line_no)
def check_api_keys(self):
"""Rule 3: No cloud API keys committed to repos"""
for root, _, files in os.walk(self.repo_path):
if "node_modules" in root or ".git" in root:
continue
for file in files:
if file.endswith((".py", ".ts", ".js", ".tsx", ".yaml", ".yml", ".json", ".env")):
if file == ".env.example":
continue
path = Path(root) / file
content = path.read_text(errors="ignore")
matches = re.finditer(API_KEY_REGEX, content, re.IGNORECASE)
for match in matches:
line_no = content.count('\n', 0, match.start()) + 1
self.log_error("Potential API key or secret found in code.", str(path.relative_to(self.repo_path)), line_no)
def check_soul_canonical(self):
"""Rule 4: SOUL.md exists and is canonical in exactly one location"""
soul_path = self.repo_path / "SOUL.md"
if self.repo_name == "timmy-config":
if not soul_path.exists():
self.log_error("SOUL.md is missing from the canonical location (timmy-config root).")
else:
if soul_path.exists():
self.log_error("SOUL.md found in non-canonical repo. It should only live in timmy-config.")
def check_readme(self):
"""Rule 5: Every repo has a README with current truth"""
readme_path = self.repo_path / "README.md"
if not readme_path.exists():
self.log_error("README.md is missing.")
else:
content = readme_path.read_text(errors="ignore")
if len(content.strip()) < 50:
self.log_error("README.md is too short or empty. Provide current truth about the repo.")
def run(self):
print(f"--- Gemini Linter: Auditing {self.repo_name} ---")
self.check_sidecar_boundary()
self.check_hardcoded_ips()
self.check_api_keys()
self.check_soul_canonical()
self.check_readme()
if self.errors:
print(f"\n[FAILURE] Found {len(self.errors)} architectural violations:")
for err in self.errors:
print(f" - {err}")
return False
else:
print("\n[SUCCESS] Architecture is sound. Sovereignty maintained.")
return True
def main():
parser = argparse.ArgumentParser(description="Gemini Architecture Linter v2")
parser.add_argument("repo_path", nargs="?", default=".", help="Path to the repository to lint")
args = parser.parse_args()
linter = Linter(args.repo_path)
success = linter.run()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,90 @@
#!/usr/bin/env python3
"""
[OPS] Cross-Repo Test Suite
Part of the Gemini Sovereign Infrastructure Suite.
Verifies the fleet works as a system by running tests across all core repositories.
"""
import os
import sys
import subprocess
import argparse
from pathlib import Path
# --- CONFIGURATION ---
REPOS = ["timmy-config", "hermes-agent", "the-nexus"]
class CrossRepoTester:
def __init__(self, root_dir: str):
self.root_dir = Path(root_dir).resolve()
def log(self, message: str):
print(f"[*] {message}")
def run_tests(self):
results = {}
for repo in REPOS:
repo_path = self.root_dir / repo
if not repo_path.exists():
# Try sibling directory if we are in one of the repos
repo_path = self.root_dir.parent / repo
if not repo_path.exists():
print(f"[WARNING] Repo {repo} not found at {repo_path}")
results[repo] = "MISSING"
continue
self.log(f"Running tests for {repo}...")
# Determine test command
test_cmd = ["pytest"]
if repo == "hermes-agent":
test_cmd = ["python3", "-m", "pytest", "tests"]
elif repo == "the-nexus":
test_cmd = ["pytest", "tests"]
try:
# Check if pytest is available
subprocess.run(["pytest", "--version"], capture_output=True)
res = subprocess.run(test_cmd, cwd=str(repo_path), capture_output=True, text=True)
if res.returncode == 0:
results[repo] = "PASSED"
else:
results[repo] = "FAILED"
# Print a snippet of the failure
print(f" [!] {repo} failed tests. Stderr snippet:")
print("\n".join(res.stderr.split("\n")[-10:]))
except FileNotFoundError:
results[repo] = "ERROR: pytest not found"
except Exception as e:
results[repo] = f"ERROR: {e}"
self.report(results)
def report(self, results: dict):
print("\n--- Cross-Repo Test Report ---")
all_passed = True
for repo, status in results.items():
icon = "" if status == "PASSED" else ""
print(f"{icon} {repo:<15} | {status}")
if status != "PASSED":
all_passed = False
if all_passed:
print("\n[SUCCESS] All systems operational. The fleet is sound.")
else:
print("\n[FAILURE] System instability detected.")
def main():
parser = argparse.ArgumentParser(description="Gemini Cross-Repo Tester")
parser.add_argument("--root", default=".", help="Root directory containing all repos")
args = parser.parse_args()
tester = CrossRepoTester(args.root)
tester.run_tests()
if __name__ == "__main__":
main()

137
scripts/fleet_llama.py Normal file
View File

@@ -0,0 +1,137 @@
#!/usr/bin/env python3
"""
[OPS] llama.cpp Fleet Manager
Part of the Gemini Sovereign Infrastructure Suite.
Manages llama-server instances across the Timmy Foundation fleet.
Supports status, restart, and model swapping via SSH.
"""
import os
import sys
import json
import argparse
import subprocess
import requests
from typing import Dict, List, Any
# --- FLEET DEFINITION ---
FLEET = {
"mac": {"ip": "10.1.10.77", "port": 8080, "role": "hub"},
"ezra": {"ip": "143.198.27.163", "port": 8080, "role": "forge"},
"allegro": {"ip": "167.99.126.228", "port": 8080, "role": "agent-host"},
"bezalel": {"ip": "159.203.146.185", "port": 8080, "role": "world-host"}
}
class FleetManager:
def __init__(self):
self.results = {}
def run_remote(self, host: str, command: str):
ip = FLEET[host]["ip"]
ssh_cmd = [
"ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
f"root@{ip}", command
]
# For Mac, we might need a different user or local execution
if host == "mac":
ssh_cmd = ["bash", "-c", command]
try:
result = subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10)
return result
except subprocess.TimeoutExpired:
return None
except Exception as e:
print(f"Error running remote command on {host}: {e}")
return None
def get_status(self, host: str):
ip = FLEET[host]["ip"]
port = FLEET[host]["port"]
status = {"online": False, "server_running": False, "model": "unknown", "tps": 0.0}
# 1. Check if machine is reachable
ping_res = subprocess.run(["ping", "-c", "1", "-W", "1", ip], capture_output=True)
if ping_res.returncode == 0:
status["online"] = True
# 2. Check if llama-server is responding to health check
try:
url = f"http://{ip}:{port}/health"
response = requests.get(url, timeout=2)
if response.status_code == 200:
status["server_running"] = True
data = response.json()
# llama.cpp health endpoint usually returns slots info
# We'll try to get model info if available
status["model"] = data.get("model", "unknown")
except:
pass
return status
def show_fleet_status(self):
print(f"{'NAME':<10} {'IP':<15} {'STATUS':<10} {'SERVER':<10} {'MODEL':<20}")
print("-" * 70)
for name in FLEET:
status = self.get_status(name)
online_str = "" if status["online"] else ""
server_str = "🚀" if status["server_running"] else "💤"
print(f"{name:<10} {FLEET[name]['ip']:<15} {online_str:<10} {server_str:<10} {status['model']:<20}")
def restart_server(self, host: str):
print(f"[*] Restarting llama-server on {host}...")
res = self.run_remote(host, "systemctl restart llama-server")
if res and res.returncode == 0:
print(f"[SUCCESS] Restarted {host}")
else:
print(f"[FAILURE] Could not restart {host}")
def swap_model(self, host: str, model_name: str):
print(f"[*] Swapping model on {host} to {model_name}...")
# This assumes the provision_wizard.py structure
# In a real scenario, we'd have a mapping of model names to URLs
# For now, we'll just update the systemd service or a config file
# 1. Stop server
self.run_remote(host, "systemctl stop llama-server")
# 2. Update service file (simplified)
# This is a bit risky to do via one-liner, but for the manager:
cmd = f"sed -i 's/-m .*\\.gguf/-m \\/opt\\/models\\/{model_name}.gguf/' /etc/systemd/system/llama-server.service"
self.run_remote(host, cmd)
# 3. Start server
self.run_remote(host, "systemctl daemon-reload && systemctl start llama-server")
print(f"[SUCCESS] Swapped model on {host}")
def main():
parser = argparse.ArgumentParser(description="Gemini Fleet Manager")
subparsers = parser.add_subparsers(dest="command")
subparsers.add_parser("status", help="Show fleet status")
restart_parser = subparsers.add_parser("restart", help="Restart a server")
restart_parser.add_argument("host", choices=list(FLEET.keys()), help="Host to restart")
swap_parser = subparsers.add_parser("swap", help="Swap model on a host")
swap_parser.add_argument("host", choices=list(FLEET.keys()), help="Host to swap")
swap_parser.add_argument("model", help="Model name (GGUF)")
args = parser.parse_args()
manager = FleetManager()
if args.command == "status":
manager.show_fleet_status()
elif args.command == "restart":
manager.restart_server(args.host)
elif args.command == "swap":
manager.swap_model(args.host, args.model)
else:
parser.print_help()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python3
"""
[OPS] Gitea Webhook Handler
Part of the Gemini Sovereign Infrastructure Suite.
Handles real-time events from Gitea to coordinate fleet actions.
"""
import os
import sys
import json
import argparse
from typing import Dict, Any
class WebhookHandler:
def handle_event(self, payload: Dict[str, Any]):
# Gitea webhooks often send the event type in a header,
# but we'll try to infer it from the payload if not provided.
event_type = payload.get("event") or self.infer_event_type(payload)
repo_name = payload.get("repository", {}).get("name")
sender = payload.get("sender", {}).get("username")
print(f"[*] Received {event_type} event from {repo_name} (by {sender})")
if event_type == "push":
self.handle_push(payload)
elif event_type == "pull_request":
self.handle_pr(payload)
elif event_type == "issue":
self.handle_issue(payload)
else:
print(f"[INFO] Ignoring event type: {event_type}")
def infer_event_type(self, payload: Dict[str, Any]) -> str:
if "commits" in payload: return "push"
if "pull_request" in payload: return "pull_request"
if "issue" in payload: return "issue"
return "unknown"
def handle_push(self, payload: Dict[str, Any]):
ref = payload.get("ref")
print(f" [PUSH] Branch: {ref}")
# Trigger CI or deployment
if ref == "refs/heads/main":
print(" [ACTION] Triggering production deployment...")
# Example: subprocess.run(["./deploy.sh"])
def handle_pr(self, payload: Dict[str, Any]):
action = payload.get("action")
pr_num = payload.get("pull_request", {}).get("number")
print(f" [PR] Action: {action} | PR #{pr_num}")
if action in ["opened", "synchronized"]:
print(f" [ACTION] Triggering architecture linter for PR #{pr_num}...")
# Example: subprocess.run(["python3", "scripts/architecture_linter_v2.py"])
def handle_issue(self, payload: Dict[str, Any]):
action = payload.get("action")
issue_num = payload.get("issue", {}).get("number")
print(f" [ISSUE] Action: {action} | Issue #{issue_num}")
def main():
parser = argparse.ArgumentParser(description="Gemini Webhook Handler")
parser.add_argument("payload_file", help="JSON file containing the webhook payload")
args = parser.parse_args()
if not os.path.exists(args.payload_file):
print(f"[ERROR] Payload file {args.payload_file} not found.")
sys.exit(1)
with open(args.payload_file, "r") as f:
try:
payload = json.load(f)
except:
print("[ERROR] Invalid JSON payload.")
sys.exit(1)
handler = WebhookHandler()
handler.handle_event(payload)
if __name__ == "__main__":
main()

95
scripts/model_eval.py Normal file
View File

@@ -0,0 +1,95 @@
#!/usr/bin/env python3
"""
[EVAL] Model Evaluation Harness
Part of the Gemini Sovereign Infrastructure Suite.
Benchmarks GGUF models for speed and quality before deployment.
"""
import os
import sys
import time
import json
import argparse
import requests
BENCHMARK_PROMPTS = [
"Write a Python script to sort a list of dictionaries by a key.",
"Explain the concept of 'Sovereign AI' in three sentences.",
"What is the capital of France?",
"Write a short story about a robot learning to paint."
]
class ModelEval:
def __init__(self, endpoint: str):
self.endpoint = endpoint.rstrip("/")
def log(self, message: str):
print(f"[*] {message}")
def run_benchmark(self):
self.log(f"Starting benchmark for {self.endpoint}...")
results = []
for prompt in BENCHMARK_PROMPTS:
self.log(f"Testing prompt: {prompt[:30]}...")
start_time = time.time()
try:
# llama.cpp server /completion endpoint
response = requests.post(
f"{self.endpoint}/completion",
json={"prompt": prompt, "n_predict": 128},
timeout=60
)
duration = time.time() - start_time
if response.status_code == 200:
data = response.json()
content = data.get("content", "")
# Rough estimate of tokens (4 chars per token is a common rule of thumb)
tokens = len(content) / 4
tps = tokens / duration
results.append({
"prompt": prompt,
"duration": duration,
"tps": tps,
"success": True
})
else:
results.append({"prompt": prompt, "success": False, "error": response.text})
except Exception as e:
results.append({"prompt": prompt, "success": False, "error": str(e)})
self.report(results)
def report(self, results: list):
print("\n--- Evaluation Report ---")
total_tps = 0
success_count = 0
for r in results:
if r["success"]:
print(f"{r['prompt'][:40]}... | {r['tps']:.2f} tok/s | {r['duration']:.2f}s")
total_tps += r["tps"]
success_count += 1
else:
print(f"{r['prompt'][:40]}... | FAILED: {r['error']}")
if success_count > 0:
avg_tps = total_tps / success_count
print(f"\nAverage Performance: {avg_tps:.2f} tok/s")
else:
print("\n[FAILURE] All benchmarks failed.")
def main():
parser = argparse.ArgumentParser(description="Gemini Model Eval")
parser.add_argument("endpoint", help="llama-server endpoint (e.g. http://localhost:8080)")
args = parser.parse_args()
evaluator = ModelEval(args.endpoint)
evaluator.run_benchmark()
if __name__ == "__main__":
main()

114
scripts/phase_tracker.py Normal file
View File

@@ -0,0 +1,114 @@
#!/usr/bin/env python3
"""
[OPS] Phase Progression Tracker
Part of the Gemini Sovereign Infrastructure Suite.
Tracks the fleet's progress through the Paperclips-inspired evolution arc.
"""
import os
import sys
import json
import argparse
MILESTONES_FILE = "fleet/milestones.md"
COMPLETED_FILE = "fleet/completed_milestones.json"
class PhaseTracker:
def __init__(self):
# Find files relative to repo root
script_dir = os.path.dirname(os.path.abspath(__file__))
repo_root = os.path.dirname(script_dir)
self.milestones_path = os.path.join(repo_root, MILESTONES_FILE)
self.completed_path = os.path.join(repo_root, COMPLETED_FILE)
self.milestones = self.parse_milestones()
self.completed = self.load_completed()
def parse_milestones(self):
if not os.path.exists(self.milestones_path):
return {}
with open(self.milestones_path, "r") as f:
content = f.read()
phases = {}
current_phase = None
for line in content.split("\n"):
if line.startswith("## Phase"):
current_phase = line.replace("## ", "").strip()
phases[current_phase] = []
elif line.startswith("### M"):
m_id = line.split(":")[0].replace("### ", "").strip()
title = line.split(":")[1].strip()
phases[current_phase].append({"id": m_id, "title": title})
return phases
def load_completed(self):
if os.path.exists(self.completed_path):
with open(self.completed_path, "r") as f:
try:
return json.load(f)
except:
return []
return []
def save_completed(self):
with open(self.completed_path, "w") as f:
json.dump(self.completed, f, indent=2)
def show_progress(self):
print("--- Fleet Phase Progression Tracker ---")
total_milestones = 0
total_completed = 0
if not self.milestones:
print("[ERROR] No milestones found in fleet/milestones.md")
return
for phase, ms in self.milestones.items():
print(f"\n{phase}")
for m in ms:
total_milestones += 1
done = m["id"] in self.completed
if done:
total_completed += 1
status = "" if done else ""
print(f" {status} {m['id']}: {m['title']}")
percent = (total_completed / total_milestones) * 100 if total_milestones > 0 else 0
print(f"\nOverall Progress: {total_completed}/{total_milestones} ({percent:.1f}%)")
def mark_complete(self, m_id: str):
if m_id not in self.completed:
self.completed.append(m_id)
self.save_completed()
print(f"[SUCCESS] Marked {m_id} as complete.")
else:
print(f"[INFO] {m_id} is already complete.")
def main():
parser = argparse.ArgumentParser(description="Gemini Phase Tracker")
subparsers = parser.add_subparsers(dest="command")
subparsers.add_parser("status", help="Show current progress")
complete_parser = subparsers.add_parser("complete", help="Mark a milestone as complete")
complete_parser.add_argument("id", help="Milestone ID (e.g. M1)")
args = parser.parse_args()
tracker = PhaseTracker()
if args.command == "status":
tracker.show_progress()
elif args.command == "complete":
tracker.mark_complete(args.id)
else:
parser.print_help()
if __name__ == "__main__":
main()

228
scripts/provision_wizard.py Normal file
View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python3
"""
[OPS] Automated VPS Provisioning System (Von Neumann as Code)
Part of the Gemini Sovereign Infrastructure Suite.
This script automates the creation and configuration of a "Wizard" node
from zero to serving inference via llama.cpp.
Usage:
python3 provision_wizard.py --name fenrir --size s-2vcpu-4gb --model qwen2.5-coder-7b
"""
import os
import sys
import time
import argparse
import requests
import subprocess
import json
from typing import Optional, Dict, Any
# --- CONFIGURATION ---
DO_API_URL = "https://api.digitalocean.com/v2"
# We expect DIGITALOCEAN_TOKEN to be set in the environment.
DO_TOKEN = os.environ.get("DIGITALOCEAN_TOKEN")
# Default settings
DEFAULT_REGION = "nyc3"
DEFAULT_IMAGE = "ubuntu-22-04-x64"
LLAMA_CPP_REPO = "https://github.com/ggerganov/llama.cpp"
class Provisioner:
def __init__(self, name: str, size: str, model: str, region: str = DEFAULT_REGION):
self.name = name
self.size = size
self.model = model
self.region = region
self.droplet_id = None
self.ip_address = None
def log(self, message: str):
print(f"[*] {message}")
def error(self, message: str):
print(f"[!] ERROR: {message}")
sys.exit(1)
def check_auth(self):
if not DO_TOKEN:
self.error("DIGITALOCEAN_TOKEN environment variable not set.")
def create_droplet(self):
self.log(f"Creating droplet '{self.name}' ({self.size}) in {self.region}...")
# Get SSH keys to add to the droplet
ssh_keys = self.get_ssh_keys()
payload = {
"name": self.name,
"region": self.region,
"size": self.size,
"image": DEFAULT_IMAGE,
"ssh_keys": ssh_keys,
"backups": False,
"ipv6": True,
"monitoring": True,
"tags": ["wizard", "gemini-provisioned"]
}
headers = {
"Authorization": f"Bearer {DO_TOKEN}",
"Content-Type": "application/json"
}
response = requests.post(f"{DO_API_URL}/droplets", json=payload, headers=headers)
if response.status_code != 202:
self.error(f"Failed to create droplet: {response.text}")
data = response.json()
self.droplet_id = data["droplet"]["id"]
self.log(f"Droplet created (ID: {self.droplet_id}). Waiting for IP...")
def get_ssh_keys(self) -> list:
# Fetch existing SSH keys from DO account to ensure we can log in
headers = {"Authorization": f"Bearer {DO_TOKEN}"}
response = requests.get(f"{DO_API_URL}/account/keys", headers=headers)
if response.status_code != 200:
self.log("Warning: Could not fetch SSH keys. Droplet might be inaccessible via SSH.")
return []
return [key["id"] for key in response.json()["ssh_keys"]]
def wait_for_ip(self):
headers = {"Authorization": f"Bearer {DO_TOKEN}"}
while not self.ip_address:
response = requests.get(f"{DO_API_URL}/droplets/{self.droplet_id}", headers=headers)
data = response.json()
networks = data["droplet"]["networks"]["v4"]
for net in networks:
if net["type"] == "public":
self.ip_address = net["ip_address"]
break
if not self.ip_address:
time.sleep(5)
self.log(f"Droplet IP: {self.ip_address}")
def run_remote(self, command: str):
# Using subprocess to call ssh. Assumes local machine has the right private key.
ssh_cmd = [
"ssh", "-o", "StrictHostKeyChecking=no",
f"root@{self.ip_address}", command
]
result = subprocess.run(ssh_cmd, capture_output=True, text=True)
return result
def setup_wizard(self):
self.log("Starting remote setup...")
# Wait for SSH to be ready
retries = 12
while retries > 0:
res = self.run_remote("echo 'SSH Ready'")
if res.returncode == 0:
break
self.log(f"Waiting for SSH... ({retries} retries left)")
time.sleep(10)
retries -= 1
if retries == 0:
self.error("SSH timed out.")
# 1. Update and install dependencies
self.log("Installing dependencies...")
setup_script = """
export DEBIAN_FRONTEND=noninteractive
apt-get update && apt-get upgrade -y
apt-get install -y build-essential git cmake curl wget python3 python3-pip
"""
self.run_remote(setup_script)
# 2. Build llama.cpp
self.log("Building llama.cpp...")
build_script = f"""
if [ ! -d "/opt/llama.cpp" ]; then
git clone {LLAMA_CPP_REPO} /opt/llama.cpp
fi
cd /opt/llama.cpp
mkdir -p build && cd build
cmake ..
cmake --build . --config Release
"""
self.run_remote(build_script)
# 3. Download Model
self.log(f"Downloading model: {self.model}...")
model_url = self.get_model_url(self.model)
download_script = f"""
mkdir -p /opt/models
if [ ! -f "/opt/models/{self.model}.gguf" ]; then
wget -O /opt/models/{self.model}.gguf {model_url}
fi
"""
self.run_remote(download_script)
# 4. Create systemd service
self.log("Creating systemd service...")
service_content = f"""
[Unit]
Description=Llama.cpp Server for {self.name}
After=network.target
[Service]
Type=simple
User=root
WorkingDirectory=/opt/llama.cpp
ExecStart=/opt/llama.cpp/build/bin/llama-server -m /opt/models/{self.model}.gguf --host 0.0.0.0 --port 8080 -c 4096
Restart=always
RestartSec=10
[Install]
WantedBy=multi-user.target
"""
# Use cat to write the file to handle multi-line string safely
self.run_remote(f"cat <<EOF > /etc/systemd/system/llama-server.service\n{service_content}\nEOF")
self.run_remote("systemctl daemon-reload && systemctl enable llama-server && systemctl start llama-server")
def get_model_url(self, model_name: str) -> str:
# Mapping for common models to GGUF URLs (HuggingFace)
mapping = {
"qwen2.5-coder-7b": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_k_m.gguf",
"hermes-3-llama-3.1-8b": "https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-8B-GGUF/resolve/main/Hermes-3-Llama-3.1-8B.Q4_K_M.gguf"
}
return mapping.get(model_name, mapping["hermes-3-llama-3.1-8b"])
def health_check(self):
self.log("Performing health check...")
time.sleep(15) # Wait for server to start
try:
url = f"http://{self.ip_address}:8080/health"
response = requests.get(url, timeout=10)
if response.status_code == 200:
self.log(f"[SUCCESS] Wizard {self.name} is healthy and serving inference.")
self.log(f"Endpoint: {url}")
else:
self.log(f"[WARNING] Health check returned status {response.status_code}")
except Exception as e:
self.log(f"[ERROR] Health check failed: {e}")
def provision(self):
self.check_auth()
self.create_droplet()
self.wait_for_ip()
self.setup_wizard()
self.health_check()
def main():
parser = argparse.ArgumentParser(description="Gemini Provisioner")
parser.add_argument("--name", required=True, help="Name of the wizard")
parser.add_argument("--size", default="s-2vcpu-4gb", help="DO droplet size")
parser.add_argument("--model", default="qwen2.5-coder-7b", help="Model to serve")
parser.add_argument("--region", default="nyc3", help="DO region")
args = parser.parse_args()
provisioner = Provisioner(args.name, args.size, args.model, args.region)
provisioner.provision()
if __name__ == "__main__":
main()

71
scripts/self_healing.py Normal file
View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python3
"""
[OPS] Self-Healing Infrastructure
Part of the Gemini Sovereign Infrastructure Suite.
Auto-detects and fixes common failures across the fleet.
"""
import os
import sys
import subprocess
import argparse
import requests
# --- CONFIGURATION ---
FLEET = {
"mac": {"ip": "10.1.10.77", "port": 8080},
"ezra": {"ip": "143.198.27.163", "port": 8080},
"allegro": {"ip": "167.99.126.228", "port": 8080},
"bezalel": {"ip": "159.203.146.185", "port": 8080}
}
class SelfHealer:
def log(self, message: str):
print(f"[*] {message}")
def run_remote(self, host: str, command: str):
ip = FLEET[host]["ip"]
ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", f"root@{ip}", command]
if host == "mac":
ssh_cmd = ["bash", "-c", command]
try:
return subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10)
except:
return None
def check_and_heal(self):
for host in FLEET:
self.log(f"Auditing {host}...")
# 1. Check llama-server
ip = FLEET[host]["ip"]
port = FLEET[host]["port"]
try:
requests.get(f"http://{ip}:{port}/health", timeout=2)
except:
self.log(f" [!] llama-server down on {host}. Attempting restart...")
self.run_remote(host, "systemctl restart llama-server")
# 2. Check disk space
res = self.run_remote(host, "df -h / | tail -1 | awk '{print $5}' | sed 's/%//'")
if res and res.returncode == 0:
try:
usage = int(res.stdout.strip())
if usage > 90:
self.log(f" [!] Disk usage high on {host} ({usage}%). Cleaning logs...")
self.run_remote(host, "journalctl --vacuum-time=1d && rm -rf /var/log/*.gz")
except:
pass
def run(self):
self.log("Starting self-healing cycle...")
self.check_and_heal()
self.log("Cycle complete.")
def main():
healer = SelfHealer()
healer.run()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,75 @@
#!/usr/bin/env python3
"""
[OPS] Sovereign Skill Installer
Part of the Gemini Sovereign Infrastructure Suite.
Packages and installs Hermes skills onto remote wizard nodes.
"""
import os
import sys
import argparse
import subprocess
from pathlib import Path
# --- CONFIGURATION ---
# Assumes hermes-agent is a sibling directory to timmy-config
HERMES_ROOT = "../hermes-agent"
SKILLS_DIR = "skills"
class SkillInstaller:
def __init__(self, host: str, ip: str):
self.host = host
self.ip = ip
self.hermes_path = Path(HERMES_ROOT).resolve()
def log(self, message: str):
print(f"[*] {message}")
def error(self, message: str):
print(f"[!] ERROR: {message}")
sys.exit(1)
def install_skill(self, skill_name: str):
self.log(f"Installing skill '{skill_name}' to {self.host} ({self.ip})...")
skill_path = self.hermes_path / SKILLS_DIR / skill_name
if not skill_path.exists():
self.error(f"Skill '{skill_name}' not found in {skill_path}")
# 1. Compress skill
self.log("Compressing skill...")
tar_file = f"{skill_name}.tar.gz"
subprocess.run(["tar", "-czf", tar_file, "-C", str(skill_path.parent), skill_name])
# 2. Upload to remote
self.log("Uploading to remote...")
remote_path = f"/opt/hermes/skills/{skill_name}"
subprocess.run(["ssh", f"root@{self.ip}", f"mkdir -p /opt/hermes/skills"])
subprocess.run(["scp", tar_file, f"root@{self.ip}:/tmp/"])
# 3. Extract and register
self.log("Extracting and registering...")
extract_cmd = f"tar -xzf /tmp/{tar_file} -C /opt/hermes/skills/ && rm /tmp/{tar_file}"
subprocess.run(["ssh", f"root@{self.ip}", extract_cmd])
# Registration logic (simplified)
# In a real scenario, we'd update the wizard's config.yaml
self.log(f"[SUCCESS] Skill '{skill_name}' installed on {self.host}")
# Cleanup local tar
os.remove(tar_file)
def main():
parser = argparse.ArgumentParser(description="Gemini Skill Installer")
parser.add_argument("host", help="Target host name")
parser.add_argument("ip", help="Target host IP")
parser.add_argument("skill", help="Skill name to install")
args = parser.parse_args()
installer = SkillInstaller(args.host, args.ip)
installer.install_skill(args.skill)
if __name__ == "__main__":
main()

129
scripts/telemetry.py Normal file
View File

@@ -0,0 +1,129 @@
#!/usr/bin/env python3
"""
[OPS] Telemetry Pipeline v2
Part of the Gemini Sovereign Infrastructure Suite.
Operational visibility without cloud dependencies.
"""
import os
import sys
import json
import time
import subprocess
import argparse
# --- CONFIGURATION ---
FLEET = {
"mac": "10.1.10.77",
"ezra": "143.198.27.163",
"allegro": "167.99.126.228",
"bezalel": "159.203.146.185"
}
TELEMETRY_FILE = "logs/telemetry.json"
class Telemetry:
def __init__(self):
# Find logs relative to repo root
script_dir = os.path.dirname(os.path.abspath(__file__))
repo_root = os.path.dirname(script_dir)
self.logs_dir = os.path.join(repo_root, "logs")
self.telemetry_path = os.path.join(repo_root, TELEMETRY_FILE)
if not os.path.exists(self.logs_dir):
os.makedirs(self.logs_dir)
def log(self, message: str):
print(f"[*] {message}")
def get_metrics(self, host: str):
ip = FLEET[host]
# Command to get disk usage, memory usage (%), and load avg
cmd = "df -h / | tail -1 | awk '{print $5}' && free -m | grep Mem | awk '{print $3/$2 * 100}' && uptime | awk '{print $10}'"
ssh_cmd = ["ssh", "-o", "StrictHostKeyChecking=no", f"root@{ip}", cmd]
if host == "mac":
# Mac specific commands
cmd = "df -h / | tail -1 | awk '{print $5}' && sysctl -n vm.page_pageable_internal_count && uptime | awk '{print $10}'"
ssh_cmd = ["bash", "-c", cmd]
try:
res = subprocess.run(ssh_cmd, capture_output=True, text=True, timeout=10)
if res.returncode == 0:
lines = res.stdout.strip().split("\n")
return {
"disk_usage": lines[0],
"mem_usage": f"{float(lines[1]):.1f}%" if len(lines) > 1 and lines[1].replace('.','',1).isdigit() else "unknown",
"load_avg": lines[2].rstrip(",") if len(lines) > 2 else "unknown"
}
except:
pass
return None
def collect(self):
self.log("Collecting telemetry from fleet...")
data = {
"timestamp": time.time(),
"metrics": {}
}
for host in FLEET:
self.log(f"Fetching metrics from {host}...")
metrics = self.get_metrics(host)
if metrics:
data["metrics"][host] = metrics
# Append to telemetry file
history = []
if os.path.exists(self.telemetry_path):
with open(self.telemetry_path, "r") as f:
try:
history = json.load(f)
except:
history = []
history.append(data)
# Keep only last 100 entries
history = history[-100:]
with open(self.telemetry_path, "w") as f:
json.dump(history, f, indent=2)
self.log(f"Telemetry saved to {self.telemetry_path}")
def show_summary(self):
if not os.path.exists(self.telemetry_path):
print("No telemetry data found.")
return
with open(self.telemetry_path, "r") as f:
try:
history = json.load(f)
except:
print("Error reading telemetry data.")
return
if not history:
print("No telemetry data found.")
return
latest = history[-1]
print(f"\n--- Fleet Telemetry Summary ({time.ctime(latest['timestamp'])}) ---")
print(f"{'HOST':<10} {'DISK':<10} {'MEM':<10} {'LOAD':<10}")
print("-" * 45)
for host, m in latest["metrics"].items():
print(f"{host:<10} {m['disk_usage']:<10} {m['mem_usage']:<10} {m['load_avg']:<10}")
def main():
parser = argparse.ArgumentParser(description="Gemini Telemetry")
parser.add_argument("command", choices=["collect", "summary"], help="Command to run")
args = parser.parse_args()
telemetry = Telemetry()
if args.command == "collect":
telemetry.collect()
elif args.command == "summary":
telemetry.show_summary()
if __name__ == "__main__":
main()