ops: source-control nightly burn automation

2026-04-06 09:47:35 -04:00
parent ffea2964c4
commit 8d4591dec9
7 changed files with 425 additions and 266577 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,7 @@
 *.db-wal
 *.db-shm
 __pycache__/
+
+# Logs and runtime churn
+logs/
+*.log
--- a/bin/burn-cycle-deadman.sh
+++ b/bin/burn-cycle-deadman.sh
@@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+# burn-cycle-deadman.sh — Dead-man switch for burn mode cron jobs
+# Run after each burn cycle to detect silent failures.
+# Alert if cron ran but no log/heartbeat was produced.
+
+set -euo pipefail
+
+LOG_DIR="$HOME/.hermes/burn-logs"
+ALERT_FILE="${LOG_DIR}/ALERT.log"
+STATUS_FILE="${LOG_DIR}/deadman-status.log"
+MAIN_LOG="${LOG_DIR}/timmy.log"
+HEARTBEAT_FILE="${LOG_DIR}/bounded-burn-heartbeat.txt"
+
+# Bound the allowed silence. The overnight burn runs every 15m.
+MAX_SILENT_MINS=120
+
+mkdir -p "$LOG_DIR"
+
+last_log_mod=0
+last_heartbeat_mod=0
+if [ -f "$MAIN_LOG" ]; then
+  last_log_mod=$(stat -f %m "$MAIN_LOG" 2>/dev/null || stat -c %Y "$MAIN_LOG" 2>/dev/null || echo "0")
+fi
+if [ -f "$HEARTBEAT_FILE" ]; then
+  last_heartbeat_mod=$(stat -f %m "$HEARTBEAT_FILE" 2>/dev/null || stat -c %Y "$HEARTBEAT_FILE" 2>/dev/null || echo "0")
+fi
+
+latest_mod=$last_log_mod
+if [ "$last_heartbeat_mod" -gt "$latest_mod" ]; then
+  latest_mod=$last_heartbeat_mod
+fi
+
+if [ "$latest_mod" -eq 0 ]; then
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] ALERT: no burn proof files exist (timmy.log or bounded-burn-heartbeat.txt)" >> "$ALERT_FILE"
+  echo "DEAD" > "$STATUS_FILE"
+  exit 1
+fi
+
+now=$(date +%s)
+gap_secs=$((now - latest_mod))
+gap_mins=$((gap_secs / 60))
+
+if [ "$gap_secs" -gt "$((MAX_SILENT_MINS * 60))" ]; then
+  last_update=$(date -r "$latest_mod" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] ALERT: No burn proof output for ${gap_mins}m (threshold: ${MAX_SILENT_MINS}m). Last update: ${last_update}" >> "$ALERT_FILE"
+  echo "ALERT:${gap_mins}" > "$STATUS_FILE"
+  exit 1
+fi
+
+echo "[$(date '+%Y-%m-%d %H:%M:%S')] OK: Burn proof updated ${gap_mins}m ago (threshold: ${MAX_SILENT_MINS}m)" >> "$STATUS_FILE"
+echo "OK:${gap_mins}" > "$STATUS_FILE"
+exit 0
--- a/bin/morning-report-compiler.py
+++ b/bin/morning-report-compiler.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python3
+"""morning-report-compiler.py — Aggregate burn-logs into a raw overnight brief.
+Runs at 6 AM via cron / manual trigger.
+
+Note: this compiler writes the raw cycle brief.
+The delivery cron can reformat that artifact into a phone-readable report.
+"""
+
+import re
+import sys
+from datetime import datetime, timedelta
+from pathlib import Path
+
+HERMES_HOME = Path.home() / ".hermes"
+BURN_LOGS = HERMES_HOME / "burn-logs"
+ALERT_LOG = BURN_LOGS / "ALERT.log"
+ALERT_STATUS_FILE = BURN_LOGS / "deadman-status.log"
+CYCLE_HEADER_RE = re.compile(r"=== BURN CYCLE:?\s*(.+?)\s*===")
+
+
+def parse_cycle_timestamp(raw: str) -> datetime | None:
+    match = re.search(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2})", raw)
+    if not match:
+        return None
+    return datetime.strptime(match.group(1), "%Y-%m-%d %H:%M")
+
+
+def extract_repo_lines(block: str) -> list[dict]:
+    repos = []
+    section = re.search(r"REPOS SURVEYED:\s*\n((?:\s*-\s+.*\n?)*)", block)
+    if not section:
+        return repos
+    for line in section.group(1).splitlines():
+        line = line.strip()
+        if not line.startswith("-"):
+            continue
+        body = line.lstrip("- ").strip()
+        if ":" in body:
+            name, status = body.split(":", 1)
+            repos.append({"name": name.strip(), "status": status.strip()})
+        else:
+            repos.append({"name": body, "status": ""})
+    return repos
+
+
+def extract_next_tasks(block: str) -> list[str]:
+    match = re.search(r"NEXT(?: CYCLE TARGET| TARGET| cycle targets)?\s*:\s*\n((?:\s*-\s+.*\n?)*)", block, re.IGNORECASE)
+    if not match:
+        return []
+    tasks = []
+    for line in match.group(1).splitlines():
+        line = line.strip()
+        if line.startswith("-"):
+            tasks.append(line.lstrip("- ").strip())
+    return tasks
+
+
+def find_cycles(log_path: Path) -> list[dict]:
+    if not log_path.exists():
+        return []
+
+    text = log_path.read_text()
+    headers = list(CYCLE_HEADER_RE.finditer(text))
+    cycles = []
+
+    for idx, match in enumerate(headers):
+        raw_timestamp = match.group(1).strip()
+        parsed = parse_cycle_timestamp(raw_timestamp)
+        if parsed is None:
+            continue
+
+        start = match.start()
+        end = headers[idx + 1].start() if idx + 1 < len(headers) else len(text)
+        block = text[start:end]
+        cycles.append(
+            {
+                "timestamp": raw_timestamp,
+                "parsed_at": parsed,
+                "repos": extract_repo_lines(block),
+                "next_tasks": extract_next_tasks(block),
+            }
+        )
+
+    return cycles
+
+
+def get_alerts(hours: int = 12) -> list[str]:
+    if not ALERT_LOG.exists():
+        return []
+
+    cutoff = datetime.now() - timedelta(hours=hours)
+    alerts = []
+    for line in ALERT_LOG.read_text().splitlines():
+        if not line.startswith("["):
+            continue
+        ts_match = re.match(r"\[([^\]]+)\]", line)
+        if not ts_match:
+            continue
+        try:
+            ts = datetime.strptime(ts_match.group(1), "%Y-%m-%d %H:%M:%S")
+        except ValueError:
+            continue
+        if ts >= cutoff:
+            alerts.append(line)
+    return alerts
+
+
+def build_report(cycles: list[dict], alerts: list[str], hours: int = 12) -> str:
+    now = datetime.now().strftime("%Y-%m-%d %H:%M")
+    lines = [
+        f"# Burn Mode Daily Brief — {now}",
+        f"## Period: Last {hours} hours",
+        "",
+        "## Overview",
+        f"- Total cycles: {len(cycles)}",
+        f"- Alerts raised: {len(alerts)}",
+        "",
+    ]
+
+    if alerts:
+        lines.append("## Alerts")
+        for alert in alerts[-5:]:
+            lines.append(f"⚠️ {alert}")
+        lines.append("")
+
+    if cycles:
+        lines.append("## Cycles")
+        lines.append("")
+        for cycle in cycles[:12]:
+            lines.append(f"### Cycle: {cycle['timestamp']}")
+            if cycle["repos"]:
+                for repo in cycle["repos"]:
+                    status = f": {repo['status']}" if repo["status"] else ""
+                    lines.append(f"- **{repo['name']}**{status}")
+            if cycle["next_tasks"]:
+                lines.append("")
+                lines.append("**Next cycle targets:**")
+                for task in cycle["next_tasks"][:5]:
+                    lines.append(f"- {task}")
+            lines.append("")
+    else:
+        lines.append("No parseable burn-cycle activity in the reporting period.")
+        lines.append("")
+
+    lines.append("---")
+    lines.append(f"*Generated by morning-report-compiler.py at {now}*")
+    return "\n".join(lines)
+
+
+def main() -> int:
+    hours = int(sys.argv[1]) if len(sys.argv) > 1 else 12
+    cutoff = datetime.now() - timedelta(hours=hours)
+
+    logs = sorted(BURN_LOGS.glob("*.log"), key=lambda p: p.stat().st_mtime, reverse=True)
+    all_cycles = []
+    for log in logs:
+        all_cycles.extend(find_cycles(log))
+
+    recent = [cycle for cycle in all_cycles if cycle["parsed_at"] >= cutoff]
+    recent.sort(key=lambda cycle: cycle["parsed_at"], reverse=True)
+
+    alerts = get_alerts(hours)
+    deadman_status = ALERT_STATUS_FILE.read_text().strip() if ALERT_STATUS_FILE.exists() else ""
+    report = build_report(recent, alerts, hours)
+
+    report_path = BURN_LOGS / f"morning-report-{datetime.now().strftime('%Y-%m-%d-%H%M')}.md"
+    report_path.write_text(report)
+
+    print(f"Morning report saved: {report_path}")
+    print(f"Cycles found: {len(recent)}")
+    print(f"Alerts found: {len(alerts)}")
+    print(f"Dead-man status: {deadman_status}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/docs/automation-inventory.md
+++ b/docs/automation-inventory.md
@@ -39,7 +39,8 @@ Rule:

 ### A. launchd-loaded automations

-These are loaded right now according to `launchctl list`.
+These are loaded right now according to `launchctl list` after the 2026-04-04 phase-2 cleanup.
+The only Timmy-specific launchd jobs still loaded are the ones below.

 #### 1. ai.hermes.gateway
 - Plist: ~/Library/LaunchAgents/ai.hermes.gateway.plist
@@ -91,6 +92,25 @@ launchctl bootstrap gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway.plist
 - Old-state risk:
  - long-lived gateway survives toolchain assumptions and keeps accepting work even if upstream routing changed

+#### 3a. ai.hermes.gateway-bezalel
+- Plist: ~/Library/LaunchAgents/ai.hermes.gateway-bezalel.plist
+- Command: Hermes gateway under the Bezalel profile
+- HERMES_HOME: `~/.hermes/profiles/bezalel`
+- Logs:
+  - `~/.hermes/profiles/bezalel/logs/gateway.log`
+  - `~/.hermes/profiles/bezalel/logs/gateway.error.log`
+- KeepAlive: yes
+- RunAtLoad: yes
+- Old-state risk:
+  - Bezalel can keep reviving a broken provider/auth chain unless the profile itself is repaired
+
+#### 3b. ai.timmy.codeclaw-qwen-heartbeat
+- Plist: ~/Library/LaunchAgents/ai.timmy.codeclaw-qwen-heartbeat.plist
+- Purpose: monitor/revive the CodeClaw Qwen lane
+- Old-state risk:
+  - can resurrect a side lane whose model/provider assumptions no longer match current truth
+  - should be audited any time CodeClaw routing changes
+
 #### 4. ai.timmy.kimi-heartbeat
 - Plist: ~/Library/LaunchAgents/ai.timmy.kimi-heartbeat.plist
 - Command: `/bin/bash ~/.timmy/uniwizard/kimi-heartbeat.sh`
@@ -137,31 +157,41 @@ rm -f /tmp/kimi-heartbeat.lock
  - any watchdog can resurrect a loop you meant to leave dead
  - this is the first place to check when a loop "comes back"

+### B. quarantined legacy launch agents
+
+These were moved out of `~/Library/LaunchAgents` on 2026-04-04 to:
+`~/Library/LaunchAgents.quarantine/timmy-legacy-20260404/`
+
 #### 6. com.timmy.dashboard-backend
- Plist: ~/Library/LaunchAgents/com.timmy.dashboard-backend.plist
- Command: uvicorn `dashboard.app:app`
- Working directory: `~/worktrees/kimi-repo`
- Port: 8100
- Logs: `~/.hermes/logs/dashboard-backend.log`
- KeepAlive: yes
- RunAtLoad: yes
- Old-state risk:
-  - this serves code from a specific worktree, not from current repo truth in the abstract
-  - if `~/worktrees/kimi-repo` is stale, launchd will faithfully keep serving stale code
+- Former plist: `com.timmy.dashboard-backend.plist`
+- Former command: uvicorn `dashboard.app:app`
+- Former working directory: `~/worktrees/kimi-repo`
+- Quarantine reason:
+  - served code from a specific stale worktree
+  - could revive old backend state by launchd KeepAlive alone

 #### 7. com.timmy.matrix-frontend
- Plist: ~/Library/LaunchAgents/com.timmy.matrix-frontend.plist
- Command: `npx vite --host`
- Working directory: `~/worktrees/the-matrix`
- Logs: `~/.hermes/logs/matrix-frontend.log`
- KeepAlive: yes
- RunAtLoad: yes
- Old-state risk:
-  - HIGH
-  - this still points at `~/worktrees/the-matrix`, even though the live 3D world work moved to `Timmy_Foundation/the-nexus`
-  - if this is left loaded, it can revive the old frontend lineage
+- Former plist: `com.timmy.matrix-frontend.plist`
+- Former command: `npx vite --host`
+- Former working directory: `~/worktrees/the-matrix`
+- Quarantine reason:
+  - pointed at the old `the-matrix` lineage instead of current nexus truth
+  - could revive a stale frontend every login

-### B. running now but NOT launchd-managed
+#### 8. ai.hermes.startup
+- Former plist: `ai.hermes.startup.plist`
+- Former command: `~/.hermes/bin/hermes-startup.sh`
+- Quarantine reason:
+  - startup path still expected missing `timmy-tmux.sh`
+  - could recreate old webhook/tmux assumptions at login
+
+#### 9. com.timmy.tick
+- Former plist: `com.timmy.tick.plist`
+- Former command: `/Users/apayne/Timmy-time-dashboard/deploy/timmy-tick-mac.sh`
+- Quarantine reason:
+  - pure dashboard-era legacy path
+
+### C. running now but NOT launchd-managed

 These are live processes, but not currently represented by a loaded launchd plist.
 They can still persist because they were started with `nohup` or by other parent scripts.
@@ -207,7 +237,7 @@ printf '{}\n' > ~/.hermes/logs/gemini-active.json
  - can repopulate agent queues even after you thought they were cleared
  - not represented in timmy-config/bin yet as of this audit

-### C. Hermes cron automations
+### D. Hermes cron automations

 Current cron inventory from `cronjob(list, include_disabled=true)`:

@@ -224,27 +254,11 @@ Old-state risk:
 - paused crons are not dead forever; they are resumable state
 - LLM-wrapped crons can revive old routing/model assumptions if resumed blindly

-### D. file exists but NOT currently loaded
+### E. file exists but NOT currently loaded

 These are the ones most likely to surprise us later because they still exist and point at old realities.

-#### 10. ai.hermes.startup
- Plist: `~/Library/LaunchAgents/ai.hermes.startup.plist`
- Points to: `~/.hermes/bin/hermes-startup.sh`
- Not loaded in launchctl at audit time
- High-risk notes:
-  - startup script still expects `~/.hermes/bin/timmy-tmux.sh`
-  - that file is MISSING at audit time
-  - script also tries to start webhook listener and the old `timmy-loop` tmux world
- This is a dormant old-state resurrection path
-
-#### 11. com.timmy.tick
- Plist: `~/Library/LaunchAgents/com.timmy.tick.plist`
- Points to: `/Users/apayne/Timmy-time-dashboard/deploy/timmy-tick-mac.sh`
- Not loaded at audit time
- Definitely legacy dashboard-era automation
-
-#### 12. com.tower.pr-automerge
+#### 10. com.tower.pr-automerge
 - Plist: `~/Library/LaunchAgents/com.tower.pr-automerge.plist`
 - Points to: `/Users/apayne/hermes-config/bin/pr-automerge.sh`
 - Not loaded at audit time
@@ -317,8 +331,6 @@ launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.timmy.claudemax-watchdo
 launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway.plist || true
 launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway-fenrir.plist || true
 launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.openclaw.gateway.plist || true
-launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/com.timmy.dashboard-backend.plist || true
-launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/com.timmy.matrix-frontend.plist || true
 ```

 2. Kill manual loops
@@ -349,10 +361,9 @@ cp ~/.hermes/sessions/sessions.json ~/.hermes/sessions/sessions.json.bak.$(date

 ## Current contradictions to fix later

-1. README still describes `bin/` as "NOT deprecated loops" but live runtime still contains revived loop scripts.
-2. `DEPRECATED.md` says claude-loop/gemini-loop/timmy-orchestrator/claudemax-watchdog were removed, but reality disagrees.
-3. `com.timmy.matrix-frontend` still points at `~/worktrees/the-matrix` rather than the nexus lineage.
-4. `ai.hermes.startup` still points at a startup path that expects missing `timmy-tmux.sh`.
-5. `gemini-loop.sh` and `timmy-orchestrator.sh` are live but not yet mirrored into timmy-config/bin/.
+1. README and DEPRECATED were corrected on 2026-04-04, but older local clones may still have stale prose.
+2. The quarantined launch agents now live under `~/Library/LaunchAgents.quarantine/timmy-legacy-20260404/`; if someone moves them back, the old state can return.
+3. `gemini-loop.sh` and `timmy-orchestrator.sh` are live but not yet mirrored into timmy-config/bin/.
+4. The open docs PR must be kept clean: do not mix operational script recovery and documentation history on the same branch.

 Until those are reconciled, trust this inventory over older prose.
--- a/docs/nightly-burn-mode.md
+++ b/docs/nightly-burn-mode.md
@@ -0,0 +1,133 @@
+# Nightly Burn Mode — Canonical Lineup
+
+Status: active pattern as of 2026-04-06
+Owner: Timmy
+Scope: overnight burn automation for the local Mac
+
+## Canonical overnight lineup
+
+Keep the overnight stack small, bounded, and proof-bearing.
+
+1. one bounded burn job
+2. one dead-man job
+3. one morning-report job
+4. one health monitor
+
+Do not run duplicate burn jobs on the same lane.
+Do not leave structurally broken jobs enabled overnight.
+
+## Canonical jobs
+
+### 1. Burn Mode — Timmy Orchestrator
+Purpose: one bounded overnight burn cycle every 15 minutes.
+
+Rules:
+- no repo cloning
+- no rebases
+- no deep repairs
+- at most one tangible action per cycle
+- if there is no clear quick win, leave proof of a healthy no-op and stay silent
+- do not step into Evennia automation from this lane
+
+Quick-win priority:
+1. merge one obviously safe PR
+2. answer one unresolved human comment on a Timmy-touched issue
+3. leave one stale/unblocked proof-first comment
+4. otherwise write a healthy no-op heartbeat
+
+Required proof per non-silent cycle:
+- what was touched
+- evidence link(s)
+- next target
+
+### 2. Burn Deadman
+Purpose: detect when the burn lane has gone silent.
+
+Command:
+```bash
+bash ~/.hermes/bin/burn-cycle-deadman.sh
+```
+
+Signal source:
+- `~/.hermes/burn-logs/timmy.log`
+- `~/.hermes/burn-logs/bounded-burn-heartbeat.txt`
+
+A healthy no-op cycle must still update one of those proof files, otherwise deadman will false-alert.
+
+### 3. Morning Report — Burn Mode
+Purpose: compile the overnight burn into a raw structured brief, then let the delivery cron reformat it into a phone-readable morning report.
+
+Command:
+```bash
+python3 ~/.hermes/bin/morning-report-compiler.py 12
+```
+
+Delivery shape (applied by the cron prompt that reads the generated markdown):
+- Shipped
+- Failed
+- Fleet Status
+- Stakes Cleared
+- Next 3
+
+### 4. Health Monitor
+Purpose: basic local machine health.
+
+Checks:
+- Ollama reachability
+- disk
+- memory
+- process count
+
+## Jobs to keep paused unless repaired
+
+### Duplicate burn loops
+If two 15-minute burns point at the same lane, pause one.
+
+### velocity-engine
+Pause overnight if it shows any of:
+- 0 claimed
+- 0 created
+- repeated HTTP 422 self-generation failures
+- KeyError `total_claimed`
+
+### wolf-eval-cycle
+Pause overnight if it times out and does not directly help the burn lane.
+
+## Source-of-truth rule
+
+Do not leave this pattern as live-only cron state.
+Repo-truth must include:
+- the canonical overnight lineup
+- the deadman script
+- the morning report compiler
+- the bounded-burn prompt/rules
+
+## No-op heartbeat rule
+
+Bounded overnight burns often find no safe merge and no fresh human comment.
+That is fine.
+
+What is not fine:
+- returning silent with no proof of liveness
+- letting deadman conclude the lane died when the lane merely had no quick win
+
+Healthy no-op cycles should update:
+- `~/.hermes/burn-logs/bounded-burn-heartbeat.txt`
+
+Recommended contents:
+- UTC timestamp
+- repos polled
+- blocker proof links
+- note that no low-risk quick win existed
+
+## Why this pattern won
+
+Compared with the old sprawling burn loop, the bounded pattern produced:
+- one real merge when available
+- multiple proof-first human-comment wins
+- useful stale/unblocked nudges
+- much better morning visibility
+- less falsework
+
+The goal is not drama.
+The goal is consistent, bounded, sovereign overnight work.
--- a/logs/huey.error.log
+++ b/logs/huey.error.log
--- a/logs/huey.log
+++ b/logs/huey.log