ops: source-control nightly burn automation

This commit is contained in:
Alexander Whitestone
2026-04-06 09:47:35 -04:00
parent ffea2964c4
commit 8d4591dec9
7 changed files with 425 additions and 266577 deletions

4
.gitignore vendored
View File

@@ -8,3 +8,7 @@
*.db-wal
*.db-shm
__pycache__/
# Logs and runtime churn
logs/
*.log

52
bin/burn-cycle-deadman.sh Executable file
View File

@@ -0,0 +1,52 @@
#!/usr/bin/env bash
# burn-cycle-deadman.sh — Dead-man switch for burn mode cron jobs
# Run after each burn cycle to detect silent failures.
# Alert if cron ran but no log/heartbeat was produced.
set -euo pipefail
LOG_DIR="$HOME/.hermes/burn-logs"
ALERT_FILE="${LOG_DIR}/ALERT.log"
STATUS_FILE="${LOG_DIR}/deadman-status.log"
MAIN_LOG="${LOG_DIR}/timmy.log"
HEARTBEAT_FILE="${LOG_DIR}/bounded-burn-heartbeat.txt"
# Bound the allowed silence. The overnight burn runs every 15m.
MAX_SILENT_MINS=120
mkdir -p "$LOG_DIR"
last_log_mod=0
last_heartbeat_mod=0
if [ -f "$MAIN_LOG" ]; then
last_log_mod=$(stat -f %m "$MAIN_LOG" 2>/dev/null || stat -c %Y "$MAIN_LOG" 2>/dev/null || echo "0")
fi
if [ -f "$HEARTBEAT_FILE" ]; then
last_heartbeat_mod=$(stat -f %m "$HEARTBEAT_FILE" 2>/dev/null || stat -c %Y "$HEARTBEAT_FILE" 2>/dev/null || echo "0")
fi
latest_mod=$last_log_mod
if [ "$last_heartbeat_mod" -gt "$latest_mod" ]; then
latest_mod=$last_heartbeat_mod
fi
if [ "$latest_mod" -eq 0 ]; then
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ALERT: no burn proof files exist (timmy.log or bounded-burn-heartbeat.txt)" >> "$ALERT_FILE"
echo "DEAD" > "$STATUS_FILE"
exit 1
fi
now=$(date +%s)
gap_secs=$((now - latest_mod))
gap_mins=$((gap_secs / 60))
if [ "$gap_secs" -gt "$((MAX_SILENT_MINS * 60))" ]; then
last_update=$(date -r "$latest_mod" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || date '+%Y-%m-%d %H:%M:%S')
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ALERT: No burn proof output for ${gap_mins}m (threshold: ${MAX_SILENT_MINS}m). Last update: ${last_update}" >> "$ALERT_FILE"
echo "ALERT:${gap_mins}" > "$STATUS_FILE"
exit 1
fi
echo "[$(date '+%Y-%m-%d %H:%M:%S')] OK: Burn proof updated ${gap_mins}m ago (threshold: ${MAX_SILENT_MINS}m)" >> "$STATUS_FILE"
echo "OK:${gap_mins}" > "$STATUS_FILE"
exit 0

177
bin/morning-report-compiler.py Executable file
View File

@@ -0,0 +1,177 @@
#!/usr/bin/env python3
"""morning-report-compiler.py — Aggregate burn-logs into a raw overnight brief.
Runs at 6 AM via cron / manual trigger.
Note: this compiler writes the raw cycle brief.
The delivery cron can reformat that artifact into a phone-readable report.
"""
import re
import sys
from datetime import datetime, timedelta
from pathlib import Path
HERMES_HOME = Path.home() / ".hermes"
BURN_LOGS = HERMES_HOME / "burn-logs"
ALERT_LOG = BURN_LOGS / "ALERT.log"
ALERT_STATUS_FILE = BURN_LOGS / "deadman-status.log"
CYCLE_HEADER_RE = re.compile(r"=== BURN CYCLE:?\s*(.+?)\s*===")
def parse_cycle_timestamp(raw: str) -> datetime | None:
match = re.search(r"(\d{4}-\d{2}-\d{2} \d{2}:\d{2})", raw)
if not match:
return None
return datetime.strptime(match.group(1), "%Y-%m-%d %H:%M")
def extract_repo_lines(block: str) -> list[dict]:
repos = []
section = re.search(r"REPOS SURVEYED:\s*\n((?:\s*-\s+.*\n?)*)", block)
if not section:
return repos
for line in section.group(1).splitlines():
line = line.strip()
if not line.startswith("-"):
continue
body = line.lstrip("- ").strip()
if ":" in body:
name, status = body.split(":", 1)
repos.append({"name": name.strip(), "status": status.strip()})
else:
repos.append({"name": body, "status": ""})
return repos
def extract_next_tasks(block: str) -> list[str]:
match = re.search(r"NEXT(?: CYCLE TARGET| TARGET| cycle targets)?\s*:\s*\n((?:\s*-\s+.*\n?)*)", block, re.IGNORECASE)
if not match:
return []
tasks = []
for line in match.group(1).splitlines():
line = line.strip()
if line.startswith("-"):
tasks.append(line.lstrip("- ").strip())
return tasks
def find_cycles(log_path: Path) -> list[dict]:
if not log_path.exists():
return []
text = log_path.read_text()
headers = list(CYCLE_HEADER_RE.finditer(text))
cycles = []
for idx, match in enumerate(headers):
raw_timestamp = match.group(1).strip()
parsed = parse_cycle_timestamp(raw_timestamp)
if parsed is None:
continue
start = match.start()
end = headers[idx + 1].start() if idx + 1 < len(headers) else len(text)
block = text[start:end]
cycles.append(
{
"timestamp": raw_timestamp,
"parsed_at": parsed,
"repos": extract_repo_lines(block),
"next_tasks": extract_next_tasks(block),
}
)
return cycles
def get_alerts(hours: int = 12) -> list[str]:
if not ALERT_LOG.exists():
return []
cutoff = datetime.now() - timedelta(hours=hours)
alerts = []
for line in ALERT_LOG.read_text().splitlines():
if not line.startswith("["):
continue
ts_match = re.match(r"\[([^\]]+)\]", line)
if not ts_match:
continue
try:
ts = datetime.strptime(ts_match.group(1), "%Y-%m-%d %H:%M:%S")
except ValueError:
continue
if ts >= cutoff:
alerts.append(line)
return alerts
def build_report(cycles: list[dict], alerts: list[str], hours: int = 12) -> str:
now = datetime.now().strftime("%Y-%m-%d %H:%M")
lines = [
f"# Burn Mode Daily Brief — {now}",
f"## Period: Last {hours} hours",
"",
"## Overview",
f"- Total cycles: {len(cycles)}",
f"- Alerts raised: {len(alerts)}",
"",
]
if alerts:
lines.append("## Alerts")
for alert in alerts[-5:]:
lines.append(f"⚠️ {alert}")
lines.append("")
if cycles:
lines.append("## Cycles")
lines.append("")
for cycle in cycles[:12]:
lines.append(f"### Cycle: {cycle['timestamp']}")
if cycle["repos"]:
for repo in cycle["repos"]:
status = f": {repo['status']}" if repo["status"] else ""
lines.append(f"- **{repo['name']}**{status}")
if cycle["next_tasks"]:
lines.append("")
lines.append("**Next cycle targets:**")
for task in cycle["next_tasks"][:5]:
lines.append(f"- {task}")
lines.append("")
else:
lines.append("No parseable burn-cycle activity in the reporting period.")
lines.append("")
lines.append("---")
lines.append(f"*Generated by morning-report-compiler.py at {now}*")
return "\n".join(lines)
def main() -> int:
hours = int(sys.argv[1]) if len(sys.argv) > 1 else 12
cutoff = datetime.now() - timedelta(hours=hours)
logs = sorted(BURN_LOGS.glob("*.log"), key=lambda p: p.stat().st_mtime, reverse=True)
all_cycles = []
for log in logs:
all_cycles.extend(find_cycles(log))
recent = [cycle for cycle in all_cycles if cycle["parsed_at"] >= cutoff]
recent.sort(key=lambda cycle: cycle["parsed_at"], reverse=True)
alerts = get_alerts(hours)
deadman_status = ALERT_STATUS_FILE.read_text().strip() if ALERT_STATUS_FILE.exists() else ""
report = build_report(recent, alerts, hours)
report_path = BURN_LOGS / f"morning-report-{datetime.now().strftime('%Y-%m-%d-%H%M')}.md"
report_path.write_text(report)
print(f"Morning report saved: {report_path}")
print(f"Cycles found: {len(recent)}")
print(f"Alerts found: {len(alerts)}")
print(f"Dead-man status: {deadman_status}")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -39,7 +39,8 @@ Rule:
### A. launchd-loaded automations
These are loaded right now according to `launchctl list`.
These are loaded right now according to `launchctl list` after the 2026-04-04 phase-2 cleanup.
The only Timmy-specific launchd jobs still loaded are the ones below.
#### 1. ai.hermes.gateway
- Plist: ~/Library/LaunchAgents/ai.hermes.gateway.plist
@@ -91,6 +92,25 @@ launchctl bootstrap gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway.plist
- Old-state risk:
- long-lived gateway survives toolchain assumptions and keeps accepting work even if upstream routing changed
#### 3a. ai.hermes.gateway-bezalel
- Plist: ~/Library/LaunchAgents/ai.hermes.gateway-bezalel.plist
- Command: Hermes gateway under the Bezalel profile
- HERMES_HOME: `~/.hermes/profiles/bezalel`
- Logs:
- `~/.hermes/profiles/bezalel/logs/gateway.log`
- `~/.hermes/profiles/bezalel/logs/gateway.error.log`
- KeepAlive: yes
- RunAtLoad: yes
- Old-state risk:
- Bezalel can keep reviving a broken provider/auth chain unless the profile itself is repaired
#### 3b. ai.timmy.codeclaw-qwen-heartbeat
- Plist: ~/Library/LaunchAgents/ai.timmy.codeclaw-qwen-heartbeat.plist
- Purpose: monitor/revive the CodeClaw Qwen lane
- Old-state risk:
- can resurrect a side lane whose model/provider assumptions no longer match current truth
- should be audited any time CodeClaw routing changes
#### 4. ai.timmy.kimi-heartbeat
- Plist: ~/Library/LaunchAgents/ai.timmy.kimi-heartbeat.plist
- Command: `/bin/bash ~/.timmy/uniwizard/kimi-heartbeat.sh`
@@ -137,31 +157,41 @@ rm -f /tmp/kimi-heartbeat.lock
- any watchdog can resurrect a loop you meant to leave dead
- this is the first place to check when a loop "comes back"
### B. quarantined legacy launch agents
These were moved out of `~/Library/LaunchAgents` on 2026-04-04 to:
`~/Library/LaunchAgents.quarantine/timmy-legacy-20260404/`
#### 6. com.timmy.dashboard-backend
- Plist: ~/Library/LaunchAgents/com.timmy.dashboard-backend.plist
- Command: uvicorn `dashboard.app:app`
- Working directory: `~/worktrees/kimi-repo`
- Port: 8100
- Logs: `~/.hermes/logs/dashboard-backend.log`
- KeepAlive: yes
- RunAtLoad: yes
- Old-state risk:
- this serves code from a specific worktree, not from current repo truth in the abstract
- if `~/worktrees/kimi-repo` is stale, launchd will faithfully keep serving stale code
- Former plist: `com.timmy.dashboard-backend.plist`
- Former command: uvicorn `dashboard.app:app`
- Former working directory: `~/worktrees/kimi-repo`
- Quarantine reason:
- served code from a specific stale worktree
- could revive old backend state by launchd KeepAlive alone
#### 7. com.timmy.matrix-frontend
- Plist: ~/Library/LaunchAgents/com.timmy.matrix-frontend.plist
- Command: `npx vite --host`
- Working directory: `~/worktrees/the-matrix`
- Logs: `~/.hermes/logs/matrix-frontend.log`
- KeepAlive: yes
- RunAtLoad: yes
- Old-state risk:
- HIGH
- this still points at `~/worktrees/the-matrix`, even though the live 3D world work moved to `Timmy_Foundation/the-nexus`
- if this is left loaded, it can revive the old frontend lineage
- Former plist: `com.timmy.matrix-frontend.plist`
- Former command: `npx vite --host`
- Former working directory: `~/worktrees/the-matrix`
- Quarantine reason:
- pointed at the old `the-matrix` lineage instead of current nexus truth
- could revive a stale frontend every login
### B. running now but NOT launchd-managed
#### 8. ai.hermes.startup
- Former plist: `ai.hermes.startup.plist`
- Former command: `~/.hermes/bin/hermes-startup.sh`
- Quarantine reason:
- startup path still expected missing `timmy-tmux.sh`
- could recreate old webhook/tmux assumptions at login
#### 9. com.timmy.tick
- Former plist: `com.timmy.tick.plist`
- Former command: `/Users/apayne/Timmy-time-dashboard/deploy/timmy-tick-mac.sh`
- Quarantine reason:
- pure dashboard-era legacy path
### C. running now but NOT launchd-managed
These are live processes, but not currently represented by a loaded launchd plist.
They can still persist because they were started with `nohup` or by other parent scripts.
@@ -207,7 +237,7 @@ printf '{}\n' > ~/.hermes/logs/gemini-active.json
- can repopulate agent queues even after you thought they were cleared
- not represented in timmy-config/bin yet as of this audit
### C. Hermes cron automations
### D. Hermes cron automations
Current cron inventory from `cronjob(list, include_disabled=true)`:
@@ -224,27 +254,11 @@ Old-state risk:
- paused crons are not dead forever; they are resumable state
- LLM-wrapped crons can revive old routing/model assumptions if resumed blindly
### D. file exists but NOT currently loaded
### E. file exists but NOT currently loaded
These are the ones most likely to surprise us later because they still exist and point at old realities.
#### 10. ai.hermes.startup
- Plist: `~/Library/LaunchAgents/ai.hermes.startup.plist`
- Points to: `~/.hermes/bin/hermes-startup.sh`
- Not loaded in launchctl at audit time
- High-risk notes:
- startup script still expects `~/.hermes/bin/timmy-tmux.sh`
- that file is MISSING at audit time
- script also tries to start webhook listener and the old `timmy-loop` tmux world
- This is a dormant old-state resurrection path
#### 11. com.timmy.tick
- Plist: `~/Library/LaunchAgents/com.timmy.tick.plist`
- Points to: `/Users/apayne/Timmy-time-dashboard/deploy/timmy-tick-mac.sh`
- Not loaded at audit time
- Definitely legacy dashboard-era automation
#### 12. com.tower.pr-automerge
#### 10. com.tower.pr-automerge
- Plist: `~/Library/LaunchAgents/com.tower.pr-automerge.plist`
- Points to: `/Users/apayne/hermes-config/bin/pr-automerge.sh`
- Not loaded at audit time
@@ -317,8 +331,6 @@ launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.timmy.claudemax-watchdo
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway.plist || true
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.hermes.gateway-fenrir.plist || true
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/ai.openclaw.gateway.plist || true
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/com.timmy.dashboard-backend.plist || true
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/com.timmy.matrix-frontend.plist || true
```
2. Kill manual loops
@@ -349,10 +361,9 @@ cp ~/.hermes/sessions/sessions.json ~/.hermes/sessions/sessions.json.bak.$(date
## Current contradictions to fix later
1. README still describes `bin/` as "NOT deprecated loops" but live runtime still contains revived loop scripts.
2. `DEPRECATED.md` says claude-loop/gemini-loop/timmy-orchestrator/claudemax-watchdog were removed, but reality disagrees.
3. `com.timmy.matrix-frontend` still points at `~/worktrees/the-matrix` rather than the nexus lineage.
4. `ai.hermes.startup` still points at a startup path that expects missing `timmy-tmux.sh`.
5. `gemini-loop.sh` and `timmy-orchestrator.sh` are live but not yet mirrored into timmy-config/bin/.
1. README and DEPRECATED were corrected on 2026-04-04, but older local clones may still have stale prose.
2. The quarantined launch agents now live under `~/Library/LaunchAgents.quarantine/timmy-legacy-20260404/`; if someone moves them back, the old state can return.
3. `gemini-loop.sh` and `timmy-orchestrator.sh` are live but not yet mirrored into timmy-config/bin/.
4. The open docs PR must be kept clean: do not mix operational script recovery and documentation history on the same branch.
Until those are reconciled, trust this inventory over older prose.

133
docs/nightly-burn-mode.md Normal file
View File

@@ -0,0 +1,133 @@
# Nightly Burn Mode — Canonical Lineup
Status: active pattern as of 2026-04-06
Owner: Timmy
Scope: overnight burn automation for the local Mac
## Canonical overnight lineup
Keep the overnight stack small, bounded, and proof-bearing.
1. one bounded burn job
2. one dead-man job
3. one morning-report job
4. one health monitor
Do not run duplicate burn jobs on the same lane.
Do not leave structurally broken jobs enabled overnight.
## Canonical jobs
### 1. Burn Mode — Timmy Orchestrator
Purpose: one bounded overnight burn cycle every 15 minutes.
Rules:
- no repo cloning
- no rebases
- no deep repairs
- at most one tangible action per cycle
- if there is no clear quick win, leave proof of a healthy no-op and stay silent
- do not step into Evennia automation from this lane
Quick-win priority:
1. merge one obviously safe PR
2. answer one unresolved human comment on a Timmy-touched issue
3. leave one stale/unblocked proof-first comment
4. otherwise write a healthy no-op heartbeat
Required proof per non-silent cycle:
- what was touched
- evidence link(s)
- next target
### 2. Burn Deadman
Purpose: detect when the burn lane has gone silent.
Command:
```bash
bash ~/.hermes/bin/burn-cycle-deadman.sh
```
Signal source:
- `~/.hermes/burn-logs/timmy.log`
- `~/.hermes/burn-logs/bounded-burn-heartbeat.txt`
A healthy no-op cycle must still update one of those proof files, otherwise deadman will false-alert.
### 3. Morning Report — Burn Mode
Purpose: compile the overnight burn into a raw structured brief, then let the delivery cron reformat it into a phone-readable morning report.
Command:
```bash
python3 ~/.hermes/bin/morning-report-compiler.py 12
```
Delivery shape (applied by the cron prompt that reads the generated markdown):
- Shipped
- Failed
- Fleet Status
- Stakes Cleared
- Next 3
### 4. Health Monitor
Purpose: basic local machine health.
Checks:
- Ollama reachability
- disk
- memory
- process count
## Jobs to keep paused unless repaired
### Duplicate burn loops
If two 15-minute burns point at the same lane, pause one.
### velocity-engine
Pause overnight if it shows any of:
- 0 claimed
- 0 created
- repeated HTTP 422 self-generation failures
- KeyError `total_claimed`
### wolf-eval-cycle
Pause overnight if it times out and does not directly help the burn lane.
## Source-of-truth rule
Do not leave this pattern as live-only cron state.
Repo-truth must include:
- the canonical overnight lineup
- the deadman script
- the morning report compiler
- the bounded-burn prompt/rules
## No-op heartbeat rule
Bounded overnight burns often find no safe merge and no fresh human comment.
That is fine.
What is not fine:
- returning silent with no proof of liveness
- letting deadman conclude the lane died when the lane merely had no quick win
Healthy no-op cycles should update:
- `~/.hermes/burn-logs/bounded-burn-heartbeat.txt`
Recommended contents:
- UTC timestamp
- repos polled
- blocker proof links
- note that no low-risk quick win existed
## Why this pattern won
Compared with the old sprawling burn loop, the bounded pattern produced:
- one real merge when available
- multiple proof-first human-comment wins
- useful stale/unblocked nudges
- much better morning visibility
- less falsework
The goal is not drama.
The goal is consistent, bounded, sovereign overnight work.

File diff suppressed because it is too large Load Diff

View File