feat: wire llama health into night watch (#1123 )

2026-04-15 02:21:01 -04:00
4 changed files with 98 additions and 142 deletions
--- a/bin/night_watch.py
+++ b/bin/night_watch.py
@@ -37,6 +37,8 @@ import shutil
 import subprocess
 import sys
 import time
+import urllib.error
+import urllib.request
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Optional
@@ -119,8 +121,6 @@ def _check_memory(threshold_pct: int = 90) -> tuple[str, str]:

 def _check_gitea_reachability(gitea_url: str = "https://forge.alexanderwhitestone.com") -> tuple[str, str]:
    """Return (status, detail) for Gitea HTTPS reachability."""
-    import urllib.request
-    import urllib.error
    try:
        with urllib.request.urlopen(gitea_url, timeout=10) as resp:
            code = resp.status
@@ -131,6 +131,21 @@ def _check_gitea_reachability(gitea_url: str = "https://forge.alexanderwhiteston
        return "WARN", f"Gitea unreachable: {exc}"


+def _check_llama_server(endpoint: str = "http://127.0.0.1:11435") -> tuple[str, str]:
+    """Return (status, detail) for the local llama.cpp server health endpoint."""
+    health_url = f"{endpoint.rstrip('/')}/health"
+    try:
+        req = urllib.request.Request(health_url, headers={"Accept": "application/json"})
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            data = json.loads(resp.read().decode())
+        if data.get("status") == "ok":
+            model_name = Path(str(data.get("model_path", ""))).name or data.get("model", "unknown-model")
+            return "OK", f"llama-server healthy at {endpoint} ({model_name})"
+        return "WARN", f"llama-server unhealthy at {endpoint}: {data}"
+    except Exception as exc:
+        return "WARN", f"llama-server unreachable at {endpoint}: {exc}"
+
+
 def _check_world_readable_secrets() -> tuple[str, str]:
    """Return (status, detail) for world-readable sensitive files."""
    sensitive_patterns = ["*.key", "*.pem", "*.secret", ".env", "*.token"]
@@ -172,6 +187,9 @@ def generate_report(date_str: str, checker_mod) -> str:
    gitea_status, gitea_detail = _check_gitea_reachability()
    rows.append(("Alpha VPS", gitea_status, gitea_detail))

+    llama_status, llama_detail = _check_llama_server()
+    rows.append(("Local LLM", llama_status, llama_detail))
+
    sec_status, sec_detail = _check_world_readable_secrets()
    rows.append(("Security", sec_status, sec_detail))

--- a/docs/local-llm.md
+++ b/docs/local-llm.md
@@ -40,6 +40,9 @@ Standardizes local LLM inference across the fleet using llama.cpp.
    curl -sf http://localhost:11435/health
    curl -s http://localhost:11435/v1/models

+Night Watch integration:
+- `bin/night_watch.py` probes the local llama.cpp `/health` endpoint and surfaces failures in the nightly report.
+
 ## Troubleshooting

 - Won't start → smaller model / lower quant
--- a/docs/timmy-home-backlog-triage-2026-04-15.md
+++ b/docs/timmy-home-backlog-triage-2026-04-15.md
@@ -1,140 +0,0 @@
-# timmy-home Backlog Triage Report
-
-**Generated:** 2026-04-15
-**Issue:** the-nexus #1459
-**Source:** Timmy_Foundation/timmy-home
-
---
-
-## Summary
-
-| Metric | Count |
-|--------|-------|
-| Total open items | 231 |
-| Open issues | 228 |
-| Open PRs | 3 |
-| Issues older than 30 days | 0 |
-
-The backlog has grown from 220 (per #1127 triage) to 228. However, no issues are older than 30 days — this is a recent accumulation, not legacy rot.
-
---
-
-## Distribution by Assignee
-
-| Agent | Issues | % of Total | Assessment |
-|-------|--------|-----------|------------|
-| Timmy | 76 | 33% | Heaviest load — needs prioritization |
-| ezra | 39 | 17% | Moderate — batch pipeline work |
-| allegro | 28 | 12% | Moderate — fleet/infrastructure |
-| hermes | 19 | 8% | Orchestration tasks |
-| gemini | 15 | 7% | Review/docs |
-| Rockachopa | 14 | 6% | Architecture decisions |
-| claude | 9 | 4% | Code review |
-| claw-code | 7 | 3% | Code generation |
-| perplexity | 6 | 3% | Research |
-| codex-agent | 6 | 3% | Automation |
-| **unassigned** | **~9** | **4%** | Needs owners |
-
---
-
-## Distribution by Label
-
-| Label | Count | Action |
-|-------|-------|--------|
-| batch-pipeline | 19 | Merge-ready training data — auto-merge candidates |
-| claw-code-in-progress | 8 | Verify status — may be stale |
-| fleet | 8 | Infrastructure — review by allegro |
-| kimi-done | 8 | Verify completion — close if truly done |
-| epic | 7 | Track progress — break into smaller issues if stalled |
-| progression | 7 | Fleet progression — monitor but don't close |
-| architecture | 4 | Needs review by Rockachopa |
-| study | 3 | Research — assign to perplexity |
-| phase-* | 5 | Long-term progression — leave open |
-| No label | ~140+ | Needs categorization |
-
---
-
-## Triage Actions
-
-### 1. Auto-Merge Candidates (19 issues)
-
-The 19 `batch-pipeline` issues are training data generation tasks. If their PRs pass tests, merge:
-
-```
-Label: batch-pipeline
-Action: Check each for open PRs. Merge if green.
-Risk: Low — data-only changes
-```
-
-### 2. Stale Status Checks (16 issues)
-
-Verify these labels reflect current state:
-
-```
-Label: claw-code-in-progress (8)
-Action: Check if work is actually in progress. Close stale ones.
-
-Label: kimi-done (8)
-Action: Verify completion. Close if truly done or re-assign if not.
-```
-
-### 3. Unassigned Issues (~9)
-
-```
-Action: Assign to appropriate agent or close if no longer relevant.
-Priority: High — unassigned issues accumulate fastest.
-```
-
-### 4. Epic Tracking (7 issues)
-
-```
-Label: epic
-Action: Review progress. Break stalled epics into smaller actionable items.
-```
-
-### 5. No-Label Issues (~140+)
-
-```
-Action: Apply labels for categorization.
-Priority: Medium — improves searchability and routing.
-```
-
---
-
-## Recommendations
-
-### Immediate (this week)
-
-1. **Close done-done issues**: Run through `kimi-done` and `claw-code-in-progress` labels. Close anything completed.
-2. **Assign unassigned**: Route ~9 unassigned issues to agents.
-3. **Auto-merge training data**: The 19 `batch-pipeline` PRs are low-risk merges.
-
-### Short-term (this month)
-
-4. **Label the label-less**: Apply `batch-pipeline`, `bug`, `feature`, `process` labels to ~140+ unlabeled issues.
-5. **Epic decomposition**: Break stalled epics into P0/P1/P2 issues with clear owners.
-6. **Stale PR cleanup**: The 3 open PRs should be reviewed or closed.
-
-### Long-term
-
-7. **Backlog cap**: Set a soft cap (e.g., 150 open issues). When exceeded, mandatory triage before new issues.
-8. **Triage cadence**: Weekly automated triage via cron job.
-9. **Agent load balancing**: Timmy has 76 issues (33% of total). Redistribute.
-
---
-
-## Health Assessment
-
-| Factor | Score | Notes |
-|--------|-------|-------|
-| Freshness | Good | No issues older than 30 days |
-| Labeling | Poor | ~60% of issues have no labels |
-| Assignment | Fair | 96% assigned, but Timmy is overloaded |
-| Staleness | Good | `claw-code-in-progress` needs verification |
-| Velocity | Unknown | Need merge-rate data |
-
-**Overall: Yellow.** The backlog is fresh but growing. Label hygiene and load balancing are the biggest gaps.
-
---
-
-*Generated by backlog triage. Ref: the-nexus #1459.*
--- a/tests/test_night_watch_llama.py
+++ b/tests/test_night_watch_llama.py
@@ -0,0 +1,75 @@
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+
+class _FakeResponse:
+    def __init__(self, payload: dict):
+        self.payload = json.dumps(payload).encode()
+
+    def read(self):
+        return self.payload
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, tb):
+        return False
+
+
+class _FakeHeartbeatReport:
+    def to_panel_markdown(self):
+        return "## Heartbeat Panel\n\nAll jobs healthy."
+
+
+class _FakeChecker:
+    @staticmethod
+    def build_report():
+        return _FakeHeartbeatReport()
+
+
+def test_check_llama_server_reports_healthy_model():
+    import bin.night_watch as nw
+
+    with patch(
+        "bin.night_watch.urllib.request.urlopen",
+        return_value=_FakeResponse({"status": "ok", "model_path": "/opt/models/llama/Qwen2.5-7B-Instruct-Q4_K_M.gguf"}),
+    ):
+        status, detail = nw._check_llama_server("http://127.0.0.1:11435")
+
+    assert status == "OK"
+    assert "127.0.0.1:11435" in detail
+    assert "Qwen2.5-7B-Instruct-Q4_K_M.gguf" in detail
+
+
+def test_check_llama_server_reports_warning_on_failure():
+    import bin.night_watch as nw
+
+    with patch(
+        "bin.night_watch.urllib.request.urlopen",
+        side_effect=OSError("connection refused"),
+    ):
+        status, detail = nw._check_llama_server("http://127.0.0.1:11435")
+
+    assert status == "WARN"
+    assert "connection refused" in detail
+
+
+def test_generate_report_includes_local_llm_row():
+    import bin.night_watch as nw
+
+    with patch("bin.night_watch._check_service", return_value=("OK", "hermes-bezalel is active")), \
+         patch("bin.night_watch._check_disk", return_value=("OK", "disk usage 23%")), \
+         patch("bin.night_watch._check_memory", return_value=("OK", "memory usage 30%")), \
+         patch("bin.night_watch._check_gitea_reachability", return_value=("OK", "Gitea HTTPS is responding (200)")), \
+         patch("bin.night_watch._check_world_readable_secrets", return_value=("OK", "no sensitive recently-modified world-readable files found")), \
+         patch("bin.night_watch._check_llama_server", return_value=("OK", "llama-server healthy at http://127.0.0.1:11435 (Qwen2.5-7B-Instruct-Q4_K_M.gguf)")):
+        report = nw.generate_report("2026-04-15", _FakeChecker())
+
+    assert "| Local LLM | OK | llama-server healthy at http://127.0.0.1:11435 (Qwen2.5-7B-Instruct-Q4_K_M.gguf) |" in report
+    assert "## Heartbeat Panel" in report