fix: closes #865

2026-04-12 19:28:17 -04:00
3 changed files with 16 additions and 41 deletions
--- a/bin/nexus_watchdog.py
+++ b/bin/nexus_watchdog.py
@@ -586,8 +586,8 @@ def alert_on_failure(report: HealthReport, dry_run: bool = False) -> None:
                logger.info("Created alert issue #%d", result["number"])


-def run_once(args: argparse.Namespace) -> bool:
-    """Run one health check cycle. Returns True if healthy."""
+def run_once(args: argparse.Namespace) -> tuple:
+    """Run one health check cycle. Returns (healthy, report)."""
    report = run_health_checks(
        ws_host=args.ws_host,
        ws_port=args.ws_port,
@@ -615,7 +615,7 @@ def run_once(args: argparse.Namespace) -> bool:
        except Exception:
            pass  # never crash the watchdog over its own heartbeat

-    return report.overall_healthy
+    return report.overall_healthy, report


 def main():
@@ -678,21 +678,15 @@ def main():
        signal.signal(signal.SIGINT, _handle_sigterm)

        while _running:
-            run_once(args)
+            run_once(args)  # (healthy, report) — not needed in watch mode
            for _ in range(args.interval):
                if not _running:
                    break
                time.sleep(1)
    else:
-        healthy = run_once(args)
+        healthy, report = run_once(args)

        if args.output_json:
-            report = run_health_checks(
-                ws_host=args.ws_host,
-                ws_port=args.ws_port,
-                heartbeat_path=Path(args.heartbeat_path),
-                stale_threshold=args.stale_threshold,
-            )
            print(json.dumps({
                "healthy": report.overall_healthy,
                "timestamp": report.timestamp,
--- a/mimo-swarm/scripts/mimo-dispatcher.py
+++ b/mimo-swarm/scripts/mimo-dispatcher.py
@@ -7,7 +7,6 @@ routes to lanes, and spawns one-shot mimo-v2-pro workers.
 No new issues created. No duplicate claims. No bloat.
 """

-import glob
 import json
 import os
 import sys
@@ -39,7 +38,6 @@ else:

 CLAIM_TIMEOUT_MINUTES = 30
 CLAIM_LABEL = "mimo-claimed"
-MAX_QUEUE_DEPTH = 10  # Don't dispatch if queue already has this many prompts
 CLAIM_COMMENT = "/claim"
 DONE_COMMENT = "/done"
 ABANDON_COMMENT = "/abandon"
@@ -453,13 +451,6 @@ def dispatch(token):
    prefetch_pr_refs(target_repo, token)
    log(f"  Prefetched {len(_PR_REFS)} PR references")

-    # Check queue depth — don't pile up if workers haven't caught up
-    pending_prompts = len(glob.glob(os.path.join(STATE_DIR, "prompt-*.txt")))
-    if pending_prompts >= MAX_QUEUE_DEPTH:
-        log(f"  QUEUE THROTTLE: {pending_prompts} prompts pending (max {MAX_QUEUE_DEPTH}) — skipping dispatch")
-        save_state(state)
-        return 0
-
    # FOCUS MODE: scan only the focus repo. FIREHOSE: scan all.
    if FOCUS_MODE:
        ordered = [FOCUS_REPO]
--- a/mimo-swarm/scripts/worker-runner.py
+++ b/mimo-swarm/scripts/worker-runner.py
@@ -24,23 +24,6 @@ def log(msg):
        f.write(f"[{ts}] {msg}\n")


-def write_result(worker_id, status, repo=None, issue=None, branch=None, pr=None, error=None):
-    """Write a result file — always, even on failure."""
-    result_file = os.path.join(STATE_DIR, f"result-{worker_id}.json")
-    data = {
-        "status": status,
-        "worker": worker_id,
-        "timestamp": datetime.now(timezone.utc).isoformat(),
-    }
-    if repo: data["repo"] = repo
-    if issue: data["issue"] = int(issue) if str(issue).isdigit() else issue
-    if branch: data["branch"] = branch
-    if pr: data["pr"] = pr
-    if error: data["error"] = error
-    with open(result_file, "w") as f:
-        json.dump(data, f)
-
-
 def get_oldest_prompt():
    """Get the oldest prompt file with file locking (atomic rename)."""
    prompts = sorted(glob.glob(os.path.join(STATE_DIR, "prompt-*.txt")))
@@ -80,7 +63,6 @@ def run_worker(prompt_file):

    if not repo or not issue:
        log(f"  SKIPPING: couldn't parse repo/issue from prompt")
-        write_result(worker_id, "parse_error", error="could not parse repo/issue from prompt")
        os.remove(prompt_file)
        return False

@@ -97,7 +79,6 @@ def run_worker(prompt_file):
    )
    if result.returncode != 0:
        log(f"  CLONE FAILED: {result.stderr[:200]}")
-        write_result(worker_id, "clone_failed", repo=repo, issue=issue, error=result.stderr[:200])
        os.remove(prompt_file)
        return False

@@ -145,7 +126,6 @@ def run_worker(prompt_file):
                urllib.request.urlopen(req, timeout=10)
            except:
                pass
-            write_result(worker_id, "abandoned", repo=repo, issue=issue, error="no changes produced")
            if os.path.exists(prompt_file):
                os.remove(prompt_file)
            return False
@@ -213,7 +193,17 @@ def run_worker(prompt_file):
        pr_num = "?"

    # Write result
-    write_result(worker_id, "completed", repo=repo, issue=issue, branch=branch, pr=pr_num)
+    result_file = os.path.join(STATE_DIR, f"result-{worker_id}.json")
+    with open(result_file, "w") as f:
+        json.dump({
+            "status": "completed",
+            "worker": worker_id,
+            "repo": repo,
+            "issue": int(issue) if issue.isdigit() else issue,
+            "branch": branch,
+            "pr": pr_num,
+            "timestamp": datetime.now(timezone.utc).isoformat()
+        }, f)

    # Remove prompt
    # Remove prompt file (handles .processing extension)