Merge pull request '[orchestration] Harden the nervous system — full repo coverage, destructive PR guard, dedup' (#102) from gemini/orchestration-hardening into main
This commit was merged in pull request #102.
This commit is contained in:
77
tasks.py
77
tasks.py
@@ -22,8 +22,15 @@ METRICS_DIR = TIMMY_HOME / "metrics"
|
||||
REPOS = [
|
||||
"Timmy_Foundation/the-nexus",
|
||||
"Timmy_Foundation/timmy-config",
|
||||
"Timmy_Foundation/timmy-home",
|
||||
"Timmy_Foundation/the-door",
|
||||
"Timmy_Foundation/turboquant",
|
||||
"Timmy_Foundation/hermes-agent",
|
||||
"Timmy_Foundation/.profile",
|
||||
]
|
||||
NET_LINE_LIMIT = 10
|
||||
NET_LINE_LIMIT = 500
|
||||
# Flag PRs where any single file loses >50% of its lines
|
||||
DESTRUCTIVE_DELETION_THRESHOLD = 0.5
|
||||
|
||||
# ── Local Model Inference via Hermes Harness ─────────────────────────
|
||||
|
||||
@@ -1180,22 +1187,66 @@ def triage_issues():
|
||||
|
||||
@huey.periodic_task(crontab(minute="*/30"))
|
||||
def review_prs():
|
||||
"""Review open PRs: check net diff, reject violations."""
|
||||
"""Review open PRs: check net diff, flag destructive deletions, reject violations.
|
||||
|
||||
Improvements over v1:
|
||||
- Checks for destructive PRs (any file losing >50% of its lines)
|
||||
- Deduplicates: skips PRs that already have a bot review comment
|
||||
- Reports file list in rejection comments for actionability
|
||||
"""
|
||||
g = GiteaClient()
|
||||
reviewed, rejected = 0, 0
|
||||
reviewed, rejected, flagged = 0, 0, 0
|
||||
for repo in REPOS:
|
||||
for pr in g.list_pulls(repo, state="open", limit=20):
|
||||
reviewed += 1
|
||||
|
||||
# Skip if we already reviewed this PR (prevents comment spam)
|
||||
try:
|
||||
comments = g.list_comments(repo, pr.number)
|
||||
already_reviewed = any(
|
||||
c.body and ("❌ Net +" in c.body or "🚨 DESTRUCTIVE" in c.body)
|
||||
for c in comments
|
||||
)
|
||||
if already_reviewed:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
files = g.get_pull_files(repo, pr.number)
|
||||
net = sum(f.additions - f.deletions for f in files)
|
||||
file_list = ", ".join(f.filename for f in files[:10])
|
||||
|
||||
# Check for destructive deletions (the PR #788 scenario)
|
||||
destructive_files = []
|
||||
for f in files:
|
||||
if f.status == "modified" and f.deletions > 0:
|
||||
total_lines = f.additions + f.deletions # rough proxy
|
||||
if total_lines > 0 and f.deletions / total_lines > DESTRUCTIVE_DELETION_THRESHOLD:
|
||||
if f.deletions > 20: # ignore trivial files
|
||||
destructive_files.append(
|
||||
f"{f.filename} (-{f.deletions}/+{f.additions})"
|
||||
)
|
||||
|
||||
if destructive_files:
|
||||
flagged += 1
|
||||
g.create_comment(
|
||||
repo, pr.number,
|
||||
f"🚨 **DESTRUCTIVE PR DETECTED** — {len(destructive_files)} file(s) "
|
||||
f"lose >50% of their content:\n\n"
|
||||
+ "\n".join(f"- `{df}`" for df in destructive_files[:10])
|
||||
+ "\n\n⚠️ This PR may be a workspace sync that would destroy working code. "
|
||||
f"Please verify before merging. See CONTRIBUTING.md."
|
||||
)
|
||||
|
||||
if net > NET_LINE_LIMIT:
|
||||
rejected += 1
|
||||
g.create_comment(
|
||||
repo, pr.number,
|
||||
f"❌ Net +{net} lines exceeds the {NET_LINE_LIMIT}-line limit. "
|
||||
f"Files: {file_list}. "
|
||||
f"Find {net - NET_LINE_LIMIT} lines to cut. See CONTRIBUTING.md."
|
||||
)
|
||||
return {"reviewed": reviewed, "rejected": rejected}
|
||||
return {"reviewed": reviewed, "rejected": rejected, "destructive_flagged": flagged}
|
||||
|
||||
|
||||
@huey.periodic_task(crontab(minute="*/10"))
|
||||
@@ -1413,17 +1464,23 @@ def heartbeat_tick():
|
||||
except Exception:
|
||||
perception["model_health"] = "unreadable"
|
||||
|
||||
# Open issue/PR counts
|
||||
# Open issue/PR counts — use limit=50 for real counts, not limit=1
|
||||
if perception.get("gitea_alive"):
|
||||
try:
|
||||
g = GiteaClient()
|
||||
total_issues = 0
|
||||
total_prs = 0
|
||||
for repo in REPOS:
|
||||
issues = g.list_issues(repo, state="open", limit=1)
|
||||
pulls = g.list_pulls(repo, state="open", limit=1)
|
||||
issues = g.list_issues(repo, state="open", limit=50)
|
||||
pulls = g.list_pulls(repo, state="open", limit=50)
|
||||
perception[repo] = {
|
||||
"open_issues": len(issues),
|
||||
"open_prs": len(pulls),
|
||||
}
|
||||
total_issues += len(issues)
|
||||
total_prs += len(pulls)
|
||||
perception["total_open_issues"] = total_issues
|
||||
perception["total_open_prs"] = total_prs
|
||||
except Exception as e:
|
||||
perception["gitea_error"] = str(e)
|
||||
|
||||
@@ -1539,7 +1596,8 @@ def memory_compress():
|
||||
inference_down_count = 0
|
||||
|
||||
for t in ticks:
|
||||
for action in t.get("actions", []):
|
||||
decision = t.get("decision", {})
|
||||
for action in decision.get("actions", []):
|
||||
alerts.append(f"[{t['tick_id']}] {action}")
|
||||
p = t.get("perception", {})
|
||||
if not p.get("gitea_alive"):
|
||||
@@ -1584,8 +1642,9 @@ def good_morning_report():
|
||||
# --- GATHER OVERNIGHT DATA ---
|
||||
|
||||
# Heartbeat ticks from last night
|
||||
from datetime import timedelta as _td
|
||||
tick_dir = TIMMY_HOME / "heartbeat"
|
||||
yesterday = now.strftime("%Y%m%d")
|
||||
yesterday = (now - _td(days=1)).strftime("%Y%m%d")
|
||||
tick_log = tick_dir / f"ticks_{yesterday}.jsonl"
|
||||
tick_count = 0
|
||||
alerts = []
|
||||
|
||||
238
tests/test_orchestration_hardening.py
Normal file
238
tests/test_orchestration_hardening.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""Tests for orchestration hardening (2026-03-30 deep audit pass 3).
|
||||
|
||||
Covers:
|
||||
- REPOS expanded from 2 → 7 (all Foundation repos monitored)
|
||||
- Destructive PR detection via DESTRUCTIVE_DELETION_THRESHOLD
|
||||
- review_prs deduplication (no repeat comment spam)
|
||||
- heartbeat_tick uses limit=50 for real counts
|
||||
- All PR #101 fixes carried forward (NET_LINE_LIMIT, memory_compress, morning report)
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
def _read_tasks():
|
||||
return (Path(__file__).resolve().parent.parent / "tasks.py").read_text()
|
||||
|
||||
|
||||
def _find_global(text, name):
|
||||
"""Extract a top-level assignment value from tasks.py source."""
|
||||
for line in text.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith(name) and "=" in stripped:
|
||||
_, _, value = stripped.partition("=")
|
||||
return value.strip()
|
||||
return None
|
||||
|
||||
|
||||
def _extract_function_body(text, func_name):
|
||||
"""Extract the body of a function from source code."""
|
||||
lines = text.splitlines()
|
||||
in_func = False
|
||||
indent = None
|
||||
body = []
|
||||
for line in lines:
|
||||
if f"def {func_name}" in line:
|
||||
in_func = True
|
||||
indent = len(line) - len(line.lstrip())
|
||||
body.append(line)
|
||||
continue
|
||||
if in_func:
|
||||
if line.strip() == "":
|
||||
body.append(line)
|
||||
elif len(line) - len(line.lstrip()) > indent or line.strip().startswith("#") or line.strip().startswith("\"\"\"") or line.strip().startswith("'"):
|
||||
body.append(line)
|
||||
elif line.strip().startswith("@"):
|
||||
break
|
||||
elif len(line) - len(line.lstrip()) <= indent and line.strip().startswith("def "):
|
||||
break
|
||||
else:
|
||||
body.append(line)
|
||||
return "\n".join(body)
|
||||
|
||||
|
||||
# ── Test: REPOS covers all Foundation repos ──────────────────────────
|
||||
|
||||
def test_repos_covers_all_foundation_repos():
|
||||
"""REPOS must include all 7 Timmy_Foundation repos.
|
||||
|
||||
Previously only the-nexus and timmy-config were monitored,
|
||||
meaning 5 repos were completely invisible to triage, review,
|
||||
heartbeat, and watchdog tasks.
|
||||
"""
|
||||
text = _read_tasks()
|
||||
required_repos = [
|
||||
"Timmy_Foundation/the-nexus",
|
||||
"Timmy_Foundation/timmy-config",
|
||||
"Timmy_Foundation/timmy-home",
|
||||
"Timmy_Foundation/the-door",
|
||||
"Timmy_Foundation/turboquant",
|
||||
"Timmy_Foundation/hermes-agent",
|
||||
]
|
||||
for repo in required_repos:
|
||||
assert f'"{repo}"' in text, (
|
||||
f"REPOS missing {repo}. All Foundation repos must be monitored."
|
||||
)
|
||||
|
||||
|
||||
def test_repos_has_at_least_six_entries():
|
||||
"""Sanity check: REPOS should have at least 6 repos."""
|
||||
text = _read_tasks()
|
||||
count = text.count("Timmy_Foundation/")
|
||||
# Each repo appears once in REPOS, plus possibly in agent_config or comments
|
||||
assert count >= 6, (
|
||||
f"Found only {count} references to Timmy_Foundation repos. "
|
||||
"REPOS should have at least 6 real repos."
|
||||
)
|
||||
|
||||
|
||||
# ── Test: Destructive PR detection ───────────────────────────────────
|
||||
|
||||
def test_destructive_deletion_threshold_exists():
|
||||
"""DESTRUCTIVE_DELETION_THRESHOLD must be defined.
|
||||
|
||||
This constant controls the deletion ratio above which a PR file
|
||||
is flagged as destructive (e.g., the PR #788 scenario).
|
||||
"""
|
||||
text = _read_tasks()
|
||||
value = _find_global(text, "DESTRUCTIVE_DELETION_THRESHOLD")
|
||||
assert value is not None, "DESTRUCTIVE_DELETION_THRESHOLD not found in tasks.py"
|
||||
threshold = float(value)
|
||||
assert 0.3 <= threshold <= 0.8, (
|
||||
f"DESTRUCTIVE_DELETION_THRESHOLD = {threshold} is out of sane range [0.3, 0.8]. "
|
||||
"0.5 means 'more than half the file is deleted'."
|
||||
)
|
||||
|
||||
|
||||
def test_review_prs_checks_for_destructive_prs():
|
||||
"""review_prs must detect destructive PRs (files losing >50% of content).
|
||||
|
||||
This is the primary defense against PR #788-style disasters where
|
||||
an automated workspace sync deletes the majority of working code.
|
||||
"""
|
||||
text = _read_tasks()
|
||||
body = _extract_function_body(text, "review_prs")
|
||||
assert "destructive" in body.lower(), (
|
||||
"review_prs does not contain destructive PR detection logic. "
|
||||
"Must flag PRs where files lose >50% of content."
|
||||
)
|
||||
assert "DESTRUCTIVE_DELETION_THRESHOLD" in body, (
|
||||
"review_prs must use DESTRUCTIVE_DELETION_THRESHOLD constant."
|
||||
)
|
||||
|
||||
|
||||
# ── Test: review_prs deduplication ───────────────────────────────────
|
||||
|
||||
def test_review_prs_deduplicates_comments():
|
||||
"""review_prs must skip PRs it has already commented on.
|
||||
|
||||
Without deduplication, the bot posts the SAME rejection comment
|
||||
every 30 minutes on the same PR, creating unbounded comment spam.
|
||||
"""
|
||||
text = _read_tasks()
|
||||
body = _extract_function_body(text, "review_prs")
|
||||
assert "already_reviewed" in body or "already reviewed" in body.lower(), (
|
||||
"review_prs does not check for already-reviewed PRs. "
|
||||
"Must skip PRs where bot has already posted a review comment."
|
||||
)
|
||||
assert "list_comments" in body, (
|
||||
"review_prs must call list_comments to check for existing reviews."
|
||||
)
|
||||
|
||||
|
||||
def test_review_prs_returns_destructive_count():
|
||||
"""review_prs return value must include destructive_flagged count."""
|
||||
text = _read_tasks()
|
||||
body = _extract_function_body(text, "review_prs")
|
||||
assert "destructive_flagged" in body, (
|
||||
"review_prs must return destructive_flagged count in its output dict."
|
||||
)
|
||||
|
||||
|
||||
# ── Test: heartbeat_tick uses real counts ────────────────────────────
|
||||
|
||||
def test_heartbeat_tick_uses_realistic_limit():
|
||||
"""heartbeat_tick must use limit >= 20 for issue/PR counts.
|
||||
|
||||
Previously used limit=1 which meant len() always returned 0 or 1.
|
||||
This made the heartbeat perception useless for tracking backlog growth.
|
||||
"""
|
||||
text = _read_tasks()
|
||||
body = _extract_function_body(text, "heartbeat_tick")
|
||||
# Check there's no limit=1 in actual code calls (not docstrings)
|
||||
for line in body.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("#") or stripped.startswith("\"\"\"") or stripped.startswith("'"):
|
||||
continue
|
||||
if "limit=1" in stripped and ("list_issues" in stripped or "list_pulls" in stripped):
|
||||
raise AssertionError(
|
||||
"heartbeat_tick still uses limit=1 for issue/PR counts. "
|
||||
"This always returns 0 or 1, making counts meaningless."
|
||||
)
|
||||
# Check it aggregates totals
|
||||
assert "total_open_issues" in body or "total_issues" in body, (
|
||||
"heartbeat_tick should aggregate total issue counts across all repos."
|
||||
)
|
||||
|
||||
|
||||
# ── Test: NET_LINE_LIMIT sanity (carried from PR #101) ───────────────
|
||||
|
||||
def test_net_line_limit_is_sane():
|
||||
"""NET_LINE_LIMIT = 10 caused every real PR to be spam-rejected."""
|
||||
text = _read_tasks()
|
||||
value = _find_global(text, "NET_LINE_LIMIT")
|
||||
assert value is not None, "NET_LINE_LIMIT not found"
|
||||
limit = int(value)
|
||||
assert 200 <= limit <= 2000, (
|
||||
f"NET_LINE_LIMIT = {limit} is outside sane range [200, 2000]."
|
||||
)
|
||||
|
||||
|
||||
# ── Test: memory_compress reads correct action path ──────────────────
|
||||
|
||||
def test_memory_compress_reads_decision_actions():
|
||||
"""Actions live in tick_record['decision']['actions'], not tick_record['actions']."""
|
||||
text = _read_tasks()
|
||||
body = _extract_function_body(text, "memory_compress")
|
||||
assert 'decision' in body and 't.get(' in body, (
|
||||
"memory_compress does not read from t['decision']. "
|
||||
"Actions are nested under the decision dict."
|
||||
)
|
||||
# The OLD bug pattern
|
||||
for line in body.splitlines():
|
||||
stripped = line.strip()
|
||||
if 't.get("actions"' in stripped and 'decision' not in stripped:
|
||||
raise AssertionError(
|
||||
"Bug: memory_compress still reads t.get('actions') directly."
|
||||
)
|
||||
|
||||
|
||||
# ── Test: good_morning_report reads yesterday's ticks ────────────────
|
||||
|
||||
def test_good_morning_report_reads_yesterday_ticks():
|
||||
"""At 6 AM, the morning report should read yesterday's tick log, not today's."""
|
||||
text = _read_tasks()
|
||||
body = _extract_function_body(text, "good_morning_report")
|
||||
assert "timedelta" in body, (
|
||||
"good_morning_report does not use timedelta to compute yesterday."
|
||||
)
|
||||
# Ensure the old bug pattern is gone
|
||||
for line in body.splitlines():
|
||||
stripped = line.strip()
|
||||
if "yesterday = now.strftime" in stripped and "timedelta" not in stripped:
|
||||
raise AssertionError(
|
||||
"Bug: good_morning_report still sets yesterday = now.strftime()."
|
||||
)
|
||||
|
||||
|
||||
# ── Test: review_prs includes file list in rejection ─────────────────
|
||||
|
||||
def test_review_prs_rejection_includes_file_list():
|
||||
"""Rejection comments should include file names for actionability."""
|
||||
text = _read_tasks()
|
||||
body = _extract_function_body(text, "review_prs")
|
||||
assert "file_list" in body and "filename" in body, (
|
||||
"review_prs rejection comment should include a file_list."
|
||||
)
|
||||
Reference in New Issue
Block a user