[KAIZEN] Harden retro scheduling, chunking, and tests (#349)

- Add Kaizen Retro to cron/jobs.json with explicit local model/provider - Add Telegram message chunking for reports approaching the 4096-char limit - Fix classify_issue_type false positives on short substrings (ci in cleanup) - Add 28 unit tests covering classification, max-attempts detection, suggestion generation, report formatting, and Telegram chunking
2026-04-07 15:54:15 +00:00
parent f18955ea90
commit 2e64b160b5
3 changed files with 355 additions and 8 deletions
--- a/cron/jobs.json
+++ b/cron/jobs.json
@@ -137,7 +137,38 @@
      "paused_reason": null,
      "skills": [],
      "skill": null
+    },
+    {
+      "id": "kaizen-retro-349",
+      "name": "Kaizen Retro",
+      "prompt": "Run the automated burn-cycle retrospective. Execute: cd /root/wizards/ezra/workspace/timmy-config && ./bin/kaizen-retro.sh",
+      "model": "hermes3:latest",
+      "provider": "ollama",
+      "base_url": "http://localhost:11434/v1",
+      "schedule": {
+        "kind": "interval",
+        "minutes": 1440,
+        "display": "every 1440m"
+      },
+      "schedule_display": "daily at 07:30",
+      "repeat": {
+        "times": null,
+        "completed": 0
+      },
+      "enabled": true,
+      "created_at": "2026-04-07T15:30:00.000000Z",
+      "next_run_at": "2026-04-08T07:30:00.000000Z",
+      "last_run_at": null,
+      "last_status": null,
+      "last_error": null,
+      "deliver": "local",
+      "origin": null,
+      "state": "scheduled",
+      "paused_at": null,
+      "paused_reason": null,
+      "skills": [],
+      "skill": null
    }
  ],
  "updated_at": "2026-04-07T15:00:00+00:00"
-}
+}
--- a/scripts/kaizen_retro.py
+++ b/scripts/kaizen_retro.py
@@ -51,6 +51,7 @@ MORNING_REPORT_REPO = "Timmy_Foundation/timmy-config"

 TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN")
 TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_HOME_CHANNEL", "-1003664764329")
+TELEGRAM_MAX_LEN = 4000  # leave headroom below the 4096 hard limit

 STALE_DAYS = 7
 MAX_ATTEMPT_COMMENT_THRESHOLD = 5
@@ -86,14 +87,24 @@ def classify_issue_type(issue: dict) -> str:
    body = (issue.get("body", "") or "").lower()
    labels = [l.get("name", "").lower() for l in issue.get("labels", []) or []]
    text = f"{title} {body} {' '.join(labels)}"
+    words = set(text.split())

    best = "other"
    best_score = 0
    for kind, keywords in ISSUE_TYPE_KEYWORDS.items():
-        score = sum(1 for kw in keywords if kw in text)
+        # Short keywords (<=3 chars) require whole-word match to avoid false positives like
+        # "ci" inside "cleanup" or "cd" inside "abcde".
+        score = sum(
+            1 for kw in keywords
+            if (len(kw) <= 3 and kw in words) or (len(kw) > 3 and kw in text)
+        )
        # label match is stronger
        for label in labels:
-            if any(kw in label for kw in keywords):
+            label_words = set(label.split())
+            if any(
+                (len(kw) <= 3 and kw in label_words) or (len(kw) > 3 and kw in label)
+                for kw in keywords
+            ):
                score += 3
        if score > best_score:
            best_score = score
@@ -119,12 +130,34 @@ def is_max_attempts_candidate(issue: dict) -> bool:
    return False


-def telegram_send(text: str, bot_token: str, chat_id: str) -> dict:
+def telegram_send(text: str, bot_token: str, chat_id: str) -> list[dict]:
+    """Post text to Telegram, chunking if it exceeds the message limit."""
    url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
-    data = json.dumps({"chat_id": chat_id, "text": text, "parse_mode": "Markdown"}).encode()
-    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
-    with urllib.request.urlopen(req, timeout=30) as resp:
-        return json.loads(resp.read().decode())
+    chunks = []
+    if len(text) <= TELEGRAM_MAX_LEN:
+        chunks = [text]
+    else:
+        # Split on newlines to preserve readability
+        lines = text.splitlines(keepends=True)
+        current = ""
+        for line in lines:
+            if len(current) + len(line) > TELEGRAM_MAX_LEN:
+                if current:
+                    chunks.append(current)
+                current = line
+            else:
+                current += line
+        if current:
+            chunks.append(current)
+    results = []
+    for i, chunk in enumerate(chunks):
+        prefix = f"*(part {i + 1}/{len(chunks)})*\n" if len(chunks) > 1 else ""
+        payload = {"chat_id": chat_id, "text": prefix + chunk, "parse_mode": "Markdown"}
+        data = json.dumps(payload).encode()
+        req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
+        with urllib.request.urlopen(req, timeout=30) as resp:
+            results.append(json.loads(resp.read().decode()))
+    return results


 def find_latest_morning_report_issue(client: GiteaClient) -> Optional[int]:
--- a/tests/test_kaizen_retro.py
+++ b/tests/test_kaizen_retro.py
@@ -0,0 +1,283 @@
+"""Tests for the Kaizen Retro burn-cycle retrospective script."""
+
+from __future__ import annotations
+
+import importlib.util
+import json
+import sys
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+REPO_ROOT = Path(__file__).parent.parent
+
+# Load kaizen_retro.py as a module (it lives in scripts/, not a package)
+spec = importlib.util.spec_from_file_location("kaizen_retro", REPO_ROOT / "scripts" / "kaizen_retro.py")
+kr = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(kr)
+
+
+# ── classify_issue_type ───────────────────────────────────────────────────
+
+class TestClassifyIssueType:
+    def test_classifies_bug_from_title(self):
+        issue = {"title": "Fix crash on startup", "body": "", "labels": []}
+        assert kr.classify_issue_type(issue) == "bug"
+
+    def test_classifies_feature_from_label(self):
+        issue = {"title": "Add dark mode", "body": "", "labels": [{"name": "enhancement"}]}
+        # label "enhancement" doesn't match any keyword directly, but "feature" and "add" are in title
+        assert kr.classify_issue_type(issue) == "feature"
+
+    def test_classifies_docs_from_label(self):
+        issue = {"title": "Update guide", "body": "", "labels": [{"name": "documentation"}]}
+        assert kr.classify_issue_type(issue) == "docs"
+
+    def test_label_match_stronger_than_title(self):
+        issue = {"title": "Something random", "body": "", "labels": [{"name": "bug"}]}
+        assert kr.classify_issue_type(issue) == "bug"
+
+    def test_kaizen_takes_precedence_with_both_labels(self):
+        issue = {"title": "Process improvement", "body": "", "labels": [{"name": "kaizen"}, {"name": "bug"}]}
+        # kaizen label gives +3, bug gives +3, tie goes to first seen? kaizen appears first in dict
+        assert kr.classify_issue_type(issue) == "kaizen"
+
+    def test_defaults_to_other(self):
+        issue = {"title": "Tidy up naming", "body": "No user-facing change", "labels": [{"name": "cleanup"}]}
+        assert kr.classify_issue_type(issue) == "other"
+
+
+# ── is_max_attempts_candidate ─────────────────────────────────────────────
+
+class TestIsMaxAttemptsCandidate:
+    def test_blocker_label_returns_true(self):
+        issue = {"labels": [{"name": "blocked"}], "comments": 0, "created_at": "2026-04-07T00:00:00Z"}
+        assert kr.is_max_attempts_candidate(issue) is True
+
+    def test_timeout_label_returns_true(self):
+        issue = {"labels": [{"name": "timeout"}], "comments": 0, "created_at": "2026-04-07T00:00:00Z"}
+        assert kr.is_max_attempts_candidate(issue) is True
+
+    def test_high_comment_count_returns_true(self):
+        issue = {"labels": [], "comments": 5, "created_at": "2026-04-07T00:00:00Z"}
+        assert kr.is_max_attempts_candidate(issue) is True
+
+    def test_fresh_issue_with_low_comments_returns_false(self):
+        now = datetime.now(timezone.utc)
+        issue = {"labels": [], "comments": 2, "created_at": now.isoformat()}
+        assert kr.is_max_attempts_candidate(issue) is False
+
+    def test_stale_age_returns_true(self):
+        old = datetime.now(timezone.utc) - timedelta(days=10)
+        issue = {"labels": [], "comments": 0, "created_at": old.isoformat()}
+        assert kr.is_max_attempts_candidate(issue) is True
+
+
+# ── fmt_pct ───────────────────────────────────────────────────────────────
+
+class TestFmtPct:
+    def test_basic_percentage(self):
+        assert kr.fmt_pct(3, 4) == "75%"
+
+    def test_zero_denominator(self):
+        assert kr.fmt_pct(0, 0) == "N/A"
+
+    def test_perfect_rate(self):
+        assert kr.fmt_pct(10, 10) == "100%"
+
+
+# ── generate_suggestion ───────────────────────────────────────────────────
+
+class TestGenerateSuggestion:
+    def test_agent_zero_success_rate(self):
+        metrics = {
+            "by_agent": {
+                "groq": {"successes": 0, "failures": 5, "closed": 0, "repos": ["timmy-home"]},
+            },
+            "by_repo": {},
+            "by_type": {},
+            "max_attempts_issues": [],
+            "closed_issues": [],
+            "merged_prs": [],
+            "closed_prs": [],
+        }
+        suggestion = kr.generate_suggestion(metrics, [])
+        assert "groq" in suggestion
+        assert "0%" in suggestion or "verify rate" in suggestion
+
+    def test_repo_with_most_failures(self):
+        metrics = {
+            "by_agent": {},
+            "by_repo": {
+                "the-nexus": {"successes": 2, "failures": 5, "closed": 2, "open": 3},
+            },
+            "by_type": {},
+            "max_attempts_issues": [],
+            "closed_issues": [],
+            "merged_prs": [],
+            "closed_prs": [],
+        }
+        suggestion = kr.generate_suggestion(metrics, [])
+        assert "the-nexus" in suggestion
+        assert "friction" in suggestion
+
+    def test_max_attempts_pattern(self):
+        metrics = {
+            "by_agent": {},
+            "by_repo": {},
+            "by_type": {},
+            "max_attempts_issues": [
+                {"type": "devops"}, {"type": "devops"}, {"type": "feature"}
+            ],
+            "closed_issues": [],
+            "merged_prs": [],
+            "closed_prs": [],
+        }
+        suggestion = kr.generate_suggestion(metrics, [])
+        assert "devops" in suggestion
+        assert "max-attempts" in suggestion.lower() or "stale" in suggestion.lower()
+
+    def test_idle_agents(self):
+        metrics = {
+            "by_agent": {},
+            "by_repo": {},
+            "by_type": {},
+            "max_attempts_issues": [],
+            "closed_issues": [],
+            "merged_prs": [],
+            "closed_prs": [],
+        }
+        fleet = [{"name": "allegro", "active": True}, {"name": "ezra", "active": True}]
+        suggestion = kr.generate_suggestion(metrics, fleet)
+        assert "idle" in suggestion.lower() or "no assignments" in suggestion.lower()
+
+    def test_fallback_celebration(self):
+        metrics = {
+            "by_agent": {},
+            "by_repo": {},
+            "by_type": {},
+            "max_attempts_issues": [],
+            "closed_issues": [{}, {}, {}, {}, {}],
+            "merged_prs": [{}, {}, {}],
+            "closed_prs": [],
+        }
+        suggestion = kr.generate_suggestion(metrics, [])
+        assert "Strong cycle" in suggestion
+
+    def test_fallback_low_activity(self):
+        metrics = {
+            "by_agent": {},
+            "by_repo": {},
+            "by_type": {},
+            "max_attempts_issues": [],
+            "closed_issues": [],
+            "merged_prs": [],
+            "closed_prs": [],
+        }
+        suggestion = kr.generate_suggestion(metrics, [])
+        assert "Low activity" in suggestion or "idle" in suggestion.lower()
+
+
+# ── build_report ──────────────────────────────────────────────────────────
+
+class TestBuildReport:
+    def test_report_contains_numbers_section(self):
+        metrics = {
+            "closed_issues": [{}, {}],
+            "merged_prs": [{}],
+            "closed_prs": [],
+            "max_attempts_issues": [],
+            "by_agent": {"ezra": {"successes": 2, "failures": 0, "repos": ["timmy-config"]}},
+            "by_repo": {"timmy-config": {"successes": 2, "failures": 0, "open": 1}},
+            "by_type": {"feature": {"successes": 2, "failures": 0, "total": 2}},
+        }
+        report = kr.build_report(metrics, "Do better.", "2026-04-06T00:00:00+00:00")
+        assert "## Numbers" in report
+        assert "Issues closed:** 2" in report
+        assert "PRs merged:** 1" in report
+        assert "## By Agent" in report
+        assert "## By Repo" in report
+        assert "## By Issue Type" in report
+        assert "Do better." in report
+
+    def test_report_skips_empty_repos(self):
+        metrics = {
+            "closed_issues": [],
+            "merged_prs": [],
+            "closed_prs": [],
+            "max_attempts_issues": [],
+            "by_agent": {},
+            "by_repo": {"unused-repo": {"successes": 0, "failures": 0, "open": 0}},
+            "by_type": {},
+        }
+        report = kr.build_report(metrics, "Nudge.", "2026-04-06T00:00:00+00:00")
+        assert "unused-repo" not in report
+
+    def test_report_truncates_max_attempts(self):
+        metrics = {
+            "closed_issues": [],
+            "merged_prs": [],
+            "closed_prs": [],
+            "max_attempts_issues": [{"repo": "r", "number": i, "type": "bug", "assignee": "a", "title": f"T{i}"} for i in range(15)],
+            "by_agent": {},
+            "by_repo": {},
+            "by_type": {},
+        }
+        report = kr.build_report(metrics, "Fix it.", "2026-04-06T00:00:00+00:00")
+        assert "and 5 more" in report
+
+
+# ── telegram_send ─────────────────────────────────────────────────────────
+
+class TestTelegramSend:
+    def test_short_message_sent_in_one_piece(self):
+        with patch("urllib.request.urlopen") as mock_urlopen:
+            mock_resp = MagicMock()
+            mock_resp.read.return_value = b'{"ok": true}'
+            mock_urlopen.return_value.__enter__.return_value = mock_resp
+
+            results = kr.telegram_send("Hello", "fake-token", "123")
+            assert len(results) == 1
+            assert results[0]["ok"] is True
+            # Verify payload
+            call_args = mock_urlopen.call_args
+            req = call_args[0][0]
+            payload = json.loads(req.data.decode())
+            assert payload["text"] == "Hello"
+            assert payload["chat_id"] == "123"
+
+    def test_long_message_chunked(self):
+        big_text = "Line\n" * 2000  # ~10k chars
+        with patch("urllib.request.urlopen") as mock_urlopen:
+            mock_resp = MagicMock()
+            mock_resp.read.return_value = b'{"ok": true}'
+            mock_urlopen.return_value.__enter__.return_value = mock_resp
+
+            results = kr.telegram_send(big_text, "fake-token", "123")
+            assert len(results) >= 2
+            # First chunk should have a part prefix
+            req = mock_urlopen.call_args_list[0][0][0]
+            payload = json.loads(req.data.decode())
+            assert "(part 1" in payload["text"]
+
+
+# ── load helpers ──────────────────────────────────────────────────────────
+
+class TestLoadHelpers:
+    def test_load_json_missing_returns_none(self, tmp_path):
+        missing = tmp_path / "does_not_exist.json"
+        assert kr.load_json(missing) is None
+
+    def test_load_json_valid(self, tmp_path):
+        p = tmp_path / "data.json"
+        p.write_text('{"a": 1}')
+        assert kr.load_json(p) == {"a": 1}
+
+    def test_iso_day_ago_format(self):
+        s = kr.iso_day_ago(1)
+        # Should be a valid ISO timestamp string
+        dt = datetime.fromisoformat(s)
+        now = datetime.now(timezone.utc)
+        assert now - dt < timedelta(days=2)