[KAIZEN] Harden retro scheduling, chunking, and tests (#349)

- Add Kaizen Retro to cron/jobs.json with explicit local model/provider
- Add Telegram message chunking for reports approaching the 4096-char limit
- Fix classify_issue_type false positives on short substrings (ci in cleanup)
- Add 28 unit tests covering classification, max-attempts detection,
  suggestion generation, report formatting, and Telegram chunking
This commit is contained in:
Ezra
2026-04-07 15:54:15 +00:00
committed by ezra
parent f18955ea90
commit 2e64b160b5
3 changed files with 355 additions and 8 deletions

View File

@@ -137,7 +137,38 @@
"paused_reason": null,
"skills": [],
"skill": null
},
{
"id": "kaizen-retro-349",
"name": "Kaizen Retro",
"prompt": "Run the automated burn-cycle retrospective. Execute: cd /root/wizards/ezra/workspace/timmy-config && ./bin/kaizen-retro.sh",
"model": "hermes3:latest",
"provider": "ollama",
"base_url": "http://localhost:11434/v1",
"schedule": {
"kind": "interval",
"minutes": 1440,
"display": "every 1440m"
},
"schedule_display": "daily at 07:30",
"repeat": {
"times": null,
"completed": 0
},
"enabled": true,
"created_at": "2026-04-07T15:30:00.000000Z",
"next_run_at": "2026-04-08T07:30:00.000000Z",
"last_run_at": null,
"last_status": null,
"last_error": null,
"deliver": "local",
"origin": null,
"state": "scheduled",
"paused_at": null,
"paused_reason": null,
"skills": [],
"skill": null
}
],
"updated_at": "2026-04-07T15:00:00+00:00"
}
}

View File

@@ -51,6 +51,7 @@ MORNING_REPORT_REPO = "Timmy_Foundation/timmy-config"
TELEGRAM_BOT_TOKEN = os.environ.get("TELEGRAM_BOT_TOKEN")
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_HOME_CHANNEL", "-1003664764329")
TELEGRAM_MAX_LEN = 4000 # leave headroom below the 4096 hard limit
STALE_DAYS = 7
MAX_ATTEMPT_COMMENT_THRESHOLD = 5
@@ -86,14 +87,24 @@ def classify_issue_type(issue: dict) -> str:
body = (issue.get("body", "") or "").lower()
labels = [l.get("name", "").lower() for l in issue.get("labels", []) or []]
text = f"{title} {body} {' '.join(labels)}"
words = set(text.split())
best = "other"
best_score = 0
for kind, keywords in ISSUE_TYPE_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text)
# Short keywords (<=3 chars) require whole-word match to avoid false positives like
# "ci" inside "cleanup" or "cd" inside "abcde".
score = sum(
1 for kw in keywords
if (len(kw) <= 3 and kw in words) or (len(kw) > 3 and kw in text)
)
# label match is stronger
for label in labels:
if any(kw in label for kw in keywords):
label_words = set(label.split())
if any(
(len(kw) <= 3 and kw in label_words) or (len(kw) > 3 and kw in label)
for kw in keywords
):
score += 3
if score > best_score:
best_score = score
@@ -119,12 +130,34 @@ def is_max_attempts_candidate(issue: dict) -> bool:
return False
def telegram_send(text: str, bot_token: str, chat_id: str) -> dict:
def telegram_send(text: str, bot_token: str, chat_id: str) -> list[dict]:
"""Post text to Telegram, chunking if it exceeds the message limit."""
url = f"https://api.telegram.org/bot{bot_token}/sendMessage"
data = json.dumps({"chat_id": chat_id, "text": text, "parse_mode": "Markdown"}).encode()
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode())
chunks = []
if len(text) <= TELEGRAM_MAX_LEN:
chunks = [text]
else:
# Split on newlines to preserve readability
lines = text.splitlines(keepends=True)
current = ""
for line in lines:
if len(current) + len(line) > TELEGRAM_MAX_LEN:
if current:
chunks.append(current)
current = line
else:
current += line
if current:
chunks.append(current)
results = []
for i, chunk in enumerate(chunks):
prefix = f"*(part {i + 1}/{len(chunks)})*\n" if len(chunks) > 1 else ""
payload = {"chat_id": chat_id, "text": prefix + chunk, "parse_mode": "Markdown"}
data = json.dumps(payload).encode()
req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})
with urllib.request.urlopen(req, timeout=30) as resp:
results.append(json.loads(resp.read().decode()))
return results
def find_latest_morning_report_issue(client: GiteaClient) -> Optional[int]:

283
tests/test_kaizen_retro.py Normal file
View File

@@ -0,0 +1,283 @@
"""Tests for the Kaizen Retro burn-cycle retrospective script."""
from __future__ import annotations
import importlib.util
import json
import sys
from datetime import datetime, timedelta, timezone
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
REPO_ROOT = Path(__file__).parent.parent
# Load kaizen_retro.py as a module (it lives in scripts/, not a package)
spec = importlib.util.spec_from_file_location("kaizen_retro", REPO_ROOT / "scripts" / "kaizen_retro.py")
kr = importlib.util.module_from_spec(spec)
spec.loader.exec_module(kr)
# ── classify_issue_type ───────────────────────────────────────────────────
class TestClassifyIssueType:
def test_classifies_bug_from_title(self):
issue = {"title": "Fix crash on startup", "body": "", "labels": []}
assert kr.classify_issue_type(issue) == "bug"
def test_classifies_feature_from_label(self):
issue = {"title": "Add dark mode", "body": "", "labels": [{"name": "enhancement"}]}
# label "enhancement" doesn't match any keyword directly, but "feature" and "add" are in title
assert kr.classify_issue_type(issue) == "feature"
def test_classifies_docs_from_label(self):
issue = {"title": "Update guide", "body": "", "labels": [{"name": "documentation"}]}
assert kr.classify_issue_type(issue) == "docs"
def test_label_match_stronger_than_title(self):
issue = {"title": "Something random", "body": "", "labels": [{"name": "bug"}]}
assert kr.classify_issue_type(issue) == "bug"
def test_kaizen_takes_precedence_with_both_labels(self):
issue = {"title": "Process improvement", "body": "", "labels": [{"name": "kaizen"}, {"name": "bug"}]}
# kaizen label gives +3, bug gives +3, tie goes to first seen? kaizen appears first in dict
assert kr.classify_issue_type(issue) == "kaizen"
def test_defaults_to_other(self):
issue = {"title": "Tidy up naming", "body": "No user-facing change", "labels": [{"name": "cleanup"}]}
assert kr.classify_issue_type(issue) == "other"
# ── is_max_attempts_candidate ─────────────────────────────────────────────
class TestIsMaxAttemptsCandidate:
def test_blocker_label_returns_true(self):
issue = {"labels": [{"name": "blocked"}], "comments": 0, "created_at": "2026-04-07T00:00:00Z"}
assert kr.is_max_attempts_candidate(issue) is True
def test_timeout_label_returns_true(self):
issue = {"labels": [{"name": "timeout"}], "comments": 0, "created_at": "2026-04-07T00:00:00Z"}
assert kr.is_max_attempts_candidate(issue) is True
def test_high_comment_count_returns_true(self):
issue = {"labels": [], "comments": 5, "created_at": "2026-04-07T00:00:00Z"}
assert kr.is_max_attempts_candidate(issue) is True
def test_fresh_issue_with_low_comments_returns_false(self):
now = datetime.now(timezone.utc)
issue = {"labels": [], "comments": 2, "created_at": now.isoformat()}
assert kr.is_max_attempts_candidate(issue) is False
def test_stale_age_returns_true(self):
old = datetime.now(timezone.utc) - timedelta(days=10)
issue = {"labels": [], "comments": 0, "created_at": old.isoformat()}
assert kr.is_max_attempts_candidate(issue) is True
# ── fmt_pct ───────────────────────────────────────────────────────────────
class TestFmtPct:
def test_basic_percentage(self):
assert kr.fmt_pct(3, 4) == "75%"
def test_zero_denominator(self):
assert kr.fmt_pct(0, 0) == "N/A"
def test_perfect_rate(self):
assert kr.fmt_pct(10, 10) == "100%"
# ── generate_suggestion ───────────────────────────────────────────────────
class TestGenerateSuggestion:
def test_agent_zero_success_rate(self):
metrics = {
"by_agent": {
"groq": {"successes": 0, "failures": 5, "closed": 0, "repos": ["timmy-home"]},
},
"by_repo": {},
"by_type": {},
"max_attempts_issues": [],
"closed_issues": [],
"merged_prs": [],
"closed_prs": [],
}
suggestion = kr.generate_suggestion(metrics, [])
assert "groq" in suggestion
assert "0%" in suggestion or "verify rate" in suggestion
def test_repo_with_most_failures(self):
metrics = {
"by_agent": {},
"by_repo": {
"the-nexus": {"successes": 2, "failures": 5, "closed": 2, "open": 3},
},
"by_type": {},
"max_attempts_issues": [],
"closed_issues": [],
"merged_prs": [],
"closed_prs": [],
}
suggestion = kr.generate_suggestion(metrics, [])
assert "the-nexus" in suggestion
assert "friction" in suggestion
def test_max_attempts_pattern(self):
metrics = {
"by_agent": {},
"by_repo": {},
"by_type": {},
"max_attempts_issues": [
{"type": "devops"}, {"type": "devops"}, {"type": "feature"}
],
"closed_issues": [],
"merged_prs": [],
"closed_prs": [],
}
suggestion = kr.generate_suggestion(metrics, [])
assert "devops" in suggestion
assert "max-attempts" in suggestion.lower() or "stale" in suggestion.lower()
def test_idle_agents(self):
metrics = {
"by_agent": {},
"by_repo": {},
"by_type": {},
"max_attempts_issues": [],
"closed_issues": [],
"merged_prs": [],
"closed_prs": [],
}
fleet = [{"name": "allegro", "active": True}, {"name": "ezra", "active": True}]
suggestion = kr.generate_suggestion(metrics, fleet)
assert "idle" in suggestion.lower() or "no assignments" in suggestion.lower()
def test_fallback_celebration(self):
metrics = {
"by_agent": {},
"by_repo": {},
"by_type": {},
"max_attempts_issues": [],
"closed_issues": [{}, {}, {}, {}, {}],
"merged_prs": [{}, {}, {}],
"closed_prs": [],
}
suggestion = kr.generate_suggestion(metrics, [])
assert "Strong cycle" in suggestion
def test_fallback_low_activity(self):
metrics = {
"by_agent": {},
"by_repo": {},
"by_type": {},
"max_attempts_issues": [],
"closed_issues": [],
"merged_prs": [],
"closed_prs": [],
}
suggestion = kr.generate_suggestion(metrics, [])
assert "Low activity" in suggestion or "idle" in suggestion.lower()
# ── build_report ──────────────────────────────────────────────────────────
class TestBuildReport:
def test_report_contains_numbers_section(self):
metrics = {
"closed_issues": [{}, {}],
"merged_prs": [{}],
"closed_prs": [],
"max_attempts_issues": [],
"by_agent": {"ezra": {"successes": 2, "failures": 0, "repos": ["timmy-config"]}},
"by_repo": {"timmy-config": {"successes": 2, "failures": 0, "open": 1}},
"by_type": {"feature": {"successes": 2, "failures": 0, "total": 2}},
}
report = kr.build_report(metrics, "Do better.", "2026-04-06T00:00:00+00:00")
assert "## Numbers" in report
assert "Issues closed:** 2" in report
assert "PRs merged:** 1" in report
assert "## By Agent" in report
assert "## By Repo" in report
assert "## By Issue Type" in report
assert "Do better." in report
def test_report_skips_empty_repos(self):
metrics = {
"closed_issues": [],
"merged_prs": [],
"closed_prs": [],
"max_attempts_issues": [],
"by_agent": {},
"by_repo": {"unused-repo": {"successes": 0, "failures": 0, "open": 0}},
"by_type": {},
}
report = kr.build_report(metrics, "Nudge.", "2026-04-06T00:00:00+00:00")
assert "unused-repo" not in report
def test_report_truncates_max_attempts(self):
metrics = {
"closed_issues": [],
"merged_prs": [],
"closed_prs": [],
"max_attempts_issues": [{"repo": "r", "number": i, "type": "bug", "assignee": "a", "title": f"T{i}"} for i in range(15)],
"by_agent": {},
"by_repo": {},
"by_type": {},
}
report = kr.build_report(metrics, "Fix it.", "2026-04-06T00:00:00+00:00")
assert "and 5 more" in report
# ── telegram_send ─────────────────────────────────────────────────────────
class TestTelegramSend:
def test_short_message_sent_in_one_piece(self):
with patch("urllib.request.urlopen") as mock_urlopen:
mock_resp = MagicMock()
mock_resp.read.return_value = b'{"ok": true}'
mock_urlopen.return_value.__enter__.return_value = mock_resp
results = kr.telegram_send("Hello", "fake-token", "123")
assert len(results) == 1
assert results[0]["ok"] is True
# Verify payload
call_args = mock_urlopen.call_args
req = call_args[0][0]
payload = json.loads(req.data.decode())
assert payload["text"] == "Hello"
assert payload["chat_id"] == "123"
def test_long_message_chunked(self):
big_text = "Line\n" * 2000 # ~10k chars
with patch("urllib.request.urlopen") as mock_urlopen:
mock_resp = MagicMock()
mock_resp.read.return_value = b'{"ok": true}'
mock_urlopen.return_value.__enter__.return_value = mock_resp
results = kr.telegram_send(big_text, "fake-token", "123")
assert len(results) >= 2
# First chunk should have a part prefix
req = mock_urlopen.call_args_list[0][0][0]
payload = json.loads(req.data.decode())
assert "(part 1" in payload["text"]
# ── load helpers ──────────────────────────────────────────────────────────
class TestLoadHelpers:
def test_load_json_missing_returns_none(self, tmp_path):
missing = tmp_path / "does_not_exist.json"
assert kr.load_json(missing) is None
def test_load_json_valid(self, tmp_path):
p = tmp_path / "data.json"
p.write_text('{"a": 1}')
assert kr.load_json(p) == {"a": 1}
def test_iso_day_ago_format(self):
s = kr.iso_day_ago(1)
# Should be a valid ISO timestamp string
dt = datetime.fromisoformat(s)
now = datetime.now(timezone.utc)
assert now - dt < timedelta(days=2)