1
0
This repository has been archived on 2026-03-24. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
Timmy-time-dashboard/src/timmy/vassal/orchestration_loop.py

322 lines
11 KiB
Python

"""Vassal Protocol — main orchestration loop.
Ties the backlog, dispatch, agent health, and house health modules together
into a single ``VassalOrchestrator`` that can run as a background service.
Each cycle:
1. Fetch open Gitea issues
2. Triage: score priority + route to agent
3. Dispatch: apply labels / post routing comments
4. Check agent health: nudge stuck agents
5. Check house health: log warnings, trigger cleanup if needed
6. Return a VassalCycleRecord summarising the cycle
Usage::
from timmy.vassal import vassal_orchestrator
record = await vassal_orchestrator.run_cycle()
status = vassal_orchestrator.get_status()
"""
from __future__ import annotations
import asyncio
import logging
import time
from dataclasses import dataclass, field
from datetime import UTC, datetime
from typing import Any
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Cycle record
# ---------------------------------------------------------------------------
@dataclass
class VassalCycleRecord:
"""Summary of one orchestration cycle."""
cycle_id: int
started_at: str
finished_at: str = ""
duration_ms: int = 0
issues_fetched: int = 0
issues_dispatched: int = 0
dispatched_to_claude: int = 0
dispatched_to_kimi: int = 0
dispatched_to_timmy: int = 0
stuck_agents: list[str] = field(default_factory=list)
nudges_sent: int = 0
house_warnings: list[str] = field(default_factory=list)
cleanup_deleted: int = 0
errors: list[str] = field(default_factory=list)
@property
def healthy(self) -> bool:
return not self.errors and not self.house_warnings
# ---------------------------------------------------------------------------
# Orchestrator
# ---------------------------------------------------------------------------
class VassalOrchestrator:
"""Timmy's autonomous orchestration engine.
Runs observe → triage → dispatch → monitor → house-check cycles on a
configurable interval.
Parameters
----------
cycle_interval:
Seconds between cycles. Defaults to ``settings.vassal_cycle_interval``
when available, otherwise 300 s (5 min).
max_dispatch_per_cycle:
Cap on new dispatches per cycle to avoid spamming agents.
"""
def __init__(
self,
cycle_interval: float | None = None,
max_dispatch_per_cycle: int = 10,
) -> None:
self._cycle_count = 0
self._running = False
self._task: asyncio.Task | None = None
self._max_dispatch = max_dispatch_per_cycle
self._history: list[VassalCycleRecord] = []
# Resolve interval — lazy to avoid import-time settings read
self._cycle_interval = cycle_interval
# -- public API --------------------------------------------------------
@property
def cycle_count(self) -> int:
return self._cycle_count
@property
def is_running(self) -> bool:
return self._running
@property
def history(self) -> list[VassalCycleRecord]:
return list(self._history)
def get_status(self) -> dict[str, Any]:
"""Return a JSON-serialisable status dict."""
last = self._history[-1] if self._history else None
return {
"running": self._running,
"cycle_count": self._cycle_count,
"last_cycle": {
"cycle_id": last.cycle_id,
"started_at": last.started_at,
"issues_fetched": last.issues_fetched,
"issues_dispatched": last.issues_dispatched,
"stuck_agents": last.stuck_agents,
"house_warnings": last.house_warnings,
"healthy": last.healthy,
}
if last
else None,
}
# -- single cycle ------------------------------------------------------
async def run_cycle(self) -> VassalCycleRecord:
"""Execute one full orchestration cycle.
Gracefully degrades at each step — a failure in one sub-task does
not abort the rest of the cycle.
Returns:
VassalCycleRecord summarising what happened.
"""
self._cycle_count += 1
start = time.monotonic()
record = VassalCycleRecord(
cycle_id=self._cycle_count,
started_at=datetime.now(UTC).isoformat(),
)
# 1 + 2: Fetch & triage
await self._step_backlog(record)
# 3: Agent health
await self._step_agent_health(record)
# 4: House health
await self._step_house_health(record)
# Finalise record
record.finished_at = datetime.now(UTC).isoformat()
record.duration_ms = int((time.monotonic() - start) * 1000)
self._history.append(record)
# Broadcast via WebSocket (best-effort)
await self._broadcast(record)
logger.info(
"VassalOrchestrator cycle #%d complete (%d ms): "
"fetched=%d dispatched=%d stuck=%s house_ok=%s",
record.cycle_id,
record.duration_ms,
record.issues_fetched,
record.issues_dispatched,
record.stuck_agents or "none",
not record.house_warnings,
)
return record
# -- background loop ---------------------------------------------------
async def start(self) -> None:
"""Start the recurring orchestration loop as a background task."""
if self._running:
logger.warning("VassalOrchestrator already running")
return
self._running = True
self._task = asyncio.ensure_future(self._loop())
def stop(self) -> None:
"""Signal the loop to stop after the current cycle."""
self._running = False
if self._task and not self._task.done():
self._task.cancel()
logger.info("VassalOrchestrator stop requested")
async def _loop(self) -> None:
interval = self._resolve_interval()
logger.info("VassalOrchestrator loop started (interval=%.0fs)", interval)
while self._running:
try:
await self.run_cycle()
except Exception:
logger.exception("VassalOrchestrator cycle failed")
await asyncio.sleep(interval)
# -- step: backlog -------------------------------------------------------
async def _step_backlog(self, record: VassalCycleRecord) -> None:
from timmy.vassal.backlog import fetch_open_issues, triage_issues
from timmy.vassal.dispatch import dispatch_issue, get_dispatch_registry
try:
raw_issues = await fetch_open_issues(
limit=50,
exclude_labels=["wip", "blocked", "needs-info"],
)
record.issues_fetched = len(raw_issues)
if not raw_issues:
return
triaged = triage_issues(raw_issues)
registry = get_dispatch_registry()
dispatched = 0
for issue in triaged:
if dispatched >= self._max_dispatch:
break
# Skip already-dispatched issues
if issue.number in registry:
continue
await dispatch_issue(issue)
dispatched += 1
from timmy.vassal.backlog import AgentTarget
if issue.agent_target == AgentTarget.CLAUDE:
record.dispatched_to_claude += 1
elif issue.agent_target == AgentTarget.KIMI:
record.dispatched_to_kimi += 1
else:
record.dispatched_to_timmy += 1
record.issues_dispatched = dispatched
except Exception as exc:
logger.exception("_step_backlog failed")
record.errors.append(f"backlog: {exc}")
# -- step: agent health -------------------------------------------------
async def _step_agent_health(self, record: VassalCycleRecord) -> None:
from config import settings
from timmy.vassal.agent_health import get_full_health_report, nudge_stuck_agent
try:
threshold = getattr(settings, "vassal_stuck_threshold_minutes", 120)
report = await get_full_health_report(stuck_threshold_minutes=threshold)
for agent_status in report.agents:
if agent_status.is_stuck:
record.stuck_agents.append(agent_status.agent)
for issue_num in agent_status.stuck_issue_numbers:
ok = await nudge_stuck_agent(agent_status.agent, issue_num)
if ok:
record.nudges_sent += 1
except Exception as exc:
logger.exception("_step_agent_health failed")
record.errors.append(f"agent_health: {exc}")
# -- step: house health -------------------------------------------------
async def _step_house_health(self, record: VassalCycleRecord) -> None:
from timmy.vassal.house_health import cleanup_stale_files, get_system_snapshot
try:
snapshot = await get_system_snapshot()
record.house_warnings = snapshot.warnings
# Auto-cleanup temp files when disk is getting tight
if snapshot.disk.percent_used >= 80.0:
result = await cleanup_stale_files(max_age_days=3)
record.cleanup_deleted = result.get("deleted_count", 0)
except Exception as exc:
logger.exception("_step_house_health failed")
record.errors.append(f"house_health: {exc}")
# -- helpers ------------------------------------------------------------
def _resolve_interval(self) -> float:
if self._cycle_interval is not None:
return self._cycle_interval
try:
from config import settings
return float(getattr(settings, "vassal_cycle_interval", 300))
except Exception:
return 300.0
async def _broadcast(self, record: VassalCycleRecord) -> None:
try:
from infrastructure.ws_manager.handler import ws_manager
await ws_manager.broadcast(
"vassal.cycle",
{
"cycle_id": record.cycle_id,
"started_at": record.started_at,
"issues_fetched": record.issues_fetched,
"issues_dispatched": record.issues_dispatched,
"stuck_agents": record.stuck_agents,
"house_warnings": record.house_warnings,
"duration_ms": record.duration_ms,
"healthy": record.healthy,
},
)
except Exception as exc:
logger.debug("VassalOrchestrator broadcast skipped: %s", exc)