forked from Rockachopa/Timmy-time-dashboard
322 lines
11 KiB
Python
322 lines
11 KiB
Python
"""Vassal Protocol — main orchestration loop.
|
|
|
|
Ties the backlog, dispatch, agent health, and house health modules together
|
|
into a single ``VassalOrchestrator`` that can run as a background service.
|
|
|
|
Each cycle:
|
|
1. Fetch open Gitea issues
|
|
2. Triage: score priority + route to agent
|
|
3. Dispatch: apply labels / post routing comments
|
|
4. Check agent health: nudge stuck agents
|
|
5. Check house health: log warnings, trigger cleanup if needed
|
|
6. Return a VassalCycleRecord summarising the cycle
|
|
|
|
Usage::
|
|
|
|
from timmy.vassal import vassal_orchestrator
|
|
|
|
record = await vassal_orchestrator.run_cycle()
|
|
status = vassal_orchestrator.get_status()
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import UTC, datetime
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Cycle record
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class VassalCycleRecord:
|
|
"""Summary of one orchestration cycle."""
|
|
|
|
cycle_id: int
|
|
started_at: str
|
|
finished_at: str = ""
|
|
duration_ms: int = 0
|
|
|
|
issues_fetched: int = 0
|
|
issues_dispatched: int = 0
|
|
dispatched_to_claude: int = 0
|
|
dispatched_to_kimi: int = 0
|
|
dispatched_to_timmy: int = 0
|
|
|
|
stuck_agents: list[str] = field(default_factory=list)
|
|
nudges_sent: int = 0
|
|
|
|
house_warnings: list[str] = field(default_factory=list)
|
|
cleanup_deleted: int = 0
|
|
|
|
errors: list[str] = field(default_factory=list)
|
|
|
|
@property
|
|
def healthy(self) -> bool:
|
|
return not self.errors and not self.house_warnings
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Orchestrator
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class VassalOrchestrator:
|
|
"""Timmy's autonomous orchestration engine.
|
|
|
|
Runs observe → triage → dispatch → monitor → house-check cycles on a
|
|
configurable interval.
|
|
|
|
Parameters
|
|
----------
|
|
cycle_interval:
|
|
Seconds between cycles. Defaults to ``settings.vassal_cycle_interval``
|
|
when available, otherwise 300 s (5 min).
|
|
max_dispatch_per_cycle:
|
|
Cap on new dispatches per cycle to avoid spamming agents.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
cycle_interval: float | None = None,
|
|
max_dispatch_per_cycle: int = 10,
|
|
) -> None:
|
|
self._cycle_count = 0
|
|
self._running = False
|
|
self._task: asyncio.Task | None = None
|
|
self._max_dispatch = max_dispatch_per_cycle
|
|
self._history: list[VassalCycleRecord] = []
|
|
|
|
# Resolve interval — lazy to avoid import-time settings read
|
|
self._cycle_interval = cycle_interval
|
|
|
|
# -- public API --------------------------------------------------------
|
|
|
|
@property
|
|
def cycle_count(self) -> int:
|
|
return self._cycle_count
|
|
|
|
@property
|
|
def is_running(self) -> bool:
|
|
return self._running
|
|
|
|
@property
|
|
def history(self) -> list[VassalCycleRecord]:
|
|
return list(self._history)
|
|
|
|
def get_status(self) -> dict[str, Any]:
|
|
"""Return a JSON-serialisable status dict."""
|
|
last = self._history[-1] if self._history else None
|
|
return {
|
|
"running": self._running,
|
|
"cycle_count": self._cycle_count,
|
|
"last_cycle": {
|
|
"cycle_id": last.cycle_id,
|
|
"started_at": last.started_at,
|
|
"issues_fetched": last.issues_fetched,
|
|
"issues_dispatched": last.issues_dispatched,
|
|
"stuck_agents": last.stuck_agents,
|
|
"house_warnings": last.house_warnings,
|
|
"healthy": last.healthy,
|
|
}
|
|
if last
|
|
else None,
|
|
}
|
|
|
|
# -- single cycle ------------------------------------------------------
|
|
|
|
async def run_cycle(self) -> VassalCycleRecord:
|
|
"""Execute one full orchestration cycle.
|
|
|
|
Gracefully degrades at each step — a failure in one sub-task does
|
|
not abort the rest of the cycle.
|
|
|
|
Returns:
|
|
VassalCycleRecord summarising what happened.
|
|
"""
|
|
self._cycle_count += 1
|
|
start = time.monotonic()
|
|
record = VassalCycleRecord(
|
|
cycle_id=self._cycle_count,
|
|
started_at=datetime.now(UTC).isoformat(),
|
|
)
|
|
|
|
# 1 + 2: Fetch & triage
|
|
await self._step_backlog(record)
|
|
|
|
# 3: Agent health
|
|
await self._step_agent_health(record)
|
|
|
|
# 4: House health
|
|
await self._step_house_health(record)
|
|
|
|
# Finalise record
|
|
record.finished_at = datetime.now(UTC).isoformat()
|
|
record.duration_ms = int((time.monotonic() - start) * 1000)
|
|
self._history.append(record)
|
|
|
|
# Broadcast via WebSocket (best-effort)
|
|
await self._broadcast(record)
|
|
|
|
logger.info(
|
|
"VassalOrchestrator cycle #%d complete (%d ms): "
|
|
"fetched=%d dispatched=%d stuck=%s house_ok=%s",
|
|
record.cycle_id,
|
|
record.duration_ms,
|
|
record.issues_fetched,
|
|
record.issues_dispatched,
|
|
record.stuck_agents or "none",
|
|
not record.house_warnings,
|
|
)
|
|
return record
|
|
|
|
# -- background loop ---------------------------------------------------
|
|
|
|
async def start(self) -> None:
|
|
"""Start the recurring orchestration loop as a background task."""
|
|
if self._running:
|
|
logger.warning("VassalOrchestrator already running")
|
|
return
|
|
self._running = True
|
|
self._task = asyncio.ensure_future(self._loop())
|
|
|
|
def stop(self) -> None:
|
|
"""Signal the loop to stop after the current cycle."""
|
|
self._running = False
|
|
if self._task and not self._task.done():
|
|
self._task.cancel()
|
|
logger.info("VassalOrchestrator stop requested")
|
|
|
|
async def _loop(self) -> None:
|
|
interval = self._resolve_interval()
|
|
logger.info("VassalOrchestrator loop started (interval=%.0fs)", interval)
|
|
while self._running:
|
|
try:
|
|
await self.run_cycle()
|
|
except Exception:
|
|
logger.exception("VassalOrchestrator cycle failed")
|
|
await asyncio.sleep(interval)
|
|
|
|
# -- step: backlog -------------------------------------------------------
|
|
|
|
async def _step_backlog(self, record: VassalCycleRecord) -> None:
|
|
from timmy.vassal.backlog import fetch_open_issues, triage_issues
|
|
from timmy.vassal.dispatch import dispatch_issue, get_dispatch_registry
|
|
|
|
try:
|
|
raw_issues = await fetch_open_issues(
|
|
limit=50,
|
|
exclude_labels=["wip", "blocked", "needs-info"],
|
|
)
|
|
record.issues_fetched = len(raw_issues)
|
|
|
|
if not raw_issues:
|
|
return
|
|
|
|
triaged = triage_issues(raw_issues)
|
|
registry = get_dispatch_registry()
|
|
|
|
dispatched = 0
|
|
for issue in triaged:
|
|
if dispatched >= self._max_dispatch:
|
|
break
|
|
# Skip already-dispatched issues
|
|
if issue.number in registry:
|
|
continue
|
|
await dispatch_issue(issue)
|
|
dispatched += 1
|
|
|
|
from timmy.vassal.backlog import AgentTarget
|
|
|
|
if issue.agent_target == AgentTarget.CLAUDE:
|
|
record.dispatched_to_claude += 1
|
|
elif issue.agent_target == AgentTarget.KIMI:
|
|
record.dispatched_to_kimi += 1
|
|
else:
|
|
record.dispatched_to_timmy += 1
|
|
|
|
record.issues_dispatched = dispatched
|
|
|
|
except Exception as exc:
|
|
logger.exception("_step_backlog failed")
|
|
record.errors.append(f"backlog: {exc}")
|
|
|
|
# -- step: agent health -------------------------------------------------
|
|
|
|
async def _step_agent_health(self, record: VassalCycleRecord) -> None:
|
|
from config import settings
|
|
from timmy.vassal.agent_health import get_full_health_report, nudge_stuck_agent
|
|
|
|
try:
|
|
threshold = getattr(settings, "vassal_stuck_threshold_minutes", 120)
|
|
report = await get_full_health_report(stuck_threshold_minutes=threshold)
|
|
|
|
for agent_status in report.agents:
|
|
if agent_status.is_stuck:
|
|
record.stuck_agents.append(agent_status.agent)
|
|
for issue_num in agent_status.stuck_issue_numbers:
|
|
ok = await nudge_stuck_agent(agent_status.agent, issue_num)
|
|
if ok:
|
|
record.nudges_sent += 1
|
|
|
|
except Exception as exc:
|
|
logger.exception("_step_agent_health failed")
|
|
record.errors.append(f"agent_health: {exc}")
|
|
|
|
# -- step: house health -------------------------------------------------
|
|
|
|
async def _step_house_health(self, record: VassalCycleRecord) -> None:
|
|
from timmy.vassal.house_health import cleanup_stale_files, get_system_snapshot
|
|
|
|
try:
|
|
snapshot = await get_system_snapshot()
|
|
record.house_warnings = snapshot.warnings
|
|
|
|
# Auto-cleanup temp files when disk is getting tight
|
|
if snapshot.disk.percent_used >= 80.0:
|
|
result = await cleanup_stale_files(max_age_days=3)
|
|
record.cleanup_deleted = result.get("deleted_count", 0)
|
|
|
|
except Exception as exc:
|
|
logger.exception("_step_house_health failed")
|
|
record.errors.append(f"house_health: {exc}")
|
|
|
|
# -- helpers ------------------------------------------------------------
|
|
|
|
def _resolve_interval(self) -> float:
|
|
if self._cycle_interval is not None:
|
|
return self._cycle_interval
|
|
try:
|
|
from config import settings
|
|
|
|
return float(getattr(settings, "vassal_cycle_interval", 300))
|
|
except Exception:
|
|
return 300.0
|
|
|
|
async def _broadcast(self, record: VassalCycleRecord) -> None:
|
|
try:
|
|
from infrastructure.ws_manager.handler import ws_manager
|
|
|
|
await ws_manager.broadcast(
|
|
"vassal.cycle",
|
|
{
|
|
"cycle_id": record.cycle_id,
|
|
"started_at": record.started_at,
|
|
"issues_fetched": record.issues_fetched,
|
|
"issues_dispatched": record.issues_dispatched,
|
|
"stuck_agents": record.stuck_agents,
|
|
"house_warnings": record.house_warnings,
|
|
"duration_ms": record.duration_ms,
|
|
"healthy": record.healthy,
|
|
},
|
|
)
|
|
except Exception as exc:
|
|
logger.debug("VassalOrchestrator broadcast skipped: %s", exc)
|