Timmy-time-dashboard/src/swarm/coordinator.py

"""Swarm coordinator — orchestrates registry, manager, and bidder.

The coordinator is the top-level entry point for swarm operations.
It ties together task creation, auction management, agent spawning,
and task assignment into a single cohesive API used by the dashboard
routes.
"""

import asyncio
import logging
from datetime import datetime, timezone
from typing import Optional

from swarm.bidder import AUCTION_DURATION_SECONDS, AuctionManager, Bid
from swarm.comms import SwarmComms
from swarm import learner as swarm_learner
from swarm.manager import SwarmManager
from swarm.recovery import reconcile_on_startup
from swarm.registry import AgentRecord
from swarm import registry
from swarm import routing as swarm_routing
from swarm import stats as swarm_stats
from swarm.tasks import (
    Task,
    TaskStatus,
    create_task,
    get_task,
    list_tasks,
    update_task,
)
from swarm.event_log import (
    EventType,
    log_event,
)

# Spark Intelligence integration — lazy import to avoid circular deps
def _get_spark():
    """Lazily import the Spark engine singleton."""
    try:
        from spark.engine import spark_engine
        return spark_engine
    except Exception:
        return None

logger = logging.getLogger(__name__)


class SwarmCoordinator:
    """High-level orchestrator for the swarm system."""

    def __init__(self) -> None:
        self.manager = SwarmManager()
        self.auctions = AuctionManager()
        self.comms = SwarmComms()
        self._in_process_nodes: list = []
        self._recovery_summary = {"tasks_failed": 0, "agents_offlined": 0}

    def initialize(self) -> None:
        """Run startup recovery. Call during app lifespan, not at import time."""
        self._recovery_summary = reconcile_on_startup()

    # ── Agent lifecycle ─────────────────────────────────────────────────────

    def spawn_agent(self, name: str, agent_id: Optional[str] = None) -> dict:
        """Spawn a new sub-agent and register it."""
        managed = self.manager.spawn(name, agent_id)
        record = registry.register(name=name, agent_id=managed.agent_id)
        return {
            "agent_id": managed.agent_id,
            "name": name,
            "pid": managed.pid,
            "status": record.status,
        }

    def stop_agent(self, agent_id: str) -> bool:
        """Stop a sub-agent and remove it from the registry."""
        registry.unregister(agent_id)
        return self.manager.stop(agent_id)

    def list_swarm_agents(self) -> list[AgentRecord]:
        return registry.list_agents()

    def spawn_persona(self, persona_id: str, agent_id: Optional[str] = None) -> dict:
        """DEPRECATED: Use brain task queue instead.

        Personas have been replaced by the distributed brain worker queue.
        Submit tasks via BrainClient.submit_task() instead.
        """
        logger.warning(
            "spawn_persona() is deprecated. "
            "Use brain.BrainClient.submit_task() instead."
        )
        # Return stub response for compatibility
        return {
            "agent_id": agent_id or "deprecated",
            "name": persona_id,
            "status": "deprecated",
            "message": "Personas replaced by brain task queue"
        }

    def spawn_in_process_agent(
        self, name: str, agent_id: Optional[str] = None,
    ) -> dict:
        """Spawn a lightweight in-process agent that bids on tasks.

        Unlike spawn_agent (which launches a subprocess), this creates a
        SwarmNode in the current process sharing the coordinator's comms
        layer.  This means the in-memory pub/sub callbacks fire
        immediately when a task is posted, allowing the node to submit
        bids into the coordinator's AuctionManager.
        """
        from swarm.swarm_node import SwarmNode

        aid = agent_id or str(__import__("uuid").uuid4())
        node = SwarmNode(
            agent_id=aid,
            name=name,
            comms=self.comms,
        )
        # Wire the node's bid callback to feed into our AuctionManager
        original_on_task = node._on_task_posted

        def _bid_and_register(msg):
            """Intercept the task announcement, submit a bid to the auction."""
            task_id = msg.data.get("task_id")
            if not task_id:
                return
            import random
            bid_sats = random.randint(10, 100)
            self.auctions.submit_bid(task_id, aid, bid_sats)
            logger.info(
                "In-process agent %s bid %d sats on task %s",
                name, bid_sats, task_id,
            )

        # Subscribe to task announcements via shared comms
        self.comms.subscribe("swarm:tasks", _bid_and_register)

        record = registry.register(name=name, agent_id=aid)
        self._in_process_nodes.append(node)
        logger.info("Spawned in-process agent %s (%s)", name, aid)
        return {
            "agent_id": aid,
            "name": name,
            "pid": None,
            "status": record.status,
        }

    # ── Task lifecycle ──────────────────────────────────────────────────────

    def post_task(self, description: str) -> Task:
        """Create a task, open an auction, and announce it to the swarm.

        The auction is opened *before* the comms announcement so that
        in-process agents (whose callbacks fire synchronously) can
        submit bids into an already-open auction.
        """
        task = create_task(description)
        update_task(task.id, status=TaskStatus.BIDDING)
        task.status = TaskStatus.BIDDING
        # Open the auction first so bids from in-process agents land
        self.auctions.open_auction(task.id)
        self.comms.post_task(task.id, description)
        logger.info("Task posted: %s (%s)", task.id, description[:50])
        # Log task creation event
        log_event(
            EventType.TASK_CREATED,
            source="coordinator",
            task_id=task.id,
            data={"description": description[:200]},
        )
        log_event(
            EventType.TASK_BIDDING,
            source="coordinator",
            task_id=task.id,
        )
        # Broadcast task posted via WebSocket
        self._broadcast(self._broadcast_task_posted, task.id, description)
        # Spark: capture task-posted event with candidate agents
        spark = _get_spark()
        if spark:
            candidates = [a.id for a in registry.list_agents()]
            spark.on_task_posted(task.id, description, candidates)
        return task

    async def run_auction_and_assign(self, task_id: str) -> Optional[Bid]:
        """Wait for the bidding period, then close the auction and assign.

        The auction should already be open (via post_task).  This method
        waits the remaining bidding window and then closes it.

        All bids are recorded in the learner so agents accumulate outcome
        history that later feeds back into adaptive bidding.
        """
        await asyncio.sleep(AUCTION_DURATION_SECONDS)

        # Snapshot the auction bids before closing (for learner recording)
        auction = self.auctions.get_auction(task_id)
        all_bids = list(auction.bids) if auction else []

        # Build bids dict for routing engine
        bids_dict = {bid.agent_id: bid.bid_sats for bid in all_bids}

        # Get routing recommendation (logs decision for audit)
        task = get_task(task_id)
        description = task.description if task else ""
        recommended, decision = swarm_routing.routing_engine.recommend_agent(
            task_id, description, bids_dict
        )

        # Log if auction winner differs from routing recommendation
        winner = self.auctions.close_auction(task_id)
        if winner and recommended and winner.agent_id != recommended:
            logger.warning(
                "Auction winner %s differs from routing recommendation %s",
                winner.agent_id[:8], recommended[:8]
            )

        # Retrieve description for learner context
        task = get_task(task_id)
        description = task.description if task else ""

        # Record every bid outcome in the learner
        winner_id = winner.agent_id if winner else None
        for bid in all_bids:
            swarm_learner.record_outcome(
                task_id=task_id,
                agent_id=bid.agent_id,
                description=description,
                bid_sats=bid.bid_sats,
                won_auction=(bid.agent_id == winner_id),
            )

        if winner:
            update_task(
                task_id,
                status=TaskStatus.ASSIGNED,
                assigned_agent=winner.agent_id,
            )
            self.comms.assign_task(task_id, winner.agent_id)
            registry.update_status(winner.agent_id, "busy")
            # Mark winning bid in persistent stats
            swarm_stats.mark_winner(task_id, winner.agent_id)
            logger.info(
                "Task %s assigned to %s at %d sats",
                task_id, winner.agent_id, winner.bid_sats,
            )
            # Log task assignment event
            log_event(
                EventType.TASK_ASSIGNED,
                source="coordinator",
                task_id=task_id,
                agent_id=winner.agent_id,
                data={"bid_sats": winner.bid_sats},
            )
            # Broadcast task assigned via WebSocket
            self._broadcast(self._broadcast_task_assigned, task_id, winner.agent_id)
            # Spark: capture assignment
            spark = _get_spark()
            if spark:
                spark.on_task_assigned(task_id, winner.agent_id)
        else:
            update_task(task_id, status=TaskStatus.FAILED)
            logger.warning("Task %s: no bids received, marked as failed", task_id)
            # Log task failure event
            log_event(
                EventType.TASK_FAILED,
                source="coordinator",
                task_id=task_id,
                data={"reason": "no bids received"},
            )
        return winner

    def complete_task(self, task_id: str, result: str) -> Optional[Task]:
        """Mark a task as completed with a result."""
        task = get_task(task_id)
        if task is None:
            return None
        now = datetime.now(timezone.utc).isoformat()
        updated = update_task(
            task_id,
            status=TaskStatus.COMPLETED,
            result=result,
            completed_at=now,
        )
        if task.assigned_agent:
            registry.update_status(task.assigned_agent, "idle")
            self.comms.complete_task(task_id, task.assigned_agent, result)
            # Record success in learner
            swarm_learner.record_task_result(task_id, task.assigned_agent, succeeded=True)
            # Log task completion event
            log_event(
                EventType.TASK_COMPLETED,
                source="coordinator",
                task_id=task_id,
                agent_id=task.assigned_agent,
                data={"result_preview": result[:500]},
            )
            # Broadcast task completed via WebSocket
            self._broadcast(
                self._broadcast_task_completed,
                task_id, task.assigned_agent, result
            )
            # Spark: capture completion
            spark = _get_spark()
            if spark:
                spark.on_task_completed(task_id, task.assigned_agent, result)
        return updated

    def fail_task(self, task_id: str, reason: str = "") -> Optional[Task]:
        """Mark a task as failed — feeds failure data into the learner."""
        task = get_task(task_id)
        if task is None:
            return None
        now = datetime.now(timezone.utc).isoformat()
        updated = update_task(
            task_id,
            status=TaskStatus.FAILED,
            result=reason,
            completed_at=now,
        )
        if task.assigned_agent:
            registry.update_status(task.assigned_agent, "idle")
            # Record failure in learner
            swarm_learner.record_task_result(task_id, task.assigned_agent, succeeded=False)
            # Log task failure event
            log_event(
                EventType.TASK_FAILED,
                source="coordinator",
                task_id=task_id,
                agent_id=task.assigned_agent,
                data={"reason": reason},
            )
            # Spark: capture failure
            spark = _get_spark()
            if spark:
                spark.on_task_failed(task_id, task.assigned_agent, reason)
        return updated

    def get_task(self, task_id: str) -> Optional[Task]:
        return get_task(task_id)

    def list_tasks(self, status: Optional[TaskStatus] = None) -> list[Task]:
        return list_tasks(status)

    # ── WebSocket broadcasts ────────────────────────────────────────────────

    def _broadcast(self, broadcast_fn, *args) -> None:
        """Safely schedule a broadcast, handling sync/async contexts.

        Only creates the coroutine and schedules it if an event loop is running.
        This prevents 'coroutine was never awaited' warnings in tests.
        """
        try:
            loop = asyncio.get_running_loop()
            # Create coroutine only when we have an event loop
            coro = broadcast_fn(*args)
            asyncio.create_task(coro)
        except RuntimeError:
            # No event loop running - skip broadcast silently
            pass

    async def _broadcast_agent_joined(self, agent_id: str, name: str) -> None:
        """Broadcast agent joined event via WebSocket."""
        try:
            from infrastructure.ws_manager.handler import ws_manager
            await ws_manager.broadcast_agent_joined(agent_id, name)
        except Exception as exc:
            logger.debug("WebSocket broadcast failed (agent_joined): %s", exc)

    async def _broadcast_bid(self, task_id: str, agent_id: str, bid_sats: int) -> None:
        """Broadcast bid submitted event via WebSocket."""
        try:
            from infrastructure.ws_manager.handler import ws_manager
            await ws_manager.broadcast_bid_submitted(task_id, agent_id, bid_sats)
        except Exception as exc:
            logger.debug("WebSocket broadcast failed (bid): %s", exc)

    async def _broadcast_task_posted(self, task_id: str, description: str) -> None:
        """Broadcast task posted event via WebSocket."""
        try:
            from infrastructure.ws_manager.handler import ws_manager
            await ws_manager.broadcast_task_posted(task_id, description)
        except Exception as exc:
            logger.debug("WebSocket broadcast failed (task_posted): %s", exc)

    async def _broadcast_task_assigned(self, task_id: str, agent_id: str) -> None:
        """Broadcast task assigned event via WebSocket."""
        try:
            from infrastructure.ws_manager.handler import ws_manager
            await ws_manager.broadcast_task_assigned(task_id, agent_id)
        except Exception as exc:
            logger.debug("WebSocket broadcast failed (task_assigned): %s", exc)

    async def _broadcast_task_completed(
        self, task_id: str, agent_id: str, result: str
    ) -> None:
        """Broadcast task completed event via WebSocket."""
        try:
            from infrastructure.ws_manager.handler import ws_manager
            await ws_manager.broadcast_task_completed(task_id, agent_id, result)
        except Exception as exc:
            logger.debug("WebSocket broadcast failed (task_completed): %s", exc)

    # ── Convenience ─────────────────────────────────────────────────────────

    def status(self) -> dict:
        """Return a summary of the swarm state."""
        agents = registry.list_agents()
        tasks = list_tasks()
        status = {
            "agents": len(agents),
            "agents_idle": sum(1 for a in agents if a.status == "idle"),
            "agents_busy": sum(1 for a in agents if a.status == "busy"),
            "tasks_total": len(tasks),
            "tasks_pending": sum(1 for t in tasks if t.status == TaskStatus.PENDING),
            "tasks_running": sum(1 for t in tasks if t.status == TaskStatus.RUNNING),
            "tasks_completed": sum(1 for t in tasks if t.status == TaskStatus.COMPLETED),
            "active_auctions": len(self.auctions.active_auctions),
            "routing_manifests": len(swarm_routing.routing_engine._manifests),
        }
        # Include Spark Intelligence summary if available
        spark = _get_spark()
        if spark and spark.enabled:
            spark_status = spark.status()
            status["spark"] = {
                "events_captured": spark_status["events_captured"],
                "memories_stored": spark_status["memories_stored"],
                "prediction_accuracy": spark_status["predictions"]["avg_accuracy"],
            }
        return status

    def get_routing_decisions(self, task_id: Optional[str] = None, limit: int = 100) -> list:
        """Get routing decision history for audit.

        Args:
            task_id: Filter to specific task (optional)
            limit: Maximum number of decisions to return
        """
        return swarm_routing.routing_engine.get_routing_history(task_id, limit=limit)


# Module-level singleton for use by dashboard routes
coordinator = SwarmCoordinator()