Timmy-time-dashboard/src/swarm/docker_runner.py

"""Docker-backed agent runner — spawn swarm agents as isolated containers.

Drop-in complement to SwarmManager.  Instead of Python subprocesses,
DockerAgentRunner launches each agent as a Docker container that shares
the data volume and communicates with the coordinator over HTTP.

Requirements
------------
- Docker Engine running on the host (``docker`` CLI in PATH)
- The ``timmy-time:latest`` image already built (``make docker-build``)
- ``data/`` directory exists and is mounted at ``/app/data`` in each container

Communication
-------------
Container agents use the coordinator's internal HTTP API rather than the
in-memory SwarmComms channel::

    GET  /internal/tasks   → poll for tasks open for bidding
    POST /internal/bids    → submit a bid

The ``COORDINATOR_URL`` env var tells agents where to reach the coordinator.
Inside the docker-compose network this is ``http://dashboard:8000``.
From the host it is typically ``http://localhost:8000``.

Usage
-----
::

    from swarm.docker_runner import DockerAgentRunner

    runner = DockerAgentRunner()
    info   = runner.spawn("Echo", capabilities="summarise,translate")
    print(info)  # {"container_id": "...", "name": "Echo", "agent_id": "..."}

    runner.stop(info["container_id"])
    runner.stop_all()
"""

import logging
import subprocess
import uuid
from dataclasses import dataclass, field
from typing import Optional

logger = logging.getLogger(__name__)

DEFAULT_IMAGE = "timmy-time:latest"
DEFAULT_COORDINATOR_URL = "http://dashboard:8000"


@dataclass
class ManagedContainer:
    container_id: str
    agent_id: str
    name: str
    image: str
    capabilities: str = ""


class DockerAgentRunner:
    """Spawn and manage swarm agents as Docker containers."""

    def __init__(
        self,
        image: str = DEFAULT_IMAGE,
        coordinator_url: str = DEFAULT_COORDINATOR_URL,
        extra_env: Optional[dict] = None,
    ) -> None:
        self.image = image
        self.coordinator_url = coordinator_url
        self.extra_env = extra_env or {}
        self._containers: dict[str, ManagedContainer] = {}

    # ── Public API ────────────────────────────────────────────────────────────

    def spawn(
        self,
        name: str,
        agent_id: Optional[str] = None,
        capabilities: str = "",
        image: Optional[str] = None,
    ) -> dict:
        """Spawn a new agent container and return its info dict.

        The container runs ``python -m swarm.agent_runner`` and communicates
        with the coordinator over HTTP via ``COORDINATOR_URL``.
        """
        aid = agent_id or str(uuid.uuid4())
        img = image or self.image
        container_name = f"timmy-agent-{aid[:8]}"

        env_flags = self._build_env_flags(aid, name, capabilities)

        cmd = [
            "docker", "run",
            "--detach",
            "--name", container_name,
            "--network", "timmy-time_swarm-net",
            "--volume", "timmy-time_timmy-data:/app/data",
            "--extra-hosts", "host.docker.internal:host-gateway",
            *env_flags,
            img,
            "python", "-m", "swarm.agent_runner",
            "--agent-id", aid,
            "--name", name,
        ]

        try:
            result = subprocess.run(
                cmd, capture_output=True, text=True, timeout=15
            )
            if result.returncode != 0:
                raise RuntimeError(result.stderr.strip())
            container_id = result.stdout.strip()
        except FileNotFoundError:
            raise RuntimeError(
                "Docker CLI not found.  Is Docker Desktop running?"
            )

        managed = ManagedContainer(
            container_id=container_id,
            agent_id=aid,
            name=name,
            image=img,
            capabilities=capabilities,
        )
        self._containers[container_id] = managed
        logger.info(
            "Docker agent %s (%s) started — container %s",
            name, aid, container_id[:12],
        )
        return {
            "container_id": container_id,
            "agent_id": aid,
            "name": name,
            "image": img,
            "capabilities": capabilities,
        }

    def stop(self, container_id: str) -> bool:
        """Stop and remove a container agent."""
        try:
            subprocess.run(
                ["docker", "rm", "-f", container_id],
                capture_output=True, timeout=10,
            )
            self._containers.pop(container_id, None)
            logger.info("Docker agent container %s stopped", container_id[:12])
            return True
        except Exception as exc:
            logger.error("Failed to stop container %s: %s", container_id[:12], exc)
            return False

    def stop_all(self) -> int:
        """Stop all containers managed by this runner."""
        ids = list(self._containers.keys())
        stopped = sum(1 for cid in ids if self.stop(cid))
        return stopped

    def list_containers(self) -> list[ManagedContainer]:
        return list(self._containers.values())

    def is_running(self, container_id: str) -> bool:
        """Return True if the container is currently running."""
        try:
            result = subprocess.run(
                ["docker", "inspect", "--format", "{{.State.Running}}", container_id],
                capture_output=True, text=True, timeout=5,
            )
            return result.stdout.strip() == "true"
        except Exception:
            return False

    # ── Internal ──────────────────────────────────────────────────────────────

    def _build_env_flags(self, agent_id: str, name: str, capabilities: str) -> list[str]:
        env = {
            "COORDINATOR_URL": self.coordinator_url,
            "AGENT_NAME": name,
            "AGENT_ID": agent_id,
            "AGENT_CAPABILITIES": capabilities,
            **self.extra_env,
        }
        flags = []
        for k, v in env.items():
            flags += ["--env", f"{k}={v}"]
        return flags