From dd60bcbfb7e628176855c79ec437a4b03088af87 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Tue, 17 Mar 2026 10:44:37 -0700
Subject: [PATCH] feat: OpenAI-compatible API server + WhatsApp configurable
 reply prefix (#1756)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: OpenAI-compatible API server platform adapter

Salvaged from PR #956, updated for current main.

Adds an HTTP API server as a gateway platform adapter that exposes
hermes-agent via the OpenAI Chat Completions and Responses APIs.
Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat,
AnythingLLM, NextChat, ChatBox, etc.) can connect by pointing at
http://localhost:8642/v1.

Endpoints:
- POST /v1/chat/completions  — stateless Chat Completions API
- POST /v1/responses         — stateful Responses API with chaining
- GET  /v1/responses/{id}    — retrieve stored response
- DELETE /v1/responses/{id}  — delete stored response
- GET  /v1/models            — list hermes-agent as available model
- GET  /health               — health check

Features:
- Real SSE streaming via stream_delta_callback (uses main's streaming)
- In-memory LRU response store for Responses API conversation chaining
- Named conversations via 'conversation' parameter
- Bearer token auth (optional, via API_SERVER_KEY)
- CORS support for browser-based frontends
- System prompt layering (frontend system messages on top of core)
- Real token usage tracking in responses

Integration points:
- Platform.API_SERVER in gateway/config.py
- _create_adapter() branch in gateway/run.py
- API_SERVER_* env vars in hermes_cli/config.py
- Env var overrides in gateway/config.py _apply_env_overrides()

Changes vs original PR #956:
- Removed streaming infrastructure (already on main via stream_consumer.py)
- Removed Telegram reply_to_mode (separate feature, not included)
- Updated _resolve_model() -> _resolve_gateway_model()
- Updated stream_callback -> stream_delta_callback
- Updated connect()/disconnect() to use _mark_connected()/_mark_disconnected()
- Adapted to current Platform enum (includes MATTERMOST, MATRIX, DINGTALK)

Tests: 72 new tests, all passing
Docs: API server guide, Open WebUI integration guide, env var reference

* feat(whatsapp): make reply prefix configurable via config.yaml

Reworked from PR #1764 (ifrederico) to use config.yaml instead of .env.

The WhatsApp bridge prepends a header to every outgoing message.
This was hardcoded to '⚕ *Hermes Agent*'. Users can now customize
or disable it via config.yaml:

  whatsapp:
    reply_prefix: ''                     # disable header
    reply_prefix: '🤖 *My Bot*\n───\n'  # custom prefix

How it works:
- load_gateway_config() reads whatsapp.reply_prefix from config.yaml
  and stores it in PlatformConfig.extra['reply_prefix']
- WhatsAppAdapter reads it from config.extra at init
- When spawning bridge.js, the adapter passes it as
  WHATSAPP_REPLY_PREFIX in the subprocess environment
- bridge.js handles undefined (default), empty (no header),
  or custom values with \\n escape support
- Self-chat echo suppression uses the configured prefix

Also fixes _config_version: was 9 but ENV_VARS_BY_VERSION had a
key 10 (TAVILY_API_KEY), so existing users at v9 would never be
prompted for Tavily. Bumped to 10 to close the gap. Added a
regression test to prevent this from happening again.

Credit: ifrederico (PR #1764) for the bridge.js implementation
and the config version gap discovery.

---------

Co-authored-by: Test <test@test.com>
---
 gateway/config.py                             |   30 +
 gateway/platforms/api_server.py               |  790 ++++++++++
 gateway/platforms/whatsapp.py                 |   10 +
 gateway/run.py                                |    7 +
 hermes_cli/config.py                          |   42 +-
 scripts/whatsapp-bridge/bridge.js             |   18 +-
 tests/gateway/test_api_server.py              | 1299 +++++++++++++++++
 tests/gateway/test_whatsapp_connect.py        |    1 +
 tests/gateway/test_whatsapp_reply_prefix.py   |  121 ++
 .../docs/reference/environment-variables.md   |    4 +
 .../docs/user-guide/features/api-server.md    |  223 +++
 website/docs/user-guide/messaging/index.md    |    6 +-
 .../docs/user-guide/messaging/open-webui.md   |  213 +++
 website/docs/user-guide/messaging/whatsapp.md |    9 +-
 website/sidebars.ts                           |    2 +
 15 files changed, 2765 insertions(+), 10 deletions(-)
 create mode 100644 gateway/platforms/api_server.py
 create mode 100644 tests/gateway/test_api_server.py
 create mode 100644 tests/gateway/test_whatsapp_reply_prefix.py
 create mode 100644 website/docs/user-guide/features/api-server.md
 create mode 100644 website/docs/user-guide/messaging/open-webui.md

diff --git a/gateway/config.py b/gateway/config.py
index e43af65a..c99756c3 100644
--- a/gateway/config.py
+++ b/gateway/config.py
@@ -46,6 +46,7 @@ class Platform(Enum):
     EMAIL = "email"
     SMS = "sms"
     DINGTALK = "dingtalk"
+    API_SERVER = "api_server"
 
 
 @dataclass
@@ -238,6 +239,9 @@ class GatewayConfig:
             # SMS uses api_key (Twilio auth token) — SID checked via env
             elif platform == Platform.SMS and os.getenv("TWILIO_ACCOUNT_SID"):
                 connected.append(platform)
+            # API Server uses enabled flag only (no token needed)
+            elif platform == Platform.API_SERVER:
+                connected.append(platform)
         return connected
     
     def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]:
@@ -416,6 +420,13 @@ def load_gateway_config() -> GatewayConfig:
                     os.environ["DISCORD_FREE_RESPONSE_CHANNELS"] = str(frc)
                 if "auto_thread" in discord_cfg and not os.getenv("DISCORD_AUTO_THREAD"):
                     os.environ["DISCORD_AUTO_THREAD"] = str(discord_cfg["auto_thread"]).lower()
+
+            # Bridge whatsapp settings from config.yaml into platform config
+            whatsapp_cfg = yaml_cfg.get("whatsapp", {})
+            if isinstance(whatsapp_cfg, dict) and "reply_prefix" in whatsapp_cfg:
+                if Platform.WHATSAPP not in config.platforms:
+                    config.platforms[Platform.WHATSAPP] = PlatformConfig()
+                config.platforms[Platform.WHATSAPP].extra["reply_prefix"] = whatsapp_cfg["reply_prefix"]
     except Exception:
         pass
 
@@ -634,6 +645,25 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
                 name=os.getenv("SMS_HOME_CHANNEL_NAME", "Home"),
             )
 
+    # API Server
+    api_server_enabled = os.getenv("API_SERVER_ENABLED", "").lower() in ("true", "1", "yes")
+    api_server_key = os.getenv("API_SERVER_KEY", "")
+    api_server_port = os.getenv("API_SERVER_PORT")
+    api_server_host = os.getenv("API_SERVER_HOST")
+    if api_server_enabled or api_server_key:
+        if Platform.API_SERVER not in config.platforms:
+            config.platforms[Platform.API_SERVER] = PlatformConfig()
+        config.platforms[Platform.API_SERVER].enabled = True
+        if api_server_key:
+            config.platforms[Platform.API_SERVER].extra["key"] = api_server_key
+        if api_server_port:
+            try:
+                config.platforms[Platform.API_SERVER].extra["port"] = int(api_server_port)
+            except ValueError:
+                pass
+        if api_server_host:
+            config.platforms[Platform.API_SERVER].extra["host"] = api_server_host
+
     # Session settings
     idle_minutes = os.getenv("SESSION_IDLE_MINUTES")
     if idle_minutes:
diff --git a/gateway/platforms/api_server.py b/gateway/platforms/api_server.py
new file mode 100644
index 00000000..bbe9f77f
--- /dev/null
+++ b/gateway/platforms/api_server.py
@@ -0,0 +1,790 @@
+"""
+OpenAI-compatible API server platform adapter.
+
+Exposes an HTTP server with endpoints:
+- POST /v1/chat/completions        — OpenAI Chat Completions format (stateless)
+- POST /v1/responses               — OpenAI Responses API format (stateful via previous_response_id)
+- GET  /v1/responses/{response_id} — Retrieve a stored response
+- DELETE /v1/responses/{response_id} — Delete a stored response
+- GET  /v1/models                  — lists hermes-agent as an available model
+- GET  /health                     — health check
+
+Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat,
+AnythingLLM, NextChat, ChatBox, etc.) can connect to hermes-agent
+through this adapter by pointing at http://localhost:8642/v1.
+
+Requires:
+- aiohttp (already available in the gateway)
+"""
+
+import asyncio
+import collections
+import json
+import logging
+import os
+import time
+import uuid
+from typing import Any, Dict, List, Optional
+
+try:
+    from aiohttp import web
+    AIOHTTP_AVAILABLE = True
+except ImportError:
+    AIOHTTP_AVAILABLE = False
+    web = None  # type: ignore[assignment]
+
+from gateway.config import Platform, PlatformConfig
+from gateway.platforms.base import (
+    BasePlatformAdapter,
+    SendResult,
+)
+
+logger = logging.getLogger(__name__)
+
+# Default settings
+DEFAULT_HOST = "127.0.0.1"
+DEFAULT_PORT = 8642
+MAX_STORED_RESPONSES = 100
+
+
+def check_api_server_requirements() -> bool:
+    """Check if API server dependencies are available."""
+    return AIOHTTP_AVAILABLE
+
+
+class ResponseStore:
+    """
+    In-memory LRU store for Responses API state.
+
+    Each stored response includes the full internal conversation history
+    (with tool calls and results) so it can be reconstructed on subsequent
+    requests via previous_response_id.
+    """
+
+    def __init__(self, max_size: int = MAX_STORED_RESPONSES):
+        self._store: collections.OrderedDict[str, Dict[str, Any]] = collections.OrderedDict()
+        self._max_size = max_size
+
+    def get(self, response_id: str) -> Optional[Dict[str, Any]]:
+        """Retrieve a stored response by ID (moves to end for LRU)."""
+        if response_id in self._store:
+            self._store.move_to_end(response_id)
+            return self._store[response_id]
+        return None
+
+    def put(self, response_id: str, data: Dict[str, Any]) -> None:
+        """Store a response, evicting the oldest if at capacity."""
+        if response_id in self._store:
+            self._store.move_to_end(response_id)
+        self._store[response_id] = data
+        while len(self._store) > self._max_size:
+            self._store.popitem(last=False)
+
+    def delete(self, response_id: str) -> bool:
+        """Remove a response from the store. Returns True if found and deleted."""
+        if response_id in self._store:
+            del self._store[response_id]
+            return True
+        return False
+
+    def __len__(self) -> int:
+        return len(self._store)
+
+
+# ---------------------------------------------------------------------------
+# CORS middleware
+# ---------------------------------------------------------------------------
+
+_CORS_HEADERS = {
+    "Access-Control-Allow-Origin": "*",
+    "Access-Control-Allow-Methods": "GET, POST, DELETE, OPTIONS",
+    "Access-Control-Allow-Headers": "Authorization, Content-Type",
+}
+
+
+if AIOHTTP_AVAILABLE:
+    @web.middleware
+    async def cors_middleware(request, handler):
+        """Add CORS headers to every response; handle OPTIONS preflight."""
+        if request.method == "OPTIONS":
+            return web.Response(status=200, headers=_CORS_HEADERS)
+        response = await handler(request)
+        response.headers.update(_CORS_HEADERS)
+        return response
+else:
+    cors_middleware = None  # type: ignore[assignment]
+
+
+class APIServerAdapter(BasePlatformAdapter):
+    """
+    OpenAI-compatible HTTP API server adapter.
+
+    Runs an aiohttp web server that accepts OpenAI-format requests
+    and routes them through hermes-agent's AIAgent.
+    """
+
+    def __init__(self, config: PlatformConfig):
+        super().__init__(config, Platform.API_SERVER)
+        extra = config.extra or {}
+        self._host: str = extra.get("host", os.getenv("API_SERVER_HOST", DEFAULT_HOST))
+        self._port: int = int(extra.get("port", os.getenv("API_SERVER_PORT", str(DEFAULT_PORT))))
+        self._api_key: str = extra.get("key", os.getenv("API_SERVER_KEY", ""))
+        self._app: Optional["web.Application"] = None
+        self._runner: Optional["web.AppRunner"] = None
+        self._site: Optional["web.TCPSite"] = None
+        self._response_store = ResponseStore()
+        # Conversation name → latest response_id mapping
+        self._conversations: Dict[str, str] = {}
+
+    # ------------------------------------------------------------------
+    # Auth helper
+    # ------------------------------------------------------------------
+
+    def _check_auth(self, request: "web.Request") -> Optional["web.Response"]:
+        """
+        Validate Bearer token from Authorization header.
+
+        Returns None if auth is OK, or a 401 web.Response on failure.
+        If no API key is configured, all requests are allowed.
+        """
+        if not self._api_key:
+            return None  # No key configured — allow all (local-only use)
+
+        auth_header = request.headers.get("Authorization", "")
+        if auth_header.startswith("Bearer "):
+            token = auth_header[7:].strip()
+            if token == self._api_key:
+                return None  # Auth OK
+
+        return web.json_response(
+            {"error": {"message": "Invalid API key", "type": "invalid_request_error", "code": "invalid_api_key"}},
+            status=401,
+        )
+
+    # ------------------------------------------------------------------
+    # Agent creation helper
+    # ------------------------------------------------------------------
+
+    def _create_agent(
+        self,
+        ephemeral_system_prompt: Optional[str] = None,
+        session_id: Optional[str] = None,
+        stream_delta_callback=None,
+    ) -> Any:
+        """
+        Create an AIAgent instance using the gateway's runtime config.
+
+        Uses _resolve_runtime_agent_kwargs() to pick up model, api_key,
+        base_url, etc. from config.yaml / env vars.
+        """
+        from run_agent import AIAgent
+        from gateway.run import _resolve_runtime_agent_kwargs, _resolve_gateway_model
+
+        runtime_kwargs = _resolve_runtime_agent_kwargs()
+        model = _resolve_gateway_model()
+
+        max_iterations = int(os.getenv("HERMES_MAX_ITERATIONS", "90"))
+
+        agent = AIAgent(
+            model=model,
+            **runtime_kwargs,
+            max_iterations=max_iterations,
+            quiet_mode=True,
+            verbose_logging=False,
+            ephemeral_system_prompt=ephemeral_system_prompt or None,
+            session_id=session_id,
+            platform="api_server",
+            stream_delta_callback=stream_delta_callback,
+        )
+        return agent
+
+    # ------------------------------------------------------------------
+    # HTTP Handlers
+    # ------------------------------------------------------------------
+
+    async def _handle_health(self, request: "web.Request") -> "web.Response":
+        """GET /health — simple health check."""
+        return web.json_response({"status": "ok", "platform": "hermes-agent"})
+
+    async def _handle_models(self, request: "web.Request") -> "web.Response":
+        """GET /v1/models — return hermes-agent as an available model."""
+        auth_err = self._check_auth(request)
+        if auth_err:
+            return auth_err
+
+        return web.json_response({
+            "object": "list",
+            "data": [
+                {
+                    "id": "hermes-agent",
+                    "object": "model",
+                    "created": int(time.time()),
+                    "owned_by": "hermes",
+                    "permission": [],
+                    "root": "hermes-agent",
+                    "parent": None,
+                }
+            ],
+        })
+
+    async def _handle_chat_completions(self, request: "web.Request") -> "web.Response":
+        """POST /v1/chat/completions — OpenAI Chat Completions format."""
+        auth_err = self._check_auth(request)
+        if auth_err:
+            return auth_err
+
+        # Parse request body
+        try:
+            body = await request.json()
+        except (json.JSONDecodeError, Exception):
+            return web.json_response(
+                {"error": {"message": "Invalid JSON in request body", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        messages = body.get("messages")
+        if not messages or not isinstance(messages, list):
+            return web.json_response(
+                {"error": {"message": "Missing or invalid 'messages' field", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        stream = body.get("stream", False)
+
+        # Extract system message (becomes ephemeral system prompt layered ON TOP of core)
+        system_prompt = None
+        conversation_messages: List[Dict[str, str]] = []
+
+        for msg in messages:
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+            if role == "system":
+                # Accumulate system messages
+                if system_prompt is None:
+                    system_prompt = content
+                else:
+                    system_prompt = system_prompt + "\n" + content
+            elif role in ("user", "assistant"):
+                conversation_messages.append({"role": role, "content": content})
+
+        # Extract the last user message as the primary input
+        user_message = ""
+        history = []
+        if conversation_messages:
+            user_message = conversation_messages[-1].get("content", "")
+            history = conversation_messages[:-1]
+
+        if not user_message:
+            return web.json_response(
+                {"error": {"message": "No user message found in messages", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        session_id = str(uuid.uuid4())
+        completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
+        model_name = body.get("model", "hermes-agent")
+        created = int(time.time())
+
+        if stream:
+            import queue as _q
+            _stream_q: _q.Queue = _q.Queue()
+
+            def _on_delta(delta):
+                _stream_q.put(delta)
+
+            # Start agent in background
+            agent_task = asyncio.ensure_future(self._run_agent(
+                user_message=user_message,
+                conversation_history=history,
+                ephemeral_system_prompt=system_prompt,
+                session_id=session_id,
+                stream_delta_callback=_on_delta,
+            ))
+
+            return await self._write_sse_chat_completion(
+                request, completion_id, model_name, created, _stream_q, agent_task
+            )
+
+        # Non-streaming: run the agent and return full response
+        try:
+            result, usage = await self._run_agent(
+                user_message=user_message,
+                conversation_history=history,
+                ephemeral_system_prompt=system_prompt,
+                session_id=session_id,
+            )
+        except Exception as e:
+            logger.error("Error running agent for chat completions: %s", e, exc_info=True)
+            return web.json_response(
+                {"error": {"message": f"Internal server error: {e}", "type": "server_error"}},
+                status=500,
+            )
+
+        final_response = result.get("final_response", "")
+        if not final_response:
+            final_response = result.get("error", "(No response generated)")
+
+        response_data = {
+            "id": completion_id,
+            "object": "chat.completion",
+            "created": created,
+            "model": model_name,
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {
+                        "role": "assistant",
+                        "content": final_response,
+                    },
+                    "finish_reason": "stop",
+                }
+            ],
+            "usage": {
+                "prompt_tokens": usage.get("input_tokens", 0),
+                "completion_tokens": usage.get("output_tokens", 0),
+                "total_tokens": usage.get("total_tokens", 0),
+            },
+        }
+
+        return web.json_response(response_data)
+
+    async def _write_sse_chat_completion(
+        self, request: "web.Request", completion_id: str, model: str,
+        created: int, stream_q, agent_task,
+    ) -> "web.StreamResponse":
+        """Write real streaming SSE from agent's stream_delta_callback queue."""
+        import queue as _q
+
+        response = web.StreamResponse(
+            status=200,
+            headers={"Content-Type": "text/event-stream", "Cache-Control": "no-cache"},
+        )
+        await response.prepare(request)
+
+        # Role chunk
+        role_chunk = {
+            "id": completion_id, "object": "chat.completion.chunk",
+            "created": created, "model": model,
+            "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
+        }
+        await response.write(f"data: {json.dumps(role_chunk)}\n\n".encode())
+
+        # Stream content chunks as they arrive from the agent
+        loop = asyncio.get_event_loop()
+        while True:
+            try:
+                delta = await loop.run_in_executor(None, lambda: stream_q.get(timeout=0.5))
+            except _q.Empty:
+                if agent_task.done():
+                    # Drain any remaining items
+                    while True:
+                        try:
+                            delta = stream_q.get_nowait()
+                            if delta is None:
+                                break
+                            content_chunk = {
+                                "id": completion_id, "object": "chat.completion.chunk",
+                                "created": created, "model": model,
+                                "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
+                            }
+                            await response.write(f"data: {json.dumps(content_chunk)}\n\n".encode())
+                        except _q.Empty:
+                            break
+                    break
+                continue
+
+            if delta is None:  # End of stream sentinel
+                break
+
+            content_chunk = {
+                "id": completion_id, "object": "chat.completion.chunk",
+                "created": created, "model": model,
+                "choices": [{"index": 0, "delta": {"content": delta}, "finish_reason": None}],
+            }
+            await response.write(f"data: {json.dumps(content_chunk)}\n\n".encode())
+
+        # Get usage from completed agent
+        usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
+        try:
+            result, agent_usage = await agent_task
+            usage = agent_usage or usage
+        except Exception:
+            pass
+
+        # Finish chunk
+        finish_chunk = {
+            "id": completion_id, "object": "chat.completion.chunk",
+            "created": created, "model": model,
+            "choices": [{"index": 0, "delta": {}, "finish_reason": "stop"}],
+            "usage": {
+                "prompt_tokens": usage.get("input_tokens", 0),
+                "completion_tokens": usage.get("output_tokens", 0),
+                "total_tokens": usage.get("total_tokens", 0),
+            },
+        }
+        await response.write(f"data: {json.dumps(finish_chunk)}\n\n".encode())
+        await response.write(b"data: [DONE]\n\n")
+
+        return response
+
+    async def _handle_responses(self, request: "web.Request") -> "web.Response":
+        """POST /v1/responses — OpenAI Responses API format."""
+        auth_err = self._check_auth(request)
+        if auth_err:
+            return auth_err
+
+        # Parse request body
+        try:
+            body = await request.json()
+        except (json.JSONDecodeError, Exception):
+            return web.json_response(
+                {"error": {"message": "Invalid JSON in request body", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        raw_input = body.get("input")
+        if raw_input is None:
+            return web.json_response(
+                {"error": {"message": "Missing 'input' field", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        instructions = body.get("instructions")
+        previous_response_id = body.get("previous_response_id")
+        conversation = body.get("conversation")
+        store = body.get("store", True)
+
+        # conversation and previous_response_id are mutually exclusive
+        if conversation and previous_response_id:
+            return web.json_response(
+                {"error": {"message": "Cannot use both 'conversation' and 'previous_response_id'", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        # Resolve conversation name to latest response_id
+        if conversation:
+            previous_response_id = self._conversations.get(conversation)
+            # No error if conversation doesn't exist yet — it's a new conversation
+
+        # Normalize input to message list
+        input_messages: List[Dict[str, str]] = []
+        if isinstance(raw_input, str):
+            input_messages = [{"role": "user", "content": raw_input}]
+        elif isinstance(raw_input, list):
+            for item in raw_input:
+                if isinstance(item, str):
+                    input_messages.append({"role": "user", "content": item})
+                elif isinstance(item, dict):
+                    role = item.get("role", "user")
+                    content = item.get("content", "")
+                    # Handle content that may be a list of content parts
+                    if isinstance(content, list):
+                        text_parts = []
+                        for part in content:
+                            if isinstance(part, dict) and part.get("type") == "input_text":
+                                text_parts.append(part.get("text", ""))
+                            elif isinstance(part, dict) and part.get("type") == "output_text":
+                                text_parts.append(part.get("text", ""))
+                            elif isinstance(part, str):
+                                text_parts.append(part)
+                        content = "\n".join(text_parts)
+                    input_messages.append({"role": role, "content": content})
+        else:
+            return web.json_response(
+                {"error": {"message": "'input' must be a string or array", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        # Reconstruct conversation history from previous_response_id
+        conversation_history: List[Dict[str, str]] = []
+        if previous_response_id:
+            stored = self._response_store.get(previous_response_id)
+            if stored is None:
+                return web.json_response(
+                    {"error": {"message": f"Previous response not found: {previous_response_id}", "type": "invalid_request_error"}},
+                    status=404,
+                )
+            conversation_history = list(stored.get("conversation_history", []))
+            # If no instructions provided, carry forward from previous
+            if instructions is None:
+                instructions = stored.get("instructions")
+
+        # Append new input messages to history (all but the last become history)
+        for msg in input_messages[:-1]:
+            conversation_history.append(msg)
+
+        # Last input message is the user_message
+        user_message = input_messages[-1].get("content", "") if input_messages else ""
+        if not user_message:
+            return web.json_response(
+                {"error": {"message": "No user message found in input", "type": "invalid_request_error"}},
+                status=400,
+            )
+
+        # Truncation support
+        if body.get("truncation") == "auto" and len(conversation_history) > 100:
+            conversation_history = conversation_history[-100:]
+
+        # Run the agent
+        session_id = str(uuid.uuid4())
+        try:
+            result, usage = await self._run_agent(
+                user_message=user_message,
+                conversation_history=conversation_history,
+                ephemeral_system_prompt=instructions,
+                session_id=session_id,
+            )
+        except Exception as e:
+            logger.error("Error running agent for responses: %s", e, exc_info=True)
+            return web.json_response(
+                {"error": {"message": f"Internal server error: {e}", "type": "server_error"}},
+                status=500,
+            )
+
+        final_response = result.get("final_response", "")
+        if not final_response:
+            final_response = result.get("error", "(No response generated)")
+
+        response_id = f"resp_{uuid.uuid4().hex[:28]}"
+        created_at = int(time.time())
+
+        # Build the full conversation history for storage
+        # (includes tool calls from the agent run)
+        full_history = list(conversation_history)
+        full_history.append({"role": "user", "content": user_message})
+        # Add agent's internal messages if available
+        agent_messages = result.get("messages", [])
+        if agent_messages:
+            full_history.extend(agent_messages)
+        else:
+            full_history.append({"role": "assistant", "content": final_response})
+
+        # Build output items (includes tool calls + final message)
+        output_items = self._extract_output_items(result)
+
+        response_data = {
+            "id": response_id,
+            "object": "response",
+            "status": "completed",
+            "created_at": created_at,
+            "model": body.get("model", "hermes-agent"),
+            "output": output_items,
+            "usage": {
+                "input_tokens": usage.get("input_tokens", 0),
+                "output_tokens": usage.get("output_tokens", 0),
+                "total_tokens": usage.get("total_tokens", 0),
+            },
+        }
+
+        # Store the complete response object for future chaining / GET retrieval
+        if store:
+            self._response_store.put(response_id, {
+                "response": response_data,
+                "conversation_history": full_history,
+                "instructions": instructions,
+            })
+            # Update conversation mapping so the next request with the same
+            # conversation name automatically chains to this response
+            if conversation:
+                self._conversations[conversation] = response_id
+
+        return web.json_response(response_data)
+
+    # ------------------------------------------------------------------
+    # GET / DELETE response endpoints
+    # ------------------------------------------------------------------
+
+    async def _handle_get_response(self, request: "web.Request") -> "web.Response":
+        """GET /v1/responses/{response_id} — retrieve a stored response."""
+        auth_err = self._check_auth(request)
+        if auth_err:
+            return auth_err
+
+        response_id = request.match_info["response_id"]
+        stored = self._response_store.get(response_id)
+        if stored is None:
+            return web.json_response(
+                {"error": {"message": f"Response not found: {response_id}", "type": "invalid_request_error"}},
+                status=404,
+            )
+
+        return web.json_response(stored["response"])
+
+    async def _handle_delete_response(self, request: "web.Request") -> "web.Response":
+        """DELETE /v1/responses/{response_id} — delete a stored response."""
+        auth_err = self._check_auth(request)
+        if auth_err:
+            return auth_err
+
+        response_id = request.match_info["response_id"]
+        deleted = self._response_store.delete(response_id)
+        if not deleted:
+            return web.json_response(
+                {"error": {"message": f"Response not found: {response_id}", "type": "invalid_request_error"}},
+                status=404,
+            )
+
+        return web.json_response({
+            "id": response_id,
+            "object": "response",
+            "deleted": True,
+        })
+
+    # ------------------------------------------------------------------
+    # Output extraction helper
+    # ------------------------------------------------------------------
+
+    @staticmethod
+    def _extract_output_items(result: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Build the full output item array from the agent's messages.
+
+        Walks *result["messages"]* and emits:
+        - ``function_call`` items for each tool_call on assistant messages
+        - ``function_call_output`` items for each tool-role message
+        - a final ``message`` item with the assistant's text reply
+        """
+        items: List[Dict[str, Any]] = []
+        messages = result.get("messages", [])
+
+        for msg in messages:
+            role = msg.get("role")
+            if role == "assistant" and msg.get("tool_calls"):
+                for tc in msg["tool_calls"]:
+                    func = tc.get("function", {})
+                    items.append({
+                        "type": "function_call",
+                        "name": func.get("name", ""),
+                        "arguments": func.get("arguments", ""),
+                        "call_id": tc.get("id", ""),
+                    })
+            elif role == "tool":
+                items.append({
+                    "type": "function_call_output",
+                    "call_id": msg.get("tool_call_id", ""),
+                    "output": msg.get("content", ""),
+                })
+
+        # Final assistant message
+        final = result.get("final_response", "")
+        if not final:
+            final = result.get("error", "(No response generated)")
+
+        items.append({
+            "type": "message",
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "output_text",
+                    "text": final,
+                }
+            ],
+        })
+        return items
+
+    # ------------------------------------------------------------------
+    # Agent execution
+    # ------------------------------------------------------------------
+
+    async def _run_agent(
+        self,
+        user_message: str,
+        conversation_history: List[Dict[str, str]],
+        ephemeral_system_prompt: Optional[str] = None,
+        session_id: Optional[str] = None,
+        stream_delta_callback=None,
+    ) -> tuple:
+        """
+        Create an agent and run a conversation in a thread executor.
+
+        Returns ``(result_dict, usage_dict)`` where *usage_dict* contains
+        ``input_tokens``, ``output_tokens`` and ``total_tokens``.
+        """
+        loop = asyncio.get_event_loop()
+
+        def _run():
+            agent = self._create_agent(
+                ephemeral_system_prompt=ephemeral_system_prompt,
+                session_id=session_id,
+                stream_delta_callback=stream_delta_callback,
+            )
+            result = agent.run_conversation(
+                user_message=user_message,
+                conversation_history=conversation_history,
+            )
+            usage = {
+                "input_tokens": getattr(agent, "session_prompt_tokens", 0) or 0,
+                "output_tokens": getattr(agent, "session_completion_tokens", 0) or 0,
+                "total_tokens": getattr(agent, "session_total_tokens", 0) or 0,
+            }
+            return result, usage
+
+        return await loop.run_in_executor(None, _run)
+
+    # ------------------------------------------------------------------
+    # BasePlatformAdapter interface
+    # ------------------------------------------------------------------
+
+    async def connect(self) -> bool:
+        """Start the aiohttp web server."""
+        if not AIOHTTP_AVAILABLE:
+            logger.warning("[%s] aiohttp not installed", self.name)
+            return False
+
+        try:
+            self._app = web.Application(middlewares=[cors_middleware])
+            self._app.router.add_get("/health", self._handle_health)
+            self._app.router.add_get("/v1/models", self._handle_models)
+            self._app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
+            self._app.router.add_post("/v1/responses", self._handle_responses)
+            self._app.router.add_get("/v1/responses/{response_id}", self._handle_get_response)
+            self._app.router.add_delete("/v1/responses/{response_id}", self._handle_delete_response)
+
+            self._runner = web.AppRunner(self._app)
+            await self._runner.setup()
+            self._site = web.TCPSite(self._runner, self._host, self._port)
+            await self._site.start()
+
+            self._mark_connected()
+            logger.info(
+                "[%s] API server listening on http://%s:%d",
+                self.name, self._host, self._port,
+            )
+            return True
+
+        except Exception as e:
+            logger.error("[%s] Failed to start API server: %s", self.name, e)
+            return False
+
+    async def disconnect(self) -> None:
+        """Stop the aiohttp web server."""
+        self._mark_disconnected()
+        if self._site:
+            await self._site.stop()
+            self._site = None
+        if self._runner:
+            await self._runner.cleanup()
+            self._runner = None
+        self._app = None
+        logger.info("[%s] API server stopped", self.name)
+
+    async def send(
+        self,
+        chat_id: str,
+        content: str,
+        reply_to: Optional[str] = None,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> SendResult:
+        """
+        Not used — HTTP request/response cycle handles delivery directly.
+        """
+        return SendResult(success=False, error="API server uses HTTP request/response, not send()")
+
+    async def get_chat_info(self, chat_id: str) -> Dict[str, Any]:
+        """Return basic info about the API server."""
+        return {
+            "name": "API Server",
+            "type": "api",
+            "host": self._host,
+            "port": self._port,
+        }
diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py
index 2464a433..76019636 100644
--- a/gateway/platforms/whatsapp.py
+++ b/gateway/platforms/whatsapp.py
@@ -136,6 +136,7 @@ class WhatsAppAdapter(BasePlatformAdapter):
             "session_path",
             get_hermes_home() / "whatsapp" / "session"
         ))
+        self._reply_prefix: Optional[str] = config.extra.get("reply_prefix")
         self._message_queue: asyncio.Queue = asyncio.Queue()
         self._bridge_log_fh = None
         self._bridge_log: Optional[Path] = None
@@ -193,6 +194,14 @@ class WhatsAppAdapter(BasePlatformAdapter):
             self._bridge_log = self._session_path.parent / "bridge.log"
             bridge_log_fh = open(self._bridge_log, "a")
             self._bridge_log_fh = bridge_log_fh
+
+            # Build bridge subprocess environment.
+            # Pass WHATSAPP_REPLY_PREFIX from config.yaml so the Node bridge
+            # can use it without the user needing to set a separate env var.
+            bridge_env = os.environ.copy()
+            if self._reply_prefix is not None:
+                bridge_env["WHATSAPP_REPLY_PREFIX"] = self._reply_prefix
+
             self._bridge_process = subprocess.Popen(
                 [
                     "node",
@@ -204,6 +213,7 @@ class WhatsAppAdapter(BasePlatformAdapter):
                 stdout=bridge_log_fh,
                 stderr=bridge_log_fh,
                 preexec_fn=None if _IS_WINDOWS else os.setsid,
+                env=bridge_env,
             )
             
             # Wait for the bridge to connect to WhatsApp.
diff --git a/gateway/run.py b/gateway/run.py
index 47142c75..59b172af 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1162,6 +1162,13 @@ class GatewayRunner:
                 return None
             return MatrixAdapter(config)
 
+        elif platform == Platform.API_SERVER:
+            from gateway.platforms.api_server import APIServerAdapter, check_api_server_requirements
+            if not check_api_server_requirements():
+                logger.warning("API Server: aiohttp not installed")
+                return None
+            return APIServerAdapter(config)
+
         return None
     
     def _is_user_authorized(self, source: SessionSource) -> bool:
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 8da98100..d2a7693a 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -332,6 +332,14 @@ DEFAULT_CONFIG = {
         "auto_thread": True,           # Auto-create threads on @mention in channels (like Slack)
     },
 
+    # WhatsApp platform settings (gateway mode)
+    "whatsapp": {
+        # Reply prefix prepended to every outgoing WhatsApp message.
+        # Default (None) uses the built-in "⚕ *Hermes Agent*" header.
+        # Set to "" (empty string) to disable the header entirely.
+        # Supports \n for newlines, e.g. "🤖 *My Bot*\n──────\n"
+    },
+
     # Approval mode for dangerous commands:
     #   manual — always prompt the user (default)
     #   smart  — use auxiliary LLM to auto-approve low-risk commands, prompt for high-risk
@@ -364,7 +372,7 @@ DEFAULT_CONFIG = {
     },
 
     # Config schema version - bump this when adding new required fields
-    "_config_version": 9,
+    "_config_version": 10,
 }
 
 # =============================================================================
@@ -767,6 +775,38 @@ OPTIONAL_ENV_VARS = {
         "category": "messaging",
         "advanced": True,
     },
+    "API_SERVER_ENABLED": {
+        "description": "Enable the OpenAI-compatible API server (true/false). Allows frontends like Open WebUI, LobeChat, etc. to connect.",
+        "prompt": "Enable API server (true/false)",
+        "url": None,
+        "password": False,
+        "category": "messaging",
+        "advanced": True,
+    },
+    "API_SERVER_KEY": {
+        "description": "Bearer token for API server authentication. If empty, all requests are allowed (local use only).",
+        "prompt": "API server auth key (optional)",
+        "url": None,
+        "password": True,
+        "category": "messaging",
+        "advanced": True,
+    },
+    "API_SERVER_PORT": {
+        "description": "Port for the API server (default: 8642).",
+        "prompt": "API server port",
+        "url": None,
+        "password": False,
+        "category": "messaging",
+        "advanced": True,
+    },
+    "API_SERVER_HOST": {
+        "description": "Host/bind address for the API server (default: 127.0.0.1). Use 0.0.0.0 for network access — requires API_SERVER_KEY for security.",
+        "prompt": "API server host",
+        "url": None,
+        "password": False,
+        "category": "messaging",
+        "advanced": True,
+    },
 
     # ── Agent settings ──
     "MESSAGING_CWD": {
diff --git a/scripts/whatsapp-bridge/bridge.js b/scripts/whatsapp-bridge/bridge.js
index 1f326ba0..cbc18e24 100644
--- a/scripts/whatsapp-bridge/bridge.js
+++ b/scripts/whatsapp-bridge/bridge.js
@@ -44,6 +44,14 @@ const SESSION_DIR = getArg('session', path.join(process.env.HOME || '~', '.herme
 const PAIR_ONLY = args.includes('--pair-only');
 const WHATSAPP_MODE = getArg('mode', process.env.WHATSAPP_MODE || 'self-chat'); // "bot" or "self-chat"
 const ALLOWED_USERS = (process.env.WHATSAPP_ALLOWED_USERS || '').split(',').map(s => s.trim()).filter(Boolean);
+const DEFAULT_REPLY_PREFIX = '⚕ *Hermes Agent*\n────────────\n';
+const REPLY_PREFIX = process.env.WHATSAPP_REPLY_PREFIX === undefined
+  ? DEFAULT_REPLY_PREFIX
+  : process.env.WHATSAPP_REPLY_PREFIX.replace(/\\n/g, '\n');
+
+function formatOutgoingMessage(message) {
+  return REPLY_PREFIX ? `${REPLY_PREFIX}${message}` : message;
+}
 
 mkdirSync(SESSION_DIR, { recursive: true });
 
@@ -188,7 +196,7 @@ async function startSocket() {
       }
 
       // Ignore Hermes' own reply messages in self-chat mode to avoid loops.
-      if (msg.key.fromMe && (body.startsWith('⚕ *Hermes Agent*') || recentlySentIds.has(msg.key.id))) {
+      if (msg.key.fromMe && ((REPLY_PREFIX && body.startsWith(REPLY_PREFIX)) || recentlySentIds.has(msg.key.id))) {
         if (WHATSAPP_DEBUG) {
           try { console.log(JSON.stringify({ event: 'ignored', reason: 'agent_echo', chatId, messageId: msg.key.id })); } catch {}
         }
@@ -251,10 +259,7 @@ app.post('/send', async (req, res) => {
   }
 
   try {
-    // Prefix responses so the user can distinguish agent replies from their
-    // own messages (especially in self-chat / "Message Yourself").
-    const prefixed = `⚕ *Hermes Agent*\n────────────\n${message}`;
-    const sent = await sock.sendMessage(chatId, { text: prefixed });
+    const sent = await sock.sendMessage(chatId, { text: formatOutgoingMessage(message) });
 
     // Track sent message ID to prevent echo-back loops
     if (sent?.key?.id) {
@@ -282,9 +287,8 @@ app.post('/edit', async (req, res) => {
   }
 
   try {
-    const prefixed = `⚕ *Hermes Agent*\n────────────\n${message}`;
     const key = { id: messageId, fromMe: true, remoteJid: chatId };
-    await sock.sendMessage(chatId, { text: prefixed, edit: key });
+    await sock.sendMessage(chatId, { text: formatOutgoingMessage(message), edit: key });
     res.json({ success: true });
   } catch (err) {
     res.status(500).json({ error: err.message });
diff --git a/tests/gateway/test_api_server.py b/tests/gateway/test_api_server.py
new file mode 100644
index 00000000..2a30e3c7
--- /dev/null
+++ b/tests/gateway/test_api_server.py
@@ -0,0 +1,1299 @@
+"""
+Tests for the OpenAI-compatible API server gateway adapter.
+
+Tests cover:
+- Chat Completions endpoint (request parsing, response format)
+- Responses API endpoint (request parsing, response format)
+- previous_response_id chaining (store/retrieve)
+- Auth (valid key, invalid key, no key configured)
+- /v1/models endpoint
+- /health endpoint
+- System prompt extraction
+- Error handling (invalid JSON, missing fields)
+"""
+
+import json
+import time
+import uuid
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from aiohttp import web
+from aiohttp.test_utils import AioHTTPTestCase, TestClient, TestServer
+
+from gateway.config import GatewayConfig, Platform, PlatformConfig
+from gateway.platforms.api_server import (
+    APIServerAdapter,
+    ResponseStore,
+    _CORS_HEADERS,
+    check_api_server_requirements,
+    cors_middleware,
+)
+
+
+# ---------------------------------------------------------------------------
+# check_api_server_requirements
+# ---------------------------------------------------------------------------
+
+
+class TestCheckRequirements:
+    def test_returns_true_when_aiohttp_available(self):
+        assert check_api_server_requirements() is True
+
+    @patch("gateway.platforms.api_server.AIOHTTP_AVAILABLE", False)
+    def test_returns_false_without_aiohttp(self):
+        assert check_api_server_requirements() is False
+
+
+# ---------------------------------------------------------------------------
+# ResponseStore
+# ---------------------------------------------------------------------------
+
+
+class TestResponseStore:
+    def test_put_and_get(self):
+        store = ResponseStore(max_size=10)
+        store.put("resp_1", {"output": "hello"})
+        assert store.get("resp_1") == {"output": "hello"}
+
+    def test_get_missing_returns_none(self):
+        store = ResponseStore(max_size=10)
+        assert store.get("resp_missing") is None
+
+    def test_lru_eviction(self):
+        store = ResponseStore(max_size=3)
+        store.put("resp_1", {"output": "one"})
+        store.put("resp_2", {"output": "two"})
+        store.put("resp_3", {"output": "three"})
+        # Adding a 4th should evict resp_1
+        store.put("resp_4", {"output": "four"})
+        assert store.get("resp_1") is None
+        assert store.get("resp_2") is not None
+        assert len(store) == 3
+
+    def test_access_refreshes_lru(self):
+        store = ResponseStore(max_size=3)
+        store.put("resp_1", {"output": "one"})
+        store.put("resp_2", {"output": "two"})
+        store.put("resp_3", {"output": "three"})
+        # Access resp_1 to move it to end
+        store.get("resp_1")
+        # Now resp_2 is the oldest — adding a 4th should evict resp_2
+        store.put("resp_4", {"output": "four"})
+        assert store.get("resp_2") is None
+        assert store.get("resp_1") is not None
+
+    def test_update_existing_key(self):
+        store = ResponseStore(max_size=10)
+        store.put("resp_1", {"output": "v1"})
+        store.put("resp_1", {"output": "v2"})
+        assert store.get("resp_1") == {"output": "v2"}
+        assert len(store) == 1
+
+    def test_delete_existing(self):
+        store = ResponseStore(max_size=10)
+        store.put("resp_1", {"output": "hello"})
+        assert store.delete("resp_1") is True
+        assert store.get("resp_1") is None
+        assert len(store) == 0
+
+    def test_delete_missing(self):
+        store = ResponseStore(max_size=10)
+        assert store.delete("resp_missing") is False
+
+
+# ---------------------------------------------------------------------------
+# Adapter initialization
+# ---------------------------------------------------------------------------
+
+
+class TestAdapterInit:
+    def test_default_config(self):
+        config = PlatformConfig(enabled=True)
+        adapter = APIServerAdapter(config)
+        assert adapter._host == "127.0.0.1"
+        assert adapter._port == 8642
+        assert adapter._api_key == ""
+        assert adapter.platform == Platform.API_SERVER
+
+    def test_custom_config_from_extra(self):
+        config = PlatformConfig(
+            enabled=True,
+            extra={"host": "0.0.0.0", "port": 9999, "key": "sk-test"},
+        )
+        adapter = APIServerAdapter(config)
+        assert adapter._host == "0.0.0.0"
+        assert adapter._port == 9999
+        assert adapter._api_key == "sk-test"
+
+    def test_config_from_env(self, monkeypatch):
+        monkeypatch.setenv("API_SERVER_HOST", "10.0.0.1")
+        monkeypatch.setenv("API_SERVER_PORT", "7777")
+        monkeypatch.setenv("API_SERVER_KEY", "sk-env")
+        config = PlatformConfig(enabled=True)
+        adapter = APIServerAdapter(config)
+        assert adapter._host == "10.0.0.1"
+        assert adapter._port == 7777
+        assert adapter._api_key == "sk-env"
+
+
+# ---------------------------------------------------------------------------
+# Auth checking
+# ---------------------------------------------------------------------------
+
+
+class TestAuth:
+    def test_no_key_configured_allows_all(self):
+        config = PlatformConfig(enabled=True)
+        adapter = APIServerAdapter(config)
+        mock_request = MagicMock()
+        mock_request.headers = {}
+        assert adapter._check_auth(mock_request) is None
+
+    def test_valid_key_passes(self):
+        config = PlatformConfig(enabled=True, extra={"key": "sk-test123"})
+        adapter = APIServerAdapter(config)
+        mock_request = MagicMock()
+        mock_request.headers = {"Authorization": "Bearer sk-test123"}
+        assert adapter._check_auth(mock_request) is None
+
+    def test_invalid_key_returns_401(self):
+        config = PlatformConfig(enabled=True, extra={"key": "sk-test123"})
+        adapter = APIServerAdapter(config)
+        mock_request = MagicMock()
+        mock_request.headers = {"Authorization": "Bearer wrong-key"}
+        result = adapter._check_auth(mock_request)
+        assert result is not None
+        assert result.status == 401
+
+    def test_missing_auth_header_returns_401(self):
+        config = PlatformConfig(enabled=True, extra={"key": "sk-test123"})
+        adapter = APIServerAdapter(config)
+        mock_request = MagicMock()
+        mock_request.headers = {}
+        result = adapter._check_auth(mock_request)
+        assert result is not None
+        assert result.status == 401
+
+    def test_malformed_auth_header_returns_401(self):
+        config = PlatformConfig(enabled=True, extra={"key": "sk-test123"})
+        adapter = APIServerAdapter(config)
+        mock_request = MagicMock()
+        mock_request.headers = {"Authorization": "Basic dXNlcjpwYXNz"}
+        result = adapter._check_auth(mock_request)
+        assert result is not None
+        assert result.status == 401
+
+
+# ---------------------------------------------------------------------------
+# Helpers for HTTP tests
+# ---------------------------------------------------------------------------
+
+
+def _make_adapter(api_key: str = "") -> APIServerAdapter:
+    """Create an adapter with optional API key."""
+    extra = {}
+    if api_key:
+        extra["key"] = api_key
+    config = PlatformConfig(enabled=True, extra=extra)
+    return APIServerAdapter(config)
+
+
+def _create_app(adapter: APIServerAdapter) -> web.Application:
+    """Create the aiohttp app from the adapter (without starting the full server)."""
+    app = web.Application(middlewares=[cors_middleware])
+    app.router.add_get("/health", adapter._handle_health)
+    app.router.add_get("/v1/models", adapter._handle_models)
+    app.router.add_post("/v1/chat/completions", adapter._handle_chat_completions)
+    app.router.add_post("/v1/responses", adapter._handle_responses)
+    app.router.add_get("/v1/responses/{response_id}", adapter._handle_get_response)
+    app.router.add_delete("/v1/responses/{response_id}", adapter._handle_delete_response)
+    return app
+
+
+@pytest.fixture
+def adapter():
+    return _make_adapter()
+
+
+@pytest.fixture
+def auth_adapter():
+    return _make_adapter(api_key="sk-secret")
+
+
+# ---------------------------------------------------------------------------
+# /health endpoint
+# ---------------------------------------------------------------------------
+
+
+class TestHealthEndpoint:
+    @pytest.mark.asyncio
+    async def test_health_returns_ok(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/health")
+            assert resp.status == 200
+            data = await resp.json()
+            assert data["status"] == "ok"
+            assert data["platform"] == "hermes-agent"
+
+
+# ---------------------------------------------------------------------------
+# /v1/models endpoint
+# ---------------------------------------------------------------------------
+
+
+class TestModelsEndpoint:
+    @pytest.mark.asyncio
+    async def test_models_returns_hermes_agent(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/v1/models")
+            assert resp.status == 200
+            data = await resp.json()
+            assert data["object"] == "list"
+            assert len(data["data"]) == 1
+            assert data["data"][0]["id"] == "hermes-agent"
+            assert data["data"][0]["owned_by"] == "hermes"
+
+    @pytest.mark.asyncio
+    async def test_models_requires_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/v1/models")
+            assert resp.status == 401
+
+    @pytest.mark.asyncio
+    async def test_models_with_valid_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get(
+                "/v1/models",
+                headers={"Authorization": "Bearer sk-secret"},
+            )
+            assert resp.status == 200
+
+
+# ---------------------------------------------------------------------------
+# /v1/chat/completions endpoint
+# ---------------------------------------------------------------------------
+
+
+class TestChatCompletionsEndpoint:
+    @pytest.mark.asyncio
+    async def test_invalid_json_returns_400(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post(
+                "/v1/chat/completions",
+                data="not json",
+                headers={"Content-Type": "application/json"},
+            )
+            assert resp.status == 400
+            data = await resp.json()
+            assert "Invalid JSON" in data["error"]["message"]
+
+    @pytest.mark.asyncio
+    async def test_missing_messages_returns_400(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post("/v1/chat/completions", json={"model": "test"})
+            assert resp.status == 400
+            data = await resp.json()
+            assert "messages" in data["error"]["message"]
+
+    @pytest.mark.asyncio
+    async def test_empty_messages_returns_400(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post("/v1/chat/completions", json={"model": "test", "messages": []})
+            assert resp.status == 400
+
+    @pytest.mark.asyncio
+    async def test_stream_true_returns_sse(self, adapter):
+        """stream=true returns SSE format with the full response."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            async def _mock_run_agent(**kwargs):
+                # Simulate streaming: invoke stream_delta_callback with tokens
+                cb = kwargs.get("stream_delta_callback")
+                if cb:
+                    cb("Hello!")
+                    cb(None)  # End signal
+                return (
+                    {"final_response": "Hello!", "messages": [], "api_calls": 1},
+                    {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15},
+                )
+
+            with patch.object(adapter, "_run_agent", side_effect=_mock_run_agent) as mock_run:
+                resp = await cli.post(
+                    "/v1/chat/completions",
+                    json={
+                        "model": "test",
+                        "messages": [{"role": "user", "content": "hi"}],
+                        "stream": True,
+                    },
+                )
+                assert resp.status == 200
+                assert "text/event-stream" in resp.headers.get("Content-Type", "")
+                body = await resp.text()
+                assert "data: " in body
+                assert "[DONE]" in body
+                assert "Hello!" in body
+
+    @pytest.mark.asyncio
+    async def test_no_user_message_returns_400(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post(
+                "/v1/chat/completions",
+                json={
+                    "model": "test",
+                    "messages": [{"role": "system", "content": "You are helpful."}],
+                },
+            )
+            assert resp.status == 400
+
+    @pytest.mark.asyncio
+    async def test_successful_completion(self, adapter):
+        """Test a successful chat completion with mocked agent."""
+        mock_result = {
+            "final_response": "Hello! How can I help you today?",
+            "messages": [],
+            "api_calls": 1,
+        }
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/chat/completions",
+                    json={
+                        "model": "hermes-agent",
+                        "messages": [{"role": "user", "content": "Hello"}],
+                    },
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            assert data["object"] == "chat.completion"
+            assert data["id"].startswith("chatcmpl-")
+            assert data["model"] == "hermes-agent"
+            assert len(data["choices"]) == 1
+            assert data["choices"][0]["message"]["role"] == "assistant"
+            assert data["choices"][0]["message"]["content"] == "Hello! How can I help you today?"
+            assert data["choices"][0]["finish_reason"] == "stop"
+            assert "usage" in data
+
+    @pytest.mark.asyncio
+    async def test_system_prompt_extracted(self, adapter):
+        """System messages from the client are passed as ephemeral_system_prompt."""
+        mock_result = {
+            "final_response": "I am a pirate! Arrr!",
+            "messages": [],
+            "api_calls": 1,
+        }
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/chat/completions",
+                    json={
+                        "model": "hermes-agent",
+                        "messages": [
+                            {"role": "system", "content": "You are a pirate."},
+                            {"role": "user", "content": "Hello"},
+                        ],
+                    },
+                )
+
+            assert resp.status == 200
+            # Check that _run_agent was called with the system prompt
+            call_kwargs = mock_run.call_args
+            assert call_kwargs.kwargs.get("ephemeral_system_prompt") == "You are a pirate."
+            assert call_kwargs.kwargs.get("user_message") == "Hello"
+
+    @pytest.mark.asyncio
+    async def test_conversation_history_passed(self, adapter):
+        """Previous user/assistant messages become conversation_history."""
+        mock_result = {"final_response": "3", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/chat/completions",
+                    json={
+                        "model": "hermes-agent",
+                        "messages": [
+                            {"role": "user", "content": "1+1=?"},
+                            {"role": "assistant", "content": "2"},
+                            {"role": "user", "content": "Now add 1 more"},
+                        ],
+                    },
+                )
+
+            assert resp.status == 200
+            call_kwargs = mock_run.call_args.kwargs
+            assert call_kwargs["user_message"] == "Now add 1 more"
+            assert len(call_kwargs["conversation_history"]) == 2
+            assert call_kwargs["conversation_history"][0] == {"role": "user", "content": "1+1=?"}
+            assert call_kwargs["conversation_history"][1] == {"role": "assistant", "content": "2"}
+
+    @pytest.mark.asyncio
+    async def test_agent_error_returns_500(self, adapter):
+        """Agent exception returns 500."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.side_effect = RuntimeError("Provider failed")
+                resp = await cli.post(
+                    "/v1/chat/completions",
+                    json={
+                        "model": "hermes-agent",
+                        "messages": [{"role": "user", "content": "Hello"}],
+                    },
+                )
+
+            assert resp.status == 500
+            data = await resp.json()
+            assert "Provider failed" in data["error"]["message"]
+
+
+# ---------------------------------------------------------------------------
+# /v1/responses endpoint
+# ---------------------------------------------------------------------------
+
+
+class TestResponsesEndpoint:
+    @pytest.mark.asyncio
+    async def test_missing_input_returns_400(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post("/v1/responses", json={"model": "test"})
+            assert resp.status == 400
+            data = await resp.json()
+            assert "input" in data["error"]["message"]
+
+    @pytest.mark.asyncio
+    async def test_invalid_json_returns_400(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post(
+                "/v1/responses",
+                data="not json",
+                headers={"Content-Type": "application/json"},
+            )
+            assert resp.status == 400
+
+    @pytest.mark.asyncio
+    async def test_successful_response_with_string_input(self, adapter):
+        """String input is wrapped in a user message."""
+        mock_result = {
+            "final_response": "Paris is the capital of France.",
+            "messages": [],
+            "api_calls": 1,
+        }
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "What is the capital of France?",
+                    },
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            assert data["object"] == "response"
+            assert data["id"].startswith("resp_")
+            assert data["status"] == "completed"
+            assert len(data["output"]) == 1
+            assert data["output"][0]["type"] == "message"
+            assert data["output"][0]["content"][0]["type"] == "output_text"
+            assert data["output"][0]["content"][0]["text"] == "Paris is the capital of France."
+
+    @pytest.mark.asyncio
+    async def test_successful_response_with_array_input(self, adapter):
+        """Array input with role/content objects."""
+        mock_result = {"final_response": "Done", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": [
+                            {"role": "user", "content": "Hello"},
+                            {"role": "user", "content": "What is 2+2?"},
+                        ],
+                    },
+                )
+
+            assert resp.status == 200
+            call_kwargs = mock_run.call_args.kwargs
+            # Last message is user_message, rest are history
+            assert call_kwargs["user_message"] == "What is 2+2?"
+            assert len(call_kwargs["conversation_history"]) == 1
+
+    @pytest.mark.asyncio
+    async def test_instructions_as_ephemeral_prompt(self, adapter):
+        """The instructions field maps to ephemeral_system_prompt."""
+        mock_result = {"final_response": "Ahoy!", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "Hello",
+                        "instructions": "Talk like a pirate.",
+                    },
+                )
+
+            assert resp.status == 200
+            call_kwargs = mock_run.call_args.kwargs
+            assert call_kwargs["ephemeral_system_prompt"] == "Talk like a pirate."
+
+    @pytest.mark.asyncio
+    async def test_previous_response_id_chaining(self, adapter):
+        """Test that responses can be chained via previous_response_id."""
+        mock_result_1 = {
+            "final_response": "2",
+            "messages": [{"role": "assistant", "content": "2"}],
+            "api_calls": 1,
+        }
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            # First request
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result_1, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp1 = await cli.post(
+                    "/v1/responses",
+                    json={"model": "hermes-agent", "input": "What is 1+1?"},
+                )
+
+            assert resp1.status == 200
+            data1 = await resp1.json()
+            response_id = data1["id"]
+
+            # Second request chaining from the first
+            mock_result_2 = {
+                "final_response": "3",
+                "messages": [{"role": "assistant", "content": "3"}],
+                "api_calls": 1,
+            }
+
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result_2, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp2 = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "Now add 1 more",
+                        "previous_response_id": response_id,
+                    },
+                )
+
+            assert resp2.status == 200
+            # The conversation_history should contain the full history from the first response
+            call_kwargs = mock_run.call_args.kwargs
+            assert len(call_kwargs["conversation_history"]) > 0
+            assert call_kwargs["user_message"] == "Now add 1 more"
+
+    @pytest.mark.asyncio
+    async def test_invalid_previous_response_id_returns_404(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post(
+                "/v1/responses",
+                json={
+                    "model": "hermes-agent",
+                    "input": "follow up",
+                    "previous_response_id": "resp_nonexistent",
+                },
+            )
+            assert resp.status == 404
+
+    @pytest.mark.asyncio
+    async def test_store_false_does_not_store(self, adapter):
+        """When store=false, the response is NOT stored."""
+        mock_result = {"final_response": "OK", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "Hello",
+                        "store": False,
+                    },
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            # The response has an ID but it shouldn't be retrievable
+            assert adapter._response_store.get(data["id"]) is None
+
+    @pytest.mark.asyncio
+    async def test_instructions_inherited_from_previous(self, adapter):
+        """If no instructions provided, carry forward from previous response."""
+        mock_result = {"final_response": "Ahoy!", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            # First request with instructions
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp1 = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "Hello",
+                        "instructions": "Be a pirate",
+                    },
+                )
+
+            data1 = await resp1.json()
+            resp_id = data1["id"]
+
+            # Second request without instructions
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp2 = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "Tell me more",
+                        "previous_response_id": resp_id,
+                    },
+                )
+
+            assert resp2.status == 200
+            call_kwargs = mock_run.call_args.kwargs
+            assert call_kwargs["ephemeral_system_prompt"] == "Be a pirate"
+
+    @pytest.mark.asyncio
+    async def test_agent_error_returns_500(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.side_effect = RuntimeError("Boom")
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={"model": "hermes-agent", "input": "Hello"},
+                )
+
+            assert resp.status == 500
+
+    @pytest.mark.asyncio
+    async def test_invalid_input_type_returns_400(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post(
+                "/v1/responses",
+                json={"model": "hermes-agent", "input": 42},
+            )
+            assert resp.status == 400
+
+
+# ---------------------------------------------------------------------------
+# Auth on endpoints
+# ---------------------------------------------------------------------------
+
+
+class TestEndpointAuth:
+    @pytest.mark.asyncio
+    async def test_chat_completions_requires_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post(
+                "/v1/chat/completions",
+                json={"model": "test", "messages": [{"role": "user", "content": "hi"}]},
+            )
+            assert resp.status == 401
+
+    @pytest.mark.asyncio
+    async def test_responses_requires_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post(
+                "/v1/responses",
+                json={"model": "test", "input": "hi"},
+            )
+            assert resp.status == 401
+
+    @pytest.mark.asyncio
+    async def test_models_requires_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/v1/models")
+            assert resp.status == 401
+
+    @pytest.mark.asyncio
+    async def test_health_does_not_require_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/health")
+            assert resp.status == 200
+
+
+# ---------------------------------------------------------------------------
+# Config integration
+# ---------------------------------------------------------------------------
+
+
+class TestConfigIntegration:
+    def test_platform_enum_has_api_server(self):
+        assert Platform.API_SERVER.value == "api_server"
+
+    def test_env_override_enables_api_server(self, monkeypatch):
+        monkeypatch.setenv("API_SERVER_ENABLED", "true")
+        from gateway.config import load_gateway_config
+        config = load_gateway_config()
+        assert Platform.API_SERVER in config.platforms
+        assert config.platforms[Platform.API_SERVER].enabled is True
+
+    def test_env_override_with_key(self, monkeypatch):
+        monkeypatch.setenv("API_SERVER_KEY", "sk-mykey")
+        from gateway.config import load_gateway_config
+        config = load_gateway_config()
+        assert Platform.API_SERVER in config.platforms
+        assert config.platforms[Platform.API_SERVER].extra.get("key") == "sk-mykey"
+
+    def test_env_override_port_and_host(self, monkeypatch):
+        monkeypatch.setenv("API_SERVER_ENABLED", "true")
+        monkeypatch.setenv("API_SERVER_PORT", "9999")
+        monkeypatch.setenv("API_SERVER_HOST", "0.0.0.0")
+        from gateway.config import load_gateway_config
+        config = load_gateway_config()
+        assert config.platforms[Platform.API_SERVER].extra.get("port") == 9999
+        assert config.platforms[Platform.API_SERVER].extra.get("host") == "0.0.0.0"
+
+    def test_api_server_in_connected_platforms(self):
+        config = GatewayConfig()
+        config.platforms[Platform.API_SERVER] = PlatformConfig(enabled=True)
+        connected = config.get_connected_platforms()
+        assert Platform.API_SERVER in connected
+
+    def test_api_server_not_in_connected_when_disabled(self):
+        config = GatewayConfig()
+        config.platforms[Platform.API_SERVER] = PlatformConfig(enabled=False)
+        connected = config.get_connected_platforms()
+        assert Platform.API_SERVER not in connected
+
+
+# ---------------------------------------------------------------------------
+# Multiple system messages
+# ---------------------------------------------------------------------------
+
+
+class TestMultipleSystemMessages:
+    @pytest.mark.asyncio
+    async def test_multiple_system_messages_concatenated(self, adapter):
+        mock_result = {"final_response": "OK", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/chat/completions",
+                    json={
+                        "model": "hermes-agent",
+                        "messages": [
+                            {"role": "system", "content": "You are helpful."},
+                            {"role": "system", "content": "Be concise."},
+                            {"role": "user", "content": "Hello"},
+                        ],
+                    },
+                )
+
+            assert resp.status == 200
+            call_kwargs = mock_run.call_args.kwargs
+            prompt = call_kwargs["ephemeral_system_prompt"]
+            assert "You are helpful." in prompt
+            assert "Be concise." in prompt
+
+
+# ---------------------------------------------------------------------------
+# send() method (not used but required by base)
+# ---------------------------------------------------------------------------
+
+
+class TestSendMethod:
+    @pytest.mark.asyncio
+    async def test_send_returns_not_supported(self):
+        config = PlatformConfig(enabled=True)
+        adapter = APIServerAdapter(config)
+        result = await adapter.send("chat1", "hello")
+        assert result.success is False
+        assert "HTTP request/response" in result.error
+
+
+# ---------------------------------------------------------------------------
+# GET /v1/responses/{response_id}
+# ---------------------------------------------------------------------------
+
+
+class TestGetResponse:
+    @pytest.mark.asyncio
+    async def test_get_stored_response(self, adapter):
+        """GET returns a previously stored response."""
+        mock_result = {"final_response": "Hello!", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            # Create a response first
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={"model": "hermes-agent", "input": "Hi"},
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            response_id = data["id"]
+
+            # Now GET it
+            resp2 = await cli.get(f"/v1/responses/{response_id}")
+            assert resp2.status == 200
+            data2 = await resp2.json()
+            assert data2["id"] == response_id
+            assert data2["object"] == "response"
+            assert data2["status"] == "completed"
+
+    @pytest.mark.asyncio
+    async def test_get_not_found(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/v1/responses/resp_nonexistent")
+            assert resp.status == 404
+
+    @pytest.mark.asyncio
+    async def test_get_requires_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/v1/responses/resp_any")
+            assert resp.status == 401
+
+
+# ---------------------------------------------------------------------------
+# DELETE /v1/responses/{response_id}
+# ---------------------------------------------------------------------------
+
+
+class TestDeleteResponse:
+    @pytest.mark.asyncio
+    async def test_delete_stored_response(self, adapter):
+        """DELETE removes a stored response and returns confirmation."""
+        mock_result = {"final_response": "Hello!", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={"model": "hermes-agent", "input": "Hi"},
+                )
+
+            data = await resp.json()
+            response_id = data["id"]
+
+            # Delete it
+            resp2 = await cli.delete(f"/v1/responses/{response_id}")
+            assert resp2.status == 200
+            data2 = await resp2.json()
+            assert data2["id"] == response_id
+            assert data2["object"] == "response"
+            assert data2["deleted"] is True
+
+            # Verify it's gone
+            resp3 = await cli.get(f"/v1/responses/{response_id}")
+            assert resp3.status == 404
+
+    @pytest.mark.asyncio
+    async def test_delete_not_found(self, adapter):
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.delete("/v1/responses/resp_nonexistent")
+            assert resp.status == 404
+
+    @pytest.mark.asyncio
+    async def test_delete_requires_auth(self, auth_adapter):
+        app = _create_app(auth_adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.delete("/v1/responses/resp_any")
+            assert resp.status == 401
+
+
+# ---------------------------------------------------------------------------
+# Tool calls in output
+# ---------------------------------------------------------------------------
+
+
+class TestToolCallsInOutput:
+    @pytest.mark.asyncio
+    async def test_tool_calls_in_output(self, adapter):
+        """When agent returns tool calls, they appear as function_call items."""
+        mock_result = {
+            "final_response": "The result is 42.",
+            "messages": [
+                {
+                    "role": "assistant",
+                    "content": None,
+                    "tool_calls": [
+                        {
+                            "id": "call_abc123",
+                            "function": {
+                                "name": "calculator",
+                                "arguments": '{"expression": "6*7"}',
+                            },
+                        }
+                    ],
+                },
+                {
+                    "role": "tool",
+                    "tool_call_id": "call_abc123",
+                    "content": "42",
+                },
+                {
+                    "role": "assistant",
+                    "content": "The result is 42.",
+                },
+            ],
+            "api_calls": 2,
+        }
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={"model": "hermes-agent", "input": "What is 6*7?"},
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            output = data["output"]
+
+            # Should have: function_call, function_call_output, message
+            assert len(output) == 3
+            assert output[0]["type"] == "function_call"
+            assert output[0]["name"] == "calculator"
+            assert output[0]["arguments"] == '{"expression": "6*7"}'
+            assert output[0]["call_id"] == "call_abc123"
+            assert output[1]["type"] == "function_call_output"
+            assert output[1]["call_id"] == "call_abc123"
+            assert output[1]["output"] == "42"
+            assert output[2]["type"] == "message"
+            assert output[2]["content"][0]["text"] == "The result is 42."
+
+    @pytest.mark.asyncio
+    async def test_no_tool_calls_still_works(self, adapter):
+        """Without tool calls, output is just a message."""
+        mock_result = {"final_response": "Hello!", "messages": [], "api_calls": 1}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={"model": "hermes-agent", "input": "Hello"},
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            assert len(data["output"]) == 1
+            assert data["output"][0]["type"] == "message"
+
+
+# ---------------------------------------------------------------------------
+# Usage / token counting
+# ---------------------------------------------------------------------------
+
+
+class TestUsageCounting:
+    @pytest.mark.asyncio
+    async def test_responses_usage(self, adapter):
+        """Responses API returns real token counts."""
+        mock_result = {"final_response": "Done", "messages": [], "api_calls": 1}
+        usage = {"input_tokens": 100, "output_tokens": 50, "total_tokens": 150}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, usage)
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={"model": "hermes-agent", "input": "Hi"},
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            assert data["usage"]["input_tokens"] == 100
+            assert data["usage"]["output_tokens"] == 50
+            assert data["usage"]["total_tokens"] == 150
+
+    @pytest.mark.asyncio
+    async def test_chat_completions_usage(self, adapter):
+        """Chat completions returns real token counts."""
+        mock_result = {"final_response": "Done", "messages": [], "api_calls": 1}
+        usage = {"input_tokens": 200, "output_tokens": 80, "total_tokens": 280}
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, usage)
+                resp = await cli.post(
+                    "/v1/chat/completions",
+                    json={
+                        "model": "hermes-agent",
+                        "messages": [{"role": "user", "content": "Hi"}],
+                    },
+                )
+
+            assert resp.status == 200
+            data = await resp.json()
+            assert data["usage"]["prompt_tokens"] == 200
+            assert data["usage"]["completion_tokens"] == 80
+            assert data["usage"]["total_tokens"] == 280
+
+
+# ---------------------------------------------------------------------------
+# Truncation
+# ---------------------------------------------------------------------------
+
+
+class TestTruncation:
+    @pytest.mark.asyncio
+    async def test_truncation_auto_limits_history(self, adapter):
+        """With truncation=auto, history over 100 messages is trimmed."""
+        mock_result = {"final_response": "OK", "messages": [], "api_calls": 1}
+
+        # Pre-seed a stored response with a long history
+        long_history = [{"role": "user", "content": f"msg {i}"} for i in range(150)]
+        adapter._response_store.put("resp_prev", {
+            "response": {"id": "resp_prev", "object": "response"},
+            "conversation_history": long_history,
+            "instructions": None,
+        })
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "follow up",
+                        "previous_response_id": "resp_prev",
+                        "truncation": "auto",
+                    },
+                )
+
+        assert resp.status == 200
+        call_kwargs = mock_run.call_args.kwargs
+        # History should be truncated to 100
+        assert len(call_kwargs["conversation_history"]) <= 100
+
+    @pytest.mark.asyncio
+    async def test_no_truncation_keeps_full_history(self, adapter):
+        """Without truncation=auto, long history is passed as-is."""
+        mock_result = {"final_response": "OK", "messages": [], "api_calls": 1}
+
+        long_history = [{"role": "user", "content": f"msg {i}"} for i in range(150)]
+        adapter._response_store.put("resp_prev2", {
+            "response": {"id": "resp_prev2", "object": "response"},
+            "conversation_history": long_history,
+            "instructions": None,
+        })
+
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (mock_result, {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0})
+                resp = await cli.post(
+                    "/v1/responses",
+                    json={
+                        "model": "hermes-agent",
+                        "input": "follow up",
+                        "previous_response_id": "resp_prev2",
+                    },
+                )
+
+        assert resp.status == 200
+        call_kwargs = mock_run.call_args.kwargs
+        assert len(call_kwargs["conversation_history"]) == 150
+
+
+# ---------------------------------------------------------------------------
+# CORS
+# ---------------------------------------------------------------------------
+
+
+class TestCORS:
+    @pytest.mark.asyncio
+    async def test_cors_headers_on_get(self, adapter):
+        """CORS headers present on normal responses."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.get("/health")
+            assert resp.status == 200
+            assert resp.headers.get("Access-Control-Allow-Origin") == "*"
+            assert "POST" in resp.headers.get("Access-Control-Allow-Methods", "")
+            assert "DELETE" in resp.headers.get("Access-Control-Allow-Methods", "")
+
+    @pytest.mark.asyncio
+    async def test_cors_options_preflight(self, adapter):
+        """OPTIONS preflight request returns CORS headers."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            # OPTIONS to a known path — aiohttp will route through middleware
+            resp = await cli.options("/health")
+            assert resp.status == 200
+            assert resp.headers.get("Access-Control-Allow-Origin") == "*"
+            assert "Authorization" in resp.headers.get("Access-Control-Allow-Headers", "")
+
+
+# ---------------------------------------------------------------------------
+# Conversation parameter
+# ---------------------------------------------------------------------------
+
+
+class TestConversationParameter:
+    @pytest.mark.asyncio
+    async def test_conversation_creates_new(self, adapter):
+        """First request with a conversation name works (new conversation)."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (
+                    {"final_response": "Hello!", "messages": [], "api_calls": 1},
+                    {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15},
+                )
+                resp = await cli.post("/v1/responses", json={
+                    "input": "hi",
+                    "conversation": "my-chat",
+                })
+                assert resp.status == 200
+                data = await resp.json()
+                assert data["status"] == "completed"
+                # Conversation mapping should be set
+                assert "my-chat" in adapter._conversations
+
+    @pytest.mark.asyncio
+    async def test_conversation_chains_automatically(self, adapter):
+        """Second request with same conversation name chains to first."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (
+                    {"final_response": "First response", "messages": [], "api_calls": 1},
+                    {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15},
+                )
+                # First request
+                resp1 = await cli.post("/v1/responses", json={
+                    "input": "hello",
+                    "conversation": "test-conv",
+                })
+                assert resp1.status == 200
+                data1 = await resp1.json()
+                resp1_id = data1["id"]
+
+                # Second request — should chain
+                mock_run.return_value = (
+                    {"final_response": "Second response", "messages": [], "api_calls": 1},
+                    {"input_tokens": 20, "output_tokens": 10, "total_tokens": 30},
+                )
+                resp2 = await cli.post("/v1/responses", json={
+                    "input": "follow up",
+                    "conversation": "test-conv",
+                })
+                assert resp2.status == 200
+
+                # The second call should have received conversation history from the first
+                assert mock_run.call_count == 2
+                second_call_kwargs = mock_run.call_args_list[1]
+                history = second_call_kwargs.kwargs.get("conversation_history",
+                          second_call_kwargs[1].get("conversation_history", []) if len(second_call_kwargs) > 1 else [])
+                # History should be non-empty (contains messages from first response)
+                assert len(history) > 0
+
+    @pytest.mark.asyncio
+    async def test_conversation_and_previous_response_id_conflict(self, adapter):
+        """Cannot use both conversation and previous_response_id."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            resp = await cli.post("/v1/responses", json={
+                "input": "hi",
+                "conversation": "my-chat",
+                "previous_response_id": "resp_abc123",
+            })
+            assert resp.status == 400
+            data = await resp.json()
+            assert "Cannot use both" in data["error"]["message"]
+
+    @pytest.mark.asyncio
+    async def test_separate_conversations_are_isolated(self, adapter):
+        """Different conversation names have independent histories."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (
+                    {"final_response": "Response A", "messages": [], "api_calls": 1},
+                    {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15},
+                )
+                # Conversation A
+                await cli.post("/v1/responses", json={"input": "conv-a msg", "conversation": "conv-a"})
+                # Conversation B
+                mock_run.return_value = (
+                    {"final_response": "Response B", "messages": [], "api_calls": 1},
+                    {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15},
+                )
+                await cli.post("/v1/responses", json={"input": "conv-b msg", "conversation": "conv-b"})
+
+                # They should have different response IDs in the mapping
+                assert adapter._conversations["conv-a"] != adapter._conversations["conv-b"]
+
+    @pytest.mark.asyncio
+    async def test_conversation_store_false_no_mapping(self, adapter):
+        """If store=false, conversation mapping is not updated."""
+        app = _create_app(adapter)
+        async with TestClient(TestServer(app)) as cli:
+            with patch.object(adapter, "_run_agent", new_callable=AsyncMock) as mock_run:
+                mock_run.return_value = (
+                    {"final_response": "Ephemeral", "messages": [], "api_calls": 1},
+                    {"input_tokens": 10, "output_tokens": 5, "total_tokens": 15},
+                )
+                resp = await cli.post("/v1/responses", json={
+                    "input": "hi",
+                    "conversation": "ephemeral-chat",
+                    "store": False,
+                })
+                assert resp.status == 200
+                # Conversation mapping should NOT be set since store=false
+                assert "ephemeral-chat" not in adapter._conversations
diff --git a/tests/gateway/test_whatsapp_connect.py b/tests/gateway/test_whatsapp_connect.py
index 3f6c5e49..37a1f950 100644
--- a/tests/gateway/test_whatsapp_connect.py
+++ b/tests/gateway/test_whatsapp_connect.py
@@ -51,6 +51,7 @@ def _make_adapter():
     adapter._bridge_log_fh = None
     adapter._bridge_log = None
     adapter._bridge_process = None
+    adapter._reply_prefix = None
     adapter._running = False
     adapter._message_queue = asyncio.Queue()
     return adapter
diff --git a/tests/gateway/test_whatsapp_reply_prefix.py b/tests/gateway/test_whatsapp_reply_prefix.py
new file mode 100644
index 00000000..bf7a45c3
--- /dev/null
+++ b/tests/gateway/test_whatsapp_reply_prefix.py
@@ -0,0 +1,121 @@
+"""Tests for WhatsApp reply_prefix config.yaml support.
+
+Covers:
+- config.yaml whatsapp.reply_prefix bridging into PlatformConfig.extra
+- WhatsAppAdapter reading reply_prefix from config.extra
+- Bridge subprocess receiving WHATSAPP_REPLY_PREFIX env var
+- Config version covers all ENV_VARS_BY_VERSION keys (regression guard)
+"""
+
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from gateway.config import GatewayConfig, Platform, PlatformConfig
+
+
+# ---------------------------------------------------------------------------
+# Config bridging from config.yaml
+# ---------------------------------------------------------------------------
+
+
+class TestConfigYamlBridging:
+    """Test that whatsapp.reply_prefix in config.yaml flows into PlatformConfig."""
+
+    def test_reply_prefix_bridged_from_yaml(self, tmp_path):
+        """whatsapp.reply_prefix in config.yaml sets PlatformConfig.extra."""
+        config_yaml = tmp_path / "config.yaml"
+        config_yaml.write_text('whatsapp:\n  reply_prefix: "Custom Bot"\n')
+
+        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
+            from gateway.config import load_gateway_config
+            # Need to also patch WHATSAPP_ENABLED so the platform exists
+            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
+                config = load_gateway_config()
+
+        wa_config = config.platforms.get(Platform.WHATSAPP)
+        assert wa_config is not None
+        assert wa_config.extra.get("reply_prefix") == "Custom Bot"
+
+    def test_empty_reply_prefix_bridged(self, tmp_path):
+        """Empty string reply_prefix disables the header."""
+        config_yaml = tmp_path / "config.yaml"
+        config_yaml.write_text('whatsapp:\n  reply_prefix: ""\n')
+
+        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
+            from gateway.config import load_gateway_config
+            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
+                config = load_gateway_config()
+
+        wa_config = config.platforms.get(Platform.WHATSAPP)
+        assert wa_config is not None
+        assert wa_config.extra.get("reply_prefix") == ""
+
+    def test_no_whatsapp_section_no_extra(self, tmp_path):
+        """Without whatsapp section, no reply_prefix is set."""
+        config_yaml = tmp_path / "config.yaml"
+        config_yaml.write_text("timezone: UTC\n")
+
+        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
+            from gateway.config import load_gateway_config
+            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
+                config = load_gateway_config()
+
+        wa_config = config.platforms.get(Platform.WHATSAPP)
+        assert wa_config is not None
+        assert "reply_prefix" not in wa_config.extra
+
+    def test_whatsapp_section_without_reply_prefix(self, tmp_path):
+        """whatsapp section present but without reply_prefix key."""
+        config_yaml = tmp_path / "config.yaml"
+        config_yaml.write_text("whatsapp:\n  other_setting: true\n")
+
+        with patch("gateway.config.get_hermes_home", return_value=tmp_path):
+            from gateway.config import load_gateway_config
+            with patch.dict("os.environ", {"WHATSAPP_ENABLED": "true"}, clear=False):
+                config = load_gateway_config()
+
+        wa_config = config.platforms.get(Platform.WHATSAPP)
+        assert "reply_prefix" not in wa_config.extra
+
+
+# ---------------------------------------------------------------------------
+# WhatsAppAdapter __init__
+# ---------------------------------------------------------------------------
+
+
+class TestAdapterInit:
+    """Test that WhatsAppAdapter reads reply_prefix from config.extra."""
+
+    def test_reply_prefix_from_extra(self):
+        from gateway.platforms.whatsapp import WhatsAppAdapter
+        config = PlatformConfig(enabled=True, extra={"reply_prefix": "Bot\\n"})
+        adapter = WhatsAppAdapter(config)
+        assert adapter._reply_prefix == "Bot\\n"
+
+    def test_reply_prefix_default_none(self):
+        from gateway.platforms.whatsapp import WhatsAppAdapter
+        config = PlatformConfig(enabled=True)
+        adapter = WhatsAppAdapter(config)
+        assert adapter._reply_prefix is None
+
+    def test_reply_prefix_empty_string(self):
+        from gateway.platforms.whatsapp import WhatsAppAdapter
+        config = PlatformConfig(enabled=True, extra={"reply_prefix": ""})
+        adapter = WhatsAppAdapter(config)
+        assert adapter._reply_prefix == ""
+
+
+# ---------------------------------------------------------------------------
+# Config version regression guard
+# ---------------------------------------------------------------------------
+
+
+class TestConfigVersionCoverage:
+    """Ensure _config_version covers all ENV_VARS_BY_VERSION keys."""
+
+    def test_default_config_version_covers_env_var_versions(self):
+        """_config_version must be >= the highest ENV_VARS_BY_VERSION key."""
+        from hermes_cli.config import DEFAULT_CONFIG, ENV_VARS_BY_VERSION
+        assert DEFAULT_CONFIG["_config_version"] >= max(ENV_VARS_BY_VERSION)
diff --git a/website/docs/reference/environment-variables.md b/website/docs/reference/environment-variables.md
index b4d6c3f4..c7ddfd1f 100644
--- a/website/docs/reference/environment-variables.md
+++ b/website/docs/reference/environment-variables.md
@@ -192,6 +192,10 @@ For native Anthropic auth, Hermes prefers Claude Code's own credential files whe
 | `MATRIX_ENCRYPTION` | Enable end-to-end encryption (`true`/`false`, default: `false`) |
 | `HASS_TOKEN` | Home Assistant Long-Lived Access Token (enables HA platform + tools) |
 | `HASS_URL` | Home Assistant URL (default: `http://homeassistant.local:8123`) |
+| `API_SERVER_ENABLED` | Enable the OpenAI-compatible API server (`true`/`false`). Runs alongside other platforms. |
+| `API_SERVER_KEY` | Bearer token for API server authentication. If empty, all requests are allowed (local-only use). |
+| `API_SERVER_PORT` | Port for the API server (default: `8642`) |
+| `API_SERVER_HOST` | Host/bind address for the API server (default: `127.0.0.1`). Use `0.0.0.0` for network access — set `API_SERVER_KEY` for security. |
 | `MESSAGING_CWD` | Working directory for terminal commands in messaging mode (default: `~`) |
 | `GATEWAY_ALLOWED_USERS` | Comma-separated user IDs allowed across all platforms |
 | `GATEWAY_ALLOW_ALL_USERS` | Allow all users without allowlists (`true`/`false`, default: `false`) |
diff --git a/website/docs/user-guide/features/api-server.md b/website/docs/user-guide/features/api-server.md
new file mode 100644
index 00000000..cf488bb9
--- /dev/null
+++ b/website/docs/user-guide/features/api-server.md
@@ -0,0 +1,223 @@
+---
+sidebar_position: 14
+title: "API Server"
+description: "Expose hermes-agent as an OpenAI-compatible API for any frontend"
+---
+
+# API Server
+
+The API server exposes hermes-agent as an OpenAI-compatible HTTP endpoint. Any frontend that speaks the OpenAI format — Open WebUI, LobeChat, LibreChat, NextChat, ChatBox, and hundreds more — can connect to hermes-agent and use it as a backend.
+
+Your agent handles requests with its full toolset (terminal, file operations, web search, memory, skills) and returns the final response. Tool calls execute invisibly server-side.
+
+## Quick Start
+
+### 1. Enable the API server
+
+Add to `~/.hermes/.env`:
+
+```bash
+API_SERVER_ENABLED=true
+```
+
+### 2. Start the gateway
+
+```bash
+hermes gateway
+```
+
+You'll see:
+
+```
+[API Server] API server listening on http://127.0.0.1:8642
+```
+
+### 3. Connect a frontend
+
+Point any OpenAI-compatible client at `http://localhost:8642/v1`:
+
+```bash
+# Test with curl
+curl http://localhost:8642/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{"model": "hermes-agent", "messages": [{"role": "user", "content": "Hello!"}]}'
+```
+
+Or connect Open WebUI, LobeChat, or any other frontend — see the [Open WebUI integration guide](/docs/user-guide/messaging/open-webui) for step-by-step instructions.
+
+## Endpoints
+
+### POST /v1/chat/completions
+
+Standard OpenAI Chat Completions format. Stateless — the full conversation is included in each request via the `messages` array.
+
+**Request:**
+```json
+{
+  "model": "hermes-agent",
+  "messages": [
+    {"role": "system", "content": "You are a Python expert."},
+    {"role": "user", "content": "Write a fibonacci function"}
+  ],
+  "stream": false
+}
+```
+
+**Response:**
+```json
+{
+  "id": "chatcmpl-abc123",
+  "object": "chat.completion",
+  "created": 1710000000,
+  "model": "hermes-agent",
+  "choices": [{
+    "index": 0,
+    "message": {"role": "assistant", "content": "Here's a fibonacci function..."},
+    "finish_reason": "stop"
+  }],
+  "usage": {"prompt_tokens": 50, "completion_tokens": 200, "total_tokens": 250}
+}
+```
+
+**Streaming** (`"stream": true`): Returns Server-Sent Events (SSE) with token-by-token response chunks. When streaming is enabled in config, tokens are emitted live as the LLM generates them. When disabled, the full response is sent as a single SSE chunk.
+
+### POST /v1/responses
+
+OpenAI Responses API format. Supports server-side conversation state via `previous_response_id` — the server stores full conversation history (including tool calls and results) so multi-turn context is preserved without the client managing it.
+
+**Request:**
+```json
+{
+  "model": "hermes-agent",
+  "input": "What files are in my project?",
+  "instructions": "You are a helpful coding assistant.",
+  "store": true
+}
+```
+
+**Response:**
+```json
+{
+  "id": "resp_abc123",
+  "object": "response",
+  "status": "completed",
+  "model": "hermes-agent",
+  "output": [
+    {"type": "function_call", "name": "terminal", "arguments": "{\"command\": \"ls\"}", "call_id": "call_1"},
+    {"type": "function_call_output", "call_id": "call_1", "output": "README.md src/ tests/"},
+    {"type": "message", "role": "assistant", "content": [{"type": "output_text", "text": "Your project has..."}]}
+  ],
+  "usage": {"input_tokens": 50, "output_tokens": 200, "total_tokens": 250}
+}
+```
+
+#### Multi-turn with previous_response_id
+
+Chain responses to maintain full context (including tool calls) across turns:
+
+```json
+{
+  "input": "Now show me the README",
+  "previous_response_id": "resp_abc123"
+}
+```
+
+The server reconstructs the full conversation from the stored response chain — all previous tool calls and results are preserved.
+
+#### Named conversations
+
+Use the `conversation` parameter instead of tracking response IDs:
+
+```json
+{"input": "Hello", "conversation": "my-project"}
+{"input": "What's in src/?", "conversation": "my-project"}
+{"input": "Run the tests", "conversation": "my-project"}
+```
+
+The server automatically chains to the latest response in that conversation. Like the `/title` command for gateway sessions.
+
+### GET /v1/responses/{id}
+
+Retrieve a previously stored response by ID.
+
+### DELETE /v1/responses/{id}
+
+Delete a stored response.
+
+### GET /v1/models
+
+Lists `hermes-agent` as an available model. Required by most frontends for model discovery.
+
+### GET /health
+
+Health check. Returns `{"status": "ok"}`.
+
+## System Prompt Handling
+
+When a frontend sends a `system` message (Chat Completions) or `instructions` field (Responses API), hermes-agent **layers it on top** of its core system prompt. Your agent keeps all its tools, memory, and skills — the frontend's system prompt adds extra instructions.
+
+This means you can customize behavior per-frontend without losing capabilities:
+- Open WebUI system prompt: "You are a Python expert. Always include type hints."
+- The agent still has terminal, file tools, web search, memory, etc.
+
+## Authentication
+
+Bearer token auth via the `Authorization` header:
+
+```
+Authorization: Bearer ***
+```
+
+Configure the key via `API_SERVER_KEY` env var. If no key is set, all requests are allowed (for local-only use).
+
+:::warning Security
+The API server gives full access to hermes-agent's toolset, **including terminal commands**. If you change the bind address to `0.0.0.0` (network-accessible), **always set `API_SERVER_KEY`** — without it, anyone on your network can execute arbitrary commands on your machine.
+
+The default bind address (`127.0.0.1`) is safe for local-only use.
+:::
+
+## Configuration
+
+### Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `API_SERVER_ENABLED` | `false` | Enable the API server |
+| `API_SERVER_PORT` | `8642` | HTTP server port |
+| `API_SERVER_HOST` | `127.0.0.1` | Bind address (localhost only by default) |
+| `API_SERVER_KEY` | _(none)_ | Bearer token for auth |
+
+### config.yaml
+
+```yaml
+# Not yet supported — use environment variables.
+# config.yaml support coming in a future release.
+```
+
+## CORS
+
+The API server includes CORS headers on all responses (`Access-Control-Allow-Origin: *`), so browser-based frontends can connect directly.
+
+## Compatible Frontends
+
+Any frontend that supports the OpenAI API format works. Tested/documented integrations:
+
+| Frontend | Stars | Connection |
+|----------|-------|------------|
+| [Open WebUI](/docs/user-guide/messaging/open-webui) | 126k | Full guide available |
+| LobeChat | 73k | Custom provider endpoint |
+| LibreChat | 34k | Custom endpoint in librechat.yaml |
+| AnythingLLM | 56k | Generic OpenAI provider |
+| NextChat | 87k | BASE_URL env var |
+| ChatBox | 39k | API Host setting |
+| Jan | 26k | Remote model config |
+| HF Chat-UI | 8k | OPENAI_BASE_URL |
+| big-AGI | 7k | Custom endpoint |
+| OpenAI Python SDK | — | `OpenAI(base_url="http://localhost:8642/v1")` |
+| curl | — | Direct HTTP requests |
+
+## Limitations
+
+- **Response storage is in-memory** — stored responses (for `previous_response_id`) are lost on gateway restart. Max 100 stored responses (LRU eviction).
+- **No file upload** — vision/document analysis via uploaded files is not yet supported through the API.
+- **Model field is cosmetic** — the `model` field in requests is accepted but the actual LLM model used is configured server-side in config.yaml.
diff --git a/website/docs/user-guide/messaging/index.md b/website/docs/user-guide/messaging/index.md
index c969b451..227694da 100644
--- a/website/docs/user-guide/messaging/index.md
+++ b/website/docs/user-guide/messaging/index.md
@@ -1,7 +1,7 @@
 ---
 sidebar_position: 1
 title: "Messaging Gateway"
-description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, SMS, Email, Home Assistant, Mattermost, Matrix, DingTalk, or your browser — architecture and setup overview"
+description: "Chat with Hermes from Telegram, Discord, Slack, WhatsApp, Signal, SMS, Email, Home Assistant, Mattermost, Matrix, DingTalk, or any OpenAI-compatible frontend via the API server — architecture and setup overview"
 ---
 
 # Messaging Gateway
@@ -27,6 +27,7 @@ flowchart TB
             mm[Mattermost]
             mx[Matrix]
             dt[DingTalk]
+            api["API Server<br/>(OpenAI-compatible)"]
         end
 
         store["Session store<br/>per chat"]
@@ -45,6 +46,7 @@ flowchart TB
     mm --> store
     mx --> store
     dt --> store
+    api --> store
     store --> agent
     cron --> store
 ```
@@ -306,6 +308,7 @@ Each platform has its own toolset:
 | Mattermost | `hermes-mattermost` | Full tools including terminal |
 | Matrix | `hermes-matrix` | Full tools including terminal |
 | DingTalk | `hermes-dingtalk` | Full tools including terminal |
+| API Server | `hermes` (default) | Full tools including terminal |
 
 ## Next Steps
 
@@ -320,3 +323,4 @@ Each platform has its own toolset:
 - [Mattermost Setup](mattermost.md)
 - [Matrix Setup](matrix.md)
 - [DingTalk Setup](dingtalk.md)
+- [Open WebUI + API Server](open-webui.md)
diff --git a/website/docs/user-guide/messaging/open-webui.md b/website/docs/user-guide/messaging/open-webui.md
new file mode 100644
index 00000000..7f90941e
--- /dev/null
+++ b/website/docs/user-guide/messaging/open-webui.md
@@ -0,0 +1,213 @@
+---
+sidebar_position: 8
+title: "Open WebUI"
+description: "Connect Open WebUI to Hermes Agent via the OpenAI-compatible API server"
+---
+
+# Open WebUI Integration
+
+[Open WebUI](https://github.com/open-webui/open-webui) (126k★) is the most popular self-hosted chat interface for AI. With Hermes Agent's built-in API server, you can use Open WebUI as a polished web frontend for your agent — complete with conversation management, user accounts, and a modern chat interface.
+
+## Architecture
+
+```
+┌──────────────────┐    POST /v1/chat/completions    ┌──────────────────────┐
+│   Open WebUI     │ ──────────────────────────────► │  hermes-agent        │
+│   (browser UI)   │    SSE streaming response       │  gateway API server  │
+│   port 3000      │ ◄────────────────────────────── │  port 8642           │
+└──────────────────┘                                  └──────────────────────┘
+```
+
+Open WebUI connects to Hermes Agent's API server just like it would connect to OpenAI. Your agent handles the requests with its full toolset — terminal, file operations, web search, memory, skills — and returns the final response.
+
+## Quick Setup
+
+### 1. Enable the API server
+
+Add to `~/.hermes/.env`:
+
+```bash
+API_SERVER_ENABLED=true
+# Optional: set a key for auth (recommended if accessible beyond localhost)
+# API_SERVER_KEY=your-secret-key
+```
+
+### 2. Start Hermes Agent gateway
+
+```bash
+hermes gateway
+```
+
+You should see:
+
+```
+[API Server] API server listening on http://127.0.0.1:8642
+```
+
+### 3. Start Open WebUI
+
+```bash
+docker run -d -p 3000:8080 \
+  -e OPENAI_API_BASE_URL=http://host.docker.internal:8642/v1 \
+  -e OPENAI_API_KEY=not-needed \
+  --add-host=host.docker.internal:host-gateway \
+  -v open-webui:/app/backend/data \
+  --name open-webui \
+  --restart always \
+  ghcr.io/open-webui/open-webui:main
+```
+
+If you set an `API_SERVER_KEY`, use it instead of `not-needed`:
+
+```bash
+-e OPENAI_API_KEY=your-secret-key
+```
+
+### 4. Open the UI
+
+Go to **http://localhost:3000**. Create your admin account (the first user becomes admin). You should see **hermes-agent** in the model dropdown. Start chatting!
+
+## Docker Compose Setup
+
+For a more permanent setup, create a `docker-compose.yml`:
+
+```yaml
+services:
+  open-webui:
+    image: ghcr.io/open-webui/open-webui:main
+    ports:
+      - "3000:8080"
+    volumes:
+      - open-webui:/app/backend/data
+    environment:
+      - OPENAI_API_BASE_URL=http://host.docker.internal:8642/v1
+      - OPENAI_API_KEY=not-needed
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    restart: always
+
+volumes:
+  open-webui:
+```
+
+Then:
+
+```bash
+docker compose up -d
+```
+
+## Configuring via the Admin UI
+
+If you prefer to configure the connection through the UI instead of environment variables:
+
+1. Log in to Open WebUI at **http://localhost:3000**
+2. Click your **profile avatar** → **Admin Settings**
+3. Go to **Connections**
+4. Under **OpenAI API**, click the **wrench icon** (Manage)
+5. Click **+ Add New Connection**
+6. Enter:
+   - **URL**: `http://host.docker.internal:8642/v1`
+   - **API Key**: your key or any non-empty value (e.g., `not-needed`)
+7. Click the **checkmark** to verify the connection
+8. **Save**
+
+The **hermes-agent** model should now appear in the model dropdown.
+
+:::warning
+Environment variables only take effect on Open WebUI's **first launch**. After that, connection settings are stored in its internal database. To change them later, use the Admin UI or delete the Docker volume and start fresh.
+:::
+
+## API Type: Chat Completions vs Responses
+
+Open WebUI supports two API modes when connecting to a backend:
+
+| Mode | Format | When to use |
+|------|--------|-------------|
+| **Chat Completions** (default) | `/v1/chat/completions` | Recommended. Works out of the box. |
+| **Responses** (experimental) | `/v1/responses` | For server-side conversation state via `previous_response_id`. |
+
+### Using Chat Completions (recommended)
+
+This is the default and requires no extra configuration. Open WebUI sends standard OpenAI-format requests and Hermes Agent responds accordingly. Each request includes the full conversation history.
+
+### Using Responses API
+
+To use the Responses API mode:
+
+1. Go to **Admin Settings** → **Connections** → **OpenAI** → **Manage**
+2. Edit your hermes-agent connection
+3. Change **API Type** from "Chat Completions" to **"Responses (Experimental)"**
+4. Save
+
+With the Responses API, Open WebUI sends requests in the Responses format (`input` array + `instructions`), and Hermes Agent can preserve full tool call history across turns via `previous_response_id`.
+
+:::note
+Open WebUI currently manages conversation history client-side even in Responses mode — it sends the full message history in each request rather than using `previous_response_id`. The Responses API mode is mainly useful for future compatibility as frontends evolve.
+:::
+
+## How It Works
+
+When you send a message in Open WebUI:
+
+1. Open WebUI sends a `POST /v1/chat/completions` request with your message and conversation history
+2. Hermes Agent creates an AIAgent instance with its full toolset
+3. The agent processes your request — it may call tools (terminal, file operations, web search, etc.)
+4. Tool calls happen invisibly server-side
+5. The agent's final text response is returned to Open WebUI
+6. Open WebUI displays the response in its chat interface
+
+Your agent has access to all the same tools and capabilities as when using the CLI or Telegram — the only difference is the frontend.
+
+## Configuration Reference
+
+### Hermes Agent (API server)
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `API_SERVER_ENABLED` | `false` | Enable the API server |
+| `API_SERVER_PORT` | `8642` | HTTP server port |
+| `API_SERVER_HOST` | `127.0.0.1` | Bind address |
+| `API_SERVER_KEY` | _(none)_ | Bearer token for auth. No key = allow all. |
+
+### Open WebUI
+
+| Variable | Description |
+|----------|-------------|
+| `OPENAI_API_BASE_URL` | Hermes Agent's API URL (include `/v1`) |
+| `OPENAI_API_KEY` | Must be non-empty. Match your `API_SERVER_KEY`. |
+
+## Troubleshooting
+
+### No models appear in the dropdown
+
+- **Check the URL has `/v1` suffix**: `http://host.docker.internal:8642/v1` (not just `:8642`)
+- **Verify the gateway is running**: `curl http://localhost:8642/health` should return `{"status": "ok"}`
+- **Check model listing**: `curl http://localhost:8642/v1/models` should return a list with `hermes-agent`
+- **Docker networking**: From inside Docker, `localhost` means the container, not your host. Use `host.docker.internal` or `--network=host`.
+
+### Connection test passes but no models load
+
+This is almost always the missing `/v1` suffix. Open WebUI's connection test is a basic connectivity check — it doesn't verify model listing works.
+
+### Response takes a long time
+
+Hermes Agent may be executing multiple tool calls (reading files, running commands, searching the web) before producing its final response. This is normal for complex queries. The response appears all at once when the agent finishes.
+
+### "Invalid API key" errors
+
+Make sure your `OPENAI_API_KEY` in Open WebUI matches the `API_SERVER_KEY` in Hermes Agent. If no key is configured on the Hermes side, any non-empty value works.
+
+## Linux Docker (no Docker Desktop)
+
+On Linux without Docker Desktop, `host.docker.internal` doesn't resolve by default. Options:
+
+```bash
+# Option 1: Add host mapping
+docker run --add-host=host.docker.internal:host-gateway ...
+
+# Option 2: Use host networking
+docker run --network=host -e OPENAI_API_BASE_URL=http://localhost:8642/v1 ...
+
+# Option 3: Use Docker bridge IP
+docker run -e OPENAI_API_BASE_URL=http://172.17.0.1:8642/v1 ...
+```
diff --git a/website/docs/user-guide/messaging/whatsapp.md b/website/docs/user-guide/messaging/whatsapp.md
index eb741467..f754c9c2 100644
--- a/website/docs/user-guide/messaging/whatsapp.md
+++ b/website/docs/user-guide/messaging/whatsapp.md
@@ -140,7 +140,14 @@ Hermes supports voice on WhatsApp:
 
 - **Incoming:** Voice messages (`.ogg` opus) are automatically transcribed using the configured STT provider: local `faster-whisper`, Groq Whisper (`GROQ_API_KEY`), or OpenAI Whisper (`VOICE_TOOLS_OPENAI_KEY`)
 - **Outgoing:** TTS responses are sent as MP3 audio file attachments
-- Agent responses are prefixed with "⚕ **Hermes Agent**" for easy identification
+- Agent responses are prefixed with "⚕ **Hermes Agent**" by default. You can customize or disable this in `config.yaml`:
+
+```yaml
+# ~/.hermes/config.yaml
+whatsapp:
+  reply_prefix: ""                          # Empty string disables the header
+  # reply_prefix: "🤖 *My Bot*\n──────\n"  # Custom prefix (supports \n for newlines)
+```
 
 ---
 
diff --git a/website/sidebars.ts b/website/sidebars.ts
index 935cdaff..a25aa105 100644
--- a/website/sidebars.ts
+++ b/website/sidebars.ts
@@ -51,6 +51,7 @@ const sidebars: SidebarsConfig = {
             'user-guide/messaging/mattermost',
             'user-guide/messaging/matrix',
             'user-guide/messaging/dingtalk',
+            'user-guide/messaging/open-webui',
           ],
         },
         {
@@ -90,6 +91,7 @@ const sidebars: SidebarsConfig = {
           type: 'category',
           label: 'Integrations',
           items: [
+            'user-guide/features/api-server',
             'user-guide/features/acp',
             'user-guide/features/mcp',
             'user-guide/features/honcho',