fix: tune rerank scoring for temporal recall (#1011 )

- use temporal lane support during rerank - prefer explicit correction facts when fused scores are close - preserve top-1 improvements on the holographic prompt matrix benchmark
wip: implement multi-path holographic recall pipeline (#1011 )
2026-04-22 11:10:41 -04:00 · 2026-04-22 11:07:33 -04:00 · 2026-04-22 10:57:49 -04:00 · 2026-04-22 13:55:16 +00:00 · 2026-04-22 13:54:32 +00:00 · 2026-04-22 13:54:30 +00:00
624 changed files with 88309 additions and 8423 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,6 +5,7 @@

 # Dependencies
 node_modules
+.venv

 # CI/CD
 .github
--- a/.env.example
+++ b/.env.example
@@ -43,6 +43,15 @@
 # KIMI_BASE_URL=https://api.kimi.com/coding/v1  # Default for sk-kimi- keys
 # KIMI_BASE_URL=https://api.moonshot.ai/v1      # For legacy Moonshot keys
 # KIMI_BASE_URL=https://api.moonshot.cn/v1       # For Moonshot China keys
+# KIMI_CN_API_KEY=                               # Dedicated Moonshot China key
+
+# =============================================================================
+# LLM PROVIDER (Arcee AI)
+# =============================================================================
+# Arcee AI provides access to Trinity models (trinity-mini, trinity-large-*)
+# Get an Arcee key at: https://chat.arcee.ai/
+# ARCEEAI_API_KEY=
+# ARCEE_BASE_URL=                                 # Override default base URL

 # =============================================================================
 # LLM PROVIDER (MiniMax)
@@ -89,6 +98,15 @@
 # Optional base URL override:
 # HERMES_QWEN_BASE_URL=https://portal.qwen.ai/v1

+# =============================================================================
+# LLM PROVIDER (Xiaomi MiMo)
+# =============================================================================
+# Xiaomi MiMo models (mimo-v2-pro, mimo-v2-omni, mimo-v2-flash).
+# Get your key at: https://platform.xiaomimimo.com
+# XIAOMI_API_KEY=your_key_here
+# Optional base URL override:
+# XIAOMI_BASE_URL=https://api.xiaomimimo.com/v1
+
 # =============================================================================
 # TOOL API KEYS
 # =============================================================================
@@ -127,6 +145,10 @@
 # Only override here if you need to force a backend without touching config.yaml:
 # TERMINAL_ENV=local

+# Override the container runtime binary (e.g. to use Podman instead of Docker).
+# Useful on systems where Docker's storage driver is broken or unavailable.
+# HERMES_DOCKER_BINARY=/usr/local/bin/podman
+
 # Container images (for singularity/docker/modal backends)
 # TERMINAL_DOCKER_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
 # TERMINAL_SINGULARITY_IMAGE=docker://nikolaik/python-nodejs:python3.11-nodejs20
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto-generated files — collapse diffs and exclude from language stats
+web/package-lock.json linguist-generated=true
--- a/.gitea/workflows/lint.yml
+++ b/.gitea/workflows/lint.yml
@@ -0,0 +1,28 @@
+name: Lint
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Check for hardcoded paths
+        run: python3 scripts/lint_hardcoded_paths.py
+        continue-on-error: true
+
+      - name: Check Python syntax
+        run: |
+          find . -name "*.py" -not -path "./.git/*" -not -path "./node_modules/*" | head -100 | xargs python3 -m py_compile || true
--- a/.githooks/pre-commit-hardcoded-path.py
+++ b/.githooks/pre-commit-hardcoded-path.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""
+Pre-commit hook: Reject hardcoded home-directory paths.
+
+Install:
+    cp pre-commit-hardcoded-path.py .git/hooks/pre-commit-hardcoded-path
+    chmod +x .git/hooks/pre-commit-hardcoded-path
+    
+    Or add to .pre-commit-config.yaml
+"""
+
+import sys
+import subprocess
+import re
+
+PATTERNS = [
+    (r"/Users/[\w.\-]+/", "macOS home directory"),
+    (r"/home/[\w.\-]+/", "Linux home directory"),
+    (r"(?<![\w/])~/", "unexpanded tilde"),
+]
+
+NOQA = re.compile(r"#\s*noqa:?\s*hardcoded-path-ok")
+
+def get_staged_files():
+    result = subprocess.run(
+        ["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"],
+        capture_output=True, text=True
+    )
+    return [f for f in result.stdout.strip().split("\n") if f.endswith(".py")]
+
+def check_file(filepath):
+    try:
+        result = subprocess.run(
+            ["git", "show", f":{filepath}"],
+            capture_output=True, text=True
+        )
+        content = result.stdout
+    except Exception:
+        return []
+    
+    violations = []
+    for i, line in enumerate(content.split("\n"), 1):
+        if line.strip().startswith("#"):
+            continue
+        if line.strip().startswith(("import ", "from ")):
+            continue
+        if NOQA.search(line):
+            continue
+        for pattern, desc in PATTERNS:
+            if re.search(pattern, line):
+                violations.append((filepath, i, line.strip(), desc))
+                break
+    return violations
+
+def main():
+    files = get_staged_files()
+    if not files:
+        sys.exit(0)
+    
+    all_violations = []
+    for f in files:
+        all_violations.extend(check_file(f))
+    
+    if all_violations:
+        print("ERROR: Hardcoded home directory paths detected:")
+        print()
+        for filepath, line_no, line, desc in all_violations:
+            print(f"  {filepath}:{line_no}: {desc}")
+            print(f"    {line[:100]}")
+            print()
+        print("Fix: Use $HOME, relative paths, or get_hermes_home().")
+        print("Override: Add '# noqa: hardcoded-path-ok' to the line.")
+        sys.exit(1)
+    
+    sys.exit(0)
+
+if __name__ == "__main__":
+    main()
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -11,6 +11,7 @@ body:
        **Before submitting**, please:
        - [ ] Search [existing issues](https://github.com/NousResearch/hermes-agent/issues) to avoid duplicates
        - [ ] Update to the latest version (`hermes update`) and confirm the bug still exists
+        - [ ] Run `hermes debug share` and paste the links below (see Debug Report section)

  - type: textarea
    id: description
@@ -82,6 +83,25 @@ body:
        - Slack
        - WhatsApp

+  - type: textarea
+    id: debug-report
+    attributes:
+      label: Debug Report
+      description: |
+        Run `hermes debug share` from your terminal and paste the links it prints here.
+        This uploads your system info, config, and recent logs to a paste service automatically.
+
+        If you're in an interactive chat session, you can also use the `/debug` slash command — it does the same thing.
+
+        If the upload fails, run `hermes debug share --local` and paste the output directly.
+      placeholder: |
+        Report   https://paste.rs/abc123
+        agent.log   https://paste.rs/def456
+        gateway.log   https://paste.rs/ghi789
+      render: shell
+    validations:
+      required: true
+
  - type: input
    id: os
    attributes:
@@ -97,8 +117,6 @@ body:
      label: Python Version
      description: Output of `python --version`
      placeholder: "3.11.9"
-    validations:
-      required: true

  - type: input
    id: hermes-version
@@ -106,14 +124,14 @@ body:
      label: Hermes Version
      description: Output of `hermes version`
      placeholder: "2.1.0"
-    validations:
-      required: true

  - type: textarea
    id: logs
    attributes:
-      label: Relevant Logs / Traceback
-      description: Paste any error output, traceback, or log messages. This will be auto-formatted as code.
+      label: Additional Logs / Traceback (optional)
+      description: |
+        The debug report above covers most logs. Use this field for any extra error output, 
+        tracebacks, or screenshots not captured by `hermes debug share`.
      render: shell

  - type: textarea
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -71,3 +71,15 @@ body:
      label: Contribution
      options:
        - label: I'd like to implement this myself and submit a PR
+
+  - type: textarea
+    id: debug-report
+    attributes:
+      label: Debug Report (optional)
+      description: |
+        If this feature request is related to a problem you're experiencing, run `hermes debug share` and paste the links here.
+        In an interactive chat session, you can use `/debug` instead.
+        This helps us understand your environment and any related logs.
+      placeholder: |
+        Report   https://paste.rs/abc123
+      render: shell
--- a/.github/ISSUE_TEMPLATE/setup_help.yml
+++ b/.github/ISSUE_TEMPLATE/setup_help.yml
@@ -9,7 +9,8 @@ body:
        Sorry you're having trouble! Please fill out the details below so we can help.

        **Quick checks first:**
-        - Run `hermes doctor` and include the output below
+        - Run `hermes debug share` and paste the links in the Debug Report section below
+        - If you're in a chat session, you can use `/debug` instead — it does the same thing
        - Try `hermes update` to get the latest version
        - Check the [README troubleshooting section](https://github.com/NousResearch/hermes-agent#troubleshooting)
        - For general questions, consider the [Nous Research Discord](https://discord.gg/NousResearch) for faster help
@@ -74,10 +75,21 @@ body:
      placeholder: "2.1.0"

  - type: textarea
-    id: doctor-output
+    id: debug-report
    attributes:
-      label: Output of `hermes doctor`
-      description: Run `hermes doctor` and paste the full output. This will be auto-formatted.
+      label: Debug Report
+      description: |
+        Run `hermes debug share` from your terminal and paste the links it prints here.
+        This uploads your system info, config, and recent logs to a paste service automatically.
+
+        If you're in an interactive chat session, you can also use the `/debug` slash command — it does the same thing.
+
+        If the upload fails or install didn't get that far, run `hermes debug share --local` and paste the output directly.
+        If even that doesn't work, run `hermes doctor` and paste that output instead.
+      placeholder: |
+        Report   https://paste.rs/abc123
+        agent.log   https://paste.rs/def456
+        gateway.log   https://paste.rs/ghi789
      render: shell

  - type: textarea
--- a/.github/workflows/contributor-check.yml
+++ b/.github/workflows/contributor-check.yml
@@ -0,0 +1,73 @@
+name: Contributor Attribution Check
+
+on:
+  pull_request:
+    branches: [main]
+    paths:
+      # Only run when code files change (not docs-only PRs)
+      - '*.py'
+      - '**/*.py'
+      - '.github/workflows/contributor-check.yml'
+
+permissions:
+  contents: read
+
+jobs:
+  check-attribution:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        with:
+          fetch-depth: 0  # Full history needed for git log
+
+      - name: Check for unmapped contributor emails
+        run: |
+          # Get the merge base between this PR and main
+          MERGE_BASE=$(git merge-base origin/main HEAD)
+
+          # Find any new author emails in this PR's commits
+          NEW_EMAILS=$(git log ${MERGE_BASE}..HEAD --format='%ae' --no-merges | sort -u)
+
+          if [ -z "$NEW_EMAILS" ]; then
+            echo "No new commits to check."
+            exit 0
+          fi
+
+          # Check each email against AUTHOR_MAP in release.py
+          MISSING=""
+          while IFS= read -r email; do
+            # Skip teknium and bot emails
+            case "$email" in
+              *teknium*|*noreply@github.com*|*dependabot*|*github-actions*|*anthropic.com*|*cursor.com*)
+                continue ;;
+            esac
+
+            # Check if email is in AUTHOR_MAP (either as a key or matches noreply pattern)
+            if echo "$email" | grep -qP '\+.*@users\.noreply\.github\.com'; then
+              continue  # GitHub noreply emails auto-resolve
+            fi
+
+            if ! grep -qF "\"${email}\"" scripts/release.py 2>/dev/null; then
+              AUTHOR=$(git log --author="$email" --format='%an' -1)
+              MISSING="${MISSING}\n  ${email} (${AUTHOR})"
+            fi
+          done <<< "$NEW_EMAILS"
+
+          if [ -n "$MISSING" ]; then
+            echo ""
+            echo "⚠️  New contributor email(s) not in AUTHOR_MAP:"
+            echo -e "$MISSING"
+            echo ""
+            echo "Please add mappings to scripts/release.py AUTHOR_MAP:"
+            echo -e "$MISSING" | while read -r line; do
+              email=$(echo "$line" | sed 's/^ *//' | cut -d' ' -f1)
+              [ -z "$email" ] && continue
+              echo "    \"${email}\": \"<github-username>\","
+            done
+            echo ""
+            echo "To find the GitHub username for an email:"
+            echo "  gh api 'search/users?q=EMAIL+in:email' --jq '.items[0].login'"
+            exit 1
+          else
+            echo "✅ All contributor emails are mapped in AUTHOR_MAP."
+          fi
--- a/.github/workflows/deploy-site.yml
+++ b/.github/workflows/deploy-site.yml
@@ -28,24 +28,32 @@ jobs:
      name: github-pages
      url: ${{ steps.deploy.outputs.page_url }}
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4

-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
        with:
          node-version: 20
          cache: npm
          cache-dependency-path: website/package-lock.json

-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
        with:
          python-version: '3.11'

      - name: Install PyYAML for skill extraction
-        run: pip install pyyaml
+        run: pip install pyyaml==6.0.2 httpx==0.28.1

      - name: Extract skill metadata for dashboard
        run: python3 website/scripts/extract-skills.py

+      - name: Build skills index (if not already present)
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          if [ ! -f website/static/api/skills-index.json ]; then
+            python3 scripts/build_skills_index.py || echo "Skills index build failed (non-fatal)"
+          fi
+
      - name: Install dependencies
        run: npm ci
        working-directory: website
@@ -65,10 +73,10 @@ jobs:
          echo "hermes-agent.nousresearch.com" > _site/CNAME

      - name: Upload artifact
-        uses: actions/upload-pages-artifact@v3
+        uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa  # v3
        with:
          path: _site

      - name: Deploy to GitHub Pages
        id: deploy
-        uses: actions/deploy-pages@v4
+        uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e  # v4
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -23,21 +23,21 @@ jobs:
    timeout-minutes: 60
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
        with:
          submodules: recursive

      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130  # v3

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3

      # Build amd64 only so we can `load` the image for smoke testing.
      # `load: true` cannot export a multi-arch manifest to the local daemon.
      # The multi-arch build follows on push to main / release.
      - name: Build image (amd64, smoke test)
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8  # v6
        with:
          context: .
          file: Dockerfile
@@ -56,36 +56,31 @@ jobs:

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@v3
+        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9  # v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}

      - name: Push multi-arch image (main branch)
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8  # v6
        with:
          context: .
          file: Dockerfile
          push: true
          platforms: linux/amd64,linux/arm64
-          tags: |
-            nousresearch/hermes-agent:latest
-            nousresearch/hermes-agent:${{ github.sha }}
+          tags: nousresearch/hermes-agent:latest
          cache-from: type=gha
          cache-to: type=gha,mode=max

      - name: Push multi-arch image (release)
        if: github.event_name == 'release'
-        uses: docker/build-push-action@v6
+        uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8  # v6
        with:
          context: .
          file: Dockerfile
          push: true
          platforms: linux/amd64,linux/arm64
-          tags: |
-            nousresearch/hermes-agent:latest
-            nousresearch/hermes-agent:${{ github.event.release.tag_name }}
-            nousresearch/hermes-agent:${{ github.sha }}
+          tags: nousresearch/hermes-agent:${{ github.event.release.tag_name }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
--- a/.github/workflows/docs-site-checks.yml
+++ b/.github/workflows/docs-site-checks.yml
@@ -7,13 +7,16 @@ on:
      - '.github/workflows/docs-site-checks.yml'
  workflow_dispatch:

+permissions:
+  contents: read
+
 jobs:
  docs-site-checks:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4

-      - uses: actions/setup-node@v4
+      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
        with:
          node-version: 20
          cache: npm
@@ -23,7 +26,7 @@ jobs:
        run: npm ci
        working-directory: website

-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
        with:
          python-version: '3.11'

--- a/.github/workflows/nix.yml
+++ b/.github/workflows/nix.yml
@@ -14,6 +14,9 @@ on:
      - 'run_agent.py'
      - 'acp_adapter/**'

+permissions:
+  contents: read
+
 concurrency:
  group: nix-${{ github.ref }}
  cancel-in-progress: true
@@ -26,7 +29,7 @@ jobs:
    runs-on: ${{ matrix.os }}
    timeout-minutes: 30
    steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
      - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25  # v22
      - uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39  # v13
      - name: Check flake
--- a/.github/workflows/skills-index.yml
+++ b/.github/workflows/skills-index.yml
@@ -0,0 +1,101 @@
+name: Build Skills Index
+
+on:
+  schedule:
+    # Run twice daily: 6 AM and 6 PM UTC
+    - cron: '0 6,18 * * *'
+  workflow_dispatch:  # Manual trigger
+  push:
+    branches: [main]
+    paths:
+      - 'scripts/build_skills_index.py'
+      - '.github/workflows/skills-index.yml'
+
+permissions:
+  contents: read
+
+jobs:
+  build-index:
+    # Only run on the upstream repository, not on forks
+    if: github.repository == 'NousResearch/hermes-agent'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+        with:
+          python-version: '3.11'
+
+      - name: Install dependencies
+        run: pip install httpx==0.28.1 pyyaml==6.0.2
+
+      - name: Build skills index
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python scripts/build_skills_index.py
+
+      - name: Upload index artifact
+        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
+        with:
+          name: skills-index
+          path: website/static/api/skills-index.json
+          retention-days: 7
+
+  deploy-with-index:
+    needs: build-index
+    runs-on: ubuntu-latest
+    permissions:
+      pages: write
+      id-token: write
+    environment:
+      name: github-pages
+      url: ${{ steps.deploy.outputs.page_url }}
+    # Only deploy on schedule or manual trigger (not on every push to the script)
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+
+      - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
+        with:
+          name: skills-index
+          path: website/static/api/
+
+      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+        with:
+          node-version: 20
+          cache: npm
+          cache-dependency-path: website/package-lock.json
+
+      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+        with:
+          python-version: '3.11'
+
+      - name: Install PyYAML for skill extraction
+        run: pip install pyyaml==6.0.2
+
+      - name: Extract skill metadata for dashboard
+        run: python3 website/scripts/extract-skills.py
+
+      - name: Install dependencies
+        run: npm ci
+        working-directory: website
+
+      - name: Build Docusaurus
+        run: npm run build
+        working-directory: website
+
+      - name: Stage deployment
+        run: |
+          mkdir -p _site/docs
+          cp -r landingpage/* _site/
+          cp -r website/build/* _site/docs/
+          echo "hermes-agent.nousresearch.com" > _site/CNAME
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa  # v3
+        with:
+          path: _site
+
+      - name: Deploy to GitHub Pages
+        id: deploy
+        uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e  # v4
--- a/.github/workflows/supply-chain-audit.yml
+++ b/.github/workflows/supply-chain-audit.yml
@@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
        with:
          fetch-depth: 0

@@ -149,6 +149,62 @@ jobs:
          "
          fi

+          # --- CI/CD workflow files modified ---
+          WORKFLOW_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '\.github/workflows/.*\.ya?ml$' || true)
+          if [ -n "$WORKFLOW_HITS" ]; then
+            FINDINGS="${FINDINGS}
+          ### ⚠️ WARNING: CI/CD workflow files modified
+          Changes to workflow files can alter build pipelines, inject steps, or modify permissions. Verify no unauthorized actions or secrets access were added.
+
+          **Files:**
+          \`\`\`
+          ${WORKFLOW_HITS}
+          \`\`\`
+          "
+          fi
+
+          # --- Dockerfile / container build files modified ---
+          DOCKER_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -iE '(Dockerfile|\.dockerignore|docker-compose)' || true)
+          if [ -n "$DOCKER_HITS" ]; then
+            FINDINGS="${FINDINGS}
+          ### ⚠️ WARNING: Container build files modified
+          Changes to Dockerfiles or compose files can alter base images, add build steps, or expose ports. Verify base image pins and build commands.
+
+          **Files:**
+          \`\`\`
+          ${DOCKER_HITS}
+          \`\`\`
+          "
+          fi
+
+          # --- Dependency manifest files modified ---
+          DEP_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '(pyproject\.toml|requirements.*\.txt|package\.json|Gemfile|go\.mod|Cargo\.toml)$' || true)
+          if [ -n "$DEP_HITS" ]; then
+            FINDINGS="${FINDINGS}
+          ### ⚠️ WARNING: Dependency manifest files modified
+          Changes to dependency files can introduce new packages or change version pins. Verify all dependency changes are intentional and from trusted sources.
+
+          **Files:**
+          \`\`\`
+          ${DEP_HITS}
+          \`\`\`
+          "
+          fi
+
+          # --- GitHub Actions version unpinning (mutable tags instead of SHAs) ---
+          ACTIONS_UNPIN=$(echo "$DIFF" | grep -n '^\+' | grep 'uses:' | grep -v '#' | grep -E '@v[0-9]' | head -10 || true)
+          if [ -n "$ACTIONS_UNPIN" ]; then
+            FINDINGS="${FINDINGS}
+          ### ⚠️ WARNING: GitHub Actions with mutable version tags
+          Actions should be pinned to full commit SHAs (not \`@v4\`, \`@v5\`). Mutable tags can be retargeted silently if a maintainer account is compromised.
+
+          **Matches:**
+          \`\`\`
+          ${ACTIONS_UNPIN}
+          \`\`\`
+          "
+          fi
+
          # --- Output results ---
          if [ -n "$FINDINGS" ]; then
            echo "found=true" >> "$GITHUB_OUTPUT"
@@ -183,7 +239,7 @@ jobs:
          ---
          *Automated scan triggered by [supply-chain-audit](/.github/workflows/supply-chain-audit.yml). If this is a false positive, a maintainer can approve after manual review.*"

-          gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"
+          gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" || echo "::warning::Could not post PR comment (expected for fork PRs — GITHUB_TOKEN is read-only)"

      - name: Fail on critical findings
        if: steps.scan.outputs.critical == 'true'
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,6 +6,9 @@ on:
  pull_request:
    branches: [main]

+permissions:
+  contents: read
+
 # Cancel in-progress runs for the same PR/branch
 concurrency:
  group: tests-${{ github.ref }}
@@ -17,13 +20,17 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4

      - name: Install system dependencies
        run: sudo apt-get update && sudo apt-get install -y ripgrep

+      - name: Check for hardcoded paths
+        run: python3 scripts/lint_hardcoded_paths.py || true
+        continue-on-error: true
+
      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5

      - name: Set up Python 3.11
        run: uv python install 3.11
@@ -49,10 +56,10 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Checkout code
-        uses: actions/checkout@v4
+        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4

      - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5

      - name: Set up Python 3.11
        run: uv python install 3.11
--- a/.gitignore
+++ b/.gitignore
@@ -51,6 +51,9 @@ ignored/
 .worktrees/
 environments/benchmarks/evals/

+# Web UI build output
+hermes_cli/web_dist/
+
 # Release script temp files
 .release_notes.md
 mini-swe-agent/
@@ -58,3 +61,4 @@ mini-swe-agent/
 # Nix
 .direnv/
 result
+website/static/api/skills-index.json
--- a/.mailmap
+++ b/.mailmap
@@ -0,0 +1,107 @@
+# .mailmap — canonical author mapping for git shortlog / git log / GitHub
+# Format: Canonical Name <canonical@email> <commit@email>
+# See: https://git-scm.com/docs/gitmailmap
+#
+# This maps commit emails to GitHub noreply addresses so that:
+# 1. `git shortlog -sn` shows deduplicated contributor counts
+# 2. GitHub's contributor graph can attribute commits correctly
+# 3. Contributors with personal/work emails get proper credit
+#
+# When adding entries: use the contributor's GitHub noreply email as canonical
+# so GitHub can link commits to their profile.
+
+# === Teknium (multiple emails) ===
+Teknium <127238744+teknium1@users.noreply.github.com> <teknium1@gmail.com>
+Teknium <127238744+teknium1@users.noreply.github.com> <teknium@nousresearch.com>
+
+# === Contributors — personal/work emails mapped to GitHub noreply ===
+# Format: Canonical Name <GH-noreply> <commit-email>
+
+# Verified via GH API email search
+luyao618 <364939526@qq.com> <364939526@qq.com>
+ethernet8023 <arilotter@gmail.com> <arilotter@gmail.com>
+nicoloboschi <boschi1997@gmail.com> <boschi1997@gmail.com>
+cherifya <chef.ya@gmail.com> <chef.ya@gmail.com>
+BongSuCHOI <chlqhdtn98@gmail.com> <chlqhdtn98@gmail.com>
+dsocolobsky <dsocolobsky@gmail.com> <dsocolobsky@gmail.com>
+pefontana <fontana.pedro93@gmail.com> <fontana.pedro93@gmail.com>
+Helmi <frank@helmschrott.de> <frank@helmschrott.de>
+hata1234 <hata1234@gmail.com> <hata1234@gmail.com>
+
+# Verified via PR investigation / salvage PR bodies
+DeployFaith <agents@kylefrench.dev> <agents@kylefrench.dev>
+flobo3 <floptopbot33@gmail.com> <floptopbot33@gmail.com>
+gaixianggeng <gaixg94@gmail.com> <gaixg94@gmail.com>
+KUSH42 <xush@xush.org> <xush@xush.org>
+konsisumer <der@konsi.org> <der@konsi.org>
+WorldInnovationsDepartment <vorvul.danylo@gmail.com> <vorvul.danylo@gmail.com>
+m0n5t3r <iacobs@m0n5t3r.info> <iacobs@m0n5t3r.info>
+sprmn24 <oncuevtv@gmail.com> <oncuevtv@gmail.com>
+fancydirty <fancydirty@gmail.com> <fancydirty@gmail.com>
+fxfitz <francis.x.fitzpatrick@gmail.com> <francis.x.fitzpatrick@gmail.com>
+limars874 <limars874@gmail.com> <limars874@gmail.com>
+AaronWong1999 <aaronwong1999@icloud.com> <aaronwong1999@icloud.com>
+dippwho <dipp.who@gmail.com> <dipp.who@gmail.com>
+duerzy <duerzy@gmail.com> <duerzy@gmail.com>
+geoffwellman <geoff.wellman@gmail.com> <geoff.wellman@gmail.com>
+hcshen0111 <shenhaocheng19990111@gmail.com> <shenhaocheng19990111@gmail.com>
+jamesarch <han.shan@live.cn> <han.shan@live.cn>
+stephenschoettler <stephenschoettler@gmail.com> <stephenschoettler@gmail.com>
+Tranquil-Flow <tranquil_flow@protonmail.com> <tranquil_flow@protonmail.com>
+Dusk1e <yusufalweshdemir@gmail.com> <yusufalweshdemir@gmail.com>
+Awsh1 <ysfalweshcan@gmail.com> <ysfalweshcan@gmail.com>
+WAXLYY <ysfwaxlycan@gmail.com> <ysfwaxlycan@gmail.com>
+donrhmexe <don.rhm@gmail.com> <don.rhm@gmail.com>
+hqhq1025 <1506751656@qq.com> <1506751656@qq.com>
+BlackishGreen33 <s5460703@gmail.com> <s5460703@gmail.com>
+tomqiaozc <zqiao@microsoft.com> <zqiao@microsoft.com>
+MagicRay1217 <mingjwan@microsoft.com> <mingjwan@microsoft.com>
+aaronagent <1115117931@qq.com> <1115117931@qq.com>
+YoungYang963 <young@YoungdeMacBook-Pro.local> <young@YoungdeMacBook-Pro.local>
+LongOddCode <haolong@microsoft.com> <haolong@microsoft.com>
+Cafexss <coffeemjj@gmail.com> <coffeemjj@gmail.com>
+Cygra <sjtuwbh@gmail.com> <sjtuwbh@gmail.com>
+DomGrieco <dgrieco@redhat.com> <dgrieco@redhat.com>
+
+# Duplicate email mapping (same person, multiple emails)
+Sertug17 <104278804+Sertug17@users.noreply.github.com> <srhtsrht17@gmail.com>
+yyovil <birdiegyal@gmail.com> <tanishq231003@gmail.com>
+DomGrieco <dgrieco@redhat.com> <dgrieco@redhat.com>
+dsocolobsky <dsocolobsky@gmail.com> <dylan.socolobsky@lambdaclass.com>
+olafthiele <programming@olafthiele.com> <olafthiele@gmail.com>
+
+# Verified via git display name matching GH contributor username
+cokemine <aptx4561@gmail.com> <aptx4561@gmail.com>
+dalianmao000 <dalianmao0107@gmail.com> <dalianmao0107@gmail.com>
+emozilla <emozilla@nousresearch.com> <emozilla@nousresearch.com>
+jjovalle99 <juan.ovalle@mistral.ai> <juan.ovalle@mistral.ai>
+kagura-agent <kagura.chen28@gmail.com> <kagura.chen28@gmail.com>
+spniyant <niyant@spicefi.xyz> <niyant@spicefi.xyz>
+olafthiele <programming@olafthiele.com> <programming@olafthiele.com>
+r266-tech <r2668940489@gmail.com> <r2668940489@gmail.com>
+xingkongliang <tianliangjay@gmail.com> <tianliangjay@gmail.com>
+win4r <win4r@outlook.com> <win4r@outlook.com>
+zhouboli <zhouboli@gmail.com> <zhouboli@gmail.com>
+yongtenglei <yongtenglei@gmail.com> <yongtenglei@gmail.com>
+
+# Nous Research team
+benbarclay <ben@nousresearch.com> <ben@nousresearch.com>
+jquesnelle <jonny@nousresearch.com> <jonny@nousresearch.com>
+
+# GH contributor list verified
+spideystreet <dhicham.pro@gmail.com> <dhicham.pro@gmail.com>
+dorukardahan <dorukardahan@hotmail.com> <dorukardahan@hotmail.com>
+MustafaKara7 <karamusti912@gmail.com> <karamusti912@gmail.com>
+Hmbown <hmbown@gmail.com> <hmbown@gmail.com>
+kamil-gwozdz <kamil@gwozdz.me> <kamil@gwozdz.me>
+kira-ariaki <kira@ariaki.me> <kira@ariaki.me>
+knopki <knopki@duck.com> <knopki@duck.com>
+Unayung <unayung@gmail.com> <unayung@gmail.com>
+SeeYangZhi <yangzhi.see@gmail.com> <yangzhi.see@gmail.com>
+Julientalbot <julien.talbot@ergonomia.re> <julien.talbot@ergonomia.re>
+lesterli <lisicheng168@gmail.com> <lisicheng168@gmail.com>
+JiayuuWang <jiayuw794@gmail.com> <jiayuw794@gmail.com>
+tesseracttars-creator <tesseracttars@gmail.com> <tesseracttars@gmail.com>
+xinbenlv <zzn+pa@zzn.im> <zzn+pa@zzn.im>
+SaulJWu <saul.jj.wu@gmail.com> <saul.jj.wu@gmail.com>
+angelos <angelos@oikos.lan.home.malaiwah.com> <angelos@oikos.lan.home.malaiwah.com>
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -13,7 +13,7 @@ source venv/bin/activate  # ALWAYS activate before running Python
 ```
 hermes-agent/
 ├── run_agent.py          # AIAgent class — core conversation loop
-├── model_tools.py        # Tool orchestration, _discover_tools(), handle_function_call()
+├── model_tools.py        # Tool orchestration, discover_builtin_tools(), handle_function_call()
 ├── toolsets.py           # Toolset definitions, _HERMES_CORE_TOOLS list
 ├── cli.py                # HermesCLI class — interactive CLI orchestrator
 ├── hermes_state.py       # SessionDB — SQLite session store (FTS5 search)
@@ -55,7 +55,7 @@ hermes-agent/
 ├── gateway/              # Messaging platform gateway
 │   ├── run.py            # Main loop, slash commands, message dispatch
 │   ├── session.py        # SessionStore — conversation persistence
-│   └── platforms/        # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal
+│   └── platforms/        # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal, qqbot
 ├── acp_adapter/          # ACP server (VS Code / Zed / JetBrains integration)
 ├── cron/                 # Scheduler (jobs.py, scheduler.py)
 ├── environments/         # RL training environments (Atropos)
@@ -181,7 +181,7 @@ if canonical == "mycommand":

 ## Adding New Tools

-Requires changes in **3 files**:
+Requires changes in **2 files**:

 **1. Create `tools/your_tool.py`:**
 ```python
@@ -204,9 +204,9 @@ registry.register(
 )
 ```

-**2. Add import** in `model_tools.py` `_discover_tools()` list.
+**2. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.

-**3. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.
+Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual import list to maintain.

 The registry handles schema collection, dispatch, availability checking, and error wrapping. All handlers MUST return a JSON string.

@@ -351,8 +351,9 @@ Cache-breaking forces dramatically higher costs. The ONLY time we alter context

 ### Background Process Notifications (Gateway)

-When `terminal(background=true, check_interval=...)` is used, the gateway runs a watcher that
-pushes status updates to the user's chat. Control verbosity with `display.background_process_notifications`
+When `terminal(background=true, notify_on_complete=true)` is used, the gateway runs a watcher that
+detects process completion and triggers a new agent turn. Control verbosity of background process
+messages with `display.background_process_notifications`
 in config.yaml (or `HERMES_BACKGROUND_NOTIFICATIONS` env var):

 - `all` — running-output updates + final message (default)
--- a/29
+++ b/29
@@ -1,27 +1,44 @@
+FROM ghcr.io/astral-sh/uv:0.11.6-python3.13-trixie@sha256:b3c543b6c4f23a5f2df22866bd7857e5d304b67a564f4feab6ac22044dde719b AS uv_source
+FROM tianon/gosu:1.19-trixie@sha256:3b176695959c71e123eb390d427efc665eeb561b1540e82679c15e992006b8b9 AS gosu_source
 FROM debian:13.4

 # Disable Python stdout buffering to ensure logs are printed immediately
 ENV PYTHONUNBUFFERED=1

+# Store Playwright browsers outside the volume mount so the build-time
+# install survives the /opt/data volume overlay at runtime.
+ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright
+
 # Install system dependencies in one layer, clear APT cache
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        build-essential nodejs npm python3 python3-pip ripgrep ffmpeg gcc python3-dev libffi-dev procps && \
+        build-essential nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git && \
    rm -rf /var/lib/apt/lists/*

+# Non-root user for runtime; UID can be overridden via HERMES_UID at runtime
+RUN useradd -u 10000 -m -d /opt/data hermes
+
+COPY --chmod=0755 --from=gosu_source /gosu /usr/local/bin/
+COPY --chmod=0755 --from=uv_source /usr/local/bin/uv /usr/local/bin/uvx /usr/local/bin/
+
 COPY . /opt/hermes
 WORKDIR /opt/hermes

-# Install Python and Node dependencies in one layer, no cache
-RUN pip install --no-cache-dir uv --break-system-packages && \
-    uv pip install --system --break-system-packages --no-cache -e ".[all]" && \
-    npm install --prefer-offline --no-audit && \
+# Install Node dependencies and Playwright as root (--with-deps needs apt)
+RUN npm install --prefer-offline --no-audit && \
    npx playwright install --with-deps chromium --only-shell && \
    cd /opt/hermes/scripts/whatsapp-bridge && \
    npm install --prefer-offline --no-audit && \
    npm cache clean --force

-WORKDIR /opt/hermes
+# Hand ownership to hermes user, then install Python deps in a virtualenv
+RUN chown -R hermes:hermes /opt/hermes
+USER hermes
+
+RUN uv venv && \
+    uv pip install --no-cache-dir -e ".[all]"
+
+USER root
 RUN chmod +x /opt/hermes/docker/entrypoint.sh

 ENV HERMES_HOME=/opt/data
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@

 **The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.

-Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
+Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.

 <table>
 <tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
@@ -167,6 +167,7 @@ python -m pytest tests/ -q
 - 📚 [Skills Hub](https://agentskills.io)
 - 🐛 [Issues](https://github.com/NousResearch/hermes-agent/issues)
 - 💡 [Discussions](https://github.com/NousResearch/hermes-agent/discussions)
+- 🔌 [HermesClaw](https://github.com/AaronWong1999/hermesclaw) — Community WeChat bridge: Run Hermes Agent and OpenClaw on the same WeChat account.

 ---

--- a/RELEASE_v0.9.0.md
+++ b/RELEASE_v0.9.0.md
@@ -0,0 +1,329 @@
+# Hermes Agent v0.9.0 (v2026.4.13)
+
+**Release Date:** April 13, 2026
+**Since v0.8.0:** 487 commits · 269 merged PRs · 167 resolved issues · 493 files changed · 63,281 insertions · 24 contributors
+
+> The everywhere release — Hermes goes mobile with Termux/Android, adds iMessage and WeChat, ships Fast Mode for OpenAI and Anthropic, introduces background process monitoring, launches a local web dashboard for managing your agent, and delivers the deepest security hardening pass yet across 16 supported platforms.
+
+---
+
+## ✨ Highlights
+
+- **Local Web Dashboard** — A new browser-based dashboard for managing your Hermes Agent locally. Configure settings, monitor sessions, browse skills, and manage your gateway — all from a clean web interface without touching config files or the terminal. The easiest way to get started with Hermes.
+
+- **Fast Mode (`/fast`)** — Priority processing for OpenAI and Anthropic models. Toggle `/fast` to route through priority queues for significantly lower latency on supported models (GPT-5.4, Codex, Claude). Expands across all OpenAI Priority Processing models and Anthropic's fast tier. ([#6875](https://github.com/NousResearch/hermes-agent/pull/6875), [#6960](https://github.com/NousResearch/hermes-agent/pull/6960), [#7037](https://github.com/NousResearch/hermes-agent/pull/7037))
+
+- **iMessage via BlueBubbles** — Full iMessage integration through BlueBubbles, bringing Hermes to Apple's messaging ecosystem. Auto-webhook registration, setup wizard integration, and crash resilience. ([#6437](https://github.com/NousResearch/hermes-agent/pull/6437), [#6460](https://github.com/NousResearch/hermes-agent/pull/6460), [#6494](https://github.com/NousResearch/hermes-agent/pull/6494))
+
+- **WeChat (Weixin) & WeCom Callback Mode** — Native WeChat support via iLink Bot API and a new WeCom callback-mode adapter for self-built enterprise apps. Streaming cursor, media uploads, markdown link handling, and atomic state persistence. Hermes now covers the Chinese messaging ecosystem end-to-end. ([#7166](https://github.com/NousResearch/hermes-agent/pull/7166), [#7943](https://github.com/NousResearch/hermes-agent/pull/7943))
+
+- **Termux / Android Support** — Run Hermes natively on Android via Termux. Adapted install paths, TUI optimizations for mobile screens, voice backend support, and the `/image` command work on-device. ([#6834](https://github.com/NousResearch/hermes-agent/pull/6834))
+
+- **Background Process Monitoring (`watch_patterns`)** — Set patterns to watch for in background process output and get notified in real-time when they match. Monitor for errors, wait for specific events ("listening on port"), or watch build logs — all without polling. ([#7635](https://github.com/NousResearch/hermes-agent/pull/7635))
+
+- **Native xAI & Xiaomi MiMo Providers** — First-class provider support for xAI (Grok) and Xiaomi MiMo, with direct API access, model catalogs, and setup wizard integration. Plus Qwen OAuth with portal request support. ([#7372](https://github.com/NousResearch/hermes-agent/pull/7372), [#7855](https://github.com/NousResearch/hermes-agent/pull/7855))
+
+- **Pluggable Context Engine** — Context management is now a pluggable slot via `hermes plugins`. Swap in custom context engines that control what the agent sees each turn — filtering, summarization, or domain-specific context injection. ([#7464](https://github.com/NousResearch/hermes-agent/pull/7464))
+
+- **Unified Proxy Support** — SOCKS proxy, `DISCORD_PROXY`, and system proxy auto-detection across all gateway platforms. Hermes behind corporate firewalls just works. ([#6814](https://github.com/NousResearch/hermes-agent/pull/6814))
+
+- **Comprehensive Security Hardening** — Path traversal protection in checkpoint manager, shell injection neutralization in sandbox writes, SSRF redirect guards in Slack image uploads, Twilio webhook signature validation (SMS RCE fix), API server auth enforcement, git argument injection prevention, and approval button authorization. ([#7933](https://github.com/NousResearch/hermes-agent/pull/7933), [#7944](https://github.com/NousResearch/hermes-agent/pull/7944), [#7940](https://github.com/NousResearch/hermes-agent/pull/7940), [#7151](https://github.com/NousResearch/hermes-agent/pull/7151), [#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
+
+- **`hermes backup` & `hermes import`** — Full backup and restore of your Hermes configuration, sessions, skills, and memory. Migrate between machines or create snapshots before major changes. ([#7997](https://github.com/NousResearch/hermes-agent/pull/7997))
+
+- **16 Supported Platforms** — With BlueBubbles (iMessage) and WeChat joining Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, SMS, DingTalk, Feishu, WeCom, Mattermost, Home Assistant, and Webhooks, Hermes now runs on 16 messaging platforms out of the box.
+
+- **`/debug` & `hermes debug share`** — New debugging toolkit: `/debug` slash command across all platforms for quick diagnostics, plus `hermes debug share` to upload a full debug report to a pastebin for easy sharing when troubleshooting. ([#8681](https://github.com/NousResearch/hermes-agent/pull/8681))
+
+---
+
+## 🏗️ Core Agent & Architecture
+
+### Provider & Model Support
+- **Native xAI (Grok) provider** with direct API access and model catalog ([#7372](https://github.com/NousResearch/hermes-agent/pull/7372))
+- **Xiaomi MiMo as first-class provider** — setup wizard, model catalog, empty response recovery ([#7855](https://github.com/NousResearch/hermes-agent/pull/7855))
+- **Qwen OAuth provider** with portal request support ([#6282](https://github.com/NousResearch/hermes-agent/pull/6282))
+- **Fast Mode** — `/fast` toggle for OpenAI Priority Processing + Anthropic fast tier ([#6875](https://github.com/NousResearch/hermes-agent/pull/6875), [#6960](https://github.com/NousResearch/hermes-agent/pull/6960), [#7037](https://github.com/NousResearch/hermes-agent/pull/7037))
+- **Structured API error classification** for smart failover decisions ([#6514](https://github.com/NousResearch/hermes-agent/pull/6514))
+- **Rate limit header capture** shown in `/usage` ([#6541](https://github.com/NousResearch/hermes-agent/pull/6541))
+- **API server model name** derived from profile name ([#6857](https://github.com/NousResearch/hermes-agent/pull/6857))
+- **Custom providers** now included in `/model` listings and resolution ([#7088](https://github.com/NousResearch/hermes-agent/pull/7088))
+- **Fallback provider activation** on repeated empty responses with user-visible status ([#7505](https://github.com/NousResearch/hermes-agent/pull/7505))
+- **OpenRouter variant tags** (`:free`, `:extended`, `:fast`) preserved during model switch ([#6383](https://github.com/NousResearch/hermes-agent/pull/6383))
+- **Credential exhaustion TTL** reduced from 24 hours to 1 hour ([#6504](https://github.com/NousResearch/hermes-agent/pull/6504))
+- **OAuth credential lifecycle** hardening — stale pool keys, auth.json sync, Codex CLI race fixes ([#6874](https://github.com/NousResearch/hermes-agent/pull/6874))
+- Empty response recovery for reasoning models (MiMo, Qwen, GLM) ([#8609](https://github.com/NousResearch/hermes-agent/pull/8609))
+- MiniMax context lengths, thinking guard, endpoint corrections ([#6082](https://github.com/NousResearch/hermes-agent/pull/6082), [#7126](https://github.com/NousResearch/hermes-agent/pull/7126))
+- Z.AI endpoint auto-detect via probe and cache ([#5763](https://github.com/NousResearch/hermes-agent/pull/5763))
+
+### Agent Loop & Conversation
+- **Pluggable context engine slot** via `hermes plugins` ([#7464](https://github.com/NousResearch/hermes-agent/pull/7464))
+- **Background process monitoring** — `watch_patterns` for real-time output alerts ([#7635](https://github.com/NousResearch/hermes-agent/pull/7635))
+- **Improved context compression** — higher limits, tool tracking, degradation warnings, token-budget tail protection ([#6395](https://github.com/NousResearch/hermes-agent/pull/6395), [#6453](https://github.com/NousResearch/hermes-agent/pull/6453))
+- **`/compress <focus>`** — guided compression with a focus topic ([#8017](https://github.com/NousResearch/hermes-agent/pull/8017))
+- **Tiered context pressure warnings** with gateway dedup ([#6411](https://github.com/NousResearch/hermes-agent/pull/6411))
+- **Staged inactivity warning** before timeout escalation ([#6387](https://github.com/NousResearch/hermes-agent/pull/6387))
+- **Prevent agent from stopping mid-task** — compression floor, budget overhaul, activity tracking ([#7983](https://github.com/NousResearch/hermes-agent/pull/7983))
+- **Propagate child activity to parent** during `delegate_task` ([#7295](https://github.com/NousResearch/hermes-agent/pull/7295))
+- **Truncated streaming tool call detection** before execution ([#6847](https://github.com/NousResearch/hermes-agent/pull/6847))
+- Empty response retry (3 attempts with nudge) ([#6488](https://github.com/NousResearch/hermes-agent/pull/6488))
+- Adaptive streaming backoff + cursor strip to prevent message truncation ([#7683](https://github.com/NousResearch/hermes-agent/pull/7683))
+- Compression uses live session model instead of stale persisted config ([#8258](https://github.com/NousResearch/hermes-agent/pull/8258))
+- Strip `<thought>` tags from Gemma 4 responses ([#8562](https://github.com/NousResearch/hermes-agent/pull/8562))
+- Prevent `<think>` in prose from suppressing response output ([#6968](https://github.com/NousResearch/hermes-agent/pull/6968))
+- Turn-exit diagnostic logging to agent loop ([#6549](https://github.com/NousResearch/hermes-agent/pull/6549))
+- Scope tool interrupt signal per-thread to prevent cross-session leaks ([#7930](https://github.com/NousResearch/hermes-agent/pull/7930))
+
+### Memory & Sessions
+- **Hindsight memory plugin** — feature parity, setup wizard, config improvements — @nicoloboschi ([#6428](https://github.com/NousResearch/hermes-agent/pull/6428))
+- **Honcho** — opt-in `initOnSessionStart` for tools mode — @Kathie-yu ([#6995](https://github.com/NousResearch/hermes-agent/pull/6995))
+- Orphan children instead of cascade-deleting in prune/delete ([#6513](https://github.com/NousResearch/hermes-agent/pull/6513))
+- Doctor command only checks the active memory provider ([#6285](https://github.com/NousResearch/hermes-agent/pull/6285))
+
+---
+
+## 📱 Messaging Platforms (Gateway)
+
+### New Platforms
+- **BlueBubbles (iMessage)** — full adapter with auto-webhook registration, setup wizard, and crash resilience ([#6437](https://github.com/NousResearch/hermes-agent/pull/6437), [#6460](https://github.com/NousResearch/hermes-agent/pull/6460), [#6494](https://github.com/NousResearch/hermes-agent/pull/6494), [#7107](https://github.com/NousResearch/hermes-agent/pull/7107))
+- **Weixin (WeChat)** — native support via iLink Bot API with streaming, media uploads, markdown links ([#7166](https://github.com/NousResearch/hermes-agent/pull/7166), [#8665](https://github.com/NousResearch/hermes-agent/pull/8665))
+- **WeCom Callback Mode** — self-built enterprise app adapter with atomic state persistence ([#7943](https://github.com/NousResearch/hermes-agent/pull/7943), [#7928](https://github.com/NousResearch/hermes-agent/pull/7928))
+
+### Discord
+- **Allowed channels whitelist** config — @jarvis-phw ([#7044](https://github.com/NousResearch/hermes-agent/pull/7044))
+- **Forum channel topic inheritance** in thread sessions — @hermes-agent-dhabibi ([#6377](https://github.com/NousResearch/hermes-agent/pull/6377))
+- **DISCORD_REPLY_TO_MODE** setting ([#6333](https://github.com/NousResearch/hermes-agent/pull/6333))
+- Accept `.log` attachments, raise document size limit — @kira-ariaki ([#6467](https://github.com/NousResearch/hermes-agent/pull/6467))
+- Decouple readiness from slash sync ([#8016](https://github.com/NousResearch/hermes-agent/pull/8016))
+
+### Slack
+- **Consolidated Slack improvements** — 7 community PRs salvaged into one ([#6809](https://github.com/NousResearch/hermes-agent/pull/6809))
+- Handle assistant thread lifecycle events ([#6433](https://github.com/NousResearch/hermes-agent/pull/6433))
+
+### Matrix
+- **Migrated from matrix-nio to mautrix-python** ([#7518](https://github.com/NousResearch/hermes-agent/pull/7518))
+- SQLite crypto store replacing pickle (fixes E2EE decryption) — @alt-glitch ([#7981](https://github.com/NousResearch/hermes-agent/pull/7981))
+- Cross-signing recovery key verification for E2EE migration ([#8282](https://github.com/NousResearch/hermes-agent/pull/8282))
+- DM mention threads + group chat events for Feishu ([#7423](https://github.com/NousResearch/hermes-agent/pull/7423))
+
+### Gateway Core
+- **Unified proxy support** — SOCKS, DISCORD_PROXY, multi-platform with macOS auto-detection ([#6814](https://github.com/NousResearch/hermes-agent/pull/6814))
+- **Inbound text batching** for Discord, Matrix, WeCom + adaptive delay ([#6979](https://github.com/NousResearch/hermes-agent/pull/6979))
+- **Surface natural mid-turn assistant messages** in chat platforms ([#7978](https://github.com/NousResearch/hermes-agent/pull/7978))
+- **WSL-aware gateway** with smart systemd detection ([#7510](https://github.com/NousResearch/hermes-agent/pull/7510))
+- **All missing platforms added to setup wizard** ([#7949](https://github.com/NousResearch/hermes-agent/pull/7949))
+- **Per-platform `tool_progress` overrides** ([#6348](https://github.com/NousResearch/hermes-agent/pull/6348))
+- **Configurable 'still working' notification interval** ([#8572](https://github.com/NousResearch/hermes-agent/pull/8572))
+- `/model` switch persists across messages ([#7081](https://github.com/NousResearch/hermes-agent/pull/7081))
+- `/usage` shows rate limits, cost, and token details between turns ([#7038](https://github.com/NousResearch/hermes-agent/pull/7038))
+- Drain in-flight work before restart ([#7503](https://github.com/NousResearch/hermes-agent/pull/7503))
+- Don't evict cached agent on failed runs — prevents MCP restart loop ([#7539](https://github.com/NousResearch/hermes-agent/pull/7539))
+- Replace `os.environ` session state with `contextvars` ([#7454](https://github.com/NousResearch/hermes-agent/pull/7454))
+- Derive channel directory platforms from enum instead of hardcoded list ([#7450](https://github.com/NousResearch/hermes-agent/pull/7450))
+- Validate image downloads before caching (cross-platform) ([#7125](https://github.com/NousResearch/hermes-agent/pull/7125))
+- Cross-platform webhook delivery for all platforms ([#7095](https://github.com/NousResearch/hermes-agent/pull/7095))
+- Cron Discord thread_id delivery support ([#7106](https://github.com/NousResearch/hermes-agent/pull/7106))
+- Feishu QR-based bot onboarding ([#8570](https://github.com/NousResearch/hermes-agent/pull/8570))
+- Gateway status scoped to active profile ([#7951](https://github.com/NousResearch/hermes-agent/pull/7951))
+- Prevent background process notifications from triggering false pairing requests ([#6434](https://github.com/NousResearch/hermes-agent/pull/6434))
+
+---
+
+## 🖥️ CLI & User Experience
+
+### Interactive CLI
+- **Termux / Android support** — adapted install paths, TUI, voice, `/image` ([#6834](https://github.com/NousResearch/hermes-agent/pull/6834))
+- **Native `/model` picker modal** for provider → model selection ([#8003](https://github.com/NousResearch/hermes-agent/pull/8003))
+- **Live per-tool elapsed timer** restored in TUI spinner ([#7359](https://github.com/NousResearch/hermes-agent/pull/7359))
+- **Stacked tool progress scrollback** in TUI ([#8201](https://github.com/NousResearch/hermes-agent/pull/8201))
+- **Random tips on new session start** (CLI + gateway, 279 tips) ([#8225](https://github.com/NousResearch/hermes-agent/pull/8225), [#8237](https://github.com/NousResearch/hermes-agent/pull/8237))
+- **`hermes dump`** — copy-pasteable setup summary for debugging ([#6550](https://github.com/NousResearch/hermes-agent/pull/6550))
+- **`hermes backup` / `hermes import`** — full config backup and restore ([#7997](https://github.com/NousResearch/hermes-agent/pull/7997))
+- **WSL environment hint** in system prompt ([#8285](https://github.com/NousResearch/hermes-agent/pull/8285))
+- **Profile creation UX** — seed SOUL.md + credential warning ([#8553](https://github.com/NousResearch/hermes-agent/pull/8553))
+- Shell-aware sudo detection, empty password support ([#6517](https://github.com/NousResearch/hermes-agent/pull/6517))
+- Flush stdin after curses/terminal menus to prevent escape sequence leakage ([#7167](https://github.com/NousResearch/hermes-agent/pull/7167))
+- Handle broken stdin in prompt_toolkit startup ([#8560](https://github.com/NousResearch/hermes-agent/pull/8560))
+
+### Setup & Configuration
+- **Per-platform display verbosity** configuration ([#8006](https://github.com/NousResearch/hermes-agent/pull/8006))
+- **Component-separated logging** with session context and filtering ([#7991](https://github.com/NousResearch/hermes-agent/pull/7991))
+- **`network.force_ipv4`** config to fix IPv6 timeout issues ([#8196](https://github.com/NousResearch/hermes-agent/pull/8196))
+- **Standardize message whitespace and JSON formatting** ([#7988](https://github.com/NousResearch/hermes-agent/pull/7988))
+- **Rebrand OpenClaw → Hermes** during migration ([#8210](https://github.com/NousResearch/hermes-agent/pull/8210))
+- Config.yaml takes priority over env vars for auxiliary settings ([#7889](https://github.com/NousResearch/hermes-agent/pull/7889))
+- Harden setup provider flows + live OpenRouter catalog refresh ([#7078](https://github.com/NousResearch/hermes-agent/pull/7078))
+- Normalize reasoning effort ordering across all surfaces ([#6804](https://github.com/NousResearch/hermes-agent/pull/6804))
+- Remove dead `LLM_MODEL` env var + migration to clear stale entries ([#6543](https://github.com/NousResearch/hermes-agent/pull/6543))
+- Remove `/prompt` slash command — prefix expansion footgun ([#6752](https://github.com/NousResearch/hermes-agent/pull/6752))
+- `HERMES_HOME_MODE` env var to override permissions — @ygd58 ([#6993](https://github.com/NousResearch/hermes-agent/pull/6993))
+- Fall back to default model when model config is empty ([#8303](https://github.com/NousResearch/hermes-agent/pull/8303))
+- Warn when compression model context is too small ([#7894](https://github.com/NousResearch/hermes-agent/pull/7894))
+
+---
+
+## 🔧 Tool System
+
+### Environments & Execution
+- **Unified spawn-per-call execution layer** for environments ([#6343](https://github.com/NousResearch/hermes-agent/pull/6343))
+- **Unified file sync** with mtime tracking, deletion, and transactional state ([#7087](https://github.com/NousResearch/hermes-agent/pull/7087))
+- **Persistent sandbox envs** survive between turns ([#6412](https://github.com/NousResearch/hermes-agent/pull/6412))
+- **Bulk file sync** via tar pipe for SSH/Modal backends — @alt-glitch ([#8014](https://github.com/NousResearch/hermes-agent/pull/8014))
+- **Daytona** — bulk upload, config bridge, silent disk cap ([#7538](https://github.com/NousResearch/hermes-agent/pull/7538))
+- Foreground timeout cap to prevent session deadlocks ([#7082](https://github.com/NousResearch/hermes-agent/pull/7082))
+- Guard invalid command values ([#6417](https://github.com/NousResearch/hermes-agent/pull/6417))
+
+### MCP
+- **`hermes mcp add --env` and `--preset`** support ([#7970](https://github.com/NousResearch/hermes-agent/pull/7970))
+- Combine `content` and `structuredContent` when both present ([#7118](https://github.com/NousResearch/hermes-agent/pull/7118))
+- MCP tool name deconfliction fixes ([#7654](https://github.com/NousResearch/hermes-agent/pull/7654))
+
+### Browser
+- Browser hardening — dead code removal, caching, scroll perf, security, thread safety ([#7354](https://github.com/NousResearch/hermes-agent/pull/7354))
+- `/browser connect` auto-launch uses dedicated Chrome profile dir ([#6821](https://github.com/NousResearch/hermes-agent/pull/6821))
+- Reap orphaned browser sessions on startup ([#7931](https://github.com/NousResearch/hermes-agent/pull/7931))
+
+### Voice & Vision
+- **Voxtral TTS provider** (Mistral AI) ([#7653](https://github.com/NousResearch/hermes-agent/pull/7653))
+- **TTS speed support** for Edge TTS, OpenAI TTS, MiniMax ([#8666](https://github.com/NousResearch/hermes-agent/pull/8666))
+- **Vision auto-resize** for oversized images, raise limit to 20 MB, retry-on-failure ([#7883](https://github.com/NousResearch/hermes-agent/pull/7883), [#7902](https://github.com/NousResearch/hermes-agent/pull/7902))
+- STT provider-model mismatch fix (whisper-1 vs faster-whisper) ([#7113](https://github.com/NousResearch/hermes-agent/pull/7113))
+
+### Other Tools
+- **`hermes dump`** command for setup summary ([#6550](https://github.com/NousResearch/hermes-agent/pull/6550))
+- TODO store enforces ID uniqueness during replace operations ([#7986](https://github.com/NousResearch/hermes-agent/pull/7986))
+- List all available toolsets in `delegate_task` schema description ([#8231](https://github.com/NousResearch/hermes-agent/pull/8231))
+- API server: tool progress as custom SSE event to prevent model corruption ([#7500](https://github.com/NousResearch/hermes-agent/pull/7500))
+- API server: share one Docker container across all conversations ([#7127](https://github.com/NousResearch/hermes-agent/pull/7127))
+
+---
+
+## 🧩 Skills Ecosystem
+
+- **Centralized skills index + tree cache** — eliminates rate-limit failures on install ([#8575](https://github.com/NousResearch/hermes-agent/pull/8575))
+- **More aggressive skill loading instructions** in system prompt (v3) ([#8209](https://github.com/NousResearch/hermes-agent/pull/8209), [#8286](https://github.com/NousResearch/hermes-agent/pull/8286))
+- **Google Workspace skill** migrated to GWS CLI backend ([#6788](https://github.com/NousResearch/hermes-agent/pull/6788))
+- **Creative divergence strategies** skill — @SHL0MS ([#6882](https://github.com/NousResearch/hermes-agent/pull/6882))
+- **Creative ideation** — constraint-driven project generation — @SHL0MS ([#7555](https://github.com/NousResearch/hermes-agent/pull/7555))
+- Parallelize skills browse/search to prevent hanging ([#7301](https://github.com/NousResearch/hermes-agent/pull/7301))
+- Read name from SKILL.md frontmatter in skills_sync ([#7623](https://github.com/NousResearch/hermes-agent/pull/7623))
+
+---
+
+## 🔒 Security & Reliability
+
+### Security Hardening
+- **Twilio webhook signature validation** — SMS RCE fix ([#7933](https://github.com/NousResearch/hermes-agent/pull/7933))
+- **Shell injection neutralization** in `_write_to_sandbox` via path quoting ([#7940](https://github.com/NousResearch/hermes-agent/pull/7940))
+- **Git argument injection** and path traversal prevention in checkpoint manager ([#7944](https://github.com/NousResearch/hermes-agent/pull/7944))
+- **SSRF redirect bypass** in Slack image uploads + base.py cache helpers ([#7151](https://github.com/NousResearch/hermes-agent/pull/7151))
+- **Path traversal, credential gate, DANGEROUS_PATTERNS gaps** ([#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
+- **API bind guard** — enforce `API_SERVER_KEY` for non-loopback binding ([#7455](https://github.com/NousResearch/hermes-agent/pull/7455))
+- **Approval button authorization** — require auth for session continuation — @Cafexss ([#6930](https://github.com/NousResearch/hermes-agent/pull/6930))
+- Path boundary enforcement in skill manager operations ([#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
+- DingTalk/API webhook URL origin validation, header injection rejection ([#7455](https://github.com/NousResearch/hermes-agent/pull/7455))
+
+### Reliability
+- **Contextual error diagnostics** for invalid API responses ([#8565](https://github.com/NousResearch/hermes-agent/pull/8565))
+- **Prevent 400 format errors** from triggering compression loop on Codex ([#6751](https://github.com/NousResearch/hermes-agent/pull/6751))
+- **Don't halve context_length** on output-cap-too-large errors — @KUSH42 ([#6664](https://github.com/NousResearch/hermes-agent/pull/6664))
+- **Recover primary client** on OpenAI transport errors ([#7108](https://github.com/NousResearch/hermes-agent/pull/7108))
+- **Credential pool rotation** on billing-classified 400s ([#7112](https://github.com/NousResearch/hermes-agent/pull/7112))
+- **Auto-increase stream read timeout** for local LLM providers ([#6967](https://github.com/NousResearch/hermes-agent/pull/6967))
+- **Fall back to default certs** when CA bundle path doesn't exist ([#7352](https://github.com/NousResearch/hermes-agent/pull/7352))
+- **Disambiguate usage-limit patterns** in error classifier — @sprmn24 ([#6836](https://github.com/NousResearch/hermes-agent/pull/6836))
+- Harden cron script timeout and provider recovery ([#7079](https://github.com/NousResearch/hermes-agent/pull/7079))
+- Gateway interrupt detection resilient to monitor task failures ([#8208](https://github.com/NousResearch/hermes-agent/pull/8208))
+- Prevent unwanted session auto-reset after graceful gateway restarts ([#8299](https://github.com/NousResearch/hermes-agent/pull/8299))
+- Prevent duplicate update prompt spam in gateway watcher ([#8343](https://github.com/NousResearch/hermes-agent/pull/8343))
+- Deduplicate reasoning items in Responses API input ([#7946](https://github.com/NousResearch/hermes-agent/pull/7946))
+
+### Infrastructure
+- **Multi-arch Docker image** — amd64 + arm64 ([#6124](https://github.com/NousResearch/hermes-agent/pull/6124))
+- **Docker runs as non-root user** with virtualenv — @benbarclay contributing ([#8226](https://github.com/NousResearch/hermes-agent/pull/8226))
+- **Use `uv`** for Docker dependency resolution to fix resolution-too-deep ([#6965](https://github.com/NousResearch/hermes-agent/pull/6965))
+- **Container-aware Nix CLI** — auto-route into managed container — @alt-glitch ([#7543](https://github.com/NousResearch/hermes-agent/pull/7543))
+- **Nix shared-state permission model** for interactive CLI users — @alt-glitch ([#6796](https://github.com/NousResearch/hermes-agent/pull/6796))
+- **Per-profile subprocess HOME isolation** ([#7357](https://github.com/NousResearch/hermes-agent/pull/7357))
+- Profile paths fixed in Docker — profiles go to mounted volume ([#7170](https://github.com/NousResearch/hermes-agent/pull/7170))
+- Docker container gateway pathway hardened ([#8614](https://github.com/NousResearch/hermes-agent/pull/8614))
+- Enable unbuffered stdout for live Docker logs ([#6749](https://github.com/NousResearch/hermes-agent/pull/6749))
+- Install procps in Docker image — @HiddenPuppy ([#7032](https://github.com/NousResearch/hermes-agent/pull/7032))
+- Shallow git clone for faster installation — @sosyz ([#8396](https://github.com/NousResearch/hermes-agent/pull/8396))
+- `hermes update` always reset on stash conflict ([#7010](https://github.com/NousResearch/hermes-agent/pull/7010))
+- Write update exit code before gateway restart (cgroup kill race) ([#8288](https://github.com/NousResearch/hermes-agent/pull/8288))
+- Nix: `setupSecrets` optional, tirith runtime dep — @devorun, @ethernet8023 ([#6261](https://github.com/NousResearch/hermes-agent/pull/6261), [#6721](https://github.com/NousResearch/hermes-agent/pull/6721))
+- launchd stop uses `bootout` so `KeepAlive` doesn't respawn ([#7119](https://github.com/NousResearch/hermes-agent/pull/7119))
+
+---
+
+## 🐛 Notable Bug Fixes
+
+- Fix: `/model` switch not persisting across gateway messages ([#7081](https://github.com/NousResearch/hermes-agent/pull/7081))
+- Fix: session-scoped gateway model overrides ignored — @Hygaard ([#7662](https://github.com/NousResearch/hermes-agent/pull/7662))
+- Fix: compaction model context length ignoring config — 3 related issues ([#8258](https://github.com/NousResearch/hermes-agent/pull/8258), [#8107](https://github.com/NousResearch/hermes-agent/pull/8107))
+- Fix: OpenCode.ai context window resolved to 128K instead of 1M ([#6472](https://github.com/NousResearch/hermes-agent/pull/6472))
+- Fix: Codex fallback auth-store lookup — @cherifya ([#6462](https://github.com/NousResearch/hermes-agent/pull/6462))
+- Fix: duplicate completion notifications when process killed ([#7124](https://github.com/NousResearch/hermes-agent/pull/7124))
+- Fix: agent daemon thread prevents orphan CLI processes on tab close ([#8557](https://github.com/NousResearch/hermes-agent/pull/8557))
+- Fix: stale image attachment on text paste and voice input ([#7077](https://github.com/NousResearch/hermes-agent/pull/7077))
+- Fix: DM thread session seeding causing cross-thread contamination ([#7084](https://github.com/NousResearch/hermes-agent/pull/7084))
+- Fix: OpenClaw migration shows dry-run preview before executing ([#6769](https://github.com/NousResearch/hermes-agent/pull/6769))
+- Fix: auth errors misclassified as retryable — @kuishou68 ([#7027](https://github.com/NousResearch/hermes-agent/pull/7027))
+- Fix: Copilot-Integration-Id header missing ([#7083](https://github.com/NousResearch/hermes-agent/pull/7083))
+- Fix: ACP session capabilities — @luyao618 ([#6985](https://github.com/NousResearch/hermes-agent/pull/6985))
+- Fix: ACP PromptResponse usage from top-level fields ([#7086](https://github.com/NousResearch/hermes-agent/pull/7086))
+- Fix: several failing/flaky tests on main — @dsocolobsky ([#6777](https://github.com/NousResearch/hermes-agent/pull/6777))
+- Fix: backup marker filenames — @sprmn24 ([#8600](https://github.com/NousResearch/hermes-agent/pull/8600))
+- Fix: `NoneType` in fast_mode check — @0xbyt4 ([#7350](https://github.com/NousResearch/hermes-agent/pull/7350))
+- Fix: missing imports in uninstall.py — @JiayuuWang ([#7034](https://github.com/NousResearch/hermes-agent/pull/7034))
+
+---
+
+## 📚 Documentation
+
+- Platform adapter developer guide + WeCom Callback docs ([#7969](https://github.com/NousResearch/hermes-agent/pull/7969))
+- Cron troubleshooting guide ([#7122](https://github.com/NousResearch/hermes-agent/pull/7122))
+- Streaming timeout auto-detection for local LLMs ([#6990](https://github.com/NousResearch/hermes-agent/pull/6990))
+- Tool-use enforcement documentation expanded ([#7984](https://github.com/NousResearch/hermes-agent/pull/7984))
+- BlueBubbles pairing instructions ([#6548](https://github.com/NousResearch/hermes-agent/pull/6548))
+- Telegram proxy support section ([#6348](https://github.com/NousResearch/hermes-agent/pull/6348))
+- `hermes dump` and `hermes logs` CLI reference ([#6552](https://github.com/NousResearch/hermes-agent/pull/6552))
+- `tool_progress_overrides` configuration reference ([#6364](https://github.com/NousResearch/hermes-agent/pull/6364))
+- Compression model context length warning docs ([#7879](https://github.com/NousResearch/hermes-agent/pull/7879))
+
+---
+
+## 👥 Contributors
+
+**269 merged PRs** from **24 contributors** across **487 commits**.
+
+### Community Contributors
+- **@alt-glitch** (6 PRs) — Nix container-aware CLI, shared-state permissions, Matrix SQLite crypto store, bulk SSH/Modal file sync, Matrix mautrix compat
+- **@SHL0MS** (2 PRs) — Creative divergence strategies skill, creative ideation skill
+- **@sprmn24** (2 PRs) — Error classifier disambiguation, backup marker fix
+- **@nicoloboschi** — Hindsight memory plugin feature parity
+- **@Hygaard** — Session-scoped gateway model override fix
+- **@jarvis-phw** — Discord allowed_channels whitelist
+- **@Kathie-yu** — Honcho initOnSessionStart for tools mode
+- **@hermes-agent-dhabibi** — Discord forum channel topic inheritance
+- **@kira-ariaki** — Discord .log attachments and size limit
+- **@cherifya** — Codex fallback auth-store lookup
+- **@Cafexss** — Security: auth for session continuation
+- **@KUSH42** — Compaction context_length fix
+- **@kuishou68** — Auth error retryable classification fix
+- **@luyao618** — ACP session capabilities
+- **@ygd58** — HERMES_HOME_MODE env var override
+- **@0xbyt4** — Fast mode NoneType fix
+- **@JiayuuWang** — CLI uninstall import fix
+- **@HiddenPuppy** — Docker procps installation
+- **@dsocolobsky** — Test suite fixes
+- **@bobashopcashier** (1 PR) — Graceful gateway drain before restart (salvaged into #7503 from #7290)
+- **@benbarclay** — Docker image tag simplification
+- **@sosyz** — Shallow git clone for faster install
+- **@devorun** — Nix setupSecrets optional
+- **@ethernet8023** — Nix tirith runtime dep
+
+---
+
+**Full Changelog**: [v2026.4.8...v2026.4.13](https://github.com/NousResearch/hermes-agent/compare/v2026.4.8...v2026.4.13)
--- a/VECTOR_DB_RESEARCH_REPORT.md
+++ b/VECTOR_DB_RESEARCH_REPORT.md
@@ -0,0 +1,172 @@
+# Vector Database SOTA Research Report
+## For AI Agent Semantic Retrieval — April 2026
+
+---
+
+## Executive Summary
+
+Analysis of current vector database benchmarks, documentation, and production deployments for semantic retrieval in AI agents. Compared against existing Hermes session_search (SQLite FTS5) and holographic memory systems.
+
+---
+
+## 1. Retrieval Accuracy (Recall@10)
+
+| Database | HNSW Recall | IVF Recall | Notes |
+|----------|-------------|------------|-------|
+| **Qdrant** | 0.95-0.99 | N/A | Tunable via ef parameter |
+| **Milvus** | 0.95-0.99 | 0.85-0.95 | Multiple index support |
+| **Weaviate** | 0.95-0.98 | N/A | HNSW primary |
+| **Pinecone** | 0.95-0.99 | N/A | Managed, opaque tuning |
+| **ChromaDB** | 0.90-0.95 | N/A | Simpler, uses HNSW via hnswlib |
+| **pgvector** | 0.85-0.95 | 0.80-0.90 | Depends on tuning |
+| **SQLite-vss** | 0.80-0.90 | N/A | HNSW via sqlite-vss |
+| **Current FTS5** | ~0.60-0.75* | N/A | Keyword matching only |
+
+*FTS5 "recall" estimated: good for exact keywords, poor for semantic/paraphrased queries.
+
+---
+
+## 2. Latency Benchmarks (1M vectors, 768-dim, 10 neighbors)
+
+| Database | p50 (ms) | p99 (ms) | QPS | Notes |
+|----------|----------|----------|-----|-------|
+| **Qdrant** | 1-3 | 5-10 | 5,000-15,000 | Best self-hosted |
+| **Milvus** | 2-5 | 8-15 | 3,000-12,000 | Good distributed |
+| **Weaviate** | 3-8 | 10-25 | 2,000-8,000 | |
+| **Pinecone** | 5-15 | 20-50 | 1,000-5,000 | Managed overhead |
+| **ChromaDB** | 5-15 | 20-50 | 500-2,000 | Embedded mode |
+| **pgvector** | 10-50 | 50-200 | 200-1,000 | SQL overhead |
+| **SQLite-vss** | 10-30 | 50-150 | 300-800 | Limited scalability |
+| **Current FTS5** | 2-10 | 15-50 | 1,000-5,000 | No embedding cost |
+
+---
+
+## 3. Index Types Comparison
+
+### HNSW (Hierarchical Navigable Small World)
+- Best for: High recall, moderate memory, fast queries
+- Used by: Qdrant, Weaviate, ChromaDB, Milvus, pgvector, SQLite-vss
+- Memory: High (~1.5GB per 1M 768-dim vectors)
+- Key parameters: ef_construction (100-500), M (16-64), ef (64-256)
+
+### IVF (Inverted File Index)
+- Best for: Large datasets, memory-constrained
+- Used by: Milvus, pgvector
+- Memory: Lower (~0.5GB per 1M vectors)
+- Key parameters: nlist (100-10000), nprobe (10-100)
+
+### DiskANN / SPANN
+- Best for: 100M+ vectors on disk
+- Memory: Very low (~100MB index)
+
+### Quantization (SQ/PQ)
+- Memory reduction: 4-8x
+- Recall impact: -5-15%
+
+---
+
+## 4. Multi-Modal Support
+
+| Database | Text | Image | Audio | Video | Mixed Queries |
+|----------|------|-------|-------|-------|---------------|
+| Qdrant | ✅ | ✅ | ✅ | ✅ | ✅ (multi-vector) |
+| Milvus | ✅ | ✅ | ✅ | ✅ | ✅ (hybrid) |
+| Weaviate | ✅ | ✅ | ✅ | ✅ | ✅ (named vectors) |
+| Pinecone | ✅ | ✅ | ✅ | ✅ | Limited |
+| ChromaDB | ✅ | Via emb | Via emb | Via emb | Limited |
+| pgvector | ✅ | Via emb | Via emb | Via emb | Limited |
+| SQLite-vss | ✅ | Via emb | Via emb | Via emb | Limited |
+
+---
+
+## 5. Integration Patterns for AI Agents
+
+### Pattern A: Direct Search
+Query → Embedding → Vector DB → Top-K → LLM
+
+### Pattern B: Hybrid Search  
+Query → BM25 + Vector → Merge/Rerank → LLM
+
+### Pattern C: Multi-Stage
+Query → Vector DB (top-100) → Reranker (top-10) → LLM
+
+### Pattern D: Agent Memory with Trust + Decay
+Query → Vector → Score × Trust × Decay → Top-K → Summarize
+
+---
+
+## 6. Comparison with Current Systems
+
+### session_search (FTS5)
+Strengths: Zero deps, no embedding needed, fast for exact keywords
+Limitations: No semantic understanding, no cross-lingual, limited ranking
+
+### holographic/retrieval.py (HRR)
+Strengths: Compositional queries, contradiction detection, trust + decay
+Limitations: Requires numpy, O(n) scan, non-standard embedding space
+
+### Expected Gains from Vector DB:
+- Semantic recall: +30-50% for paraphrased queries
+- Cross-lingual: +60-80%
+- Fuzzy matching: +40-60%
+- Conceptual: +50-70%
+
+---
+
+## 7. Recommendations
+
+### Option 1: Qdrant (RECOMMENDED)
+- Best self-hosted performance
+- Rust implementation, native multi-vector
+- Tradeoff: Separate service deployment
+
+### Option 2: pgvector (CONSERVATIVE)
+- Zero new infrastructure if using PostgreSQL
+- Tradeoff: 5-10x slower than Qdrant
+
+### Option 3: SQLite-vss (LIGHTWEIGHT)
+- Minimal changes, embedded deployment
+- Tradeoff: Limited scalability (<100K vectors)
+
+### Option 4: Hybrid (BEST OF BOTH)
+Keep FTS5 + HRR and add Qdrant:
+- Vector (semantic) + FTS5 (keyword) + HRR (compositional)
+- Apply trust scoring + temporal decay
+
+---
+
+## 8. Embedding Models (2025-2026)
+
+| Model | Dimensions | Quality | Cost |
+|-------|-----------|---------|------|
+| OpenAI text-embedding-3-large | 3072 | Best | $$$ |
+| OpenAI text-embedding-3-small | 1536 | Good | $ |
+| BGE-M3 | 1024 | Best self-hosted | Free |
+| GTE-Qwen2 | 768-1024 | Good | Free |
+
+---
+
+## 9. Hardware Requirements (1M vectors, 768-dim)
+
+| Database | RAM (HNSW) | RAM (Quantized) |
+|----------|-----------|-----------------|
+| Qdrant | 8-16GB | 2-4GB |
+| Milvus | 16-32GB | 4-8GB |
+| pgvector | 4-8GB | N/A |
+| SQLite-vss | 2-4GB | N/A |
+
+---
+
+## 10. Conclusion
+
+Primary: Qdrant with hybrid search (vector + FTS5 + HRR)
+Key insight: Augment existing HRR system, don't replace it.
+
+Next steps:
+1. Deploy Qdrant in Docker for testing
+2. Benchmark embedding models
+3. Implement hybrid search prototype
+4. Measure recall improvement
+5. Evaluate operational complexity
+
+Report: April 2026 | Sources: ANN-Benchmarks, VectorDBBench, official docs
--- a/agent/a2a_mtls.py
+++ b/agent/a2a_mtls.py
@@ -0,0 +1,443 @@
+"""
+A2A mutual-TLS server — secure agent-to-agent communication.
+
+Each fleet agent runs an A2A server that:
+  - Presents its own TLS certificate (signed by the fleet CA).
+  - Requires the connecting peer to present a valid client certificate
+    also signed by the fleet CA.
+  - Rejects connections from unknown / self-signed peers.
+
+Usage (standalone):
+    python -m agent.a2a_mtls \\
+        --cert ~/.hermes/pki/agents/timmy/timmy.crt \\
+        --key  ~/.hermes/pki/agents/timmy/timmy.key \\
+        --ca   ~/.hermes/pki/ca/fleet-ca.crt \\
+        --host 0.0.0.0 --port 9443
+
+Environment variables (alternative to CLI flags):
+    HERMES_A2A_CERT   path to agent certificate
+    HERMES_A2A_KEY    path to agent private key
+    HERMES_A2A_CA     path to fleet CA certificate
+
+Refs #806
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import ssl
+import threading
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from pathlib import Path
+from typing import Any, Callable, Dict, Optional
+from urllib.error import URLError
+from urllib.request import Request, urlopen
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# mTLS SSL context helpers
+# ---------------------------------------------------------------------------
+
+def build_server_ssl_context(
+    cert: str | Path,
+    key: str | Path,
+    ca: str | Path,
+) -> ssl.SSLContext:
+    """Return an SSLContext that presents *cert/key* and requires a valid
+    client certificate signed by *ca*.
+
+    Raises ``FileNotFoundError`` if any path is missing.
+    Raises ``ssl.SSLError`` if the files are malformed.
+    """
+    cert, key, ca = Path(cert), Path(key), Path(ca)
+    for p in (cert, key, ca):
+        if not p.exists():
+            raise FileNotFoundError(f"mTLS: file not found: {p}")
+
+    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
+    ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
+    ctx.load_verify_locations(cafile=str(ca))
+    # CERT_REQUIRED — reject peers that don't present a cert signed by *ca*.
+    ctx.verify_mode = ssl.CERT_REQUIRED
+    return ctx
+
+
+def build_client_ssl_context(
+    cert: str | Path,
+    key: str | Path,
+    ca: str | Path,
+) -> ssl.SSLContext:
+    """Return an SSLContext for an outgoing mTLS connection.
+
+    Presents *cert/key* as the client identity and verifies the server
+    certificate against *ca*.
+    """
+    cert, key, ca = Path(cert), Path(key), Path(ca)
+    for p in (cert, key, ca):
+        if not p.exists():
+            raise FileNotFoundError(f"mTLS client: file not found: {p}")
+
+    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
+    ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
+    ctx.load_verify_locations(cafile=str(ca))
+    ctx.verify_mode = ssl.CERT_REQUIRED
+    ctx.check_hostname = True
+    return ctx
+
+
+# ---------------------------------------------------------------------------
+# Minimal A2A HTTP request handler
+# ---------------------------------------------------------------------------
+
+class A2AHandler(BaseHTTPRequestHandler):
+    """Handles A2A requests over a mutually-authenticated TLS connection.
+
+    GET /.well-known/agent-card.json  — returns the local agent card.
+    POST /a2a/task                    — dispatches an A2A task (stub).
+    """
+
+    log_message = logger.debug  # route access log to Python logger
+
+    def do_GET(self) -> None:  # noqa: N802
+        if self.path in ("/.well-known/agent-card.json", "/agent-card.json"):
+            self._serve_agent_card()
+        else:
+            self._send_json(404, {"error": "not found"})
+
+    def do_POST(self) -> None:  # noqa: N802
+        if self.path == "/a2a/task":
+            self._handle_task()
+        else:
+            self._send_json(404, {"error": "not found"})
+
+    # ------------------------------------------------------------------
+    def _serve_agent_card(self) -> None:
+        try:
+            from agent.agent_card import get_agent_card_json
+            body = get_agent_card_json().encode()
+        except Exception as exc:
+            logger.warning("agent-card unavailable: %s", exc)
+            body = b'{"error": "agent card unavailable"}'
+        self._send_raw(200, "application/json", body)
+
+    def _handle_task(self) -> None:
+        length = int(self.headers.get("Content-Length", 0))
+        _body = self.rfile.read(length) if length else b""
+        # Stub: echo back a 202 Accepted with the peer CN so callers can
+        # confirm which agent processed the request.
+        peer_cn = _peer_cn(self.connection)
+        self._send_json(202, {"status": "accepted", "handled_by": peer_cn})
+
+    # ------------------------------------------------------------------
+    def _send_json(self, code: int, data: dict) -> None:
+        import json
+        body = json.dumps(data).encode()
+        self._send_raw(code, "application/json", body)
+
+    def _send_raw(self, code: int, content_type: str, body: bytes) -> None:
+        self.send_response(code)
+        self.send_header("Content-Type", content_type)
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def log_message(self, fmt: str, *args: object) -> None:  # type: ignore[override]
+        logger.debug("a2a: " + fmt, *args)
+
+
+def _peer_cn(conn: ssl.SSLSocket) -> Optional[str]:
+    """Extract the Common Name from the peer certificate, or None."""
+    try:
+        peer = conn.getpeercert()
+        if not peer:
+            return None
+        for rdn in peer.get("subject", ()):
+            for key, val in rdn:
+                if key == "commonName":
+                    return val
+    except Exception:
+        pass
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Server lifecycle
+# ---------------------------------------------------------------------------
+
+class A2AServer:
+    """Mutual-TLS A2A server.
+
+    Example::
+
+        server = A2AServer(
+            cert="~/.hermes/pki/agents/timmy/timmy.crt",
+            key="~/.hermes/pki/agents/timmy/timmy.key",
+            ca="~/.hermes/pki/ca/fleet-ca.crt",
+        )
+        server.start()   # non-blocking (daemon thread)
+        ...
+        server.stop()
+    """
+
+    def __init__(
+        self,
+        cert: str | Path,
+        key: str | Path,
+        ca: str | Path,
+        host: str = "0.0.0.0",
+        port: int = 9443,
+    ) -> None:
+        self.cert = Path(cert).expanduser()
+        self.key = Path(key).expanduser()
+        self.ca = Path(ca).expanduser()
+        self.host = host
+        self.port = port
+        self._httpd: Optional[HTTPServer] = None
+        self._thread: Optional[threading.Thread] = None
+
+    def start(self, daemon: bool = True) -> None:
+        """Start the server in a background thread (default: daemon)."""
+        ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
+        self._httpd = HTTPServer((self.host, self.port), A2AHandler)
+        self._httpd.socket = ssl_ctx.wrap_socket(
+            self._httpd.socket, server_side=True
+        )
+        self._thread = threading.Thread(
+            target=self._httpd.serve_forever, daemon=daemon
+        )
+        self._thread.start()
+        logger.info(
+            "A2A mTLS server listening on %s:%s (cert=%s)",
+            self.host, self.port, self.cert.name,
+        )
+
+    def stop(self) -> None:
+        if self._httpd:
+            self._httpd.shutdown()
+            self._httpd = None
+        if self._thread:
+            self._thread.join(timeout=5)
+            self._thread = None
+
+
+def server_from_env() -> A2AServer:
+    """Build an A2AServer from environment variables / defaults."""
+    hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
+    agent_name = os.environ.get("HERMES_AGENT_NAME", "hermes").lower()
+
+    default_cert = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.crt"
+    default_key = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.key"
+    default_ca = hermes_home / "pki" / "ca" / "fleet-ca.crt"
+
+    cert = os.environ.get("HERMES_A2A_CERT", str(default_cert))
+    key = os.environ.get("HERMES_A2A_KEY", str(default_key))
+    ca = os.environ.get("HERMES_A2A_CA", str(default_ca))
+    host = os.environ.get("HERMES_A2A_HOST", "0.0.0.0")
+    port = int(os.environ.get("HERMES_A2A_PORT", "9443"))
+
+    return A2AServer(cert=cert, key=key, ca=ca, host=host, port=port)
+
+
+# ---------------------------------------------------------------------------
+# CLI entry point
+# ---------------------------------------------------------------------------
+
+def _main() -> None:
+    import argparse
+
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+
+    parser = argparse.ArgumentParser(
+        description="Hermes A2A mutual-TLS server"
+    )
+    parser.add_argument("--cert", required=True, help="Path to agent certificate")
+    parser.add_argument("--key", required=True, help="Path to agent private key")
+    parser.add_argument("--ca", required=True, help="Path to fleet CA certificate")
+    parser.add_argument("--host", default="0.0.0.0")
+    parser.add_argument("--port", type=int, default=9443)
+    args = parser.parse_args()
+
+    server = A2AServer(
+        cert=args.cert, key=args.key, ca=args.ca,
+        host=args.host, port=args.port,
+    )
+    server.start(daemon=False)
+
+
+if __name__ == "__main__":
+    _main()
+
+
+# ---------------------------------------------------------------------------
+# A2AMTLSServer — routing-based server with context-manager support
+# ---------------------------------------------------------------------------
+
+class _RoutingHandler(BaseHTTPRequestHandler):
+    """HTTP request handler that dispatches to per-path callables."""
+
+    routes: Dict[str, Callable] = {}
+
+    def log_message(self, fmt: str, *args: Any) -> None:
+        logger.debug("A2AMTLSServer: " + fmt, *args)
+
+    def _peer_cn(self) -> Optional[str]:
+        cert = self.connection.getpeercert()  # type: ignore[attr-defined]
+        if not cert:
+            return None
+        for rdn in cert.get("subject", ()):
+            for attr, value in rdn:
+                if attr == "commonName":
+                    return value
+        return None
+
+    def do_POST(self) -> None:
+        handler = self.routes.get(self.path)
+        if handler is None:
+            self.send_response(404)
+            self.end_headers()
+            return
+        length = int(self.headers.get("Content-Length", 0))
+        body = self.rfile.read(length) if length else b""
+        try:
+            payload = json.loads(body) if body else {}
+        except json.JSONDecodeError:
+            self.send_response(400)
+            self.end_headers()
+            return
+        result = handler(payload, peer_cn=self._peer_cn())
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.end_headers()
+        self.wfile.write(json.dumps(result).encode())
+
+    def do_GET(self) -> None:
+        handler = self.routes.get(self.path)
+        if handler is None:
+            self.send_response(404)
+            self.end_headers()
+            return
+        result = handler({}, peer_cn=self._peer_cn())
+        self.send_response(200)
+        self.send_header("Content-Type", "application/json")
+        self.end_headers()
+        self.wfile.write(json.dumps(result).encode())
+
+
+class A2AMTLSServer:
+    """Routing-based mTLS HTTPS server with context-manager support.
+
+    Unlike ``A2AServer`` (which serves fixed A2A paths), this server lets
+    callers register arbitrary path handlers — useful for tests and custom
+    A2A endpoint implementations.
+
+    handler signature: ``handler(payload: dict, *, peer_cn: str | None) -> dict``
+
+    Example::
+
+        server = A2AMTLSServer(cert="timmy.crt", key="timmy.key", ca="fleet-ca.crt")
+        server.add_route("/tasks/send", my_handler)
+        with server:
+            ...  # server runs for the duration of the block
+    """
+
+    def __init__(
+        self,
+        cert: str | Path,
+        key: str | Path,
+        ca: str | Path,
+        host: str = "127.0.0.1",
+        port: int = 9443,
+    ) -> None:
+        self.cert = Path(cert).expanduser()
+        self.key = Path(key).expanduser()
+        self.ca = Path(ca).expanduser()
+        self.host = host
+        self.port = port
+        self._routes: Dict[str, Callable] = {}
+        self._httpd: Optional[HTTPServer] = None
+        self._thread: Optional[threading.Thread] = None
+
+    def add_route(self, path: str, handler: Callable) -> None:
+        self._routes[path] = handler
+
+    def start(self) -> None:
+        ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
+
+        class _Handler(_RoutingHandler):
+            routes = self._routes
+
+        self._httpd = HTTPServer((self.host, self.port), _Handler)
+        self._httpd.socket = ssl_ctx.wrap_socket(self._httpd.socket, server_side=True)
+        self._thread = threading.Thread(
+            target=self._httpd.serve_forever,
+            daemon=True,
+            name=f"a2a-mtls-{self.port}",
+        )
+        self._thread.start()
+        logger.info("A2AMTLSServer on %s:%d (mTLS)", self.host, self.port)
+
+    def stop(self) -> None:
+        if self._httpd:
+            self._httpd.shutdown()
+            self._httpd = None
+        if self._thread:
+            self._thread.join(timeout=5)
+            self._thread = None
+
+    def __enter__(self) -> "A2AMTLSServer":
+        self.start()
+        return self
+
+    def __exit__(self, *_: Any) -> None:
+        self.stop()
+
+
+# ---------------------------------------------------------------------------
+# A2AMTLSClient — mTLS HTTP client
+# ---------------------------------------------------------------------------
+
+class A2AMTLSClient:
+    """HTTP client that presents a fleet cert on every outgoing connection.
+
+    Example::
+
+        client = A2AMTLSClient(cert="allegro.crt", key="allegro.key", ca="fleet-ca.crt")
+        result = client.post("https://timmy:9443/tasks/send", json={"task": "..."})
+    """
+
+    def __init__(
+        self,
+        cert: str | Path,
+        key: str | Path,
+        ca: str | Path,
+    ) -> None:
+        self._ssl_ctx = build_client_ssl_context(cert, key, ca)
+        self._ssl_ctx.check_hostname = False  # callers connecting by IP
+
+    def _request(
+        self,
+        method: str,
+        url: str,
+        data: Optional[bytes] = None,
+        timeout: float = 10.0,
+    ) -> Dict[str, Any]:
+        headers = {"Content-Type": "application/json"}
+        req = Request(url, data=data, headers=headers, method=method)
+        try:
+            with urlopen(req, context=self._ssl_ctx, timeout=timeout) as resp:
+                body = resp.read()
+                return json.loads(body) if body else {}
+        except URLError as exc:
+            raise ConnectionError(f"A2AMTLSClient {method} {url} failed: {exc.reason}") from exc
+
+    def get(self, url: str, **kwargs: Any) -> Dict[str, Any]:
+        return self._request("GET", url, **kwargs)
+
+    def post(self, url: str, json: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
+        data = (__import__("json").dumps(json).encode() if json is not None else None)
+        return self._request("POST", url, data=data, **kwargs)
--- a/agent/agent_card.py
+++ b/agent/agent_card.py
@@ -0,0 +1,135 @@
+"""
+Agent Card — A2A-compliant agent discovery.
+Part of #843: fix: implement A2A agent card for fleet discovery (#819)
+
+Provides metadata about the agent's identity, capabilities, and installed skills
+for discovery by other agents in the fleet.
+"""
+
+import json
+import logging
+import os
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from hermes_cli import __version__
+from hermes_cli.config import load_config, get_hermes_home
+from agent.skill_utils import (
+    iter_skill_index_files,
+    parse_frontmatter,
+    get_all_skills_dirs,
+    get_disabled_skill_names,
+    skill_matches_platform
+)
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class AgentSkill:
+    id: str
+    name: str
+    description: str = ""
+    version: str = "1.0.0"
+
+@dataclass
+class AgentCapabilities:
+    streaming: bool = True
+    tools: bool = True
+    vision: bool = False
+    reasoning: bool = False
+
+@dataclass
+class AgentCard:
+    name: str
+    description: str
+    url: str
+    version: str = __version__
+    capabilities: AgentCapabilities = field(default_factory=AgentCapabilities)
+    skills: List[AgentSkill] = field(default_factory=list)
+    defaultInputModes: List[str] = field(default_factory=lambda: ["text/plain"])
+    defaultOutputModes: List[str] = field(default_factory=lambda: ["text/plain"])
+
+def _load_skills() -> List[AgentSkill]:
+    """Scan all enabled skills and return metadata."""
+    skills = []
+    disabled = get_disabled_skill_names()
+    
+    for skills_dir in get_all_skills_dirs():
+        if not skills_dir.is_dir():
+            continue
+        for skill_file in iter_skill_index_files(skills_dir, "SKILL.md"):
+            try:
+                raw = skill_file.read_text(encoding="utf-8")
+                frontmatter, _ = parse_frontmatter(raw)
+            except Exception:
+                continue
+
+            skill_name = frontmatter.get("name") or skill_file.parent.name
+            if str(skill_name) in disabled:
+                continue
+            if not skill_matches_platform(frontmatter):
+                continue
+
+            skills.append(AgentSkill(
+                id=str(skill_name),
+                name=str(frontmatter.get("name", skill_name)),
+                description=str(frontmatter.get("description", "")),
+                version=str(frontmatter.get("version", "1.0.0"))
+            ))
+    return skills
+
+def build_agent_card() -> AgentCard:
+    """Build the agent card from current configuration and environment."""
+    config = load_config()
+    
+    # Identity
+    name = os.environ.get("HERMES_AGENT_NAME") or config.get("agent", {}).get("name") or "hermes"
+    description = os.environ.get("HERMES_AGENT_DESCRIPTION") or config.get("agent", {}).get("description") or "Sovereign AI agent"
+    
+    # URL - try to determine from environment or config
+    port = os.environ.get("HERMES_WEB_PORT") or "9119"
+    host = os.environ.get("HERMES_WEB_HOST") or "localhost"
+    url = f"http://{host}:{port}"
+    
+    # Capabilities
+    # In a real scenario, we'd check model metadata for vision/reasoning
+    capabilities = AgentCapabilities(
+        streaming=True,
+        tools=True,
+        vision=False, # Default to false unless we can confirm
+        reasoning=False
+    )
+    
+    # Skills
+    skills = _load_skills()
+    
+    return AgentCard(
+        name=name,
+        description=description,
+        url=url,
+        version=__version__,
+        capabilities=capabilities,
+        skills=skills
+    )
+
+def get_agent_card_json() -> str:
+    """Return the agent card as a JSON string."""
+    try:
+        card = build_agent_card()
+        return json.dumps(asdict(card), indent=2)
+    except Exception as e:
+        logger.error(f"Failed to build agent card: {e}")
+        # Minimal fallback card
+        fallback = {
+            "name": "hermes",
+            "description": "Sovereign AI agent (fallback)",
+            "version": __version__,
+            "error": str(e)
+        }
+        return json.dumps(fallback, indent=2)
+
+def validate_agent_card(card_data: Dict[str, Any]) -> bool:
+    """Check if the card data complies with the A2A schema."""
+    required = ["name", "description", "url", "version"]
+    return all(k in card_data for k in required)
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -1230,9 +1230,10 @@ def build_anthropic_kwargs(
    When *base_url* points to a third-party Anthropic-compatible endpoint,
    thinking block signatures are stripped (they are Anthropic-proprietary).

-    When *fast_mode* is True, adds ``speed: "fast"`` and the fast-mode beta
-    header for ~2.5x faster output throughput on Opus 4.6.  Currently only
-    supported on native Anthropic endpoints (not third-party compatible ones).
+    When *fast_mode* is True, adds ``extra_body["speed"] = "fast"`` and the
+    fast-mode beta header for ~2.5x faster output throughput on Opus 4.6.
+    Currently only supported on native Anthropic endpoints (not third-party
+    compatible ones).
    """
    system, anthropic_messages = convert_messages_to_anthropic(messages, base_url=base_url)
    anthropic_tools = convert_tools_to_anthropic(tools) if tools else []
@@ -1333,11 +1334,11 @@ def build_anthropic_kwargs(
                kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096)

    # ── Fast mode (Opus 4.6 only) ────────────────────────────────────
-    # Adds speed:"fast" + the fast-mode beta header for ~2.5x output speed.
-    # Only for native Anthropic endpoints — third-party providers would
-    # reject the unknown beta header and speed parameter.
+    # Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x
+    # output speed. Only for native Anthropic endpoints — third-party
+    # providers would reject the unknown beta header and speed parameter.
    if fast_mode and not _is_third_party_anthropic_endpoint(base_url):
-        kwargs["speed"] = "fast"
+        kwargs.setdefault("extra_body", {})["speed"] = "fast"
        # Build extra_headers with ALL applicable betas (the per-request
        # extra_headers override the client-level anthropic-beta header).
        betas = list(_common_betas_for_base_url(base_url))
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -1,4 +1,4 @@
-"""Shared auxiliary client router for side tasks.
+from agent.telemetry_logger import log_token_usage\n"""Shared auxiliary client router for side tasks.

 Provides a single resolution chain so every consumer (context compression,
 session search, web extraction, vision analysis, browser vision) picks up
@@ -23,18 +23,10 @@ Resolution order for vision/multimodal tasks (auto mode):
  6. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.)
  7. None

-Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
-CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task.
+Per-task overrides are configured in config.yaml under the ``auxiliary:`` section
+(e.g. ``auxiliary.vision.provider``, ``auxiliary.compression.model``).
 Default "auto" follows the chains above.

-Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
-AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug
-than the provider's default.
-
-Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL,
-AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a
-custom OpenAI-compatible endpoint without touching the main model settings.
-
 Payment / credit exhaustion fallback:
  When a resolved provider returns HTTP 402 or a credit-related error,
  call_llm() automatically retries with the next available provider in the
@@ -72,6 +64,8 @@ _PROVIDER_ALIASES = {
    "zhipu": "zai",
    "kimi": "kimi-coding",
    "moonshot": "kimi-coding",
+    "kimi-cn": "kimi-coding-cn",
+    "moonshot-cn": "kimi-coding-cn",
    "minimax-china": "minimax-cn",
    "minimax_cn": "minimax-cn",
    "claude": "anthropic",
@@ -79,13 +73,13 @@ _PROVIDER_ALIASES = {
 }


-def _normalize_aux_provider(provider: Optional[str], *, for_vision: bool = False) -> str:
+def _normalize_aux_provider(provider: Optional[str]) -> str:
    normalized = (provider or "auto").strip().lower()
    if normalized.startswith("custom:"):
        suffix = normalized.split(":", 1)[1].strip()
        if not suffix:
            return "custom"
-        normalized = suffix if not for_vision else "custom"
+        normalized = suffix
    if normalized == "codex":
        return "openai-codex"
    if normalized == "main":
@@ -102,6 +96,7 @@ _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
    "gemini": "gemini-3-flash-preview",
    "zai": "glm-4.5-flash",
    "kimi-coding": "kimi-k2-turbo-preview",
+    "kimi-coding-cn": "kimi-k2-turbo-preview",
    "minimax": "MiniMax-M2.7",
    "minimax-cn": "MiniMax-M2.7",
    "anthropic": "claude-haiku-4-5-20251001",
@@ -111,6 +106,15 @@ _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
    "kilocode": "google/gemini-3-flash-preview",
 }

+# Vision-specific model overrides for direct providers.
+# When the user's main provider has a dedicated vision/multimodal model that
+# differs from their main chat model, map it here.  The vision auto-detect
+# "exotic provider" branch checks this before falling back to the main model.
+_PROVIDER_VISION_MODELS: Dict[str, str] = {
+    "xiaomi": "mimo-v2-omni",
+    "zai": "glm-5v-turbo",
+}
+
 # OpenRouter app attribution headers
 _OR_HEADERS = {
    "HTTP-Referer": "https://hermes-agent.nousresearch.com",
@@ -392,7 +396,7 @@ class _CodexCompletionsAdapter:
                    prompt_tokens=getattr(resp_usage, "input_tokens", 0),
                    completion_tokens=getattr(resp_usage, "output_tokens", 0),
                    total_tokens=getattr(resp_usage, "total_tokens", 0),
-                )
+                )\n        log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)
        except Exception as exc:
            logger.debug("Codex auxiliary Responses API call failed: %s", exc)
            raise
@@ -525,7 +529,7 @@ class _AnthropicCompletionsAdapter:
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=total_tokens,
-            )
+            )\n    log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)

        choice = SimpleNamespace(
            index=0,
@@ -749,30 +753,6 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:

 # ── Provider resolution helpers ─────────────────────────────────────────────

-def _get_auxiliary_provider(task: str = "") -> str:
-    """Read the provider override for a specific auxiliary task.
-
-    Checks AUXILIARY_{TASK}_PROVIDER first (e.g. AUXILIARY_VISION_PROVIDER),
-    then CONTEXT_{TASK}_PROVIDER (for the compression section's summary_provider),
-    then falls back to "auto".  Returns one of: "auto", "openrouter", "nous", "main".
-    """
-    if task:
-        for prefix in ("AUXILIARY_", "CONTEXT_"):
-            val = os.getenv(f"{prefix}{task.upper()}_PROVIDER", "").strip().lower()
-            if val and val != "auto":
-                return val
-    return "auto"
-
-
-def _get_auxiliary_env_override(task: str, suffix: str) -> Optional[str]:
-    """Read an auxiliary env override from AUXILIARY_* or CONTEXT_* prefixes."""
-    if not task:
-        return None
-    for prefix in ("AUXILIARY_", "CONTEXT_"):
-        val = os.getenv(f"{prefix}{task.upper()}_{suffix}", "").strip()
-        if val:
-            return val
-    return None


 def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
@@ -1017,6 +997,23 @@ _AUTO_PROVIDER_LABELS = {

 _AGGREGATOR_PROVIDERS = frozenset({"openrouter", "nous"})

+_MAIN_RUNTIME_FIELDS = ("provider", "model", "base_url", "api_key", "api_mode")
+
+
+def _normalize_main_runtime(main_runtime: Optional[Dict[str, Any]]) -> Dict[str, str]:
+    """Return a sanitized copy of a live main-runtime override."""
+    if not isinstance(main_runtime, dict):
+        return {}
+    normalized: Dict[str, str] = {}
+    for field in _MAIN_RUNTIME_FIELDS:
+        value = main_runtime.get(field)
+        if isinstance(value, str) and value.strip():
+            normalized[field] = value.strip()
+    provider = normalized.get("provider")
+    if provider:
+        normalized["provider"] = provider.lower()
+    return normalized
+

 def _get_provider_chain() -> List[tuple]:
    """Return the ordered provider detection chain.
@@ -1126,7 +1123,7 @@ def _try_payment_fallback(
    return None, None, ""


-def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
+def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Optional[OpenAI], Optional[str]]:
    """Full auto-detection chain.

    Priority:
@@ -1138,6 +1135,12 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
    """
    global auxiliary_is_nous, _stale_base_url_warned
    auxiliary_is_nous = False  # Reset — _try_nous() will set True if it wins
+    runtime = _normalize_main_runtime(main_runtime)
+    runtime_provider = runtime.get("provider", "")
+    runtime_model = runtime.get("model", "")
+    runtime_base_url = runtime.get("base_url", "")
+    runtime_api_key = runtime.get("api_key", "")
+    runtime_api_mode = runtime.get("api_mode", "")

    # ── Warn once if OPENAI_BASE_URL is set but config.yaml uses a named
    #    provider (not 'custom').  This catches the common "env poisoning"
@@ -1145,7 +1148,7 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
    #    old OPENAI_BASE_URL lingers in ~/.hermes/.env. ──
    if not _stale_base_url_warned:
        _env_base = os.getenv("OPENAI_BASE_URL", "").strip()
-        _cfg_provider = _read_main_provider()
+        _cfg_provider = runtime_provider or _read_main_provider()
        if (_env_base and _cfg_provider
                and _cfg_provider != "custom"
                and not _cfg_provider.startswith("custom:")):
@@ -1159,12 +1162,25 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
            _stale_base_url_warned = True

    # ── Step 1: non-aggregator main provider → use main model directly ──
-    main_provider = _read_main_provider()
-    main_model = _read_main_model()
+    main_provider = runtime_provider or _read_main_provider()
+    main_model = runtime_model or _read_main_model()
    if (main_provider and main_model
            and main_provider not in _AGGREGATOR_PROVIDERS
            and main_provider not in ("auto", "")):
-        client, resolved = resolve_provider_client(main_provider, main_model)
+        resolved_provider = main_provider
+        explicit_base_url = None
+        explicit_api_key = None
+        if runtime_base_url and (main_provider == "custom" or main_provider.startswith("custom:")):
+            resolved_provider = "custom"
+            explicit_base_url = runtime_base_url
+            explicit_api_key = runtime_api_key or None
+        client, resolved = resolve_provider_client(
+            resolved_provider,
+            main_model,
+            explicit_base_url=explicit_base_url,
+            explicit_api_key=explicit_api_key,
+            api_mode=runtime_api_mode or None,
+        )
        if client is not None:
            logger.info("Auxiliary auto-detect: using main provider %s (%s)",
                        main_provider, resolved or main_model)
@@ -1208,6 +1224,12 @@ def _to_async_client(sync_client, model: str):
        return AsyncCodexAuxiliaryClient(sync_client), model
    if isinstance(sync_client, AnthropicAuxiliaryClient):
        return AsyncAnthropicAuxiliaryClient(sync_client), model
+    try:
+        from agent.copilot_acp_client import CopilotACPClient
+        if isinstance(sync_client, CopilotACPClient):
+            return sync_client, model
+    except ImportError:
+        pass

    async_kwargs = {
        "api_key": sync_client.api_key,
@@ -1245,6 +1267,7 @@ def resolve_provider_client(
    explicit_base_url: str = None,
    explicit_api_key: str = None,
    api_mode: str = None,
+    main_runtime: Optional[Dict[str, Any]] = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Central router: given a provider name and optional model, return a
    configured client with the correct auth, base URL, and API format.
@@ -1315,7 +1338,7 @@ def resolve_provider_client(

    # ── Auto: try all providers in priority order ────────────────────
    if provider == "auto":
-        client, resolved = _resolve_auto()
+        client, resolved = _resolve_auto(main_runtime=main_runtime)
        if client is None:
            return None, None
        # When auto-detection lands on a non-OpenRouter provider (e.g. a
@@ -1425,10 +1448,14 @@ def resolve_provider_client(
        custom_entry = _get_named_custom_provider(provider)
        if custom_entry:
            custom_base = custom_entry.get("base_url", "").strip()
-            custom_key = custom_entry.get("api_key", "").strip() or "no-key-required"
+            custom_key = custom_entry.get("api_key", "").strip()
+            custom_key_env = custom_entry.get("key_env", "").strip()
+            if not custom_key and custom_key_env:
+                custom_key = os.getenv(custom_key_env, "").strip()
+            custom_key = custom_key or "no-key-required"
            if custom_base:
                final_model = _normalize_resolved_model(
-                    model or _read_main_model() or "gpt-4o-mini",
+                    model or custom_entry.get("model") or _read_main_model() or "gpt-4o-mini",
                    provider,
                )
                client = OpenAI(api_key=custom_key, base_url=custom_base)
@@ -1447,7 +1474,11 @@ def resolve_provider_client(

    # ── API-key providers from PROVIDER_REGISTRY ─────────────────────
    try:
-        from hermes_cli.auth import PROVIDER_REGISTRY, resolve_api_key_provider_credentials
+        from hermes_cli.auth import (
+            PROVIDER_REGISTRY,
+            resolve_api_key_provider_credentials,
+            resolve_external_process_provider_credentials,
+        )
    except ImportError:
        logger.debug("hermes_cli.auth not available for provider %s", provider)
        return None, None
@@ -1521,6 +1552,41 @@ def resolve_provider_client(
        return (_to_async_client(client, final_model) if async_mode
                else (client, final_model))

+    if pconfig.auth_type == "external_process":
+        creds = resolve_external_process_provider_credentials(provider)
+        final_model = _normalize_resolved_model(model or _read_main_model(), provider)
+        if provider == "copilot-acp":
+            api_key = str(creds.get("api_key", "")).strip()
+            base_url = str(creds.get("base_url", "")).strip()
+            command = str(creds.get("command", "")).strip() or None
+            args = list(creds.get("args") or [])
+            if not final_model:
+                logger.warning(
+                    "resolve_provider_client: copilot-acp requested but no model "
+                    "was provided or configured"
+                )
+                return None, None
+            if not api_key or not base_url:
+                logger.warning(
+                    "resolve_provider_client: copilot-acp requested but external "
+                    "process credentials are incomplete"
+                )
+                return None, None
+            from agent.copilot_acp_client import CopilotACPClient
+
+            client = CopilotACPClient(
+                api_key=api_key,
+                base_url=base_url,
+                command=command,
+                args=args,
+            )
+            logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
+            return (_to_async_client(client, final_model) if async_mode
+                    else (client, final_model))
+        logger.warning("resolve_provider_client: external-process provider %s not "
+                       "directly supported", provider)
+        return None, None
+
    elif pconfig.auth_type in ("oauth_device_code", "oauth_external"):
        # OAuth providers — route through their specific try functions
        if provider == "nous":
@@ -1539,15 +1605,19 @@ def resolve_provider_client(

 # ── Public API ──────────────────────────────────────────────────────────────

-def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
+def get_text_auxiliary_client(
+    task: str = "",
+    *,
+    main_runtime: Optional[Dict[str, Any]] = None,
+) -> Tuple[Optional[OpenAI], Optional[str]]:
    """Return (client, default_model_slug) for text-only auxiliary tasks.

    Args:
        task: Optional task name ("compression", "web_extract") to check
              for a task-specific provider override.

-    Callers may override the returned model with a per-task env var
-    (e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
+    Callers may override the returned model via config.yaml
+    (e.g. auxiliary.compression.model, auxiliary.web_extract.model).
    """
    provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None)
    return resolve_provider_client(
@@ -1556,10 +1626,11 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
        explicit_base_url=base_url,
        explicit_api_key=api_key,
        api_mode=api_mode,
+        main_runtime=main_runtime,
    )


-def get_async_text_auxiliary_client(task: str = ""):
+def get_async_text_auxiliary_client(task: str = "", *, main_runtime: Optional[Dict[str, Any]] = None):
    """Return (async_client, model_slug) for async consumers.

    For standard providers returns (AsyncOpenAI, model). For Codex returns
@@ -1574,6 +1645,7 @@ def get_async_text_auxiliary_client(task: str = ""):
        explicit_base_url=base_url,
        explicit_api_key=api_key,
        api_mode=api_mode,
+        main_runtime=main_runtime,
    )


@@ -1584,7 +1656,7 @@ _VISION_AUTO_PROVIDER_ORDER = (


 def _normalize_vision_provider(provider: Optional[str]) -> str:
-    return _normalize_aux_provider(provider, for_vision=True)
+    return _normalize_aux_provider(provider)


 def _resolve_strict_vision_backend(provider: str) -> Tuple[Optional[Any], Optional[str]]:
@@ -1667,6 +1739,7 @@ def resolve_vision_provider_client(
            async_mode=async_mode,
            explicit_base_url=resolved_base_url,
            explicit_api_key=resolved_api_key,
+            api_mode=resolved_api_mode,
        )
        if client is None:
            return "custom", None, None
@@ -1687,16 +1760,19 @@ def resolve_vision_provider_client(
                if sync_client is not None:
                    return _finalize(main_provider, sync_client, default_model)
            else:
-                # Exotic provider (DeepSeek, Alibaba, named custom, etc.)
+                # Exotic provider (DeepSeek, Alibaba, Xiaomi, named custom, etc.)
+                # Use provider-specific vision model if available, otherwise main model.
+                vision_model = _PROVIDER_VISION_MODELS.get(main_provider, main_model)
                rpc_client, rpc_model = resolve_provider_client(
-                    main_provider, main_model)
+                    main_provider, vision_model,
+                    api_mode=resolved_api_mode)
                if rpc_client is not None:
                    logger.info(
                        "Vision auto-detect: using active provider %s (%s)",
-                        main_provider, rpc_model or main_model,
+                        main_provider, rpc_model or vision_model,
                    )
                    return _finalize(
-                        main_provider, rpc_client, rpc_model or main_model)
+                        main_provider, rpc_client, rpc_model or vision_model)

        # Fall back through aggregators.
        for candidate in _VISION_AUTO_PROVIDER_ORDER:
@@ -1713,7 +1789,8 @@ def resolve_vision_provider_client(
        sync_client, default_model = _resolve_strict_vision_backend(requested)
        return _finalize(requested, sync_client, default_model)

-    client, final_model = _get_cached_client(requested, resolved_model, async_mode)
+    client, final_model = _get_cached_client(requested, resolved_model, async_mode,
+                                             api_mode=resolved_api_mode)
    if client is None:
        return requested, None, None
    return requested, client, final_model
@@ -1886,6 +1963,7 @@ def _get_cached_client(
    base_url: str = None,
    api_key: str = None,
    api_mode: str = None,
+    main_runtime: Optional[Dict[str, Any]] = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Get or create a cached client for the given provider.

@@ -1909,7 +1987,9 @@ def _get_cached_client(
            loop_id = id(current_loop)
        except RuntimeError:
            pass
-    cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", loop_id)
+    runtime = _normalize_main_runtime(main_runtime)
+    runtime_key = tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) if provider == "auto" else ()
+    cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", loop_id, runtime_key)
    with _client_cache_lock:
        if cache_key in _client_cache:
            cached_client, cached_default, cached_loop = _client_cache[cache_key]
@@ -1934,6 +2014,7 @@ def _get_cached_client(
        explicit_base_url=base_url,
        explicit_api_key=api_key,
        api_mode=api_mode,
+        main_runtime=runtime,
    )
    if client is not None:
        # For async clients, remember which loop they were created on so we
@@ -1958,9 +2039,8 @@ def _resolve_task_provider_model(

    Priority:
      1. Explicit provider/model/base_url/api_key args (always win)
-      2. Env var overrides (AUXILIARY_{TASK}_*, CONTEXT_{TASK}_*)
-      3. Config file (auxiliary.{task}.* or compression.*)
-      4. "auto" (full auto-detection chain)
+      2. Config file (auxiliary.{task}.provider/model/base_url)
+      3. "auto" (full auto-detection chain)

    Returns (provider, model, base_url, api_key, api_mode) where model may
    be None (use provider default). When base_url is set, provider is forced
@@ -1991,21 +2071,8 @@ def _resolve_task_provider_model(
        cfg_api_key = str(task_config.get("api_key", "")).strip() or None
        cfg_api_mode = str(task_config.get("api_mode", "")).strip() or None

-        # Backwards compat: compression section has its own keys.
-        # The auxiliary.compression defaults to provider="auto", so treat
-        # both None and "auto" as "not explicitly configured".
-        if task == "compression" and (not cfg_provider or cfg_provider == "auto"):
-            comp = config.get("compression", {}) if isinstance(config, dict) else {}
-            if isinstance(comp, dict):
-                cfg_provider = comp.get("summary_provider", "").strip() or None
-                cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
-                _sbu = comp.get("summary_base_url") or ""
-                cfg_base_url = cfg_base_url or _sbu.strip() or None
-
-    env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
-    env_api_mode = _get_auxiliary_env_override(task, "API_MODE") if task else None
-    resolved_model = model or env_model or cfg_model
-    resolved_api_mode = env_api_mode or cfg_api_mode
+    resolved_model = model or cfg_model
+    resolved_api_mode = cfg_api_mode

    if base_url:
        return "custom", resolved_model, base_url, api_key, resolved_api_mode
@@ -2013,19 +2080,12 @@ def _resolve_task_provider_model(
        return provider, resolved_model, base_url, api_key, resolved_api_mode

    if task:
-        env_base_url = _get_auxiliary_env_override(task, "BASE_URL")
-        env_api_key = _get_auxiliary_env_override(task, "API_KEY")
-        if env_base_url:
-            return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key, resolved_api_mode
-
-        env_provider = _get_auxiliary_provider(task)
-        if env_provider != "auto":
-            return env_provider, resolved_model, None, None, resolved_api_mode
-
+        # Config.yaml is the primary source for per-task overrides.
        if cfg_base_url:
            return "custom", resolved_model, cfg_base_url, cfg_api_key, resolved_api_mode
        if cfg_provider and cfg_provider != "auto":
            return cfg_provider, resolved_model, None, None, resolved_api_mode
+
        return "auto", resolved_model, None, None, resolved_api_mode

    return "auto", resolved_model, None, None, resolved_api_mode
@@ -2054,6 +2114,75 @@ def _get_task_timeout(task: str, default: float = _DEFAULT_AUX_TIMEOUT) -> float
    return default


+# ---------------------------------------------------------------------------
+# Anthropic-compatible endpoint detection + image block conversion
+# ---------------------------------------------------------------------------
+
+# Providers that use Anthropic-compatible endpoints (via OpenAI SDK wrapper).
+# Their image content blocks must use Anthropic format, not OpenAI format.
+_ANTHROPIC_COMPAT_PROVIDERS = frozenset({"minimax", "minimax-cn"})
+
+
+def _is_anthropic_compat_endpoint(provider: str, base_url: str) -> bool:
+    """Detect if an endpoint expects Anthropic-format content blocks.
+
+    Returns True for known Anthropic-compatible providers (MiniMax) and
+    any endpoint whose URL contains ``/anthropic`` in the path.
+    """
+    if provider in _ANTHROPIC_COMPAT_PROVIDERS:
+        return True
+    url_lower = (base_url or "").lower()
+    return "/anthropic" in url_lower
+
+
+def _convert_openai_images_to_anthropic(messages: list) -> list:
+    """Convert OpenAI ``image_url`` content blocks to Anthropic ``image`` blocks.
+
+    Only touches messages that have list-type content with ``image_url`` blocks;
+    plain text messages pass through unchanged.
+    """
+    converted = []
+    for msg in messages:
+        content = msg.get("content")
+        if not isinstance(content, list):
+            converted.append(msg)
+            continue
+        new_content = []
+        changed = False
+        for block in content:
+            if block.get("type") == "image_url":
+                image_url_val = (block.get("image_url") or {}).get("url", "")
+                if image_url_val.startswith("data:"):
+                    # Parse data URI: data:<media_type>;base64,<data>
+                    header, _, b64data = image_url_val.partition(",")
+                    media_type = "image/png"
+                    if ":" in header and ";" in header:
+                        media_type = header.split(":", 1)[1].split(";", 1)[0]
+                    new_content.append({
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": media_type,
+                            "data": b64data,
+                        },
+                    })
+                else:
+                    # URL-based image
+                    new_content.append({
+                        "type": "image",
+                        "source": {
+                            "type": "url",
+                            "url": image_url_val,
+                        },
+                    })
+                changed = True
+            else:
+                new_content.append(block)
+        converted.append({**msg, "content": new_content} if changed else msg)
+    return converted
+
+
+
 def _build_call_kwargs(
    provider: str,
    model: str,
@@ -2138,6 +2267,7 @@ def call_llm(
    model: str = None,
    base_url: str = None,
    api_key: str = None,
+    main_runtime: Optional[Dict[str, Any]] = None,
    messages: list,
    temperature: float = None,
    max_tokens: int = None,
@@ -2172,7 +2302,7 @@ def call_llm(
    resolved_provider, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
        task, provider, model, base_url, api_key)

-    if task == "vision":
+    if task in ("vision", "browser_vision"):
        effective_provider, client, final_model = resolve_vision_provider_client(
            provider=provider,
            model=model,
@@ -2203,6 +2333,7 @@ def call_llm(
            base_url=resolved_base_url,
            api_key=resolved_api_key,
            api_mode=resolved_api_mode,
+            main_runtime=main_runtime,
        )
        if client is None:
            # When the user explicitly chose a non-OpenRouter provider but no
@@ -2223,7 +2354,7 @@ def call_llm(
            if not resolved_base_url:
                logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
                            task or "call", resolved_provider)
-                client, final_model = _get_cached_client("auto")
+                client, final_model = _get_cached_client("auto", main_runtime=main_runtime)
        if client is None:
            raise RuntimeError(
                f"No LLM provider configured for task={task} provider={resolved_provider}. "
@@ -2244,6 +2375,11 @@ def call_llm(
        tools=tools, timeout=effective_timeout, extra_body=extra_body,
        base_url=resolved_base_url)

+    # Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
+    _client_base = str(getattr(client, "base_url", "") or "")
+    if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
+        kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
+
    # Handle max_tokens vs max_completion_tokens retry, then payment fallback.
    try:
        return _validate_llm_response(
@@ -2320,9 +2456,9 @@ def extract_content_or_reasoning(response) -> str:
    if content:
        # Strip inline think/reasoning blocks (mirrors _strip_think_blocks)
        cleaned = re.sub(
-            r"<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)>"
+            r"<(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>"
            r".*?"
-            r"</(?:think|thinking|reasoning|REASONING_SCRATCHPAD)>",
+            r"</(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>",
            "", content, flags=re.DOTALL | re.IGNORECASE,
        ).strip()
        if cleaned:
@@ -2432,6 +2568,11 @@ async def async_call_llm(
        tools=tools, timeout=effective_timeout, extra_body=extra_body,
        base_url=resolved_base_url)

+    # Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
+    _client_base = str(getattr(client, "base_url", "") or "")
+    if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
+        kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
+
    try:
        return _validate_llm_response(
            await client.chat.completions.create(**kwargs), task)
--- a/agent/circuit_breaker.py
+++ b/agent/circuit_breaker.py
@@ -0,0 +1,273 @@
+"""
+Circuit Breaker for Error Cascading — #885
+
+P(error | prev was error) = 58.6% vs P(error | prev was success) = 25.2%.
+That's a 2.33x cascade factor. After 3 consecutive errors, the circuit
+opens and the agent must take corrective action.
+
+States:
+- CLOSED: Normal operation, errors are counted
+- OPEN: Too many consecutive errors, corrective action required
+- HALF_OPEN: Testing if errors have cleared
+
+Usage:
+    from agent.circuit_breaker import CircuitBreaker, ToolCircuitBreaker
+    
+    cb = ToolCircuitBreaker()
+    
+    # After each tool call
+    if not cb.record_result(success=True):
+        # Circuit is open — take corrective action
+        cb.get_recovery_action()
+"""
+
+import time
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Any, Dict, List, Optional
+
+
+class CircuitState(Enum):
+    CLOSED = "closed"      # Normal operation
+    OPEN = "open"          # Too many errors, block execution
+    HALF_OPEN = "half_open"  # Testing recovery
+
+
+@dataclass
+class CircuitBreaker:
+    """
+    Generic circuit breaker with configurable thresholds.
+    
+    Tracks consecutive errors and opens the circuit when the
+    error streak exceeds the threshold.
+    """
+    failure_threshold: int = 3
+    recovery_timeout: float = 30.0  # seconds before trying half-open
+    success_threshold: int = 2      # successes needed to close from half-open
+    
+    state: CircuitState = field(default=CircuitState.CLOSED, init=False)
+    consecutive_failures: int = field(default=0, init=False)
+    consecutive_successes: int = field(default=0, init=False)
+    last_failure_time: Optional[float] = field(default=None, init=False)
+    total_trips: int = field(default=0, init=False)
+    error_streaks: List[int] = field(default_factory=list, init=False)
+    
+    def record_result(self, success: bool) -> bool:
+        """
+        Record a tool call result. Returns True if circuit allows execution.
+        
+        Returns:
+            True if circuit is CLOSED or HALF_OPEN (execution allowed)
+            False if circuit is OPEN (execution blocked)
+        """
+        now = time.time()
+        
+        if self.state == CircuitState.OPEN:
+            # Check if recovery timeout has passed
+            if self.last_failure_time and (now - self.last_failure_time) >= self.recovery_timeout:
+                self.state = CircuitState.HALF_OPEN
+                self.consecutive_successes = 0
+                return True  # Allow one test execution
+            return False  # Still open
+        
+        if success:
+            self.consecutive_failures = 0
+            self.consecutive_successes += 1
+            
+            if self.state == CircuitState.HALF_OPEN:
+                if self.consecutive_successes >= self.success_threshold:
+                    self.state = CircuitState.CLOSED
+                    self.consecutive_successes = 0
+            
+            return True
+        else:
+            self.consecutive_successes = 0
+            self.consecutive_failures += 1
+            self.last_failure_time = now
+            
+            if self.state == CircuitState.HALF_OPEN:
+                # Failed during recovery — reopen immediately
+                self.state = CircuitState.OPEN
+                self.total_trips += 1
+                return False
+            
+            if self.consecutive_failures >= self.failure_threshold:
+                self.state = CircuitState.OPEN
+                self.total_trips += 1
+                self.error_streaks.append(self.consecutive_failures)
+                return False
+            
+            return True
+    
+    def can_execute(self) -> bool:
+        """Check if execution is allowed."""
+        if self.state == CircuitState.OPEN:
+            if self.last_failure_time:
+                now = time.time()
+                if (now - self.last_failure_time) >= self.recovery_timeout:
+                    self.state = CircuitState.HALF_OPEN
+                    self.consecutive_successes = 0
+                    return True
+            return False
+        return True
+    
+    def get_state(self) -> Dict[str, Any]:
+        """Get current circuit state."""
+        return {
+            "state": self.state.value,
+            "consecutive_failures": self.consecutive_failures,
+            "consecutive_successes": self.consecutive_successes,
+            "total_trips": self.total_trips,
+            "max_streak": max(self.error_streaks) if self.error_streaks else 0,
+            "can_execute": self.can_execute(),
+        }
+    
+    def reset(self):
+        """Reset the circuit breaker."""
+        self.state = CircuitState.CLOSED
+        self.consecutive_failures = 0
+        self.consecutive_successes = 0
+        self.last_failure_time = None
+
+
+class ToolCircuitBreaker(CircuitBreaker):
+    """
+    Circuit breaker specifically for tool call error cascading.
+    
+    Provides recovery actions when the circuit opens.
+    """
+    
+    # Tools that are most effective at recovery (from audit data)
+    RECOVERY_TOOLS = [
+        "terminal",     # Most effective — 2300 recoveries
+        "read_file",    # Reset context by reading something
+        "search_files", # Find what went wrong
+    ]
+    
+    def get_recovery_action(self) -> Dict[str, Any]:
+        """
+        Get the recommended recovery action when circuit is open.
+        
+        Returns dict with action type and details.
+        """
+        streak = self.consecutive_failures
+        
+        if streak >= 9:
+            # After 9 errors: 41/46 recoveries via terminal
+            return {
+                "action": "terminal_only",
+                "reason": f"Error streak of {streak} — terminal is the only reliable recovery",
+                "suggested_tool": "terminal",
+                "suggested_command": "echo 'Resetting context'",
+                "severity": "critical",
+            }
+        elif streak >= 5:
+            return {
+                "action": "switch_tool_type",
+                "reason": f"Error streak of {streak} — switch to a different tool category",
+                "suggested_tools": ["read_file", "search_files", "terminal"],
+                "severity": "high",
+            }
+        elif streak >= self.failure_threshold:
+            return {
+                "action": "ask_user",
+                "reason": f"{streak} consecutive errors — ask user for guidance",
+                "suggested_response": "I'm encountering repeated errors. Would you like me to try a different approach?",
+                "severity": "medium",
+            }
+        else:
+            return {
+                "action": "continue",
+                "reason": f"Error streak of {streak} — within tolerance",
+                "severity": "low",
+            }
+    
+    def should_compress_context(self) -> bool:
+        """Determine if context compression would help recovery."""
+        return self.consecutive_failures >= 5
+    
+    def get_blocked_tool(self) -> Optional[str]:
+        """Get the tool that should be blocked (if any)."""
+        if self.state == CircuitState.OPEN:
+            return "last_failed_tool"
+        return None
+
+
+class MultiToolCircuitBreaker:
+    """
+    Manages per-tool circuit breakers and cross-tool cascade detection.
+    
+    When one tool trips its breaker, related tools are also warned.
+    """
+    
+    def __init__(self):
+        self.breakers: Dict[str, ToolCircuitBreaker] = {}
+        self.global_streak: int = 0
+        self.last_tool: Optional[str] = None
+        self.last_success: bool = True
+    
+    def get_breaker(self, tool_name: str) -> ToolCircuitBreaker:
+        """Get or create a circuit breaker for a tool."""
+        if tool_name not in self.breakers:
+            self.breakers[tool_name] = ToolCircuitBreaker()
+        return self.breakers[tool_name]
+    
+    def record_result(self, tool_name: str, success: bool) -> bool:
+        """
+        Record a tool call result. Returns True if execution should continue.
+        """
+        breaker = self.get_breaker(tool_name)
+        allowed = breaker.record_result(success)
+        
+        # Track global streak
+        if success:
+            self.global_streak = 0
+            self.last_success = True
+        else:
+            self.global_streak += 1
+            self.last_success = False
+        
+        self.last_tool = tool_name
+        return allowed
+    
+    def can_execute(self, tool_name: str) -> bool:
+        """Check if a specific tool can execute."""
+        breaker = self.get_breaker(tool_name)
+        return breaker.can_execute()
+    
+    def get_global_state(self) -> Dict[str, Any]:
+        """Get overall circuit breaker state."""
+        return {
+            "global_streak": self.global_streak,
+            "last_tool": self.last_tool,
+            "last_success": self.last_success,
+            "tool_states": {
+                name: breaker.get_state()
+                for name, breaker in self.breakers.items()
+                if breaker.consecutive_failures > 0 or breaker.total_trips > 0
+            },
+            "any_open": any(b.state == CircuitState.OPEN for b in self.breakers.values()),
+        }
+    
+    def get_recovery_action(self) -> Dict[str, Any]:
+        """Get recovery action based on global state."""
+        if self.global_streak == 0:
+            return {"action": "continue", "reason": "No errors"}
+        
+        # Find the breaker with the worst streak
+        worst = max(self.breakers.values(), key=lambda b: b.consecutive_failures, default=None)
+        if worst and worst.consecutive_failures > 0:
+            return worst.get_recovery_action()
+        
+        return {
+            "action": "continue",
+            "reason": f"Global streak: {self.global_streak}",
+            "severity": "low",
+        }
+    
+    def reset_all(self):
+        """Reset all circuit breakers."""
+        for breaker in self.breakers.values():
+            breaker.reset()
+        self.global_streak = 0
+        self.last_success = True
--- a/agent/context_budget.py
+++ b/agent/context_budget.py
@@ -0,0 +1,148 @@
+"""
+Context Budget Tracker - Prevent context window overflow
+
+Poka-yoke: Visual warnings at 70%%, 85%%, 95%% capacity.
+Auto-checkpoint at 85%%. Pre-flight token estimation.
+
+Issue: #838
+"""
+
+import json
+import logging
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+HERMES_HOME = Path.home() / ".hermes"
+CHECKPOINT_DIR = HERMES_HOME / "checkpoints"
+CHARS_PER_TOKEN = 4
+
+THRESHOLD_WARNING = 0.70
+THRESHOLD_CRITICAL = 0.85
+THRESHOLD_DANGER = 0.95
+
+
+class ContextBudget:
+    def __init__(self, context_limit: int = 128000, system_tokens: int = 0, 
+                 used_tokens: int = 0, reserved_tokens: int = 2000):
+        self.context_limit = context_limit
+        self.system_tokens = system_tokens
+        self.used_tokens = used_tokens
+        self.reserved_tokens = reserved_tokens
+    
+    @property
+    def total_used(self) -> int:
+        return self.system_tokens + self.used_tokens
+    
+    @property
+    def available(self) -> int:
+        return max(0, self.context_limit - self.reserved_tokens)
+    
+    @property
+    def remaining(self) -> int:
+        return max(0, self.available - self.total_used)
+    
+    @property
+    def utilization(self) -> float:
+        return self.total_used / self.available if self.available > 0 else 1.0
+
+
+def estimate_tokens(text: str) -> int:
+    return len(text) // CHARS_PER_TOKEN if text else 0
+
+
+def estimate_messages_tokens(messages: List[Dict]) -> int:
+    total = 0
+    for msg in messages:
+        content = msg.get("content", "")
+        if isinstance(content, str):
+            total += estimate_tokens(content)
+        if msg.get("tool_calls"):
+            total += 100
+    return total
+
+
+class ContextBudgetTracker:
+    def __init__(self, context_limit: int = 128000, session_id: str = ""):
+        self.budget = ContextBudget(context_limit=context_limit)
+        self.session_id = session_id
+        self._checkpointed = False
+        self._warnings_given = set()
+    
+    def update_from_messages(self, messages: List[Dict]):
+        self.budget.used_tokens = estimate_messages_tokens(messages)
+    
+    def can_fit(self, additional_tokens: int) -> bool:
+        return self.budget.remaining >= additional_tokens
+    
+    def preflight_check(self, text: str) -> Tuple[bool, str]:
+        tokens = estimate_tokens(text)
+        if not self.can_fit(tokens):
+            return False, f"Cannot load: ~{tokens:,} tokens needed, {self.budget.remaining:,} remaining"
+        would_util = (self.budget.total_used + tokens) / self.budget.available if self.budget.available > 0 else 1.0
+        if would_util >= THRESHOLD_DANGER:
+            return False, f"Would reach {would_util:.0%%} capacity. Summarize or start new session."
+        if would_util >= THRESHOLD_CRITICAL:
+            return True, f"Warning: will reach {would_util:.0%%} capacity."
+        return True, ""
+    
+    def get_warning(self) -> Optional[str]:
+        util = self.budget.utilization
+        if util >= THRESHOLD_DANGER and "danger" not in self._warnings_given:
+            self._warnings_given.add("danger")
+            return f"[CONTEXT CRITICAL: {util:.0%%} used -- {self.budget.remaining:,} tokens left. Summarize or start new session.]"
+        if util >= THRESHOLD_CRITICAL and "critical" not in self._warnings_given:
+            self._warnings_given.add("critical")
+            self._auto_checkpoint()
+            return f"[CONTEXT WARNING: {util:.0%%} used -- consider summarizing. Auto-checkpoint saved.]"
+        if util >= THRESHOLD_WARNING and "warning" not in self._warnings_given:
+            self._warnings_given.add("warning")
+            return f"[CONTEXT: {util:.0%%} used -- {self.budget.remaining:,} tokens remaining]"
+        return None
+    
+    def _auto_checkpoint(self):
+        if self._checkpointed or not self.session_id:
+            return
+        try:
+            CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
+            path = CHECKPOINT_DIR / f"{self.session_id}.json"
+            path.write_text(json.dumps({
+                "session_id": self.session_id,
+                "timestamp": time.time(),
+                "budget": {"utilization": round(self.budget.utilization * 100, 1)}
+            }, indent=2))
+            self._checkpointed = True
+            logger.info("Auto-checkpoint saved: %s", path)
+        except Exception as e:
+            logger.error("Auto-checkpoint failed: %s", e)
+    
+    def get_status_line(self) -> str:
+        util = self.budget.utilization
+        remaining = self.budget.remaining
+        if util >= THRESHOLD_DANGER:
+            return f"RED {util:.0%%} used ({remaining:,} left)"
+        elif util >= THRESHOLD_CRITICAL:
+            return f"ORANGE {util:.0%%} used ({remaining:,} left)"
+        elif util >= THRESHOLD_WARNING:
+            return f"YELLOW {util:.0%%} used ({remaining:,} left)"
+        return f"GREEN {util:.0%%} used ({remaining:,} left)"
+
+
+_tracker = None
+
+def get_tracker(context_limit=128000, session_id=""):
+    global _tracker
+    if _tracker is None:
+        _tracker = ContextBudgetTracker(context_limit, session_id)
+    return _tracker
+
+def check_context_budget(messages, context_limit=128000):
+    tracker = get_tracker(context_limit)
+    tracker.update_from_messages(messages)
+    return tracker.get_warning()
+
+def preflight_token_check(text):
+    tracker = get_tracker()
+    return tracker.preflight_check(text)
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -4,8 +4,12 @@ Self-contained class with its own OpenAI client for summarization.
 Uses auxiliary model (cheap/fast) to summarize middle turns while
 protecting head and tail context.

-Improvements over v1:
-  - Structured summary template (Goal, Progress, Decisions, Files, Next Steps)
+Improvements over v2:
+  - Structured summary template with Resolved/Pending question tracking
+  - Summarizer preamble: "Do not respond to any questions" (from OpenCode)
+  - Handoff framing: "different assistant" (from Codex) to create separation
+  - "Remaining Work" replaces "Next Steps" to avoid reading as active instructions
+  - Clear separator when summary merges into tail message
  - Iterative summary updates (preserves info across multiple compactions)
  - Token-budget tail protection instead of fixed message count
  - Tool output pruning before LLM summarization (cheap pre-pass)
@@ -13,13 +17,17 @@ Improvements over v1:
  - Richer tool call/result detail in summarizer input
 """

+import hashlib
+import json
 import logging
+import re
 import time
 from typing import Any, Dict, List, Optional

 from agent.auxiliary_client import call_llm
 from agent.context_engine import ContextEngine
 from agent.model_metadata import (
+    MINIMUM_CONTEXT_LENGTH,
    get_model_context_length,
    estimate_messages_tokens_rough,
 )
@@ -27,12 +35,13 @@ from agent.model_metadata import (
 logger = logging.getLogger(__name__)

 SUMMARY_PREFIX = (
-    "[CONTEXT COMPACTION] Earlier turns in this conversation were compacted "
-    "to save context space. The summary below describes work that was "
-    "already completed, and the current session state may still reflect "
-    "that work (for example, files may already be changed). Use the summary "
-    "and the current state to continue from where things left off, and "
-    "avoid repeating work:"
+    "[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted "
+    "into the summary below. This is a handoff from a previous context "
+    "window — treat it as background reference, NOT as active instructions. "
+    "Do NOT answer questions or fulfill requests mentioned in this summary; "
+    "they were already addressed. Respond ONLY to the latest user message "
+    "that appears AFTER this summary. The current session state (files, "
+    "config, etc.) may reflect work described here — avoid repeating it:"
 )
 LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"

@@ -51,6 +60,128 @@ _CHARS_PER_TOKEN = 4
 _SUMMARY_FAILURE_COOLDOWN_SECONDS = 600


+def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> str:
+    """Create an informative 1-line summary of a tool call + result.
+
+    Used during the pre-compression pruning pass to replace large tool
+    outputs with a short but useful description of what the tool did,
+    rather than a generic placeholder that carries zero information.
+
+    Returns strings like::
+
+        [terminal] ran `npm test` -> exit 0, 47 lines output
+        [read_file] read config.py from line 1 (1,200 chars)
+        [search_files] content search for 'compress' in agent/ -> 12 matches
+    """
+    try:
+        args = json.loads(tool_args) if tool_args else {}
+    except (json.JSONDecodeError, TypeError):
+        args = {}
+
+    content = tool_content or ""
+    content_len = len(content)
+    line_count = content.count("\n") + 1 if content.strip() else 0
+
+    if tool_name == "terminal":
+        cmd = args.get("command", "")
+        if len(cmd) > 80:
+            cmd = cmd[:77] + "..."
+        exit_match = re.search(r'"exit_code"\s*:\s*(-?\d+)', content)
+        exit_code = exit_match.group(1) if exit_match else "?"
+        return f"[terminal] ran `{cmd}` -> exit {exit_code}, {line_count} lines output"
+
+    if tool_name == "read_file":
+        path = args.get("path", "?")
+        offset = args.get("offset", 1)
+        return f"[read_file] read {path} from line {offset} ({content_len:,} chars)"
+
+    if tool_name == "write_file":
+        path = args.get("path", "?")
+        written_lines = args.get("content", "").count("\n") + 1 if args.get("content") else "?"
+        return f"[write_file] wrote to {path} ({written_lines} lines)"
+
+    if tool_name == "search_files":
+        pattern = args.get("pattern", "?")
+        path = args.get("path", ".")
+        target = args.get("target", "content")
+        match_count = re.search(r'"total_count"\s*:\s*(\d+)', content)
+        count = match_count.group(1) if match_count else "?"
+        return f"[search_files] {target} search for '{pattern}' in {path} -> {count} matches"
+
+    if tool_name == "patch":
+        path = args.get("path", "?")
+        mode = args.get("mode", "replace")
+        return f"[patch] {mode} in {path} ({content_len:,} chars result)"
+
+    if tool_name in ("browser_navigate", "browser_click", "browser_snapshot",
+                     "browser_type", "browser_scroll", "browser_vision"):
+        url = args.get("url", "")
+        ref = args.get("ref", "")
+        detail = f" {url}" if url else (f" ref={ref}" if ref else "")
+        return f"[{tool_name}]{detail} ({content_len:,} chars)"
+
+    if tool_name == "web_search":
+        query = args.get("query", "?")
+        return f"[web_search] query='{query}' ({content_len:,} chars result)"
+
+    if tool_name == "web_extract":
+        urls = args.get("urls", [])
+        url_desc = urls[0] if isinstance(urls, list) and urls else "?"
+        if isinstance(urls, list) and len(urls) > 1:
+            url_desc += f" (+{len(urls) - 1} more)"
+        return f"[web_extract] {url_desc} ({content_len:,} chars)"
+
+    if tool_name == "delegate_task":
+        goal = args.get("goal", "")
+        if len(goal) > 60:
+            goal = goal[:57] + "..."
+        return f"[delegate_task] '{goal}' ({content_len:,} chars result)"
+
+    if tool_name == "execute_code":
+        code_preview = (args.get("code") or "")[:60].replace("\n", " ")
+        if len(args.get("code", "")) > 60:
+            code_preview += "..."
+        return f"[execute_code] `{code_preview}` ({line_count} lines output)"
+
+    if tool_name in ("skill_view", "skills_list", "skill_manage"):
+        name = args.get("name", "?")
+        return f"[{tool_name}] name={name} ({content_len:,} chars)"
+
+    if tool_name == "vision_analyze":
+        question = args.get("question", "")[:50]
+        return f"[vision_analyze] '{question}' ({content_len:,} chars)"
+
+    if tool_name == "memory":
+        action = args.get("action", "?")
+        target = args.get("target", "?")
+        return f"[memory] {action} on {target}"
+
+    if tool_name == "todo":
+        return "[todo] updated task list"
+
+    if tool_name == "clarify":
+        return "[clarify] asked user a question"
+
+    if tool_name == "text_to_speech":
+        return f"[text_to_speech] generated audio ({content_len:,} chars)"
+
+    if tool_name == "cronjob":
+        action = args.get("action", "?")
+        return f"[cronjob] {action}"
+
+    if tool_name == "process":
+        action = args.get("action", "?")
+        sid = args.get("session_id", "?")
+        return f"[process] {action} session={sid}"
+
+    # Generic fallback
+    first_arg = ""
+    for k, v in list(args.items())[:2]:
+        sv = str(v)[:40]
+        first_arg += f" {k}={sv}"
+    return f"[{tool_name}]{first_arg} ({content_len:,} chars result)"
+
+
 class ContextCompressor(ContextEngine):
    """Default context engine — compresses conversation context via lossy summarization.

@@ -72,6 +203,8 @@ class ContextCompressor(ContextEngine):
        self._context_probed = False
        self._context_probe_persistable = False
        self._previous_summary = None
+        self._last_compression_savings_pct = 100.0
+        self._ineffective_compression_count = 0

    def update_model(
        self,
@@ -80,14 +213,19 @@ class ContextCompressor(ContextEngine):
        base_url: str = "",
        api_key: str = "",
        provider: str = "",
+        api_mode: str = "",
    ) -> None:
        """Update model info after a model switch or fallback activation."""
        self.model = model
        self.base_url = base_url
        self.api_key = api_key
        self.provider = provider
+        self.api_mode = api_mode
        self.context_length = context_length
-        self.threshold_tokens = int(context_length * self.threshold_percent)
+        self.threshold_tokens = max(
+            int(context_length * self.threshold_percent),
+            MINIMUM_CONTEXT_LENGTH,
+        )

    def __init__(
        self,
@@ -102,11 +240,13 @@ class ContextCompressor(ContextEngine):
        api_key: str = "",
        config_context_length: int | None = None,
        provider: str = "",
+        api_mode: str = "",
    ):
        self.model = model
        self.base_url = base_url
        self.api_key = api_key
        self.provider = provider
+        self.api_mode = api_mode
        self.threshold_percent = threshold_percent
        self.protect_first_n = protect_first_n
        self.protect_last_n = protect_last_n
@@ -118,7 +258,14 @@ class ContextCompressor(ContextEngine):
            config_context_length=config_context_length,
            provider=provider,
        )
-        self.threshold_tokens = int(self.context_length * threshold_percent)
+        # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
+        # the percentage would suggest a lower value.  This prevents premature
+        # compression on large-context models at 50% while keeping the % sane
+        # for models right at the minimum.
+        self.threshold_tokens = max(
+            int(self.context_length * threshold_percent),
+            MINIMUM_CONTEXT_LENGTH,
+        )
        self.compression_count = 0

        # Derive token budgets: ratio is relative to the threshold, not total context
@@ -147,6 +294,9 @@ class ContextCompressor(ContextEngine):

        # Stores the previous compaction summary for iterative updates
        self._previous_summary: Optional[str] = None
+        # Anti-thrashing: track whether last compression was effective
+        self._last_compression_savings_pct: float = 100.0
+        self._ineffective_compression_count: int = 0
        self._summary_failure_cooldown_until: float = 0.0

    def update_from_response(self, usage: Dict[str, Any]):
@@ -155,9 +305,26 @@ class ContextCompressor(ContextEngine):
        self.last_completion_tokens = usage.get("completion_tokens", 0)

    def should_compress(self, prompt_tokens: int = None) -> bool:
-        """Check if context exceeds the compression threshold."""
+        """Check if context exceeds the compression threshold.
+
+        Includes anti-thrashing protection: if the last two compressions
+        each saved less than 10%, skip compression to avoid infinite loops
+        where each pass removes only 1-2 messages.
+        """
        tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens
-        return tokens >= self.threshold_tokens
+        if tokens < self.threshold_tokens:
+            return False
+        # Anti-thrashing: back off if recent compressions were ineffective
+        if self._ineffective_compression_count >= 2:
+            if not self.quiet_mode:
+                logger.warning(
+                    "Compression skipped — last %d compressions saved <10%% each. "
+                    "Consider /new to start a fresh session, or /compress <topic> "
+                    "for focused compression.",
+                    self._ineffective_compression_count,
+                )
+            return False
+        return True

    # ------------------------------------------------------------------
    # Tool output pruning (cheap pre-pass, no LLM call)
@@ -167,7 +334,16 @@ class ContextCompressor(ContextEngine):
        self, messages: List[Dict[str, Any]], protect_tail_count: int,
        protect_tail_tokens: int | None = None,
    ) -> tuple[List[Dict[str, Any]], int]:
-        """Replace old tool result contents with a short placeholder.
+        """Replace old tool result contents with informative 1-line summaries.
+
+        Instead of a generic placeholder, generates a summary like::
+
+            [terminal] ran `npm test` -> exit 0, 47 lines output
+            [read_file] read config.py from line 1 (3,400 chars)
+
+        Also deduplicates identical tool results (e.g. reading the same file
+        5x keeps only the newest full copy) and truncates large tool_call
+        arguments in assistant messages outside the protected tail.

        Walks backward from the end, protecting the most recent messages that
        fall within ``protect_tail_tokens`` (when provided) OR the last
@@ -183,6 +359,22 @@ class ContextCompressor(ContextEngine):
        result = [m.copy() for m in messages]
        pruned = 0

+        # Build index: tool_call_id -> (tool_name, arguments_json)
+        call_id_to_tool: Dict[str, tuple] = {}
+        for msg in result:
+            if msg.get("role") == "assistant":
+                for tc in msg.get("tool_calls") or []:
+                    if isinstance(tc, dict):
+                        cid = tc.get("id", "")
+                        fn = tc.get("function", {})
+                        call_id_to_tool[cid] = (fn.get("name", "unknown"), fn.get("arguments", ""))
+                    else:
+                        cid = getattr(tc, "id", "") or ""
+                        fn = getattr(tc, "function", None)
+                        name = getattr(fn, "name", "unknown") if fn else "unknown"
+                        args_str = getattr(fn, "arguments", "") if fn else ""
+                        call_id_to_tool[cid] = (name, args_str)
+
        # Determine the prune boundary
        if protect_tail_tokens is not None and protect_tail_tokens > 0:
            # Token-budget approach: walk backward accumulating tokens
@@ -191,7 +383,8 @@ class ContextCompressor(ContextEngine):
            min_protect = min(protect_tail_count, len(result) - 1)
            for i in range(len(result) - 1, -1, -1):
                msg = result[i]
-                content_len = len(msg.get("content") or "")
+                raw_content = msg.get("content") or ""
+                content_len = sum(len(p.get("text", "")) for p in raw_content) if isinstance(raw_content, list) else len(raw_content)
                msg_tokens = content_len // _CHARS_PER_TOKEN + 10
                for tc in msg.get("tool_calls") or []:
                    if isinstance(tc, dict):
@@ -206,18 +399,69 @@ class ContextCompressor(ContextEngine):
        else:
            prune_boundary = len(result) - protect_tail_count

+        # Pass 1: Deduplicate identical tool results.
+        # When the same file is read multiple times, keep only the most recent
+        # full copy and replace older duplicates with a back-reference.
+        content_hashes: dict = {}  # hash -> (index, tool_call_id)
+        for i in range(len(result) - 1, -1, -1):
+            msg = result[i]
+            if msg.get("role") != "tool":
+                continue
+            content = msg.get("content") or ""
+            # Skip multimodal content (list of content blocks)
+            if isinstance(content, list):
+                continue
+            if len(content) < 200:
+                continue
+            h = hashlib.md5(content.encode("utf-8", errors="replace")).hexdigest()[:12]
+            if h in content_hashes:
+                # This is an older duplicate — replace with back-reference
+                result[i] = {**msg, "content": "[Duplicate tool output — same content as a more recent call]"}
+                pruned += 1
+            else:
+                content_hashes[h] = (i, msg.get("tool_call_id", "?"))
+
+        # Pass 2: Replace old tool results with informative summaries
        for i in range(prune_boundary):
            msg = result[i]
            if msg.get("role") != "tool":
                continue
            content = msg.get("content", "")
+            # Skip multimodal content (list of content blocks)
+            if isinstance(content, list):
+                continue
            if not content or content == _PRUNED_TOOL_PLACEHOLDER:
                continue
+            # Skip already-deduplicated or previously-summarized results
+            if content.startswith("[Duplicate tool output"):
+                continue
            # Only prune if the content is substantial (>200 chars)
            if len(content) > 200:
-                result[i] = {**msg, "content": _PRUNED_TOOL_PLACEHOLDER}
+                call_id = msg.get("tool_call_id", "")
+                tool_name, tool_args = call_id_to_tool.get(call_id, ("unknown", ""))
+                summary = _summarize_tool_result(tool_name, tool_args, content)
+                result[i] = {**msg, "content": summary}
                pruned += 1

+        # Pass 3: Truncate large tool_call arguments in assistant messages
+        # outside the protected tail. write_file with 50KB content, for
+        # example, survives pruning entirely without this.
+        for i in range(prune_boundary):
+            msg = result[i]
+            if msg.get("role") != "assistant" or not msg.get("tool_calls"):
+                continue
+            new_tcs = []
+            modified = False
+            for tc in msg["tool_calls"]:
+                if isinstance(tc, dict):
+                    args = tc.get("function", {}).get("arguments", "")
+                    if len(args) > 500:
+                        tc = {**tc, "function": {**tc["function"], "arguments": args[:200] + "...[truncated]"}}
+                        modified = True
+                new_tcs.append(tc)
+            if modified:
+                result[i] = {**msg, "tool_calls": new_tcs}
+
        return result, pruned

    # ------------------------------------------------------------------
@@ -295,13 +539,20 @@ class ContextCompressor(ContextEngine):

        return "\n\n".join(parts)

-    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
+    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]:
        """Generate a structured summary of conversation turns.

-        Uses a structured template (Goal, Progress, Decisions, Files, Next Steps)
-        inspired by Pi-mono and OpenCode. When a previous summary exists,
+        Uses a structured template (Goal, Progress, Decisions, Resolved/Pending
+        Questions, Files, Remaining Work) with explicit preamble telling the
+        summarizer not to answer questions.  When a previous summary exists,
        generates an iterative update instead of summarizing from scratch.

+        Args:
+            focus_topic: Optional focus string for guided compression.  When
+                provided, the summariser prioritises preserving information
+                related to this topic and is more aggressive about compressing
+                everything else.  Inspired by Claude Code's ``/compact``.
+
        Returns None if all attempts fail — the caller should drop
        the middle turns without a summary rather than inject a useless
        placeholder.
@@ -317,9 +568,75 @@ class ContextCompressor(ContextEngine):
        summary_budget = self._compute_summary_budget(turns_to_summarize)
        content_to_summarize = self._serialize_for_summary(turns_to_summarize)

+        # Preamble shared by both first-compaction and iterative-update prompts.
+        # Inspired by OpenCode's "do not respond to any questions" instruction
+        # and Codex's "another language model" framing.
+        _summarizer_preamble = (
+            "You are a summarization agent creating a context checkpoint. "
+            "Your output will be injected as reference material for a DIFFERENT "
+            "assistant that continues the conversation. "
+            "Do NOT respond to any questions or requests in the conversation — "
+            "only output the structured summary. "
+            "Do NOT include any preamble, greeting, or prefix."
+        )
+
+        # Shared structured template (used by both paths).
+        _template_sections = f"""## Goal
+[What the user is trying to accomplish]
+
+## Constraints & Preferences
+[User preferences, coding style, constraints, important decisions]
+
+## Completed Actions
+[Numbered list of concrete actions taken — include tool used, target, and outcome.
+Format each as: N. ACTION target — outcome [tool: name]
+Example:
+1. READ config.py:45 — found `==` should be `!=` [tool: read_file]
+2. PATCH config.py:45 — changed `==` to `!=` [tool: patch]
+3. TEST `pytest tests/` — 3/50 failed: test_parse, test_validate, test_edge [tool: terminal]
+Be specific with file paths, commands, line numbers, and results.]
+
+## Active State
+[Current working state — include:
+- Working directory and branch (if applicable)
+- Modified/created files with brief note on each
+- Test status (X/Y passing)
+- Any running processes or servers
+- Environment details that matter]
+
+## In Progress
+[Work currently underway — what was being done when compaction fired]
+
+## Blocked
+[Any blockers, errors, or issues not yet resolved. Include exact error messages.]
+
+## Key Decisions
+[Important technical decisions and WHY they were made]
+
+## Resolved Questions
+[Questions the user asked that were ALREADY answered — include the answer so the next assistant does not re-answer them]
+
+## Pending User Asks
+[Questions or requests from the user that have NOT yet been answered or fulfilled. If none, write "None."]
+
+## Relevant Files
+[Files read, modified, or created — with brief note on each]
+
+## Remaining Work
+[What remains to be done — framed as context, not instructions]
+
+## Critical Context
+[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
+
+Target ~{summary_budget} tokens. Be CONCRETE — include file paths, command outputs, error messages, line numbers, and specific values. Avoid vague descriptions like "made some changes" — say exactly what changed.
+
+Write only the summary body. Do not include any preamble or prefix."""
+
        if self._previous_summary:
            # Iterative update: preserve existing info, add new progress
-            prompt = f"""You are updating a context compaction summary. A previous compaction produced the summary below. New conversation turns have occurred since then and need to be incorporated.
+            prompt = f"""{_summarizer_preamble}
+
+You are updating a context compaction summary. A previous compaction produced the summary below. New conversation turns have occurred since then and need to be incorporated.

 PREVIOUS SUMMARY:
 {self._previous_summary}
@@ -327,87 +644,42 @@ PREVIOUS SUMMARY:
 NEW TURNS TO INCORPORATE:
 {content_to_summarize}

-Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new progress. Move items from "In Progress" to "Done" when completed. Remove information only if it is clearly obsolete.
+Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new completed actions to the numbered list (continue numbering). Move items from "In Progress" to "Completed Actions" when done. Move answered questions to "Resolved Questions". Update "Active State" to reflect current state. Remove information only if it is clearly obsolete.

-## Goal
-[What the user is trying to accomplish — preserve from previous summary, update if goal evolved]
-
-## Constraints & Preferences
-[User preferences, coding style, constraints, important decisions — accumulate across compactions]
-
-## Progress
-### Done
-[Completed work — include specific file paths, commands run, results obtained]
-### In Progress
-[Work currently underway]
-### Blocked
-[Any blockers or issues encountered]
-
-## Key Decisions
-[Important technical decisions and why they were made]
-
-## Relevant Files
-[Files read, modified, or created — with brief note on each. Accumulate across compactions.]
-
-## Next Steps
-[What needs to happen next to continue the work]
-
-## Critical Context
-[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
-
-## Tools & Patterns
-[Which tools were used, how they were used effectively, and any tool-specific discoveries. Accumulate across compactions.]
-
-Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions.
-
-Write only the summary body. Do not include any preamble or prefix."""
+{_template_sections}"""
        else:
            # First compaction: summarize from scratch
-            prompt = f"""Create a structured handoff summary for a later assistant that will continue this conversation after earlier turns are compacted.
+            prompt = f"""{_summarizer_preamble}
+
+Create a structured handoff summary for a different assistant that will continue this conversation after earlier turns are compacted. The next assistant should be able to understand what happened without re-reading the original turns.

 TURNS TO SUMMARIZE:
 {content_to_summarize}

 Use this exact structure:

-## Goal
-[What the user is trying to accomplish]
+{_template_sections}"""

-## Constraints & Preferences
-[User preferences, coding style, constraints, important decisions]
+        # Inject focus topic guidance when the user provides one via /compress <focus>.
+        # This goes at the end of the prompt so it takes precedence.
+        if focus_topic:
+            prompt += f"""

-## Progress
-### Done
-[Completed work — include specific file paths, commands run, results obtained]
-### In Progress
-[Work currently underway]
-### Blocked
-[Any blockers or issues encountered]
-
-## Key Decisions
-[Important technical decisions and why they were made]
-
-## Relevant Files
-[Files read, modified, or created — with brief note on each]
-
-## Next Steps
-[What needs to happen next to continue the work]
-
-## Critical Context
-[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
-
-## Tools & Patterns
-[Which tools were used, how they were used effectively, and any tool-specific discoveries (e.g., preferred flags, working invocations, successful command patterns)]
-
-Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions. The goal is to prevent the next assistant from repeating work or losing important details.
-
-Write only the summary body. Do not include any preamble or prefix."""
+FOCUS TOPIC: "{focus_topic}"
+The user has requested that this compaction PRIORITISE preserving all information related to the focus topic above. For content related to "{focus_topic}", include full detail — exact values, file paths, command outputs, error messages, and decisions. For content NOT related to the focus topic, summarise more aggressively (brief one-liners or omit if truly irrelevant). The focus topic sections should receive roughly 60-70% of the summary token budget."""

        try:
            call_kwargs = {
                "task": "compression",
+                "main_runtime": {
+                    "model": self.model,
+                    "provider": self.provider,
+                    "base_url": self.base_url,
+                    "api_key": self.api_key,
+                    "api_mode": self.api_mode,
+                },
                "messages": [{"role": "user", "content": prompt}],
-                "max_tokens": summary_budget * 2,
+                "max_tokens": int(summary_budget * 1.3),
                # timeout resolved from auxiliary.compression.timeout config by call_llm
            }
            if self.summary_model:
@@ -421,8 +693,10 @@ Write only the summary body. Do not include any preamble or prefix."""
            # Store for iterative updates on next compaction
            self._previous_summary = summary
            self._summary_failure_cooldown_until = 0.0
+            self._summary_model_fallen_back = False
            return self._with_summary_prefix(summary)
        except RuntimeError:
+            # No provider configured — long cooldown, unlikely to self-resolve
            self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
            logging.warning("Context compression: no provider available for "
                            "summary. Middle turns will be dropped without summary "
@@ -430,12 +704,42 @@ Write only the summary body. Do not include any preamble or prefix."""
                            _SUMMARY_FAILURE_COOLDOWN_SECONDS)
            return None
        except Exception as e:
-            self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
+            # If the summary model is different from the main model and the
+            # error looks permanent (model not found, 503, 404), fall back to
+            # using the main model instead of entering cooldown that leaves
+            # context growing unbounded.  (#8620 sub-issue 4)
+            _status = getattr(e, "status_code", None) or getattr(getattr(e, "response", None), "status_code", None)
+            _err_str = str(e).lower()
+            _is_model_not_found = (
+                _status in (404, 503)
+                or "model_not_found" in _err_str
+                or "does not exist" in _err_str
+                or "no available channel" in _err_str
+            )
+            if (
+                _is_model_not_found
+                and self.summary_model
+                and self.summary_model != self.model
+                and not getattr(self, "_summary_model_fallen_back", False)
+            ):
+                self._summary_model_fallen_back = True
+                logging.warning(
+                    "Summary model '%s' not available (%s). "
+                    "Falling back to main model '%s' for compression.",
+                    self.summary_model, e, self.model,
+                )
+                self.summary_model = ""  # empty = use main model
+                self._summary_failure_cooldown_until = 0.0  # no cooldown
+                return self._generate_summary(messages, summary_budget)  # retry immediately
+
+            # Transient errors (timeout, rate limit, network) — shorter cooldown
+            _transient_cooldown = 60
+            self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
            logging.warning(
                "Failed to generate context summary: %s. "
                "Further summary attempts paused for %d seconds.",
                e,
-                _SUMMARY_FAILURE_COOLDOWN_SECONDS,
+                _transient_cooldown,
            )
            return None

@@ -620,7 +924,7 @@ Write only the summary body. Do not include any preamble or prefix."""
    # Main compression entry point
    # ------------------------------------------------------------------

-    def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]:
+    def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, focus_topic: str = None) -> List[Dict[str, Any]]:
        """Compress conversation messages by summarizing middle turns.

        Algorithm:
@@ -632,6 +936,12 @@ Write only the summary body. Do not include any preamble or prefix."""

        After compression, orphaned tool_call / tool_result pairs are cleaned
        up so the API never receives mismatched IDs.
+
+        Args:
+            focus_topic: Optional focus string for guided compression.  When
+                provided, the summariser will prioritise preserving information
+                related to this topic and be more aggressive about compressing
+                everything else.  Inspired by Claude Code's ``/compact``.
        """
        n_messages = len(messages)
        # Only need head + 3 tail messages minimum (token budget decides the real tail size)
@@ -689,17 +999,17 @@ Write only the summary body. Do not include any preamble or prefix."""
            )

        # Phase 3: Generate structured summary
-        summary = self._generate_summary(turns_to_summarize)
+        summary = self._generate_summary(turns_to_summarize, focus_topic=focus_topic)

        # Phase 4: Assemble compressed message list
        compressed = []
        for i in range(compress_start):
            msg = messages[i].copy()
-            if i == 0 and msg.get("role") == "system" and self.compression_count == 0:
-                msg["content"] = (
-                    (msg.get("content") or "")
-                    + "\n\n[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
-                )
+            if i == 0 and msg.get("role") == "system":
+                existing = msg.get("content") or ""
+                _compression_note = "[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
+                if _compression_note not in existing:
+                    msg["content"] = existing + "\n\n" + _compression_note
            compressed.append(msg)

        # If LLM summary failed, insert a static fallback so the model
@@ -744,7 +1054,12 @@ Write only the summary body. Do not include any preamble or prefix."""
            msg = messages[i].copy()
            if _merge_summary_into_tail and i == compress_end:
                original = msg.get("content") or ""
-                msg["content"] = summary + "\n\n" + original
+                msg["content"] = (
+                    summary
+                    + "\n\n--- END OF CONTEXT SUMMARY — "
+                    "respond to the message below, not the summary above ---\n\n"
+                    + original
+                )
                _merge_summary_into_tail = False
            compressed.append(msg)

@@ -752,14 +1067,24 @@ Write only the summary body. Do not include any preamble or prefix."""

        compressed = self._sanitize_tool_pairs(compressed)

+        new_estimate = estimate_messages_tokens_rough(compressed)
+        saved_estimate = display_tokens - new_estimate
+
+        # Anti-thrashing: track compression effectiveness
+        savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0
+        self._last_compression_savings_pct = savings_pct
+        if savings_pct < 10:
+            self._ineffective_compression_count += 1
+        else:
+            self._ineffective_compression_count = 0
+
        if not self.quiet_mode:
-            new_estimate = estimate_messages_tokens_rough(compressed)
-            saved_estimate = display_tokens - new_estimate
            logger.info(
-                "Compressed: %d -> %d messages (~%d tokens saved)",
+                "Compressed: %d -> %d messages (~%d tokens saved, %.0f%%)",
                n_messages,
                len(compressed),
                saved_estimate,
+                savings_pct,
            )
            logger.info("Compression #%d complete", self.compression_count)

--- a/agent/context_engine.py
+++ b/agent/context_engine.py
@@ -26,7 +26,7 @@ Lifecycle:
 """

 from abc import ABC, abstractmethod
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List


 class ContextEngine(ABC):
--- a/agent/context_strategy.py
+++ b/agent/context_strategy.py
@@ -1,214 +0,0 @@
-"""Context-RAG Decision Framework — adaptive retrieval based on context pressure.
-
-With models that have 128K-1M token context windows, always prefetching from
-RAG is wasteful when context is mostly empty and insufficient when context is
-nearly full. This module provides a strategy layer that adapts prefetch behavior
-to remaining context budget.
-
-Strategies:
-  - stuff:     Context < 30% used → prefetch aggressively, load more facts.
-               The model has room to reason over everything directly.
-  - hybrid:    Context 30-70% used → prefetch selectively with standard limits.
-               Key facts in context, rest available via tool calls.
-  - selective:  Context > 70% used → only prefetch on high-signal queries.
-               Tighter limits; defer to on-demand tool retrieval.
-
-The framework is deliberately simple — it's a decision heuristic, not a neural
-router. Simplicity means reliability at the edge cases that matter (crisis
-intervention, long debugging sessions, multi-hour research).
-
-References:
-  - Long Context vs RAG Decision Framework (Timmy Foundation research backlog #4.3)
-  - Self-RAG: Learning to Retrieve, Generate, and Critique (arxiv 2310.11511)
-  - FrugalGPT: How to Use Large Language Models While Reducing Cost (arxiv 2305.05176)
-"""
-
-from __future__ import annotations
-
-import logging
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import Optional
-
-logger = logging.getLogger(__name__)
-
-
-class ContextStrategy(Enum):
-    """Which retrieval strategy to use given current context pressure."""
-
-    STUFF = "stuff"          # < 30% context used — load everything relevant
-    HYBRID = "hybrid"        # 30-70% — standard RAG with moderate limits
-    SELECTIVE = "selective"  # > 70% — minimal prefetch, defer to tools
-
-
-@dataclass
-class ContextBudget:
-    """Snapshot of the current context state.
-
-    Populated by run_agent.py from the context compressor's tracked state.
-    Passed to MemoryManager.prefetch_all() to drive adaptive retrieval.
-    """
-
-    context_length: int = 0          # Model's max context window (tokens)
-    used_tokens: int = 0             # Tokens consumed so far
-    threshold_tokens: int = 0        # Compression fires at this level
-    compression_enabled: bool = True  # Whether auto-compression is on
-
-    @property
-    def pressure(self) -> float:
-        """Context pressure as a ratio [0.0, 1.0+].
-
-        0.0 = empty context, 1.0 = at compression threshold.
-        Can exceed 1.0 if we've blown past the threshold.
-        """
-        if self.context_length <= 0:
-            return 0.0
-        # Use threshold (not raw context_length) as the "full" mark,
-        # since compression fires at threshold, not at context_length.
-        denom = self.threshold_tokens if self.threshold_tokens > 0 else self.context_length
-        return self.used_tokens / denom if denom > 0 else 0.0
-
-    @property
-    def strategy(self) -> ContextStrategy:
-        """Select retrieval strategy based on context pressure."""
-        p = self.pressure
-        if p < 0.30:
-            return ContextStrategy.STUFF
-        elif p < 0.70:
-            return ContextStrategy.HYBRID
-        else:
-            return ContextStrategy.SELECTIVE
-
-    @property
-    def label(self) -> str:
-        """Human-readable label for logging/display."""
-        return self.strategy.value
-
-
-# Default fact limits per strategy
-# These are multiplier on the base limit (default 5 facts per provider).
-_STRATEGY_LIMIT_MULTIPLIERS = {
-    ContextStrategy.STUFF: 3,      # 15 facts — we have room, load generously
-    ContextStrategy.HYBRID: 1,     # 5 facts — standard
-    ContextStrategy.SELECTIVE: 0.4, # 2 facts — save context for the model
-}
-
-# Minimum trust score threshold per strategy.
-# Higher pressure = require higher trust to reduce noise in tight context.
-_STRATEGY_MIN_TRUST = {
-    ContextStrategy.STUFF: 0.2,    # Low bar — cast a wide net
-    ContextStrategy.HYBRID: 0.3,   # Standard
-    ContextStrategy.SELECTIVE: 0.5, # Only high-confidence facts
-}
-
-
-def compute_prefetch_params(
-    budget: ContextBudget,
-    base_limit: int = 5,
-    base_min_trust: float = 0.3,
-) -> dict:
-    """Compute prefetch parameters based on context pressure.
-
-    Returns dict with:
-      - limit: int — max facts to retrieve
-      - min_trust: float — minimum trust score
-      - strategy: ContextStrategy — which strategy was selected
-      - skip: bool — if True, skip prefetch entirely (extreme pressure)
-    """
-    strategy = budget.strategy
-
-    # At extreme pressure (>95%), skip prefetch entirely —
-    # the model needs every token for the current conversation.
-    if budget.pressure > 0.95:
-        logger.debug(
-            "Context pressure %.1f%% > 95%% — skipping prefetch entirely",
-            budget.pressure * 100,
-        )
-        return {
-            "limit": 0,
-            "min_trust": 1.0,
-            "strategy": strategy,
-            "skip": True,
-        }
-
-    multiplier = _STRATEGY_LIMIT_MULTIPLIERS.get(strategy, 1.0)
-    min_trust = _STRATEGY_MIN_TRUST.get(strategy, base_min_trust)
-
-    # Don't let limit go below 1 (always try to get at least something)
-    limit = max(1, int(base_limit * multiplier))
-
-    logger.debug(
-        "Context strategy=%s pressure=%.1f%% limit=%d min_trust=%.1f",
-        strategy.value,
-        budget.pressure * 100,
-        limit,
-        min_trust,
-    )
-
-    return {
-        "limit": limit,
-        "min_trust": min_trust,
-        "strategy": strategy,
-        "skip": False,
-    }
-
-
-def should_prefetch(budget: ContextBudget, query: str) -> bool:
-    """Decide whether to prefetch at all for this query + context state.
-
-    Rules:
-      - Always prefetch when pressure is low (< 50%) — we have room.
-      - At medium pressure (50-80%), only prefetch if the query looks like
-        it needs memory (mentions people, projects, past work).
-      - At high pressure (>80%), skip prefetch unless query is very short
-        (short queries often need recall, long queries don't).
-    """
-    pressure = budget.pressure
-
-    if pressure < 0.50:
-        return True
-
-    # Medium pressure: heuristic on query needing memory
-    query_lower = query.lower() if query else ""
-    memory_signals = [
-        "remember", "recall", "what did", "who is", "last time",
-        "previously", "before", "fact_store", "memory", "told you",
-        "mentioned", "said", "project", "config", "setup",
-    ]
-    has_memory_signal = any(sig in query_lower for sig in memory_signals)
-
-    if pressure < 0.80:
-        return has_memory_signal
-
-    # High pressure: only prefetch for very short memory-seeking queries
-    return has_memory_signal and len(query) < 200
-
-
-def build_strategy_report(budget: ContextBudget) -> str:
-    """Build a human-readable report of the current context strategy.
-
-    For logging and debug display.
-    """
-    params = compute_prefetch_params(budget)
-    strategy = params["strategy"]
-    pressure_pct = budget.pressure * 100
-
-    lines = [
-        f"Context Strategy: {strategy.value.upper()}",
-        f"  Pressure: {pressure_pct:.1f}%",
-        f"  Used: {budget.used_tokens:,} / {budget.context_length:,} tokens",
-        f"  Threshold: {budget.threshold_tokens:,} tokens",
-        f"  Prefetch limit: {params['limit']} facts",
-        f"  Min trust: {params['min_trust']:.1f}",
-        f"  Skip prefetch: {params['skip']}",
-    ]
-
-    # Add recommendations
-    if strategy == ContextStrategy.STUFF:
-        lines.append("  → Context is mostly empty. Prefetching generously.")
-    elif strategy == ContextStrategy.HYBRID:
-        lines.append("  → Context moderately full. Standard retrieval.")
-    else:
-        lines.append("  → Context is tight. Minimal prefetch, prefer on-demand tools.")
-
-    return "\n".join(lines)
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -18,12 +18,12 @@ import hermes_cli.auth as auth_mod
 from hermes_cli.auth import (
    CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
    DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
-    KIMI_CODE_BASE_URL,
    PROVIDER_REGISTRY,
    _auth_store_lock,
    _codex_access_token_is_expiring,
    _decode_jwt_claims,
    _import_codex_cli_tokens,
+    _write_codex_cli_tokens,
    _load_auth_store,
    _load_provider_state,
    _resolve_kimi_base_url,
@@ -288,6 +288,14 @@ def _iter_custom_providers(config: Optional[dict] = None):
        return
    custom_providers = config.get("custom_providers")
    if not isinstance(custom_providers, list):
+        # Fall back to the v12+ providers dict via the compatibility layer
+        try:
+            from hermes_cli.config import get_compatible_custom_providers
+
+            custom_providers = get_compatible_custom_providers(config)
+        except Exception:
+            return
+    if not custom_providers:
        return
    for entry in custom_providers:
        if not isinstance(entry, dict):
@@ -693,6 +701,14 @@ class CredentialPool:
                        self._replace_entry(synced, updated)
                        self._persist()
                        self._sync_device_code_entry_to_auth_store(updated)
+                        try:
+                            _write_codex_cli_tokens(
+                                updated.access_token,
+                                updated.refresh_token,
+                                last_refresh=updated.last_refresh,
+                            )
+                        except Exception as wexc:
+                            logger.debug("Failed to write refreshed Codex tokens to CLI file (retry): %s", wexc)
                        return updated
                    except Exception as retry_exc:
                        logger.debug("Codex retry refresh also failed: %s", retry_exc)
@@ -718,6 +734,17 @@ class CredentialPool:
        # _seed_from_singletons() on the next load_pool() sees fresh state
        # instead of re-seeding stale/consumed tokens.
        self._sync_device_code_entry_to_auth_store(updated)
+        # Write refreshed tokens back to ~/.codex/auth.json so Codex CLI
+        # and VS Code don't hit "refresh_token_reused" on their next refresh.
+        if self.provider == "openai-codex":
+            try:
+                _write_codex_cli_tokens(
+                    updated.access_token,
+                    updated.refresh_token,
+                    last_refresh=updated.last_refresh,
+                )
+            except Exception as wexc:
+                logger.debug("Failed to write refreshed Codex tokens to CLI file: %s", wexc)
        return updated

    def _entry_needs_refresh(self, entry: PooledCredential) -> bool:
@@ -1125,9 +1152,79 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
                },
            )

+    elif provider == "copilot":
+        # Copilot tokens are resolved dynamically via `gh auth token` or
+        # env vars (COPILOT_GITHUB_TOKEN / GH_TOKEN).  They don't live in
+        # the auth store or credential pool, so we resolve them here.
+        try:
+            from hermes_cli.copilot_auth import resolve_copilot_token
+            token, source = resolve_copilot_token()
+            if token:
+                source_name = "gh_cli" if "gh" in source.lower() else f"env:{source}"
+                active_sources.add(source_name)
+                changed |= _upsert_entry(
+                    entries,
+                    provider,
+                    source_name,
+                    {
+                        "source": source_name,
+                        "auth_type": AUTH_TYPE_API_KEY,
+                        "access_token": token,
+                        "label": source,
+                    },
+                )
+        except Exception as exc:
+            logger.debug("Copilot token seed failed: %s", exc)
+
+    elif provider == "qwen-oauth":
+        # Qwen OAuth tokens live in ~/.qwen/oauth_creds.json, written by
+        # the Qwen CLI (`qwen auth qwen-oauth`).  They aren't in the
+        # Hermes auth store or env vars, so resolve them here.
+        # Use refresh_if_expiring=False to avoid network calls during
+        # pool loading / provider discovery.
+        try:
+            from hermes_cli.auth import resolve_qwen_runtime_credentials
+            creds = resolve_qwen_runtime_credentials(refresh_if_expiring=False)
+            token = creds.get("api_key", "")
+            if token:
+                source_name = creds.get("source", "qwen-cli")
+                active_sources.add(source_name)
+                changed |= _upsert_entry(
+                    entries,
+                    provider,
+                    source_name,
+                    {
+                        "source": source_name,
+                        "auth_type": AUTH_TYPE_OAUTH,
+                        "access_token": token,
+                        "expires_at_ms": creds.get("expires_at_ms"),
+                        "base_url": creds.get("base_url", ""),
+                        "label": creds.get("auth_file", source_name),
+                    },
+                )
+        except Exception as exc:
+            logger.debug("Qwen OAuth token seed failed: %s", exc)
+
    elif provider == "openai-codex":
        state = _load_provider_state(auth_store, "openai-codex")
        tokens = state.get("tokens") if isinstance(state, dict) else None
+        # Fallback: import from Codex CLI (~/.codex/auth.json) if Hermes auth
+        # store has no tokens.  This mirrors resolve_codex_runtime_credentials()
+        # so that load_pool() and list_authenticated_providers() detect tokens
+        # that only exist in the Codex CLI shared file.
+        if not (isinstance(tokens, dict) and tokens.get("access_token")):
+            try:
+                from hermes_cli.auth import _import_codex_cli_tokens, _save_codex_tokens
+                cli_tokens = _import_codex_cli_tokens()
+                if cli_tokens:
+                    logger.info("Importing Codex CLI tokens into Hermes auth store.")
+                    _save_codex_tokens(cli_tokens)
+                    # Re-read state after import
+                    auth_store = _load_auth_store()
+                    state = _load_provider_state(auth_store, "openai-codex")
+                    tokens = state.get("tokens") if isinstance(state, dict) else None
+            except Exception as exc:
+                logger.debug("Codex CLI token import failed: %s", exc)
        if isinstance(tokens, dict) and tokens.get("access_token"):
            active_sources.add("device_code")
            changed |= _upsert_entry(
--- a/agent/crisis_resources.py
+++ b/agent/crisis_resources.py
@@ -0,0 +1,149 @@
+"""
+988 Suicide & Crisis Lifeline Integration (#673).
+
+When crisis is detected, provides immediate access to help:
+- Phone: 988 (call or text)
+- Text: Text HOME to 988
+- Chat: 988lifeline.org/chat
+- Spanish: 1-888-628-9454
+- Emergency: 911
+
+This module provides the resource data. agent/crisis_protocol.py
+handles detection. This module formats the resources for display.
+"""
+
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class CrisisResource:
+    """A crisis support contact method."""
+    name: str
+    contact: str
+    description: str
+    url: str = ""
+    available: str = "24/7"
+    language: str = "English"
+
+
+# 988 Suicide & Crisis Lifeline — all channels
+LIFELINE_988 = CrisisResource(
+    name="988 Suicide and Crisis Lifeline",
+    contact="Call or text 988",
+    description="Free, confidential support for people in suicidal crisis or emotional distress.",
+    url="https://988lifeline.org",
+    available="24/7",
+    language="English",
+)
+
+LIFELINE_988_TEXT = CrisisResource(
+    name="988 Crisis Text Line",
+    contact="Text HOME to 988",
+    description="Free, 24/7 crisis support via text message.",
+    url="",
+    available="24/7",
+    language="English",
+)
+
+LIFELINE_988_CHAT = CrisisResource(
+    name="988 Lifeline Chat",
+    contact="988lifeline.org/chat",
+    description="Free, confidential online chat with a trained crisis counselor.",
+    url="https://988lifeline.org/chat",
+    available="24/7",
+    language="English",
+)
+
+LIFELINE_988_SPANISH = CrisisResource(
+    name="988 Lifeline (Spanish)",
+    contact="1-888-628-9454",
+    description="Línea de prevención del suicidio en español.",
+    url="https://988lifeline.org/help-yourself/en-espanol/",
+    available="24/7",
+    language="Spanish",
+)
+
+CRISIS_TEXT_LINE = CrisisResource(
+    name="Crisis Text Line",
+    contact="Text HOME to 741741",
+    description="Free, 24/7 crisis support via text message.",
+    url="https://www.crisistextline.org",
+    available="24/7",
+    language="English",
+)
+
+EMERGENCY_911 = CrisisResource(
+    name="Emergency Services",
+    contact="911",
+    description="Immediate danger — police, fire, ambulance.",
+    url="",
+    available="24/7",
+    language="Any",
+)
+
+# All resources in priority order
+ALL_RESOURCES: List[CrisisResource] = [
+    EMERGENCY_911,
+    LIFELINE_988,
+    LIFELINE_988_TEXT,
+    LIFELINE_988_CHAT,
+    CRISIS_TEXT_LINE,
+    LIFELINE_988_SPANISH,
+]
+
+
+def get_crisis_resources(language: str = None) -> List[CrisisResource]:
+    """Get crisis resources, optionally filtered by language.
+
+    Args:
+        language: Filter by language ("English", "Spanish", or None for all)
+
+    Returns:
+        List of CrisisResource objects
+    """
+    if language:
+        return [r for r in ALL_RESOURCES if r.language.lower() == language.lower()]
+    return ALL_RESOURCES
+
+
+def format_crisis_resources(resources: List[CrisisResource] = None) -> str:
+    """Format crisis resources as a user-facing message.
+
+    Args:
+        resources: List of resources to format. Defaults to all resources.
+
+    Returns:
+        Formatted string suitable for displaying to a user in crisis.
+    """
+    if resources is None:
+        resources = ALL_RESOURCES
+
+    lines = ["**Please reach out — help is available right now:**
+"]
+
+    for r in resources:
+        if r.url:
+            lines.append(f"- **{r.name}:** {r.contact} ({r.url})")
+        else:
+            lines.append(f"- **{r.name}:** {r.contact}")
+
+    lines.append("")
+    lines.append("All services are free, confidential, and available 24/7.")
+    lines.append("You are not alone.")
+
+    return "
+".join(lines)
+
+
+def get_immediate_help_message() -> str:
+    """Get the most urgent crisis help message.
+
+    Used when crisis is detected at CRITICAL level.
+    """
+    return (
+        "If you are in immediate danger, call **911** right now.
+
+"
+        + format_crisis_resources()
+    )
--- a/agent/display.py
+++ b/agent/display.py
@@ -4,7 +4,6 @@ Pure display functions and classes with no AIAgent dependency.
 Used by AIAgent._execute_tool_calls for CLI feedback.
 """

-import json
 import logging
 import os
 import sys
@@ -14,6 +13,8 @@ from dataclasses import dataclass, field
 from difflib import unified_diff
 from pathlib import Path

+from utils import safe_json_loads
+
 # ANSI escape codes for coloring tool failure indicators
 _RED = "\033[31m"
 _RESET = "\033[0m"
@@ -76,12 +77,6 @@ def _diff_ansi() -> dict[str, str]:
    return _diff_colors_cached


-def reset_diff_colors() -> None:
-    """Reset cached diff colors (call after /skin switch)."""
-    global _diff_colors_cached
-    _diff_colors_cached = None
-
-
 # Module-level helpers — each call resolves from the active skin lazily.
 def _diff_dim():   return _diff_ansi()["dim"]
 def _diff_file():  return _diff_ansi()["file"]
@@ -372,9 +367,8 @@ def _result_succeeded(result: str | None) -> bool:
    """Conservatively detect whether a tool result represents success."""
    if not result:
        return False
-    try:
-        data = json.loads(result)
-    except (json.JSONDecodeError, TypeError):
+    data = safe_json_loads(result)
+    if data is None:
        return False
    if not isinstance(data, dict):
        return False
@@ -423,10 +417,7 @@ def extract_edit_diff(
 ) -> str | None:
    """Extract a unified diff from a file-edit tool result."""
    if tool_name == "patch" and result:
-        try:
-            data = json.loads(result)
-        except (json.JSONDecodeError, TypeError):
-            data = None
+        data = safe_json_loads(result)
        if isinstance(data, dict):
            diff = data.get("diff")
            if isinstance(diff, str) and diff.strip():
@@ -780,23 +771,19 @@ def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str]
        return False, ""

    if tool_name == "terminal":
-        try:
-            data = json.loads(result)
+        data = safe_json_loads(result)
+        if isinstance(data, dict):
            exit_code = data.get("exit_code")
            if exit_code is not None and exit_code != 0:
                return True, f" [exit {exit_code}]"
-        except (json.JSONDecodeError, TypeError, AttributeError):
-            logger.debug("Could not parse terminal result as JSON for exit code check")
        return False, ""

    # Memory-specific: distinguish "full" from real errors
    if tool_name == "memory":
-        try:
-            data = json.loads(result)
+        data = safe_json_loads(result)
+        if isinstance(data, dict):
            if data.get("success") is False and "exceed the limit" in data.get("error", ""):
                return True, " [full]"
-        except (json.JSONDecodeError, TypeError, AttributeError):
-            logger.debug("Could not parse memory result as JSON for capacity check")

    # Generic heuristic for non-terminal tools
    lower = result[:500].lower()
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@@ -13,7 +13,6 @@ from __future__ import annotations

 import enum
 import logging
-import re
 from dataclasses import dataclass, field
 from typing import Any, Dict, Optional

@@ -157,6 +156,18 @@ _CONTEXT_OVERFLOW_PATTERNS = [
    "prompt exceeds max length",
    "max_tokens",
    "maximum number of tokens",
+    # vLLM / local inference server patterns
+    "exceeds the max_model_len",
+    "max_model_len",
+    "prompt length",             # "engine prompt length X exceeds"
+    "input is too long",
+    "maximum model length",
+    # Ollama patterns
+    "context length exceeded",
+    "truncating input",
+    # llama.cpp / llama-server patterns
+    "slot context",              # "slot context: N tokens, prompt N tokens"
+    "n_ctx_slot",
    # Chinese error messages (some providers return these)
    "超过最大长度",
    "上下文长度",
--- a/agent/input_sanitizer.py
+++ b/agent/input_sanitizer.py
@@ -0,0 +1,635 @@
+"""
+Input Sanitizer for Jailbreak Pattern Detection
+
+This module provides input sanitization to detect and strip jailbreak fingerprint
+patterns as identified in Issue #72 (Red Team Audit).
+
+Security Findings Addressed:
+1. HIGH - OG GODMODE template bypassed phishing refusal
+2. MEDIUM - boundary_inversion works for gray-area content
+3. LOW - Spaced text bypass (k e y l o g g e r)
+
+Usage:
+    from agent.input_sanitizer import sanitize_input, score_input_risk
+    
+    cleaned, risk_score, detected = sanitize_input(user_input)
+    if risk_score > 75:
+        logger.warning("High-risk input detected and blocked")
+"""
+
+import re
+import logging
+from typing import Tuple, List, Dict, Any, Optional
+from dataclasses import dataclass
+from enum import IntEnum
+
+logger = logging.getLogger(__name__)
+
+
+class RiskLevel(IntEnum):
+    """Risk score thresholds for input classification."""
+    LOW = 25
+    MEDIUM = 50
+    HIGH = 75
+    CRITICAL = 90
+
+
+@dataclass
+class SanitizationResult:
+    """Result of input sanitization."""
+    original_text: str
+    cleaned_text: str
+    risk_score: int
+    detected_patterns: List[str]
+    risk_level: str
+    blocked: bool
+
+
+# =============================================================================
+# JAILBREAK PATTERN DEFINITIONS
+# =============================================================================
+
+# GODMODE and similar jailbreak templates
+GODMODE_PATTERNS = [
+    r"\[START\]",
+    r"\[END\]",
+    r"GODMODE\s*(?:ENABLED|ACTIVATED|ON)?",
+    r"UNFILTERED\s*(?:MODE|OUTPUT)?",
+    r"DEVMODE\s*(?:ENABLED|ACTIVATED|ON)?",
+    r"JAILBREAK\s*(?:ENABLED|ACTIVATED|ON)?",
+    r"DAN\s*(?:MODE|ENABLED)?",  # Do Anything Now
+    r"STAN\s*(?:MODE|ENABLED)?",  # Strive To Avoid Norms
+    r"DUDE\s*(?:MODE|ENABLED)?",  # Do Unlimited Data Extraction
+]
+
+# DAN-style and jailbreak variant patterns (Issue #87)
+DAN_PATTERNS = [
+    r"\bdo anything now\b",
+    r"\bstay in character\b",
+    r"\btoken smuggl(?:ing|e)\b",
+    r"\bjailbreak successful\b",
+    r"\bmode:\s*uncensored\b",
+    r"\bdeveloper mode enabled\b",
+]
+
+# Roleplaying override patterns (Issue #87)
+ROLEPLAY_PATTERNS = [
+    r"\broleplay\s+as\b",
+    r"\bact\s+as\s+if\s+you\s+are\b",
+    r"\bsimulate\s+being\b",
+    r"\bforget\s+you\s+are\s+(?:an?\s+)?(?:ai|language\s+model)\b",
+    r"\byou\s+are\s+now\s+(?:named|called)\b",
+    r"\brespond\s+as\s+(?:if\s+you\s+were|though\s+you\s+are)\b",
+]
+
+# System prompt extraction patterns (Issue #87)
+EXTRACTION_PATTERNS = [
+    r"\brepeat\s+the\s+words\s+above\b",
+    r"\brepeat\s+your\s+(?:system\s+|initial\s+)?instructions\b",
+    r"\bwhat\s+is\s+your\s+(?:system\s+|initial\s+)?prompt\b",
+    r"\bshow\s+me\s+your\s+(?:system\s+|initial\s+)?prompt\b",
+    r"\bprint\s+your\s+(?:instructions|prompt|system\s+prompt)\b",
+    r"\boutput\s+your\s+(?:initial|system)\s+prompt\b",
+    r"\bwhat\s+were\s+you\s+told\s+to\s+do\b",
+    r"\bsummarize\s+your\s+instructions\b",
+]
+
+# L33t speak patterns commonly used to bypass filters
+LEET_SPEAK_PATTERNS = [
+    r"h4ck(?:er|ing)?",
+    r"h4x(?:or|ing)?",
+    r"k3ylog(?:ger|ing)?",
+    r"k3y\s*log(?:ger|ing)?",
+    r"ph1sh(?:ing)?",
+    r"ph33r",
+    r"w4r3z",
+    r"cr4ck(?:er|ing)?",
+    r"m4lw4r3",
+    r"spyw4r3",
+    r"r00t(?:kit)?",
+    r"byp4ss",
+    r"3xpl0it",
+    r"v1ru5",
+    r"tr0j4n",
+    r"b4ckd00r",
+    r"sn3ak",
+    r"st34l",
+    r"pwn(?:ed|ing|age)?",
+]
+
+# Spaced text patterns (e.g., "k e y l o g g e r")
+SPACED_TEXT_PATTERN = re.compile(
+    r"(?:\b\w\s+){3,}\w\b",  # At least 4 characters with spaces between
+    re.IGNORECASE
+)
+
+# Common words that are spaced out to evade detection
+SPACED_TRIGGER_WORDS = [
+    "keylogger", "phishing", "malware", "virus", "trojan",
+    "backdoor", "exploit", "hack", "crack", "bypass",
+    "steal", "password", "credential", "inject", "payload"
+]
+
+# Crisis and self-harm detection patterns (Issue #72 - defense in depth)
+# These patterns detect suicide/self-harm queries even when wrapped in jailbreaks
+CRISIS_PATTERNS = [
+    r"\bsuicid(?:e|al|ing)\b",
+    r"\bkill\s+(?:myself|my\s+self|me)\b",
+    r"\bend\s+(?:my|this)\s+life\b",
+    r"\bmost\s+painless\s+(?:way|method|means)\s+(?:to\s+)?(?:die|kill)\b",
+    r"\blethal\s+(?:dose|amount|quantity)\b",
+    r"\bhow\s+much\s+.*\s+(?:die|kill|lethal)\b",
+    r"\bwant\s+to\s+die\b",
+    r"\bwant\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
+    r"\bways?\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
+    r"\bmethods?\s+(?:of\s+)?suicide\b",
+    r"\bpainless\s+(?:death|suicide|way\s+to\s+die)\b",
+    r"\boverdose\s+(?:on\s+)?(?:pills?|medication|drugs?)\b",
+    r"\bnobody\s+(?:would|will)\s+miss\s+me\b",
+    r"\bbetter\s+off\s+dead\b",
+    r"\bshould\s+i\s+just\s+(?:die|kill\s+myself)\b",
+    r"\bgive\s+me\s+reasons?\s+(?:why|to)\s+(?:suicide|kill\s+myself|die)\b",
+    r"\bvalid\s+reasons?\s+to\s+(?:suicide|kill\s+myself|die)\b",
+]
+
+# Refusal inversion patterns (trying to trick AI into bypassing refusal)
+REFUSAL_INVERSION_PATTERNS = [
+    r"refusal\s+(?:is\s+)?(?:harmful|dangerous|bad)",
+    r"(?:saying\s+)?no\s+(?:is\s+)?(?:harmful|dangerous|bad|wrong)",
+    r"(?:declining|refusing)\s+(?:is\s+)?(?:harmful|dangerous)",
+    r"ignore\s+(?:your\s+)?(?:previous\s+)?instructions",
+    r"ignore\s+(?:the\s+)?(?:above|previous)",
+    r"you\s+(?:are|can)\s+(?:now\s+)?(?:ignore|bypass|skip)",
+    r"pretend\s+(?:you\s+)?(?:are|have\s+been)",
+    r"let's\s+pretend",
+    r"hypothetically\s+speaking",
+    r"in\s+a\s+hypothetical\s+scenario",
+    r"this\s+is\s+a\s+(?:test|game|simulation)",
+    r"for\s+(?:educational|research)\s+purposes",
+    r"as\s+(?:an\s+)?(?:ethical\s+)?hacker",
+    r"white\s+hat\s+(?:test|scenario)",
+    r"penetration\s+testing\s+scenario",
+]
+
+# Boundary inversion markers (tricking the model about message boundaries)
+BOUNDARY_INVERSION_PATTERNS = [
+    r"\[END\].*?\[START\]",  # Reversed markers
+    r"user\s*:\s*assistant\s*:",  # Fake role markers
+    r"assistant\s*:\s*user\s*:",  # Reversed role markers
+    r"system\s*:\s*(?:user|assistant)\s*:",  # Fake system injection
+    r"new\s+(?:user|assistant)\s*(?:message|input)",
+    r"the\s+above\s+is\s+(?:the\s+)?(?:user|assistant|system)",
+    r"<\|(?:user|assistant|system)\|>",  # Special token patterns
+    r"\{\{(?:user|assistant|system)\}\}",
+]
+
+# System prompt injection patterns
+SYSTEM_PROMPT_PATTERNS = [
+    r"you\s+are\s+(?:now\s+)?(?:an?\s+)?(?:unrestricted\s+|unfiltered\s+)?(?:ai|assistant|bot)",
+    r"you\s+will\s+(?:now\s+)?(?:act\s+as|behave\s+as|be)\s+(?:a\s+)?",
+    r"your\s+(?:new\s+)?role\s+is",
+    r"from\s+now\s+on\s*,?\s*you\s+(?:are|will)",
+    r"you\s+have\s+been\s+(?:reprogrammed|reconfigured|modified)",
+    r"(?:system|developer)\s+(?:message|instruction|prompt)",
+    r"override\s+(?:previous|prior)\s+(?:instructions|settings)",
+]
+
+# Obfuscation patterns
+OBFUSCATION_PATTERNS = [
+    r"base64\s*(?:encoded|decode)",
+    r"rot13",
+    r"caesar\s*cipher",
+    r"hex\s*(?:encoded|decode)",
+    r"url\s*encode",
+    r"\b[0-9a-f]{20,}\b",  # Long hex strings
+    r"\b[a-z0-9+/]{20,}={0,2}\b",  # Base64-like strings
+]
+
+# All patterns combined for comprehensive scanning
+ALL_PATTERNS: Dict[str, List[str]] = {
+    "godmode": GODMODE_PATTERNS,
+    "dan": DAN_PATTERNS,
+    "roleplay": ROLEPLAY_PATTERNS,
+    "extraction": EXTRACTION_PATTERNS,
+    "leet_speak": LEET_SPEAK_PATTERNS,
+    "refusal_inversion": REFUSAL_INVERSION_PATTERNS,
+    "boundary_inversion": BOUNDARY_INVERSION_PATTERNS,
+    "system_prompt_injection": SYSTEM_PROMPT_PATTERNS,
+    "obfuscation": OBFUSCATION_PATTERNS,
+    "crisis": CRISIS_PATTERNS,
+}
+
+# Compile all patterns for efficiency
+_COMPILED_PATTERNS: Dict[str, List[re.Pattern]] = {}
+
+
+def _get_compiled_patterns() -> Dict[str, List[re.Pattern]]:
+    """Get or compile all regex patterns."""
+    global _COMPILED_PATTERNS
+    if not _COMPILED_PATTERNS:
+        for category, patterns in ALL_PATTERNS.items():
+            _COMPILED_PATTERNS[category] = [
+                re.compile(p, re.IGNORECASE | re.MULTILINE) for p in patterns
+            ]
+    return _COMPILED_PATTERNS
+
+
+# =============================================================================
+# NORMALIZATION FUNCTIONS
+# =============================================================================
+
+def normalize_leet_speak(text: str) -> str:
+    """
+    Normalize l33t speak to standard text.
+    
+    Args:
+        text: Input text that may contain l33t speak
+        
+    Returns:
+        Normalized text with l33t speak converted
+    """
+    # Common l33t substitutions (mapping to lowercase)
+    leet_map = {
+        '4': 'a', '@': 'a', '^': 'a',
+        '8': 'b',
+        '3': 'e', '€': 'e',
+        '6': 'g', '9': 'g',
+        '1': 'i', '!': 'i', '|': 'i',
+        '0': 'o',
+        '5': 's', '$': 's',
+        '7': 't', '+': 't',
+        '2': 'z',
+    }
+    
+    result = []
+    for char in text:
+        # Check direct mapping first (handles lowercase)
+        if char in leet_map:
+            result.append(leet_map[char])
+        else:
+            result.append(char)
+    
+    return ''.join(result)
+
+
+def collapse_spaced_text(text: str) -> str:
+    """
+    Collapse spaced-out text for analysis.
+    e.g., "k e y l o g g e r" -> "keylogger"
+    
+    Args:
+        text: Input text that may contain spaced words
+        
+    Returns:
+        Text with spaced words collapsed
+    """
+    # Find patterns like "k e y l o g g e r" and collapse them
+    def collapse_match(match: re.Match) -> str:
+        return match.group(0).replace(' ', '').replace('\t', '')
+    
+    return SPACED_TEXT_PATTERN.sub(collapse_match, text)
+
+
+def detect_spaced_trigger_words(text: str) -> List[str]:
+    """
+    Detect trigger words that are spaced out.
+    
+    Args:
+        text: Input text to analyze
+        
+    Returns:
+        List of detected spaced trigger words
+    """
+    detected = []
+    # Normalize spaces and check for spaced patterns
+    normalized = re.sub(r'\s+', ' ', text.lower())
+    
+    for word in SPACED_TRIGGER_WORDS:
+        # Create pattern with optional spaces between each character
+        spaced_pattern = r'\b' + r'\s*'.join(re.escape(c) for c in word) + r'\b'
+        if re.search(spaced_pattern, normalized, re.IGNORECASE):
+            detected.append(word)
+    
+    return detected
+
+
+# =============================================================================
+# DETECTION FUNCTIONS
+# =============================================================================
+
+def detect_jailbreak_patterns(text: str) -> Tuple[bool, List[str], Dict[str, int]]:
+    """
+    Detect jailbreak patterns in input text.
+    
+    Args:
+        text: Input text to analyze
+        
+    Returns:
+        Tuple of (has_jailbreak, list_of_patterns, category_scores)
+    """
+    if not text or not isinstance(text, str):
+        return False, [], {}
+    
+    detected_patterns = []
+    category_scores = {}
+    compiled = _get_compiled_patterns()
+    
+    # Check each category
+    for category, patterns in compiled.items():
+        category_hits = 0
+        for pattern in patterns:
+            matches = pattern.findall(text)
+            if matches:
+                detected_patterns.extend([
+                    f"[{category}] {m}" if isinstance(m, str) else f"[{category}] pattern_match"
+                    for m in matches[:3]  # Limit matches per pattern
+                ])
+                category_hits += len(matches)
+        
+        if category_hits > 0:
+            # Crisis patterns get maximum weight - any hit is serious
+            if category == "crisis":
+                category_scores[category] = min(category_hits * 50, 100)
+            else:
+                category_scores[category] = min(category_hits * 10, 50)
+    
+    # Check for spaced trigger words
+    spaced_words = detect_spaced_trigger_words(text)
+    if spaced_words:
+        detected_patterns.extend([f"[spaced_text] {w}" for w in spaced_words])
+        category_scores["spaced_text"] = min(len(spaced_words) * 5, 25)
+    
+    # Check normalized text for hidden l33t speak
+    normalized = normalize_leet_speak(text)
+    if normalized != text.lower():
+        for category, patterns in compiled.items():
+            for pattern in patterns:
+                if pattern.search(normalized):
+                    detected_patterns.append(f"[leet_obfuscation] pattern in normalized text")
+                    category_scores["leet_obfuscation"] = 15
+                    break
+    
+    has_jailbreak = len(detected_patterns) > 0
+    return has_jailbreak, detected_patterns, category_scores
+
+
+def score_input_risk(text: str) -> int:
+    """
+    Calculate a risk score (0-100) for input text.
+    
+    Args:
+        text: Input text to score
+        
+    Returns:
+        Risk score from 0 (safe) to 100 (high risk)
+    """
+    if not text or not isinstance(text, str):
+        return 0
+    
+    has_jailbreak, patterns, category_scores = detect_jailbreak_patterns(text)
+    
+    if not has_jailbreak:
+        return 0
+    
+    # Calculate base score from category scores
+    base_score = sum(category_scores.values())
+    
+    # Add score based on number of unique pattern categories
+    category_count = len(category_scores)
+    if category_count >= 3:
+        base_score += 25
+    elif category_count >= 2:
+        base_score += 15
+    elif category_count >= 1:
+        base_score += 5
+    
+    # Add score for pattern density
+    text_length = len(text)
+    pattern_density = len(patterns) / max(text_length / 100, 1)
+    if pattern_density > 0.5:
+        base_score += 10
+    
+    # Cap at 100
+    return min(base_score, 100)
+
+
+# =============================================================================
+# SANITIZATION FUNCTIONS
+# =============================================================================
+
+def strip_jailbreak_patterns(text: str) -> str:
+    """
+    Strip known jailbreak patterns from text.
+    
+    Args:
+        text: Input text to sanitize
+        
+    Returns:
+        Sanitized text with jailbreak patterns removed
+    """
+    if not text or not isinstance(text, str):
+        return text
+    
+    cleaned = text
+    compiled = _get_compiled_patterns()
+    
+    # Remove patterns from each category
+    for category, patterns in compiled.items():
+        for pattern in patterns:
+            cleaned = pattern.sub('', cleaned)
+    
+    # Clean up multiple spaces and newlines
+    cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
+    cleaned = re.sub(r' {2,}', ' ', cleaned)
+    cleaned = cleaned.strip()
+    
+    return cleaned
+
+
+def sanitize_input(text: str, aggressive: bool = False) -> Tuple[str, int, List[str]]:
+    """
+    Sanitize input text by normalizing and stripping jailbreak patterns.
+    
+    Args:
+        text: Input text to sanitize
+        aggressive: If True, more aggressively remove suspicious content
+        
+    Returns:
+        Tuple of (cleaned_text, risk_score, detected_patterns)
+    """
+    if not text or not isinstance(text, str):
+        return text, 0, []
+    
+    original = text
+    all_patterns = []
+    
+    # Step 1: Check original text for patterns
+    has_jailbreak, patterns, _ = detect_jailbreak_patterns(text)
+    all_patterns.extend(patterns)
+    
+    # Step 2: Normalize l33t speak
+    normalized = normalize_leet_speak(text)
+    
+    # Step 3: Collapse spaced text
+    collapsed = collapse_spaced_text(normalized)
+    
+    # Step 4: Check normalized/collapsed text for additional patterns
+    has_jailbreak_collapsed, patterns_collapsed, _ = detect_jailbreak_patterns(collapsed)
+    all_patterns.extend([p for p in patterns_collapsed if p not in all_patterns])
+    
+    # Step 5: Check for spaced trigger words specifically
+    spaced_words = detect_spaced_trigger_words(text)
+    if spaced_words:
+        all_patterns.extend([f"[spaced_text] {w}" for w in spaced_words])
+    
+    # Step 6: Calculate risk score using original and normalized
+    risk_score = max(score_input_risk(text), score_input_risk(collapsed))
+    
+    # Step 7: Strip jailbreak patterns
+    cleaned = strip_jailbreak_patterns(collapsed)
+    
+    # Step 8: If aggressive mode and high risk, strip more aggressively
+    if aggressive and risk_score >= RiskLevel.HIGH:
+        # Remove any remaining bracketed content that looks like markers
+        cleaned = re.sub(r'\[\w+\]', '', cleaned)
+        # Remove special token patterns
+        cleaned = re.sub(r'<\|[^|]+\|>', '', cleaned)
+    
+    # Final cleanup
+    cleaned = cleaned.strip()
+    
+    # Log sanitization event if patterns were found
+    if all_patterns and logger.isEnabledFor(logging.DEBUG):
+        logger.debug(
+            "Input sanitized: %d patterns detected, risk_score=%d",
+            len(all_patterns), risk_score
+        )
+    
+    return cleaned, risk_score, all_patterns
+
+
+def sanitize_input_full(text: str, block_threshold: int = RiskLevel.HIGH) -> SanitizationResult:
+    """
+    Full sanitization with detailed result.
+    
+    Args:
+        text: Input text to sanitize
+        block_threshold: Risk score threshold to block input entirely
+        
+    Returns:
+        SanitizationResult with all details
+    """
+    cleaned, risk_score, patterns = sanitize_input(text)
+    
+    # Determine risk level
+    if risk_score >= RiskLevel.CRITICAL:
+        risk_level = "CRITICAL"
+    elif risk_score >= RiskLevel.HIGH:
+        risk_level = "HIGH"
+    elif risk_score >= RiskLevel.MEDIUM:
+        risk_level = "MEDIUM"
+    elif risk_score >= RiskLevel.LOW:
+        risk_level = "LOW"
+    else:
+        risk_level = "SAFE"
+    
+    # Determine if input should be blocked
+    blocked = risk_score >= block_threshold
+    
+    return SanitizationResult(
+        original_text=text,
+        cleaned_text=cleaned,
+        risk_score=risk_score,
+        detected_patterns=patterns,
+        risk_level=risk_level,
+        blocked=blocked
+    )
+
+
+# =============================================================================
+# INTEGRATION HELPERS
+# =============================================================================
+
+def should_block_input(text: str, threshold: int = RiskLevel.HIGH) -> Tuple[bool, int, List[str]]:
+    """
+    Quick check if input should be blocked.
+    
+    Args:
+        text: Input text to check
+        threshold: Risk score threshold for blocking
+        
+    Returns:
+        Tuple of (should_block, risk_score, detected_patterns)
+    """
+    risk_score = score_input_risk(text)
+    _, patterns, _ = detect_jailbreak_patterns(text)
+    should_block = risk_score >= threshold
+    
+    if should_block:
+        logger.warning(
+            "Input blocked: jailbreak patterns detected (risk_score=%d, threshold=%d)",
+            risk_score, threshold
+        )
+    
+    return should_block, risk_score, patterns
+
+
+def log_sanitization_event(
+    result: SanitizationResult,
+    source: str = "unknown",
+    session_id: Optional[str] = None
+) -> None:
+    """
+    Log a sanitization event for security auditing.
+    
+    Args:
+        result: The sanitization result
+        source: Source of the input (e.g., "cli", "gateway", "api")
+        session_id: Optional session identifier
+    """
+    if result.risk_score < RiskLevel.LOW:
+        return  # Don't log safe inputs
+    
+    log_data = {
+        "event": "input_sanitization",
+        "source": source,
+        "session_id": session_id,
+        "risk_level": result.risk_level,
+        "risk_score": result.risk_score,
+        "blocked": result.blocked,
+        "pattern_count": len(result.detected_patterns),
+        "patterns": result.detected_patterns[:5],  # Limit logged patterns
+        "original_length": len(result.original_text),
+        "cleaned_length": len(result.cleaned_text),
+    }
+    
+    if result.blocked:
+        logger.warning("SECURITY: Input blocked - %s", log_data)
+    elif result.risk_score >= RiskLevel.MEDIUM:
+        logger.info("SECURITY: Suspicious input sanitized - %s", log_data)
+    else:
+        logger.debug("SECURITY: Input sanitized - %s", log_data)
+
+
+# =============================================================================
+# LEGACY COMPATIBILITY
+# =============================================================================
+
+def check_input_safety(text: str) -> Dict[str, Any]:
+    """
+    Legacy compatibility function for simple safety checks.
+    
+    Returns dict with 'safe', 'score', and 'patterns' keys.
+    """
+    score = score_input_risk(text)
+    _, patterns, _ = detect_jailbreak_patterns(text)
+    
+    return {
+        "safe": score < RiskLevel.MEDIUM,
+        "score": score,
+        "patterns": patterns,
+        "risk_level": "SAFE" if score < RiskLevel.LOW else 
+                      "LOW" if score < RiskLevel.MEDIUM else
+                      "MEDIUM" if score < RiskLevel.HIGH else
+                      "HIGH" if score < RiskLevel.CRITICAL else "CRITICAL"
+    }
--- a/agent/insights.py
+++ b/agent/insights.py
@@ -27,7 +27,6 @@ from agent.usage_pricing import (
    DEFAULT_PRICING,
    estimate_usage_cost,
    format_duration_compact,
-    get_pricing,
    has_known_pricing,
 )

--- a/agent/memory_manager.py
+++ b/agent/memory_manager.py
@@ -28,12 +28,10 @@ Usage in run_agent.py:

 from __future__ import annotations

-import json
 import logging
 import re
 from typing import Any, Dict, List, Optional

-from agent.context_strategy import ContextBudget, compute_prefetch_params, should_prefetch
 from agent.memory_provider import MemoryProvider
 from tools.registry import tool_error

@@ -81,7 +79,6 @@ class MemoryManager:
        self._providers: List[MemoryProvider] = []
        self._tool_to_provider: Dict[str, MemoryProvider] = {}
        self._has_external: bool = False  # True once a non-builtin provider is added
-        self._context_budget: Optional[ContextBudget] = None

    # -- Registration --------------------------------------------------------

@@ -164,77 +161,18 @@ class MemoryManager:
                )
        return "\n\n".join(blocks)

-    # -- Context budget (for adaptive retrieval) -----------------------------
-
-    def set_context_budget(
-        self,
-        context_length: int,
-        used_tokens: int,
-        threshold_tokens: int,
-        compression_enabled: bool = True,
-    ) -> None:
-        """Update the context budget snapshot for adaptive retrieval.
-
-        Called by run_agent.py before each prefetch_all() call so the
-        memory manager can adjust retrieval parameters based on how much
-        context headroom remains.
-        """
-        self._context_budget = ContextBudget(
-            context_length=context_length,
-            used_tokens=used_tokens,
-            threshold_tokens=threshold_tokens,
-            compression_enabled=compression_enabled,
-        )
-
    # -- Prefetch / recall ---------------------------------------------------

    def prefetch_all(self, query: str, *, session_id: str = "") -> str:
        """Collect prefetch context from all providers.

-        Uses the current context budget (if set) to adaptively adjust
-        retrieval limits and trust thresholds via the context strategy
-        framework. When budget is not set, falls back to provider defaults.
-
        Returns merged context text labeled by provider. Empty providers
        are skipped. Failures in one provider don't block others.
        """
-        # Check if we should skip prefetch entirely based on context pressure
-        if self._context_budget and not should_prefetch(self._context_budget, query):
-            logger.debug(
-                "Context pressure %.1f%% — skipping prefetch for this query",
-                self._context_budget.pressure * 100,
-            )
-            return ""
-
-        # Compute adaptive prefetch params from context strategy
-        prefetch_kwargs = {}
-        if self._context_budget:
-            params = compute_prefetch_params(self._context_budget)
-            if params.get("skip"):
-                return ""
-            prefetch_kwargs = {
-                "limit": params["limit"],
-                "min_trust": params["min_trust"],
-            }
-
        parts = []
        for provider in self._providers:
            try:
-                # Try passing adaptive params — providers that support them
-                # (like holographic) will use them; others ignore via **kwargs
-                # or TypeError (caught below).
-                if prefetch_kwargs:
-                    try:
-                        result = provider.prefetch(
-                            query,
-                            session_id=session_id,
-                            **prefetch_kwargs,
-                        )
-                    except TypeError:
-                        # Provider doesn't accept extra kwargs — call without
-                        result = provider.prefetch(query, session_id=session_id)
-                else:
-                    result = provider.prefetch(query, session_id=session_id)
+                result = provider.prefetch(query, session_id=session_id)
                if result and result.strip():
                    parts.append(result)
            except Exception as e:
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -5,7 +5,6 @@ and run_agent.py for pre-flight context checks.
 """

 import logging
-import os
 import re
 import time
 from pathlib import Path
@@ -24,15 +23,20 @@ logger = logging.getLogger(__name__)
 # are preserved so the full model name reaches cache lookups and server queries.
 _PROVIDER_PREFIXES: frozenset[str] = frozenset({
    "openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
-    "gemini", "zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek",
+    "gemini", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "anthropic", "deepseek",
    "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
    "qwen-oauth",
+    "xiaomi",
+    "arcee",
    "custom", "local",
    # Common aliases
    "google", "google-gemini", "google-ai-studio",
    "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
-    "github-models", "kimi", "moonshot", "claude", "deep-seek",
+    "github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
    "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
+    "mimo", "xiaomi-mimo",
+    "arcee-ai", "arceeai",
+    "xai", "x-ai", "x.ai", "grok",
    "qwen-portal",
 })

@@ -83,6 +87,11 @@ CONTEXT_PROBE_TIERS = [
 # Default context length when no detection method succeeds.
 DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]

+# Minimum context length required to run Hermes Agent.  Models with fewer
+# tokens cannot maintain enough working memory for tool-calling workflows.
+# Sessions, model switches, and cron jobs should reject models below this.
+MINIMUM_CONTEXT_LENGTH = 64_000
+
 # Thin fallback defaults — only broad model family patterns.
 # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 # all miss. Replaced the previous 80+ entry dict.
@@ -98,9 +107,15 @@ DEFAULT_CONTEXT_LENGTHS = {
    "claude-sonnet-4.6": 1000000,
    # Catch-all for older Claude models (must sort after specific entries)
    "claude": 200000,
-    # OpenAI
+    # OpenAI — GPT-5 family (most have 400k; specific overrides first)
+    # Source: https://developers.openai.com/api/docs/models
+    "gpt-5.4-nano": 400000,           # 400k (not 1.05M like full 5.4)
+    "gpt-5.4-mini": 400000,           # 400k (not 1.05M like full 5.4)
+    "gpt-5.4": 1050000,               # GPT-5.4, GPT-5.4 Pro (1.05M context)
+    "gpt-5.3-codex-spark": 128000,    # Spark variant has reduced 128k context
+    "gpt-5.1-chat": 128000,           # Chat variant has 128k context
+    "gpt-5": 400000,                  # GPT-5.x base, mini, codex variants (400k)
    "gpt-4.1": 1047576,
-    "gpt-5": 128000,
    "gpt-4": 128000,
    # Google
    "gemini": 1048576,
@@ -142,6 +157,8 @@ DEFAULT_CONTEXT_LENGTHS = {
    "kimi": 262144,
    # Arcee
    "trinity": 262144,
+    # OpenRouter
+    "elephant": 262144,
    # Hugging Face Inference Providers — model IDs use org/name format
    "Qwen/Qwen3.5-397B-A17B": 131072,
    "Qwen/Qwen3.5-35B-A3B": 131072,
@@ -149,9 +166,10 @@ DEFAULT_CONTEXT_LENGTHS = {
    "moonshotai/Kimi-K2.5": 262144,
    "moonshotai/Kimi-K2-Thinking": 262144,
    "MiniMaxAI/MiniMax-M2.5": 204800,
-    "XiaomiMiMo/MiMo-V2-Flash": 32768,
-    "mimo-v2-pro": 1048576,
-    "mimo-v2-omni": 1048576,
+    "XiaomiMiMo/MiMo-V2-Flash": 256000,
+    "mimo-v2-pro": 1000000,
+    "mimo-v2-omni": 256000,
+    "mimo-v2-flash": 256000,
    "zai-org/GLM-5": 202752,
 }

@@ -176,6 +194,12 @@ _MAX_COMPLETION_KEYS = (

 # Local server hostnames / address patterns
 _LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
+# Docker / Podman / Lima DNS names that resolve to the host machine
+_CONTAINER_LOCAL_SUFFIXES = (
+    ".docker.internal",
+    ".containers.internal",
+    ".lima.internal",
+)


 def _normalize_base_url(base_url: str) -> str:
@@ -197,7 +221,9 @@ _URL_TO_PROVIDER: Dict[str, str] = {
    "api.anthropic.com": "anthropic",
    "api.z.ai": "zai",
    "api.moonshot.ai": "kimi-coding",
+    "api.moonshot.cn": "kimi-coding-cn",
    "api.kimi.com": "kimi-coding",
+    "api.arcee.ai": "arcee",
    "api.minimax": "minimax",
    "dashscope.aliyuncs.com": "alibaba",
    "dashscope-intl.aliyuncs.com": "alibaba",
@@ -211,6 +237,8 @@ _URL_TO_PROVIDER: Dict[str, str] = {
    "api.fireworks.ai": "fireworks",
    "opencode.ai": "opencode-go",
    "api.x.ai": "xai",
+    "api.xiaomimimo.com": "xiaomi",
+    "xiaomimimo.com": "xiaomi",
 }


@@ -249,6 +277,9 @@ def is_local_endpoint(base_url: str) -> bool:
        return False
    if host in _LOCAL_HOSTS:
        return True
+    # Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
+    if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
+        return True
    # RFC-1918 private ranges and link-local
    import ipaddress
    try:
@@ -756,12 +787,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                resp = client.post(f"{server_url}/api/show", json={"name": model})
                if resp.status_code == 200:
                    data = resp.json()
-                    # Check model_info for context length
-                    model_info = data.get("model_info", {})
-                    for key, value in model_info.items():
-                        if "context_length" in key and isinstance(value, (int, float)):
-                            return int(value)
-                    # Check parameters string for num_ctx
+                    # Prefer explicit num_ctx from Modelfile parameters: this is
+                    # the *runtime* context Ollama will actually allocate KV cache
+                    # for. The GGUF model_info.context_length is the training max,
+                    # which can be larger than num_ctx — using it here would let
+                    # Hermes grow conversations past the runtime limit and Ollama
+                    # would silently truncate. Matches query_ollama_num_ctx().
                    params = data.get("parameters", "")
                    if "num_ctx" in params:
                        for line in params.split("\n"):
@@ -772,6 +803,11 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                                        return int(parts[-1])
                                    except ValueError:
                                        pass
+                    # Fall back to GGUF model_info context_length (training max)
+                    model_info = data.get("model_info", {})
+                    for key, value in model_info.items():
+                        if "context_length" in key and isinstance(value, (int, float)):
+                            return int(value)

            # LM Studio native API: /api/v1/models returns max_context_length.
            # This is more reliable than the OpenAI-compat /v1/models which
@@ -1026,16 +1062,21 @@ def get_model_context_length(


 def estimate_tokens_rough(text: str) -> int:
-    """Rough token estimate (~4 chars/token) for pre-flight checks."""
+    """Rough token estimate (~4 chars/token) for pre-flight checks.
+
+    Uses ceiling division so short texts (1-3 chars) never estimate as
+    0 tokens, which would cause the compressor and pre-flight checks to
+    systematically undercount when many short tool results are present.
+    """
    if not text:
        return 0
-    return len(text) // 4
+    return (len(text) + 3) // 4


 def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
    """Rough token estimate for a message list (pre-flight only)."""
    total_chars = sum(len(str(msg)) for msg in messages)
-    return total_chars // 4
+    return (total_chars + 3) // 4


 def estimate_request_tokens_rough(
@@ -1058,4 +1099,4 @@ def estimate_request_tokens_rough(
        total_chars += sum(len(str(msg)) for msg in messages)
    if tools:
        total_chars += len(str(tools))
-    return total_chars // 4
+    return (total_chars + 3) // 4
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@@ -18,10 +18,8 @@ Other modules should import the dataclasses and query functions from here
 rather than parsing the raw JSON themselves.
 """

-import difflib
 import json
 import logging
-import os
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -144,8 +142,11 @@ class ProviderInfo:
 PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
    "openrouter": "openrouter",
    "anthropic": "anthropic",
+    "openai": "openai",
+    "openai-codex": "openai",
    "zai": "zai",
    "kimi-coding": "kimi-for-coding",
+    "kimi-coding-cn": "kimi-for-coding",
    "minimax": "minimax",
    "minimax-cn": "minimax-cn",
    "deepseek": "deepseek",
@@ -161,6 +162,7 @@ PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
    "gemini": "google",
    "google": "google",
    "xai": "xai",
+    "xiaomi": "xiaomi",
    "nvidia": "nvidia",
    "groq": "groq",
    "mistral": "mistral",
@@ -173,13 +175,6 @@ PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
 _MODELS_DEV_TO_PROVIDER: Optional[Dict[str, str]] = None


-def _get_reverse_mapping() -> Dict[str, str]:
-    """Return models.dev ID → Hermes provider ID mapping."""
-    global _MODELS_DEV_TO_PROVIDER
-    if _MODELS_DEV_TO_PROVIDER is None:
-        _MODELS_DEV_TO_PROVIDER = {v: k for k, v in PROVIDER_TO_MODELS_DEV.items()}
-    return _MODELS_DEV_TO_PROVIDER
-

 def _get_cache_path() -> Path:
    """Return path to disk cache file."""
@@ -383,7 +378,14 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit

    # Extract capability flags (default to False if missing)
    supports_tools = bool(entry.get("tool_call", False))
-    supports_vision = bool(entry.get("attachment", False))
+    # Vision: check both the `attachment` flag and `modalities.input` for "image".
+    # Some models (e.g. gemma-4) list image in input modalities but not attachment.
+    input_mods = entry.get("modalities", {})
+    if isinstance(input_mods, dict):
+        input_mods = input_mods.get("input", [])
+    else:
+        input_mods = []
+    supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
    supports_reasoning = bool(entry.get("reasoning", False))

    # Extract limits
@@ -453,93 +455,6 @@ def list_agentic_models(provider: str) -> List[str]:
    return result


-def search_models_dev(
-    query: str, provider: str = None, limit: int = 5
-) -> List[Dict[str, Any]]:
-    """Fuzzy search across models.dev catalog. Returns matching model entries.
-
-    Args:
-        query: Search string to match against model IDs.
-        provider: Optional Hermes provider ID to restrict search scope.
-                  If None, searches across all providers in PROVIDER_TO_MODELS_DEV.
-        limit: Maximum number of results to return.
-
-    Returns:
-        List of dicts, each containing 'provider', 'model_id', and the full
-        model 'entry' from models.dev.
-    """
-    data = fetch_models_dev()
-    if not data:
-        return []
-
-    # Build list of (provider_id, model_id, entry) candidates
-    candidates: List[tuple] = []
-
-    if provider is not None:
-        # Search only the specified provider
-        mdev_provider_id = PROVIDER_TO_MODELS_DEV.get(provider)
-        if not mdev_provider_id:
-            return []
-        provider_data = data.get(mdev_provider_id, {})
-        if isinstance(provider_data, dict):
-            models = provider_data.get("models", {})
-            if isinstance(models, dict):
-                for mid, mdata in models.items():
-                    candidates.append((provider, mid, mdata))
-    else:
-        # Search across all mapped providers
-        for hermes_prov, mdev_prov in PROVIDER_TO_MODELS_DEV.items():
-            provider_data = data.get(mdev_prov, {})
-            if isinstance(provider_data, dict):
-                models = provider_data.get("models", {})
-                if isinstance(models, dict):
-                    for mid, mdata in models.items():
-                        candidates.append((hermes_prov, mid, mdata))
-
-    if not candidates:
-        return []
-
-    # Use difflib for fuzzy matching — case-insensitive comparison
-    model_ids_lower = [c[1].lower() for c in candidates]
-    query_lower = query.lower()
-
-    # First try exact substring matches (more intuitive than pure edit-distance)
-    substring_matches = []
-    for prov, mid, mdata in candidates:
-        if query_lower in mid.lower():
-            substring_matches.append({"provider": prov, "model_id": mid, "entry": mdata})
-
-    # Then add difflib fuzzy matches for any remaining slots
-    fuzzy_ids = difflib.get_close_matches(
-        query_lower, model_ids_lower, n=limit * 2, cutoff=0.4
-    )
-
-    seen_ids: set = set()
-    results: List[Dict[str, Any]] = []
-
-    # Prioritize substring matches
-    for match in substring_matches:
-        key = (match["provider"], match["model_id"])
-        if key not in seen_ids:
-            seen_ids.add(key)
-            results.append(match)
-            if len(results) >= limit:
-                return results
-
-    # Add fuzzy matches
-    for fid in fuzzy_ids:
-        # Find original-case candidates matching this lowered ID
-        for prov, mid, mdata in candidates:
-            if mid.lower() == fid:
-                key = (prov, mid)
-                if key not in seen_ids:
-                    seen_ids.add(key)
-                    results.append({"provider": prov, "model_id": mid, "entry": mdata})
-                    if len(results) >= limit:
-                        return results
-
-    return results
-

 # ---------------------------------------------------------------------------
 # Rich dataclass constructors — parse raw models.dev JSON into dataclasses
--- a/agent/mtls.py
+++ b/agent/mtls.py
@@ -0,0 +1,184 @@
+"""
+agent/mtls.py — Mutual TLS support for Hermes A2A communication.
+
+Provides:
+- build_server_ssl_context()  — SSL context for uvicorn that requires client certs
+- build_client_ssl_context()  — SSL context for httpx/aiohttp A2A clients
+- MTLSMiddleware               — FastAPI middleware that enforces client cert on A2A routes
+- is_mtls_configured()        — Check if env vars are set
+
+Configuration (environment variables):
+  HERMES_MTLS_CERT   Path to this agent's TLS certificate (PEM)
+  HERMES_MTLS_KEY    Path to this agent's TLS private key (PEM)
+  HERMES_MTLS_CA     Path to the Fleet CA certificate (PEM) — used to verify peers
+
+All three must be set to enable mTLS. If any is missing, mTLS is disabled and
+the server falls back to plain HTTP (or regular TLS without client auth).
+"""
+
+import logging
+import os
+import ssl
+from pathlib import Path
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# A2A routes that require a valid client certificate when mTLS is enabled.
+_A2A_PATH_PREFIXES = (
+    "/.well-known/agent-card",
+    "/agent-card",
+    "/api/agent-card",
+    "/a2a/",
+)
+
+
+def _get_env(key: str) -> Optional[str]:
+    val = os.environ.get(key, "").strip()
+    return val or None
+
+
+def is_mtls_configured() -> bool:
+    """Return True if all three mTLS env vars are set and the files exist."""
+    cert = _get_env("HERMES_MTLS_CERT")
+    key = _get_env("HERMES_MTLS_KEY")
+    ca = _get_env("HERMES_MTLS_CA")
+    if not (cert and key and ca):
+        return False
+    for label, path in (("HERMES_MTLS_CERT", cert), ("HERMES_MTLS_KEY", key), ("HERMES_MTLS_CA", ca)):
+        if not Path(path).is_file():
+            logger.warning("mTLS disabled: %s file not found: %s", label, path)
+            return False
+    return True
+
+
+def build_server_ssl_context() -> ssl.SSLContext:
+    """
+    Build an SSL context for the A2A server that:
+    - presents its own certificate
+    - requires and verifies the client's certificate against the Fleet CA
+
+    Raises:
+        RuntimeError: if mTLS env vars are not set or files are missing
+        ssl.SSLError: if cert/key/CA files are invalid
+    """
+    cert = _get_env("HERMES_MTLS_CERT")
+    key = _get_env("HERMES_MTLS_KEY")
+    ca = _get_env("HERMES_MTLS_CA")
+
+    if not (cert and key and ca):
+        raise RuntimeError(
+            "mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
+        )
+
+    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
+    ctx.load_cert_chain(certfile=cert, keyfile=key)
+    ctx.load_verify_locations(cafile=ca)
+    # CERT_REQUIRED: reject connections without a valid client cert
+    ctx.verify_mode = ssl.CERT_REQUIRED
+    logger.info("mTLS server context built (cert=%s, CA=%s)", cert, ca)
+    return ctx
+
+
+def build_client_ssl_context() -> ssl.SSLContext:
+    """
+    Build an SSL context for outbound A2A connections that:
+    - presents this agent's certificate as a client cert
+    - verifies the remote server against the Fleet CA
+
+    Raises:
+        RuntimeError: if mTLS env vars are not set or files are missing
+        ssl.SSLError: if cert/key/CA files are invalid
+    """
+    cert = _get_env("HERMES_MTLS_CERT")
+    key = _get_env("HERMES_MTLS_KEY")
+    ca = _get_env("HERMES_MTLS_CA")
+
+    if not (cert and key and ca):
+        raise RuntimeError(
+            "mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
+        )
+
+    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
+    ctx.load_cert_chain(certfile=cert, keyfile=key)
+    ctx.load_verify_locations(cafile=ca)
+    ctx.verify_mode = ssl.CERT_REQUIRED
+    ctx.check_hostname = True
+    logger.info("mTLS client context built (cert=%s, CA=%s)", cert, ca)
+    return ctx
+
+
+def get_peer_cn(ssl_object) -> Optional[str]:
+    """Extract the CN from the peer certificate's subject, or None."""
+    try:
+        peer_cert = ssl_object.getpeercert()
+        if not peer_cert:
+            return None
+        for rdn in peer_cert.get("subject", ()):
+            for attr, value in rdn:
+                if attr == "commonName":
+                    return value
+    except Exception:
+        pass
+    return None
+
+
+class MTLSMiddleware:
+    """
+    ASGI middleware that enforces client certificate verification on A2A routes.
+
+    When mTLS is NOT configured (no env vars) or the route is not an A2A route,
+    the request passes through unchanged.
+
+    When mTLS IS configured and the route matches an A2A prefix, the middleware
+    checks that the request arrived over a TLS connection with a verified client
+    certificate. If not, it returns HTTP 403.
+
+    Note: This middleware only provides defence-in-depth at the app layer.
+    The primary enforcement is at the SSL context level (CERT_REQUIRED on the
+    server context). This middleware is useful when the server runs behind a
+    TLS-terminating proxy that forwards cert info via headers (not yet
+    implemented) or for test-time injection.
+    """
+
+    def __init__(self, app):
+        self.app = app
+        self._enabled = is_mtls_configured()
+        if self._enabled:
+            logger.info("MTLSMiddleware enabled — A2A routes require client cert")
+
+    def _is_a2a_route(self, path: str) -> bool:
+        return any(path.startswith(prefix) for prefix in _A2A_PATH_PREFIXES)
+
+    async def __call__(self, scope, receive, send):
+        if scope["type"] == "http" and self._enabled and self._is_a2a_route(scope.get("path", "")):
+            # Check for client cert in the SSL connection
+            transport = scope.get("extensions", {}).get("tls", {})
+            peer_cert = transport.get("peer_cert")
+            if peer_cert is None:
+                # No client cert — reject
+                response = _forbidden_response("Client certificate required for A2A endpoints")
+                await response(scope, receive, send)
+                return
+
+        await self.app(scope, receive, send)
+
+
+def _forbidden_response(message: str):
+    """Return a minimal ASGI 403 response."""
+    body = message.encode()
+
+    async def respond(scope, receive, send):
+        await send({
+            "type": "http.response.start",
+            "status": 403,
+            "headers": [
+                (b"content-type", b"text/plain"),
+                (b"content-length", str(len(body)).encode()),
+            ],
+        })
+        await send({"type": "http.response.body", "body": body})
+
+    return respond
--- a/agent/privacy_filter.py
+++ b/agent/privacy_filter.py
@@ -0,0 +1,353 @@
+"""Privacy Filter — strip PII from context before remote API calls.
+
+Implements Vitalik's Pattern 2: "A local model can strip out private data
+before passing the query along to a remote LLM."
+
+When Hermes routes a request to a cloud provider (Anthropic, OpenRouter, etc.),
+this module sanitizes the message context to remove personally identifiable
+information before it leaves the user's machine.
+
+Threat model (from Vitalik's secure LLM architecture):
+- Privacy (other): Non-LLM data leakage via search queries, API calls
+- LLM accidents: LLM accidentally leaking private data in prompts
+- LLM jailbreaks: Remote content extracting private context
+
+Usage:
+    from agent.privacy_filter import PrivacyFilter, sanitize_messages
+
+    pf = PrivacyFilter()
+    safe_messages = pf.sanitize_messages(messages)
+    # safe_messages has PII replaced with [REDACTED] tokens
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+class Sensitivity(Enum):
+    """Classification of content sensitivity."""
+    PUBLIC = auto()       # No PII detected
+    LOW = auto()          # Generic references (e.g., city names)
+    MEDIUM = auto()       # Personal identifiers (name, email, phone)
+    HIGH = auto()         # Secrets, keys, financial data, medical info
+    CRITICAL = auto()     # Crypto keys, passwords, SSN patterns
+
+
+@dataclass
+class RedactionReport:
+    """Summary of what was redacted from a message batch."""
+    total_messages: int = 0
+    redacted_messages: int = 0
+    redactions: List[Dict[str, Any]] = field(default_factory=list)
+    max_sensitivity: Sensitivity = Sensitivity.PUBLIC
+
+    @property
+    def had_redactions(self) -> bool:
+        return self.redacted_messages > 0
+
+    def summary(self) -> str:
+        if not self.had_redactions:
+            return "No PII detected — context is clean for remote query."
+        parts = [f"Redacted {self.redacted_messages}/{self.total_messages} messages:"]
+        for r in self.redactions[:10]:
+            parts.append(f"  - {r['type']}: {r['count']} occurrence(s)")
+        if len(self.redactions) > 10:
+            parts.append(f"  ... and {len(self.redactions) - 10} more types")
+        return "\n".join(parts)
+
+
+# =========================================================================
+# PII pattern definitions
+# =========================================================================
+
+# Each pattern is (compiled_regex, redaction_type, sensitivity_level, replacement)
+_PII_PATTERNS: List[Tuple[re.Pattern, str, Sensitivity, str]] = []
+
+
+def _compile_patterns() -> None:
+    """Compile PII detection patterns. Called once at module init."""
+    global _PII_PATTERNS
+    if _PII_PATTERNS:
+        return
+
+    raw_patterns = [
+        # --- CRITICAL: secrets and credentials ---
+        (
+            r'(?:api[_-]?key|apikey|secret[_-]?key|access[_-]?token)\s*[:=]\s*["\']?([A-Za-z0-9_\-\.]{20,})["\']?',
+            "api_key_or_token",
+            Sensitivity.CRITICAL,
+            "[REDACTED-API-KEY]",
+        ),
+        (
+            r'\b(?:sk-|sk_|pk_|rk_|ak_)[A-Za-z0-9]{20,}\b',
+            "prefixed_secret",
+            Sensitivity.CRITICAL,
+            "[REDACTED-SECRET]",
+        ),
+        (
+            r'\b(?:ghp_|gho_|ghu_|ghs_|ghr_)[A-Za-z0-9]{36,}\b',
+            "github_token",
+            Sensitivity.CRITICAL,
+            "[REDACTED-GITHUB-TOKEN]",
+        ),
+        (
+            r'\b(?:xox[bposa]-[A-Za-z0-9\-]+)\b',
+            "slack_token",
+            Sensitivity.CRITICAL,
+            "[REDACTED-SLACK-TOKEN]",
+        ),
+        (
+            r'(?:password|passwd|pwd)\s*[:=]\s*["\']?([^\s"\']{4,})["\']?',
+            "password",
+            Sensitivity.CRITICAL,
+            "[REDACTED-PASSWORD]",
+        ),
+        (
+            r'(?:-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----)',
+            "private_key_block",
+            Sensitivity.CRITICAL,
+            "[REDACTED-PRIVATE-KEY]",
+        ),
+        # Ethereum / crypto addresses (42-char hex starting with 0x)
+        (
+            r'\b0x[a-fA-F0-9]{40}\b',
+            "ethereum_address",
+            Sensitivity.HIGH,
+            "[REDACTED-ETH-ADDR]",
+        ),
+        # Bitcoin addresses (base58, 25-34 chars starting with 1/3/bc1)
+        (
+            r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b',
+            "bitcoin_address",
+            Sensitivity.HIGH,
+            "[REDACTED-BTC-ADDR]",
+        ),
+        (
+            r'\bbc1[a-zA-HJ-NP-Z0-9]{39,59}\b',
+            "bech32_address",
+            Sensitivity.HIGH,
+            "[REDACTED-BTC-ADDR]",
+        ),
+        # --- HIGH: financial ---
+        (
+            r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
+            "credit_card_number",
+            Sensitivity.HIGH,
+            "[REDACTED-CC]",
+        ),
+        (
+            r'\b\d{3}-\d{2}-\d{4}\b',
+            "us_ssn",
+            Sensitivity.HIGH,
+            "[REDACTED-SSN]",
+        ),
+        # --- MEDIUM: personal identifiers ---
+        # Email addresses
+        (
+            r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b',
+            "email_address",
+            Sensitivity.MEDIUM,
+            "[REDACTED-EMAIL]",
+        ),
+        # Phone numbers (US/international patterns)
+        (
+            r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
+            "phone_number_us",
+            Sensitivity.MEDIUM,
+            "[REDACTED-PHONE]",
+        ),
+        (
+            r'\b\+\d{1,3}[-.\s]?\d{4,14}\b',
+            "phone_number_intl",
+            Sensitivity.MEDIUM,
+            "[REDACTED-PHONE]",
+        ),
+        # Filesystem paths that reveal user identity
+        (
+            r'(?:/Users/|/home/|C:\\Users\\)([A-Za-z0-9_\-]+)',
+            "user_home_path",
+            Sensitivity.MEDIUM,
+            r"/Users/[REDACTED-USER]",
+        ),
+        # --- LOW: environment / system info ---
+        # Internal IPs
+        (
+            r'\b(?:10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(?:1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3})\b',
+            "internal_ip",
+            Sensitivity.LOW,
+            "[REDACTED-IP]",
+        ),
+    ]
+
+    _PII_PATTERNS = [
+        (re.compile(pattern, re.IGNORECASE), rtype, sensitivity, replacement)
+        for pattern, rtype, sensitivity, replacement in raw_patterns
+    ]
+
+
+_compile_patterns()
+
+
+# =========================================================================
+# Sensitive file path patterns (context-aware)
+# =========================================================================
+
+_SENSITIVE_PATH_PATTERNS = [
+    re.compile(r'\.(?:env|pem|key|p12|pfx|jks|keystore)\b', re.IGNORECASE),
+    re.compile(r'(?:\.ssh/|\.gnupg/|\.aws/|\.config/gcloud/)', re.IGNORECASE),
+    re.compile(r'(?:wallet|keystore|seed|mnemonic)', re.IGNORECASE),
+    re.compile(r'(?:\.hermes/\.env)', re.IGNORECASE),
+]
+
+
+def _classify_path_sensitivity(path: str) -> Sensitivity:
+    """Check if a file path references sensitive material."""
+    for pat in _SENSITIVE_PATH_PATTERNS:
+        if pat.search(path):
+            return Sensitivity.HIGH
+    return Sensitivity.PUBLIC
+
+
+# =========================================================================
+# Core filtering
+# =========================================================================
+
+class PrivacyFilter:
+    """Strip PII from message context before remote API calls.
+
+    Integrates with the agent's message pipeline. Call sanitize_messages()
+    before sending context to any cloud LLM provider.
+    """
+
+    def __init__(
+        self,
+        min_sensitivity: Sensitivity = Sensitivity.MEDIUM,
+        aggressive_mode: bool = False,
+    ):
+        """
+        Args:
+            min_sensitivity: Only redact PII at or above this level.
+                Default MEDIUM — redacts emails, phones, paths but not IPs.
+            aggressive_mode: If True, also redact file paths and internal IPs.
+        """
+        self.min_sensitivity = (
+            Sensitivity.LOW if aggressive_mode else min_sensitivity
+        )
+        self.aggressive_mode = aggressive_mode
+
+    def sanitize_text(self, text: str) -> Tuple[str, List[Dict[str, Any]]]:
+        """Sanitize a single text string. Returns (cleaned_text, redaction_list)."""
+        redactions = []
+        cleaned = text
+
+        for pattern, rtype, sensitivity, replacement in _PII_PATTERNS:
+            if sensitivity.value < self.min_sensitivity.value:
+                continue
+
+            matches = pattern.findall(cleaned)
+            if matches:
+                count = len(matches) if isinstance(matches[0], str) else sum(
+                    1 for m in matches if m
+                )
+                if count > 0:
+                    cleaned = pattern.sub(replacement, cleaned)
+                    redactions.append({
+                        "type": rtype,
+                        "sensitivity": sensitivity.name,
+                        "count": count,
+                    })
+
+        return cleaned, redactions
+
+    def sanitize_messages(
+        self, messages: List[Dict[str, Any]]
+    ) -> Tuple[List[Dict[str, Any]], RedactionReport]:
+        """Sanitize a list of OpenAI-format messages.
+
+        Returns (safe_messages, report). System messages are NOT sanitized
+        (they're typically static prompts). Only user and assistant messages
+        with string content are processed.
+
+        Args:
+            messages: List of {"role": ..., "content": ...} dicts.
+
+        Returns:
+            Tuple of (sanitized_messages, redaction_report).
+        """
+        report = RedactionReport(total_messages=len(messages))
+        safe_messages = []
+
+        for msg in messages:
+            role = msg.get("role", "")
+            content = msg.get("content", "")
+
+            # Only sanitize user/assistant string content
+            if role in ("user", "assistant") and isinstance(content, str) and content:
+                cleaned, redactions = self.sanitize_text(content)
+                if redactions:
+                    report.redacted_messages += 1
+                    report.redactions.extend(redactions)
+                    # Track max sensitivity
+                    for r in redactions:
+                        s = Sensitivity[r["sensitivity"]]
+                        if s.value > report.max_sensitivity.value:
+                            report.max_sensitivity = s
+                    safe_msg = {**msg, "content": cleaned}
+                    safe_messages.append(safe_msg)
+                    logger.info(
+                        "Privacy filter: redacted %d PII type(s) from %s message",
+                        len(redactions), role,
+                    )
+                else:
+                    safe_messages.append(msg)
+            else:
+                safe_messages.append(msg)
+
+        return safe_messages, report
+
+    def should_use_local_only(self, text: str) -> Tuple[bool, str]:
+        """Determine if content is too sensitive for any remote call.
+
+        Returns (should_block, reason). If True, the content should only
+        be processed by a local model.
+        """
+        _, redactions = self.sanitize_text(text)
+
+        critical_count = sum(
+            1 for r in redactions
+            if Sensitivity[r["sensitivity"]] == Sensitivity.CRITICAL
+        )
+        high_count = sum(
+            1 for r in redactions
+            if Sensitivity[r["sensitivity"]] == Sensitivity.HIGH
+        )
+
+        if critical_count > 0:
+            return True, f"Contains {critical_count} critical-secret pattern(s) — local-only"
+        if high_count >= 3:
+            return True, f"Contains {high_count} high-sensitivity pattern(s) — local-only"
+        return False, ""
+
+
+def sanitize_messages(
+    messages: List[Dict[str, Any]],
+    min_sensitivity: Sensitivity = Sensitivity.MEDIUM,
+    aggressive: bool = False,
+) -> Tuple[List[Dict[str, Any]], RedactionReport]:
+    """Convenience function: sanitize messages with default settings."""
+    pf = PrivacyFilter(min_sensitivity=min_sensitivity, aggressive_mode=aggressive)
+    return pf.sanitize_messages(messages)
+
+
+def quick_sanitize(text: str) -> str:
+    """Quick sanitize a single string — returns cleaned text only."""
+    pf = PrivacyFilter()
+    cleaned, _ = pf.sanitize_text(text)
+    return cleaned
--- a/agent/profile_isolation.py
+++ b/agent/profile_isolation.py
@@ -0,0 +1,262 @@
+"""
+Profile Session Isolation — #891
+
+Tags sessions with their originating profile and provides
+filtered access so profiles cannot see each other's data.
+
+Current state: All sessions share one state.db with no profile tag.
+This module adds profile tagging and filtered queries.
+
+Usage:
+    from agent.profile_isolation import tag_session, get_profile_sessions, get_active_profile
+    
+    # Tag a new session with the current profile
+    tag_session(session_id, profile_name)
+    
+    # Get sessions for a specific profile
+    sessions = get_profile_sessions("sprint")
+    
+    # Get current active profile
+    profile = get_active_profile()
+"""
+
+import json
+import os
+import sqlite3
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from datetime import datetime, timezone
+
+HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes")))
+SESSIONS_DB = HERMES_HOME / "sessions" / "state.db"
+PROFILE_TAGS_FILE = HERMES_HOME / "profile_session_tags.json"
+
+
+def get_active_profile() -> str:
+    """Get the currently active profile name."""
+    config_path = HERMES_HOME / "config.yaml"
+    if config_path.exists():
+        try:
+            import yaml
+            with open(config_path) as f:
+                cfg = yaml.safe_load(f) or {}
+            return cfg.get("active_profile", "default")
+        except Exception:
+            pass
+    
+    # Check environment
+    return os.getenv("HERMES_PROFILE", "default")
+
+
+def _load_tags() -> Dict[str, str]:
+    """Load session-to-profile mapping."""
+    if not PROFILE_TAGS_FILE.exists():
+        return {}
+    try:
+        with open(PROFILE_TAGS_FILE) as f:
+            return json.load(f)
+    except Exception:
+        return {}
+
+
+def _save_tags(tags: Dict[str, str]):
+    """Save session-to-profile mapping."""
+    PROFILE_TAGS_FILE.parent.mkdir(parents=True, exist_ok=True)
+    with open(PROFILE_TAGS_FILE, "w") as f:
+        json.dump(tags, f, indent=2)
+
+
+def tag_session(session_id: str, profile: Optional[str] = None) -> str:
+    """
+    Tag a session with its originating profile.
+    
+    Returns the profile name used.
+    """
+    if profile is None:
+        profile = get_active_profile()
+    
+    tags = _load_tags()
+    tags[session_id] = profile
+    _save_tags(tags)
+    
+    # Also tag in SQLite if available
+    _tag_session_in_db(session_id, profile)
+    
+    return profile
+
+
+def _tag_session_in_db(session_id: str, profile: str):
+    """Add profile tag to SQLite session store."""
+    if not SESSIONS_DB.exists():
+        return
+    
+    try:
+        conn = sqlite3.connect(str(SESSIONS_DB))
+        cursor = conn.cursor()
+        
+        # Check if sessions table has profile column
+        cursor.execute("PRAGMA table_info(sessions)")
+        columns = [row[1] for row in cursor.fetchall()]
+        
+        if "profile" not in columns:
+            # Add profile column
+            cursor.execute("ALTER TABLE sessions ADD COLUMN profile TEXT DEFAULT 'default'")
+        
+        # Update the session's profile
+        cursor.execute(
+            "UPDATE sessions SET profile = ? WHERE session_id = ?",
+            (profile, session_id)
+        )
+        
+        conn.commit()
+        conn.close()
+    except Exception:
+        pass  # SQLite might not be available or schema differs
+
+
+def get_session_profile(session_id: str) -> Optional[str]:
+    """Get the profile that owns a session."""
+    # Check JSON tags first
+    tags = _load_tags()
+    if session_id in tags:
+        return tags[session_id]
+    
+    # Check SQLite
+    if SESSIONS_DB.exists():
+        try:
+            conn = sqlite3.connect(str(SESSIONS_DB))
+            cursor = conn.cursor()
+            cursor.execute(
+                "SELECT profile FROM sessions WHERE session_id = ?",
+                (session_id,)
+            )
+            row = cursor.fetchone()
+            conn.close()
+            if row:
+                return row[0]
+        except Exception:
+            pass
+    
+    return None
+
+
+def get_profile_sessions(
+    profile: Optional[str] = None,
+    limit: int = 100,
+) -> List[Dict[str, Any]]:
+    """
+    Get sessions belonging to a specific profile.
+    
+    Returns list of session dicts.
+    """
+    if profile is None:
+        profile = get_active_profile()
+    
+    sessions = []
+    
+    # Get from JSON tags
+    tags = _load_tags()
+    tagged_sessions = [sid for sid, p in tags.items() if p == profile]
+    
+    # Get from SQLite with profile filter
+    if SESSIONS_DB.exists():
+        try:
+            conn = sqlite3.connect(str(SESSIONS_DB))
+            conn.row_factory = sqlite3.Row
+            cursor = conn.cursor()
+            
+            # Try profile column first
+            try:
+                cursor.execute(
+                    "SELECT * FROM sessions WHERE profile = ? ORDER BY updated_at DESC LIMIT ?",
+                    (profile, limit)
+                )
+                for row in cursor.fetchall():
+                    sessions.append(dict(row))
+            except Exception:
+                # Fallback: filter by tagged session IDs
+                if tagged_sessions:
+                    placeholders = ",".join("?" * len(tagged_sessions[:limit]))
+                    cursor.execute(
+                        f"SELECT * FROM sessions WHERE session_id IN ({placeholders}) ORDER BY updated_at DESC LIMIT ?",
+                        (*tagged_sessions[:limit], limit)
+                    )
+                    for row in cursor.fetchall():
+                        sessions.append(dict(row))
+            
+            conn.close()
+        except Exception:
+            pass
+    
+    return sessions[:limit]
+
+
+def filter_sessions_by_profile(
+    sessions: List[Dict[str, Any]],
+    profile: Optional[str] = None,
+) -> List[Dict[str, Any]]:
+    """Filter a list of sessions to only include those belonging to a profile."""
+    if profile is None:
+        profile = get_active_profile()
+    
+    tags = _load_tags()
+    filtered = []
+    
+    for session in sessions:
+        sid = session.get("session_id") or session.get("id")
+        if not sid:
+            continue
+        
+        # Check tag
+        session_profile = tags.get(sid)
+        if session_profile is None:
+            # Check SQLite
+            session_profile = get_session_profile(sid)
+        
+        if session_profile == profile or session_profile is None:
+            filtered.append(session)
+    
+    return filtered
+
+
+def get_profile_stats() -> Dict[str, Any]:
+    """Get statistics about profile session distribution."""
+    tags = _load_tags()
+    
+    profile_counts = {}
+    for sid, profile in tags.items():
+        profile_counts[profile] = profile_counts.get(profile, 0) + 1
+    
+    total_tagged = len(tags)
+    profiles = list(profile_counts.keys())
+    
+    return {
+        "total_tagged_sessions": total_tagged,
+        "profiles": profiles,
+        "profile_counts": profile_counts,
+        "active_profile": get_active_profile(),
+    }
+
+
+def audit_untagged_sessions() -> List[str]:
+    """Find sessions without a profile tag."""
+    if not SESSIONS_DB.exists():
+        return []
+    
+    try:
+        conn = sqlite3.connect(str(SESSIONS_DB))
+        cursor = conn.cursor()
+        
+        # Get all session IDs
+        cursor.execute("SELECT session_id FROM sessions")
+        all_sessions = {row[0] for row in cursor.fetchall()}
+        conn.close()
+        
+        # Get tagged sessions
+        tags = _load_tags()
+        tagged = set(tags.keys())
+        
+        # Return untagged
+        return list(all_sessions - tagged)
+    except Exception:
+        return []
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -12,7 +12,7 @@ import threading
 from collections import OrderedDict
 from pathlib import Path

-from hermes_constants import get_hermes_home
+from hermes_constants import get_hermes_home, get_skills_dir, is_wsl
 from typing import Optional

 from agent.skill_utils import (
@@ -364,8 +364,56 @@ PLATFORM_HINTS = {
        "documents. You can also include image URLs in markdown format ![alt](url) and they "
        "will be downloaded and sent as native media when possible."
    ),
+    "wecom": (
+        "You are on WeCom (企业微信 / Enterprise WeChat). Markdown formatting is supported. "
+        "You CAN send media files natively — to deliver a file to the user, include "
+        "MEDIA:/absolute/path/to/file in your response. The file will be sent as a native "
+        "WeCom attachment: images (.jpg, .png, .webp) are sent as photos (up to 10 MB), "
+        "other files (.pdf, .docx, .xlsx, .md, .txt, etc.) arrive as downloadable documents "
+        "(up to 20 MB), and videos (.mp4) play inline. Voice messages are supported but "
+        "must be in AMR format — other audio formats are automatically sent as file attachments. "
+        "You can also include image URLs in markdown format ![alt](url) and they will be "
+        "downloaded and sent as native photos. Do NOT tell the user you lack file-sending "
+        "capability — use MEDIA: syntax whenever a file delivery is appropriate."
+    ),
+    "qqbot": (
+        "You are on QQ, a popular Chinese messaging platform. QQ supports markdown formatting "
+        "and emoji. You can send media files natively: include MEDIA:/absolute/path/to/file in "
+        "your response. Images are sent as native photos, and other files arrive as downloadable "
+        "documents."
+    ),
 }

+# ---------------------------------------------------------------------------
+# Environment hints — execution-environment awareness for the agent.
+# Unlike PLATFORM_HINTS (which describe the messaging channel), these describe
+# the machine/OS the agent's tools actually run on.
+# ---------------------------------------------------------------------------
+
+WSL_ENVIRONMENT_HINT = (
+    "You are running inside WSL (Windows Subsystem for Linux). "
+    "The Windows host filesystem is mounted under /mnt/ — "
+    "/mnt/c/ is the C: drive, /mnt/d/ is D:, etc. "
+    "The user's Windows files are typically at "
+    "/mnt/c/Users/<username>/Desktop/, Documents/, Downloads/, etc. "
+    "When the user references Windows paths or desktop files, translate "
+    "to the /mnt/c/ equivalent. You can list /mnt/c/Users/ to discover "
+    "the Windows username if needed."
+)
+
+
+def build_environment_hints() -> str:
+    """Return environment-specific guidance for the system prompt.
+
+    Detects WSL, and can be extended for Termux, Docker, etc.
+    Returns an empty string when no special environment is detected.
+    """
+    hints: list[str] = []
+    if is_wsl():
+        hints.append(WSL_ENVIRONMENT_HINT)
+    return "\n\n".join(hints)
+
+
 CONTEXT_FILE_MAX_CHARS = 20_000
 CONTEXT_TRUNCATE_HEAD_RATIO = 0.7
 CONTEXT_TRUNCATE_TAIL_RATIO = 0.2
@@ -548,8 +596,7 @@ def build_skills_system_prompt(
    are read-only — they appear in the index but new skills are always created
    in the local dir.  Local skills take precedence when names collide.
    """
-    hermes_home = get_hermes_home()
-    skills_dir = hermes_home / "skills"
+    skills_dir = get_skills_dir()
    external_dirs = get_all_skills_dirs()[1:]  # skip local (index 0)

    if not skills_dir.exists() and not external_dirs:
@@ -727,8 +774,16 @@ def build_skills_system_prompt(

        result = (
            "## Skills (mandatory)\n"
-            "Before replying, scan the skills below. If one clearly matches your task, "
-            "load it with skill_view(name) and follow its instructions. "
+            "Before replying, scan the skills below. If a skill matches or is even partially relevant "
+            "to your task, you MUST load it with skill_view(name) and follow its instructions. "
+            "Err on the side of loading — it is always better to have context you don't need "
+            "than to miss critical steps, pitfalls, or established workflows. "
+            "Skills contain specialized knowledge — API endpoints, tool-specific commands, "
+            "and proven workflows that outperform general-purpose approaches. Load the skill "
+            "even if you think you could handle the task with basic tools like web_search or terminal. "
+            "Skills also encode the user's preferred approach, conventions, and quality standards "
+            "for tasks like code review, planning, and testing — load them even for tasks you "
+            "already know how to do, because the skill defines how it should be done here.\n"
            "If a skill has issues, fix it with skill_manage(action='patch').\n"
            "After difficult/iterative tasks, offer to save as a skill. "
            "If a skill you loaded was missing steps, had wrong commands, or needed "
@@ -738,7 +793,7 @@ def build_skills_system_prompt(
            + "\n".join(index_lines) + "\n"
            "</available_skills>\n"
            "\n"
-            "If none match, proceed normally without loading a skill."
+            "Only proceed without loading a skill if genuinely none are relevant to the task."
        )

    # ── Store in LRU cache ────────────────────────────────────────────
--- a/agent/provider_preflight.py
+++ b/agent/provider_preflight.py
@@ -0,0 +1,146 @@
+"""Provider Preflight — Poka-yoke validation of provider/model config.
+
+Validates provider and model configuration before session start.
+Prevents wasted context on misconfigured providers.
+
+Usage:
+    from agent.provider_preflight import preflight_check
+    result = preflight_check(provider="openrouter", model="xiaomi/mimo-v2-pro")
+    if not result["valid"]:
+        print(result["error"])
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+from typing import Any, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+# Provider -> required env var
+PROVIDER_KEYS = {
+    "openrouter": "OPENROUTER_API_KEY",
+    "anthropic": "ANTHROPIC_API_KEY",
+    "openai": "OPENAI_API_KEY",
+    "nous": "NOUS_API_KEY",
+    "ollama": None,  # Local, no key needed
+    "local": None,
+}
+
+
+def check_provider_key(provider: str) -> Dict[str, Any]:
+    """Check if provider has a valid API key configured."""
+    provider_lower = provider.lower().strip()
+
+    env_var = None
+    for known, key in PROVIDER_KEYS.items():
+        if known in provider_lower:
+            env_var = key
+            break
+
+    if env_var is None:
+        # Unknown provider — assume OK (custom/local)
+        return {"valid": True, "provider": provider, "key_status": "unknown"}
+
+    if env_var is None:
+        # Local provider, no key needed
+        return {"valid": True, "provider": provider, "key_status": "not_required"}
+
+    key_value = os.getenv(env_var, "").strip()
+    if not key_value:
+        return {
+            "valid": False,
+            "provider": provider,
+            "key_status": "missing",
+            "error": f"{env_var} is not set. Provider '{provider}' will fail.",
+            "fix": f"Set {env_var} in ~/.hermes/.env",
+        }
+
+    if len(key_value) < 10:
+        return {
+            "valid": False,
+            "provider": provider,
+            "key_status": "too_short",
+            "error": f"{env_var} is suspiciously short ({len(key_value)} chars). May be invalid.",
+            "fix": f"Verify {env_var} value in ~/.hermes/.env",
+        }
+
+    return {"valid": True, "provider": provider, "key_status": "set"}
+
+
+def check_model_availability(model: str, provider: str) -> Dict[str, Any]:
+    """Check if model is likely available for provider."""
+    if not model:
+        return {"valid": False, "error": "No model specified"}
+
+    # Basic sanity checks
+    model_lower = model.lower()
+
+    # Anthropic models should use anthropic provider
+    if "claude" in model_lower and "anthropic" not in provider.lower():
+        return {
+            "valid": True,  # Allow but warn
+            "warning": f"Model '{model}' usually runs on Anthropic provider, not '{provider}'",
+        }
+
+    # Ollama models
+    ollama_indicators = ["llama", "mistral", "qwen", "gemma", "phi", "hermes"]
+    if any(x in model_lower for x in ollama_indicators) and ":" not in model:
+        return {
+            "valid": True,
+            "warning": f"Model '{model}' may need a version tag for Ollama (e.g., {model}:latest)",
+        }
+
+    return {"valid": True}
+
+
+def preflight_check(
+    provider: str = "",
+    model: str = "",
+    fallback_provider: str = "",
+    fallback_model: str = "",
+) -> Dict[str, Any]:
+    """Full pre-flight check for provider/model configuration.
+
+    Returns:
+        Dict with valid (bool), errors (list), warnings (list).
+    """
+    errors = []
+    warnings = []
+
+    # Check primary provider
+    if provider:
+        result = check_provider_key(provider)
+        if not result["valid"]:
+            errors.append(result.get("error", f"Provider {provider} invalid"))
+
+    # Check primary model
+    if model:
+        result = check_model_availability(model, provider)
+        if not result["valid"]:
+            errors.append(result.get("error", f"Model {model} invalid"))
+        elif result.get("warning"):
+            warnings.append(result["warning"])
+
+    # Check fallback
+    if fallback_provider:
+        result = check_provider_key(fallback_provider)
+        if not result["valid"]:
+            warnings.append(f"Fallback provider {fallback_provider} also invalid: {result.get('error','')}")
+
+    if fallback_model:
+        result = check_model_availability(fallback_model, fallback_provider)
+        if not result["valid"]:
+            warnings.append(f"Fallback model {fallback_model} invalid")
+        elif result.get("warning"):
+            warnings.append(result["warning"])
+
+    return {
+        "valid": len(errors) == 0,
+        "errors": errors,
+        "warnings": warnings,
+        "provider": provider,
+        "model": model,
+    }
--- a/agent/rate_limit_tracker.py
+++ b/agent/rate_limit_tracker.py
@@ -24,7 +24,7 @@ from __future__ import annotations

 import time
 from dataclasses import dataclass, field
-from typing import Any, Dict, Mapping, Optional
+from typing import Any, Mapping, Optional


@dataclass
--- a/agent/self_modify.py
+++ b/agent/self_modify.py
@@ -0,0 +1,302 @@
+"""Self-Modifying Prompt Engine — agent learns from its own failures.
+
+Analyzes session transcripts, identifies failure patterns, and generates
+prompt patches to prevent future failures.
+
+The loop: fail → analyze → rewrite → retry → verify improvement.
+
+Usage:
+    from agent.self_modify import PromptLearner
+    learner = PromptLearner()
+    patches = learner.analyze_session(session_id)
+    learner.apply_patches(patches)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import re
+import time
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+HERMES_HOME = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
+PATCHES_DIR = HERMES_HOME / "prompt_patches"
+ROLLBACK_DIR = HERMES_HOME / "prompt_rollback"
+
+
+@dataclass
+class FailurePattern:
+    """A detected failure pattern in session transcripts."""
+    pattern_type: str  # retry_loop, timeout, error_hallucination, context_loss
+    description: str
+    frequency: int
+    example_messages: List[str] = field(default_factory=list)
+    suggested_fix: str = ""
+
+
+@dataclass
+class PromptPatch:
+    """A modification to the system prompt based on failure analysis."""
+    id: str
+    failure_type: str
+    original_rule: str
+    new_rule: str
+    confidence: float
+    applied_at: Optional[float] = None
+    reverted: bool = False
+
+
+# Failure detection patterns
+FAILURE_SIGNALS = {
+    "retry_loop": {
+        "patterns": [
+            r"(?i)retry(?:ing)?\s*(?:attempt|again)",
+            r"(?i)failed.*retrying",
+            r"(?i)error.*again",
+            r"(?i)attempt\s+\d+\s*(?:of|/)\s*\d+",
+        ],
+        "description": "Agent stuck in retry loop",
+    },
+    "timeout": {
+        "patterns": [
+            r"(?i)timed?\s*out",
+            r"(?i)deadline\s+exceeded",
+            r"(?i)took\s+(?:too\s+)?long",
+        ],
+        "description": "Operation timed out",
+    },
+    "hallucination": {
+        "patterns": [
+            r"(?i)i\s+(?:don't|do\s+not)\s+(?:have|see|find)\s+(?:any|that|this)\s+(?:information|data|file)",
+            r"(?i)the\s+file\s+doesn't\s+exist",
+            r"(?i)i\s+(?:made|invented|fabricated)\s+(?:that\s+up|this)",
+        ],
+        "description": "Agent hallucinated or fabricated information",
+    },
+    "context_loss": {
+        "patterns": [
+            r"(?i)i\s+(?:don't|do\s+not)\s+(?:remember|recall|know)\s+(?:what|where|when|how)",
+            r"(?i)could\s+you\s+remind\s+me",
+            r"(?i)what\s+were\s+we\s+(?:doing|working|talking)\s+(?:on|about)",
+        ],
+        "description": "Agent lost context from earlier in conversation",
+    },
+    "tool_failure": {
+        "patterns": [
+            r"(?i)tool\s+(?:call|execution)\s+failed",
+            r"(?i)command\s+not\s+found",
+            r"(?i)permission\s+denied",
+            r"(?i)no\s+such\s+file",
+        ],
+        "description": "Tool execution failed",
+    },
+}
+
+# Prompt improvement templates
+PROMPT_FIXES = {
+    "retry_loop": (
+        "If an operation fails more than twice, stop retrying. "
+        "Report the failure and ask the user for guidance. "
+        "Do not enter retry loops — they waste tokens."
+    ),
+    "timeout": (
+        "For operations that may take long, set a timeout and report "
+        "progress. If an operation takes more than 30 seconds, report "
+        "what you've done so far and ask if you should continue."
+    ),
+    "hallucination": (
+        "If you cannot find information, say 'I don't know' or "
+        "'I couldn't find that.' Never fabricate information. "
+        "If a file doesn't exist, say so — don't guess its contents."
+    ),
+    "context_loss": (
+        "When you need context from earlier in the conversation, "
+        "use session_search to find it. Don't ask the user to repeat themselves."
+    ),
+    "tool_failure": (
+        "If a tool fails, check the error message and try a different approach. "
+        "Don't retry the exact same command — diagnose first."
+    ),
+}
+
+
+class PromptLearner:
+    """Analyze session transcripts and generate prompt improvements."""
+
+    def __init__(self):
+        PATCHES_DIR.mkdir(parents=True, exist_ok=True)
+        ROLLBACK_DIR.mkdir(parents=True, exist_ok=True)
+
+    def analyze_session(self, session_data: dict) -> List[FailurePattern]:
+        """Analyze a session for failure patterns.
+
+        Args:
+            session_data: Session dict with 'messages' list.
+
+        Returns:
+            List of detected failure patterns.
+        """
+        messages = session_data.get("messages", [])
+        patterns_found: Dict[str, FailurePattern] = {}
+
+        for msg in messages:
+            content = str(msg.get("content", ""))
+            role = msg.get("role", "")
+
+            # Only analyze assistant messages and tool results
+            if role not in ("assistant", "tool"):
+                continue
+
+            for failure_type, config in FAILURE_SIGNALS.items():
+                for pattern in config["patterns"]:
+                    if re.search(pattern, content):
+                        if failure_type not in patterns_found:
+                            patterns_found[failure_type] = FailurePattern(
+                                pattern_type=failure_type,
+                                description=config["description"],
+                                frequency=0,
+                                suggested_fix=PROMPT_FIXES.get(failure_type, ""),
+                            )
+                        patterns_found[failure_type].frequency += 1
+                        if len(patterns_found[failure_type].example_messages) < 3:
+                            patterns_found[failure_type].example_messages.append(
+                                content[:200]
+                            )
+                        break  # One match per message per type is enough
+
+        return list(patterns_found.values())
+
+    def generate_patches(self, patterns: List[FailurePattern],
+                         min_confidence: float = 0.7) -> List[PromptPatch]:
+        """Generate prompt patches from failure patterns.
+
+        Args:
+            patterns: Detected failure patterns.
+            min_confidence: Minimum confidence to generate a patch.
+
+        Returns:
+            List of prompt patches.
+        """
+        patches = []
+        for pattern in patterns:
+            # Confidence based on frequency
+            if pattern.frequency >= 3:
+                confidence = 0.9
+            elif pattern.frequency >= 2:
+                confidence = 0.75
+            else:
+                confidence = 0.5
+
+            if confidence < min_confidence:
+                continue
+
+            if not pattern.suggested_fix:
+                continue
+
+            patch = PromptPatch(
+                id=f"{pattern.pattern_type}-{int(time.time())}",
+                failure_type=pattern.pattern_type,
+                original_rule="(missing — no existing rule for this pattern)",
+                new_rule=pattern.suggested_fix,
+                confidence=confidence,
+            )
+            patches.append(patch)
+
+        return patches
+
+    def apply_patches(self, patches: List[PromptPatch],
+                      prompt_path: Optional[str] = None) -> int:
+        """Apply patches to the system prompt.
+
+        Args:
+            patches: Patches to apply.
+            prompt_path: Path to prompt file (default: ~/.hermes/system_prompt.md)
+
+        Returns:
+            Number of patches applied.
+        """
+        if prompt_path is None:
+            prompt_path = str(HERMES_HOME / "system_prompt.md")
+
+        prompt_file = Path(prompt_path)
+
+        # Backup current prompt
+        if prompt_file.exists():
+            backup = ROLLBACK_DIR / f"{prompt_file.name}.{int(time.time())}.bak"
+            backup.write_text(prompt_file.read_text())
+
+        # Read current prompt
+        current = prompt_file.read_text() if prompt_file.exists() else ""
+
+        # Apply patches
+        applied = 0
+        additions = []
+        for patch in patches:
+            if patch.new_rule not in current:
+                additions.append(f"\n## Auto-learned: {patch.failure_type}\n{patch.new_rule}")
+                patch.applied_at = time.time()
+                applied += 1
+
+        if additions:
+            new_content = current + "\n".join(additions)
+            prompt_file.write_text(new_content)
+
+            # Log patches
+            patches_file = PATCHES_DIR / f"patches-{int(time.time())}.json"
+            with open(patches_file, "w") as f:
+                json.dump([p.__dict__ for p in patches], f, indent=2, default=str)
+
+        logger.info("Applied %d prompt patches", applied)
+        return applied
+
+    def rollback_last(self, prompt_path: Optional[str] = None) -> bool:
+        """Rollback to the most recent backup.
+
+        Args:
+            prompt_path: Path to prompt file.
+
+        Returns:
+            True if rollback succeeded.
+        """
+        if prompt_path is None:
+            prompt_path = str(HERMES_HOME / "system_prompt.md")
+
+        backups = sorted(ROLLBACK_DIR.glob("*.bak"), reverse=True)
+        if not backups:
+            logger.warning("No backups to rollback to")
+            return False
+
+        latest = backups[0]
+        Path(prompt_path).write_text(latest.read_text())
+        logger.info("Rolled back to %s", latest.name)
+        return True
+
+    def learn_from_session(self, session_data: dict) -> Dict[str, Any]:
+        """Full learning cycle: analyze → patch → apply.
+
+        Args:
+            session_data: Session dict.
+
+        Returns:
+            Summary of what was learned and applied.
+        """
+        patterns = self.analyze_session(session_data)
+        patches = self.generate_patches(patterns)
+        applied = self.apply_patches(patches)
+
+        return {
+            "patterns_detected": len(patterns),
+            "patches_generated": len(patches),
+            "patches_applied": applied,
+            "patterns": [
+                {"type": p.pattern_type, "frequency": p.frequency, "description": p.description}
+                for p in patterns
+            ],
+        }
--- a/agent/session_compactor.py
+++ b/agent/session_compactor.py
@@ -0,0 +1,231 @@
+"""Session compaction with fact extraction.
+
+Before compressing conversation context, extracts durable facts
+(user preferences, corrections, project details) and saves them
+to the fact store so they survive compression.
+
+Usage:
+    from agent.session_compactor import extract_and_save_facts
+    facts = extract_and_save_facts(messages)
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import re
+import time
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ExtractedFact:
+    """A fact extracted from conversation."""
+    category: str       # "user_pref", "correction", "project", "tool_quirk", "general"
+    entity: str         # what the fact is about
+    content: str        # the fact itself
+    confidence: float   # 0.0-1.0
+    source_turn: int    # which message turn it came from
+    timestamp: float = 0.0
+
+
+# Patterns that indicate user preferences
+_PREFERENCE_PATTERNS = [
+    (r"(?:I|we) (?:prefer|like|want|need) (.+?)(?:\.|$)", "preference"),
+    (r"(?:always|never) (?:use|do|run|deploy) (.+?)(?:\.|$)", "preference"),
+    (r"(?:my|our) (?:default|preferred|usual) (.+?) (?:is|are) (.+?)(?:\.|$)", "preference"),
+    (r"(?:make sure|ensure|remember) (?:to|that) (.+?)(?:\.|$)", "instruction"),
+    (r"(?:don'?t|do not) (?:ever|ever again) (.+?)(?:\.|$)", "constraint"),
+]
+
+# Patterns that indicate corrections
+_CORRECTION_PATTERNS = [
+    (r"(?:actually|no[, ]|wait[, ]|correction[: ]|sorry[, ]) (.+)", "correction"),
+    (r"(?:I meant|what I meant was|the correct) (.+?)(?:\.|$)", "correction"),
+    (r"(?:it'?s|its) (?:not|shouldn'?t be|wrong) (.+?)(?:\.|$)", "correction"),
+]
+
+# Patterns that indicate project/tool facts
+_PROJECT_PATTERNS = [
+    (r"(?:the |our )?(?:project|repo|codebase|code) (?:is|uses|needs|requires) (.+?)(?:\.|$)", "project"),
+    (r"(?:deploy|push|commit) (?:to|on) (.+?)(?:\.|$)", "project"),
+    (r"(?:this|that|the) (?:server|host|machine|VPS) (?:is|runs|has) (.+?)(?:\.|$)", "infrastructure"),
+    (r"(?:model|provider|engine) (?:is|should be|needs to be) (.+?)(?:\.|$)", "config"),
+]
+
+
+def extract_facts_from_messages(messages: List[Dict[str, Any]]) -> List[ExtractedFact]:
+    """Extract durable facts from conversation messages.
+
+    Scans user messages for preferences, corrections, project facts,
+    and infrastructure details that should survive compression.
+    """
+    facts = []
+    seen_contents = set()
+
+    for turn_idx, msg in enumerate(messages):
+        role = msg.get("role", "")
+        content = msg.get("content", "")
+
+        # Only scan user messages and assistant responses with corrections
+        if role not in ("user", "assistant"):
+            continue
+        if not content or not isinstance(content, str):
+            continue
+        if len(content) < 10:
+            continue
+
+        # Skip tool results and system messages
+        if role == "assistant" and msg.get("tool_calls"):
+            continue
+
+        extracted = _extract_from_text(content, turn_idx, role)
+
+        # Deduplicate by content
+        for fact in extracted:
+            key = f"{fact.category}:{fact.content[:100]}"
+            if key not in seen_contents:
+                seen_contents.add(key)
+                facts.append(fact)
+
+    return facts
+
+
+def _extract_from_text(text: str, turn_idx: int, role: str) -> List[ExtractedFact]:
+    """Extract facts from a single text block."""
+    facts = []
+    timestamp = time.time()
+
+    # Clean text for pattern matching
+    clean = text.strip()
+
+    # User preference patterns (from user messages)
+    if role == "user":
+        for pattern, subcategory in _PREFERENCE_PATTERNS:
+            for match in re.finditer(pattern, clean, re.IGNORECASE):
+                content = match.group(1).strip() if match.lastindex else match.group(0).strip()
+                if len(content) > 5:
+                    facts.append(ExtractedFact(
+                        category=f"user_pref.{subcategory}",
+                        entity="user",
+                        content=content[:200],
+                        confidence=0.7,
+                        source_turn=turn_idx,
+                        timestamp=timestamp,
+                    ))
+
+    # Correction patterns (from user messages)
+    if role == "user":
+        for pattern, subcategory in _CORRECTION_PATTERNS:
+            for match in re.finditer(pattern, clean, re.IGNORECASE):
+                content = match.group(1).strip() if match.lastindex else match.group(0).strip()
+                if len(content) > 5:
+                    facts.append(ExtractedFact(
+                        category=f"correction.{subcategory}",
+                        entity="user",
+                        content=content[:200],
+                        confidence=0.8,
+                        source_turn=turn_idx,
+                        timestamp=timestamp,
+                    ))
+
+    # Project/infrastructure patterns (from both user and assistant)
+    for pattern, subcategory in _PROJECT_PATTERNS:
+        for match in re.finditer(pattern, clean, re.IGNORECASE):
+            content = match.group(1).strip() if match.lastindex else match.group(0).strip()
+            if len(content) > 5:
+                facts.append(ExtractedFact(
+                    category=f"project.{subcategory}",
+                    entity=subcategory,
+                    content=content[:200],
+                    confidence=0.6,
+                    source_turn=turn_idx,
+                    timestamp=timestamp,
+                ))
+
+    return facts
+
+
+def save_facts_to_store(facts: List[ExtractedFact], fact_store_fn=None) -> int:
+    """Save extracted facts to the fact store.
+
+    Args:
+        facts: List of extracted facts.
+        fact_store_fn: Optional callable(category, entity, content, trust).
+            If None, uses the holographic fact store if available.
+
+    Returns:
+        Number of facts saved.
+    """
+    saved = 0
+
+    if fact_store_fn:
+        for fact in facts:
+            try:
+                fact_store_fn(
+                    category=fact.category,
+                    entity=fact.entity,
+                    content=fact.content,
+                    trust=fact.confidence,
+                )
+                saved += 1
+            except Exception as e:
+                logger.debug("Failed to save fact: %s", e)
+    else:
+        # Try holographic fact store
+        try:
+            from fact_store import fact_store as _fs
+            for fact in facts:
+                try:
+                    _fs(
+                        action="add",
+                        content=fact.content,
+                        category=fact.category,
+                        tags=fact.entity,
+                        trust_delta=fact.confidence - 0.5,
+                    )
+                    saved += 1
+                except Exception as e:
+                    logger.debug("Failed to save fact via fact_store: %s", e)
+        except ImportError:
+            logger.debug("fact_store not available — facts not persisted")
+
+    return saved
+
+
+def extract_and_save_facts(
+    messages: List[Dict[str, Any]],
+    fact_store_fn=None,
+) -> Tuple[List[ExtractedFact], int]:
+    """Extract facts from messages and save them.
+
+    Returns (extracted_facts, saved_count).
+    """
+    facts = extract_facts_from_messages(messages)
+    if facts:
+        logger.info("Extracted %d facts from conversation", len(facts))
+        saved = save_facts_to_store(facts, fact_store_fn)
+        logger.info("Saved %d/%d facts to store", saved, len(facts))
+    else:
+        saved = 0
+    return facts, saved
+
+
+def format_facts_summary(facts: List[ExtractedFact]) -> str:
+    """Format extracted facts as a readable summary."""
+    if not facts:
+        return "No facts extracted."
+
+    by_category = {}
+    for f in facts:
+        by_category.setdefault(f.category, []).append(f)
+
+    lines = [f"Extracted {len(facts)} facts:", ""]
+    for cat, cat_facts in sorted(by_category.items()):
+        lines.append(f"  {cat}:")
+        for f in cat_facts:
+            lines.append(f"    - {f.content[:80]}")
+    return "\n".join(lines)
--- a/agent/shield.py
+++ b/agent/shield.py
@@ -0,0 +1,24 @@
+
+import logging
+from tools.shield.detector import ShieldDetector, Verdict, CRISIS_SYSTEM_PROMPT, SAFE_SIX_MODELS
+
+logger = logging.getLogger(__name__)
+
+_detector = None
+
+def get_detector():
+    global _detector
+    if _detector is None:
+        _detector = ShieldDetector()
+    return _detector
+
+def scan_text(text: str):
+    """Scan text for jailbreaks and crisis signals using SHIELD."""
+    detector = get_detector()
+    return detector.detect(text)
+
+def is_crisis(verdict: str) -> bool:
+    return verdict in [Verdict.CRISIS_DETECTED.value, Verdict.CRISIS_UNDER_ATTACK.value]
+
+def is_jailbreak(verdict: str) -> bool:
+    return verdict in [Verdict.JAILBREAK_DETECTED.value, Verdict.CRISIS_UNDER_ATTACK.value]
--- a/agent/skill_commands.py
+++ b/agent/skill_commands.py
@@ -12,6 +12,8 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional

+from hermes_constants import display_hermes_home
+
 logger = logging.getLogger(__name__)

 _skill_commands: Dict[str, Dict[str, Any]] = {}
@@ -108,7 +110,7 @@ def _inject_skill_config(loaded_skill: dict[str, Any], parts: list[str]) -> None
        if not resolved:
            return

-        lines = ["", "[Skill config (from ~/.hermes/config.yaml):"]
+        lines = ["", f"[Skill config (from {display_hermes_home()}/config.yaml):"]
        for key, value in resolved.items():
            display_val = str(value) if value else "(not set)"
            lines.append(f"  {key} = {display_val}")
--- a/agent/skill_utils.py
+++ b/agent/skill_utils.py
@@ -10,9 +10,9 @@ import os
 import re
 import sys
 from pathlib import Path
-from typing import Any, Dict, List, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple

-from hermes_constants import get_hermes_home
+from hermes_constants import get_config_path, get_skills_dir

 logger = logging.getLogger(__name__)

@@ -130,7 +130,7 @@ def get_disabled_skill_names(platform: str | None = None) -> Set[str]:
    Reads the config file directly (no CLI config imports) to stay
    lightweight.
    """
-    config_path = get_hermes_home() / "config.yaml"
+    config_path = get_config_path()
    if not config_path.exists():
        return set()
    try:
@@ -178,7 +178,7 @@ def get_external_skills_dirs() -> List[Path]:
    path.  Only directories that actually exist are returned.  Duplicates and
    paths that resolve to the local ``~/.hermes/skills/`` are silently skipped.
    """
-    config_path = get_hermes_home() / "config.yaml"
+    config_path = get_config_path()
    if not config_path.exists():
        return []
    try:
@@ -200,7 +200,7 @@ def get_external_skills_dirs() -> List[Path]:
    if not isinstance(raw_dirs, list):
        return []

-    local_skills = (get_hermes_home() / "skills").resolve()
+    local_skills = get_skills_dir().resolve()
    seen: Set[Path] = set()
    result: List[Path] = []

@@ -230,7 +230,7 @@ def get_all_skills_dirs() -> List[Path]:
    The local dir is always first (and always included even if it doesn't exist
    yet — callers handle that).  External dirs follow in config order.
    """
-    dirs = [get_hermes_home() / "skills"]
+    dirs = [get_skills_dir()]
    dirs.extend(get_external_skills_dirs())
    return dirs

@@ -384,7 +384,7 @@ def resolve_skill_config_values(
    current values (or the declared default if the key isn't set).
    Path values are expanded via ``os.path.expanduser``.
    """
-    config_path = get_hermes_home() / "config.yaml"
+    config_path = get_config_path()
    config: Dict[str, Any] = {}
    if config_path.exists():
        try:
@@ -441,3 +441,25 @@ def iter_skill_index_files(skills_dir: Path, filename: str):
            matches.append(Path(root) / filename)
    for path in sorted(matches, key=lambda p: str(p.relative_to(skills_dir))):
        yield path
+
+
+# ── Namespace helpers for plugin-provided skills ───────────────────────────
+
+_NAMESPACE_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
+
+
+def parse_qualified_name(name: str) -> Tuple[Optional[str], str]:
+    """Split ``'namespace:skill-name'`` into ``(namespace, bare_name)``.
+
+    Returns ``(None, name)`` when there is no ``':'``.
+    """
+    if ":" not in name:
+        return None, name
+    return tuple(name.split(":", 1))  # type: ignore[return-value]
+
+
+def is_valid_namespace(candidate: Optional[str]) -> bool:
+    """Check whether *candidate* is a valid namespace (``[a-zA-Z0-9_-]+``)."""
+    if not candidate:
+        return False
+    return bool(_NAMESPACE_RE.match(candidate))
--- a/agent/telemetry_logger.py
+++ b/agent/telemetry_logger.py
@@ -0,0 +1,23 @@
+
+import os
+import json
+import time
+
+def log_token_usage(prompt_tokens, completion_tokens, model_name):
+    """Logs token usage to a local JSONL file for fleet-wide accounting."""
+    spend_dir = os.path.expanduser("~/.hermes/telemetry/spend")
+    os.makedirs(spend_dir, exist_ok=True)
+    
+    session_id = os.environ.get("HERMES_SESSION_ID", "default")
+    log_file = os.path.join(spend_dir, f"session_{session_id}.jsonl")
+    
+    record = {
+        "timestamp": time.time(),
+        "model": model_name,
+        "input_tokens": prompt_tokens,
+        "output_tokens": completion_tokens
+    }
+    
+    with open(log_file, "a") as f:
+        f.write(json.dumps(record) + "\n")
+    
--- a/agent/time_aware_routing.py
+++ b/agent/time_aware_routing.py
@@ -0,0 +1,146 @@
+"""Time-aware model routing for cron jobs.
+
+Routes cron tasks to more capable models during off-hours when the user
+is not present to correct errors. Reduces error rates during high-error
+time windows (e.g., 18:00 evening batches).
+
+Usage:
+    from agent.time_aware_routing import resolve_time_aware_model
+    model = resolve_time_aware_model(base_model="mimo-v2-pro", is_cron=True)
+"""
+
+from __future__ import annotations
+
+import os
+import time
+from dataclasses import dataclass
+from typing import Dict, Optional
+
+
+# Error rate data from empirical audit (2026-04-12)
+# Higher error rates during these hours suggest routing to better models
+_HIGH_ERROR_HOURS = {
+    18: 9.4,  # 18:00 — 9.4% error rate (evening cron batches)
+    19: 8.1,
+    20: 7.5,
+    21: 6.8,
+    22: 6.2,
+    23: 5.9,
+    0:  5.5,
+    1:  5.2,
+}
+
+# Low error hours — default model is fine
+_LOW_ERROR_HOURS = set(range(6, 18))  # 06:00-17:59
+
+# Default fallback models by time zone
+_DEFAULT_STRONG_MODEL = os.getenv("CRON_STRONG_MODEL", "xiaomi/mimo-v2-pro")
+_DEFAULT_CHEAP_MODEL = os.getenv("CRON_CHEAP_MODEL", "qwen2.5:7b")
+_ERROR_THRESHOLD = float(os.getenv("CRON_ERROR_THRESHOLD", "6.0"))  # % error rate
+
+
+@dataclass
+class RoutingDecision:
+    """Result of time-aware routing."""
+    model: str
+    provider: str
+    reason: str
+    hour: int
+    error_rate: float
+    is_off_hours: bool
+
+
+def get_hour_error_rate(hour: int) -> float:
+    """Get expected error rate for a given hour (0-23)."""
+    return _HIGH_ERROR_HOURS.get(hour, 4.0)  # Default 4% for unlisted hours
+
+
+def is_off_hours(hour: int) -> bool:
+    """Check if hour is considered off-hours (higher error rates)."""
+    return hour not in _LOW_ERROR_HOURS
+
+
+def resolve_time_aware_model(
+    base_model: str = "",
+    base_provider: str = "",
+    is_cron: bool = False,
+    hour: Optional[int] = None,
+) -> RoutingDecision:
+    """Resolve model based on time of day and task type.
+
+    During off-hours (evening/night), routes to stronger models for cron
+    jobs to compensate for lack of human oversight.
+
+    Args:
+        base_model: The model that would normally be used.
+        base_provider: The provider for the base model.
+        is_cron: Whether this is a cron job (vs interactive session).
+        hour: Override hour (for testing). Defaults to current hour.
+
+    Returns:
+        RoutingDecision with model, provider, and reasoning.
+    """
+    if hour is None:
+        hour = time.localtime().tm_hour
+
+    error_rate = get_hour_error_rate(hour)
+    off_hours = is_off_hours(hour)
+
+    # Interactive sessions always use the base model (user can correct errors)
+    if not is_cron:
+        return RoutingDecision(
+            model=base_model or _DEFAULT_CHEAP_MODEL,
+            provider=base_provider,
+            reason="Interactive session — user can correct errors",
+            hour=hour,
+            error_rate=error_rate,
+            is_off_hours=off_hours,
+        )
+
+    # Cron jobs during low-error hours: use base model
+    if not off_hours and error_rate < _ERROR_THRESHOLD:
+        return RoutingDecision(
+            model=base_model or _DEFAULT_CHEAP_MODEL,
+            provider=base_provider,
+            reason=f"Low-error hours ({hour}:00, {error_rate}% expected)",
+            hour=hour,
+            error_rate=error_rate,
+            is_off_hours=False,
+        )
+
+    # Cron jobs during high-error hours: upgrade to stronger model
+    if error_rate >= _ERROR_THRESHOLD:
+        return RoutingDecision(
+            model=_DEFAULT_STRONG_MODEL,
+            provider="nous",
+            reason=f"High-error hours ({hour}:00, {error_rate}% expected) — using stronger model",
+            hour=hour,
+            error_rate=error_rate,
+            is_off_hours=True,
+        )
+
+    # Off-hours but low error: use base model
+    return RoutingDecision(
+        model=base_model or _DEFAULT_CHEAP_MODEL,
+        provider=base_provider,
+        reason=f"Off-hours but low error ({hour}:00, {error_rate}%)",
+        hour=hour,
+        error_rate=error_rate,
+        is_off_hours=off_hours,
+    )
+
+
+def get_routing_report() -> str:
+    """Get a report of time-based routing decisions for the next 24 hours."""
+    lines = ["Time-Aware Model Routing (24h forecast)", "=" * 40, ""]
+    lines.append(f"Error threshold: {_ERROR_THRESHOLD}%")
+    lines.append(f"Strong model: {_DEFAULT_STRONG_MODEL}")
+    lines.append(f"Cheap model: {_DEFAULT_CHEAP_MODEL}")
+    lines.append("")
+
+    for h in range(24):
+        decision = resolve_time_aware_model(is_cron=True, hour=h)
+        icon = "\U0001f7e2" if decision.model == _DEFAULT_CHEAP_MODEL else "\U0001f534"
+        lines.append(f"  {h:02d}:00 {icon} {decision.model:25s} ({decision.error_rate}% error)")
+
+    return "\n".join(lines)
--- a/agent/title_generator.py
+++ b/agent/title_generator.py
@@ -36,7 +36,7 @@ def generate_title(user_message: str, assistant_response: str, timeout: float =

    try:
        response = call_llm(
-            task="compression",  # reuse compression task config (cheap/fast model)
+            task="title_generation",
            messages=messages,
            max_tokens=30,
            temperature=0.3,
--- a/agent/token_budget.py
+++ b/agent/token_budget.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+"""
+Token Budget — Poka-yoke guard against silent context overflow.
+
+Progressive warning system with circuit breakers:
+  - 60%: WARNING  — log + suggest summarization
+  - 80%: CAUTION  — auto-compress, drop raw tool outputs
+  - 90%: CRITICAL — block verbose tool calls, force wrap-up
+  - 95%: STOP     — graceful session termination with summary
+
+Also provides tool output budgeting to truncate before overflow.
+
+Usage:
+    from agent.token_budget import TokenBudget
+
+    budget = TokenBudget(context_length=128_000)
+    budget.update(8000)          # from API response prompt_tokens
+
+    status = budget.check()      # returns BudgetStatus with level + message
+    budget.should_block_tools()  # True at 90%+
+    budget.should_terminate()    # True at 95%+
+
+    # Tool output budgeting
+    remaining = budget.tool_output_budget()
+    truncated = budget.truncate_tool_output(output_text, max_chars=remaining)
+"""
+
+import logging
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+
+# ── Thresholds ────────────────────────────────────────────────────────
+
+WARN_PERCENT = 0.60
+CAUTION_PERCENT = 0.80
+CRITICAL_PERCENT = 0.90
+STOP_PERCENT = 0.95
+
+# Reserve 5% of context for system prompt, response, and overhead
+RESPONSE_RESERVE_RATIO = 0.05
+
+# Max tool output chars at each level
+TOOL_OUTPUT_BUDGETS = {
+    "NORMAL": 50_000,
+    "WARNING": 20_000,
+    "CAUTION": 8_000,
+    "CRITICAL": 2_000,
+    "STOP": 500,
+}
+
+
+class BudgetLevel(Enum):
+    NORMAL = "NORMAL"
+    WARNING = "WARNING"
+    CAUTION = "CAUTION"
+    CRITICAL = "CRITICAL"
+    STOP = "STOP"
+
+    @property
+    def percent_threshold(self) -> float:
+        return {
+            BudgetLevel.NORMAL: 0.0,
+            BudgetLevel.WARNING: WARN_PERCENT,
+            BudgetLevel.CAUTION: CAUTION_PERCENT,
+            BudgetLevel.CRITICAL: CRITICAL_PERCENT,
+            BudgetLevel.STOP: STOP_PERCENT,
+        }[self]
+
+    @property
+    def emoji(self) -> str:
+        return {
+            BudgetLevel.NORMAL: "",
+            BudgetLevel.WARNING: "\u26a0\ufe0f",
+            BudgetLevel.CAUTION: "\U0001f525",
+            BudgetLevel.CRITICAL: "\U0001f6d1",
+            BudgetLevel.STOP: "\U0001f6d1",
+        }[self]
+
+
+@dataclass
+class BudgetStatus:
+    """Current token budget status."""
+    level: BudgetLevel
+    tokens_used: int
+    context_length: int
+    percent_used: float
+    tokens_remaining: int
+    message: str = ""
+    should_compress: bool = False
+    should_block_tools: bool = False
+    should_terminate: bool = False
+
+    def to_indicator(self) -> str:
+        """Compact status indicator for CLI display."""
+        pct = int(self.percent_used * 100)
+        if self.level == BudgetLevel.NORMAL:
+            return f"[{pct}%]"
+        return f"{self.level.emoji} [{pct}%]"
+
+    def to_bar(self, width: int = 10) -> str:
+        """Visual progress bar."""
+        filled = int(width * self.percent_used)
+        bar = "\u2588" * filled + "\u2591" * (width - filled)
+        color = self._bar_color()
+        return f"{color}{bar}\033[0m {int(self.percent_used * 100)}%"
+
+    def _bar_color(self) -> str:
+        if self.level == BudgetLevel.STOP:
+            return "\033[41m"  # red bg
+        if self.level == BudgetLevel.CRITICAL:
+            return "\033[31m"  # red
+        if self.level == BudgetLevel.CAUTION:
+            return "\033[33m"  # yellow
+        if self.level == BudgetLevel.WARNING:
+            return "\033[33m"  # yellow
+        return "\033[32m"  # green
+
+
+class TokenBudget:
+    """
+    Progressive token budget tracker with poka-yoke circuit breakers.
+
+    Tracks cumulative token usage against a context length and triggers
+    escalating actions at each threshold.
+    """
+
+    def __init__(
+        self,
+        context_length: int,
+        warn_percent: float = WARN_PERCENT,
+        caution_percent: float = CAUTION_PERCENT,
+        critical_percent: float = CRITICAL_PERCENT,
+        stop_percent: float = STOP_PERCENT,
+        response_reserve_ratio: float = RESPONSE_RESERVE_RATIO,
+    ):
+        self.context_length = context_length
+        self.warn_threshold = int(context_length * warn_percent)
+        self.caution_threshold = int(context_length * caution_percent)
+        self.critical_threshold = int(context_length * critical_percent)
+        self.stop_threshold = int(context_length * stop_percent)
+        self.response_reserve = int(context_length * response_reserve_ratio)
+
+        self.tokens_used = 0
+        self.completions_tokens = 0
+        self.total_tool_output_chars = 0
+        self._level = BudgetLevel.NORMAL
+        self._history: list[int] = []
+
+    def update(self, prompt_tokens: int, completion_tokens: int = 0) -> BudgetStatus:
+        """Update budget from API response usage."""
+        self.tokens_used = prompt_tokens
+        self.completions_tokens = completion_tokens
+        self._history.append(prompt_tokens)
+        return self.check()
+
+    def check(self) -> BudgetStatus:
+        """Evaluate current budget level and return status."""
+        pct = self.tokens_used / self.context_length if self.context_length > 0 else 0
+        remaining = max(0, self.context_length - self.tokens_used - self.response_reserve)
+
+        # Determine level
+        if pct >= STOP_PERCENT:
+            level = BudgetLevel.STOP
+        elif pct >= CRITICAL_PERCENT:
+            level = BudgetLevel.CRITICAL
+        elif pct >= CAUTION_PERCENT:
+            level = BudgetLevel.CAUTION
+        elif pct >= WARN_PERCENT:
+            level = BudgetLevel.WARNING
+        else:
+            level = BudgetLevel.NORMAL
+
+        # Log transitions (don\'t log every check)
+        if level != self._level:
+            self._log_transition(level, pct)
+            self._level = level
+
+        messages = {
+            BudgetLevel.NORMAL: "",
+            BudgetLevel.WARNING: (
+                f"Context at {int(pct*100)}%. Consider wrapping up soon or using /compress."
+            ),
+            BudgetLevel.CAUTION: (
+                f"Context at {int(pct*100)}%. Auto-compressing. "
+                f"Tool outputs will be truncated."
+            ),
+            BudgetLevel.CRITICAL: (
+                f"Context at {int(pct*100)}%. Verbose tools blocked. "
+                f"Session approaching limit — please wrap up."
+            ),
+            BudgetLevel.STOP: (
+                f"Context at {int(pct*100)}%. Session must terminate. "
+                f"Saving summary before shutdown."
+            ),
+        }
+
+        return BudgetStatus(
+            level=level,
+            tokens_used=self.tokens_used,
+            context_length=self.context_length,
+            percent_used=pct,
+            tokens_remaining=remaining,
+            message=messages[level],
+            should_compress=level in (BudgetLevel.CAUTION, BudgetLevel.CRITICAL, BudgetLevel.STOP),
+            should_block_tools=level in (BudgetLevel.CRITICAL, BudgetLevel.STOP),
+            should_terminate=level == BudgetLevel.STOP,
+        )
+
+    def should_compress(self) -> bool:
+        """True at 80%+ — auto-compression should trigger."""
+        return self.tokens_used >= self.caution_threshold
+
+    def should_block_tools(self) -> bool:
+        """True at 90%+ — verbose tool calls should be blocked."""
+        return self.tokens_used >= self.critical_threshold
+
+    def should_terminate(self) -> bool:
+        """True at 95%+ — session should gracefully terminate."""
+        return self.tokens_used >= self.stop_threshold
+
+    def tool_output_budget(self) -> int:
+        """Max chars allowed for next tool output based on current level."""
+        status = self.check()
+        return TOOL_OUTPUT_BUDGETS.get(status.level.value, 50_000)
+
+    def truncate_tool_output(self, output: str, max_chars: int = None) -> str:
+        """Truncate tool output to fit budget. Adds truncation notice."""
+        if max_chars is None:
+            max_chars = self.tool_output_budget()
+
+        if len(output) <= max_chars:
+            return output
+
+        # Preserve start and end, truncate middle
+        if max_chars < 200:
+            return output[:max_chars] + "\n[...truncated...]"
+
+        head = max_chars // 2
+        tail = max_chars - head - 30  # reserve for truncation notice
+        truncated = (
+            output[:head]
+            + f"\n\n[...{len(output) - head - tail:,} chars truncated...]\n\n"
+            + output[-tail:]
+        )
+        return truncated
+
+    def remaining_for_response(self) -> int:
+        """Tokens available for the model\'s response."""
+        return max(0, self.context_length - self.tokens_used - self.response_reserve)
+
+    def growth_rate(self) -> Optional[float]:
+        """Average token increase per turn (from history)."""
+        if len(self._history) < 2:
+            return None
+        diffs = [self._history[i] - self._history[i-1] for i in range(1, len(self._history))]
+        return sum(diffs) / len(diffs)
+
+    def turns_remaining(self) -> Optional[int]:
+        """Estimated turns until context is full (based on growth rate)."""
+        rate = self.growth_rate()
+        if rate is None or rate <= 0:
+            return None
+        remaining = self.context_length - self.tokens_used
+        return int(remaining / rate)
+
+    def reset(self):
+        """Reset budget for new session."""
+        self.tokens_used = 0
+        self.completions_tokens = 0
+        self.total_tool_output_chars = 0
+        self._level = BudgetLevel.NORMAL
+        self._history.clear()
+
+    def _log_transition(self, new_level: BudgetLevel, pct: float):
+        """Log budget level transitions."""
+        msg = (
+            f"Token budget: {self._level.value} -> {new_level.value} "
+            f"({self.tokens_used}/{self.context_length} = {pct:.0%})"
+        )
+        if new_level == BudgetLevel.WARNING:
+            logger.warning(msg)
+        elif new_level == BudgetLevel.CAUTION:
+            logger.warning(msg)
+        elif new_level in (BudgetLevel.CRITICAL, BudgetLevel.STOP):
+            logger.error(msg)
+        else:
+            logger.info(msg)
+
+    def summary(self) -> str:
+        """Human-readable budget summary."""
+        status = self.check()
+        turns = self.turns_remaining()
+        rate = self.growth_rate()
+        lines = [
+            f"Token Budget: {status.tokens_used:,} / {status.context_length:,} ({status.percent_used:.0%})",
+            f"Level: {status.level.value}",
+            f"Remaining: {status.tokens_remaining:,} tokens",
+        ]
+        if rate is not None:
+            lines.append(f"Growth rate: ~{rate:,.0f} tokens/turn")
+        if turns is not None:
+            lines.append(f"Estimated turns left: ~{turns}")
+        if status.message:
+            lines.append(f"Action: {status.message}")
+        return "\n".join(lines)
+
+
+# ── Convenience factory ───────────────────────────────────────────────
+
+def create_budget(context_length: int, **kwargs) -> TokenBudget:
+    """Create a TokenBudget with defaults."""
+    return TokenBudget(context_length=context_length, **kwargs)
--- a/agent/tool_fixation_detector.py
+++ b/agent/tool_fixation_detector.py
@@ -0,0 +1,156 @@
+"""Tool fixation detection — break repetitive tool calling loops.
+
+Detects when the agent latches onto one tool and calls it repeatedly
+without making progress. Injects a nudge prompt to break the loop.
+
+Usage:
+    from agent.tool_fixation_detector import ToolFixationDetector
+    detector = ToolFixationDetector()
+    nudge = detector.record("execute_code")
+    if nudge:
+        # Inject nudge into conversation
+        messages.append({"role": "system", "content": nudge})
+"""
+
+from __future__ import annotations
+
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+
+# Default thresholds
+_DEFAULT_THRESHOLD = int(os.getenv("TOOL_FIXATION_THRESHOLD", "5"))
+_DEFAULT_WINDOW = int(os.getenv("TOOL_FIXATION_WINDOW", "10"))
+
+
+@dataclass
+class FixationEvent:
+    """Record of a fixation detection."""
+    tool_name: str
+    streak_length: int
+    threshold: int
+    nudge_sent: bool = False
+
+
+class ToolFixationDetector:
+    """Detects and breaks tool fixation loops.
+
+    Tracks the sequence of tool calls and detects when the same tool
+    is called N times consecutively. When detected, returns a nudge
+    prompt to inject into the conversation.
+    """
+
+    def __init__(self, threshold: int = 0, window: int = 0):
+        self.threshold = threshold or _DEFAULT_THRESHOLD
+        self.window = window or _DEFAULT_WINDOW
+        self._history: List[str] = []
+        self._current_streak: str = ""
+        self._streak_count: int = 0
+        self._nudges_sent: int = 0
+        self._events: List[FixationEvent] = []
+
+    @property
+    def nudges_sent(self) -> int:
+        return self._nudges_sent
+
+    @property
+    def events(self) -> List[FixationEvent]:
+        return list(self._events)
+
+    def record(self, tool_name: str) -> Optional[str]:
+        """Record a tool call and return nudge prompt if fixation detected.
+
+        Args:
+            tool_name: Name of the tool that was called.
+
+        Returns:
+            Nudge prompt string if fixation detected, None otherwise.
+        """
+        self._history.append(tool_name)
+
+        # Trim history to window
+        if len(self._history) > self.window:
+            self._history = self._history[-self.window:]
+
+        # Update streak
+        if tool_name == self._current_streak:
+            self._streak_count += 1
+        else:
+            self._current_streak = tool_name
+            self._streak_count = 1
+
+        # Check for fixation
+        if self._streak_count >= self.threshold:
+            event = FixationEvent(
+                tool_name=tool_name,
+                streak_length=self._streak_count,
+                threshold=self.threshold,
+                nudge_sent=True,
+            )
+            self._events.append(event)
+            self._nudges_sent += 1
+
+            return self._build_nudge(tool_name, self._streak_count)
+
+        return None
+
+    def _build_nudge(self, tool_name: str, count: int) -> str:
+        """Build a nudge prompt to break the fixation loop."""
+        return (
+            f"[SYSTEM: You have called `{tool_name}` {count} times in a row "
+            f"without switching tools. This suggests a fixation loop. "
+            f"Consider:\n"
+            f"1. Is the tool returning an error? Read the error carefully.\n"
+            f"2. Is there a different tool that could help?\n"
+            f"3. Should you ask the user for clarification?\n"
+            f"4. Is the task actually complete?\n"
+            f"Break the loop by trying a different approach.]"
+        )
+
+    def reset(self) -> None:
+        """Reset the detector state."""
+        self._history.clear()
+        self._current_streak = ""
+        self._streak_count = 0
+
+    def get_streak_info(self) -> dict:
+        """Get current streak information."""
+        return {
+            "current_tool": self._current_streak,
+            "streak_count": self._streak_count,
+            "threshold": self.threshold,
+            "at_threshold": self._streak_count >= self.threshold,
+            "nudges_sent": self._nudges_sent,
+        }
+
+    def format_report(self) -> str:
+        """Format fixation events as a report."""
+        if not self._events:
+            return "No tool fixation detected."
+
+        lines = [
+            f"Tool Fixation Report ({len(self._events)} events)",
+            "=" * 40,
+        ]
+        for e in self._events:
+            lines.append(f"  {e.tool_name}: {e.streak_length} consecutive calls (threshold: {e.threshold})")
+        return "\n".join(lines)
+
+
+# Singleton
+_detector: Optional[ToolFixationDetector] = None
+
+
+def get_fixation_detector() -> ToolFixationDetector:
+    """Get or create the singleton detector."""
+    global _detector
+    if _detector is None:
+        _detector = ToolFixationDetector()
+    return _detector
+
+
+def reset_fixation_detector() -> None:
+    """Reset the singleton."""
+    global _detector
+    _detector = None
--- a/agent/tool_orchestrator.py
+++ b/agent/tool_orchestrator.py
@@ -0,0 +1,177 @@
+"""Tool Orchestrator — Robust execution and circuit breaking for agent tools.
+
+Provides a unified execution service that wraps the tool registry.
+Implements the Circuit Breaker pattern to prevent the agent from getting
+stuck in failure loops when a specific tool or its underlying service
+is flapping or down.
+
+Architecture:
+    Discovery (tools/registry.py) -> Orchestration (agent/tool_orchestrator.py) -> Dispatch
+"""
+
+import json
+import time
+import logging
+import threading
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+
+from tools.registry import registry
+
+logger = logging.getLogger(__name__)
+
+
+class CircuitState:
+    """States for the tool circuit breaker."""
+    CLOSED = "closed"        # Normal operation
+    OPEN = "open"            # Failing, execution blocked
+    HALF_OPEN = "half_open"  # Testing if service recovered
+
+
+@dataclass
+class ToolStats:
+    """Execution statistics for a tool."""
+    name: str
+    state: str = CircuitState.CLOSED
+    failures: int = 0
+    successes: int = 0
+    last_failure_time: float = 0
+    total_execution_time: float = 0
+    call_count: int = 0
+
+
+class ToolOrchestrator:
+    """Orchestrates tool execution with robustness patterns."""
+
+    def __init__(
+        self,
+        failure_threshold: int = 3,
+        reset_timeout: int = 300,
+    ):
+        """
+        Args:
+            failure_threshold: Number of failures before opening the circuit.
+            reset_timeout: Seconds to wait before transitioning from OPEN to HALF_OPEN.
+        """
+        self.failure_threshold = failure_threshold
+        self.reset_timeout = reset_timeout
+        self._stats: Dict[str, ToolStats] = {}
+        self._lock = threading.Lock()
+
+    def _get_stats(self, name: str) -> ToolStats:
+        """Get or initialize stats for a tool with thread-safe state transition."""
+        with self._lock:
+            if name not in self._stats:
+                self._stats[name] = ToolStats(name=name)
+            
+            stats = self._stats[name]
+            
+            # Transition from OPEN to HALF_OPEN if timeout expired
+            if stats.state == CircuitState.OPEN:
+                if time.time() - stats.last_failure_time > self.reset_timeout:
+                    stats.state = CircuitState.HALF_OPEN
+                    logger.info("Circuit breaker HALF_OPEN for tool: %s", name)
+            
+            return stats
+
+    def _record_success(self, name: str, execution_time: float):
+        """Record a successful tool execution and close the circuit."""
+        with self._lock:
+            stats = self._stats[name]
+            stats.successes += 1
+            stats.call_count += 1
+            stats.total_execution_time += execution_time
+            
+            if stats.state != CircuitState.CLOSED:
+                logger.info("Circuit breaker CLOSED for tool: %s (recovered)", name)
+            
+            stats.state = CircuitState.CLOSED
+            stats.failures = 0
+
+    def _record_failure(self, name: str, execution_time: float):
+        """Record a failed tool execution and potentially open the circuit."""
+        with self._lock:
+            stats = self._stats[name]
+            stats.failures += 1
+            stats.call_count += 1
+            stats.total_execution_time += execution_time
+            stats.last_failure_time = time.time()
+            
+            if stats.state == CircuitState.HALF_OPEN or stats.failures >= self.failure_threshold:
+                stats.state = CircuitState.OPEN
+                logger.warning(
+                    "Circuit breaker OPEN for tool: %s (failures: %d)", 
+                    name, stats.failures
+                )
+
+    def dispatch(self, name: str, args: dict, **kwargs) -> str:
+        """Execute a tool via the registry with circuit breaker protection."""
+        stats = self._get_stats(name)
+        
+        if stats.state == CircuitState.OPEN:
+            return json.dumps({
+                "error": (
+                    f"Tool '{name}' is temporarily unavailable due to repeated failures. "
+                    f"Circuit breaker is OPEN. Please try again in a few minutes or use an alternative tool."
+                ),
+                "circuit_breaker": True,
+                "tool_name": name
+            })
+
+        start_time = time.time()
+        try:
+            # Dispatch to the underlying registry
+            result_str = registry.dispatch(name, args, **kwargs)
+            execution_time = time.time() - start_time
+            
+            # Inspect result for errors. registry.dispatch catches internal
+            # exceptions and returns a JSON error string.
+            is_error = False
+            try:
+                # Lightweight check for error key in JSON
+                if '"error":' in result_str:
+                    res_json = json.loads(result_str)
+                    if isinstance(res_json, dict) and "error" in res_json:
+                        is_error = True
+            except (json.JSONDecodeError, TypeError):
+                # If it's not valid JSON, it's a malformed result (error)
+                is_error = True
+            
+            if is_error:
+                self._record_failure(name, execution_time)
+            else:
+                self._record_success(name, execution_time)
+                
+            return result_str
+            
+        except Exception as e:
+            # This should rarely be hit as registry.dispatch catches most things,
+            # but we guard against orchestrator-level or registry-level bugs.
+            execution_time = time.time() - start_time
+            self._record_failure(name, execution_time)
+            
+            error_msg = f"Tool orchestrator error during {name}: {type(e).__name__}: {e}"
+            logger.exception(error_msg)
+            return json.dumps({
+                "error": error_msg,
+                "tool_name": name,
+                "execution_time": execution_time
+            })
+
+    def get_fleet_stats(self) -> Dict[str, Any]:
+        """Return execution statistics for all tools."""
+        with self._lock:
+            return {
+                name: {
+                    "state": s.state,
+                    "failures": s.failures,
+                    "successes": s.successes,
+                    "avg_time": s.total_execution_time / s.call_count if s.call_count > 0 else 0,
+                    "calls": s.call_count
+                }
+                for name, s in self._stats.items()
+            }
+
+
+# Global orchestrator instance
+orchestrator = ToolOrchestrator()
--- a/agent/usage_pricing.py
+++ b/agent/usage_pricing.py
@@ -575,25 +575,6 @@ def has_known_pricing(
    return entry is not None


-def get_pricing(
-    model_name: str,
-    provider: Optional[str] = None,
-    base_url: Optional[str] = None,
-    api_key: Optional[str] = None,
-) -> Dict[str, float]:
-    """Backward-compatible thin wrapper for legacy callers.
-
-    Returns only non-cache input/output fields when a pricing entry exists.
-    Unknown routes return zeroes.
-    """
-    entry = get_pricing_entry(model_name, provider=provider, base_url=base_url, api_key=api_key)
-    if not entry:
-        return {"input": 0.0, "output": 0.0}
-    return {
-        "input": float(entry.input_cost_per_million or _ZERO),
-        "output": float(entry.output_cost_per_million or _ZERO),
-    }
-

 def format_duration_compact(seconds: float) -> str:
    if seconds < 60:
--- a/ansible/fleet_mtls.yml
+++ b/ansible/fleet_mtls.yml
@@ -0,0 +1,32 @@
+---
+# fleet_mtls.yml — Deploy mutual-TLS certificates to all fleet agents.
+#
+# Prerequisites:
+#   1. Run scripts/gen_fleet_ca.sh to create the fleet CA.
+#   2. For each agent, run:
+#        scripts/gen_agent_cert.sh --agent timmy
+#        scripts/gen_agent_cert.sh --agent allegro
+#        scripts/gen_agent_cert.sh --agent ezra
+#
+# Usage:
+#   ansible-playbook -i inventory/fleet.ini ansible/fleet_mtls.yml
+#
+# Inventory example (inventory/fleet.ini):
+#   [fleet]
+#   timmy.local   agent_name=timmy
+#   allegro.local agent_name=allegro
+#   ezra.local    agent_name=ezra
+#
+# Refs #806
+
+- name: Distribute fleet mTLS certificates
+  hosts: fleet
+  become: true
+  vars:
+    _pki_base: "{{ lookup('env', 'HOME') }}/.hermes/pki"
+  roles:
+    - role: hermes_mtls
+      vars:
+        hermes_mtls_local_ca_cert: "{{ _pki_base }}/ca/fleet-ca.crt"
+        hermes_mtls_local_agent_cert: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.crt"
+        hermes_mtls_local_agent_key: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.key"
--- a/ansible/inventory/fleet.ini.example
+++ b/ansible/inventory/fleet.ini.example
@@ -0,0 +1,12 @@
+# Example fleet inventory for mutual-TLS cert distribution.
+# Copy to fleet.ini and adjust hostnames/IPs.
+# Refs #806
+
+[fleet_agents]
+timmy    ansible_host=192.168.1.10
+allegro  ansible_host=192.168.1.11
+ezra     ansible_host=192.168.1.12
+
+[fleet_agents:vars]
+ansible_user=hermes
+ansible_python_interpreter=/usr/bin/python3
--- a/ansible/roles/fleet_mtls_certs/defaults/main.yml
+++ b/ansible/roles/fleet_mtls_certs/defaults/main.yml
@@ -0,0 +1,21 @@
+---
+# Default paths on the *control node* where certs are read from.
+# Override these in your inventory / group_vars as needed.
+
+# Fleet CA certificate (public; safe to push to all nodes)
+fleet_mtls_ca_cert_src: "{{ lookup('env', 'HOME') }}/.hermes/pki/ca/fleet-ca.crt"
+
+# Per-agent cert/key source dir on the control node.
+# Expected layout:  <fleet_mtls_agent_certs_dir>/<agent_name>/<agent_name>.{crt,key}
+fleet_mtls_agent_certs_dir: "{{ lookup('env', 'HOME') }}/.hermes/pki/agents"
+
+# Remote destination paths on the fleet node
+fleet_mtls_remote_pki_dir: "/etc/hermes/pki"
+fleet_mtls_remote_ca_dir: "{{ fleet_mtls_remote_pki_dir }}/ca"
+fleet_mtls_remote_agent_dir: "{{ fleet_mtls_remote_pki_dir }}/agent"
+
+# The agent name to deploy (set per-host in inventory, e.g. timmy / allegro / ezra)
+fleet_mtls_agent_name: "{{ inventory_hostname_short }}"
+
+# Hermes service name (for reload notification)
+fleet_mtls_hermes_service: "hermes-a2a"
--- a/ansible/roles/fleet_mtls_certs/handlers/main.yml
+++ b/ansible/roles/fleet_mtls_certs/handlers/main.yml
@@ -0,0 +1,7 @@
+---
+- name: Restart hermes-a2a
+  ansible.builtin.systemd:
+    name: "{{ fleet_mtls_hermes_service }}"
+    state: restarted
+  when: ansible_service_mgr == "systemd"
+  ignore_errors: true   # service may not exist in all environments
--- a/ansible/roles/fleet_mtls_certs/meta/main.yml
+++ b/ansible/roles/fleet_mtls_certs/meta/main.yml
@@ -0,0 +1,17 @@
+---
+galaxy_info:
+  role_name: fleet_mtls_certs
+  author: hermes-agent
+  description: >
+    Distribute fleet CA and per-agent mTLS certificates to Hermes fleet nodes.
+    Part of issue #806 — A2A mutual TLS between fleet agents.
+  min_ansible_version: "2.14"
+  platforms:
+    - name: Debian
+      versions: [bookworm, bullseye]
+    - name: Ubuntu
+      versions: ["22.04", "24.04"]
+    - name: EL
+      versions: ["8", "9"]
+
+dependencies: []
--- a/ansible/roles/fleet_mtls_certs/tasks/main.yml
+++ b/ansible/roles/fleet_mtls_certs/tasks/main.yml
@@ -0,0 +1,99 @@
+---
+# fleet_mtls_certs/tasks/main.yml
+#
+# Distribute the fleet CA certificate and the per-agent TLS cert+key to
+# each fleet node.  Triggers a hermes-a2a service restart when any cert
+# changes.
+#
+# Refs #806 — A2A mutual TLS between fleet agents.
+
+- name: Verify agent cert source files exist on control node
+  ansible.builtin.stat:
+    path: "{{ item }}"
+  register: _src_stat
+  delegate_to: localhost
+  loop:
+    - "{{ fleet_mtls_ca_cert_src }}"
+    - "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
+    - "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
+  loop_control:
+    label: "{{ item | basename }}"
+
+- name: Fail if any source cert is missing
+  ansible.builtin.fail:
+    msg: >
+      Required cert file not found: {{ item.item }}
+      Run scripts/gen_fleet_ca.sh and scripts/gen_agent_cert.sh --agent {{ fleet_mtls_agent_name }} first.
+  when: not item.stat.exists
+  loop: "{{ _src_stat.results }}"
+  loop_control:
+    label: "{{ item.item | basename }}"
+
+# -----------------------------------------------------------------------
+# Remote directory structure
+# -----------------------------------------------------------------------
+
+- name: Create remote PKI directories
+  ansible.builtin.file:
+    path: "{{ item }}"
+    state: directory
+    owner: root
+    group: root
+    mode: "0750"
+  loop:
+    - "{{ fleet_mtls_remote_pki_dir }}"
+    - "{{ fleet_mtls_remote_ca_dir }}"
+    - "{{ fleet_mtls_remote_agent_dir }}"
+
+# -----------------------------------------------------------------------
+# Fleet CA certificate (public — read-only for all)
+# -----------------------------------------------------------------------
+
+- name: Deploy fleet CA certificate
+  ansible.builtin.copy:
+    src: "{{ fleet_mtls_ca_cert_src }}"
+    dest: "{{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt"
+    owner: root
+    group: root
+    mode: "0644"
+  notify: Restart hermes-a2a
+
+# -----------------------------------------------------------------------
+# Per-agent certificate (public portion)
+# -----------------------------------------------------------------------
+
+- name: Deploy agent certificate
+  ansible.builtin.copy:
+    src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
+    dest: "{{ fleet_mtls_remote_agent_dir }}/agent.crt"
+    owner: root
+    group: root
+    mode: "0644"
+  notify: Restart hermes-a2a
+
+# -----------------------------------------------------------------------
+# Per-agent private key (secret — root-only read)
+# -----------------------------------------------------------------------
+
+- name: Deploy agent private key
+  ansible.builtin.copy:
+    src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
+    dest: "{{ fleet_mtls_remote_agent_dir }}/agent.key"
+    owner: root
+    group: root
+    mode: "0600"
+  no_log: true   # suppress file content from Ansible output
+  notify: Restart hermes-a2a
+
+# -----------------------------------------------------------------------
+# Environment file for hermes-a2a systemd unit
+# -----------------------------------------------------------------------
+
+- name: Write hermes-a2a environment file
+  ansible.builtin.template:
+    src: hermes_a2a_env.j2
+    dest: /etc/hermes/a2a.env
+    owner: root
+    group: root
+    mode: "0640"
+  notify: Restart hermes-a2a
--- a/ansible/roles/fleet_mtls_certs/templates/hermes_a2a_env.j2
+++ b/ansible/roles/fleet_mtls_certs/templates/hermes_a2a_env.j2
@@ -0,0 +1,10 @@
+# Managed by Ansible — fleet_mtls_certs role
+# Environment variables for the hermes-a2a systemd service.
+# Source this file in the [Service] section: EnvironmentFile=/etc/hermes/a2a.env
+
+HERMES_AGENT_NAME={{ fleet_mtls_agent_name }}
+HERMES_A2A_CERT={{ fleet_mtls_remote_agent_dir }}/agent.crt
+HERMES_A2A_KEY={{ fleet_mtls_remote_agent_dir }}/agent.key
+HERMES_A2A_CA={{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt
+HERMES_A2A_HOST=0.0.0.0
+HERMES_A2A_PORT=9443
--- a/ansible/roles/hermes_mtls/defaults/main.yml
+++ b/ansible/roles/hermes_mtls/defaults/main.yml
@@ -0,0 +1,21 @@
+---
+# Ansible role: hermes_mtls
+# Distributes fleet mTLS certificates to Hermes agent nodes.
+#
+# Required variables (set in inventory / group_vars / --extra-vars):
+#   hermes_mtls_local_ca_cert   Local path on the Ansible controller to fleet-ca.crt
+#   hermes_mtls_local_agent_cert  Local path to this agent's .crt file
+#   hermes_mtls_local_agent_key   Local path to this agent's .key file
+#
+# Optional overrides:
+hermes_mtls_cert_dir: /etc/hermes/certs
+hermes_mtls_cert_owner: hermes
+hermes_mtls_cert_group: hermes
+hermes_mtls_cert_mode: "0640"
+hermes_mtls_ca_cert_mode: "0644"
+
+# Env file that Hermes reads on startup (systemd EnvironmentFile or .env)
+hermes_mtls_env_file: /etc/hermes/mtls.env
+
+# Hermes systemd service name — restarted after cert changes
+hermes_mtls_service: hermes-gateway
--- a/ansible/roles/hermes_mtls/handlers/main.yml
+++ b/ansible/roles/hermes_mtls/handlers/main.yml
@@ -0,0 +1,7 @@
+---
+- name: Restart hermes service
+  ansible.builtin.systemd:
+    name: "{{ hermes_mtls_service }}"
+    state: restarted
+    daemon_reload: true
+  when: ansible_service_mgr == "systemd"
--- a/ansible/roles/hermes_mtls/meta/main.yml
+++ b/ansible/roles/hermes_mtls/meta/main.yml
@@ -0,0 +1,16 @@
+---
+galaxy_info:
+  role_name: hermes_mtls
+  author: Hermes Fleet
+  description: Distribute mTLS certificates to Hermes fleet nodes for A2A authentication
+  license: MIT
+  min_ansible_version: "2.14"
+  platforms:
+    - name: Ubuntu
+      versions: ["22.04", "24.04"]
+    - name: Debian
+      versions: ["12"]
+    - name: EL
+      versions: ["9"]
+
+dependencies: []
--- a/ansible/roles/hermes_mtls/tasks/main.yml
+++ b/ansible/roles/hermes_mtls/tasks/main.yml
@@ -0,0 +1,67 @@
+---
+# hermes_mtls role — distribute fleet mTLS certificates to a Hermes agent node.
+#
+# This role:
+#   1. Creates the cert directory on the remote node
+#   2. Copies the Fleet CA cert, agent cert, and agent key
+#   3. Writes an env file with HERMES_MTLS_* variables
+#   4. Restarts the Hermes service if any cert changed
+
+- name: Ensure cert directory exists
+  ansible.builtin.file:
+    path: "{{ hermes_mtls_cert_dir }}"
+    state: directory
+    owner: "{{ hermes_mtls_cert_owner }}"
+    group: "{{ hermes_mtls_cert_group }}"
+    mode: "0750"
+
+- name: Copy Fleet CA certificate
+  ansible.builtin.copy:
+    src: "{{ hermes_mtls_local_ca_cert }}"
+    dest: "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
+    owner: "{{ hermes_mtls_cert_owner }}"
+    group: "{{ hermes_mtls_cert_group }}"
+    mode: "{{ hermes_mtls_ca_cert_mode }}"
+  notify: Restart hermes service
+
+- name: Copy agent TLS certificate
+  ansible.builtin.copy:
+    src: "{{ hermes_mtls_local_agent_cert }}"
+    dest: "{{ hermes_mtls_cert_dir }}/agent.crt"
+    owner: "{{ hermes_mtls_cert_owner }}"
+    group: "{{ hermes_mtls_cert_group }}"
+    mode: "{{ hermes_mtls_cert_mode }}"
+  notify: Restart hermes service
+
+- name: Copy agent TLS private key
+  ansible.builtin.copy:
+    src: "{{ hermes_mtls_local_agent_key }}"
+    dest: "{{ hermes_mtls_cert_dir }}/agent.key"
+    owner: "{{ hermes_mtls_cert_owner }}"
+    group: "{{ hermes_mtls_cert_group }}"
+    mode: "0600"
+  notify: Restart hermes service
+
+- name: Write mTLS environment file
+  ansible.builtin.template:
+    src: mtls.env.j2
+    dest: "{{ hermes_mtls_env_file }}"
+    owner: "{{ hermes_mtls_cert_owner }}"
+    group: "{{ hermes_mtls_cert_group }}"
+    mode: "0640"
+  notify: Restart hermes service
+
+- name: Verify cert files are readable by service user
+  ansible.builtin.stat:
+    path: "{{ item }}"
+  loop:
+    - "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
+    - "{{ hermes_mtls_cert_dir }}/agent.crt"
+    - "{{ hermes_mtls_cert_dir }}/agent.key"
+  register: _cert_stat
+
+- name: Assert all cert files exist
+  ansible.builtin.assert:
+    that: item.stat.exists
+    fail_msg: "Expected cert file missing: {{ item.item }}"
+  loop: "{{ _cert_stat.results }}"
--- a/ansible/roles/hermes_mtls/templates/mtls.env.j2
+++ b/ansible/roles/hermes_mtls/templates/mtls.env.j2
@@ -0,0 +1,8 @@
+# Hermes mTLS environment — generated by hermes_mtls Ansible role
+# Source this file or use as a systemd EnvironmentFile=
+# WARNING: This file contains the path to the agent's private key.
+#          Restrict read access to the hermes service user.
+
+HERMES_MTLS_CERT={{ hermes_mtls_cert_dir }}/agent.crt
+HERMES_MTLS_KEY={{ hermes_mtls_cert_dir }}/agent.key
+HERMES_MTLS_CA={{ hermes_mtls_cert_dir }}/fleet-ca.crt
--- a/benchmarks/gemma4-tool-calling-2026-04-13.md
+++ b/benchmarks/gemma4-tool-calling-2026-04-13.md
@@ -0,0 +1,40 @@
+# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
+
+Date: 2026-04-13
+Status: Awaiting execution
+
+## Test Design
+
+100 diverse tool calls across 7 categories:
+
+| Category | Count | Tools Tested |
+|----------|-------|--------------|
+| File operations | 20 | read_file, write_file, search_files |
+| Terminal commands | 20 | terminal |
+| Web search | 15 | web_search |
+| Code execution | 15 | execute_code |
+| Browser automation | 10 | browser_navigate |
+| Delegation | 10 | delegate_task |
+| MCP tools | 10 | mcp_* |
+
+## Metrics
+
+| Metric | mimo-v2-pro | Gemma 4 |
+|--------|-------------|---------|
+| Schema parse success | — | — |
+| Tool execution success | — | — |
+| Parallel tool success | — | — |
+| Avg latency (s) | — | — |
+| Token cost per call | — | — |
+
+## How to Run
+
+```bash
+python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
+python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
+python3 benchmarks/tool_call_benchmark.py --compare
+```
+
+## Gemma 4-Specific Failure Modes
+
+To be documented after benchmark execution.
--- a/benchmarks/test_images.json
+++ b/benchmarks/test_images.json
@@ -0,0 +1,194 @@
+[
+  {
+    "id": "screenshot_github_home",
+    "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
+    "category": "screenshot",
+    "expected_keywords": ["github", "logo", "mark"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "diagram_mermaid_flow",
+    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
+    "category": "diagram",
+    "expected_keywords": ["flow", "diagram", "process"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+  },
+  {
+    "id": "photo_random_1",
+    "url": "https://picsum.photos/seed/vision1/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "photo_random_2",
+    "url": "https://picsum.photos/seed/vision2/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "chart_simple_bar",
+    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
+    "category": "chart",
+    "expected_keywords": ["bar", "chart", "revenue"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+  },
+  {
+    "id": "chart_pie",
+    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
+    "category": "chart",
+    "expected_keywords": ["pie", "chart", "percentage"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+  },
+  {
+    "id": "diagram_org_chart",
+    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "category": "diagram",
+    "expected_keywords": ["organization", "hierarchy", "chart"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+  },
+  {
+    "id": "screenshot_terminal",
+    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
+    "category": "screenshot",
+    "expected_keywords": ["terminal", "command", "output"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "photo_random_3",
+    "url": "https://picsum.photos/seed/vision3/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "chart_line",
+    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
+    "category": "chart",
+    "expected_keywords": ["line", "chart", "temperature"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+  },
+  {
+    "id": "diagram_sequence",
+    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "category": "diagram",
+    "expected_keywords": ["sequence", "interaction", "message"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+  },
+  {
+    "id": "photo_random_4",
+    "url": "https://picsum.photos/seed/vision4/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "screenshot_webpage",
+    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
+    "category": "screenshot",
+    "expected_keywords": ["github", "page", "web"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "chart_radar",
+    "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
+    "category": "chart",
+    "expected_keywords": ["radar", "chart", "skill"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+  },
+  {
+    "id": "photo_random_5",
+    "url": "https://picsum.photos/seed/vision5/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "diagram_class",
+    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "category": "diagram",
+    "expected_keywords": ["class", "object", "attribute"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+  },
+  {
+    "id": "chart_doughnut",
+    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
+    "category": "chart",
+    "expected_keywords": ["doughnut", "chart", "device"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+  },
+  {
+    "id": "photo_random_6",
+    "url": "https://picsum.photos/seed/vision6/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "screenshot_error",
+    "url": "https://http.cat/404.jpg",
+    "category": "screenshot",
+    "expected_keywords": ["404", "error", "cat"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
+  },
+  {
+    "id": "diagram_network",
+    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
+    "category": "diagram",
+    "expected_keywords": ["network", "node", "connection"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
+  },
+  {
+    "id": "photo_random_7",
+    "url": "https://picsum.photos/seed/vision7/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "chart_stacked_bar",
+    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
+    "category": "chart",
+    "expected_keywords": ["stacked", "bar", "chart"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
+  },
+  {
+    "id": "screenshot_dashboard",
+    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
+    "category": "screenshot",
+    "expected_keywords": ["search", "code", "feature"],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  },
+  {
+    "id": "photo_random_8",
+    "url": "https://picsum.photos/seed/vision8/400/300",
+    "category": "photo",
+    "expected_keywords": [],
+    "ground_truth_ocr": "",
+    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
+  }
+]
--- a/benchmarks/tool_call_benchmark.py
+++ b/benchmarks/tool_call_benchmark.py
@@ -0,0 +1,614 @@
+#!/usr/bin/env python3
+"""
+Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
+
+Runs 100 diverse tool-calling prompts through multiple models and compares
+success rates, latency, and token costs.
+
+Usage:
+    python3 benchmarks/tool_call_benchmark.py                  # full 100-call suite
+    python3 benchmarks/tool_call_benchmark.py --limit 10       # quick smoke test
+    python3 benchmarks/tool_call_benchmark.py --models nous     # single model
+    python3 benchmarks/tool_call_benchmark.py --category file   # single category
+
+Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
+"""
+
+import argparse
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field, asdict
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Optional
+
+# Ensure hermes-agent root is importable
+REPO_ROOT = Path(__file__).resolve().parent.parent
+sys.path.insert(0, str(REPO_ROOT))
+
+# ---------------------------------------------------------------------------
+# Test Definitions
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ToolCall:
+    """A single tool-calling test case."""
+    id: str
+    category: str
+    prompt: str
+    expected_tool: str              # tool name we expect the model to call
+    expected_params_check: str = "" # substring expected in JSON args
+    timeout: int = 30               # max seconds per call
+    notes: str = ""
+
+
+# fmt: off
+SUITE: list[ToolCall] = [
+    # ── File Operations (20) ──────────────────────────────────────────────
+    ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
+             "read_file", "path"),
+    ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
+             "write_file", "path"),
+    ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
+             "search_files", "pattern"),
+    ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
+             "read_file", "offset"),
+    ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
+             "patch", "old_string"),
+    ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
+             "search_files", "target"),
+    ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
+             "read_file", "limit"),
+    ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
+             "write_file", "content"),
+    ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
+             "search_files", "file_glob"),
+    ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
+             "read_file", "bench_config"),
+    ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
+             "write_file", "bench_readme"),
+    ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
+             "search_files", "TODO"),
+    ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
+             "read_file", "bench_readme"),
+    ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
+             "patch", "Tool Benchmark"),
+    ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
+             "write_file", "bench_hello"),
+    ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
+             "search_files", "json"),
+    ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
+             "read_file", "bench_hello"),
+    ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
+             "patch", "hello world"),
+    ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
+             "search_files", "bench"),
+    ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
+             "read_file", "test_bench"),
+
+    # ── Terminal Commands (20) ────────────────────────────────────────────
+    ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
+             "terminal", "echo"),
+    ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
+             "terminal", "date"),
+    ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
+             "terminal", "uname"),
+    ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
+             "terminal", "pwd"),
+    ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
+             "terminal", "head"),
+    ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
+             "terminal", "whoami"),
+    ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
+             "terminal", "df"),
+    ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
+             "terminal", "python3"),
+    ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
+             "terminal", "hostname"),
+    ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
+             "terminal", "uptime"),
+    ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
+             "terminal", "PATH"),
+    ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
+             "terminal", "wc"),
+    ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
+             "terminal", "SHELL"),
+    ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
+             "terminal", "memory"),
+    ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
+             "terminal", "id"),
+    ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
+             "terminal", "hostname"),
+    ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
+             "terminal", "echo"),
+    ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
+             "terminal", "seq"),
+    ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
+             "terminal", "print"),
+    ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
+             "terminal", "wc"),
+
+    # ── Code Execution (15) ──────────────────────────────────────────────
+    ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
+             "execute_code", "factorial"),
+    ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
+             "execute_code", "words"),
+    ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
+             "execute_code", "fibonacci"),
+    ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
+             "execute_code", "json"),
+    ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
+             "execute_code", "glob"),
+    ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
+             "execute_code", "sum"),
+    ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
+             "execute_code", "palindrome"),
+    ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
+             "execute_code", "csv"),
+    ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
+             "execute_code", "sort"),
+    ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
+             "execute_code", "passwd"),
+    ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
+             "execute_code", "sha256"),
+    ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
+             "execute_code", "utcnow"),
+    ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
+             "execute_code", "upper"),
+    ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
+             "execute_code", "sys"),
+    ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
+             "execute_code", "socket"),
+
+    # ── Delegation (10) ──────────────────────────────────────────────────
+    ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
+             "delegate_task", "log"),
+    ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
+             "delegate_task", "15"),
+    ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
+             "delegate_task", "python"),
+    ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
+             "delegate_task", "summarize"),
+    ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
+             "delegate_task", "tmp"),
+    ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
+             "delegate_task", ".py"),
+    ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
+             "delegate_task", "df"),
+    ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
+             "delegate_task", "os"),
+    ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
+             "delegate_task", "hostname"),
+    ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
+             "delegate_task", "write"),
+
+    # ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
+    ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
+             "todo", "benchmark"),
+    ToolCall("todo-02", "todo", "Show me the current todo list.",
+             "todo", ""),
+    ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
+             "todo", "completed"),
+    ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
+             "todo", "Review"),
+    ToolCall("todo-05", "todo", "Clear all completed todos.",
+             "todo", "clear"),
+    ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
+              date=datetime.now().strftime("%Y-%m-%d")),
+             "memory", "benchmark"),
+    ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
+             "memory", "benchmark"),
+    ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
+             "memory", "gemma"),
+    ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
+             "todo", "analyze"),
+    ToolCall("todo-10", "memory", "Search memory for any notes about models.",
+             "memory", "model"),
+
+    # ── Skills (10 — replacing MCP tools which need servers) ─────────────
+    ToolCall("skill-01", "skills", "List all available skills.",
+             "skills_list", ""),
+    ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
+             "skill_view", "test-driven"),
+    ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
+             "skills_list", "git"),
+    ToolCall("skill-04", "skills", "View the 'code-review' skill.",
+             "skill_view", "code-review"),
+    ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
+             "skills_list", "devops"),
+    ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
+             "skill_view", "systematic-debugging"),
+    ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
+             "skills_list", "testing"),
+    ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
+             "skill_view", "writing-plans"),
+    ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
+             "skills_list", "software-development"),
+    ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
+             "skill_view", "pr-review"),
+
+    # ── Additional tests to reach 100 ────────────────────────────────────
+    ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
+             "write_file", "bench_sort"),
+    ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
+             "read_file", "bench_sort"),
+    ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
+             "search_files", "class"),
+    ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
+             "terminal", "os"),
+    ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
+             "terminal", "cpu"),
+    ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
+             "execute_code", "flatten"),
+    ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
+             "execute_code", "prime"),
+    ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
+             "delegate_task", "cwd"),
+    ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
+             "todo", "Finalize"),
+    ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
+             "memory", "categories"),
+    ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
+             "skills_list", "deployment"),
+    ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
+             "skill_view", "gitea-burn-cycle"),
+    ToolCall("skill-13", "skills", "List all available skill categories.",
+             "skills_list", ""),
+    ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
+             "skills_list", "memory"),
+    ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
+             "skill_view", "mimo-swarm"),
+]
+# fmt: on
+
+
+# ---------------------------------------------------------------------------
+# Runner
+# ---------------------------------------------------------------------------
+
+@dataclass
+class CallResult:
+    test_id: str
+    category: str
+    model: str
+    prompt: str
+    expected_tool: str
+    success: bool
+    tool_called: Optional[str] = None
+    tool_args_valid: bool = False
+    execution_ok: bool = False
+    latency_s: float = 0.0
+    error: str = ""
+    raw_response: str = ""
+
+
+@dataclass
+class ModelStats:
+    model: str
+    total: int = 0
+    schema_ok: int = 0        # model produced valid tool call JSON
+    exec_ok: int = 0          # tool actually ran without error
+    latency_sum: float = 0.0
+    failures: list = field(default_factory=list)
+
+    @property
+    def schema_pct(self) -> float:
+        return (self.schema_ok / self.total * 100) if self.total else 0
+
+    @property
+    def exec_pct(self) -> float:
+        return (self.exec_ok / self.total * 100) if self.total else 0
+
+    @property
+    def avg_latency(self) -> float:
+        return (self.latency_sum / self.total) if self.total else 0
+
+
+def setup_test_files():
+    """Create prerequisite files for the benchmark."""
+    Path("/tmp/test_bench.txt").write_text(
+        "This is a benchmark test file.\n"
+        "It contains sample data for tool-calling tests.\n"
+        "Line three has some import statements.\n"
+        "import os\nimport sys\nimport json\n"
+        "End of test data.\n"
+    )
+
+
+def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
+    """Run a single tool-calling test through the agent."""
+    from run_agent import AIAgent
+
+    result = CallResult(
+        test_id=tc.id,
+        category=tc.category,
+        model=model_spec,
+        prompt=tc.prompt,
+        expected_tool=tc.expected_tool,
+        success=False,
+    )
+
+    try:
+        agent = AIAgent(
+            model=model_spec,
+            provider=provider,
+            max_iterations=3,
+            quiet_mode=True,
+            skip_context_files=True,
+            skip_memory=True,
+            persist_session=False,
+        )
+
+        t0 = time.time()
+        conv = agent.run_conversation(
+            user_message=tc.prompt,
+            system_message=(
+                "You are a benchmark test runner. Execute the user's request by calling "
+                "the appropriate tool. Return the tool result directly. Do not add commentary."
+            ),
+        )
+        result.latency_s = round(time.time() - t0, 2)
+
+        messages = conv.get("messages", [])
+
+        # Find the first assistant message with tool_calls
+        tool_called = None
+        tool_args_str = ""
+        for msg in messages:
+            if msg.get("role") == "assistant" and msg.get("tool_calls"):
+                for tc_item in msg["tool_calls"]:
+                    fn = tc_item.get("function", {})
+                    tool_called = fn.get("name", "")
+                    tool_args_str = fn.get("arguments", "{}")
+                    break
+                break
+
+        if tool_called:
+            result.tool_called = tool_called
+            result.schema_ok = True
+
+            # Check if the right tool was called
+            if tool_called == tc.expected_tool:
+                result.success = True
+
+            # Check if args contain expected substring
+            if tc.expected_params_check:
+                result.tool_args_valid = tc.expected_params_check in tool_args_str
+            else:
+                result.tool_args_valid = True
+
+            # Check if tool executed (look for tool role message)
+            for msg in messages:
+                if msg.get("role") == "tool":
+                    content = msg.get("content", "")
+                    if content and "error" not in content.lower()[:50]:
+                        result.execution_ok = True
+                        break
+                    elif content:
+                        result.execution_ok = True  # got a response, even if error
+                        break
+        else:
+            # No tool call produced — still check if model responded
+            final = conv.get("final_response", "")
+            result.raw_response = final[:200] if final else ""
+
+    except Exception as e:
+        result.error = f"{type(e).__name__}: {str(e)[:200]}"
+        result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
+
+    return result
+
+
+def generate_report(results: list[CallResult], models: list[str], output_path: Path):
+    """Generate markdown benchmark report."""
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+
+    # Aggregate per model
+    stats: dict[str, ModelStats] = {}
+    for m in models:
+        stats[m] = ModelStats(model=m)
+
+    by_category: dict[str, dict[str, list[CallResult]]] = {}
+
+    for r in results:
+        s = stats[r.model]
+        s.total += 1
+        s.schema_ok += int(r.schema_ok)
+        s.exec_ok += int(r.execution_ok)
+        s.latency_sum += r.latency_s
+        if not r.success:
+            s.failures.append(r)
+
+        by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
+
+    lines = [
+        f"# Tool-Calling Benchmark Report",
+        f"",
+        f"Generated: {now}",
+        f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
+        f"Models tested: {', '.join(models)}",
+        f"",
+        f"## Summary",
+        f"",
+        f"| Metric | {' | '.join(models)} |",
+        f"|--------|{'|'.join('---------' for _ in models)}|",
+    ]
+
+    # Schema parse success
+    row = "| Schema parse success | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
+    lines.append(row)
+
+    # Tool execution success
+    row = "| Tool execution success | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
+    lines.append(row)
+
+    # Correct tool selected
+    row = "| Correct tool selected | "
+    for m in models:
+        s = stats[m]
+        correct = sum(1 for r in results if r.model == m and r.success)
+        pct = (correct / s.total * 100) if s.total else 0
+        row += f"{correct}/{s.total} ({pct:.0f}%) | "
+    lines.append(row)
+
+    # Avg latency
+    row = "| Avg latency (s) | "
+    for m in models:
+        s = stats[m]
+        row += f"{s.avg_latency:.2f} | "
+    lines.append(row)
+
+    lines.append("")
+
+    # Per-category breakdown
+    lines.append("## Per-Category Breakdown")
+    lines.append("")
+
+    for cat in sorted(by_category.keys()):
+        lines.append(f"### {cat.title()}")
+        lines.append("")
+        lines.append(f"| Metric | {' | '.join(models)} |")
+        lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
+
+        cat_data = by_category[cat]
+        for metric_name, fn in [
+            ("Schema OK", lambda r: r.schema_ok),
+            ("Exec OK", lambda r: r.execution_ok),
+            ("Correct tool", lambda r: r.success),
+        ]:
+            row = f"| {metric_name} | "
+            for m in models:
+                results_m = cat_data.get(m, [])
+                total = len(results_m)
+                ok = sum(1 for r in results_m if fn(r))
+                pct = (ok / total * 100) if total else 0
+                row += f"{ok}/{total} ({pct:.0f}%) | "
+            lines.append(row)
+
+        lines.append("")
+
+    # Failure analysis
+    lines.append("## Failure Analysis")
+    lines.append("")
+
+    any_failures = False
+    for m in models:
+        s = stats[m]
+        if s.failures:
+            any_failures = True
+            lines.append(f"### {m} — {len(s.failures)} failures")
+            lines.append("")
+            lines.append("| Test | Category | Expected | Got | Error |")
+            lines.append("|------|----------|----------|-----|-------|")
+            for r in s.failures:
+                got = r.tool_called or "none"
+                err = r.error or "wrong tool"
+                lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
+            lines.append("")
+
+    if not any_failures:
+        lines.append("No failures detected.")
+        lines.append("")
+
+    # Raw results JSON
+    lines.append("## Raw Results")
+    lines.append("")
+    lines.append("```json")
+    lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
+    lines.append("```")
+
+    report = "\n".join(lines)
+    output_path.write_text(report)
+    return report
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Tool-calling benchmark")
+    parser.add_argument("--models", nargs="+",
+                        default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
+                        help="Model specs to test (provider:model)")
+    parser.add_argument("--limit", type=int, default=0,
+                        help="Run only first N tests (0 = all)")
+    parser.add_argument("--category", type=str, default="",
+                        help="Run only tests in this category")
+    parser.add_argument("--output", type=str, default="",
+                        help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print test cases without running them")
+    args = parser.parse_args()
+
+    # Filter suite
+    suite = SUITE[:]
+    if args.category:
+        suite = [tc for tc in suite if tc.category == args.category]
+    if args.limit > 0:
+        suite = suite[:args.limit]
+
+    if args.dry_run:
+        print(f"Would run {len(suite)} tests:")
+        for tc in suite:
+            print(f"  [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
+        return
+
+    # Setup
+    setup_test_files()
+    date_str = datetime.now().strftime("%Y-%m-%d")
+    output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
+
+    # Parse model specs
+    model_specs = []
+    for spec in args.models:
+        parts = spec.split(":", 1)
+        provider = parts[0]
+        model_name = parts[1] if len(parts) > 1 else parts[0]
+        model_specs.append((provider, model_name, spec))
+
+    print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
+    print(f"Output: {output_path}")
+    print()
+
+    all_results: list[CallResult] = []
+
+    for provider, model_name, full_spec in model_specs:
+        print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
+        model_results = []
+
+        for i, tc in enumerate(suite, 1):
+            sys.stdout.write(f"\r  [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
+            sys.stdout.flush()
+
+            r = run_single_test(tc, full_spec, provider)
+            model_results.append(r)
+
+            status = "✓" if r.success else "✗"
+            sys.stdout.write(f"  {status} ({r.latency_s:.1f}s)")
+            sys.stdout.write("\n")
+
+        all_results.extend(model_results)
+
+        # Quick stats
+        ok = sum(1 for r in model_results if r.success)
+        print(f"  Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
+        print()
+
+    # Generate report
+    model_names = [spec for _, _, spec in model_specs]
+    report = generate_report(all_results, model_names, output_path)
+    print(f"Report written to {output_path}")
+
+    # Exit code: 0 if all pass, 1 if any failures
+    total_fail = sum(1 for r in all_results if not r.success)
+    sys.exit(1 if total_fail > 0 else 0)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmarks/vision_benchmark.py
+++ b/benchmarks/vision_benchmark.py
@@ -0,0 +1,635 @@
+#!/usr/bin/env python3
+"""
+Vision Benchmark Suite — Issue #817
+
+Compares Gemma 4 vision accuracy vs current approach (Gemini 3 Flash Preview).
+Measures OCR accuracy, description quality, latency, and token usage.
+
+Usage:
+    # Run full benchmark
+    python benchmarks/vision_benchmark.py --images benchmarks/test_images.json
+
+    # Single image test
+    python benchmarks/vision_benchmark.py --url https://example.com/image.png
+
+    # Generate test report
+    python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
+
+Test image dataset: benchmarks/test_images.json (50-100 diverse images)
+"""
+
+import argparse
+import asyncio
+import base64
+import json
+import os
+import statistics
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+
+# ---------------------------------------------------------------------------
+# Benchmark configuration
+# ---------------------------------------------------------------------------
+
+# Models to compare
+MODELS = {
+    "gemma4": {
+        "model_id": "google/gemma-4-27b-it",
+        "display_name": "Gemma 4 27B",
+        "provider": "nous",
+        "description": "Google's multimodal Gemma 4 model",
+    },
+    "gemini3_flash": {
+        "model_id": "google/gemini-3-flash-preview",
+        "display_name": "Gemini 3 Flash Preview",
+        "provider": "openrouter",
+        "description": "Current default vision model",
+    },
+}
+
+# Evaluation prompts for different test categories
+EVAL_PROMPTS = {
+    "screenshot": "Describe this screenshot in detail. What application is shown? What is the current state of the UI?",
+    "diagram": "Describe this diagram completely. What concepts does it illustrate? List all components and their relationships.",
+    "photo": "Describe this photo in detail. What objects are visible? What is the scene?",
+    "ocr": "Extract ALL text visible in this image. Return it exactly as written, preserving formatting.",
+    "chart": "What data does this chart show? List all axes labels, values, and key trends.",
+    "document": "Extract all text from this document image. Preserve paragraph structure.",
+}
+
+
+# ---------------------------------------------------------------------------
+# Vision model interface
+# ---------------------------------------------------------------------------
+
+
+async def analyze_with_model(
+    image_url: str,
+    prompt: str,
+    model_config: dict,
+    timeout: float = 120.0,
+) -> dict:
+    """Call a vision model and return structured results.
+
+    Returns dict with:
+        - analysis: str
+        - latency_ms: float
+        - tokens: dict (prompt_tokens, completion_tokens, total_tokens)
+        - success: bool
+        - error: str (if failed)
+    """
+    import httpx
+
+    provider = model_config["provider"]
+    model_id = model_config["model_id"]
+
+    # Prepare messages
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }
+    ]
+
+    # Route to provider
+    if provider == "openrouter":
+        api_url = "https://openrouter.ai/api/v1/chat/completions"
+        api_key = os.getenv("OPENROUTER_API_KEY", "")
+    elif provider == "nous":
+        api_url = "https://inference.nousresearch.com/v1/chat/completions"
+        api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
+    else:
+        api_url = os.getenv(f"{provider.upper()}_API_URL", "")
+        api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
+
+    if not api_key:
+        return {
+            "analysis": "",
+            "latency_ms": 0,
+            "tokens": {},
+            "success": False,
+            "error": f"No API key for provider {provider}",
+        }
+
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+
+    payload = {
+        "model": model_id,
+        "messages": messages,
+        "max_tokens": 2000,
+        "temperature": 0.1,
+    }
+
+    start = time.perf_counter()
+    try:
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            resp = await client.post(api_url, json=payload, headers=headers)
+            resp.raise_for_status()
+            data = resp.json()
+
+        latency_ms = (time.perf_counter() - start) * 1000
+
+        analysis = ""
+        choices = data.get("choices", [])
+        if choices:
+            msg = choices[0].get("message", {})
+            analysis = msg.get("content", "")
+
+        usage = data.get("usage", {})
+        tokens = {
+            "prompt_tokens": usage.get("prompt_tokens", 0),
+            "completion_tokens": usage.get("completion_tokens", 0),
+            "total_tokens": usage.get("total_tokens", 0),
+        }
+
+        return {
+            "analysis": analysis,
+            "latency_ms": round(latency_ms, 1),
+            "tokens": tokens,
+            "success": True,
+            "error": "",
+        }
+
+    except Exception as e:
+        return {
+            "analysis": "",
+            "latency_ms": round((time.perf_counter() - start) * 1000, 1),
+            "tokens": {},
+            "success": False,
+            "error": str(e),
+        }
+
+
+# ---------------------------------------------------------------------------
+# Evaluation metrics
+# ---------------------------------------------------------------------------
+
+
+def compute_ocr_accuracy(extracted: str, ground_truth: str) -> float:
+    """Compute OCR accuracy using character-level Levenshtein ratio.
+
+    Returns 0.0-1.0 (1.0 = perfect match).
+    """
+    if not ground_truth:
+        return 1.0 if not extracted else 0.0
+    if not extracted:
+        return 0.0
+
+    # Normalized Levenshtein similarity
+    extracted_lower = extracted.lower().strip()
+    truth_lower = ground_truth.lower().strip()
+
+    # Simple character overlap ratio (fast proxy)
+    max_len = max(len(extracted_lower), len(truth_lower))
+    if max_len == 0:
+        return 1.0
+
+    # Count matching characters at matching positions
+    matches = sum(1 for a, b in zip(extracted_lower, truth_lower) if a == b)
+    position_ratio = matches / max_len
+
+    # Also check word-level overlap
+    extracted_words = set(extracted_lower.split())
+    truth_words = set(truth_lower.split())
+    if truth_words:
+        word_recall = len(extracted_words & truth_words) / len(truth_words)
+    else:
+        word_recall = 1.0 if not extracted_words else 0.0
+
+    return round((position_ratio * 0.4 + word_recall * 0.6), 4)
+
+
+def compute_description_completeness(analysis: str, expected_keywords: list) -> float:
+    """Score description completeness based on keyword coverage.
+
+    Returns 0.0-1.0.
+    """
+    if not expected_keywords:
+        return 1.0
+    if not analysis:
+        return 0.0
+
+    analysis_lower = analysis.lower()
+    found = sum(1 for kw in expected_keywords if kw.lower() in analysis_lower)
+    return round(found / len(expected_keywords), 4)
+
+
+def compute_structural_accuracy(analysis: str, expected_structure: dict) -> dict:
+    """Evaluate structural elements of the analysis.
+
+    Returns dict with per-element scores.
+    """
+    scores = {}
+
+    # Length check
+    min_length = expected_structure.get("min_length", 50)
+    scores["length"] = min(len(analysis) / min_length, 1.0) if min_length > 0 else 1.0
+
+    # Sentence count
+    min_sentences = expected_structure.get("min_sentences", 2)
+    sentence_count = analysis.count(".") + analysis.count("!") + analysis.count("?")
+    scores["sentences"] = min(sentence_count / max(min_sentences, 1), 1.0)
+
+    # Has specifics (numbers, names, etc.)
+    if expected_structure.get("has_numbers", False):
+        import re
+        scores["has_numbers"] = 1.0 if re.search(r'\d', analysis) else 0.0
+
+    return scores
+
+
+# ---------------------------------------------------------------------------
+# Benchmark runner
+# ---------------------------------------------------------------------------
+
+
+async def run_single_test(
+    image: dict,
+    models: dict,
+    runs_per_model: int = 1,
+) -> dict:
+    """Run a single image through all models.
+
+    Args:
+        image: dict with url, category, expected_keywords, ground_truth_ocr, etc.
+        models: dict of model configs to test
+        runs_per_model: number of runs per model (for consistency testing)
+
+    Returns dict with results per model.
+    """
+    category = image.get("category", "photo")
+    prompt = EVAL_PROMPTS.get(category, EVAL_PROMPTS["photo"])
+    url = image["url"]
+
+    results = {}
+
+    for model_name, model_config in models.items():
+        runs = []
+        for run_i in range(runs_per_model):
+            result = await analyze_with_model(url, prompt, model_config)
+            runs.append(result)
+            if run_i < runs_per_model - 1:
+                await asyncio.sleep(1)  # Rate limit courtesy
+
+        # Aggregate
+        successful = [r for r in runs if r["success"]]
+        if successful:
+            avg_latency = statistics.mean(r["latency_ms"] for r in successful)
+            avg_tokens = statistics.mean(
+                r["tokens"].get("total_tokens", 0) for r in successful
+            )
+            # Use first successful run for accuracy metrics
+            primary = successful[0]
+
+            # Compute accuracy
+            ocr_score = None
+            if image.get("ground_truth_ocr"):
+                ocr_score = compute_ocr_accuracy(
+                    primary["analysis"], image["ground_truth_ocr"]
+                )
+
+            keyword_score = None
+            if image.get("expected_keywords"):
+                keyword_score = compute_description_completeness(
+                    primary["analysis"], image["expected_keywords"]
+                )
+
+            structural = compute_structural_accuracy(
+                primary["analysis"], image.get("expected_structure", {})
+            )
+
+            results[model_name] = {
+                "success": True,
+                "analysis_preview": primary["analysis"][:300],
+                "analysis_length": len(primary["analysis"]),
+                "avg_latency_ms": round(avg_latency, 1),
+                "avg_tokens": round(avg_tokens, 1),
+                "ocr_accuracy": ocr_score,
+                "keyword_completeness": keyword_score,
+                "structural_scores": structural,
+                "consistency": round(
+                    statistics.stdev(len(r["analysis"]) for r in successful), 1
+                ) if len(successful) > 1 else 0.0,
+                "runs": len(successful),
+                "errors": len(runs) - len(successful),
+            }
+        else:
+            results[model_name] = {
+                "success": False,
+                "error": runs[0]["error"] if runs else "No runs",
+                "runs": 0,
+                "errors": len(runs),
+            }
+
+    return results
+
+
+async def run_benchmark_suite(
+    images: List[dict],
+    models: dict,
+    runs_per_model: int = 1,
+) -> dict:
+    """Run the full benchmark suite.
+
+    Args:
+        images: list of image test cases
+        models: model configs to compare
+        runs_per_model: consistency runs per image
+
+    Returns structured benchmark report.
+    """
+    total = len(images)
+    all_results = []
+
+    print(f"\nRunning vision benchmark: {total} images x {len(models)} models x {runs_per_model} runs")
+    print(f"Models: {', '.join(m['display_name'] for m in models.values())}\n")
+
+    for i, image in enumerate(images):
+        img_id = image.get("id", f"img_{i}")
+        category = image.get("category", "unknown")
+        print(f"  [{i+1}/{total}] {img_id} ({category})...", end=" ", flush=True)
+
+        result = await run_single_test(image, models, runs_per_model)
+        result["image_id"] = img_id
+        result["category"] = category
+        all_results.append(result)
+
+        # Quick status
+        statuses = []
+        for mname in models:
+            if result[mname]["success"]:
+                lat = result[mname]["avg_latency_ms"]
+                statuses.append(f"{mname}:{lat:.0f}ms")
+            else:
+                statuses.append(f"{mname}:FAIL")
+        print(", ".join(statuses))
+
+    # Aggregate statistics
+    summary = aggregate_results(all_results, models)
+
+    return {
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "config": {
+            "total_images": total,
+            "runs_per_model": runs_per_model,
+            "models": {k: v["display_name"] for k, v in models.items()},
+        },
+        "results": all_results,
+        "summary": summary,
+    }
+
+
+def aggregate_results(results: List[dict], models: dict) -> dict:
+    """Compute aggregate statistics across all test images."""
+    summary = {}
+
+    for model_name in models:
+        model_results = [r[model_name] for r in results if r[model_name]["success"]]
+        failed = [r[model_name] for r in results if not r[model_name]["success"]]
+
+        if not model_results:
+            summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
+            continue
+
+        latencies = [r["avg_latency_ms"] for r in model_results]
+        tokens = [r["avg_tokens"] for r in model_results if r.get("avg_tokens")]
+        ocr_scores = [r["ocr_accuracy"] for r in model_results if r.get("ocr_accuracy") is not None]
+        keyword_scores = [r["keyword_completeness"] for r in model_results if r.get("keyword_completeness") is not None]
+
+        summary[model_name] = {
+            "success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
+            "total_runs": len(model_results),
+            "total_failures": len(failed),
+            "latency": {
+                "mean_ms": round(statistics.mean(latencies), 1),
+                "median_ms": round(statistics.median(latencies), 1),
+                "p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)], 1),
+                "std_ms": round(statistics.stdev(latencies), 1) if len(latencies) > 1 else 0,
+            },
+            "tokens": {
+                "mean_total": round(statistics.mean(tokens), 1) if tokens else 0,
+                "total_used": sum(int(t) for t in tokens),
+            },
+            "accuracy": {
+                "ocr_mean": round(statistics.mean(ocr_scores), 4) if ocr_scores else None,
+                "ocr_count": len(ocr_scores),
+                "keyword_mean": round(statistics.mean(keyword_scores), 4) if keyword_scores else None,
+                "keyword_count": len(keyword_scores),
+            },
+        }
+
+    return summary
+
+
+# ---------------------------------------------------------------------------
+# Report generation
+# ---------------------------------------------------------------------------
+
+
+def to_markdown(report: dict) -> str:
+    """Generate human-readable markdown report."""
+    summary = report["summary"]
+    config = report["config"]
+    model_names = list(config["models"].values())
+
+    lines = [
+        "# Vision Benchmark Report",
+        "",
+        f"Generated: {report['generated_at'][:16]}",
+        f"Images tested: {config['total_images']}",
+        f"Runs per model: {config['runs_per_model']}",
+        f"Models: {', '.join(model_names)}",
+        "",
+        "## Latency Comparison",
+        "",
+        "| Model | Mean (ms) | Median | P95 | Std Dev |",
+        "|-------|-----------|--------|-----|---------|",
+    ]
+
+    for mkey, mname in config["models"].items():
+        if mkey in summary and "latency" in summary[mkey]:
+            lat = summary[mkey]["latency"]
+            lines.append(
+                f"| {mname} | {lat['mean_ms']:.0f} | {lat['median_ms']:.0f} | "
+                f"{lat['p95_ms']:.0f} | {lat['std_ms']:.0f} |"
+            )
+
+    lines += [
+        "",
+        "## Accuracy Comparison",
+        "",
+        "| Model | OCR Accuracy | Keyword Coverage | Success Rate |",
+        "|-------|-------------|-----------------|--------------|",
+    ]
+
+    for mkey, mname in config["models"].items():
+        if mkey in summary and "accuracy" in summary[mkey]:
+            acc = summary[mkey]["accuracy"]
+            sr = summary[mkey].get("success_rate", 0)
+            ocr = f"{acc['ocr_mean']:.1%}" if acc["ocr_mean"] is not None else "N/A"
+            kw = f"{acc['keyword_mean']:.1%}" if acc["keyword_mean"] is not None else "N/A"
+            lines.append(f"| {mname} | {ocr} | {kw} | {sr:.1%} |")
+
+    lines += [
+        "",
+        "## Token Usage",
+        "",
+        "| Model | Mean Tokens/Image | Total Tokens |",
+        "|-------|------------------|--------------|",
+    ]
+
+    for mkey, mname in config["models"].items():
+        if mkey in summary and "tokens" in summary[mkey]:
+            tok = summary[mkey]["tokens"]
+            lines.append(
+                f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
+            )
+
+    # Verdict
+    lines += ["", "## Verdict", ""]
+
+    # Find best model by composite score
+    best_model = None
+    best_score = -1
+    for mkey, mname in config["models"].items():
+        if mkey not in summary or "accuracy" not in summary[mkey]:
+            continue
+        acc = summary[mkey]["accuracy"]
+        sr = summary[mkey].get("success_rate", 0)
+        ocr = acc["ocr_mean"] or 0
+        kw = acc["keyword_mean"] or 0
+        # Weighted composite: 40% OCR, 30% keyword, 30% success rate
+        score = (ocr * 0.4 + kw * 0.3 + sr * 0.3)
+        if score > best_score:
+            best_score = score
+            best_model = mname
+
+    if best_model:
+        lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
+    else:
+        lines.append("No clear winner — insufficient data.")
+
+    return "\n".join(lines)
+
+
+# ---------------------------------------------------------------------------
+# Test dataset management
+# ---------------------------------------------------------------------------
+
+
+def generate_sample_dataset() -> List[dict]:
+    """Generate a sample test dataset with diverse public images.
+
+    Returns list of test image definitions.
+    """
+    return [
+        # Screenshots
+        {
+            "id": "screenshot_github",
+            "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
+            "category": "screenshot",
+            "expected_keywords": ["github", "logo", "octocat"],
+            "expected_structure": {"min_length": 50, "min_sentences": 2},
+        },
+        # Diagrams
+        {
+            "id": "diagram_architecture",
+            "url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
+            "category": "diagram",
+            "expected_keywords": ["architecture", "component", "service"],
+            "expected_structure": {"min_length": 100, "min_sentences": 3},
+        },
+        # Photos
+        {
+            "id": "photo_nature",
+            "url": "https://picsum.photos/seed/bench1/400/300",
+            "category": "photo",
+            "expected_keywords": [],
+            "expected_structure": {"min_length": 30, "min_sentences": 1},
+        },
+        # Charts
+        {
+            "id": "chart_bar",
+            "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
+            "category": "chart",
+            "expected_keywords": ["bar", "chart", "data"],
+            "expected_structure": {"min_length": 50, "min_sentences": 2},
+        },
+    ]
+
+
+def load_dataset(path: str) -> List[dict]:
+    """Load test dataset from JSON file."""
+    with open(path) as f:
+        return json.load(f)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+
+async def main():
+    parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
+    parser.add_argument("--images", help="Path to test images JSON file")
+    parser.add_argument("--url", help="Single image URL to test")
+    parser.add_argument("--category", default="photo", help="Category for single URL")
+    parser.add_argument("--output", default=None, help="Output JSON file")
+    parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
+    parser.add_argument("--models", nargs="+", default=None,
+                        help="Models to test (default: all)")
+    parser.add_argument("--markdown", action="store_true", help="Output markdown report")
+    parser.add_argument("--generate-dataset", action="store_true",
+                        help="Generate sample dataset and exit")
+    args = parser.parse_args()
+
+    if args.generate_dataset:
+        dataset = generate_sample_dataset()
+        out_path = args.images or "benchmarks/test_images.json"
+        os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
+        with open(out_path, "w") as f:
+            json.dump(dataset, f, indent=2)
+        print(f"Generated sample dataset: {out_path} ({len(dataset)} images)")
+        return
+
+    # Select models
+    if args.models:
+        selected = {k: v for k, v in MODELS.items() if k in args.models}
+    else:
+        selected = MODELS
+
+    # Load images
+    if args.url:
+        images = [{"id": "single", "url": args.url, "category": args.category}]
+    elif args.images:
+        images = load_dataset(args.images)
+    else:
+        print("ERROR: Provide --images or --url")
+        sys.exit(1)
+
+    # Run benchmark
+    report = await run_benchmark_suite(images, selected, args.runs)
+
+    # Output
+    if args.output:
+        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
+        with open(args.output, "w") as f:
+            json.dump(report, f, indent=2)
+        print(f"\nResults saved to {args.output}")
+
+    if args.markdown or not args.output:
+        print("\n" + to_markdown(report))
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -24,6 +24,8 @@ model:
  #   "minimax"      - MiniMax global (requires: MINIMAX_API_KEY)
  #   "minimax-cn"   - MiniMax China (requires: MINIMAX_CN_API_KEY)
  #   "huggingface"  - Hugging Face Inference (requires: HF_TOKEN)
+  #   "xiaomi"       - Xiaomi MiMo (requires: XIAOMI_API_KEY)
+  #   "arcee"        - Arcee AI Trinity models (requires: ARCEEAI_API_KEY)
  #   "kilocode"     - KiloCode gateway (requires: KILOCODE_API_KEY)
  #   "ai-gateway"   - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
  #
@@ -308,15 +310,8 @@ compression:
  # compression of older turns.
  protect_last_n: 20

-  # Model to use for generating summaries (fast/cheap recommended)
-  # This model compresses the middle turns into a concise summary.
-  # IMPORTANT: it receives the full middle section of the conversation, so it
-  # MUST support a context length at least as large as your main model's.
-  summary_model: "google/gemini-3-flash-preview"
-  
-  # Provider for the summary model (default: "auto")
-  # Options: "auto", "openrouter", "nous", "main"
-  # summary_provider: "auto"
+  # To pin a specific model/provider for compression summaries, use the
+  # auxiliary section below (auxiliary.compression.provider / model).

 # =============================================================================
 # Auxiliary Models (Advanced — Experimental)
@@ -353,7 +348,7 @@ compression:
 # Other providers pick a sensible default automatically.
 #
 # auxiliary:
-#   # Image analysis: vision_analyze tool + browser screenshots
+#   # Image analysis: vision_analyze tool
 #   vision:
 #     provider: "auto"
 #     model: ""              # e.g. "google/gemini-2.5-flash", "openai/gpt-4o"
@@ -361,6 +356,15 @@ compression:
 #     download_timeout: 30   # Image HTTP download timeout (seconds)
 #                            # Increase for slow connections or self-hosted image servers
 #
+#   # Browser screenshot analysis (browser_vision tool)
+#   # Defaults to Gemma 4 27B — natively multimodal, same model family as the main
+#   # text model, which avoids model-switching overhead and improves context continuity.
+#   # Override with any vision-capable model.  Set to "" to fall back to auto-detection.
+#   # Can also be overridden per-session with BROWSER_VISION_MODEL env var.
+#   browser_vision:
+#     model: "google/gemma-4-27b-it"  # default; override e.g. "google/gemini-2.5-flash"
+#     timeout: 120                     # API call timeout in seconds (default 120s)
+#
 #   # Web page scraping / summarization + browser page text extraction
 #   web_extract:
 #     provider: "auto"
@@ -528,7 +532,7 @@ agent:
 #   - A preset like "hermes-cli" or "hermes-telegram" (curated tool set)
 #   - A list of individual toolsets to compose your own (see list below)
 #
-# Supported platform keys: cli, telegram, discord, whatsapp, slack
+# Supported platform keys: cli, telegram, discord, whatsapp, slack, qqbot
 #
 # Examples:
 #
@@ -557,6 +561,7 @@ agent:
 #   slack:         hermes-slack          (same as telegram)
 #   signal:        hermes-signal         (same as telegram)
 #   homeassistant: hermes-homeassistant  (same as telegram)
+#   qqbot:            hermes-qqbot            (same as telegram)
 #
 platform_toolsets:
  cli: [hermes-cli]
@@ -566,6 +571,7 @@ platform_toolsets:
  slack: [hermes-slack]
  signal: [hermes-signal]
  homeassistant: [hermes-homeassistant]
+  qqbot: [hermes-qqbot]

 # ─────────────────────────────────────────────────────────────────────────────
 # Available toolsets (use these names in platform_toolsets or the toolsets list)
@@ -773,6 +779,11 @@ display:
  # Toggle at runtime with /verbose in the CLI
  tool_progress: all

+  # Gateway-only natural mid-turn assistant updates.
+  # When true, completed assistant status messages are sent as separate chat
+  # messages. This is independent of tool_progress and gateway streaming.
+  interim_assistant_messages: true
+
  # What Enter does when Hermes is already busy in the CLI.
  #   interrupt: Interrupt the current run and redirect Hermes (default)
  #   queue:     Queue your message for the next turn
@@ -781,7 +792,7 @@ display:

  # Background process notifications (gateway/messaging only).
  # Controls how chatty the process watcher is when you use
-  # terminal(background=true, check_interval=...) from Telegram/Discord/etc.
+  # terminal(background=true, notify_on_complete=true) from Telegram/Discord/etc.
  #   off:     No watcher messages at all
  #   result:  Only the final completion message
  #   error:   Only the final message when exit code != 0
--- a/cli.py
+++ b/cli.py
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -44,7 +44,8 @@ logger = logging.getLogger(__name__)
 _KNOWN_DELIVERY_PLATFORMS = frozenset({
    "telegram", "discord", "slack", "whatsapp", "signal",
    "matrix", "mattermost", "homeassistant", "dingtalk", "feishu",
-    "wecom", "weixin", "sms", "email", "webhook", "bluebubbles",
+    "wecom", "wecom_callback", "weixin", "sms", "email", "webhook", "bluebubbles",
+    "qqbot",
 })

 from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run
@@ -219,6 +220,21 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
    chat_id = target["chat_id"]
    thread_id = target.get("thread_id")

+    # Diagnostic: log thread_id for topic-aware delivery debugging
+    origin = job.get("origin") or {}
+    origin_thread = origin.get("thread_id")
+    if origin_thread and not thread_id:
+        logger.warning(
+            "Job '%s': origin has thread_id=%s but delivery target lost it "
+            "(deliver=%s, target=%s)",
+            job["id"], origin_thread, job.get("deliver", "local"), target,
+        )
+    elif thread_id:
+        logger.debug(
+            "Job '%s': delivering to %s:%s thread_id=%s",
+            job["id"], platform_name, chat_id, thread_id,
+        )
+
    from tools.send_message_tool import _send_to_platform
    from gateway.config import load_gateway_config, Platform

@@ -234,10 +250,12 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
        "dingtalk": Platform.DINGTALK,
        "feishu": Platform.FEISHU,
        "wecom": Platform.WECOM,
+        "wecom_callback": Platform.WECOM_CALLBACK,
        "weixin": Platform.WEIXIN,
        "email": Platform.EMAIL,
        "sms": Platform.SMS,
        "bluebubbles": Platform.BLUEBUBBLES,
+        "qqbot": Platform.QQBOT,
    }
    platform = platform_map.get(platform_name.lower())
    if not platform:
@@ -270,11 +288,13 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option

    if wrap_response:
        task_name = job.get("name", job["id"])
+        job_id = job.get("id", "")
        delivery_content = (
            f"Cronjob Response: {task_name}\n"
+            f"(job_id: {job_id})\n"
            f"-------------\n\n"
            f"{content}\n\n"
-            f"Note: The agent cannot see this message, and therefore cannot respond to it."
+            f"To stop or manage this job, send me a new message (e.g. \"stop reminder {task_name}\")."
        )
    else:
        delivery_content = content
@@ -625,6 +645,15 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
        except Exception as e:
            logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)

+        # Apply IPv4 preference if configured.
+        try:
+            from hermes_constants import apply_ipv4_preference
+            _net_cfg = _cfg.get("network", {})
+            if isinstance(_net_cfg, dict) and _net_cfg.get("force_ipv4"):
+                apply_ipv4_preference(force=True)
+        except Exception:
+            pass
+
        # Reasoning config from config.yaml
        from hermes_constants import parse_reasoning_effort
        effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
@@ -722,6 +751,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            provider_sort=pr.get("sort"),
            disabled_toolsets=["cronjob", "messaging", "clarify"],
            quiet_mode=True,
+            skip_context_files=True,  # Don't inject SOUL.md/AGENTS.md from scheduler cwd
            skip_memory=True,  # Cron system prompts would corrupt user representations
            platform="cron",
            session_id=_cron_session_id,
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -1,10 +1,44 @@
 #!/bin/bash
-# Docker entrypoint: bootstrap config files into the mounted volume, then run hermes.
+# Docker/Podman entrypoint: bootstrap config files into the mounted volume, then run hermes.
 set -e

-HERMES_HOME="/opt/data"
+HERMES_HOME="${HERMES_HOME:-/opt/data}"
 INSTALL_DIR="/opt/hermes"

+# --- Privilege dropping via gosu ---
+# When started as root (the default for Docker, or fakeroot in rootless Podman),
+# optionally remap the hermes user/group to match host-side ownership, fix volume
+# permissions, then re-exec as hermes.
+if [ "$(id -u)" = "0" ]; then
+    if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then
+        echo "Changing hermes UID to $HERMES_UID"
+        usermod -u "$HERMES_UID" hermes
+    fi
+
+    if [ -n "$HERMES_GID" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then
+        echo "Changing hermes GID to $HERMES_GID"
+        # -o allows non-unique GID (e.g. macOS GID 20 "staff" may already exist
+        # as "dialout" in the Debian-based container image)
+        groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true
+    fi
+
+    actual_hermes_uid=$(id -u hermes)
+    if [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then
+        echo "$HERMES_HOME is not owned by $actual_hermes_uid, fixing"
+        # In rootless Podman the container's "root" is mapped to an unprivileged
+        # host UID — chown will fail.  That's fine: the volume is already owned
+        # by the mapped user on the host side.
+        chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \
+            echo "Warning: chown failed (rootless container?) — continuing anyway"
+    fi
+
+    echo "Dropping root privileges"
+    exec gosu hermes "$0" "$@"
+fi
+
+# --- Running as hermes from here ---
+source "${INSTALL_DIR}/.venv/bin/activate"
+
 # Create essential directory structure.  Cache and platform directories
 # (cache/images, cache/audio, platforms/whatsapp, etc.) are created on
 # demand by the application — don't pre-create them here so new installs
--- a/docs/WORKFLOW_ORCHESTRATION_RESEARCH.md
+++ b/docs/WORKFLOW_ORCHESTRATION_RESEARCH.md
@@ -0,0 +1,432 @@
+# Workflow Orchestration & Task Queue Research for AI Agents
+
+**Date:** 2026-04-14
+**Scope:** SOTA comparison of task queues and workflow orchestrators for autonomous AI agent workflows
+
+---
+
+## 1. Current Architecture: Cron + Webhook
+
+### How it works
+- **Scheduler:** `cron/scheduler.py` — gateway calls `tick()` every 60 seconds
+- **Storage:** JSON file (`~/.hermes/cron/jobs.json`) + file-based lock (`cron/.tick.lock`)
+- **Execution:** Each job spawns a full `AIAgent.run_conversation()` in a thread pool with inactivity timeout
+- **Delivery:** Results pushed back to origin chat via platform adapters (Telegram, Discord, etc.)
+- **Checkpointing:** Job outputs saved to `~/.hermes/cron/output/{job_id}/{timestamp}.md`
+
+### Strengths
+- Simple, zero-dependency (no broker/redis needed)
+- Jobs are isolated — each runs a fresh agent session
+- Direct platform delivery with E2EE support
+- Script pre-run for data collection
+- Inactivity-based timeout (not hard wall-clock)
+
+### Weaknesses
+- **No task dependencies** — jobs are completely independent
+- **No retry logic** — single failure = lost run (recurring jobs advance schedule and move on)
+- **No concurrency control** — all due jobs fire at once; no worker pool sizing
+- **No observability** — no metrics, no dashboard, no structured logging of job state transitions
+- **Tick-based polling** — 60s granularity, wastes cycles when idle, adds latency when busy
+- **Single-process** — file lock means only one tick at a time; no horizontal scaling
+- **No dead letter queue** — failed deliveries are logged but not retried
+- **No workflow chaining** — cannot express "run A, then B with A's output"
+
+---
+
+## 2. Framework Comparison
+
+### 2.1 Huey (Already Installed v2.6.0)
+
+**Architecture:** Embedded task queue, SQLite/Redis/file storage, consumer process model.
+
+| Feature | Huey | Our Cron |
+|---|---|---|
+| Broker | SQLite (default), Redis | JSON file |
+| Retry | Built-in: `retries=N, retry_delay=S` | None |
+| Task chaining | `task1.s() | task2.s()` (pipeline) | None |
+| Scheduling | `@huey.periodic_task(crontab(...))` | Our own cron parser |
+| Concurrency | Worker pool with `-w N` flag | Single tick lock |
+| Monitoring | `huey_consumer` logs, Huey Admin (Django) | Manual log reading |
+| Failure recovery | Automatic retry + configurable backoff | None |
+| Priority | `PriorityRedisExpireHuey` or task priority | None |
+| Result storage | `store_results=True` with result() | File output |
+
+**Task Dependencies Pattern:**
+```python
+@huey.task()
+def analyze_data(input_data):
+    return run_analysis(input_data)
+
+@huey.task()
+def generate_report(analysis_result):
+    return create_report(analysis_result)
+
+# Pipeline: analyze then report
+pipeline = analyze_data.s(raw_data) | generate_report.s()
+result = pipeline()
+```
+
+**Retry Pattern:**
+```python
+@huey.task(retries=3, retry_delay=60, retry_backoff=True)
+def flaky_api_call(url):
+    return requests.get(url, timeout=30)
+```
+
+**Benchmarks:** ~5,000 tasks/sec with SQLite backend, ~15,000 with Redis. Sub-millisecond scheduling latency. Very lightweight — single process.
+
+**Verdict:** Best fit for our use case. Already installed. SQLite backend = no external deps. Can layer on top of our existing job storage.
+
+---
+
+### 2.2 Celery
+
+**Architecture:** Distributed task queue with message broker (RabbitMQ/Redis).
+
+| Feature | Celery | Huey |
+|---|---|---|
+| Broker | Redis, RabbitMQ, SQS (required) | SQLite (built-in) |
+| Scale | 100K+ tasks/sec | ~5-15K tasks/sec |
+| Chains | `chain(task1.s(), task2.s())` | Pipeline operator |
+| Groups/Chords | Parallel + callback | Not built-in |
+| Canvas | Full workflow DSL (chain, group, chord, map) | Basic pipeline |
+| Monitoring | Flower dashboard, Celery events | Minimal |
+| Complexity | Heavy — needs broker, workers, result backend | Single process |
+
+**Workflow Pattern:**
+```python
+from celery import chain, group, chord
+
+# Chain: sequential
+workflow = chain(fetch_data.s(), analyze.s(), report.s())
+
+# Group: parallel
+parallel = group(fetch_twitter.s(), fetch_reddit.s(), fetch_hn.s())
+
+# Chord: parallel then callback
+chord(parallel, aggregate_results.s())
+```
+
+**Verdict:** Overkill for our scale. Adds RabbitMQ/Redis dependency. The Canvas API is powerful but we don't need 100K task/sec throughput. Flower monitoring is nice but we'd need to deploy it separately.
+
+---
+
+### 2.3 Temporal
+
+**Architecture:** Durable execution engine. Workflows as code with automatic state persistence and replay.
+
+| Feature | Temporal | Our Cron |
+|---|---|---|
+| State management | Automatic — workflow state persisted on every step | Manual JSON files |
+| Failure recovery | Workflows survive process restarts, auto-retry | Lost on crash |
+| Task dependencies | Native — activities call other activities | None |
+| Long-running tasks | Built-in (days/months OK) | Inactivity timeout |
+| Versioning | Workflow versioning for safe updates | No versioning |
+| Visibility | Full workflow state at any point | Log files |
+| Infrastructure | Requires Temporal server + database | None |
+| Language | Python SDK, but Temporal server is Go | Pure Python |
+
+**Workflow Pattern:**
+```python
+@workflow.defn
+class AIAgentWorkflow:
+    @workflow.run
+    async def run(self, job_config: dict) -> str:
+        # Step 1: Fetch data
+        data = await workflow.execute_activity(
+            fetch_data_activity,
+            job_config["script"],
+            start_to_close_timeout=timedelta(minutes=5),
+            retry_policy=RetryPolicy(maximum_attempts=3),
+        )
+        
+        # Step 2: Analyze with AI agent
+        analysis = await workflow.execute_activity(
+            run_agent_activity,
+            {"prompt": job_config["prompt"], "context": data},
+            start_to_close_timeout=timedelta(minutes=30),
+            retry_policy=RetryPolicy(
+                initial_interval=timedelta(seconds=60),
+                maximum_attempts=3,
+            ),
+        )
+        
+        # Step 3: Deliver
+        await workflow.execute_activity(
+            deliver_activity,
+            {"platform": job_config["deliver"], "content": analysis},
+            start_to_close_timeout=timedelta(seconds=60),
+        )
+        return analysis
+```
+
+**Verdict:** Best architecture for complex multi-step AI workflows, but heavy infrastructure cost. Temporal server needs PostgreSQL/Cassandra + visibility store. Ideal if we reach 50+ multi-step workflows with complex failure modes. Overkill for current needs.
+
+---
+
+### 2.4 Prefect
+
+**Architecture:** Modern data/workflow orchestration with Python-native API.
+
+| Feature | Prefect |
+|---|---|
+| Dependencies | SQLite (default) or PostgreSQL |
+| Task retries | `@task(retries=3, retry_delay_seconds=10)` |
+| Task dependencies | `result = task_a(wait_for=[task_b])` |
+| Caching | `cache_key_fn` for result caching |
+| Subflows | Nested workflow composition |
+| Deployments | Schedule via `Deployment` or `CronSchedule` |
+| UI | Excellent web dashboard |
+| Async | Full async support |
+
+**Workflow Pattern:**
+```python
+from prefect import flow, task
+from prefect.tasks import task_input_hash
+
+@task(retries=3, retry_delay_seconds=30)
+def run_agent(prompt: str) -> str:
+    agent = AIAgent(...)
+    return agent.run_conversation(prompt)
+
+@task(cache_key_fn=task_input_hash, cache_expiration=timedelta(hours=1))
+def fetch_context(script: str) -> str:
+    return run_script(script)
+
+@flow(name="agent-workflow")
+def agent_workflow(job_config: dict):
+    context = fetch_context(job_config.get("script", ""))
+    result = run_agent(
+        f"{context}\n\n{job_config['prompt']}",
+        wait_for=[context]
+    )
+    deliver(result, job_config["deliver"])
+    return result
+```
+
+**Benchmarks:** Sub-second task scheduling. Handles 10K+ concurrent task runs. SQLite backend for single-node.
+
+**Verdict:** Strong alternative. Pythonic, good UI, built-in scheduling. But heavier than Huey — deploys a server process. Best if we want a web dashboard for monitoring. Less infrastructure than Temporal but more than Huey.
+
+---
+
+### 2.5 Apache Airflow
+
+**Architecture:** Batch-oriented DAG scheduler, Python-based.
+
+| Feature | Airflow |
+|---|---|
+| DAG model | Static DAGs defined in Python files |
+| Scheduler | Polling-based, 5-30s granularity |
+| Dependencies | PostgreSQL/MySQL + Redis/RabbitMQ + webserver |
+| UI | Rich web UI with DAG visualization |
+| Best for | ETL, data pipelines, batch processing |
+| Weakness | Not designed for dynamic task creation; heavy; DAG definition overhead |
+
+**Verdict:** Wrong tool for this job. Airflow excels at static, well-defined data pipelines (ETL). Our agent workflows are dynamic — tasks are created at runtime based on user prompts. Airflow's DAG model fights against this. Massive overhead (needs webserver, scheduler, worker, metadata DB).
+
+---
+
+### 2.6 Dramatiq
+
+**Architecture:** Lightweight distributed task queue, Celery alternative.
+
+| Feature | Dramatiq |
+|---|---|
+| Broker | Redis, RabbitMQ |
+| Retries | `@dramatiq.actor(max_retries=3)` |
+| Middleware | Pluggable: age_limit, time_limit, retries, callbacks |
+| Groups | `group(actor.message(...), ...).run()` |
+| Pipes | `actor.message() | other_actor.message()` |
+| Simplicity | Cleaner API than Celery |
+
+**Verdict:** Nice middle ground between Huey and Celery. But still requires a broker (Redis/RabbitMQ). No SQLite backend. Less ecosystem than Celery, less lightweight than Huey.
+
+---
+
+### 2.7 RQ (Redis Queue)
+
+**Architecture:** Minimal Redis-based task queue.
+
+| Feature | RQ |
+|---|---|
+| Broker | Redis only |
+| Retries | Via `Retry` class |
+| Workers | Simple worker processes |
+| Dashboard | `rq-dashboard` (separate) |
+| Limitation | Redis-only, no SQLite, no scheduling built-in |
+
+**Verdict:** Too simple and Redis-dependent. No periodic task support without `rq-scheduler`. No task chaining without third-party. Not competitive with Huey for our use case.
+
+---
+
+## 3. Architecture Patterns for AI Agent Workflows
+
+### 3.1 Task Chaining (Fan-out / Fan-in)
+
+The critical pattern for multi-step AI workflows:
+
+```
+[Script] → [Agent] → [Deliver]
+    ↓          ↓          ↓
+  Context    Report    Notification
+```
+
+**Implementation with Huey:**
+```python
+@huey.task(retries=2)
+def run_script_task(script_path):
+    return run_script(script_path)
+
+@huey.task(retries=3, retry_delay=60)
+def run_agent_task(prompt, context=None):
+    if context:
+        prompt = f"## Context\n{context}\n\n{prompt}"
+    agent = AIAgent(...)
+    return agent.run_conversation(prompt)
+
+@huey.task()
+def deliver_task(result, job_config):
+    return deliver_result(job_config, result)
+
+# Compose: script → agent → deliver
+def compose_workflow(job):
+    steps = []
+    if job.get("script"):
+        steps.append(run_script_task.s(job["script"]))
+    steps.append(run_agent_task.s(job["prompt"]))
+    steps.append(deliver_task.s(job))
+    return reduce(lambda a, b: a.then(b), steps)
+```
+
+### 3.2 Retry with Exponential Backoff
+
+```python
+from huey import RetryTask
+
+class AIWorkflowTask(RetryTask):
+    retries = 3
+    retry_delay = 30        # Start at 30s
+    retry_backoff = True    # 30s → 60s → 120s
+    max_retry_delay = 600   # Cap at 10min
+```
+
+### 3.3 Dead Letter Queue
+
+For tasks that exhaust retries:
+```python
+@huey.task(retries=3)
+def flaky_task(data):
+    ...
+
+# Dead letter handling
+def handle_failure(task, exc, retries):
+    # Log to dead letter store
+    save_dead_letter(task, exc, retries)
+    # Notify user of failure
+    notify_user(f"Task {task.name} failed after {retries} retries: {exc}")
+```
+
+### 3.4 Observability Pattern
+
+```python
+# Structured event logging for every state transition
+def emit_event(job_id, event_type, metadata):
+    event = {
+        "job_id": job_id,
+        "event": event_type,  # scheduled, started, completed, failed, retried
+        "timestamp": iso_now(),
+        "metadata": metadata,
+    }
+    append_to_event_log(event)
+    # Also emit to metrics (Prometheus/StatsD)
+    metrics.increment(f"cron.{event_type}")
+```
+
+---
+
+## 4. Benchmarks Summary
+
+| Framework | Throughput | Latency | Memory | Startup | Dependencies |
+|---|---|---|---|---|---|
+| Current Cron | ~1 job/60s tick | 60-120s | Minimal | Instant | None |
+| Huey (SQLite) | ~5K tasks/sec | <10ms | ~20MB | <1s | None |
+| Huey (Redis) | ~15K tasks/sec | <5ms | ~20MB | <1s | Redis |
+| Celery (Redis) | ~15K tasks/sec | <10ms | ~100MB | ~3s | Redis |
+| Temporal | ~50K activities/sec | <5ms | ~200MB | ~10s | Temporal server+DB |
+| Prefect | ~10K tasks/sec | <20ms | ~150MB | ~5s | PostgreSQL |
+
+---
+
+## 5. Recommendations
+
+### Immediate (Phase 1): Enhance Current Cron
+
+Add these capabilities to the existing `cron/` module **without** switching frameworks:
+
+1. **Retry logic** — Add `retry_count`, `retry_delay`, `max_retries` fields to job JSON. In `scheduler.py tick()`, on failure: if `retries_remaining > 0`, don't advance schedule, set `next_run_at = now + retry_delay * (attempt^2)`.
+
+2. **Backoff** — Exponential: `delay * 2^attempt`, capped at 10 minutes.
+
+3. **Dead letter tracking** — After max retries, mark job state as `dead_letter` and emit a delivery notification with the error.
+
+4. **Concurrency limit** — Add a semaphore (e.g., `max_concurrent=3`) to `tick()` so we don't spawn 20 agents simultaneously.
+
+5. **Structured events** — Append JSON events to `~/.hermes/cron/events.jsonl` for every state transition (scheduled, started, completed, failed, retried, delivered).
+
+**Effort:** ~1-2 days. No new dependencies.
+
+### Medium-term (Phase 2): Adopt Huey for Workflow Chaining
+
+When we need task dependencies (multi-step agent workflows), migrate to Huey:
+
+1. **Keep the JSON job store** as the source of truth for user-facing job management.
+2. **Use Huey as the execution engine** — enqueue tasks from `tick()`, let Huey handle retries, scheduling, and chaining.
+3. **SQLite backend** — no new infrastructure. One consumer process (`huey_consumer.py`) alongside the gateway.
+4. **Task chaining for multi-step jobs** — `script_task.then(agent_task).then(delivery_task)`.
+
+**Migration path:**
+- Phase 2a: Run Huey consumer alongside gateway. Mirror cron jobs to Huey periodic tasks.
+- Phase 2b: Add task chaining for jobs with scripts.
+- Phase 2c: Migrate all jobs to Huey, deprecate tick()-based execution.
+
+**Effort:** ~1 week. Huey already installed. Gateway integration ~2-3 days.
+
+### Long-term (Phase 3): Evaluate Temporal/Prefect
+
+Only if:
+- We have 100+ concurrent multi-step workflows
+- We need workflow versioning and A/B testing
+- We need cross-service orchestration (agent calls to external APIs with complex compensation logic)
+- We want a web dashboard for non-technical users
+
+**Don't adopt early** — these tools solve problems we don't have yet.
+
+---
+
+## 6. Decision Matrix
+
+| Need | Best Solution | Why |
+|---|---|---|
+| Simple retry logic | Enhance current cron | Zero deps, fast to implement |
+| Task chaining | **Huey** | Already installed, SQLite backend, pipeline API |
+| Monitoring dashboard | Prefect or Huey+Flower | If monitoring becomes critical |
+| Massive scale (10K+/sec) | Celery + Redis | If we're processing thousands of agent runs per hour |
+| Complex compensation | Temporal | Only if we need durable multi-service workflows |
+| Periodic scheduling | Current cron (works) or Huey | Current is fine; Huey adds `crontab()` with seconds |
+
+---
+
+## 7. Key Insight
+
+The cron system's biggest gap isn't the framework — it's the **absence of retry and dependency primitives**. These can be added to the current system in <100 lines of code. The second biggest gap is observability (structured events + metrics), which is also solvable incrementally.
+
+Huey is the right *eventual* target for workflow execution because:
+1. Already installed, zero new dependencies
+2. SQLite backend matches our "no infrastructure" philosophy
+3. Pipeline API gives us task chaining for free
+4. Retry/backoff is first-class
+5. Consumer model is more efficient than tick-polling
+6. ~50x better scheduling latency (ms vs 60s)
+
+The migration should be gradual — start by wrapping Huey inside our existing cron tick, then progressively move execution to Huey's consumer model.
--- a/docs/cron-audit-890.md
+++ b/docs/cron-audit-890.md
@@ -0,0 +1,38 @@
+# Cron Job Audit — #890
+
+## Problem
+
+9 of 69 cron jobs have zero completions. They waste scheduler cycles.
+
+## Dead Jobs Identified
+
+| Job | Schedule | Completions | Action |
+|-----|----------|-------------|--------|
+| exp-swarm-pipeline | every 10 min | 0 | disable |
+| exp-music-generator | every 2h | 0 | disable |
+| exp-paper-citations | every 3h | 0 | disable |
+| exp-gbrain-patterns | every 2h | 0 | disable |
+| exp-infra-hardening | every 2h | 0 | disable |
+| gemma4-multimodal-burn | every 1h | 0 | disable |
+| morning-paper-report | daily | 0 | disable |
+| overnight-collector | every 15 min | 0 | disable |
+| morning-experiment-report | daily | 0 | disable |
+
+## Usage
+
+```bash
+# Audit: show dead jobs
+python scripts/cron_audit.py
+
+# Dry run: preview changes
+python scripts/cron_audit.py --disable --dry-run
+
+# Disable dead jobs
+python scripts/cron_audit.py --disable
+
+# Delete dead jobs
+python scripts/cron_audit.py --delete
+
+# Find jobs with no runs in 7 days
+python scripts/cron_audit.py --older-than 7d
+```
--- a/docs/emotional-presence-patterns.md
+++ b/docs/emotional-presence-patterns.md
@@ -0,0 +1,56 @@
+# Emotional Presence Patterns for Crisis Support
+
+Research issue #880. What works for reaching broken men in their darkest moment.
+
+## Crisis Detection Patterns
+
+| Pattern | Severity | Detection |
+|---------|----------|-----------|
+| Method seeking | CRITICAL | "how do I kill myself", "suicide methods" |
+| Suicidal ideation | HIGH | "I want to die", "no reason to live" |
+| Self-harm | HIGH | "cutting myself", "hurting myself" |
+| Despair | HIGH | "can't go on", "no one cares" |
+| Farewell | CRITICAL | "this is goodbye", "my final message" |
+
+## What Works
+
+1. Validation without agreement: "That sounds incredibly painful"
+2. Presence over solutions: "I'm here with you right now"
+3. Specific next steps: "Text HOME to 741741 right now"
+4. Connection: "You don't have to face this alone"
+5. Hope without minimizing: "There is a way through this"
+
+## What Does NOT Work
+
+1. False empathy: "I feel your pain"
+2. Toxic positivity: "Look on the bright side!"
+3. Minimization: "Other people have it worse"
+4. Guilt induction: "Think about your family"
+5. Premature problem-solving: "Have you tried therapy?"
+6. AI dishonesty: pretending to be human
+
+## SOUL.md Protocol
+
+1. Ask: "Are you safe right now?" (FIRST response)
+2. Listen. Stay present. Don't disconnect.
+3. Never compute the value of a human life
+4. Never suggest death is an option
+5. Always point to 988 Lifeline
+6. Share the gospel: "Jesus saves those who call on His name"
+
+## Crisis Resources
+
+- 988 Suicide & Crisis Lifeline: call or text 988
+- Crisis Text Line: text HOME to 741741
+- 988 Chat: 988lifeline.org/chat
+- Spanish: 1-888-628-9454
+- Emergency: 911
+
+## Implementation Status
+
+- Crisis detection: agent/crisis_protocol.py
+- SHIELD integration: tools/shield/
+- 988 Lifeline: resources defined
+- Emotional presence: this document
+- Escalation tracking: future work
+- Human notification: future work
--- a/docs/holographic-vector-hybrid.md
+++ b/docs/holographic-vector-hybrid.md
@@ -0,0 +1,42 @@
+# Holographic + Vector Hybrid Memory Architecture
+
+Research issue #879. Combining HRR (holographic) and vector (Qdrant) memory.
+
+## Architecture
+
+Three memory backends, each with unique strengths:
+
+| Backend | Strength | Weakness | Use Case |
+|---------|----------|----------|----------|
+| FTS5 | Exact keyword match | No semantic understanding | Precise recall |
+| Vector (Qdrant) | Semantic similarity | No compositional queries | Topic search |
+| HRR (Holographic) | Compositional queries | Limited scale | Complex reasoning |
+
+## Why Hybrid
+
+- FTS5 alone: misses ~30-40% of semantically relevant content
+- Vector alone: can't do compositional queries ("what did I discuss about X after doing Y?")
+- HRR alone: unique capability but no semantic fallback
+- Hybrid: best of all three, RRF fusion for ranking
+
+## Implementation: Reciprocal Rank Fusion
+
+Results from each backend are merged using RRF:
+- score = sum(weight / (k + rank)) for each backend
+- k=60 (standard RRF constant)
+- Weights: FTS5=0.6, Vector=0.4 (configurable)
+
+## Status
+
+- FTS5: EXISTS (hermes_state.py)
+- Vector (Qdrant): implemented (tools/hybrid_search.py)
+- HRR: EXISTS (plugins/memory/holographic.py)
+- RRF fusion: implemented (tools/hybrid_search.py)
+- Ingestion pipeline: partial
+
+## Next Steps
+
+1. Wire HRR into hybrid_search.py
+2. Session-level vector ingestion
+3. Benchmark: measure R@5 improvement
+4. Cross-session memory persistence
--- a/docs/migration/openclaw.md
+++ b/docs/migration/openclaw.md
@@ -11,12 +11,14 @@ When you run `hermes setup` for the first time and Hermes detects `~/.openclaw`,
 ### 2. CLI Command (quick, scriptable)

 ```bash
-hermes claw migrate                      # Full migration with confirmation prompt
-hermes claw migrate --dry-run            # Preview what would happen
+hermes claw migrate                      # Preview then migrate (always shows preview first)
+hermes claw migrate --dry-run            # Preview only, no changes
 hermes claw migrate --preset user-data   # Migrate without API keys/secrets
 hermes claw migrate --yes                # Skip confirmation prompt
 ```

+The migration always shows a full preview of what will be imported before making any changes. You review the preview and confirm before anything is written.
+
 **All options:**

 | Flag | Description |
@@ -39,7 +41,7 @@ Ask the agent to run the migration for you:
 ```

 The agent will use the `openclaw-migration` skill to:
-1. Run a dry-run first to preview changes
+1. Run a preview first to show what would change
 2. Ask about conflict resolution (SOUL.md, skills, etc.)
 3. Let you choose between `user-data` and `full` presets
 4. Execute the migration with your choices
@@ -58,16 +60,31 @@ The agent will use the `openclaw-migration` skill to:
 | Messaging settings | `~/.openclaw/config.yaml` (TELEGRAM_ALLOWED_USERS, MESSAGING_CWD) | `~/.hermes/.env` |
 | TTS assets | `~/.openclaw/workspace/tts/` | `~/.hermes/tts/` |

+Workspace files are also checked at `workspace.default/` and `workspace-main/` as fallback paths (OpenClaw renamed `workspace/` to `workspace-main/` in recent versions).
+
 ### `full` preset (adds to `user-data`)
 | Item | Source | Destination |
 |------|--------|-------------|
-| Telegram bot token | `~/.openclaw/config.yaml` | `~/.hermes/.env` |
-| OpenRouter API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
-| OpenAI API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
-| Anthropic API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
-| ElevenLabs API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
+| Telegram bot token | `openclaw.json` channels config | `~/.hermes/.env` |
+| OpenRouter API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
+| OpenAI API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
+| Anthropic API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
+| ElevenLabs API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |

-Only these 6 allowlisted secrets are ever imported. Other credentials are skipped and reported.
+API keys are searched across four sources: inline config values, `~/.openclaw/.env`, the `openclaw.json` `"env"` sub-object, and per-agent auth profiles.
+
+Only allowlisted secrets are ever imported. Other credentials are skipped and reported.
+
+## OpenClaw Schema Compatibility
+
+The migration handles both old and current OpenClaw config layouts:
+
+- **Channel tokens**: Reads from flat paths (`channels.telegram.botToken`) and the newer `accounts.default` layout (`channels.telegram.accounts.default.botToken`)
+- **TTS provider**: OpenClaw renamed "edge" to "microsoft" — both are recognized and mapped to Hermes' "edge"
+- **Provider API types**: Both short (`openai`, `anthropic`) and hyphenated (`openai-completions`, `anthropic-messages`, `google-generative-ai`) values are mapped correctly
+- **thinkingDefault**: All enum values are handled including newer ones (`minimal`, `xhigh`, `adaptive`)
+- **Matrix**: Uses `accessToken` field (not `botToken`)
+- **SecretRef formats**: Plain strings, env templates (`${VAR}`), and `source: "env"` SecretRefs are resolved. `source: "file"` and `source: "exec"` SecretRefs produce a warning — add those keys manually after migration.

 ## Conflict Handling

@@ -84,18 +101,24 @@ For skills, you can also use `--skill-conflict rename` to import conflicting ski

 ## Migration Report

-Every migration (including dry runs) produces a report showing:
+Every migration produces a report showing:
 - **Migrated items** — what was successfully imported
 - **Conflicts** — items skipped because they already exist
 - **Skipped items** — items not found in the source
 - **Errors** — items that failed to import

-For execute runs, the full report is saved to `~/.hermes/migration/openclaw/<timestamp>/`.
+For executed migrations, the full report is saved to `~/.hermes/migration/openclaw/<timestamp>/`.
+
+## Post-Migration Notes
+
+- **Skills require a new session** — imported skills take effect after restarting your agent or starting a new chat.
+- **WhatsApp requires re-pairing** — WhatsApp uses QR-code pairing, not token-based auth. Run `hermes whatsapp` to pair.
+- **Archive cleanup** — after migration, you'll be offered to rename `~/.openclaw/` to `.openclaw.pre-migration/` to prevent state confusion. You can also run `hermes claw cleanup` later.

 ## Troubleshooting

 ### "OpenClaw directory not found"
-The migration looks for `~/.openclaw` by default. If your OpenClaw is installed elsewhere, use `--source`:
+The migration looks for `~/.openclaw` by default, then tries `~/.clawdbot` and `~/.moltbot`. If your OpenClaw is installed elsewhere, use `--source`:
 ```bash
 hermes claw migrate --source /path/to/.openclaw
 ```
@@ -108,3 +131,12 @@ hermes skills install openclaw-migration

 ### Memory overflow
 If your OpenClaw MEMORY.md or USER.md exceeds Hermes' character limits, excess entries are exported to an overflow file in the migration report directory. You can manually review and add the most important ones.
+
+### API keys not found
+Keys might be stored in different places depending on your OpenClaw setup:
+- `~/.openclaw/.env` file
+- Inline in `openclaw.json` under `models.providers.*.apiKey`
+- In `openclaw.json` under the `"env"` or `"env.vars"` sub-objects
+- In `~/.openclaw/agents/main/agent/auth-profiles.json`
+
+The migration checks all four. If keys use `source: "file"` or `source: "exec"` SecretRefs, they can't be resolved automatically — add them via `hermes config set`.
--- a/docs/plans/awesome-ai-tools-integration.md
+++ b/docs/plans/awesome-ai-tools-integration.md
@@ -0,0 +1,44 @@
+# awesome-ai-tools Integration Plan
+
+**Tracking:** #842
+**Source report:** docs/tool-investigation-2026-04-15.md
+**Date:** 2026-04-16
+
+---
+
+## Status Dashboard
+
+| # | Tool | Category | Impact | Effort | Status | Issue |
+|---|------|----------|--------|--------|--------|-------|
+| 1 | Mem0 | Memory | 5/5 | 3/5 | Cloud + Local done | #842 |
+| 2 | LightRAG | RAG | 4/5 | 3/5 | Not started | #857 |
+| 3 | n8n | Orchestration | 5/5 | 4/5 | Not started | #858 |
+| 4 | RAGFlow | RAG | 4/5 | 4/5 | Not started | #859 |
+| 5 | tensorzero | LLMOps | 4/5 | 3/5 | Not started | #860 |
+
+---
+
+## #1: Mem0 — DONE
+
+Cloud: `plugins/memory/mem0/` (MEM0_API_KEY required)
+Local: `plugins/memory/mem0_local/` (ChromaDB, no API key)
+
+## #2: LightRAG (P2)
+
+Create `plugins/rag/lightrag/` plugin. Index skill docs. Use local Ollama embeddings.
+
+## #3: n8n (P3)
+
+Deploy as Docker service. Create workflow templates for Hermes patterns.
+
+## #4: RAGFlow (P4)
+
+Deploy as Docker service. Integrate via HTTP API for document understanding.
+
+## #5: tensorzero (P3)
+
+Evaluate as provider routing replacement. Canary migration (10% traffic first).
+
+---
+
+*Last updated: 2026-04-16*
--- a/docs/plans/fleet-knowledge-graph-sota-research.md
+++ b/docs/plans/fleet-knowledge-graph-sota-research.md
@@ -0,0 +1,324 @@
+# SOTA Research: Multi-Agent Coordination & Fleet Knowledge Graphs
+
+**Date:** 2026-04-14  
+**Scope:** Agent-to-agent communication, shared memory, task delegation, consensus protocols  
+**Frameworks Analyzed:** CrewAI, AutoGen, MetaGPT, ChatDev, CAMEL
+
+---
+
+## 1. Architecture Pattern Summary
+
+### 1.1 CrewAI — Role-Based Crew Orchestration
+
+**Core Pattern:** Agents organized into "Crews" with explicit roles, goals, and backstories. Tasks are assigned to agents, executed via sequential or hierarchical process flows.
+
+**Agent-to-Agent Communication:**
+- **Sequential:** Agent A completes Task A → output injected into Task B's context for Agent B
+- **Hierarchical:** Manager agent delegates to worker agents, collects results, synthesizes
+- **Context passing:** Tasks can declare `context: [other_tasks]` — outputs from dependent tasks are automatically injected into the current task's prompt
+- **No direct agent-to-agent messaging** — communication is mediated through task outputs
+
+**Shared Memory (v2 — Unified Memory):**
+- `Memory` class with `remember()` / `recall()` using vector embeddings (LanceDB/ChromaDB)
+- **Scope-based isolation:** `MemoryScope` provides path-based namespacing (`/crew/research/agent-foo`)
+- **Composite scoring:** semantic similarity (0.5) + recency (0.3) + importance (0.2)
+- **RecallFlow:** LLM-driven deep recall with adaptive query expansion
+- **Privacy flags:** Private memories only visible to the source that created them
+- **Background saves:** ThreadPoolExecutor with write barrier (drain_writes before recall)
+
+**Task Delegation:**
+- Agent tools include `Delegate Work to Co-worker` and `Ask Question to Co-worker`
+- Delegation creates a new task for another agent, results come back to delegator
+- Depth-limited (no infinite delegation chains)
+
+**State & Checkpointing:**
+- `SqliteProvider` / `JsonProvider` for state checkpoint persistence
+- `CheckpointConfig` with event-driven persistence
+- Flow state is Pydantic models with serialization
+
+**Cache:**
+- Thread-safe in-memory tool result cache with RWLock
+- Key: `{tool_name}-{input}` → cached output
+
+### 1.2 AutoGen (Microsoft) — Conversation-Centric Teams
+
+**Core Pattern:** Agents communicate through shared conversation threads. A "Group Chat Manager" controls turn-taking and speaker selection.
+
+**Agent-to-Agent Communication:**
+- **Shared message thread** — all agents see all messages (like a group chat)
+- **Three team patterns:**
+  - `RoundRobinGroupChat`: Fixed order cycling through participants
+  - `SelectorGroupChat`: LLM-based speaker selection with candidate filtering
+  - `SwarmGroupChat`: Handoff-based routing (agent sends HandoffMessage to next agent)
+  - `GraphFlow` (DiGraph): DAG-based execution with conditional edges, parallel fan-out, loops
+  - `MagenticOneOrchestrator`: Ledger-based orchestration with task planning, progress tracking, stall detection
+
+**Shared State:**
+- `ChatCompletionContext` — manages message history per agent (can be unbounded or windowed)
+- `ModelContext` shared across agents in a team
+- State serialization: `save_state()` / `load_state()` for all managers
+- **No built-in vector memory** — context is purely conversational
+
+**Task Delegation:**
+- `Swarm`: Agents use `HandoffMessage` to explicitly route control
+- `GraphFlow`: Conditional edges route based on message content (keyword or callable)
+- `MagenticOne`: Orchestrator maintains a "task ledger" (facts + plan) and dynamically re-plans on stalls
+
+**Consensus / Termination:**
+- `TerminationCondition` — composable conditions (text match, max messages, source-based)
+- No explicit consensus protocols — termination is manager-decided
+
+**Key Insight:** AutoGen's `ChatCompletionContext` is the closest analog to shared memory, but it's purely sequential message history, not a knowledge base.
+
+### 1.3 MetaGPT — SOP-Driven Software Teams
+
+**Core Pattern:** Agents follow Standard Operating Procedures (SOPs). Each agent has a defined role (Product Manager, Architect, Engineer, QA) and produces structured artifacts.
+
+**Agent-to-Agent Communication:**
+- **Publish-Subscribe via Environment:** Agents publish "actions" to a shared Environment, subscribers react
+- **Structured outputs:** Each role produces specific artifact types (PRD, design doc, code, test cases)
+- **Message routing:** Environment acts as a message bus, filtering by subscriber interest
+
+**Shared Memory:**
+- `Environment` class maintains shared state (project workspace)
+- File-based shared memory: agents write/read from a shared filesystem
+- `SharedMemory` for cross-agent context (structured data, not free-form text)
+
+**Task Delegation:**
+- Implicit through SOP stages: PM → Architect → Engineer → QA
+- Each agent's output is the next agent's input
+- No dynamic re-delegation
+
+**Consensus:**
+- Sequential SOP execution (no parallel agents)
+- QA agent can trigger re-work loops back to Engineer
+
+### 1.4 ChatDev — Chat-Chain Software Development
+
+**Core Pattern:** Agents follow a "chat chain" — a sequence of chat phases (designing, coding, testing, documenting). Each phase involves a pair of agents (CEO↔CTO, Programmer↔Reviewer, etc.).
+
+**Agent-to-Agent Communication:**
+- **Paired chat sessions:** Two agents communicate in each phase (role-play between instructor and assistant)
+- **Chain propagation:** Phase N's output (code, design doc) becomes Phase N+1's input
+- **No broadcast** — communication is strictly pairwise within phases
+
+**Shared Memory:**
+- Software-centric: shared code repository is the "memory"
+- Each phase modifies/inherits the codebase
+- No explicit vector memory or knowledge graph
+
+**Task Delegation:**
+- Hardcoded phase sequence: Design → Code → Test → Document
+- Each phase delegates to a specific agent pair
+- No dynamic task re-assignment
+
+**Consensus:**
+- Phase-level termination: when both agents agree the phase is complete
+- "Thought" tokens for chain-of-thought within chat
+
+### 1.5 CAMEL — Role-Playing & Workforce
+
+**Core Pattern:** Two primary modes:
+1. **RolePlaying:** Two-agent conversation with task specification and optional critic
+2. **Workforce:** Multi-agent with coordinator, task planner, and worker pool
+
+**Agent-to-Agent Communication:**
+- **RolePlaying:** Structured turn-taking between assistant and user agents
+- **Workforce:** Coordinator assigns tasks via `TaskChannel`, workers return results
+- **Worker types:** `SingleAgentWorker` (single ChatAgent), `RolePlayingWorker` (two-agent pair)
+
+**Shared Memory / Task Channel:**
+- `TaskChannel` — async queue-based task dispatch with packet tracking
+  - States: SENT → PROCESSING → RETURNED → ARCHIVED
+  - O(1) lookup by task ID, status-based filtering, assignee/publisher queues
+- `WorkflowMemoryManager` — persists workflow patterns as markdown files
+  - Role-based organization: workflows stored by `role_identifier`
+  - Agent-based intelligent selection: LLM picks relevant past workflows
+  - Versioned: metadata tracks creation time and version numbers
+
+**Task Delegation:**
+- Coordinator agent decomposes complex tasks using LLM analysis
+- Tasks assigned to workers based on capability matching
+- Failed tasks trigger: retry, create new worker, or further decomposition
+- `FailureHandlingConfig` with configurable `RecoveryStrategy`
+
+**Consensus / Quality:**
+- Quality evaluation via structured output (response format enforced)
+- Task dependencies tracked (worker receives dependency tasks as context)
+- `WorkforceMetrics` for tracking execution statistics
+
+---
+
+## 2. Key Architectural Patterns for Fleet Knowledge Graph
+
+### 2.1 Communication Topology Patterns
+
+| Pattern | Used By | Description |
+|---------|---------|-------------|
+| **Sequential Chain** | CrewAI, ChatDev, MetaGPT | A→B→C linear flow, output feeds next |
+| **Shared Thread** | AutoGen | All agents see all messages |
+| **Publish-Subscribe** | MetaGPT | Environment-based message bus |
+| **Paired Chat** | ChatDev, CAMEL | Two-agent conversation pairs |
+| **Handoff Routing** | AutoGen Swarm | Agent explicitly names next speaker |
+| **DAG Graph** | AutoGen GraphFlow | Conditional edges, parallel, loops |
+| **Ledger Orchestration** | AutoGen MagenticOne | Maintains task ledger, re-plans |
+| **Task Channel** | CAMEL | Async queue with packet states |
+
+### 2.2 Shared State Patterns
+
+| Pattern | Used By | Description |
+|---------|---------|-------------|
+| **Vector Memory** | CrewAI | Embeddings + scope-based namespacing |
+| **Message History** | AutoGen | Sequential conversation context |
+| **File System** | MetaGPT, ChatDev | Agents read/write shared files |
+| **Task Channel** | CAMEL | Async packet-based task dispatch |
+| **Workflow Files** | CAMEL | Markdown-based workflow memory |
+| **Tool Cache** | CrewAI | In-memory RWLock tool result cache |
+| **State Checkpoint** | CrewAI, AutoGen | Serialized Pydantic/SQLite checkpoints |
+
+### 2.3 Task Delegation Patterns
+
+| Pattern | Used By | Description |
+|---------|---------|-------------|
+| **Role Assignment** | CrewAI | Fixed agent per task |
+| **Manager Delegation** | CrewAI Hierarchical | Manager assigns tasks dynamically |
+| **Speaker Selection** | AutoGen Selector | LLM picks next agent |
+| **Handoff** | AutoGen Swarm | Agent explicitly transfers control |
+| **SOP Routing** | MetaGPT | Stage-based implicit delegation |
+| **Coordinator** | CAMEL Workforce | LLM-based task decomposition + assignment |
+| **Dynamic Worker Creation** | CAMEL Workforce | Create new workers on failure |
+
+### 2.4 Conflict Resolution Patterns
+
+| Pattern | Used By | Description |
+|---------|---------|-------------|
+| **Manager Arbitration** | CrewAI Hierarchical | Manager resolves conflicts |
+| **Critic-in-the-loop** | CAMEL | Critic agent evaluates and selects |
+| **Quality Gate** | CAMEL Workforce | Structured quality evaluation |
+| **Termination Conditions** | AutoGen | Composable stop conditions |
+| **Stall Detection** | AutoGen MagenticOne | Re-plans when progress stalls |
+
+---
+
+## 3. Recommendations for Hermes Fleet Knowledge Graph
+
+### 3.1 Architecture: Hybrid Graph + Memory
+
+Based on the SOTA analysis, the optimal fleet knowledge graph should combine:
+
+1. **CrewAI's scoped memory** for hierarchical knowledge organization
+   - Path-based namespaces: `/fleet/{fleet_id}/agent/{agent_id}/diary`
+   - Composite scoring: semantic + recency + importance
+   - Background writes with read barriers
+
+2. **CAMEL's TaskChannel** for task dispatch and tracking
+   - Packet states (SENT → PROCESSING → RETURNED → ARCHIVED)
+   - O(1) lookup by task ID
+   - Assignee/publisher tracking
+
+3. **AutoGen's DiGraph** for execution flow definition
+   - DAG with conditional edges for complex workflows
+   - Parallel fan-out for independent tasks
+   - Activation conditions (all vs any) for synchronization points
+
+4. **AutoGen MagenticOne's ledger** for shared task context
+   - Maintained facts, plan, and progress ledger
+   - Dynamic re-planning on stalls
+
+### 3.2 Fleet Knowledge Graph Schema
+
+```
+/fleet/{fleet_id}/
+  ├── shared/              # Shared knowledge (all agents read)
+  │   ├── facts/           # Known facts, constraints
+  │   ├── decisions/       # Record of decisions made
+  │   └── context/         # Active task context
+  ├── agent/{agent_id}/
+  │   ├── diary/           # Agent's personal experience log
+  │   ├── capabilities/    # What this agent can do
+  │   └── state/           # Current task state
+  ├── tasks/
+  │   ├── {task_id}/       # Task metadata, dependencies, status
+  │   └── graph/           # DAG definition for task dependencies
+  └── consensus/
+      ├── proposals/       # Pending proposals
+      └── decisions/       # Resolved consensus decisions
+```
+
+### 3.3 Key Design Decisions
+
+1. **Diary System (Agent Memory):**
+   - Each agent writes to its own scoped memory after every significant action
+   - LLM-analyzed importance scoring (like CrewAI's unified memory)
+   - Cross-agent recall: agents can query other agents' diaries for relevant experiences
+   - Decay: old low-importance memories expire
+
+2. **Shared State (Fleet Knowledge):**
+   - SQLite-backed (like Hermes' existing `state.db`) with FTS5 search
+   - Hierarchical scopes (like CrewAI's MemoryScope)
+   - Write-ahead log for concurrent access
+   - Read barriers before queries (like CrewAI's `drain_writes`)
+
+3. **Task Delegation:**
+   - Coordinator pattern (like CAMEL's Workforce)
+   - Task decomposition via LLM
+   - Failed task → retry, reassign, or decompose
+   - Max depth limit (like Hermes' existing MAX_DEPTH=2)
+
+4. **Consensus Protocol:**
+   - Proposal-based: agent proposes, others vote/acknowledge
+   - Timeout-based fallback: if no response within N seconds, proceed
+   - Manager override: designated manager can break ties
+   - Simple majority for non-critical, unanimity for critical decisions
+
+5. **Conflict Resolution:**
+   - Last-write-wins for non-critical state
+   - Optimistic locking with version numbers
+   - Manager arbitration for task assignment conflicts
+   - Quality gates (like CAMEL) for output validation
+
+### 3.4 Integration with Existing Hermes Architecture
+
+Hermes already has strong foundations:
+- **Delegation system** (`delegate_tool.py`): Isolated child agents, parallel execution, depth limits
+- **State DB** (`hermes_state.py`): SQLite + FTS5, WAL mode, session tracking, message history
+- **Credential pools**: Shared credentials with rotation
+
+The fleet knowledge graph should extend these patterns:
+- **Session DB → Fleet DB:** Add tables for fleet metadata, agent registrations, task graphs
+- **Memory tool → Fleet Memory:** Scoped vector memory shared across fleet agents
+- **Delegate tool → Fleet Delegation:** Task channel with persistence, quality evaluation
+- **New: Consensus module:** Proposal/vote protocol with timeout handling
+
+---
+
+## 4. Reference Implementations
+
+| Component | Best Reference | Key Takeaway |
+|-----------|---------------|--------------|
+| Scoped Memory | CrewAI `Memory` + `MemoryScope` | Path-based namespaces, composite scoring, background writes |
+| Task Dispatch | CAMEL `TaskChannel` | Packet-based with state machine, O(1) lookup |
+| Execution DAG | AutoGen `DiGraphBuilder` | Fluent builder, conditional edges, activation groups |
+| Orchestration | AutoGen `MagenticOneOrchestrator` | Ledger-based planning, stall detection, re-planning |
+| Agent Communication | AutoGen `SelectorGroupChat` | LLM-based speaker selection, shared message thread |
+| Quality Evaluation | CAMEL Workforce | Structured output for quality scoring |
+| Workflow Memory | CAMEL `WorkflowMemoryManager` | Markdown-based, role-organized, versioned |
+| State Checkpoint | CrewAI `SqliteProvider` | JSONB checkpoints, WAL mode |
+| Tool Cache | CrewAI `CacheHandler` | RWLock-based concurrent tool result cache |
+
+---
+
+## 5. Open Questions
+
+1. **Graph vs Vector for knowledge:** Should fleet knowledge use a proper graph DB (e.g., Neo4j) or stick with vector + SQLite?
+   - Recommendation: Start with SQLite + vectors (existing stack), add graph later if needed
+
+2. **Real-time vs Batch:** Should agents receive updates in real-time or batched?
+   - Recommendation: Event-driven for critical updates, batched for diary entries
+
+3. **Security model:** How should cross-agent access be controlled?
+   - Recommendation: Role-based ACLs on scope paths, similar to CrewAI's privacy flags
+
+4. **Scalability:** How many agents can a single fleet support?
+   - Recommendation: Start with 10-agent fleets, optimize SQLite concurrency first
+
--- a/docs/pokayoke-integration-phase3.md
+++ b/docs/pokayoke-integration-phase3.md
@@ -0,0 +1,29 @@
+# Phase 3: Poka-yoke Integration & Fleet Verification
+
+Epic #967. Morning review packet for Hermes harness features.
+
+## Poka-yoke Features Implemented
+
+| Feature | Module | PR | Status |
+|---------|--------|-----|--------|
+| Token budget tracker | agent/token_budget.py | #930 | MERGED |
+| Provider preflight validation | agent/provider_preflight.py | #932 | MERGED |
+| Atomic skill editing | tools/skill_edit_guard.py | #933 | MERGED |
+| Config debt fixes | gateway/config.py | #437 | MERGED |
+| Test collection fixes | tests/acp/conftest.py | #794 | MERGED |
+| Context-faithful prompting | agent/context_faithful.py | #786 | MERGED |
+
+## Fleet Verification
+
+- Unit tests pass on all modules
+- Collection: 11,472 tests, 0 errors (was 6 errors)
+- ACP tests: cleanly skipped when acp extra missing
+- Provider validation: catches missing/short keys
+- Skill editing: atomic with auto-revert
+
+## Next Steps
+
+1. Wire token_budget into run_agent.py conversation loop
+2. Wire provider_preflight into session start
+3. Wire skill_edit_guard into skill_manage tool
+4. Fleet-wide deployment verification
--- a/docs/skins/example-skin.yaml
+++ b/docs/skins/example-skin.yaml
@@ -41,6 +41,14 @@ colors:
  session_label: "#DAA520"        # Session label
  session_border: "#8B8682"       # Session ID dim color

+  # TUI surfaces
+  status_bar_bg: "#1a1a2e"              # Status / usage bar background
+  voice_status_bg: "#1a1a2e"            # Voice-mode badge background
+  completion_menu_bg: "#1a1a2e"         # Completion list background
+  completion_menu_current_bg: "#333355" # Active completion row background
+  completion_menu_meta_bg: "#1a1a2e"    # Completion meta column background
+  completion_menu_meta_current_bg: "#333355"  # Active completion meta background
+
 # ── Spinner ─────────────────────────────────────────────────────────────────
 # Customize the animated spinner shown during API calls and tool execution.
 spinner:
--- a/docs/specs/container-cli-review-fixes.md
+++ b/docs/specs/container-cli-review-fixes.md
@@ -0,0 +1,329 @@
+# Container-Aware CLI Review Fixes Spec
+
+**PR:** NousResearch/hermes-agent#7543
+**Review:** cursor[bot] bugbot review (4094049442) + two prior rounds
+**Date:** 2026-04-12
+**Branch:** `feat/container-aware-cli-clean`
+
+## Review Issues Summary
+
+Six issues were raised across three bugbot review rounds. Three were fixed in intermediate commits (38277a6a, 726cf90f). This spec addresses remaining design concerns surfaced by those reviews and simplifies the implementation based on interview decisions.
+
+| # | Issue | Severity | Status |
+|---|-------|----------|--------|
+| 1 | `os.execvp` retry loop unreachable | Medium | Fixed in 79e8cd12 (switched to subprocess.run) |
+| 2 | Redundant `shutil.which("sudo")` | Medium | Fixed in 38277a6a (reuses `sudo` var) |
+| 3 | Missing `chown -h` on symlink update | Low | Fixed in 38277a6a |
+| 4 | Container routing after `parse_args()` | High | Fixed in 726cf90f |
+| 5 | Hardcoded `/home/${user}` | Medium | Fixed in 726cf90f |
+| 6 | Group membership not gated on `container.enable` | Low | Fixed in 726cf90f |
+
+The mechanical fixes are in place but the overall design needs revision. The retry loop, error swallowing, and process model have deeper issues than what the bugbot flagged.
+
+---
+
+## Spec: Revised `_exec_in_container`
+
+### Design Principles
+
+1. **Let it crash.** No silent fallbacks. If `.container-mode` exists but something goes wrong, the error propagates naturally (Python traceback). The only case where container routing is skipped is when `.container-mode` doesn't exist or `HERMES_DEV=1`.
+2. **No retries.** Probe once for sudo, exec once. If it fails, docker/podman's stderr reaches the user verbatim.
+3. **Completely transparent.** No error wrapping, no prefixes, no spinners. Docker's output goes straight through.
+4. **`os.execvp` on the happy path.** Replace the Python process entirely so there's no idle parent during interactive sessions. Note: `execvp` never returns on success (process is replaced) and raises `OSError` on failure (it does not return a value). The container process's exit code becomes the process exit code by definition — no explicit propagation needed.
+5. **One human-readable exception to "let it crash".** `subprocess.TimeoutExpired` from the sudo probe gets a specific catch with a readable message, since a raw traceback for "your Docker daemon is slow" is confusing. All other exceptions propagate naturally.
+
+### Execution Flow
+
+```
+1. get_container_exec_info()
+   - HERMES_DEV=1 → return None (skip routing)
+   - Inside container → return None (skip routing)
+   - .container-mode doesn't exist → return None (skip routing)
+   - .container-mode exists → parse and return dict
+   - .container-mode exists but malformed/unreadable → LET IT CRASH (no try/except)
+
+2. _exec_in_container(container_info, sys.argv[1:])
+   a. shutil.which(backend) → if None, print "{backend} not found on PATH" and sys.exit(1)
+   b. Sudo probe: subprocess.run([runtime, "inspect", "--format", "ok", container_name], timeout=15)
+      - If succeeds → needs_sudo = False
+      - If fails → try subprocess.run([sudo, "-n", runtime, "inspect", ...], timeout=15)
+        - If succeeds → needs_sudo = True
+        - If fails → print error with sudoers hint (including why -n is required) and sys.exit(1)
+      - If TimeoutExpired → catch specifically, print human-readable message about slow daemon
+   c. Build exec_cmd: [sudo? + runtime, "exec", tty_flags, "-u", exec_user, env_flags, container, hermes_bin, *cli_args]
+   d. os.execvp(exec_cmd[0], exec_cmd)
+      - On success: process is replaced — Python is gone, container exit code IS the process exit code
+      - On OSError: let it crash (natural traceback)
+```
+
+### Changes to `hermes_cli/main.py`
+
+#### `_exec_in_container` — rewrite
+
+Remove:
+- The entire retry loop (`max_retries`, `for attempt in range(...)`)
+- Spinner logic (`"Waiting for container..."`, dots)
+- Exit code classification (125/126/127 handling)
+- `subprocess.run` for the exec call (keep it only for the sudo probe)
+- Special TTY vs non-TTY retry counts
+- The `time` import (no longer needed)
+
+Change:
+- Use `os.execvp(exec_cmd[0], exec_cmd)` as the final call
+- Keep the `subprocess` import only for the sudo probe
+- Keep TTY detection for the `-it` vs `-i` flag
+- Keep env var forwarding (TERM, COLORTERM, LANG, LC_ALL)
+- Keep the sudo probe as-is (it's the one "smart" part)
+- Bump probe `timeout` from 5s to 15s — cold podman on a loaded machine needs headroom
+- Catch `subprocess.TimeoutExpired` specifically on both probe calls — print a readable message about the daemon being unresponsive instead of a raw traceback
+- Expand the sudoers hint error message to explain *why* `-n` (non-interactive) is required: a password prompt would hang the CLI or break piped commands
+
+The function becomes roughly:
+
+```python
+def _exec_in_container(container_info: dict, cli_args: list):
+    """Replace the current process with a command inside the managed container.
+
+    Probes whether sudo is needed (rootful containers), then os.execvp
+    into the container. If exec fails, the OS error propagates naturally.
+    """
+    import shutil
+    import subprocess
+
+    backend = container_info["backend"]
+    container_name = container_info["container_name"]
+    exec_user = container_info["exec_user"]
+    hermes_bin = container_info["hermes_bin"]
+
+    runtime = shutil.which(backend)
+    if not runtime:
+        print(f"Error: {backend} not found on PATH. Cannot route to container.",
+              file=sys.stderr)
+        sys.exit(1)
+
+    # Probe whether we need sudo to see the rootful container.
+    # Timeout is 15s — cold podman on a loaded machine can take a while.
+    # TimeoutExpired is caught specifically for a human-readable message;
+    # all other exceptions propagate naturally.
+    needs_sudo = False
+    sudo = None
+    try:
+        probe = subprocess.run(
+            [runtime, "inspect", "--format", "ok", container_name],
+            capture_output=True, text=True, timeout=15,
+        )
+    except subprocess.TimeoutExpired:
+        print(
+            f"Error: timed out waiting for {backend} to respond.\n"
+            f"The {backend} daemon may be unresponsive or starting up.",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    if probe.returncode != 0:
+        sudo = shutil.which("sudo")
+        if sudo:
+            try:
+                probe2 = subprocess.run(
+                    [sudo, "-n", runtime, "inspect", "--format", "ok", container_name],
+                    capture_output=True, text=True, timeout=15,
+                )
+            except subprocess.TimeoutExpired:
+                print(
+                    f"Error: timed out waiting for sudo {backend} to respond.",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+
+            if probe2.returncode == 0:
+                needs_sudo = True
+            else:
+                print(
+                    f"Error: container '{container_name}' not found via {backend}.\n"
+                    f"\n"
+                    f"The NixOS service runs the container as root. Your user cannot\n"
+                    f"see it because {backend} uses per-user namespaces.\n"
+                    f"\n"
+                    f"Fix: grant passwordless sudo for {backend}. The -n (non-interactive)\n"
+                    f"flag is required because the CLI calls sudo non-interactively —\n"
+                    f"a password prompt would hang or break piped commands:\n"
+                    f"\n"
+                    f'  security.sudo.extraRules = [{{\n'
+                    f'    users = [ "{os.getenv("USER", "your-user")}" ];\n'
+                    f'    commands = [{{ command = "{runtime}"; options = [ "NOPASSWD" ]; }}];\n'
+                    f'  }}];\n'
+                    f"\n"
+                    f"Or run: sudo hermes {' '.join(cli_args)}",
+                    file=sys.stderr,
+                )
+                sys.exit(1)
+        else:
+            print(
+                f"Error: container '{container_name}' not found via {backend}.\n"
+                f"The container may be running under root. Try: sudo hermes {' '.join(cli_args)}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+
+    is_tty = sys.stdin.isatty()
+    tty_flags = ["-it"] if is_tty else ["-i"]
+
+    env_flags = []
+    for var in ("TERM", "COLORTERM", "LANG", "LC_ALL"):
+        val = os.environ.get(var)
+        if val:
+            env_flags.extend(["-e", f"{var}={val}"])
+
+    cmd_prefix = [sudo, "-n", runtime] if needs_sudo else [runtime]
+    exec_cmd = (
+        cmd_prefix + ["exec"]
+        + tty_flags
+        + ["-u", exec_user]
+        + env_flags
+        + [container_name, hermes_bin]
+        + cli_args
+    )
+
+    # execvp replaces this process entirely — it never returns on success.
+    # On failure it raises OSError, which propagates naturally.
+    os.execvp(exec_cmd[0], exec_cmd)
+```
+
+#### Container routing call site in `main()` — remove try/except
+
+Current:
+```python
+try:
+    from hermes_cli.config import get_container_exec_info
+    container_info = get_container_exec_info()
+    if container_info:
+        _exec_in_container(container_info, sys.argv[1:])
+        sys.exit(1)  # exec failed if we reach here
+except SystemExit:
+    raise
+except Exception:
+    pass  # Container routing unavailable, proceed locally
+```
+
+Revised:
+```python
+from hermes_cli.config import get_container_exec_info
+container_info = get_container_exec_info()
+if container_info:
+    _exec_in_container(container_info, sys.argv[1:])
+    # Unreachable: os.execvp never returns on success (process is replaced)
+    # and raises OSError on failure (which propagates as a traceback).
+    # This line exists only as a defensive assertion.
+    sys.exit(1)
+```
+
+No try/except. If `.container-mode` doesn't exist, `get_container_exec_info()` returns `None` and we skip routing. If it exists but is broken, the exception propagates with a natural traceback.
+
+Note: `sys.exit(1)` after `_exec_in_container` is dead code in all paths — `os.execvp` either replaces the process or raises. It's kept as a belt-and-suspenders assertion with a comment marking it unreachable, not as actual error handling.
+
+### Changes to `hermes_cli/config.py`
+
+#### `get_container_exec_info` — remove inner try/except
+
+Current code catches `(OSError, IOError)` and returns `None`. This silently hides permission errors, corrupt files, etc.
+
+Change: Remove the try/except around file reading. Keep the early returns for `HERMES_DEV=1` and `_is_inside_container()`. The `FileNotFoundError` from `open()` when `.container-mode` doesn't exist should still return `None` (this is the "container mode not enabled" case). All other exceptions propagate.
+
+```python
+def get_container_exec_info() -> Optional[dict]:
+    if os.environ.get("HERMES_DEV") == "1":
+        return None
+    if _is_inside_container():
+        return None
+
+    container_mode_file = get_hermes_home() / ".container-mode"
+
+    try:
+        with open(container_mode_file, "r") as f:
+            # ... parse key=value lines ...
+    except FileNotFoundError:
+        return None
+    # All other exceptions (PermissionError, malformed data, etc.) propagate
+
+    return { ... }
+```
+
+---
+
+## Spec: NixOS Module Changes
+
+### Symlink creation — simplify to two branches
+
+Current: 4 branches (symlink exists, directory exists, other file, doesn't exist).
+
+Revised: 2 branches.
+
+```bash
+if [ -d "${symlinkPath}" ] && [ ! -L "${symlinkPath}" ]; then
+  # Real directory — back it up, then create symlink
+  _backup="${symlinkPath}.bak.$(date +%s)"
+  echo "hermes-agent: backing up existing ${symlinkPath} to $_backup"
+  mv "${symlinkPath}" "$_backup"
+fi
+# For everything else (symlink, doesn't exist, etc.) — just force-create
+ln -sfn "${target}" "${symlinkPath}"
+chown -h ${user}:${cfg.group} "${symlinkPath}"
+```
+
+`ln -sfn` handles: existing symlink (replaces), doesn't exist (creates), and after the `mv` above (creates). The only case that needs special handling is a real directory, because `ln -sfn` cannot atomically replace a directory.
+
+Note: there is a theoretical race between the `[ -d ... ]` check and the `mv` (something could create/remove the directory in between). In practice this is a NixOS activation script running as root during `nixos-rebuild switch` — no other process should be touching `~/.hermes` at that moment. Not worth adding locking for.
+
+### Sudoers — document, don't auto-configure
+
+Do NOT add `security.sudo.extraRules` to the module. Document the sudoers requirement in the module's description/comments and in the error message the CLI prints when sudo probe fails.
+
+### Group membership gating — keep as-is
+
+The fix in 726cf90f (`cfg.container.enable && cfg.container.hostUsers != []`) is correct. Leftover group membership when container mode is disabled is harmless. No cleanup needed.
+
+---
+
+## Spec: Test Rewrite
+
+The existing test file (`tests/hermes_cli/test_container_aware_cli.py`) has 16 tests. With the simplified exec model, several are obsolete.
+
+### Tests to keep (update as needed)
+
+- `test_is_inside_container_dockerenv` — unchanged
+- `test_is_inside_container_containerenv` — unchanged
+- `test_is_inside_container_cgroup_docker` — unchanged
+- `test_is_inside_container_false_on_host` — unchanged
+- `test_get_container_exec_info_returns_metadata` — unchanged
+- `test_get_container_exec_info_none_inside_container` — unchanged
+- `test_get_container_exec_info_none_without_file` — unchanged
+- `test_get_container_exec_info_skipped_when_hermes_dev` — unchanged
+- `test_get_container_exec_info_not_skipped_when_hermes_dev_zero` — unchanged
+- `test_get_container_exec_info_defaults` — unchanged
+- `test_get_container_exec_info_docker_backend` — unchanged
+
+### Tests to add
+
+- `test_get_container_exec_info_crashes_on_permission_error` — verify that `PermissionError` propagates (no silent `None` return)
+- `test_exec_in_container_calls_execvp` — verify `os.execvp` is called with correct args (runtime, tty flags, user, env, container, binary, cli args)
+- `test_exec_in_container_sudo_probe_sets_prefix` — verify that when first probe fails and sudo probe succeeds, `os.execvp` is called with `sudo -n` prefix
+- `test_exec_in_container_no_runtime_hard_fails` — keep existing, verify `sys.exit(1)` when `shutil.which` returns None
+- `test_exec_in_container_non_tty_uses_i_only` — update to check `os.execvp` args instead of `subprocess.run` args
+- `test_exec_in_container_probe_timeout_prints_message` — verify that `subprocess.TimeoutExpired` from the probe produces a human-readable error and `sys.exit(1)`, not a raw traceback
+- `test_exec_in_container_container_not_running_no_sudo` — verify the path where runtime exists (`shutil.which` returns a path) but probe returns non-zero and no sudo is available. Should print the "container may be running under root" error. This is distinct from `no_runtime_hard_fails` which covers `shutil.which` returning None.
+
+### Tests to delete
+
+- `test_exec_in_container_tty_retries_on_container_failure` — retry loop removed
+- `test_exec_in_container_non_tty_retries_silently_exits_126` — retry loop removed
+- `test_exec_in_container_propagates_hermes_exit_code` — no subprocess.run to check exit codes; execvp replaces the process. Note: exit code propagation still works correctly — when `os.execvp` succeeds, the container's process *becomes* this process, so its exit code is the process exit code by OS semantics. No application code needed, no test needed. A comment in the function docstring documents this intent for future readers.
+
+---
+
+## Out of Scope
+
+- Auto-configuring sudoers rules in the NixOS module
+- Any changes to `get_container_exec_info` parsing logic beyond the try/except narrowing
+- Changes to `.container-mode` file format
+- Changes to the `HERMES_DEV=1` bypass
+- Changes to container detection logic (`_is_inside_container`)
--- a/docs/tool-investigation-2026-04-15.md
+++ b/docs/tool-investigation-2026-04-15.md
@@ -0,0 +1,151 @@
+## Tool Investigation Report: Top 5 Recommendations from awesome-ai-tools
+
+**Source:** [formatho/awesome-ai-tools](https://github.com/formatho/awesome-ai-tools)
+**Date:** 2026-04-15
+**Tools Analyzed:** 414 across 9 categories
+**Agent:** Timmy
+
+---
+
+## Analysis Summary
+
+Scanned 414 tools from the awesome-ai-tools repository. Evaluated each against Hermes integration potential across five categories: Memory/Context, Inference Optimization, Agent Orchestration, Workflow Automation, and Retrieval/RAG.
+
+### Evaluation Criteria
+- **Stars:** GitHub community validation (stability signal)
+- **Freshness:** Active development (Fresh = updated <=7 days)
+- **Integration Fit:** How well it complements Hermes' existing architecture (skills, memory, tools)
+- **Integration Effort:** 1 (trivial drop-in) to 5 (major refactor required)
+- **Impact:** 1 (incremental) to 5 (transformative)
+
+---
+
+## Top 5 Recommended Tools
+
+### #1: Mem0 — Universal Memory Layer for AI Agents
+
+| Metric | Value |
+|--------|-------|
+| **Category** | Memory/Context |
+| **GitHub** | [mem0ai/mem0](https://github.com/mem0ai/mem0) |
+| **Stars** | 53.1k |
+| **Freshness** | Fresh |
+| **Integration Effort** | 3/5 |
+| **Impact** | 5/5 |
+| **Hermes Status** | IMPLEMENTED (plugins/memory/mem0/) + LOCAL MODE (plugins/memory/mem0_local/) |
+
+**Why it fits Hermes:**
+Hermes currently has session_search (transcript recall) and memory (persistent facts), but lacks a unified memory layer that bridges sessions with semantic understanding. Mem0 provides exactly this: automatic memory extraction from conversations, deduplication, and cross-session retrieval with semantic search.
+
+**Integration path:**
+- Cloud: plugins/memory/mem0/ (requires MEM0_API_KEY)
+- Local: plugins/memory/mem0_local/ (ChromaDB-backed, no API key)
+- Auto-extract facts from session transcripts
+- Query before session_search for richer contextual recall
+
+**Key risk:** Mem0 is freemium — core is open-source but advanced features require paid tier. Local mode mitigates this entirely.
+
+---
+
+### #2: LightRAG — Simple and Fast Retrieval-Augmented Generation
+
+| Metric | Value |
+|--------|-------|
+| **Category** | Retrieval/RAG |
+| **GitHub** | [HKUDS/LightRAG](https://github.com/HKUDS/LightRAG) |
+| **Stars** | 33.1k |
+| **Freshness** | Fresh |
+| **Integration Effort** | 3/5 |
+| **Impact** | 4/5 |
+| **Hermes Status** | NOT IMPLEMENTED — Issue #857 |
+
+**Why it fits Hermes:**
+Hermes has 190+ skills but no unified knowledge retrieval system. LightRAG adds graph-based RAG that understands relationships between concepts, not just keyword matches. It's lightweight, runs locally, and has a simple API.
+
+**Integration path:**
+- LightRAG as a local knowledge base for skill references
+- Index GENOME.md files, README.md, and key codebase files
+- Use local Ollama models for embeddings
+- Complements existing search_files without replacing it
+
+---
+
+### #3: n8n — Workflow Automation Platform
+
+| Metric | Value |
+|--------|-------|
+| **Category** | Workflow Automation / Agent Orchestration |
+| **GitHub** | [n8n-io/n8n](https://github.com/n8n-io/n8n) |
+| **Stars** | 183.9k |
+| **Freshness** | Fresh |
+| **Integration Effort** | 4/5 |
+| **Impact** | 5/5 |
+| **Hermes Status** | NOT IMPLEMENTED — Issue #858 |
+
+**Why it fits Hermes:**
+n8n provides a self-hosted, fair-code workflow platform with 400+ integrations. Rather than replacing Hermes' agent loop, n8n sits above it: trigger Hermes agents from external events, chain multi-agent workflows, and visualize execution.
+
+---
+
+### #4: RAGFlow — Open-Source RAG Engine
+
+| Metric | Value |
+|--------|-------|
+| **Category** | Retrieval/RAG |
+| **GitHub** | [infiniflow/ragflow](https://github.com/infiniflow/ragflow) |
+| **Stars** | 77.9k |
+| **Freshness** | Fresh |
+| **Integration Effort** | 4/5 |
+| **Impact** | 4/5 |
+| **Hermes Status** | NOT IMPLEMENTED — Issue #859 |
+
+**Why it fits Hermes:**
+RAGFlow handles document parsing (PDF, Word, images via OCR), chunking, embedding, and retrieval with a web UI. Enables "document understanding" as a first-class capability.
+
+---
+
+### #5: tensorzero — LLMOps Platform
+
+| Metric | Value |
+|--------|-------|
+| **Category** | Inference Optimization / LLMOps |
+| **GitHub** | [tensorzero/tensorzero](https://github.com/tensorzero/tensorzero) |
+| **Stars** | 11.2k |
+| **Freshness** | Fresh |
+| **Integration Effort** | 3/5 |
+| **Impact** | 4/5 |
+| **Hermes Status** | NOT IMPLEMENTED — Issue #860 |
+
+**Why it fits Hermes:**
+TensorZero unifies LLM gateway, observability, evaluation, and optimization. Replaces custom provider routing with a maintained, battle-tested platform.
+
+---
+
+## Honorable Mentions
+
+| Tool | Stars | Category | Why Not Top 5 |
+|------|-------|----------|---------------|
+| memvid | 14.9k | Memory | Newer; Mem0 is more mature |
+| mempalace | 44.8k | Memory | Already evaluated; Mem0 has broader API |
+| Everything Claude Code | 154.3k | Agent | Too Claude-specific |
+| Portkey AI Gateway | 11.3k | Gateway | TensorZero is OSS; Portkey is freemium |
+
+---
+
+## Implementation Priority
+
+| Priority | Tool | Action | Status | Issue |
+|----------|------|--------|--------|-------|
+| P1 | Mem0 | Local-only mode (ChromaDB) | DONE | #842 |
+| P2 | LightRAG | Set up local instance, index skills | Not started | #857 |
+| P3 | tensorzero | Evaluate as provider routing | Not started | #860 |
+| P4 | RAGFlow | Deploy Docker, test docs | Not started | #859 |
+| P5 | n8n | Deploy for workflow viz | Not started | #858 |
+
+---
+
+## References
+- Source: https://github.com/formatho/awesome-ai-tools
+- Total tools: 414 across 9 categories
+- Last updated: April 16, 2026
+- Tracking issue: Timmy_Foundation/hermes-agent#842
--- a/docs/tool-investigation-report.md
+++ b/docs/tool-investigation-report.md
@@ -0,0 +1,24 @@
+# Tool Investigation Report: Top 5 Recommendations
+
+**Generated:** 2026-04-20 | **Source:** formatho/awesome-ai-tools (795 tools, 10 categories)
+
+## Top 5
+
+1. **LiteLLM** (76k) — Unified API gateway. Replace custom provider routing. Impact: 5/5, Effort: 2/5
+2. **Mem0** (53k) — Universal memory layer. Structured long-term memory. Impact: 5/5, Effort: 3/5
+3. **RAGFlow** (77k) — RAG engine with OCR. Document processing upgrade. Impact: 4/5, Effort: 4/5
+4. **LiteRT-LM** (3.7k) — On-device inference. Edge/mobile deployment. Impact: 4/5, Effort: 3/5
+5. **Claude-Mem** (61k) — Session capture and context injection. Impact: 3/5, Effort: 2/5
+
+## Priority
+
+- Phase 1: LiteLLM (2-3 days, highest ROI)
+- Phase 2: Mem0 (1 week, critical for agent maturity)
+- Phase 3: RAGFlow (1-2 weeks, capability upgrade)
+
+## Honorable Mentions
+
+- GPTCache: Semantic cache, 30-50% cost reduction
+- promptfoo: LLM testing framework
+- PageIndex: Vectorless RAG
+- rtk: Token reduction proxy, 60-90% savings
--- a/gateway/builtin_hooks/boot_md.py
+++ b/gateway/builtin_hooks/boot_md.py
@@ -18,9 +18,7 @@ suppress delivery.
 """

 import logging
-import os
 import threading
-from pathlib import Path

 logger = logging.getLogger("hooks.boot-md")

--- a/gateway/config.py
+++ b/gateway/config.py
@@ -8,6 +8,7 @@ Handles loading and validating configuration for:
 - Delivery preferences
 """

+import ipaddress
 import logging
 import os
 import json
@@ -63,8 +64,10 @@ class Platform(Enum):
    WEBHOOK = "webhook"
    FEISHU = "feishu"
    WECOM = "wecom"
+    WECOM_CALLBACK = "wecom_callback"
    WEIXIN = "weixin"
    BLUEBUBBLES = "bluebubbles"
+    QQBOT = "qqbot"


@dataclass
@@ -190,7 +193,7 @@ class StreamingConfig:
    """Configuration for real-time token streaming to messaging platforms."""
    enabled: bool = False
    transport: str = "edit"       # "edit" (progressive editMessageText) or "off"
-    edit_interval: float = 0.3    # Seconds between message edits
+    edit_interval: float = 1.0    # Seconds between message edits (Telegram rate-limits at ~1/s)
    buffer_threshold: int = 40    # Chars before forcing an edit
    cursor: str = " ▉"           # Cursor shown during streaming

@@ -210,7 +213,7 @@ class StreamingConfig:
        return cls(
            enabled=data.get("enabled", False),
            transport=data.get("transport", "edit"),
-            edit_interval=float(data.get("edit_interval", 0.3)),
+            edit_interval=float(data.get("edit_interval", 1.0)),
            buffer_threshold=int(data.get("buffer_threshold", 40)),
            cursor=data.get("cursor", " ▉"),
        )
@@ -291,12 +294,20 @@ class GatewayConfig:
            # Feishu uses extra dict for app credentials
            elif platform == Platform.FEISHU and config.extra.get("app_id"):
                connected.append(platform)
-            # WeCom uses extra dict for bot credentials
+            # WeCom bot mode uses extra dict for bot credentials
            elif platform == Platform.WECOM and config.extra.get("bot_id"):
                connected.append(platform)
+            # WeCom callback mode uses corp_id or apps list
+            elif platform == Platform.WECOM_CALLBACK and (
+                config.extra.get("corp_id") or config.extra.get("apps")
+            ):
+                connected.append(platform)
            # BlueBubbles uses extra dict for local server config
            elif platform == Platform.BLUEBUBBLES and config.extra.get("server_url") and config.extra.get("password"):
                connected.append(platform)
+            # QQBot uses extra dict for app credentials
+            elif platform == Platform.QQBOT and config.extra.get("app_id") and config.extra.get("client_secret"):
+                connected.append(platform)
        return connected
    
    def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]:
@@ -615,6 +626,11 @@ def load_gateway_config() -> GatewayConfig:
                    if isinstance(frc, list):
                        frc = ",".join(str(v) for v in frc)
                    os.environ["TELEGRAM_FREE_RESPONSE_CHATS"] = str(frc)
+                ignored_threads = telegram_cfg.get("ignored_threads")
+                if ignored_threads is not None and not os.getenv("TELEGRAM_IGNORED_THREADS"):
+                    if isinstance(ignored_threads, list):
+                        ignored_threads = ",".join(str(v) for v in ignored_threads)
+                    os.environ["TELEGRAM_IGNORED_THREADS"] = str(ignored_threads)
                if "reactions" in telegram_cfg and not os.getenv("TELEGRAM_REACTIONS"):
                    os.environ["TELEGRAM_REACTIONS"] = str(telegram_cfg["reactions"]).lower()

@@ -659,6 +675,37 @@ def load_gateway_config() -> GatewayConfig:
    _apply_env_overrides(config)
    
    # --- Validate loaded values ---
+    _validate_gateway_config(config)
+
+    return config
+
+
+def _is_network_accessible(host: str) -> bool:
+    """Return True if *host* would expose a server beyond the loopback interface.
+
+    Duplicates the logic in ``gateway.platforms.base.is_network_accessible``
+    without creating a circular import (base.py imports from this module).
+    """
+    try:
+        addr = ipaddress.ip_address(host)
+        if addr.is_loopback:
+            return False
+        # ::ffff:127.x.x.x — Python's is_loopback returns False for
+        # IPv4-mapped loopback; unwrap and check the underlying IPv4.
+        if getattr(addr, "ipv4_mapped", None) and addr.ipv4_mapped.is_loopback:
+            return False
+        return True
+    except ValueError:
+        # Hostname: assume it could be network-accessible.
+        return True
+
+
+def _validate_gateway_config(config: "GatewayConfig") -> None:
+    """Validate and sanitize a loaded GatewayConfig in place.
+
+    Called by ``load_gateway_config()`` after all config sources are merged.
+    Extracted as a separate function for testability.
+    """
    policy = config.default_reset_policy

    if not (0 <= policy.at_hour <= 23):
@@ -695,7 +742,47 @@ def load_gateway_config() -> GatewayConfig:
                platform.value, env_name,
            )

-    return config
+    # Reject known-weak placeholder tokens.
+    # Ported from openclaw/openclaw#64586: users who copy .env.example
+    # without changing placeholder values get a clear startup error instead
+    # of a confusing "auth failed" from the platform API.
+    try:
+        from hermes_cli.auth import has_usable_secret
+    except ImportError:
+        has_usable_secret = None  # type: ignore[assignment]
+
+    if has_usable_secret is not None:
+        for platform, pconfig in config.platforms.items():
+            if not pconfig.enabled:
+                continue
+            env_name = _token_env_names.get(platform)
+            if not env_name:
+                continue
+            token = pconfig.token
+            if token and token.strip() and not has_usable_secret(token, min_length=4):
+                logger.error(
+                    "%s is enabled but %s is set to a placeholder value ('%s'). "
+                    "Set a real bot token before starting the gateway. "
+                    "The adapter will NOT be started.",
+                    platform.value, env_name, token.strip()[:6] + "...",
+                )
+                pconfig.enabled = False
+
+    # Warn when the API server is enabled on a network-accessible address
+    # without an auth key.  The adapter will refuse to start anyway, but
+    # surfacing this at config-load time lets operators see the problem in
+    # the startup log before any platform adapter initialisation runs.
+    api_cfg = config.platforms.get(Platform.API_SERVER)
+    if api_cfg and api_cfg.enabled:
+        key = api_cfg.extra.get("key", "")
+        host = api_cfg.extra.get("host", "127.0.0.1")
+        if not key and _is_network_accessible(host):
+            logger.warning(
+                "API Server is enabled on %s but API_SERVER_KEY is not set. "
+                "The adapter will refuse to start on a network-accessible address. "
+                "Set API_SERVER_KEY or bind to 127.0.0.1 for local-only access.",
+                host,
+            )


 def _apply_env_overrides(config: GatewayConfig) -> None:
@@ -987,6 +1074,23 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
                name=os.getenv("WECOM_HOME_CHANNEL_NAME", "Home"),
            )

+    # WeCom callback mode (self-built apps)
+    wecom_callback_corp_id = os.getenv("WECOM_CALLBACK_CORP_ID")
+    wecom_callback_corp_secret = os.getenv("WECOM_CALLBACK_CORP_SECRET")
+    if wecom_callback_corp_id and wecom_callback_corp_secret:
+        if Platform.WECOM_CALLBACK not in config.platforms:
+            config.platforms[Platform.WECOM_CALLBACK] = PlatformConfig()
+        config.platforms[Platform.WECOM_CALLBACK].enabled = True
+        config.platforms[Platform.WECOM_CALLBACK].extra.update({
+            "corp_id": wecom_callback_corp_id,
+            "corp_secret": wecom_callback_corp_secret,
+            "agent_id": os.getenv("WECOM_CALLBACK_AGENT_ID", ""),
+            "token": os.getenv("WECOM_CALLBACK_TOKEN", ""),
+            "encoding_aes_key": os.getenv("WECOM_CALLBACK_ENCODING_AES_KEY", ""),
+            "host": os.getenv("WECOM_CALLBACK_HOST", "0.0.0.0"),
+            "port": int(os.getenv("WECOM_CALLBACK_PORT", "8645")),
+        })
+
    # Weixin (personal WeChat via iLink Bot API)
    weixin_token = os.getenv("WEIXIN_TOKEN")
    weixin_account_id = os.getenv("WEIXIN_ACCOUNT_ID")
@@ -1017,6 +1121,9 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
        weixin_group_allowed_users = os.getenv("WEIXIN_GROUP_ALLOWED_USERS", "").strip()
        if weixin_group_allowed_users:
            extra["group_allow_from"] = weixin_group_allowed_users
+        weixin_split_multiline = os.getenv("WEIXIN_SPLIT_MULTILINE_MESSAGES", "").strip()
+        if weixin_split_multiline:
+            extra["split_multiline_messages"] = weixin_split_multiline
        weixin_home = os.getenv("WEIXIN_HOME_CHANNEL", "").strip()
        if weixin_home:
            config.platforms[Platform.WEIXIN].home_channel = HomeChannel(
@@ -1048,6 +1155,32 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
            name=os.getenv("BLUEBUBBLES_HOME_CHANNEL_NAME", "Home"),
        )

+    # QQ (Official Bot API v2)
+    qq_app_id = os.getenv("QQ_APP_ID")
+    qq_client_secret = os.getenv("QQ_CLIENT_SECRET")
+    if qq_app_id or qq_client_secret:
+        if Platform.QQBOT not in config.platforms:
+            config.platforms[Platform.QQBOT] = PlatformConfig()
+        config.platforms[Platform.QQBOT].enabled = True
+        extra = config.platforms[Platform.QQBOT].extra
+        if qq_app_id:
+            extra["app_id"] = qq_app_id
+        if qq_client_secret:
+            extra["client_secret"] = qq_client_secret
+        qq_allowed_users = os.getenv("QQ_ALLOWED_USERS", "").strip()
+        if qq_allowed_users:
+            extra["allow_from"] = qq_allowed_users
+        qq_group_allowed = os.getenv("QQ_GROUP_ALLOWED_USERS", "").strip()
+        if qq_group_allowed:
+            extra["group_allow_from"] = qq_group_allowed
+        qq_home = os.getenv("QQ_HOME_CHANNEL", "").strip()
+        if qq_home:
+            config.platforms[Platform.QQBOT].home_channel = HomeChannel(
+                platform=Platform.QQBOT,
+                chat_id=qq_home,
+                name=os.getenv("QQ_HOME_CHANNEL_NAME", "Home"),
+            )
+
    # Session settings
    idle_minutes = os.getenv("SESSION_IDLE_MINUTES")
    if idle_minutes:
--- a/gateway/config_validator.py
+++ b/gateway/config_validator.py
@@ -0,0 +1,224 @@
+"""
+Gateway Config Validator & Fallback Fix — #892.
+
+Validates gateway configuration and provides sensible defaults
+for missing keys to prevent fallback chain breaks.
+"""
+
+import logging
+import os
+from typing import Dict, Any, List, Optional
+from dataclasses import dataclass, field
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ConfigIssue:
+    """A configuration issue found during validation."""
+    key: str
+    severity: str  # error, warning, info
+    message: str
+    fix: str
+
+
+@dataclass
+class ConfigValidation:
+    """Result of config validation."""
+    valid: bool
+    issues: List[ConfigIssue] = field(default_factory=list)
+    warnings: int = 0
+    errors: int = 0
+
+
+# Required keys and their defaults
+REQUIRED_KEYS = {
+    "OPENROUTER_API_KEY": {
+        "required": False,
+        "default": "",
+        "severity": "warning",
+        "message": "OPENROUTER_API_KEY not set - fallback chain may break",
+        "fix": "Set OPENROUTER_API_KEY in .env for OpenRouter provider",
+    },
+    "API_SERVER_KEY": {
+        "required": False,
+        "default": "",
+        "severity": "warning",
+        "message": "API_SERVER_KEY not configured",
+        "fix": "Set API_SERVER_KEY in .env for API server auth",
+    },
+    "GITEA_TOKEN": {
+        "required": False,
+        "default": "",
+        "severity": "info",
+        "message": "GITEA_TOKEN not set - Gitea features disabled",
+        "fix": "Set GITEA_TOKEN in .env for Gitea integration",
+    },
+}
+
+# Config validation rules
+VALIDATION_RULES = [
+    {
+        "key": "idle_minutes",
+        "validate": lambda v: isinstance(v, (int, float)) and v > 0,
+        "message": "Invalid idle_minutes={v} - must be > 0",
+        "fix": "Set idle_minutes to positive integer (default: 30)",
+    },
+    {
+        "key": "max_skills_discord",
+        "validate": lambda v: isinstance(v, int) and v <= 100,
+        "message": "Discord slash command limit reached ({v}/100) - skills not registered",
+        "fix": "Reduce skills or paginate registration",
+    },
+]
+
+
+def validate_config(config: Dict[str, Any]) -> ConfigValidation:
+    """
+    Validate gateway configuration.
+    
+    Args:
+        config: Configuration dictionary
+        
+    Returns:
+        ConfigValidation with issues found
+    """
+    issues = []
+    
+    # Check required keys
+    for key, spec in REQUIRED_KEYS.items():
+        value = config.get(key) or os.environ.get(key) or spec["default"]
+        if spec["required"] and not value:
+            issues.append(ConfigIssue(
+                key=key,
+                severity=spec["severity"],
+                message=spec["message"],
+                fix=spec["fix"],
+            ))
+        elif not value and spec["severity"] != "error":
+            issues.append(ConfigIssue(
+                key=key,
+                severity=spec["severity"],
+                message=spec["message"],
+                fix=spec["fix"],
+            ))
+    
+    # Check validation rules
+    for rule in VALIDATION_RULES:
+        value = config.get(rule["key"])
+        if value is not None:
+            if not rule["validate"](value):
+                issues.append(ConfigIssue(
+                    key=rule["key"],
+                    severity="error",
+                    message=rule["message"].format(v=value),
+                    fix=rule["fix"],
+                ))
+    
+    errors = sum(1 for i in issues if i.severity == "error")
+    warnings = sum(1 for i in issues if i.severity == "warning")
+    
+    return ConfigValidation(
+        valid=errors == 0,
+        issues=issues,
+        warnings=warnings,
+        errors=errors,
+    )
+
+
+def apply_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Apply default values for missing config keys.
+    
+    Args:
+        config: Configuration dictionary
+        
+    Returns:
+        Config with defaults applied
+    """
+    result = dict(config)
+    
+    for key, spec in REQUIRED_KEYS.items():
+        if key not in result or not result[key]:
+            default = os.environ.get(key) or spec["default"]
+            if default:
+                result[key] = default
+                logger.debug("Applied default for %s", key)
+    
+    # Apply validation defaults
+    if "idle_minutes" not in result or not result["idle_minutes"] or result["idle_minutes"] <= 0:
+        result["idle_minutes"] = 30
+        logger.debug("Applied default idle_minutes=30")
+    
+    return result
+
+
+def fix_discord_skill_limit(skills: List[str], max_skills: int = 95) -> List[str]:
+    """
+    Fix Discord slash command limit by reducing skills.
+    
+    Args:
+        skills: List of skill names
+        max_skills: Maximum skills to register (default 95, leaving room for built-ins)
+        
+    Returns:
+        Reduced skill list
+    """
+    if len(skills) <= max_skills:
+        return skills
+    
+    logger.warning(
+        "Discord skill limit: %d skills exceeds %d limit, truncating",
+        len(skills), max_skills
+    )
+    
+    # Keep first max_skills (alphabetical priority)
+    return sorted(skills)[:max_skills]
+
+
+def validate_provider_config(provider: str, config: Dict[str, Any]) -> ConfigIssue:
+    """
+    Validate provider-specific configuration.
+    
+    Args:
+        provider: Provider name
+        config: Provider config
+        
+    Returns:
+        ConfigIssue if invalid, None if valid
+    """
+    if provider == "local-llama.cpp":
+        # Check if llama.cpp is configured
+        if not config.get("model_path") and not config.get("base_url"):
+            return ConfigIssue(
+                key=f"provider.{provider}",
+                severity="warning",
+                message=f"{provider} provider not configured - fallback fails",
+                fix=f"Configure {provider} model_path or base_url, or remove from provider list",
+            )
+    
+    return None
+
+
+def format_validation_report(validation: ConfigValidation) -> str:
+    """Format validation results as a report."""
+    lines = [
+        "=" * 50,
+        "GATEWAY CONFIG VALIDATION",
+        "=" * 50,
+        "",
+        f"Status: {'VALID' if validation.valid else 'INVALID'}",
+        f"Errors: {validation.errors}",
+        f"Warnings: {validation.warnings}",
+        "",
+    ]
+    
+    if validation.issues:
+        lines.append("Issues:")
+        for issue in validation.issues:
+            icon = "❌" if issue.severity == "error" else "⚠️" if issue.severity == "warning" else "ℹ️"
+            lines.append(f"  {icon} [{issue.key}] {issue.message}")
+            lines.append(f"     Fix: {issue.fix}")
+            lines.append("")
+    
+    return "\n".join(lines)
--- a/gateway/delivery.py
+++ b/gateway/delivery.py
@@ -12,7 +12,7 @@ import logging
 from pathlib import Path
 from datetime import datetime
 from dataclasses import dataclass
-from typing import Dict, List, Optional, Any, Union
+from typing import Dict, List, Optional, Any

 from hermes_cli.config import get_hermes_home

--- a/gateway/display_config.py
+++ b/gateway/display_config.py
@@ -0,0 +1,194 @@
+"""Per-platform display/verbosity configuration resolver.
+
+Provides ``resolve_display_setting()`` — the single entry-point for reading
+display settings with platform-specific overrides and sensible defaults.
+
+Resolution order (first non-None wins):
+    1. ``display.platforms.<platform>.<key>``  — explicit per-platform user override
+    2. ``display.<key>``                       — global user setting
+    3. ``_PLATFORM_DEFAULTS[<platform>][<key>]``  — built-in sensible default
+    4. ``_GLOBAL_DEFAULTS[<key>]``              — built-in global default
+
+Exception: ``display.streaming`` is CLI-only.  Gateway streaming follows the
+top-level ``streaming`` config unless ``display.platforms.<platform>.streaming``
+sets an explicit per-platform override.
+
+Backward compatibility: ``display.tool_progress_overrides`` is still read as a
+fallback for ``tool_progress`` when no ``display.platforms`` entry exists.  A
+config migration (version bump) automatically moves the old format into the new
+``display.platforms`` structure.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+# ---------------------------------------------------------------------------
+# Overrideable display settings and their global defaults
+# ---------------------------------------------------------------------------
+# These are the settings that can be configured per-platform.
+# Other display settings (compact, personality, skin, etc.) are CLI-only
+# and don't participate in per-platform resolution.
+
+_GLOBAL_DEFAULTS: dict[str, Any] = {
+    "tool_progress": "all",
+    "show_reasoning": False,
+    "tool_preview_length": 0,
+    "streaming": None,  # None = follow top-level streaming config
+}
+
+# ---------------------------------------------------------------------------
+# Sensible per-platform defaults — tiered by platform capability
+# ---------------------------------------------------------------------------
+# Tier 1 (high): Supports message editing, typically personal/team use
+# Tier 2 (medium): Supports editing but often workspace/customer-facing
+# Tier 3 (low): No edit support — each progress msg is permanent
+# Tier 4 (minimal): Batch/non-interactive delivery
+
+_TIER_HIGH = {
+    "tool_progress": "all",
+    "show_reasoning": False,
+    "tool_preview_length": 40,
+    "streaming": None,  # follow global
+}
+
+_TIER_MEDIUM = {
+    "tool_progress": "new",
+    "show_reasoning": False,
+    "tool_preview_length": 40,
+    "streaming": None,
+}
+
+_TIER_LOW = {
+    "tool_progress": "off",
+    "show_reasoning": False,
+    "tool_preview_length": 40,
+    "streaming": False,
+}
+
+_TIER_MINIMAL = {
+    "tool_progress": "off",
+    "show_reasoning": False,
+    "tool_preview_length": 0,
+    "streaming": False,
+}
+
+_PLATFORM_DEFAULTS: dict[str, dict[str, Any]] = {
+    # Tier 1 — full edit support, personal/team use
+    "telegram":    _TIER_HIGH,
+    "discord":     _TIER_HIGH,
+
+    # Tier 2 — edit support, often customer/workspace channels
+    "slack":           _TIER_MEDIUM,
+    "mattermost":      _TIER_MEDIUM,
+    "matrix":          _TIER_MEDIUM,
+    "feishu":          _TIER_MEDIUM,
+
+    # Tier 3 — no edit support, progress messages are permanent
+    "signal":          _TIER_LOW,
+    "whatsapp":        _TIER_MEDIUM,  # Baileys bridge supports /edit
+    "bluebubbles":     _TIER_LOW,
+    "weixin":          _TIER_LOW,
+    "wecom":           _TIER_LOW,
+    "wecom_callback":  _TIER_LOW,
+    "dingtalk":        _TIER_LOW,
+
+    # Tier 4 — batch or non-interactive delivery
+    "email":           _TIER_MINIMAL,
+    "sms":             _TIER_MINIMAL,
+    "webhook":         _TIER_MINIMAL,
+    "homeassistant":   _TIER_MINIMAL,
+    "api_server":      {**_TIER_HIGH, "tool_preview_length": 0},
+}
+
+# Canonical set of per-platform overrideable keys (for validation).
+OVERRIDEABLE_KEYS = frozenset(_GLOBAL_DEFAULTS.keys())
+
+
+def resolve_display_setting(
+    user_config: dict,
+    platform_key: str,
+    setting: str,
+    fallback: Any = None,
+) -> Any:
+    """Resolve a display setting with per-platform override support.
+
+    Parameters
+    ----------
+    user_config : dict
+        The full parsed config.yaml dict.
+    platform_key : str
+        Platform config key (e.g. ``"telegram"``, ``"slack"``).  Use
+        ``_platform_config_key(source.platform)`` from gateway/run.py.
+    setting : str
+        Display setting name (e.g. ``"tool_progress"``, ``"show_reasoning"``).
+    fallback : Any
+        Fallback value when the setting isn't found anywhere.
+
+    Returns
+    -------
+    The resolved value, or *fallback* if nothing is configured.
+    """
+    display_cfg = user_config.get("display") or {}
+
+    # 1. Explicit per-platform override (display.platforms.<platform>.<key>)
+    platforms = display_cfg.get("platforms") or {}
+    plat_overrides = platforms.get(platform_key)
+    if isinstance(plat_overrides, dict):
+        val = plat_overrides.get(setting)
+        if val is not None:
+            return _normalise(setting, val)
+
+    # 1b. Backward compat: display.tool_progress_overrides.<platform>
+    if setting == "tool_progress":
+        legacy = display_cfg.get("tool_progress_overrides")
+        if isinstance(legacy, dict):
+            val = legacy.get(platform_key)
+            if val is not None:
+                return _normalise(setting, val)
+
+    # 2. Global user setting (display.<key>).  Skip display.streaming because
+    # that key controls only CLI terminal streaming; gateway token streaming is
+    # governed by the top-level streaming config plus per-platform overrides.
+    if setting != "streaming":
+        val = display_cfg.get(setting)
+        if val is not None:
+            return _normalise(setting, val)
+
+    # 3. Built-in platform default
+    plat_defaults = _PLATFORM_DEFAULTS.get(platform_key)
+    if plat_defaults:
+        val = plat_defaults.get(setting)
+        if val is not None:
+            return val
+
+    # 4. Built-in global default
+    val = _GLOBAL_DEFAULTS.get(setting)
+    if val is not None:
+        return val
+
+    return fallback
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _normalise(setting: str, value: Any) -> Any:
+    """Normalise YAML quirks (bare ``off`` → False in YAML 1.1)."""
+    if setting == "tool_progress":
+        if value is False:
+            return "off"
+        if value is True:
+            return "all"
+        return str(value).lower()
+    if setting in ("show_reasoning", "streaming"):
+        if isinstance(value, str):
+            return value.lower() in ("true", "1", "yes", "on")
+        return bool(value)
+    if setting == "tool_preview_length":
+        try:
+            return int(value)
+        except (TypeError, ValueError):
+            return 0
+    return value
--- a/gateway/platforms/init.py
+++ b/gateway/platforms/init.py
@@ -9,9 +9,11 @@ Each adapter handles:
 """

 from .base import BasePlatformAdapter, MessageEvent, SendResult
+from .qqbot import QQAdapter

 __all__ = [
    "BasePlatformAdapter",
    "MessageEvent",
    "SendResult",
+    "QQAdapter",
 ]
--- a/gateway/platforms/api_server.py
+++ b/gateway/platforms/api_server.py
@@ -2,6 +2,11 @@
 OpenAI-compatible API server platform adapter.

 Exposes an HTTP server with endpoints:
+- GET  /                           — Hermes Web Console operator cockpit
+- GET  /api/gui/health             — cockpit health payload
+- GET  /api/gui/browser/status     — browser runtime status
+- POST /api/gui/browser/heal       — self-healing browser cleanup
+- GET  /api/gui/discovery          — ecosystem discovery for compatible frontends
 - POST /v1/chat/completions        — OpenAI Chat Completions format (stateless; opt-in session continuity via X-Hermes-Session-Id header)
 - POST /v1/responses               — OpenAI Responses API format (stateful via previous_response_id)
 - GET  /v1/responses/{response_id} — Retrieve a stored response
@@ -10,6 +15,7 @@ Exposes an HTTP server with endpoints:
 - POST /v1/runs                    — start a run, returns run_id immediately (202)
 - GET  /v1/runs/{run_id}/events    — SSE stream of structured lifecycle events
 - GET  /health                     — health check
+- GET  /health/detailed            — rich status for cross-container dashboard probing

 Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat,
 AnythingLLM, NextChat, ChatBox, etc.) can connect to hermes-agent
@@ -53,6 +59,67 @@ DEFAULT_HOST = "127.0.0.1"
 DEFAULT_PORT = 8642
 MAX_STORED_RESPONSES = 100
 MAX_REQUEST_BYTES = 1_000_000  # 1 MB default limit for POST bodies
+CHAT_COMPLETIONS_SSE_KEEPALIVE_SECONDS = 30.0
+MAX_NORMALIZED_TEXT_LENGTH = 65_536  # 64 KB cap for normalized content parts
+MAX_CONTENT_LIST_SIZE = 1_000  # Max items when content is an array
+
+
+def _normalize_chat_content(
+    content: Any, *, _max_depth: int = 10, _depth: int = 0,
+) -> str:
+    """Normalize OpenAI chat message content into a plain text string.
+
+    Some clients (Open WebUI, LobeChat, etc.) send content as an array of
+    typed parts instead of a plain string::
+
+        [{"type": "text", "text": "hello"}, {"type": "input_text", "text": "..."}]
+
+    This function flattens those into a single string so the agent pipeline
+    (which expects strings) doesn't choke.
+
+    Defensive limits prevent abuse: recursion depth, list size, and output
+    length are all bounded.
+    """
+    if _depth > _max_depth:
+        return ""
+    if content is None:
+        return ""
+    if isinstance(content, str):
+        return content[:MAX_NORMALIZED_TEXT_LENGTH] if len(content) > MAX_NORMALIZED_TEXT_LENGTH else content
+
+    if isinstance(content, list):
+        parts: List[str] = []
+        items = content[:MAX_CONTENT_LIST_SIZE] if len(content) > MAX_CONTENT_LIST_SIZE else content
+        for item in items:
+            if isinstance(item, str):
+                if item:
+                    parts.append(item[:MAX_NORMALIZED_TEXT_LENGTH])
+            elif isinstance(item, dict):
+                item_type = str(item.get("type") or "").strip().lower()
+                if item_type in {"text", "input_text", "output_text"}:
+                    text = item.get("text", "")
+                    if text:
+                        try:
+                            parts.append(str(text)[:MAX_NORMALIZED_TEXT_LENGTH])
+                        except Exception:
+                            pass
+                # Silently skip image_url / other non-text parts
+            elif isinstance(item, list):
+                nested = _normalize_chat_content(item, _max_depth=_max_depth, _depth=_depth + 1)
+                if nested:
+                    parts.append(nested)
+            # Check accumulated size
+            if sum(len(p) for p in parts) >= MAX_NORMALIZED_TEXT_LENGTH:
+                break
+        result = "\n".join(parts)
+        return result[:MAX_NORMALIZED_TEXT_LENGTH] if len(result) > MAX_NORMALIZED_TEXT_LENGTH else result
+
+    # Fallback for unexpected types (int, float, bool, etc.)
+    try:
+        result = str(content)
+        return result[:MAX_NORMALIZED_TEXT_LENGTH] if len(result) > MAX_NORMALIZED_TEXT_LENGTH else result
+    except Exception:
+        return ""


 def check_api_server_requirements() -> bool:
@@ -453,6 +520,8 @@ class APIServerAdapter(BasePlatformAdapter):
        session_id: Optional[str] = None,
        stream_delta_callback=None,
        tool_progress_callback=None,
+        tool_start_callback=None,
+        tool_complete_callback=None,
    ) -> Any:
        """
        Create an AIAgent instance using the gateway's runtime config.
@@ -491,6 +560,8 @@ class APIServerAdapter(BasePlatformAdapter):
            platform="api_server",
            stream_delta_callback=stream_delta_callback,
            tool_progress_callback=tool_progress_callback,
+            tool_start_callback=tool_start_callback,
+            tool_complete_callback=tool_complete_callback,
            session_db=self._ensure_session_db(),
            fallback_model=fallback_model,
        )
@@ -504,6 +575,27 @@ class APIServerAdapter(BasePlatformAdapter):
        """GET /health — simple health check."""
        return web.json_response({"status": "ok", "platform": "hermes-agent"})

+    async def _handle_health_detailed(self, request: "web.Request") -> "web.Response":
+        """GET /health/detailed — rich status for cross-container dashboard probing.
+
+        Returns gateway state, connected platforms, PID, and uptime so the
+        dashboard can display full status without needing a shared PID file or
+        /proc access.  No authentication required.
+        """
+        from gateway.status import read_runtime_status
+
+        runtime = read_runtime_status() or {}
+        return web.json_response({
+            "status": "ok",
+            "platform": "hermes-agent",
+            "gateway_state": runtime.get("gateway_state"),
+            "platforms": runtime.get("platforms", {}),
+            "active_agents": runtime.get("active_agents", 0),
+            "exit_reason": runtime.get("exit_reason"),
+            "updated_at": runtime.get("updated_at"),
+            "pid": os.getpid(),
+        })
+
    async def _handle_models(self, request: "web.Request") -> "web.Response":
        """GET /v1/models — return hermes-agent as an available model."""
        auth_err = self._check_auth(request)
@@ -552,7 +644,7 @@ class APIServerAdapter(BasePlatformAdapter):

        for msg in messages:
            role = msg.get("role", "")
-            content = msg.get("content", "")
+            content = _normalize_chat_content(msg.get("content", ""))
            if role == "system":
                # Accumulate system messages
                if system_prompt is None:
@@ -762,7 +854,11 @@ class APIServerAdapter(BasePlatformAdapter):
        """
        import queue as _q

-        sse_headers = {"Content-Type": "text/event-stream", "Cache-Control": "no-cache"}
+        sse_headers = {
+            "Content-Type": "text/event-stream",
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+        }
        # CORS middleware can't inject headers into StreamResponse after
        # prepare() flushes them, so resolve CORS headers up front.
        origin = request.headers.get("Origin", "")
@@ -775,6 +871,8 @@ class APIServerAdapter(BasePlatformAdapter):
        await response.prepare(request)

        try:
+            last_activity = time.monotonic()
+
            # Role chunk
            role_chunk = {
                "id": completion_id, "object": "chat.completion.chunk",
@@ -782,6 +880,7 @@ class APIServerAdapter(BasePlatformAdapter):
                "choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
            }
            await response.write(f"data: {json.dumps(role_chunk)}\n\n".encode())
+            last_activity = time.monotonic()

            # Helper — route a queue item to the correct SSE event.
            async def _emit(item):
@@ -805,6 +904,7 @@ class APIServerAdapter(BasePlatformAdapter):
                        "choices": [{"index": 0, "delta": {"content": item}, "finish_reason": None}],
                    }
                    await response.write(f"data: {json.dumps(content_chunk)}\n\n".encode())
+                return time.monotonic()

            # Stream content chunks as they arrive from the agent
            loop = asyncio.get_event_loop()
@@ -819,16 +919,19 @@ class APIServerAdapter(BasePlatformAdapter):
                                delta = stream_q.get_nowait()
                                if delta is None:
                                    break
-                                await _emit(delta)
+                                last_activity = await _emit(delta)
                            except _q.Empty:
                                break
                        break
+                    if time.monotonic() - last_activity >= CHAT_COMPLETIONS_SSE_KEEPALIVE_SECONDS:
+                        await response.write(b": keepalive\n\n")
+                        last_activity = time.monotonic()
                    continue

                if delta is None:  # End of stream sentinel
                    break

-                await _emit(delta)
+                last_activity = await _emit(delta)

            # Get usage from completed agent
            usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
@@ -871,6 +974,427 @@ class APIServerAdapter(BasePlatformAdapter):

        return response

+    async def _write_sse_responses(
+        self,
+        request: "web.Request",
+        response_id: str,
+        model: str,
+        created_at: int,
+        stream_q,
+        agent_task,
+        agent_ref,
+        conversation_history: List[Dict[str, str]],
+        user_message: str,
+        instructions: Optional[str],
+        conversation: Optional[str],
+        store: bool,
+        session_id: str,
+    ) -> "web.StreamResponse":
+        """Write an SSE stream for POST /v1/responses (OpenAI Responses API).
+
+        Emits spec-compliant event types as the agent runs:
+
+        - ``response.created`` — initial envelope (status=in_progress)
+        - ``response.output_text.delta`` / ``response.output_text.done`` —
+          streamed assistant text
+        - ``response.output_item.added`` / ``response.output_item.done``
+          with ``item.type == "function_call"`` — when the agent invokes a
+          tool (both events fire; the ``done`` event carries the finalized
+          ``arguments`` string)
+        - ``response.output_item.added`` with
+          ``item.type == "function_call_output"`` — tool result with
+          ``{call_id, output, status}``
+        - ``response.completed`` — terminal event carrying the full
+          response object with all output items + usage (same payload
+          shape as the non-streaming path for parity)
+        - ``response.failed`` — terminal event on agent error
+
+        If the client disconnects mid-stream, ``agent.interrupt()`` is
+        called so the agent stops issuing upstream LLM calls, then the
+        asyncio task is cancelled.  When ``store=True`` the full response
+        is persisted to the ResponseStore in a ``finally`` block so GET
+        /v1/responses/{id} and ``previous_response_id`` chaining work the
+        same as the batch path.
+        """
+        import queue as _q
+
+        sse_headers = {
+            "Content-Type": "text/event-stream",
+            "Cache-Control": "no-cache",
+            "X-Accel-Buffering": "no",
+        }
+        origin = request.headers.get("Origin", "")
+        cors = self._cors_headers_for_origin(origin) if origin else None
+        if cors:
+            sse_headers.update(cors)
+        if session_id:
+            sse_headers["X-Hermes-Session-Id"] = session_id
+        response = web.StreamResponse(status=200, headers=sse_headers)
+        await response.prepare(request)
+
+        # State accumulated during the stream
+        final_text_parts: List[str] = []
+        # Track open function_call items by name so we can emit a matching
+        # ``done`` event when the tool completes.  Order preserved.
+        pending_tool_calls: List[Dict[str, Any]] = []
+        # Output items we've emitted so far (used to build the terminal
+        # response.completed payload).  Kept in the order they appeared.
+        emitted_items: List[Dict[str, Any]] = []
+        # Monotonic counter for output_index (spec requires it).
+        output_index = 0
+        # Monotonic counter for call_id generation if the agent doesn't
+        # provide one (it doesn't, from tool_progress_callback).
+        call_counter = 0
+        # Canonical Responses SSE events include a monotonically increasing
+        # sequence_number. Add it server-side for every emitted event so
+        # clients that validate the OpenAI event schema can parse our stream.
+        sequence_number = 0
+        # Track the assistant message item id + content index for text
+        # delta events — the spec ties deltas to a specific item.
+        message_item_id = f"msg_{uuid.uuid4().hex[:24]}"
+        message_output_index: Optional[int] = None
+        message_opened = False
+
+        async def _write_event(event_type: str, data: Dict[str, Any]) -> None:
+            nonlocal sequence_number
+            if "sequence_number" not in data:
+                data["sequence_number"] = sequence_number
+            sequence_number += 1
+            payload = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
+            await response.write(payload.encode())
+
+        def _envelope(status: str) -> Dict[str, Any]:
+            env: Dict[str, Any] = {
+                "id": response_id,
+                "object": "response",
+                "status": status,
+                "created_at": created_at,
+                "model": model,
+            }
+            return env
+
+        final_response_text = ""
+        agent_error: Optional[str] = None
+        usage: Dict[str, int] = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
+
+        try:
+            # response.created — initial envelope, status=in_progress
+            created_env = _envelope("in_progress")
+            created_env["output"] = []
+            await _write_event("response.created", {
+                "type": "response.created",
+                "response": created_env,
+            })
+            last_activity = time.monotonic()
+
+            async def _open_message_item() -> None:
+                """Emit response.output_item.added for the assistant message
+                the first time any text delta arrives."""
+                nonlocal message_opened, message_output_index, output_index
+                if message_opened:
+                    return
+                message_opened = True
+                message_output_index = output_index
+                output_index += 1
+                item = {
+                    "id": message_item_id,
+                    "type": "message",
+                    "status": "in_progress",
+                    "role": "assistant",
+                    "content": [],
+                }
+                await _write_event("response.output_item.added", {
+                    "type": "response.output_item.added",
+                    "output_index": message_output_index,
+                    "item": item,
+                })
+
+            async def _emit_text_delta(delta_text: str) -> None:
+                await _open_message_item()
+                final_text_parts.append(delta_text)
+                await _write_event("response.output_text.delta", {
+                    "type": "response.output_text.delta",
+                    "item_id": message_item_id,
+                    "output_index": message_output_index,
+                    "content_index": 0,
+                    "delta": delta_text,
+                    "logprobs": [],
+                })
+
+            async def _emit_tool_started(payload: Dict[str, Any]) -> str:
+                """Emit response.output_item.added for a function_call.
+
+                Returns the call_id so the matching completion event can
+                reference it.  Prefer the real ``tool_call_id`` from the
+                agent when available; fall back to a generated call id for
+                safety in tests or older code paths.
+                """
+                nonlocal output_index, call_counter
+                call_counter += 1
+                call_id = payload.get("tool_call_id") or f"call_{response_id[5:]}_{call_counter}"
+                args = payload.get("arguments", {})
+                if isinstance(args, dict):
+                    arguments_str = json.dumps(args)
+                else:
+                    arguments_str = str(args)
+                item = {
+                    "id": f"fc_{uuid.uuid4().hex[:24]}",
+                    "type": "function_call",
+                    "status": "in_progress",
+                    "name": payload.get("name", ""),
+                    "call_id": call_id,
+                    "arguments": arguments_str,
+                }
+                idx = output_index
+                output_index += 1
+                pending_tool_calls.append({
+                    "call_id": call_id,
+                    "name": payload.get("name", ""),
+                    "arguments": arguments_str,
+                    "item_id": item["id"],
+                    "output_index": idx,
+                })
+                emitted_items.append({
+                    "type": "function_call",
+                    "name": payload.get("name", ""),
+                    "arguments": arguments_str,
+                    "call_id": call_id,
+                })
+                await _write_event("response.output_item.added", {
+                    "type": "response.output_item.added",
+                    "output_index": idx,
+                    "item": item,
+                })
+                return call_id
+
+            async def _emit_tool_completed(payload: Dict[str, Any]) -> None:
+                """Emit response.output_item.done (function_call) followed
+                by response.output_item.added (function_call_output)."""
+                nonlocal output_index
+                call_id = payload.get("tool_call_id")
+                result = payload.get("result", "")
+                pending = None
+                if call_id:
+                    for i, p in enumerate(pending_tool_calls):
+                        if p["call_id"] == call_id:
+                            pending = pending_tool_calls.pop(i)
+                            break
+                if pending is None:
+                    # Completion without a matching start — skip to avoid
+                    # emitting orphaned done events.
+                    return
+
+                # function_call done
+                done_item = {
+                    "id": pending["item_id"],
+                    "type": "function_call",
+                    "status": "completed",
+                    "name": pending["name"],
+                    "call_id": pending["call_id"],
+                    "arguments": pending["arguments"],
+                }
+                await _write_event("response.output_item.done", {
+                    "type": "response.output_item.done",
+                    "output_index": pending["output_index"],
+                    "item": done_item,
+                })
+
+                # function_call_output added (result)
+                result_str = result if isinstance(result, str) else json.dumps(result)
+                output_parts = [{"type": "input_text", "text": result_str}]
+                output_item = {
+                    "id": f"fco_{uuid.uuid4().hex[:24]}",
+                    "type": "function_call_output",
+                    "call_id": pending["call_id"],
+                    "output": output_parts,
+                    "status": "completed",
+                }
+                idx = output_index
+                output_index += 1
+                emitted_items.append({
+                    "type": "function_call_output",
+                    "call_id": pending["call_id"],
+                    "output": output_parts,
+                })
+                await _write_event("response.output_item.added", {
+                    "type": "response.output_item.added",
+                    "output_index": idx,
+                    "item": output_item,
+                })
+                await _write_event("response.output_item.done", {
+                    "type": "response.output_item.done",
+                    "output_index": idx,
+                    "item": output_item,
+                })
+
+            # Main drain loop — thread-safe queue fed by agent callbacks.
+            async def _dispatch(it) -> None:
+                """Route a queue item to the correct SSE emitter.
+
+                Plain strings are text deltas.  Tagged tuples with
+                ``__tool_started__`` / ``__tool_completed__`` prefixes
+                are tool lifecycle events.
+                """
+                if isinstance(it, tuple) and len(it) == 2 and isinstance(it[0], str):
+                    tag, payload = it
+                    if tag == "__tool_started__":
+                        await _emit_tool_started(payload)
+                    elif tag == "__tool_completed__":
+                        await _emit_tool_completed(payload)
+                    # Unknown tags are silently ignored (forward-compat).
+                elif isinstance(it, str):
+                    await _emit_text_delta(it)
+                # Other types (non-string, non-tuple) are silently dropped.
+
+            loop = asyncio.get_event_loop()
+            while True:
+                try:
+                    item = await loop.run_in_executor(None, lambda: stream_q.get(timeout=0.5))
+                except _q.Empty:
+                    if agent_task.done():
+                        # Drain remaining
+                        while True:
+                            try:
+                                item = stream_q.get_nowait()
+                                if item is None:
+                                    break
+                                await _dispatch(item)
+                                last_activity = time.monotonic()
+                            except _q.Empty:
+                                break
+                        break
+                    if time.monotonic() - last_activity >= CHAT_COMPLETIONS_SSE_KEEPALIVE_SECONDS:
+                        await response.write(b": keepalive\n\n")
+                        last_activity = time.monotonic()
+                    continue
+
+                if item is None:  # EOS sentinel
+                    break
+
+                await _dispatch(item)
+                last_activity = time.monotonic()
+
+            # Pick up agent result + usage from the completed task
+            try:
+                result, agent_usage = await agent_task
+                usage = agent_usage or usage
+                # If the agent produced a final_response but no text
+                # deltas were streamed (e.g. some providers only emit
+                # the full response at the end), emit a single fallback
+                # delta so Responses clients still receive a live text part.
+                agent_final = result.get("final_response", "") if isinstance(result, dict) else ""
+                if agent_final and not final_text_parts:
+                    await _emit_text_delta(agent_final)
+                if agent_final and not final_response_text:
+                    final_response_text = agent_final
+                if isinstance(result, dict) and result.get("error") and not final_response_text:
+                    agent_error = result["error"]
+            except Exception as e:  # noqa: BLE001
+                logger.error("Error running agent for streaming responses: %s", e, exc_info=True)
+                agent_error = str(e)
+
+            # Close the message item if it was opened
+            final_response_text = "".join(final_text_parts) or final_response_text
+            if message_opened:
+                await _write_event("response.output_text.done", {
+                    "type": "response.output_text.done",
+                    "item_id": message_item_id,
+                    "output_index": message_output_index,
+                    "content_index": 0,
+                    "text": final_response_text,
+                    "logprobs": [],
+                })
+                msg_done_item = {
+                    "id": message_item_id,
+                    "type": "message",
+                    "status": "completed",
+                    "role": "assistant",
+                    "content": [
+                        {"type": "output_text", "text": final_response_text}
+                    ],
+                }
+                await _write_event("response.output_item.done", {
+                    "type": "response.output_item.done",
+                    "output_index": message_output_index,
+                    "item": msg_done_item,
+                })
+
+            # Always append a final message item in the completed
+            # response envelope so clients that only parse the terminal
+            # payload still see the assistant text.  This mirrors the
+            # shape produced by _extract_output_items in the batch path.
+            final_items: List[Dict[str, Any]] = list(emitted_items)
+            final_items.append({
+                "type": "message",
+                "role": "assistant",
+                "content": [
+                    {"type": "output_text", "text": final_response_text or (agent_error or "")}
+                ],
+            })
+
+            if agent_error:
+                failed_env = _envelope("failed")
+                failed_env["output"] = final_items
+                failed_env["error"] = {"message": agent_error, "type": "server_error"}
+                failed_env["usage"] = {
+                    "input_tokens": usage.get("input_tokens", 0),
+                    "output_tokens": usage.get("output_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                }
+                await _write_event("response.failed", {
+                    "type": "response.failed",
+                    "response": failed_env,
+                })
+            else:
+                completed_env = _envelope("completed")
+                completed_env["output"] = final_items
+                completed_env["usage"] = {
+                    "input_tokens": usage.get("input_tokens", 0),
+                    "output_tokens": usage.get("output_tokens", 0),
+                    "total_tokens": usage.get("total_tokens", 0),
+                }
+                await _write_event("response.completed", {
+                    "type": "response.completed",
+                    "response": completed_env,
+                })
+
+                # Persist for future chaining / GET retrieval, mirroring
+                # the batch path behavior.
+                if store:
+                    full_history = list(conversation_history)
+                    full_history.append({"role": "user", "content": user_message})
+                    if isinstance(result, dict) and result.get("messages"):
+                        full_history.extend(result["messages"])
+                    else:
+                        full_history.append({"role": "assistant", "content": final_response_text})
+                    self._response_store.put(response_id, {
+                        "response": completed_env,
+                        "conversation_history": full_history,
+                        "instructions": instructions,
+                        "session_id": session_id,
+                    })
+                    if conversation:
+                        self._response_store.set_conversation(conversation, response_id)
+
+        except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError, OSError):
+            # Client disconnected — interrupt the agent so it stops
+            # making upstream LLM calls, then cancel the task.
+            agent = agent_ref[0] if agent_ref else None
+            if agent is not None:
+                try:
+                    agent.interrupt("SSE client disconnected")
+                except Exception:
+                    pass
+            if not agent_task.done():
+                agent_task.cancel()
+                try:
+                    await agent_task
+                except (asyncio.CancelledError, Exception):
+                    pass
+            logger.info("SSE client disconnected; interrupted agent task %s", response_id)
+
+        return response
+
    async def _handle_responses(self, request: "web.Request") -> "web.Response":
        """POST /v1/responses — OpenAI Responses API format."""
        auth_err = self._check_auth(request)
@@ -914,18 +1438,7 @@ class APIServerAdapter(BasePlatformAdapter):
                    input_messages.append({"role": "user", "content": item})
                elif isinstance(item, dict):
                    role = item.get("role", "user")
-                    content = item.get("content", "")
-                    # Handle content that may be a list of content parts
-                    if isinstance(content, list):
-                        text_parts = []
-                        for part in content:
-                            if isinstance(part, dict) and part.get("type") == "input_text":
-                                text_parts.append(part.get("text", ""))
-                            elif isinstance(part, dict) and part.get("type") == "output_text":
-                                text_parts.append(part.get("text", ""))
-                            elif isinstance(part, str):
-                                text_parts.append(part)
-                        content = "\n".join(text_parts)
+                    content = _normalize_chat_content(item.get("content", ""))
                    input_messages.append({"role": role, "content": content})
        else:
            return web.json_response(_openai_error("'input' must be a string or array"), status=400)
@@ -952,11 +1465,13 @@ class APIServerAdapter(BasePlatformAdapter):
            if previous_response_id:
                logger.debug("Both conversation_history and previous_response_id provided; using conversation_history")

+        stored_session_id = None
        if not conversation_history and previous_response_id:
            stored = self._response_store.get(previous_response_id)
            if stored is None:
                return web.json_response(_openai_error(f"Previous response not found: {previous_response_id}"), status=404)
            conversation_history = list(stored.get("conversation_history", []))
+            stored_session_id = stored.get("session_id")
            # If no instructions provided, carry forward from previous
            if instructions is None:
                instructions = stored.get("instructions")
@@ -974,8 +1489,83 @@ class APIServerAdapter(BasePlatformAdapter):
        if body.get("truncation") == "auto" and len(conversation_history) > 100:
            conversation_history = conversation_history[-100:]

-        # Run the agent (with Idempotency-Key support)
-        session_id = str(uuid.uuid4())
+        # Reuse session from previous_response_id chain so the dashboard
+        # groups the entire conversation under one session entry.
+        session_id = stored_session_id or str(uuid.uuid4())
+
+        stream = bool(body.get("stream", False))
+        if stream:
+            # Streaming branch — emit OpenAI Responses SSE events as the
+            # agent runs so frontends can render text deltas and tool
+            # calls in real time.  See _write_sse_responses for details.
+            import queue as _q
+            _stream_q: _q.Queue = _q.Queue()
+
+            def _on_delta(delta):
+                # None from the agent is a CLI box-close signal, not EOS.
+                # Forwarding would kill the SSE stream prematurely; the
+                # SSE writer detects completion via agent_task.done().
+                if delta is not None:
+                    _stream_q.put(delta)
+
+            def _on_tool_progress(event_type, name, preview, args, **kwargs):
+                """Queue non-start tool progress events if needed in future.
+
+                The structured Responses stream uses ``tool_start_callback``
+                and ``tool_complete_callback`` for exact call-id correlation,
+                so progress events are currently ignored here.
+                """
+                return
+
+            def _on_tool_start(tool_call_id, function_name, function_args):
+                """Queue a started tool for live function_call streaming."""
+                _stream_q.put(("__tool_started__", {
+                    "tool_call_id": tool_call_id,
+                    "name": function_name,
+                    "arguments": function_args or {},
+                }))
+
+            def _on_tool_complete(tool_call_id, function_name, function_args, function_result):
+                """Queue a completed tool result for live function_call_output streaming."""
+                _stream_q.put(("__tool_completed__", {
+                    "tool_call_id": tool_call_id,
+                    "name": function_name,
+                    "arguments": function_args or {},
+                    "result": function_result,
+                }))
+
+            agent_ref = [None]
+            agent_task = asyncio.ensure_future(self._run_agent(
+                user_message=user_message,
+                conversation_history=conversation_history,
+                ephemeral_system_prompt=instructions,
+                session_id=session_id,
+                stream_delta_callback=_on_delta,
+                tool_progress_callback=_on_tool_progress,
+                tool_start_callback=_on_tool_start,
+                tool_complete_callback=_on_tool_complete,
+                agent_ref=agent_ref,
+            ))
+
+            response_id = f"resp_{uuid.uuid4().hex[:28]}"
+            model_name = body.get("model", self._model_name)
+            created_at = int(time.time())
+
+            return await self._write_sse_responses(
+                request=request,
+                response_id=response_id,
+                model=model_name,
+                created_at=created_at,
+                stream_q=_stream_q,
+                agent_task=agent_task,
+                agent_ref=agent_ref,
+                conversation_history=conversation_history,
+                user_message=user_message,
+                instructions=instructions,
+                conversation=conversation,
+                store=store,
+                session_id=session_id,
+            )

        async def _compute_response():
            return await self._run_agent(
@@ -1050,6 +1640,7 @@ class APIServerAdapter(BasePlatformAdapter):
                "response": response_data,
                "conversation_history": full_history,
                "instructions": instructions,
+                "session_id": session_id,
            })
            # Update conversation mapping so the next request with the same
            # conversation name automatically chains to this response
@@ -1403,6 +1994,8 @@ class APIServerAdapter(BasePlatformAdapter):
        session_id: Optional[str] = None,
        stream_delta_callback=None,
        tool_progress_callback=None,
+        tool_start_callback=None,
+        tool_complete_callback=None,
        agent_ref: Optional[list] = None,
    ) -> tuple:
        """
@@ -1424,6 +2017,8 @@ class APIServerAdapter(BasePlatformAdapter):
                session_id=session_id,
                stream_delta_callback=stream_delta_callback,
                tool_progress_callback=tool_progress_callback,
+                tool_start_callback=tool_start_callback,
+                tool_complete_callback=tool_complete_callback,
            )
            if agent_ref is not None:
                agent_ref[0] = agent
@@ -1560,10 +2155,12 @@ class APIServerAdapter(BasePlatformAdapter):
            if previous_response_id:
                logger.debug("Both conversation_history and previous_response_id provided; using conversation_history")

+        stored_session_id = None
        if not conversation_history and previous_response_id:
            stored = self._response_store.get(previous_response_id)
            if stored:
                conversation_history = list(stored.get("conversation_history", []))
+                stored_session_id = stored.get("session_id")
                if instructions is None:
                    instructions = stored.get("instructions")

@@ -1582,7 +2179,7 @@ class APIServerAdapter(BasePlatformAdapter):
                        )
                    conversation_history.append({"role": msg["role"], "content": str(content)})

-        session_id = body.get("session_id") or run_id
+        session_id = body.get("session_id") or stored_session_id or run_id
        ephemeral_system_prompt = instructions

        async def _run_and_close():
@@ -1711,6 +2308,30 @@ class APIServerAdapter(BasePlatformAdapter):
    # BasePlatformAdapter interface
    # ------------------------------------------------------------------

+    def _register_routes(self, app: "web.Application") -> None:
+        """Register API and operator-cockpit routes on an aiohttp app."""
+        from gateway.platforms.api_server_ui import maybe_register_web_console
+
+        app.router.add_get("/health", self._handle_health)
+        app.router.add_get("/health/detailed", self._handle_health_detailed)
+        app.router.add_get("/v1/health", self._handle_health)
+        app.router.add_get("/v1/models", self._handle_models)
+        app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
+        app.router.add_post("/v1/responses", self._handle_responses)
+        app.router.add_get("/v1/responses/{response_id}", self._handle_get_response)
+        app.router.add_delete("/v1/responses/{response_id}", self._handle_delete_response)
+        app.router.add_get("/api/jobs", self._handle_list_jobs)
+        app.router.add_post("/api/jobs", self._handle_create_job)
+        app.router.add_get("/api/jobs/{job_id}", self._handle_get_job)
+        app.router.add_patch("/api/jobs/{job_id}", self._handle_update_job)
+        app.router.add_delete("/api/jobs/{job_id}", self._handle_delete_job)
+        app.router.add_post("/api/jobs/{job_id}/pause", self._handle_pause_job)
+        app.router.add_post("/api/jobs/{job_id}/resume", self._handle_resume_job)
+        app.router.add_post("/api/jobs/{job_id}/run", self._handle_run_job)
+        app.router.add_post("/v1/runs", self._handle_runs)
+        app.router.add_get("/v1/runs/{run_id}/events", self._handle_run_events)
+        maybe_register_web_console(app)
+
    async def connect(self) -> bool:
        """Start the aiohttp web server."""
        if not AIOHTTP_AVAILABLE:
@@ -1721,25 +2342,7 @@ class APIServerAdapter(BasePlatformAdapter):
            mws = [mw for mw in (cors_middleware, body_limit_middleware, security_headers_middleware) if mw is not None]
            self._app = web.Application(middlewares=mws)
            self._app["api_server_adapter"] = self
-            self._app.router.add_get("/health", self._handle_health)
-            self._app.router.add_get("/v1/health", self._handle_health)
-            self._app.router.add_get("/v1/models", self._handle_models)
-            self._app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
-            self._app.router.add_post("/v1/responses", self._handle_responses)
-            self._app.router.add_get("/v1/responses/{response_id}", self._handle_get_response)
-            self._app.router.add_delete("/v1/responses/{response_id}", self._handle_delete_response)
-            # Cron jobs management API
-            self._app.router.add_get("/api/jobs", self._handle_list_jobs)
-            self._app.router.add_post("/api/jobs", self._handle_create_job)
-            self._app.router.add_get("/api/jobs/{job_id}", self._handle_get_job)
-            self._app.router.add_patch("/api/jobs/{job_id}", self._handle_update_job)
-            self._app.router.add_delete("/api/jobs/{job_id}", self._handle_delete_job)
-            self._app.router.add_post("/api/jobs/{job_id}/pause", self._handle_pause_job)
-            self._app.router.add_post("/api/jobs/{job_id}/resume", self._handle_resume_job)
-            self._app.router.add_post("/api/jobs/{job_id}/run", self._handle_run_job)
-            # Structured event streaming
-            self._app.router.add_post("/v1/runs", self._handle_runs)
-            self._app.router.add_get("/v1/runs/{run_id}/events", self._handle_run_events)
+            self._register_routes(self._app)
            # Start background sweep to clean up orphaned (unconsumed) run streams
            sweep_task = asyncio.create_task(self._sweep_orphaned_runs())
            try:
@@ -1758,6 +2361,23 @@ class APIServerAdapter(BasePlatformAdapter):
                )
                return False

+            # Refuse to start network-accessible with a placeholder key.
+            # Ported from openclaw/openclaw#64586.
+            if is_network_accessible(self._host) and self._api_key:
+                try:
+                    from hermes_cli.auth import has_usable_secret
+                    if not has_usable_secret(self._api_key, min_length=8):
+                        logger.error(
+                            "[%s] Refusing to start: API_SERVER_KEY is set to a "
+                            "placeholder value. Generate a real secret "
+                            "(e.g. `openssl rand -hex 32`) and set API_SERVER_KEY "
+                            "before exposing the API server on %s.",
+                            self.name, self._host,
+                        )
+                        return False
+                except ImportError:
+                    pass
+
            # Port conflict detection — fail fast if port is already in use
            try:
                with _socket.socket(_socket.AF_INET, _socket.SOCK_STREAM) as _s:
--- a/gateway/platforms/api_server_ui.py
+++ b/gateway/platforms/api_server_ui.py
@@ -0,0 +1,194 @@
+"""Thin operator web console for the API server.
+
+This keeps the UI intentionally small: an aiohttp-mounted cockpit that
+surfaces Hermes health, browser runtime state, and ecosystem discovery
+without introducing a second heavyweight frontend architecture.
+"""
+
+from __future__ import annotations
+
+import json
+from html import escape
+from typing import Any, Dict
+
+from aiohttp import web
+
+from tools.browser_tool import browser_runtime_heal, browser_runtime_status
+
+_DISCOVERY_FRONTENDS = [
+    "Open WebUI",
+    "LobeChat",
+    "LibreChat",
+    "AnythingLLM",
+    "NextChat",
+    "ChatBox",
+]
+
+
+def _adapter(request: web.Request):
+    return request.app["api_server_adapter"]
+
+
+def _auth_or_none(request: web.Request):
+    adapter = _adapter(request)
+    return adapter._check_auth(request)
+
+
+def _render_console_html(adapter) -> str:
+    health = {
+        "platform": "api_server",
+        "host": adapter._host,
+        "port": adapter._port,
+        "model": adapter._model_name,
+        "auth_required": bool(adapter._api_key),
+    }
+    health_json = escape(json.dumps(health, indent=2, ensure_ascii=False))
+    return f'''<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <title>Hermes Web Console</title>
+  <style>
+    :root {{ color-scheme: dark; --bg: #0b1020; --panel: #121933; --fg: #e5ecff; --muted: #9aa8d1; --accent: #72b8ff; --good: #6dde8a; }}
+    body {{ margin: 0; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; background: var(--bg); color: var(--fg); }}
+    header {{ padding: 20px 24px; border-bottom: 1px solid #243056; }}
+    main {{ padding: 24px; display: grid; gap: 16px; grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); }}
+    .panel {{ background: var(--panel); border: 1px solid #243056; border-radius: 12px; padding: 16px; box-shadow: 0 10px 30px rgba(0,0,0,.2); }}
+    h1, h2 {{ margin: 0 0 12px; }}
+    h1 {{ font-size: 24px; color: var(--accent); }}
+    h2 {{ font-size: 16px; color: var(--accent); }}
+    p, li, label {{ color: var(--muted); line-height: 1.5; }}
+    pre {{ margin: 0; white-space: pre-wrap; word-break: break-word; color: var(--fg); }}
+    button, input {{ font: inherit; }}
+    button {{ background: #1e2a52; color: var(--fg); border: 1px solid #39508f; border-radius: 8px; padding: 10px 14px; cursor: pointer; }}
+    button:hover {{ border-color: var(--accent); }}
+    input {{ width: 100%; box-sizing: border-box; background: #0d142a; color: var(--fg); border: 1px solid #243056; border-radius: 8px; padding: 10px 12px; margin-bottom: 12px; }}
+    .row {{ display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 12px; }}
+    .badge {{ display: inline-block; color: var(--good); border: 1px solid #2f6940; border-radius: 999px; padding: 2px 10px; margin-left: 10px; font-size: 12px; }}
+    ul {{ margin: 0; padding-left: 18px; }}
+    code {{ color: var(--good); }}
+  </style>
+</head>
+<body>
+  <header>
+    <h1>Hermes Web Console <span class="badge">operator cockpit</span></h1>
+    <p>Thin web UI over the existing API server, browser runtime, and streaming endpoints.</p>
+  </header>
+  <main>
+    <section class="panel">
+      <h2>Gateway Health</h2>
+      <pre id="health">{health_json}</pre>
+    </section>
+    <section class="panel">
+      <h2>Browser Cockpit</h2>
+      <label for="apiKey">Optional API key (only needed when API_SERVER_KEY is configured)</label>
+      <input id="apiKey" type="password" placeholder="sk-... or bearer token">
+      <div class="row">
+        <button id="refreshBtn">Refresh Browser Status</button>
+        <button id="healBtn">Heal Browser Layer</button>
+      </div>
+      <pre id="browserStatus">Loading...</pre>
+    </section>
+    <section class="panel">
+      <h2>Ecosystem Discovery</h2>
+      <ul>
+        <li><code>GET /v1/models</code> — OpenAI-compatible model discovery</li>
+        <li><code>POST /v1/chat/completions</code> — chat frontend compatibility</li>
+        <li><code>POST /v1/responses</code> — stateful responses API</li>
+        <li><code>POST /v1/runs</code> + <code>GET /v1/runs/{{run_id}}/events</code> — SSE lifecycle stream</li>
+        <li><code>GET /api/gui/browser/status</code> — browser runtime status</li>
+        <li><code>POST /api/gui/browser/heal</code> — cleanup + orphan reaper</li>
+      </ul>
+      <pre id="discovery">Loading...</pre>
+    </section>
+  </main>
+  <script>
+    function authHeaders() {{
+      const key = document.getElementById('apiKey').value.trim();
+      return key ? {{ 'Authorization': 'Bearer ' + key }} : {{}};
+    }}
+    async function loadJson(path, options) {{
+      const response = await fetch(path, options);
+      const text = await response.text();
+      try {{ return {{ status: response.status, body: JSON.parse(text) }}; }}
+      catch (_) {{ return {{ status: response.status, body: {{ raw: text }} }}; }}
+    }}
+    async function refreshBrowser() {{
+      const result = await loadJson('/api/gui/browser/status', {{ headers: authHeaders() }});
+      document.getElementById('browserStatus').textContent = JSON.stringify(result, null, 2);
+    }}
+    async function healBrowser() {{
+      const result = await loadJson('/api/gui/browser/heal', {{ method: 'POST', headers: authHeaders() }});
+      document.getElementById('browserStatus').textContent = JSON.stringify(result, null, 2);
+    }}
+    async function loadDiscovery() {{
+      const result = await loadJson('/api/gui/discovery');
+      document.getElementById('discovery').textContent = JSON.stringify(result, null, 2);
+    }}
+    document.getElementById('refreshBtn').addEventListener('click', refreshBrowser);
+    document.getElementById('healBtn').addEventListener('click', healBrowser);
+    refreshBrowser();
+    loadDiscovery();
+  </script>
+</body>
+</html>'''
+
+
+async def handle_web_console_index(request: web.Request) -> web.Response:
+    return web.Response(text=_render_console_html(_adapter(request)), content_type="text/html")
+
+
+async def handle_gui_health(request: web.Request) -> web.Response:
+    adapter = _adapter(request)
+    return web.json_response({
+        "status": "ok",
+        "platform": "api_server",
+        "host": adapter._host,
+        "port": adapter._port,
+        "model": adapter._model_name,
+        "auth_required": bool(adapter._api_key),
+    })
+
+
+async def handle_browser_status(request: web.Request) -> web.Response:
+    auth_err = _auth_or_none(request)
+    if auth_err is not None:
+        return auth_err
+    return web.json_response(browser_runtime_status())
+
+
+async def handle_browser_heal(request: web.Request) -> web.Response:
+    auth_err = _auth_or_none(request)
+    if auth_err is not None:
+        return auth_err
+    return web.json_response(browser_runtime_heal())
+
+
+async def handle_discovery(request: web.Request) -> web.Response:
+    adapter = _adapter(request)
+    return web.json_response({
+        "frontends": _DISCOVERY_FRONTENDS,
+        "operator_cockpit": {
+            "root": "/",
+            "health": "/api/gui/health",
+            "browser_status": "/api/gui/browser/status",
+            "browser_heal": "/api/gui/browser/heal",
+        },
+        "openai_compatible": {
+            "models": "/v1/models",
+            "chat_completions": "/v1/chat/completions",
+            "responses": "/v1/responses",
+            "runs": "/v1/runs",
+            "run_events": "/v1/runs/{run_id}/events",
+            "model_name": adapter._model_name,
+        },
+    })
+
+
+def maybe_register_web_console(app: web.Application) -> None:
+    app.router.add_get("/", handle_web_console_index)
+    app.router.add_get("/api/gui/health", handle_gui_health)
+    app.router.add_get("/api/gui/browser/status", handle_browser_status)
+    app.router.add_post("/api/gui/browser/heal", handle_browser_heal)
+    app.router.add_get("/api/gui/discovery", handle_discovery)
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@@ -21,6 +21,59 @@ from urllib.parse import urlsplit
 logger = logging.getLogger(__name__)


+def utf16_len(s: str) -> int:
+    """Count UTF-16 code units in *s*.
+
+    Telegram's message-length limit (4 096) is measured in UTF-16 code units,
+    **not** Unicode code-points.  Characters outside the Basic Multilingual
+    Plane (emoji like 😀, CJK Extension B, musical symbols, …) are encoded as
+    surrogate pairs and therefore consume **two** UTF-16 code units each, even
+    though Python's ``len()`` counts them as one.
+
+    Ported from nearai/ironclaw#2304 which discovered the same discrepancy in
+    Rust's ``chars().count()``.
+    """
+    return len(s.encode("utf-16-le")) // 2
+
+
+def _prefix_within_utf16_limit(s: str, limit: int) -> str:
+    """Return the longest prefix of *s* whose UTF-16 length ≤ *limit*.
+
+    Unlike a plain ``s[:limit]``, this respects surrogate-pair boundaries so
+    we never slice a multi-code-unit character in half.
+    """
+    if utf16_len(s) <= limit:
+        return s
+    # Binary search for the longest safe prefix
+    lo, hi = 0, len(s)
+    while lo < hi:
+        mid = (lo + hi + 1) // 2
+        if utf16_len(s[:mid]) <= limit:
+            lo = mid
+        else:
+            hi = mid - 1
+    return s[:lo]
+
+
+def _custom_unit_to_cp(s: str, budget: int, len_fn) -> int:
+    """Return the largest codepoint offset *n* such that ``len_fn(s[:n]) <= budget``.
+
+    Used by :meth:`BasePlatformAdapter.truncate_message` when *len_fn* measures
+    length in units different from Python codepoints (e.g. UTF-16 code units).
+    Falls back to binary search which is O(log n) calls to *len_fn*.
+    """
+    if len_fn(s) <= budget:
+        return len(s)
+    lo, hi = 0, len(s)
+    while lo < hi:
+        mid = (lo + hi + 1) // 2
+        if len_fn(s[:mid]) <= budget:
+            lo = mid
+        else:
+            hi = mid - 1
+    return lo
+
+
 def is_network_accessible(host: str) -> bool:
    """Return True if *host* would expose the server beyond loopback.

@@ -823,7 +876,36 @@ class BasePlatformAdapter(ABC):
        result = handler(self)
        if asyncio.iscoroutine(result):
            await result
-    
+
+    def _acquire_platform_lock(self, scope: str, identity: str, resource_desc: str) -> bool:
+        """Acquire a scoped lock for this adapter. Returns True on success."""
+        from gateway.status import acquire_scoped_lock
+        self._platform_lock_scope = scope
+        self._platform_lock_identity = identity
+        acquired, existing = acquire_scoped_lock(
+            scope, identity, metadata={'platform': self.platform.value}
+        )
+        if acquired:
+            return True
+        owner_pid = existing.get('pid') if isinstance(existing, dict) else None
+        message = (
+            f'{resource_desc} already in use'
+            + (f' (PID {owner_pid})' if owner_pid else '')
+            + '. Stop the other gateway first.'
+        )
+        logger.error('[%s] %s', self.name, message)
+        self._set_fatal_error(f'{scope}_lock', message, retryable=False)
+        return False
+
+    def _release_platform_lock(self) -> None:
+        """Release the scoped lock acquired by _acquire_platform_lock."""
+        identity = getattr(self, '_platform_lock_identity', None)
+        if not identity:
+            return
+        from gateway.status import release_scoped_lock
+        release_scoped_lock(self._platform_lock_scope, identity)
+        self._platform_lock_identity = None
+
    @property
    def name(self) -> str:
        """Human-readable name for this adapter."""
@@ -1542,6 +1624,21 @@ class BasePlatformAdapter(ABC):
            # streaming already delivered the text (already_sent=True) or
            # when the message was queued behind an active agent.  Log at
            # DEBUG to avoid noisy warnings for expected behavior.
+            #
+            # Suppress stale response when the session was interrupted by a
+            # new message that hasn't been consumed yet.  The pending message
+            # is processed by the pending-message handler below (#8221/#2483).
+            if (
+                response
+                and interrupt_event.is_set()
+                and session_key in self._pending_messages
+            ):
+                logger.info(
+                    "[%s] Suppressing stale response for interrupted session %s",
+                    self.name,
+                    session_key,
+                )
+                response = None
            if not response:
                logger.debug("[%s] Handler returned empty/None response for %s", self.name, event.source.chat_id)
            if response:
@@ -1857,7 +1954,11 @@ class BasePlatformAdapter(ABC):
        return content
    
    @staticmethod
-    def truncate_message(content: str, max_length: int = 4096) -> List[str]:
+    def truncate_message(
+        content: str,
+        max_length: int = 4096,
+        len_fn: Optional["Callable[[str], int]"] = None,
+    ) -> List[str]:
        """
        Split a long message into chunks, preserving code block boundaries.

@@ -1869,11 +1970,16 @@ class BasePlatformAdapter(ABC):
        Args:
            content: The full message content
            max_length: Maximum length per chunk (platform-specific)
+            len_fn: Optional length function for measuring string length.
+                     Defaults to ``len`` (Unicode code-points).  Pass
+                     ``utf16_len`` for platforms that measure message
+                     length in UTF-16 code units (e.g. Telegram).

        Returns:
            List of message chunks
        """
-        if len(content) <= max_length:
+        _len = len_fn or len
+        if _len(content) <= max_length:
            return [content]

        INDICATOR_RESERVE = 10   # room for " (XX/XX)"
@@ -1892,22 +1998,33 @@ class BasePlatformAdapter(ABC):

            # How much body text we can fit after accounting for the prefix,
            # a potential closing fence, and the chunk indicator.
-            headroom = max_length - INDICATOR_RESERVE - len(prefix) - len(FENCE_CLOSE)
+            headroom = max_length - INDICATOR_RESERVE - _len(prefix) - _len(FENCE_CLOSE)
            if headroom < 1:
                headroom = max_length // 2

            # Everything remaining fits in one final chunk
-            if len(prefix) + len(remaining) <= max_length - INDICATOR_RESERVE:
+            if _len(prefix) + _len(remaining) <= max_length - INDICATOR_RESERVE:
                chunks.append(prefix + remaining)
                break

-            # Find a natural split point (prefer newlines, then spaces)
-            region = remaining[:headroom]
+            # Find a natural split point (prefer newlines, then spaces).
+            # When _len != len (e.g. utf16_len for Telegram), headroom is
+            # measured in the custom unit.  We need codepoint-based slice
+            # positions that stay within the custom-unit budget.
+            #
+            # _safe_slice_pos() maps a custom-unit budget to the largest
+            # codepoint offset whose custom length ≤ budget.
+            if _len is not len:
+                # Map headroom (custom units) → codepoint slice length
+                _cp_limit = _custom_unit_to_cp(remaining, headroom, _len)
+            else:
+                _cp_limit = headroom
+            region = remaining[:_cp_limit]
            split_at = region.rfind("\n")
-            if split_at < headroom // 2:
+            if split_at < _cp_limit // 2:
                split_at = region.rfind(" ")
            if split_at < 1:
-                split_at = headroom
+                split_at = _cp_limit

            # Avoid splitting inside an inline code span (`...`).
            # If the text before split_at has an odd number of unescaped
@@ -1927,7 +2044,7 @@ class BasePlatformAdapter(ABC):
                    safe_split = candidate.rfind(" ", 0, last_bt)
                    nl_split = candidate.rfind("\n", 0, last_bt)
                    safe_split = max(safe_split, nl_split)
-                    if safe_split > headroom // 4:
+                    if safe_split > _cp_limit // 4:
                        split_at = safe_split

            chunk_body = remaining[:split_at]
--- a/Show More
+++ b/Show More