feat: transform NexusBridge into a client for the Nexus Gateway. Fixes #1135

2026-04-08 22:08:11 -04:00
777 changed files with 14853 additions and 118892 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -5,7 +5,6 @@

 # Dependencies
 node_modules
-.venv

 # CI/CD
 .github
--- a/.env.example
+++ b/.env.example
@@ -43,15 +43,6 @@
 # KIMI_BASE_URL=https://api.kimi.com/coding/v1  # Default for sk-kimi- keys
 # KIMI_BASE_URL=https://api.moonshot.ai/v1      # For legacy Moonshot keys
 # KIMI_BASE_URL=https://api.moonshot.cn/v1       # For Moonshot China keys
-# KIMI_CN_API_KEY=                               # Dedicated Moonshot China key
-
-# =============================================================================
-# LLM PROVIDER (Arcee AI)
-# =============================================================================
-# Arcee AI provides access to Trinity models (trinity-mini, trinity-large-*)
-# Get an Arcee key at: https://chat.arcee.ai/
-# ARCEEAI_API_KEY=
-# ARCEE_BASE_URL=                                 # Override default base URL

 # =============================================================================
 # LLM PROVIDER (MiniMax)
@@ -98,15 +89,6 @@
 # Optional base URL override:
 # HERMES_QWEN_BASE_URL=https://portal.qwen.ai/v1

-# =============================================================================
-# LLM PROVIDER (Xiaomi MiMo)
-# =============================================================================
-# Xiaomi MiMo models (mimo-v2-pro, mimo-v2-omni, mimo-v2-flash).
-# Get your key at: https://platform.xiaomimimo.com
-# XIAOMI_API_KEY=your_key_here
-# Optional base URL override:
-# XIAOMI_BASE_URL=https://api.xiaomimimo.com/v1
-
 # =============================================================================
 # TOOL API KEYS
 # =============================================================================
@@ -145,10 +127,6 @@
 # Only override here if you need to force a backend without touching config.yaml:
 # TERMINAL_ENV=local

-# Override the container runtime binary (e.g. to use Podman instead of Docker).
-# Useful on systems where Docker's storage driver is broken or unavailable.
-# HERMES_DOCKER_BINARY=/usr/local/bin/podman
-
 # Container images (for singularity/docker/modal backends)
 # TERMINAL_DOCKER_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
 # TERMINAL_SINGULARITY_IMAGE=docker://nikolaik/python-nodejs:python3.11-nodejs20
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +0,0 @@
-# Auto-generated files — collapse diffs and exclude from language stats
-web/package-lock.json linguist-generated=true
--- a/.gitea/workflows/lint.yml
+++ b/.gitea/workflows/lint.yml
@@ -1,28 +0,0 @@
-name: Lint
-
-on:
-  push:
-    branches: [main]
-  pull_request:
-    branches: [main]
-
-jobs:
-  lint:
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Set up Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Check for hardcoded paths
-        run: python3 scripts/lint_hardcoded_paths.py
-        continue-on-error: true
-
-      - name: Check Python syntax
-        run: |
-          find . -name "*.py" -not -path "./.git/*" -not -path "./node_modules/*" | head -100 | xargs python3 -m py_compile || true
--- a/.githooks/pre-commit-hardcoded-path.py
+++ b/.githooks/pre-commit-hardcoded-path.py
@@ -1,78 +0,0 @@
-#!/usr/bin/env python3
-"""
-Pre-commit hook: Reject hardcoded home-directory paths.
-
-Install:
-    cp pre-commit-hardcoded-path.py .git/hooks/pre-commit-hardcoded-path
-    chmod +x .git/hooks/pre-commit-hardcoded-path
-    
-    Or add to .pre-commit-config.yaml
-"""
-
-import sys
-import subprocess
-import re
-
-PATTERNS = [
-    (r"/Users/[\w.\-]+/", "macOS home directory"),
-    (r"/home/[\w.\-]+/", "Linux home directory"),
-    (r"(?<![\w/])~/", "unexpanded tilde"),
-]
-
-NOQA = re.compile(r"#\s*noqa:?\s*hardcoded-path-ok")
-
-def get_staged_files():
-    result = subprocess.run(
-        ["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"],
-        capture_output=True, text=True
-    )
-    return [f for f in result.stdout.strip().split("\n") if f.endswith(".py")]
-
-def check_file(filepath):
-    try:
-        result = subprocess.run(
-            ["git", "show", f":{filepath}"],
-            capture_output=True, text=True
-        )
-        content = result.stdout
-    except Exception:
-        return []
-    
-    violations = []
-    for i, line in enumerate(content.split("\n"), 1):
-        if line.strip().startswith("#"):
-            continue
-        if line.strip().startswith(("import ", "from ")):
-            continue
-        if NOQA.search(line):
-            continue
-        for pattern, desc in PATTERNS:
-            if re.search(pattern, line):
-                violations.append((filepath, i, line.strip(), desc))
-                break
-    return violations
-
-def main():
-    files = get_staged_files()
-    if not files:
-        sys.exit(0)
-    
-    all_violations = []
-    for f in files:
-        all_violations.extend(check_file(f))
-    
-    if all_violations:
-        print("ERROR: Hardcoded home directory paths detected:")
-        print()
-        for filepath, line_no, line, desc in all_violations:
-            print(f"  {filepath}:{line_no}: {desc}")
-            print(f"    {line[:100]}")
-            print()
-        print("Fix: Use $HOME, relative paths, or get_hermes_home().")
-        print("Override: Add '# noqa: hardcoded-path-ok' to the line.")
-        sys.exit(1)
-    
-    sys.exit(0)
-
-if __name__ == "__main__":
-    main()
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -11,7 +11,6 @@ body:
        **Before submitting**, please:
        - [ ] Search [existing issues](https://github.com/NousResearch/hermes-agent/issues) to avoid duplicates
        - [ ] Update to the latest version (`hermes update`) and confirm the bug still exists
-        - [ ] Run `hermes debug share` and paste the links below (see Debug Report section)

  - type: textarea
    id: description
@@ -83,25 +82,6 @@ body:
        - Slack
        - WhatsApp

-  - type: textarea
-    id: debug-report
-    attributes:
-      label: Debug Report
-      description: |
-        Run `hermes debug share` from your terminal and paste the links it prints here.
-        This uploads your system info, config, and recent logs to a paste service automatically.
-
-        If you're in an interactive chat session, you can also use the `/debug` slash command — it does the same thing.
-
-        If the upload fails, run `hermes debug share --local` and paste the output directly.
-      placeholder: |
-        Report   https://paste.rs/abc123
-        agent.log   https://paste.rs/def456
-        gateway.log   https://paste.rs/ghi789
-      render: shell
-    validations:
-      required: true
-
  - type: input
    id: os
    attributes:
@@ -117,6 +97,8 @@ body:
      label: Python Version
      description: Output of `python --version`
      placeholder: "3.11.9"
+    validations:
+      required: true

  - type: input
    id: hermes-version
@@ -124,14 +106,14 @@ body:
      label: Hermes Version
      description: Output of `hermes version`
      placeholder: "2.1.0"
+    validations:
+      required: true

  - type: textarea
    id: logs
    attributes:
-      label: Additional Logs / Traceback (optional)
-      description: |
-        The debug report above covers most logs. Use this field for any extra error output, 
-        tracebacks, or screenshots not captured by `hermes debug share`.
+      label: Relevant Logs / Traceback
+      description: Paste any error output, traceback, or log messages. This will be auto-formatted as code.
      render: shell

  - type: textarea
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -71,15 +71,3 @@ body:
      label: Contribution
      options:
        - label: I'd like to implement this myself and submit a PR
-
-  - type: textarea
-    id: debug-report
-    attributes:
-      label: Debug Report (optional)
-      description: |
-        If this feature request is related to a problem you're experiencing, run `hermes debug share` and paste the links here.
-        In an interactive chat session, you can use `/debug` instead.
-        This helps us understand your environment and any related logs.
-      placeholder: |
-        Report   https://paste.rs/abc123
-      render: shell
--- a/.github/ISSUE_TEMPLATE/setup_help.yml
+++ b/.github/ISSUE_TEMPLATE/setup_help.yml
@@ -9,8 +9,7 @@ body:
        Sorry you're having trouble! Please fill out the details below so we can help.

        **Quick checks first:**
-        - Run `hermes debug share` and paste the links in the Debug Report section below
-        - If you're in a chat session, you can use `/debug` instead — it does the same thing
+        - Run `hermes doctor` and include the output below
        - Try `hermes update` to get the latest version
        - Check the [README troubleshooting section](https://github.com/NousResearch/hermes-agent#troubleshooting)
        - For general questions, consider the [Nous Research Discord](https://discord.gg/NousResearch) for faster help
@@ -75,21 +74,10 @@ body:
      placeholder: "2.1.0"

  - type: textarea
-    id: debug-report
+    id: doctor-output
    attributes:
-      label: Debug Report
-      description: |
-        Run `hermes debug share` from your terminal and paste the links it prints here.
-        This uploads your system info, config, and recent logs to a paste service automatically.
-
-        If you're in an interactive chat session, you can also use the `/debug` slash command — it does the same thing.
-
-        If the upload fails or install didn't get that far, run `hermes debug share --local` and paste the output directly.
-        If even that doesn't work, run `hermes doctor` and paste that output instead.
-      placeholder: |
-        Report   https://paste.rs/abc123
-        agent.log   https://paste.rs/def456
-        gateway.log   https://paste.rs/ghi789
+      label: Output of `hermes doctor`
+      description: Run `hermes doctor` and paste the full output. This will be auto-formatted.
      render: shell

  - type: textarea
--- a/.github/workflows/contributor-check.yml
+++ b/.github/workflows/contributor-check.yml
@@ -1,73 +0,0 @@
-name: Contributor Attribution Check
-
-on:
-  pull_request:
-    branches: [main]
-    paths:
-      # Only run when code files change (not docs-only PRs)
-      - '*.py'
-      - '**/*.py'
-      - '.github/workflows/contributor-check.yml'
-
-permissions:
-  contents: read
-
-jobs:
-  check-attribution:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
-        with:
-          fetch-depth: 0  # Full history needed for git log
-
-      - name: Check for unmapped contributor emails
-        run: |
-          # Get the merge base between this PR and main
-          MERGE_BASE=$(git merge-base origin/main HEAD)
-
-          # Find any new author emails in this PR's commits
-          NEW_EMAILS=$(git log ${MERGE_BASE}..HEAD --format='%ae' --no-merges | sort -u)
-
-          if [ -z "$NEW_EMAILS" ]; then
-            echo "No new commits to check."
-            exit 0
-          fi
-
-          # Check each email against AUTHOR_MAP in release.py
-          MISSING=""
-          while IFS= read -r email; do
-            # Skip teknium and bot emails
-            case "$email" in
-              *teknium*|*noreply@github.com*|*dependabot*|*github-actions*|*anthropic.com*|*cursor.com*)
-                continue ;;
-            esac
-
-            # Check if email is in AUTHOR_MAP (either as a key or matches noreply pattern)
-            if echo "$email" | grep -qP '\+.*@users\.noreply\.github\.com'; then
-              continue  # GitHub noreply emails auto-resolve
-            fi
-
-            if ! grep -qF "\"${email}\"" scripts/release.py 2>/dev/null; then
-              AUTHOR=$(git log --author="$email" --format='%an' -1)
-              MISSING="${MISSING}\n  ${email} (${AUTHOR})"
-            fi
-          done <<< "$NEW_EMAILS"
-
-          if [ -n "$MISSING" ]; then
-            echo ""
-            echo "⚠️  New contributor email(s) not in AUTHOR_MAP:"
-            echo -e "$MISSING"
-            echo ""
-            echo "Please add mappings to scripts/release.py AUTHOR_MAP:"
-            echo -e "$MISSING" | while read -r line; do
-              email=$(echo "$line" | sed 's/^ *//' | cut -d' ' -f1)
-              [ -z "$email" ] && continue
-              echo "    \"${email}\": \"<github-username>\","
-            done
-            echo ""
-            echo "To find the GitHub username for an email:"
-            echo "  gh api 'search/users?q=EMAIL+in:email' --jq '.items[0].login'"
-            exit 1
-          else
-            echo "✅ All contributor emails are mapped in AUTHOR_MAP."
-          fi
--- a/.github/workflows/deploy-site.yml
+++ b/.github/workflows/deploy-site.yml
@@ -28,32 +28,24 @@ jobs:
      name: github-pages
      url: ${{ steps.deploy.outputs.page_url }}
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+      - uses: actions/checkout@v4

-      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+      - uses: actions/setup-node@v4
        with:
          node-version: 20
          cache: npm
          cache-dependency-path: website/package-lock.json

-      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'

      - name: Install PyYAML for skill extraction
-        run: pip install pyyaml==6.0.2 httpx==0.28.1
+        run: pip install pyyaml

      - name: Extract skill metadata for dashboard
        run: python3 website/scripts/extract-skills.py

-      - name: Build skills index (if not already present)
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          if [ ! -f website/static/api/skills-index.json ]; then
-            python3 scripts/build_skills_index.py || echo "Skills index build failed (non-fatal)"
-          fi
-
      - name: Install dependencies
        run: npm ci
        working-directory: website
@@ -73,10 +65,10 @@ jobs:
          echo "hermes-agent.nousresearch.com" > _site/CNAME

      - name: Upload artifact
-        uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa  # v3
+        uses: actions/upload-pages-artifact@v3
        with:
          path: _site

      - name: Deploy to GitHub Pages
        id: deploy
-        uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e  # v4
+        uses: actions/deploy-pages@v4
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -8,9 +8,6 @@ on:
  release:
    types: [published]

-permissions:
-  contents: read
-
 concurrency:
  group: docker-${{ github.ref }}
  cancel-in-progress: true
@@ -20,29 +17,22 @@ jobs:
    # Only run on the upstream repository, not on forks
    if: github.repository == 'NousResearch/hermes-agent'
    runs-on: ubuntu-latest
-    timeout-minutes: 60
+    timeout-minutes: 30
    steps:
      - name: Checkout code
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        uses: actions/checkout@v4
        with:
          submodules: recursive

-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130  # v3
-
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f  # v3
+        uses: docker/setup-buildx-action@v3

-      # Build amd64 only so we can `load` the image for smoke testing.
-      # `load: true` cannot export a multi-arch manifest to the local daemon.
-      # The multi-arch build follows on push to main / release.
-      - name: Build image (amd64, smoke test)
-        uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8  # v6
+      - name: Build image
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: Dockerfile
          load: true
-          platforms: linux/amd64
          tags: nousresearch/hermes-agent:test
          cache-from: type=gha
          cache-to: type=gha,mode=max
@@ -56,31 +46,34 @@ jobs:

      - name: Log in to Docker Hub
        if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
-        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9  # v3
+        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}

-      - name: Push multi-arch image (main branch)
+      - name: Push image (main branch)
        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
-        uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8  # v6
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: Dockerfile
          push: true
-          platforms: linux/amd64,linux/arm64
-          tags: nousresearch/hermes-agent:latest
+          tags: |
+            nousresearch/hermes-agent:latest
+            nousresearch/hermes-agent:${{ github.sha }}
          cache-from: type=gha
          cache-to: type=gha,mode=max

-      - name: Push multi-arch image (release)
+      - name: Push image (release)
        if: github.event_name == 'release'
-        uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8  # v6
+        uses: docker/build-push-action@v6
        with:
          context: .
          file: Dockerfile
          push: true
-          platforms: linux/amd64,linux/arm64
-          tags: nousresearch/hermes-agent:${{ github.event.release.tag_name }}
+          tags: |
+            nousresearch/hermes-agent:latest
+            nousresearch/hermes-agent:${{ github.event.release.tag_name }}
+            nousresearch/hermes-agent:${{ github.sha }}
          cache-from: type=gha
          cache-to: type=gha,mode=max
--- a/.github/workflows/docs-site-checks.yml
+++ b/.github/workflows/docs-site-checks.yml
@@ -7,16 +7,13 @@ on:
      - '.github/workflows/docs-site-checks.yml'
  workflow_dispatch:

-permissions:
-  contents: read
-
 jobs:
  docs-site-checks:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+      - uses: actions/checkout@v4

-      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
+      - uses: actions/setup-node@v4
        with:
          node-version: 20
          cache: npm
@@ -26,12 +23,12 @@ jobs:
        run: npm ci
        working-directory: website

-      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
+      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'

-      - name: Install ascii-guard
-        run: python -m pip install ascii-guard==2.3.0 pyyaml==6.0.3
+      - name: Install Python dependencies
+        run: python -m pip install ascii-guard pyyaml

      - name: Extract skill metadata for dashboard
        run: python3 website/scripts/extract-skills.py
--- a/.github/workflows/nix.yml
+++ b/.github/workflows/nix.yml
@@ -14,9 +14,6 @@ on:
      - 'run_agent.py'
      - 'acp_adapter/**'

-permissions:
-  contents: read
-
 concurrency:
  group: nix-${{ github.ref }}
  cancel-in-progress: true
@@ -29,9 +26,9 @@ jobs:
    runs-on: ${{ matrix.os }}
    timeout-minutes: 30
    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
-      - uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25  # v22
-      - uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39  # v13
+      - uses: actions/checkout@v4
+      - uses: DeterminateSystems/nix-installer-action@main
+      - uses: DeterminateSystems/magic-nix-cache-action@main
      - name: Check flake
        if: runner.os == 'Linux'
        run: nix flake check --print-build-logs
--- a/.github/workflows/skills-index.yml
+++ b/.github/workflows/skills-index.yml
@@ -1,101 +0,0 @@
-name: Build Skills Index
-
-on:
-  schedule:
-    # Run twice daily: 6 AM and 6 PM UTC
-    - cron: '0 6,18 * * *'
-  workflow_dispatch:  # Manual trigger
-  push:
-    branches: [main]
-    paths:
-      - 'scripts/build_skills_index.py'
-      - '.github/workflows/skills-index.yml'
-
-permissions:
-  contents: read
-
-jobs:
-  build-index:
-    # Only run on the upstream repository, not on forks
-    if: github.repository == 'NousResearch/hermes-agent'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
-
-      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
-        with:
-          python-version: '3.11'
-
-      - name: Install dependencies
-        run: pip install httpx==0.28.1 pyyaml==6.0.2
-
-      - name: Build skills index
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: python scripts/build_skills_index.py
-
-      - name: Upload index artifact
-        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02  # v4
-        with:
-          name: skills-index
-          path: website/static/api/skills-index.json
-          retention-days: 7
-
-  deploy-with-index:
-    needs: build-index
-    runs-on: ubuntu-latest
-    permissions:
-      pages: write
-      id-token: write
-    environment:
-      name: github-pages
-      url: ${{ steps.deploy.outputs.page_url }}
-    # Only deploy on schedule or manual trigger (not on every push to the script)
-    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
-    steps:
-      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
-
-      - uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093  # v4
-        with:
-          name: skills-index
-          path: website/static/api/
-
-      - uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020  # v4
-        with:
-          node-version: 20
-          cache: npm
-          cache-dependency-path: website/package-lock.json
-
-      - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065  # v5
-        with:
-          python-version: '3.11'
-
-      - name: Install PyYAML for skill extraction
-        run: pip install pyyaml==6.0.2
-
-      - name: Extract skill metadata for dashboard
-        run: python3 website/scripts/extract-skills.py
-
-      - name: Install dependencies
-        run: npm ci
-        working-directory: website
-
-      - name: Build Docusaurus
-        run: npm run build
-        working-directory: website
-
-      - name: Stage deployment
-        run: |
-          mkdir -p _site/docs
-          cp -r landingpage/* _site/
-          cp -r website/build/* _site/docs/
-          echo "hermes-agent.nousresearch.com" > _site/CNAME
-
-      - name: Upload artifact
-        uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa  # v3
-        with:
-          path: _site
-
-      - name: Deploy to GitHub Pages
-        id: deploy
-        uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e  # v4
--- a/.github/workflows/supply-chain-audit.yml
+++ b/.github/workflows/supply-chain-audit.yml
@@ -14,7 +14,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0

@@ -149,62 +149,6 @@ jobs:
          "
          fi

-          # --- CI/CD workflow files modified ---
-          WORKFLOW_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '\.github/workflows/.*\.ya?ml$' || true)
-          if [ -n "$WORKFLOW_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: CI/CD workflow files modified
-          Changes to workflow files can alter build pipelines, inject steps, or modify permissions. Verify no unauthorized actions or secrets access were added.
-
-          **Files:**
-          \`\`\`
-          ${WORKFLOW_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- Dockerfile / container build files modified ---
-          DOCKER_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -iE '(Dockerfile|\.dockerignore|docker-compose)' || true)
-          if [ -n "$DOCKER_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: Container build files modified
-          Changes to Dockerfiles or compose files can alter base images, add build steps, or expose ports. Verify base image pins and build commands.
-
-          **Files:**
-          \`\`\`
-          ${DOCKER_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- Dependency manifest files modified ---
-          DEP_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '(pyproject\.toml|requirements.*\.txt|package\.json|Gemfile|go\.mod|Cargo\.toml)$' || true)
-          if [ -n "$DEP_HITS" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: Dependency manifest files modified
-          Changes to dependency files can introduce new packages or change version pins. Verify all dependency changes are intentional and from trusted sources.
-
-          **Files:**
-          \`\`\`
-          ${DEP_HITS}
-          \`\`\`
-          "
-          fi
-
-          # --- GitHub Actions version unpinning (mutable tags instead of SHAs) ---
-          ACTIONS_UNPIN=$(echo "$DIFF" | grep -n '^\+' | grep 'uses:' | grep -v '#' | grep -E '@v[0-9]' | head -10 || true)
-          if [ -n "$ACTIONS_UNPIN" ]; then
-            FINDINGS="${FINDINGS}
-          ### ⚠️ WARNING: GitHub Actions with mutable version tags
-          Actions should be pinned to full commit SHAs (not \`@v4\`, \`@v5\`). Mutable tags can be retargeted silently if a maintainer account is compromised.
-
-          **Matches:**
-          \`\`\`
-          ${ACTIONS_UNPIN}
-          \`\`\`
-          "
-          fi
-
          # --- Output results ---
          if [ -n "$FINDINGS" ]; then
            echo "found=true" >> "$GITHUB_OUTPUT"
@@ -239,7 +183,7 @@ jobs:
          ---
          *Automated scan triggered by [supply-chain-audit](/.github/workflows/supply-chain-audit.yml). If this is a false positive, a maintainer can approve after manual review.*"

-          gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" || echo "::warning::Could not post PR comment (expected for fork PRs — GITHUB_TOKEN is read-only)"
+          gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"

      - name: Fail on critical findings
        if: steps.scan.outputs.critical == 'true'
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,9 +6,6 @@ on:
  pull_request:
    branches: [main]

-permissions:
-  contents: read
-
 # Cancel in-progress runs for the same PR/branch
 concurrency:
  group: tests-${{ github.ref }}
@@ -20,17 +17,13 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Checkout code
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        uses: actions/checkout@v4

      - name: Install system dependencies
        run: sudo apt-get update && sudo apt-get install -y ripgrep

-      - name: Check for hardcoded paths
-        run: python3 scripts/lint_hardcoded_paths.py || true
-        continue-on-error: true
-
      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
+        uses: astral-sh/setup-uv@v5

      - name: Set up Python 3.11
        run: uv python install 3.11
@@ -56,10 +49,10 @@ jobs:
    timeout-minutes: 10
    steps:
      - name: Checkout code
-        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5  # v4
+        uses: actions/checkout@v4

      - name: Install uv
-        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
+        uses: astral-sh/setup-uv@v5

      - name: Set up Python 3.11
        run: uv python install 3.11
--- a/.gitignore
+++ b/.gitignore
@@ -51,9 +51,6 @@ ignored/
 .worktrees/
 environments/benchmarks/evals/

-# Web UI build output
-hermes_cli/web_dist/
-
 # Release script temp files
 .release_notes.md
 mini-swe-agent/
@@ -61,4 +58,3 @@ mini-swe-agent/
 # Nix
 .direnv/
 result
-website/static/api/skills-index.json
--- a/.mailmap
+++ b/.mailmap
@@ -1,107 +0,0 @@
-# .mailmap — canonical author mapping for git shortlog / git log / GitHub
-# Format: Canonical Name <canonical@email> <commit@email>
-# See: https://git-scm.com/docs/gitmailmap
-#
-# This maps commit emails to GitHub noreply addresses so that:
-# 1. `git shortlog -sn` shows deduplicated contributor counts
-# 2. GitHub's contributor graph can attribute commits correctly
-# 3. Contributors with personal/work emails get proper credit
-#
-# When adding entries: use the contributor's GitHub noreply email as canonical
-# so GitHub can link commits to their profile.
-
-# === Teknium (multiple emails) ===
-Teknium <127238744+teknium1@users.noreply.github.com> <teknium1@gmail.com>
-Teknium <127238744+teknium1@users.noreply.github.com> <teknium@nousresearch.com>
-
-# === Contributors — personal/work emails mapped to GitHub noreply ===
-# Format: Canonical Name <GH-noreply> <commit-email>
-
-# Verified via GH API email search
-luyao618 <364939526@qq.com> <364939526@qq.com>
-ethernet8023 <arilotter@gmail.com> <arilotter@gmail.com>
-nicoloboschi <boschi1997@gmail.com> <boschi1997@gmail.com>
-cherifya <chef.ya@gmail.com> <chef.ya@gmail.com>
-BongSuCHOI <chlqhdtn98@gmail.com> <chlqhdtn98@gmail.com>
-dsocolobsky <dsocolobsky@gmail.com> <dsocolobsky@gmail.com>
-pefontana <fontana.pedro93@gmail.com> <fontana.pedro93@gmail.com>
-Helmi <frank@helmschrott.de> <frank@helmschrott.de>
-hata1234 <hata1234@gmail.com> <hata1234@gmail.com>
-
-# Verified via PR investigation / salvage PR bodies
-DeployFaith <agents@kylefrench.dev> <agents@kylefrench.dev>
-flobo3 <floptopbot33@gmail.com> <floptopbot33@gmail.com>
-gaixianggeng <gaixg94@gmail.com> <gaixg94@gmail.com>
-KUSH42 <xush@xush.org> <xush@xush.org>
-konsisumer <der@konsi.org> <der@konsi.org>
-WorldInnovationsDepartment <vorvul.danylo@gmail.com> <vorvul.danylo@gmail.com>
-m0n5t3r <iacobs@m0n5t3r.info> <iacobs@m0n5t3r.info>
-sprmn24 <oncuevtv@gmail.com> <oncuevtv@gmail.com>
-fancydirty <fancydirty@gmail.com> <fancydirty@gmail.com>
-fxfitz <francis.x.fitzpatrick@gmail.com> <francis.x.fitzpatrick@gmail.com>
-limars874 <limars874@gmail.com> <limars874@gmail.com>
-AaronWong1999 <aaronwong1999@icloud.com> <aaronwong1999@icloud.com>
-dippwho <dipp.who@gmail.com> <dipp.who@gmail.com>
-duerzy <duerzy@gmail.com> <duerzy@gmail.com>
-geoffwellman <geoff.wellman@gmail.com> <geoff.wellman@gmail.com>
-hcshen0111 <shenhaocheng19990111@gmail.com> <shenhaocheng19990111@gmail.com>
-jamesarch <han.shan@live.cn> <han.shan@live.cn>
-stephenschoettler <stephenschoettler@gmail.com> <stephenschoettler@gmail.com>
-Tranquil-Flow <tranquil_flow@protonmail.com> <tranquil_flow@protonmail.com>
-Dusk1e <yusufalweshdemir@gmail.com> <yusufalweshdemir@gmail.com>
-Awsh1 <ysfalweshcan@gmail.com> <ysfalweshcan@gmail.com>
-WAXLYY <ysfwaxlycan@gmail.com> <ysfwaxlycan@gmail.com>
-donrhmexe <don.rhm@gmail.com> <don.rhm@gmail.com>
-hqhq1025 <1506751656@qq.com> <1506751656@qq.com>
-BlackishGreen33 <s5460703@gmail.com> <s5460703@gmail.com>
-tomqiaozc <zqiao@microsoft.com> <zqiao@microsoft.com>
-MagicRay1217 <mingjwan@microsoft.com> <mingjwan@microsoft.com>
-aaronagent <1115117931@qq.com> <1115117931@qq.com>
-YoungYang963 <young@YoungdeMacBook-Pro.local> <young@YoungdeMacBook-Pro.local>
-LongOddCode <haolong@microsoft.com> <haolong@microsoft.com>
-Cafexss <coffeemjj@gmail.com> <coffeemjj@gmail.com>
-Cygra <sjtuwbh@gmail.com> <sjtuwbh@gmail.com>
-DomGrieco <dgrieco@redhat.com> <dgrieco@redhat.com>
-
-# Duplicate email mapping (same person, multiple emails)
-Sertug17 <104278804+Sertug17@users.noreply.github.com> <srhtsrht17@gmail.com>
-yyovil <birdiegyal@gmail.com> <tanishq231003@gmail.com>
-DomGrieco <dgrieco@redhat.com> <dgrieco@redhat.com>
-dsocolobsky <dsocolobsky@gmail.com> <dylan.socolobsky@lambdaclass.com>
-olafthiele <programming@olafthiele.com> <olafthiele@gmail.com>
-
-# Verified via git display name matching GH contributor username
-cokemine <aptx4561@gmail.com> <aptx4561@gmail.com>
-dalianmao000 <dalianmao0107@gmail.com> <dalianmao0107@gmail.com>
-emozilla <emozilla@nousresearch.com> <emozilla@nousresearch.com>
-jjovalle99 <juan.ovalle@mistral.ai> <juan.ovalle@mistral.ai>
-kagura-agent <kagura.chen28@gmail.com> <kagura.chen28@gmail.com>
-spniyant <niyant@spicefi.xyz> <niyant@spicefi.xyz>
-olafthiele <programming@olafthiele.com> <programming@olafthiele.com>
-r266-tech <r2668940489@gmail.com> <r2668940489@gmail.com>
-xingkongliang <tianliangjay@gmail.com> <tianliangjay@gmail.com>
-win4r <win4r@outlook.com> <win4r@outlook.com>
-zhouboli <zhouboli@gmail.com> <zhouboli@gmail.com>
-yongtenglei <yongtenglei@gmail.com> <yongtenglei@gmail.com>
-
-# Nous Research team
-benbarclay <ben@nousresearch.com> <ben@nousresearch.com>
-jquesnelle <jonny@nousresearch.com> <jonny@nousresearch.com>
-
-# GH contributor list verified
-spideystreet <dhicham.pro@gmail.com> <dhicham.pro@gmail.com>
-dorukardahan <dorukardahan@hotmail.com> <dorukardahan@hotmail.com>
-MustafaKara7 <karamusti912@gmail.com> <karamusti912@gmail.com>
-Hmbown <hmbown@gmail.com> <hmbown@gmail.com>
-kamil-gwozdz <kamil@gwozdz.me> <kamil@gwozdz.me>
-kira-ariaki <kira@ariaki.me> <kira@ariaki.me>
-knopki <knopki@duck.com> <knopki@duck.com>
-Unayung <unayung@gmail.com> <unayung@gmail.com>
-SeeYangZhi <yangzhi.see@gmail.com> <yangzhi.see@gmail.com>
-Julientalbot <julien.talbot@ergonomia.re> <julien.talbot@ergonomia.re>
-lesterli <lisicheng168@gmail.com> <lisicheng168@gmail.com>
-JiayuuWang <jiayuw794@gmail.com> <jiayuw794@gmail.com>
-tesseracttars-creator <tesseracttars@gmail.com> <tesseracttars@gmail.com>
-xinbenlv <zzn+pa@zzn.im> <zzn+pa@zzn.im>
-SaulJWu <saul.jj.wu@gmail.com> <saul.jj.wu@gmail.com>
-angelos <angelos@oikos.lan.home.malaiwah.com> <angelos@oikos.lan.home.malaiwah.com>
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -13,7 +13,7 @@ source venv/bin/activate  # ALWAYS activate before running Python
 ```
 hermes-agent/
 ├── run_agent.py          # AIAgent class — core conversation loop
-├── model_tools.py        # Tool orchestration, discover_builtin_tools(), handle_function_call()
+├── model_tools.py        # Tool orchestration, _discover_tools(), handle_function_call()
 ├── toolsets.py           # Toolset definitions, _HERMES_CORE_TOOLS list
 ├── cli.py                # HermesCLI class — interactive CLI orchestrator
 ├── hermes_state.py       # SessionDB — SQLite session store (FTS5 search)
@@ -55,7 +55,7 @@ hermes-agent/
 ├── gateway/              # Messaging platform gateway
 │   ├── run.py            # Main loop, slash commands, message dispatch
 │   ├── session.py        # SessionStore — conversation persistence
-│   └── platforms/        # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal, qqbot
+│   └── platforms/        # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal
 ├── acp_adapter/          # ACP server (VS Code / Zed / JetBrains integration)
 ├── cron/                 # Scheduler (jobs.py, scheduler.py)
 ├── environments/         # RL training environments (Atropos)
@@ -181,7 +181,7 @@ if canonical == "mycommand":

 ## Adding New Tools

-Requires changes in **2 files**:
+Requires changes in **3 files**:

 **1. Create `tools/your_tool.py`:**
 ```python
@@ -204,9 +204,9 @@ registry.register(
 )
 ```

-**2. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.
+**2. Add import** in `model_tools.py` `_discover_tools()` list.

-Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual import list to maintain.
+**3. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.

 The registry handles schema collection, dispatch, availability checking, and error wrapping. All handlers MUST return a JSON string.

@@ -351,9 +351,8 @@ Cache-breaking forces dramatically higher costs. The ONLY time we alter context

 ### Background Process Notifications (Gateway)

-When `terminal(background=true, notify_on_complete=true)` is used, the gateway runs a watcher that
-detects process completion and triggers a new agent turn. Control verbosity of background process
-messages with `display.background_process_notifications`
+When `terminal(background=true, check_interval=...)` is used, the gateway runs a watcher that
+pushes status updates to the user's chat. Control verbosity with `display.background_process_notifications`
 in config.yaml (or `HERMES_BACKGROUND_NOTIFICATIONS` env var):

 - `all` — running-output updates + final message (default)
--- a/31
+++ b/31
@@ -1,44 +1,23 @@
-FROM ghcr.io/astral-sh/uv:0.11.6-python3.13-trixie@sha256:b3c543b6c4f23a5f2df22866bd7857e5d304b67a564f4feab6ac22044dde719b AS uv_source
-FROM tianon/gosu:1.19-trixie@sha256:3b176695959c71e123eb390d427efc665eeb561b1540e82679c15e992006b8b9 AS gosu_source
 FROM debian:13.4

-# Disable Python stdout buffering to ensure logs are printed immediately
-ENV PYTHONUNBUFFERED=1
-
-# Store Playwright browsers outside the volume mount so the build-time
-# install survives the /opt/data volume overlay at runtime.
-ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright
-
 # Install system dependencies in one layer, clear APT cache
 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
-        build-essential nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git && \
+        build-essential nodejs npm python3 python3-pip ripgrep ffmpeg gcc python3-dev libffi-dev && \
    rm -rf /var/lib/apt/lists/*

-# Non-root user for runtime; UID can be overridden via HERMES_UID at runtime
-RUN useradd -u 10000 -m -d /opt/data hermes
-
-COPY --chmod=0755 --from=gosu_source /gosu /usr/local/bin/
-COPY --chmod=0755 --from=uv_source /usr/local/bin/uv /usr/local/bin/uvx /usr/local/bin/
-
 COPY . /opt/hermes
 WORKDIR /opt/hermes

-# Install Node dependencies and Playwright as root (--with-deps needs apt)
-RUN npm install --prefer-offline --no-audit && \
+# Install Python and Node dependencies in one layer, no cache
+RUN pip install --no-cache-dir -e ".[all]" --break-system-packages && \
+    npm install --prefer-offline --no-audit && \
    npx playwright install --with-deps chromium --only-shell && \
    cd /opt/hermes/scripts/whatsapp-bridge && \
    npm install --prefer-offline --no-audit && \
    npm cache clean --force

-# Hand ownership to hermes user, then install Python deps in a virtualenv
-RUN chown -R hermes:hermes /opt/hermes
-USER hermes
-
-RUN uv venv && \
-    uv pip install --no-cache-dir -e ".[all]"
-
-USER root
+WORKDIR /opt/hermes
 RUN chmod +x /opt/hermes/docker/entrypoint.sh

 ENV HERMES_HOME=/opt/data
--- a/README.md
+++ b/README.md
@@ -13,7 +13,7 @@

 **The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.

-Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
+Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.

 <table>
 <tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
@@ -33,10 +33,8 @@ Use any model you want — [Nous Portal](https://portal.nousresearch.com), [Open
 curl -fsSL https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/install.sh | bash
 ```

-Works on Linux, macOS, WSL2, and Android via Termux. The installer handles the platform-specific setup for you.
+Works on Linux, macOS, and WSL2. The installer handles everything — Python, Node.js, dependencies, and the `hermes` command. No prerequisites except git.

-> **Android / Termux:** The tested manual path is documented in the [Termux guide](https://hermes-agent.nousresearch.com/docs/getting-started/termux). On Termux, Hermes installs a curated `.[termux]` extra because the full `.[all]` extra currently pulls Android-incompatible voice dependencies.
->
 > **Windows:** Native Windows is not supported. Please install [WSL2](https://learn.microsoft.com/en-us/windows/wsl/install) and run the command above.

 After installation:
@@ -167,7 +165,6 @@ python -m pytest tests/ -q
 - 📚 [Skills Hub](https://agentskills.io)
 - 🐛 [Issues](https://github.com/NousResearch/hermes-agent/issues)
 - 💡 [Discussions](https://github.com/NousResearch/hermes-agent/discussions)
- 🔌 [HermesClaw](https://github.com/AaronWong1999/hermesclaw) — Community WeChat bridge: Run Hermes Agent and OpenClaw on the same WeChat account.

 ---

--- a/RELEASE_v0.9.0.md
+++ b/RELEASE_v0.9.0.md
@@ -1,329 +0,0 @@
-# Hermes Agent v0.9.0 (v2026.4.13)
-
-**Release Date:** April 13, 2026
-**Since v0.8.0:** 487 commits · 269 merged PRs · 167 resolved issues · 493 files changed · 63,281 insertions · 24 contributors
-
-> The everywhere release — Hermes goes mobile with Termux/Android, adds iMessage and WeChat, ships Fast Mode for OpenAI and Anthropic, introduces background process monitoring, launches a local web dashboard for managing your agent, and delivers the deepest security hardening pass yet across 16 supported platforms.
-
---
-
-## ✨ Highlights
-
- **Local Web Dashboard** — A new browser-based dashboard for managing your Hermes Agent locally. Configure settings, monitor sessions, browse skills, and manage your gateway — all from a clean web interface without touching config files or the terminal. The easiest way to get started with Hermes.
-
- **Fast Mode (`/fast`)** — Priority processing for OpenAI and Anthropic models. Toggle `/fast` to route through priority queues for significantly lower latency on supported models (GPT-5.4, Codex, Claude). Expands across all OpenAI Priority Processing models and Anthropic's fast tier. ([#6875](https://github.com/NousResearch/hermes-agent/pull/6875), [#6960](https://github.com/NousResearch/hermes-agent/pull/6960), [#7037](https://github.com/NousResearch/hermes-agent/pull/7037))
-
- **iMessage via BlueBubbles** — Full iMessage integration through BlueBubbles, bringing Hermes to Apple's messaging ecosystem. Auto-webhook registration, setup wizard integration, and crash resilience. ([#6437](https://github.com/NousResearch/hermes-agent/pull/6437), [#6460](https://github.com/NousResearch/hermes-agent/pull/6460), [#6494](https://github.com/NousResearch/hermes-agent/pull/6494))
-
- **WeChat (Weixin) & WeCom Callback Mode** — Native WeChat support via iLink Bot API and a new WeCom callback-mode adapter for self-built enterprise apps. Streaming cursor, media uploads, markdown link handling, and atomic state persistence. Hermes now covers the Chinese messaging ecosystem end-to-end. ([#7166](https://github.com/NousResearch/hermes-agent/pull/7166), [#7943](https://github.com/NousResearch/hermes-agent/pull/7943))
-
- **Termux / Android Support** — Run Hermes natively on Android via Termux. Adapted install paths, TUI optimizations for mobile screens, voice backend support, and the `/image` command work on-device. ([#6834](https://github.com/NousResearch/hermes-agent/pull/6834))
-
- **Background Process Monitoring (`watch_patterns`)** — Set patterns to watch for in background process output and get notified in real-time when they match. Monitor for errors, wait for specific events ("listening on port"), or watch build logs — all without polling. ([#7635](https://github.com/NousResearch/hermes-agent/pull/7635))
-
- **Native xAI & Xiaomi MiMo Providers** — First-class provider support for xAI (Grok) and Xiaomi MiMo, with direct API access, model catalogs, and setup wizard integration. Plus Qwen OAuth with portal request support. ([#7372](https://github.com/NousResearch/hermes-agent/pull/7372), [#7855](https://github.com/NousResearch/hermes-agent/pull/7855))
-
- **Pluggable Context Engine** — Context management is now a pluggable slot via `hermes plugins`. Swap in custom context engines that control what the agent sees each turn — filtering, summarization, or domain-specific context injection. ([#7464](https://github.com/NousResearch/hermes-agent/pull/7464))
-
- **Unified Proxy Support** — SOCKS proxy, `DISCORD_PROXY`, and system proxy auto-detection across all gateway platforms. Hermes behind corporate firewalls just works. ([#6814](https://github.com/NousResearch/hermes-agent/pull/6814))
-
- **Comprehensive Security Hardening** — Path traversal protection in checkpoint manager, shell injection neutralization in sandbox writes, SSRF redirect guards in Slack image uploads, Twilio webhook signature validation (SMS RCE fix), API server auth enforcement, git argument injection prevention, and approval button authorization. ([#7933](https://github.com/NousResearch/hermes-agent/pull/7933), [#7944](https://github.com/NousResearch/hermes-agent/pull/7944), [#7940](https://github.com/NousResearch/hermes-agent/pull/7940), [#7151](https://github.com/NousResearch/hermes-agent/pull/7151), [#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
-
- **`hermes backup` & `hermes import`** — Full backup and restore of your Hermes configuration, sessions, skills, and memory. Migrate between machines or create snapshots before major changes. ([#7997](https://github.com/NousResearch/hermes-agent/pull/7997))
-
- **16 Supported Platforms** — With BlueBubbles (iMessage) and WeChat joining Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, SMS, DingTalk, Feishu, WeCom, Mattermost, Home Assistant, and Webhooks, Hermes now runs on 16 messaging platforms out of the box.
-
- **`/debug` & `hermes debug share`** — New debugging toolkit: `/debug` slash command across all platforms for quick diagnostics, plus `hermes debug share` to upload a full debug report to a pastebin for easy sharing when troubleshooting. ([#8681](https://github.com/NousResearch/hermes-agent/pull/8681))
-
---
-
-## 🏗️ Core Agent & Architecture
-
-### Provider & Model Support
- **Native xAI (Grok) provider** with direct API access and model catalog ([#7372](https://github.com/NousResearch/hermes-agent/pull/7372))
- **Xiaomi MiMo as first-class provider** — setup wizard, model catalog, empty response recovery ([#7855](https://github.com/NousResearch/hermes-agent/pull/7855))
- **Qwen OAuth provider** with portal request support ([#6282](https://github.com/NousResearch/hermes-agent/pull/6282))
- **Fast Mode** — `/fast` toggle for OpenAI Priority Processing + Anthropic fast tier ([#6875](https://github.com/NousResearch/hermes-agent/pull/6875), [#6960](https://github.com/NousResearch/hermes-agent/pull/6960), [#7037](https://github.com/NousResearch/hermes-agent/pull/7037))
- **Structured API error classification** for smart failover decisions ([#6514](https://github.com/NousResearch/hermes-agent/pull/6514))
- **Rate limit header capture** shown in `/usage` ([#6541](https://github.com/NousResearch/hermes-agent/pull/6541))
- **API server model name** derived from profile name ([#6857](https://github.com/NousResearch/hermes-agent/pull/6857))
- **Custom providers** now included in `/model` listings and resolution ([#7088](https://github.com/NousResearch/hermes-agent/pull/7088))
- **Fallback provider activation** on repeated empty responses with user-visible status ([#7505](https://github.com/NousResearch/hermes-agent/pull/7505))
- **OpenRouter variant tags** (`:free`, `:extended`, `:fast`) preserved during model switch ([#6383](https://github.com/NousResearch/hermes-agent/pull/6383))
- **Credential exhaustion TTL** reduced from 24 hours to 1 hour ([#6504](https://github.com/NousResearch/hermes-agent/pull/6504))
- **OAuth credential lifecycle** hardening — stale pool keys, auth.json sync, Codex CLI race fixes ([#6874](https://github.com/NousResearch/hermes-agent/pull/6874))
- Empty response recovery for reasoning models (MiMo, Qwen, GLM) ([#8609](https://github.com/NousResearch/hermes-agent/pull/8609))
- MiniMax context lengths, thinking guard, endpoint corrections ([#6082](https://github.com/NousResearch/hermes-agent/pull/6082), [#7126](https://github.com/NousResearch/hermes-agent/pull/7126))
- Z.AI endpoint auto-detect via probe and cache ([#5763](https://github.com/NousResearch/hermes-agent/pull/5763))
-
-### Agent Loop & Conversation
- **Pluggable context engine slot** via `hermes plugins` ([#7464](https://github.com/NousResearch/hermes-agent/pull/7464))
- **Background process monitoring** — `watch_patterns` for real-time output alerts ([#7635](https://github.com/NousResearch/hermes-agent/pull/7635))
- **Improved context compression** — higher limits, tool tracking, degradation warnings, token-budget tail protection ([#6395](https://github.com/NousResearch/hermes-agent/pull/6395), [#6453](https://github.com/NousResearch/hermes-agent/pull/6453))
- **`/compress <focus>`** — guided compression with a focus topic ([#8017](https://github.com/NousResearch/hermes-agent/pull/8017))
- **Tiered context pressure warnings** with gateway dedup ([#6411](https://github.com/NousResearch/hermes-agent/pull/6411))
- **Staged inactivity warning** before timeout escalation ([#6387](https://github.com/NousResearch/hermes-agent/pull/6387))
- **Prevent agent from stopping mid-task** — compression floor, budget overhaul, activity tracking ([#7983](https://github.com/NousResearch/hermes-agent/pull/7983))
- **Propagate child activity to parent** during `delegate_task` ([#7295](https://github.com/NousResearch/hermes-agent/pull/7295))
- **Truncated streaming tool call detection** before execution ([#6847](https://github.com/NousResearch/hermes-agent/pull/6847))
- Empty response retry (3 attempts with nudge) ([#6488](https://github.com/NousResearch/hermes-agent/pull/6488))
- Adaptive streaming backoff + cursor strip to prevent message truncation ([#7683](https://github.com/NousResearch/hermes-agent/pull/7683))
- Compression uses live session model instead of stale persisted config ([#8258](https://github.com/NousResearch/hermes-agent/pull/8258))
- Strip `<thought>` tags from Gemma 4 responses ([#8562](https://github.com/NousResearch/hermes-agent/pull/8562))
- Prevent `<think>` in prose from suppressing response output ([#6968](https://github.com/NousResearch/hermes-agent/pull/6968))
- Turn-exit diagnostic logging to agent loop ([#6549](https://github.com/NousResearch/hermes-agent/pull/6549))
- Scope tool interrupt signal per-thread to prevent cross-session leaks ([#7930](https://github.com/NousResearch/hermes-agent/pull/7930))
-
-### Memory & Sessions
- **Hindsight memory plugin** — feature parity, setup wizard, config improvements — @nicoloboschi ([#6428](https://github.com/NousResearch/hermes-agent/pull/6428))
- **Honcho** — opt-in `initOnSessionStart` for tools mode — @Kathie-yu ([#6995](https://github.com/NousResearch/hermes-agent/pull/6995))
- Orphan children instead of cascade-deleting in prune/delete ([#6513](https://github.com/NousResearch/hermes-agent/pull/6513))
- Doctor command only checks the active memory provider ([#6285](https://github.com/NousResearch/hermes-agent/pull/6285))
-
---
-
-## 📱 Messaging Platforms (Gateway)
-
-### New Platforms
- **BlueBubbles (iMessage)** — full adapter with auto-webhook registration, setup wizard, and crash resilience ([#6437](https://github.com/NousResearch/hermes-agent/pull/6437), [#6460](https://github.com/NousResearch/hermes-agent/pull/6460), [#6494](https://github.com/NousResearch/hermes-agent/pull/6494), [#7107](https://github.com/NousResearch/hermes-agent/pull/7107))
- **Weixin (WeChat)** — native support via iLink Bot API with streaming, media uploads, markdown links ([#7166](https://github.com/NousResearch/hermes-agent/pull/7166), [#8665](https://github.com/NousResearch/hermes-agent/pull/8665))
- **WeCom Callback Mode** — self-built enterprise app adapter with atomic state persistence ([#7943](https://github.com/NousResearch/hermes-agent/pull/7943), [#7928](https://github.com/NousResearch/hermes-agent/pull/7928))
-
-### Discord
- **Allowed channels whitelist** config — @jarvis-phw ([#7044](https://github.com/NousResearch/hermes-agent/pull/7044))
- **Forum channel topic inheritance** in thread sessions — @hermes-agent-dhabibi ([#6377](https://github.com/NousResearch/hermes-agent/pull/6377))
- **DISCORD_REPLY_TO_MODE** setting ([#6333](https://github.com/NousResearch/hermes-agent/pull/6333))
- Accept `.log` attachments, raise document size limit — @kira-ariaki ([#6467](https://github.com/NousResearch/hermes-agent/pull/6467))
- Decouple readiness from slash sync ([#8016](https://github.com/NousResearch/hermes-agent/pull/8016))
-
-### Slack
- **Consolidated Slack improvements** — 7 community PRs salvaged into one ([#6809](https://github.com/NousResearch/hermes-agent/pull/6809))
- Handle assistant thread lifecycle events ([#6433](https://github.com/NousResearch/hermes-agent/pull/6433))
-
-### Matrix
- **Migrated from matrix-nio to mautrix-python** ([#7518](https://github.com/NousResearch/hermes-agent/pull/7518))
- SQLite crypto store replacing pickle (fixes E2EE decryption) — @alt-glitch ([#7981](https://github.com/NousResearch/hermes-agent/pull/7981))
- Cross-signing recovery key verification for E2EE migration ([#8282](https://github.com/NousResearch/hermes-agent/pull/8282))
- DM mention threads + group chat events for Feishu ([#7423](https://github.com/NousResearch/hermes-agent/pull/7423))
-
-### Gateway Core
- **Unified proxy support** — SOCKS, DISCORD_PROXY, multi-platform with macOS auto-detection ([#6814](https://github.com/NousResearch/hermes-agent/pull/6814))
- **Inbound text batching** for Discord, Matrix, WeCom + adaptive delay ([#6979](https://github.com/NousResearch/hermes-agent/pull/6979))
- **Surface natural mid-turn assistant messages** in chat platforms ([#7978](https://github.com/NousResearch/hermes-agent/pull/7978))
- **WSL-aware gateway** with smart systemd detection ([#7510](https://github.com/NousResearch/hermes-agent/pull/7510))
- **All missing platforms added to setup wizard** ([#7949](https://github.com/NousResearch/hermes-agent/pull/7949))
- **Per-platform `tool_progress` overrides** ([#6348](https://github.com/NousResearch/hermes-agent/pull/6348))
- **Configurable 'still working' notification interval** ([#8572](https://github.com/NousResearch/hermes-agent/pull/8572))
- `/model` switch persists across messages ([#7081](https://github.com/NousResearch/hermes-agent/pull/7081))
- `/usage` shows rate limits, cost, and token details between turns ([#7038](https://github.com/NousResearch/hermes-agent/pull/7038))
- Drain in-flight work before restart ([#7503](https://github.com/NousResearch/hermes-agent/pull/7503))
- Don't evict cached agent on failed runs — prevents MCP restart loop ([#7539](https://github.com/NousResearch/hermes-agent/pull/7539))
- Replace `os.environ` session state with `contextvars` ([#7454](https://github.com/NousResearch/hermes-agent/pull/7454))
- Derive channel directory platforms from enum instead of hardcoded list ([#7450](https://github.com/NousResearch/hermes-agent/pull/7450))
- Validate image downloads before caching (cross-platform) ([#7125](https://github.com/NousResearch/hermes-agent/pull/7125))
- Cross-platform webhook delivery for all platforms ([#7095](https://github.com/NousResearch/hermes-agent/pull/7095))
- Cron Discord thread_id delivery support ([#7106](https://github.com/NousResearch/hermes-agent/pull/7106))
- Feishu QR-based bot onboarding ([#8570](https://github.com/NousResearch/hermes-agent/pull/8570))
- Gateway status scoped to active profile ([#7951](https://github.com/NousResearch/hermes-agent/pull/7951))
- Prevent background process notifications from triggering false pairing requests ([#6434](https://github.com/NousResearch/hermes-agent/pull/6434))
-
---
-
-## 🖥️ CLI & User Experience
-
-### Interactive CLI
- **Termux / Android support** — adapted install paths, TUI, voice, `/image` ([#6834](https://github.com/NousResearch/hermes-agent/pull/6834))
- **Native `/model` picker modal** for provider → model selection ([#8003](https://github.com/NousResearch/hermes-agent/pull/8003))
- **Live per-tool elapsed timer** restored in TUI spinner ([#7359](https://github.com/NousResearch/hermes-agent/pull/7359))
- **Stacked tool progress scrollback** in TUI ([#8201](https://github.com/NousResearch/hermes-agent/pull/8201))
- **Random tips on new session start** (CLI + gateway, 279 tips) ([#8225](https://github.com/NousResearch/hermes-agent/pull/8225), [#8237](https://github.com/NousResearch/hermes-agent/pull/8237))
- **`hermes dump`** — copy-pasteable setup summary for debugging ([#6550](https://github.com/NousResearch/hermes-agent/pull/6550))
- **`hermes backup` / `hermes import`** — full config backup and restore ([#7997](https://github.com/NousResearch/hermes-agent/pull/7997))
- **WSL environment hint** in system prompt ([#8285](https://github.com/NousResearch/hermes-agent/pull/8285))
- **Profile creation UX** — seed SOUL.md + credential warning ([#8553](https://github.com/NousResearch/hermes-agent/pull/8553))
- Shell-aware sudo detection, empty password support ([#6517](https://github.com/NousResearch/hermes-agent/pull/6517))
- Flush stdin after curses/terminal menus to prevent escape sequence leakage ([#7167](https://github.com/NousResearch/hermes-agent/pull/7167))
- Handle broken stdin in prompt_toolkit startup ([#8560](https://github.com/NousResearch/hermes-agent/pull/8560))
-
-### Setup & Configuration
- **Per-platform display verbosity** configuration ([#8006](https://github.com/NousResearch/hermes-agent/pull/8006))
- **Component-separated logging** with session context and filtering ([#7991](https://github.com/NousResearch/hermes-agent/pull/7991))
- **`network.force_ipv4`** config to fix IPv6 timeout issues ([#8196](https://github.com/NousResearch/hermes-agent/pull/8196))
- **Standardize message whitespace and JSON formatting** ([#7988](https://github.com/NousResearch/hermes-agent/pull/7988))
- **Rebrand OpenClaw → Hermes** during migration ([#8210](https://github.com/NousResearch/hermes-agent/pull/8210))
- Config.yaml takes priority over env vars for auxiliary settings ([#7889](https://github.com/NousResearch/hermes-agent/pull/7889))
- Harden setup provider flows + live OpenRouter catalog refresh ([#7078](https://github.com/NousResearch/hermes-agent/pull/7078))
- Normalize reasoning effort ordering across all surfaces ([#6804](https://github.com/NousResearch/hermes-agent/pull/6804))
- Remove dead `LLM_MODEL` env var + migration to clear stale entries ([#6543](https://github.com/NousResearch/hermes-agent/pull/6543))
- Remove `/prompt` slash command — prefix expansion footgun ([#6752](https://github.com/NousResearch/hermes-agent/pull/6752))
- `HERMES_HOME_MODE` env var to override permissions — @ygd58 ([#6993](https://github.com/NousResearch/hermes-agent/pull/6993))
- Fall back to default model when model config is empty ([#8303](https://github.com/NousResearch/hermes-agent/pull/8303))
- Warn when compression model context is too small ([#7894](https://github.com/NousResearch/hermes-agent/pull/7894))
-
---
-
-## 🔧 Tool System
-
-### Environments & Execution
- **Unified spawn-per-call execution layer** for environments ([#6343](https://github.com/NousResearch/hermes-agent/pull/6343))
- **Unified file sync** with mtime tracking, deletion, and transactional state ([#7087](https://github.com/NousResearch/hermes-agent/pull/7087))
- **Persistent sandbox envs** survive between turns ([#6412](https://github.com/NousResearch/hermes-agent/pull/6412))
- **Bulk file sync** via tar pipe for SSH/Modal backends — @alt-glitch ([#8014](https://github.com/NousResearch/hermes-agent/pull/8014))
- **Daytona** — bulk upload, config bridge, silent disk cap ([#7538](https://github.com/NousResearch/hermes-agent/pull/7538))
- Foreground timeout cap to prevent session deadlocks ([#7082](https://github.com/NousResearch/hermes-agent/pull/7082))
- Guard invalid command values ([#6417](https://github.com/NousResearch/hermes-agent/pull/6417))
-
-### MCP
- **`hermes mcp add --env` and `--preset`** support ([#7970](https://github.com/NousResearch/hermes-agent/pull/7970))
- Combine `content` and `structuredContent` when both present ([#7118](https://github.com/NousResearch/hermes-agent/pull/7118))
- MCP tool name deconfliction fixes ([#7654](https://github.com/NousResearch/hermes-agent/pull/7654))
-
-### Browser
- Browser hardening — dead code removal, caching, scroll perf, security, thread safety ([#7354](https://github.com/NousResearch/hermes-agent/pull/7354))
- `/browser connect` auto-launch uses dedicated Chrome profile dir ([#6821](https://github.com/NousResearch/hermes-agent/pull/6821))
- Reap orphaned browser sessions on startup ([#7931](https://github.com/NousResearch/hermes-agent/pull/7931))
-
-### Voice & Vision
- **Voxtral TTS provider** (Mistral AI) ([#7653](https://github.com/NousResearch/hermes-agent/pull/7653))
- **TTS speed support** for Edge TTS, OpenAI TTS, MiniMax ([#8666](https://github.com/NousResearch/hermes-agent/pull/8666))
- **Vision auto-resize** for oversized images, raise limit to 20 MB, retry-on-failure ([#7883](https://github.com/NousResearch/hermes-agent/pull/7883), [#7902](https://github.com/NousResearch/hermes-agent/pull/7902))
- STT provider-model mismatch fix (whisper-1 vs faster-whisper) ([#7113](https://github.com/NousResearch/hermes-agent/pull/7113))
-
-### Other Tools
- **`hermes dump`** command for setup summary ([#6550](https://github.com/NousResearch/hermes-agent/pull/6550))
- TODO store enforces ID uniqueness during replace operations ([#7986](https://github.com/NousResearch/hermes-agent/pull/7986))
- List all available toolsets in `delegate_task` schema description ([#8231](https://github.com/NousResearch/hermes-agent/pull/8231))
- API server: tool progress as custom SSE event to prevent model corruption ([#7500](https://github.com/NousResearch/hermes-agent/pull/7500))
- API server: share one Docker container across all conversations ([#7127](https://github.com/NousResearch/hermes-agent/pull/7127))
-
---
-
-## 🧩 Skills Ecosystem
-
- **Centralized skills index + tree cache** — eliminates rate-limit failures on install ([#8575](https://github.com/NousResearch/hermes-agent/pull/8575))
- **More aggressive skill loading instructions** in system prompt (v3) ([#8209](https://github.com/NousResearch/hermes-agent/pull/8209), [#8286](https://github.com/NousResearch/hermes-agent/pull/8286))
- **Google Workspace skill** migrated to GWS CLI backend ([#6788](https://github.com/NousResearch/hermes-agent/pull/6788))
- **Creative divergence strategies** skill — @SHL0MS ([#6882](https://github.com/NousResearch/hermes-agent/pull/6882))
- **Creative ideation** — constraint-driven project generation — @SHL0MS ([#7555](https://github.com/NousResearch/hermes-agent/pull/7555))
- Parallelize skills browse/search to prevent hanging ([#7301](https://github.com/NousResearch/hermes-agent/pull/7301))
- Read name from SKILL.md frontmatter in skills_sync ([#7623](https://github.com/NousResearch/hermes-agent/pull/7623))
-
---
-
-## 🔒 Security & Reliability
-
-### Security Hardening
- **Twilio webhook signature validation** — SMS RCE fix ([#7933](https://github.com/NousResearch/hermes-agent/pull/7933))
- **Shell injection neutralization** in `_write_to_sandbox` via path quoting ([#7940](https://github.com/NousResearch/hermes-agent/pull/7940))
- **Git argument injection** and path traversal prevention in checkpoint manager ([#7944](https://github.com/NousResearch/hermes-agent/pull/7944))
- **SSRF redirect bypass** in Slack image uploads + base.py cache helpers ([#7151](https://github.com/NousResearch/hermes-agent/pull/7151))
- **Path traversal, credential gate, DANGEROUS_PATTERNS gaps** ([#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
- **API bind guard** — enforce `API_SERVER_KEY` for non-loopback binding ([#7455](https://github.com/NousResearch/hermes-agent/pull/7455))
- **Approval button authorization** — require auth for session continuation — @Cafexss ([#6930](https://github.com/NousResearch/hermes-agent/pull/6930))
- Path boundary enforcement in skill manager operations ([#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
- DingTalk/API webhook URL origin validation, header injection rejection ([#7455](https://github.com/NousResearch/hermes-agent/pull/7455))
-
-### Reliability
- **Contextual error diagnostics** for invalid API responses ([#8565](https://github.com/NousResearch/hermes-agent/pull/8565))
- **Prevent 400 format errors** from triggering compression loop on Codex ([#6751](https://github.com/NousResearch/hermes-agent/pull/6751))
- **Don't halve context_length** on output-cap-too-large errors — @KUSH42 ([#6664](https://github.com/NousResearch/hermes-agent/pull/6664))
- **Recover primary client** on OpenAI transport errors ([#7108](https://github.com/NousResearch/hermes-agent/pull/7108))
- **Credential pool rotation** on billing-classified 400s ([#7112](https://github.com/NousResearch/hermes-agent/pull/7112))
- **Auto-increase stream read timeout** for local LLM providers ([#6967](https://github.com/NousResearch/hermes-agent/pull/6967))
- **Fall back to default certs** when CA bundle path doesn't exist ([#7352](https://github.com/NousResearch/hermes-agent/pull/7352))
- **Disambiguate usage-limit patterns** in error classifier — @sprmn24 ([#6836](https://github.com/NousResearch/hermes-agent/pull/6836))
- Harden cron script timeout and provider recovery ([#7079](https://github.com/NousResearch/hermes-agent/pull/7079))
- Gateway interrupt detection resilient to monitor task failures ([#8208](https://github.com/NousResearch/hermes-agent/pull/8208))
- Prevent unwanted session auto-reset after graceful gateway restarts ([#8299](https://github.com/NousResearch/hermes-agent/pull/8299))
- Prevent duplicate update prompt spam in gateway watcher ([#8343](https://github.com/NousResearch/hermes-agent/pull/8343))
- Deduplicate reasoning items in Responses API input ([#7946](https://github.com/NousResearch/hermes-agent/pull/7946))
-
-### Infrastructure
- **Multi-arch Docker image** — amd64 + arm64 ([#6124](https://github.com/NousResearch/hermes-agent/pull/6124))
- **Docker runs as non-root user** with virtualenv — @benbarclay contributing ([#8226](https://github.com/NousResearch/hermes-agent/pull/8226))
- **Use `uv`** for Docker dependency resolution to fix resolution-too-deep ([#6965](https://github.com/NousResearch/hermes-agent/pull/6965))
- **Container-aware Nix CLI** — auto-route into managed container — @alt-glitch ([#7543](https://github.com/NousResearch/hermes-agent/pull/7543))
- **Nix shared-state permission model** for interactive CLI users — @alt-glitch ([#6796](https://github.com/NousResearch/hermes-agent/pull/6796))
- **Per-profile subprocess HOME isolation** ([#7357](https://github.com/NousResearch/hermes-agent/pull/7357))
- Profile paths fixed in Docker — profiles go to mounted volume ([#7170](https://github.com/NousResearch/hermes-agent/pull/7170))
- Docker container gateway pathway hardened ([#8614](https://github.com/NousResearch/hermes-agent/pull/8614))
- Enable unbuffered stdout for live Docker logs ([#6749](https://github.com/NousResearch/hermes-agent/pull/6749))
- Install procps in Docker image — @HiddenPuppy ([#7032](https://github.com/NousResearch/hermes-agent/pull/7032))
- Shallow git clone for faster installation — @sosyz ([#8396](https://github.com/NousResearch/hermes-agent/pull/8396))
- `hermes update` always reset on stash conflict ([#7010](https://github.com/NousResearch/hermes-agent/pull/7010))
- Write update exit code before gateway restart (cgroup kill race) ([#8288](https://github.com/NousResearch/hermes-agent/pull/8288))
- Nix: `setupSecrets` optional, tirith runtime dep — @devorun, @ethernet8023 ([#6261](https://github.com/NousResearch/hermes-agent/pull/6261), [#6721](https://github.com/NousResearch/hermes-agent/pull/6721))
- launchd stop uses `bootout` so `KeepAlive` doesn't respawn ([#7119](https://github.com/NousResearch/hermes-agent/pull/7119))
-
---
-
-## 🐛 Notable Bug Fixes
-
- Fix: `/model` switch not persisting across gateway messages ([#7081](https://github.com/NousResearch/hermes-agent/pull/7081))
- Fix: session-scoped gateway model overrides ignored — @Hygaard ([#7662](https://github.com/NousResearch/hermes-agent/pull/7662))
- Fix: compaction model context length ignoring config — 3 related issues ([#8258](https://github.com/NousResearch/hermes-agent/pull/8258), [#8107](https://github.com/NousResearch/hermes-agent/pull/8107))
- Fix: OpenCode.ai context window resolved to 128K instead of 1M ([#6472](https://github.com/NousResearch/hermes-agent/pull/6472))
- Fix: Codex fallback auth-store lookup — @cherifya ([#6462](https://github.com/NousResearch/hermes-agent/pull/6462))
- Fix: duplicate completion notifications when process killed ([#7124](https://github.com/NousResearch/hermes-agent/pull/7124))
- Fix: agent daemon thread prevents orphan CLI processes on tab close ([#8557](https://github.com/NousResearch/hermes-agent/pull/8557))
- Fix: stale image attachment on text paste and voice input ([#7077](https://github.com/NousResearch/hermes-agent/pull/7077))
- Fix: DM thread session seeding causing cross-thread contamination ([#7084](https://github.com/NousResearch/hermes-agent/pull/7084))
- Fix: OpenClaw migration shows dry-run preview before executing ([#6769](https://github.com/NousResearch/hermes-agent/pull/6769))
- Fix: auth errors misclassified as retryable — @kuishou68 ([#7027](https://github.com/NousResearch/hermes-agent/pull/7027))
- Fix: Copilot-Integration-Id header missing ([#7083](https://github.com/NousResearch/hermes-agent/pull/7083))
- Fix: ACP session capabilities — @luyao618 ([#6985](https://github.com/NousResearch/hermes-agent/pull/6985))
- Fix: ACP PromptResponse usage from top-level fields ([#7086](https://github.com/NousResearch/hermes-agent/pull/7086))
- Fix: several failing/flaky tests on main — @dsocolobsky ([#6777](https://github.com/NousResearch/hermes-agent/pull/6777))
- Fix: backup marker filenames — @sprmn24 ([#8600](https://github.com/NousResearch/hermes-agent/pull/8600))
- Fix: `NoneType` in fast_mode check — @0xbyt4 ([#7350](https://github.com/NousResearch/hermes-agent/pull/7350))
- Fix: missing imports in uninstall.py — @JiayuuWang ([#7034](https://github.com/NousResearch/hermes-agent/pull/7034))
-
---
-
-## 📚 Documentation
-
- Platform adapter developer guide + WeCom Callback docs ([#7969](https://github.com/NousResearch/hermes-agent/pull/7969))
- Cron troubleshooting guide ([#7122](https://github.com/NousResearch/hermes-agent/pull/7122))
- Streaming timeout auto-detection for local LLMs ([#6990](https://github.com/NousResearch/hermes-agent/pull/6990))
- Tool-use enforcement documentation expanded ([#7984](https://github.com/NousResearch/hermes-agent/pull/7984))
- BlueBubbles pairing instructions ([#6548](https://github.com/NousResearch/hermes-agent/pull/6548))
- Telegram proxy support section ([#6348](https://github.com/NousResearch/hermes-agent/pull/6348))
- `hermes dump` and `hermes logs` CLI reference ([#6552](https://github.com/NousResearch/hermes-agent/pull/6552))
- `tool_progress_overrides` configuration reference ([#6364](https://github.com/NousResearch/hermes-agent/pull/6364))
- Compression model context length warning docs ([#7879](https://github.com/NousResearch/hermes-agent/pull/7879))
-
---
-
-## 👥 Contributors
-
-**269 merged PRs** from **24 contributors** across **487 commits**.
-
-### Community Contributors
- **@alt-glitch** (6 PRs) — Nix container-aware CLI, shared-state permissions, Matrix SQLite crypto store, bulk SSH/Modal file sync, Matrix mautrix compat
- **@SHL0MS** (2 PRs) — Creative divergence strategies skill, creative ideation skill
- **@sprmn24** (2 PRs) — Error classifier disambiguation, backup marker fix
- **@nicoloboschi** — Hindsight memory plugin feature parity
- **@Hygaard** — Session-scoped gateway model override fix
- **@jarvis-phw** — Discord allowed_channels whitelist
- **@Kathie-yu** — Honcho initOnSessionStart for tools mode
- **@hermes-agent-dhabibi** — Discord forum channel topic inheritance
- **@kira-ariaki** — Discord .log attachments and size limit
- **@cherifya** — Codex fallback auth-store lookup
- **@Cafexss** — Security: auth for session continuation
- **@KUSH42** — Compaction context_length fix
- **@kuishou68** — Auth error retryable classification fix
- **@luyao618** — ACP session capabilities
- **@ygd58** — HERMES_HOME_MODE env var override
- **@0xbyt4** — Fast mode NoneType fix
- **@JiayuuWang** — CLI uninstall import fix
- **@HiddenPuppy** — Docker procps installation
- **@dsocolobsky** — Test suite fixes
- **@bobashopcashier** (1 PR) — Graceful gateway drain before restart (salvaged into #7503 from #7290)
- **@benbarclay** — Docker image tag simplification
- **@sosyz** — Shallow git clone for faster install
- **@devorun** — Nix setupSecrets optional
- **@ethernet8023** — Nix tirith runtime dep
-
---
-
-**Full Changelog**: [v2026.4.8...v2026.4.13](https://github.com/NousResearch/hermes-agent/compare/v2026.4.8...v2026.4.13)
--- a/VECTOR_DB_RESEARCH_REPORT.md
+++ b/VECTOR_DB_RESEARCH_REPORT.md
@@ -1,172 +0,0 @@
-# Vector Database SOTA Research Report
-## For AI Agent Semantic Retrieval — April 2026
-
---
-
-## Executive Summary
-
-Analysis of current vector database benchmarks, documentation, and production deployments for semantic retrieval in AI agents. Compared against existing Hermes session_search (SQLite FTS5) and holographic memory systems.
-
---
-
-## 1. Retrieval Accuracy (Recall@10)
-
-| Database | HNSW Recall | IVF Recall | Notes |
-|----------|-------------|------------|-------|
-| **Qdrant** | 0.95-0.99 | N/A | Tunable via ef parameter |
-| **Milvus** | 0.95-0.99 | 0.85-0.95 | Multiple index support |
-| **Weaviate** | 0.95-0.98 | N/A | HNSW primary |
-| **Pinecone** | 0.95-0.99 | N/A | Managed, opaque tuning |
-| **ChromaDB** | 0.90-0.95 | N/A | Simpler, uses HNSW via hnswlib |
-| **pgvector** | 0.85-0.95 | 0.80-0.90 | Depends on tuning |
-| **SQLite-vss** | 0.80-0.90 | N/A | HNSW via sqlite-vss |
-| **Current FTS5** | ~0.60-0.75* | N/A | Keyword matching only |
-
-*FTS5 "recall" estimated: good for exact keywords, poor for semantic/paraphrased queries.
-
---
-
-## 2. Latency Benchmarks (1M vectors, 768-dim, 10 neighbors)
-
-| Database | p50 (ms) | p99 (ms) | QPS | Notes |
-|----------|----------|----------|-----|-------|
-| **Qdrant** | 1-3 | 5-10 | 5,000-15,000 | Best self-hosted |
-| **Milvus** | 2-5 | 8-15 | 3,000-12,000 | Good distributed |
-| **Weaviate** | 3-8 | 10-25 | 2,000-8,000 | |
-| **Pinecone** | 5-15 | 20-50 | 1,000-5,000 | Managed overhead |
-| **ChromaDB** | 5-15 | 20-50 | 500-2,000 | Embedded mode |
-| **pgvector** | 10-50 | 50-200 | 200-1,000 | SQL overhead |
-| **SQLite-vss** | 10-30 | 50-150 | 300-800 | Limited scalability |
-| **Current FTS5** | 2-10 | 15-50 | 1,000-5,000 | No embedding cost |
-
---
-
-## 3. Index Types Comparison
-
-### HNSW (Hierarchical Navigable Small World)
- Best for: High recall, moderate memory, fast queries
- Used by: Qdrant, Weaviate, ChromaDB, Milvus, pgvector, SQLite-vss
- Memory: High (~1.5GB per 1M 768-dim vectors)
- Key parameters: ef_construction (100-500), M (16-64), ef (64-256)
-
-### IVF (Inverted File Index)
- Best for: Large datasets, memory-constrained
- Used by: Milvus, pgvector
- Memory: Lower (~0.5GB per 1M vectors)
- Key parameters: nlist (100-10000), nprobe (10-100)
-
-### DiskANN / SPANN
- Best for: 100M+ vectors on disk
- Memory: Very low (~100MB index)
-
-### Quantization (SQ/PQ)
- Memory reduction: 4-8x
- Recall impact: -5-15%
-
---
-
-## 4. Multi-Modal Support
-
-| Database | Text | Image | Audio | Video | Mixed Queries |
-|----------|------|-------|-------|-------|---------------|
-| Qdrant | ✅ | ✅ | ✅ | ✅ | ✅ (multi-vector) |
-| Milvus | ✅ | ✅ | ✅ | ✅ | ✅ (hybrid) |
-| Weaviate | ✅ | ✅ | ✅ | ✅ | ✅ (named vectors) |
-| Pinecone | ✅ | ✅ | ✅ | ✅ | Limited |
-| ChromaDB | ✅ | Via emb | Via emb | Via emb | Limited |
-| pgvector | ✅ | Via emb | Via emb | Via emb | Limited |
-| SQLite-vss | ✅ | Via emb | Via emb | Via emb | Limited |
-
---
-
-## 5. Integration Patterns for AI Agents
-
-### Pattern A: Direct Search
-Query → Embedding → Vector DB → Top-K → LLM
-
-### Pattern B: Hybrid Search  
-Query → BM25 + Vector → Merge/Rerank → LLM
-
-### Pattern C: Multi-Stage
-Query → Vector DB (top-100) → Reranker (top-10) → LLM
-
-### Pattern D: Agent Memory with Trust + Decay
-Query → Vector → Score × Trust × Decay → Top-K → Summarize
-
---
-
-## 6. Comparison with Current Systems
-
-### session_search (FTS5)
-Strengths: Zero deps, no embedding needed, fast for exact keywords
-Limitations: No semantic understanding, no cross-lingual, limited ranking
-
-### holographic/retrieval.py (HRR)
-Strengths: Compositional queries, contradiction detection, trust + decay
-Limitations: Requires numpy, O(n) scan, non-standard embedding space
-
-### Expected Gains from Vector DB:
- Semantic recall: +30-50% for paraphrased queries
- Cross-lingual: +60-80%
- Fuzzy matching: +40-60%
- Conceptual: +50-70%
-
---
-
-## 7. Recommendations
-
-### Option 1: Qdrant (RECOMMENDED)
- Best self-hosted performance
- Rust implementation, native multi-vector
- Tradeoff: Separate service deployment
-
-### Option 2: pgvector (CONSERVATIVE)
- Zero new infrastructure if using PostgreSQL
- Tradeoff: 5-10x slower than Qdrant
-
-### Option 3: SQLite-vss (LIGHTWEIGHT)
- Minimal changes, embedded deployment
- Tradeoff: Limited scalability (<100K vectors)
-
-### Option 4: Hybrid (BEST OF BOTH)
-Keep FTS5 + HRR and add Qdrant:
- Vector (semantic) + FTS5 (keyword) + HRR (compositional)
- Apply trust scoring + temporal decay
-
---
-
-## 8. Embedding Models (2025-2026)
-
-| Model | Dimensions | Quality | Cost |
-|-------|-----------|---------|------|
-| OpenAI text-embedding-3-large | 3072 | Best | $$$ |
-| OpenAI text-embedding-3-small | 1536 | Good | $ |
-| BGE-M3 | 1024 | Best self-hosted | Free |
-| GTE-Qwen2 | 768-1024 | Good | Free |
-
---
-
-## 9. Hardware Requirements (1M vectors, 768-dim)
-
-| Database | RAM (HNSW) | RAM (Quantized) |
-|----------|-----------|-----------------|
-| Qdrant | 8-16GB | 2-4GB |
-| Milvus | 16-32GB | 4-8GB |
-| pgvector | 4-8GB | N/A |
-| SQLite-vss | 2-4GB | N/A |
-
---
-
-## 10. Conclusion
-
-Primary: Qdrant with hybrid search (vector + FTS5 + HRR)
-Key insight: Augment existing HRR system, don't replace it.
-
-Next steps:
-1. Deploy Qdrant in Docker for testing
-2. Benchmark embedding models
-3. Implement hybrid search prototype
-4. Measure recall improvement
-5. Evaluate operational complexity
-
-Report: April 2026 | Sources: ANN-Benchmarks, VectorDBBench, official docs
--- a/acp_adapter/server.py
+++ b/acp_adapter/server.py
@@ -36,7 +36,6 @@ from acp.schema import (
    SessionCapabilities,
    SessionForkCapabilities,
    SessionListCapabilities,
-    SessionResumeCapabilities,
    SessionInfo,
    TextContentBlock,
    UnstructuredCommandInput,
@@ -246,11 +245,9 @@ class HermesACPAgent(acp.Agent):
            protocol_version=acp.PROTOCOL_VERSION,
            agent_info=Implementation(name="hermes-agent", version=HERMES_VERSION),
            agent_capabilities=AgentCapabilities(
-                load_session=True,
                session_capabilities=SessionCapabilities(
                    fork=SessionForkCapabilities(),
                    list=SessionListCapabilities(),
-                    resume=SessionResumeCapabilities(),
                ),
            ),
            auth_methods=auth_methods,
@@ -454,13 +451,14 @@ class HermesACPAgent(acp.Agent):
            await conn.session_update(session_id, update)

        usage = None
-        if any(result.get(key) is not None for key in ("prompt_tokens", "completion_tokens", "total_tokens")):
+        usage_data = result.get("usage")
+        if usage_data and isinstance(usage_data, dict):
            usage = Usage(
-                input_tokens=result.get("prompt_tokens", 0),
-                output_tokens=result.get("completion_tokens", 0),
-                total_tokens=result.get("total_tokens", 0),
-                thought_tokens=result.get("reasoning_tokens"),
-                cached_read_tokens=result.get("cache_read_tokens"),
+                input_tokens=usage_data.get("prompt_tokens", 0),
+                output_tokens=usage_data.get("completion_tokens", 0),
+                total_tokens=usage_data.get("total_tokens", 0),
+                thought_tokens=usage_data.get("reasoning_tokens"),
+                cached_read_tokens=usage_data.get("cached_tokens"),
            )

        stop_reason = "cancelled" if state.cancel_event and state.cancel_event.is_set() else "end_turn"
--- a/agent/a2a_mtls.py
+++ b/agent/a2a_mtls.py
@@ -1,443 +0,0 @@
-"""
-A2A mutual-TLS server — secure agent-to-agent communication.
-
-Each fleet agent runs an A2A server that:
-  - Presents its own TLS certificate (signed by the fleet CA).
-  - Requires the connecting peer to present a valid client certificate
-    also signed by the fleet CA.
-  - Rejects connections from unknown / self-signed peers.
-
-Usage (standalone):
-    python -m agent.a2a_mtls \\
-        --cert ~/.hermes/pki/agents/timmy/timmy.crt \\
-        --key  ~/.hermes/pki/agents/timmy/timmy.key \\
-        --ca   ~/.hermes/pki/ca/fleet-ca.crt \\
-        --host 0.0.0.0 --port 9443
-
-Environment variables (alternative to CLI flags):
-    HERMES_A2A_CERT   path to agent certificate
-    HERMES_A2A_KEY    path to agent private key
-    HERMES_A2A_CA     path to fleet CA certificate
-
-Refs #806
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import os
-import ssl
-import threading
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from pathlib import Path
-from typing import Any, Callable, Dict, Optional
-from urllib.error import URLError
-from urllib.request import Request, urlopen
-
-logger = logging.getLogger(__name__)
-
-# ---------------------------------------------------------------------------
-# mTLS SSL context helpers
-# ---------------------------------------------------------------------------
-
-def build_server_ssl_context(
-    cert: str | Path,
-    key: str | Path,
-    ca: str | Path,
-) -> ssl.SSLContext:
-    """Return an SSLContext that presents *cert/key* and requires a valid
-    client certificate signed by *ca*.
-
-    Raises ``FileNotFoundError`` if any path is missing.
-    Raises ``ssl.SSLError`` if the files are malformed.
-    """
-    cert, key, ca = Path(cert), Path(key), Path(ca)
-    for p in (cert, key, ca):
-        if not p.exists():
-            raise FileNotFoundError(f"mTLS: file not found: {p}")
-
-    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
-    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
-    ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
-    ctx.load_verify_locations(cafile=str(ca))
-    # CERT_REQUIRED — reject peers that don't present a cert signed by *ca*.
-    ctx.verify_mode = ssl.CERT_REQUIRED
-    return ctx
-
-
-def build_client_ssl_context(
-    cert: str | Path,
-    key: str | Path,
-    ca: str | Path,
-) -> ssl.SSLContext:
-    """Return an SSLContext for an outgoing mTLS connection.
-
-    Presents *cert/key* as the client identity and verifies the server
-    certificate against *ca*.
-    """
-    cert, key, ca = Path(cert), Path(key), Path(ca)
-    for p in (cert, key, ca):
-        if not p.exists():
-            raise FileNotFoundError(f"mTLS client: file not found: {p}")
-
-    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
-    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
-    ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
-    ctx.load_verify_locations(cafile=str(ca))
-    ctx.verify_mode = ssl.CERT_REQUIRED
-    ctx.check_hostname = True
-    return ctx
-
-
-# ---------------------------------------------------------------------------
-# Minimal A2A HTTP request handler
-# ---------------------------------------------------------------------------
-
-class A2AHandler(BaseHTTPRequestHandler):
-    """Handles A2A requests over a mutually-authenticated TLS connection.
-
-    GET /.well-known/agent-card.json  — returns the local agent card.
-    POST /a2a/task                    — dispatches an A2A task (stub).
-    """
-
-    log_message = logger.debug  # route access log to Python logger
-
-    def do_GET(self) -> None:  # noqa: N802
-        if self.path in ("/.well-known/agent-card.json", "/agent-card.json"):
-            self._serve_agent_card()
-        else:
-            self._send_json(404, {"error": "not found"})
-
-    def do_POST(self) -> None:  # noqa: N802
-        if self.path == "/a2a/task":
-            self._handle_task()
-        else:
-            self._send_json(404, {"error": "not found"})
-
-    # ------------------------------------------------------------------
-    def _serve_agent_card(self) -> None:
-        try:
-            from agent.agent_card import get_agent_card_json
-            body = get_agent_card_json().encode()
-        except Exception as exc:
-            logger.warning("agent-card unavailable: %s", exc)
-            body = b'{"error": "agent card unavailable"}'
-        self._send_raw(200, "application/json", body)
-
-    def _handle_task(self) -> None:
-        length = int(self.headers.get("Content-Length", 0))
-        _body = self.rfile.read(length) if length else b""
-        # Stub: echo back a 202 Accepted with the peer CN so callers can
-        # confirm which agent processed the request.
-        peer_cn = _peer_cn(self.connection)
-        self._send_json(202, {"status": "accepted", "handled_by": peer_cn})
-
-    # ------------------------------------------------------------------
-    def _send_json(self, code: int, data: dict) -> None:
-        import json
-        body = json.dumps(data).encode()
-        self._send_raw(code, "application/json", body)
-
-    def _send_raw(self, code: int, content_type: str, body: bytes) -> None:
-        self.send_response(code)
-        self.send_header("Content-Type", content_type)
-        self.send_header("Content-Length", str(len(body)))
-        self.end_headers()
-        self.wfile.write(body)
-
-    def log_message(self, fmt: str, *args: object) -> None:  # type: ignore[override]
-        logger.debug("a2a: " + fmt, *args)
-
-
-def _peer_cn(conn: ssl.SSLSocket) -> Optional[str]:
-    """Extract the Common Name from the peer certificate, or None."""
-    try:
-        peer = conn.getpeercert()
-        if not peer:
-            return None
-        for rdn in peer.get("subject", ()):
-            for key, val in rdn:
-                if key == "commonName":
-                    return val
-    except Exception:
-        pass
-    return None
-
-
-# ---------------------------------------------------------------------------
-# Server lifecycle
-# ---------------------------------------------------------------------------
-
-class A2AServer:
-    """Mutual-TLS A2A server.
-
-    Example::
-
-        server = A2AServer(
-            cert="~/.hermes/pki/agents/timmy/timmy.crt",
-            key="~/.hermes/pki/agents/timmy/timmy.key",
-            ca="~/.hermes/pki/ca/fleet-ca.crt",
-        )
-        server.start()   # non-blocking (daemon thread)
-        ...
-        server.stop()
-    """
-
-    def __init__(
-        self,
-        cert: str | Path,
-        key: str | Path,
-        ca: str | Path,
-        host: str = "0.0.0.0",
-        port: int = 9443,
-    ) -> None:
-        self.cert = Path(cert).expanduser()
-        self.key = Path(key).expanduser()
-        self.ca = Path(ca).expanduser()
-        self.host = host
-        self.port = port
-        self._httpd: Optional[HTTPServer] = None
-        self._thread: Optional[threading.Thread] = None
-
-    def start(self, daemon: bool = True) -> None:
-        """Start the server in a background thread (default: daemon)."""
-        ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
-        self._httpd = HTTPServer((self.host, self.port), A2AHandler)
-        self._httpd.socket = ssl_ctx.wrap_socket(
-            self._httpd.socket, server_side=True
-        )
-        self._thread = threading.Thread(
-            target=self._httpd.serve_forever, daemon=daemon
-        )
-        self._thread.start()
-        logger.info(
-            "A2A mTLS server listening on %s:%s (cert=%s)",
-            self.host, self.port, self.cert.name,
-        )
-
-    def stop(self) -> None:
-        if self._httpd:
-            self._httpd.shutdown()
-            self._httpd = None
-        if self._thread:
-            self._thread.join(timeout=5)
-            self._thread = None
-
-
-def server_from_env() -> A2AServer:
-    """Build an A2AServer from environment variables / defaults."""
-    hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
-    agent_name = os.environ.get("HERMES_AGENT_NAME", "hermes").lower()
-
-    default_cert = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.crt"
-    default_key = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.key"
-    default_ca = hermes_home / "pki" / "ca" / "fleet-ca.crt"
-
-    cert = os.environ.get("HERMES_A2A_CERT", str(default_cert))
-    key = os.environ.get("HERMES_A2A_KEY", str(default_key))
-    ca = os.environ.get("HERMES_A2A_CA", str(default_ca))
-    host = os.environ.get("HERMES_A2A_HOST", "0.0.0.0")
-    port = int(os.environ.get("HERMES_A2A_PORT", "9443"))
-
-    return A2AServer(cert=cert, key=key, ca=ca, host=host, port=port)
-
-
-# ---------------------------------------------------------------------------
-# CLI entry point
-# ---------------------------------------------------------------------------
-
-def _main() -> None:
-    import argparse
-
-    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
-
-    parser = argparse.ArgumentParser(
-        description="Hermes A2A mutual-TLS server"
-    )
-    parser.add_argument("--cert", required=True, help="Path to agent certificate")
-    parser.add_argument("--key", required=True, help="Path to agent private key")
-    parser.add_argument("--ca", required=True, help="Path to fleet CA certificate")
-    parser.add_argument("--host", default="0.0.0.0")
-    parser.add_argument("--port", type=int, default=9443)
-    args = parser.parse_args()
-
-    server = A2AServer(
-        cert=args.cert, key=args.key, ca=args.ca,
-        host=args.host, port=args.port,
-    )
-    server.start(daemon=False)
-
-
-if __name__ == "__main__":
-    _main()
-
-
-# ---------------------------------------------------------------------------
-# A2AMTLSServer — routing-based server with context-manager support
-# ---------------------------------------------------------------------------
-
-class _RoutingHandler(BaseHTTPRequestHandler):
-    """HTTP request handler that dispatches to per-path callables."""
-
-    routes: Dict[str, Callable] = {}
-
-    def log_message(self, fmt: str, *args: Any) -> None:
-        logger.debug("A2AMTLSServer: " + fmt, *args)
-
-    def _peer_cn(self) -> Optional[str]:
-        cert = self.connection.getpeercert()  # type: ignore[attr-defined]
-        if not cert:
-            return None
-        for rdn in cert.get("subject", ()):
-            for attr, value in rdn:
-                if attr == "commonName":
-                    return value
-        return None
-
-    def do_POST(self) -> None:
-        handler = self.routes.get(self.path)
-        if handler is None:
-            self.send_response(404)
-            self.end_headers()
-            return
-        length = int(self.headers.get("Content-Length", 0))
-        body = self.rfile.read(length) if length else b""
-        try:
-            payload = json.loads(body) if body else {}
-        except json.JSONDecodeError:
-            self.send_response(400)
-            self.end_headers()
-            return
-        result = handler(payload, peer_cn=self._peer_cn())
-        self.send_response(200)
-        self.send_header("Content-Type", "application/json")
-        self.end_headers()
-        self.wfile.write(json.dumps(result).encode())
-
-    def do_GET(self) -> None:
-        handler = self.routes.get(self.path)
-        if handler is None:
-            self.send_response(404)
-            self.end_headers()
-            return
-        result = handler({}, peer_cn=self._peer_cn())
-        self.send_response(200)
-        self.send_header("Content-Type", "application/json")
-        self.end_headers()
-        self.wfile.write(json.dumps(result).encode())
-
-
-class A2AMTLSServer:
-    """Routing-based mTLS HTTPS server with context-manager support.
-
-    Unlike ``A2AServer`` (which serves fixed A2A paths), this server lets
-    callers register arbitrary path handlers — useful for tests and custom
-    A2A endpoint implementations.
-
-    handler signature: ``handler(payload: dict, *, peer_cn: str | None) -> dict``
-
-    Example::
-
-        server = A2AMTLSServer(cert="timmy.crt", key="timmy.key", ca="fleet-ca.crt")
-        server.add_route("/tasks/send", my_handler)
-        with server:
-            ...  # server runs for the duration of the block
-    """
-
-    def __init__(
-        self,
-        cert: str | Path,
-        key: str | Path,
-        ca: str | Path,
-        host: str = "127.0.0.1",
-        port: int = 9443,
-    ) -> None:
-        self.cert = Path(cert).expanduser()
-        self.key = Path(key).expanduser()
-        self.ca = Path(ca).expanduser()
-        self.host = host
-        self.port = port
-        self._routes: Dict[str, Callable] = {}
-        self._httpd: Optional[HTTPServer] = None
-        self._thread: Optional[threading.Thread] = None
-
-    def add_route(self, path: str, handler: Callable) -> None:
-        self._routes[path] = handler
-
-    def start(self) -> None:
-        ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
-
-        class _Handler(_RoutingHandler):
-            routes = self._routes
-
-        self._httpd = HTTPServer((self.host, self.port), _Handler)
-        self._httpd.socket = ssl_ctx.wrap_socket(self._httpd.socket, server_side=True)
-        self._thread = threading.Thread(
-            target=self._httpd.serve_forever,
-            daemon=True,
-            name=f"a2a-mtls-{self.port}",
-        )
-        self._thread.start()
-        logger.info("A2AMTLSServer on %s:%d (mTLS)", self.host, self.port)
-
-    def stop(self) -> None:
-        if self._httpd:
-            self._httpd.shutdown()
-            self._httpd = None
-        if self._thread:
-            self._thread.join(timeout=5)
-            self._thread = None
-
-    def __enter__(self) -> "A2AMTLSServer":
-        self.start()
-        return self
-
-    def __exit__(self, *_: Any) -> None:
-        self.stop()
-
-
-# ---------------------------------------------------------------------------
-# A2AMTLSClient — mTLS HTTP client
-# ---------------------------------------------------------------------------
-
-class A2AMTLSClient:
-    """HTTP client that presents a fleet cert on every outgoing connection.
-
-    Example::
-
-        client = A2AMTLSClient(cert="allegro.crt", key="allegro.key", ca="fleet-ca.crt")
-        result = client.post("https://timmy:9443/tasks/send", json={"task": "..."})
-    """
-
-    def __init__(
-        self,
-        cert: str | Path,
-        key: str | Path,
-        ca: str | Path,
-    ) -> None:
-        self._ssl_ctx = build_client_ssl_context(cert, key, ca)
-        self._ssl_ctx.check_hostname = False  # callers connecting by IP
-
-    def _request(
-        self,
-        method: str,
-        url: str,
-        data: Optional[bytes] = None,
-        timeout: float = 10.0,
-    ) -> Dict[str, Any]:
-        headers = {"Content-Type": "application/json"}
-        req = Request(url, data=data, headers=headers, method=method)
-        try:
-            with urlopen(req, context=self._ssl_ctx, timeout=timeout) as resp:
-                body = resp.read()
-                return json.loads(body) if body else {}
-        except URLError as exc:
-            raise ConnectionError(f"A2AMTLSClient {method} {url} failed: {exc.reason}") from exc
-
-    def get(self, url: str, **kwargs: Any) -> Dict[str, Any]:
-        return self._request("GET", url, **kwargs)
-
-    def post(self, url: str, json: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
-        data = (__import__("json").dumps(json).encode() if json is not None else None)
-        return self._request("POST", url, data=data, **kwargs)
--- a/agent/agent_card.py
+++ b/agent/agent_card.py
@@ -1,135 +0,0 @@
-"""
-Agent Card — A2A-compliant agent discovery.
-Part of #843: fix: implement A2A agent card for fleet discovery (#819)
-
-Provides metadata about the agent's identity, capabilities, and installed skills
-for discovery by other agents in the fleet.
-"""
-
-import json
-import logging
-import os
-from dataclasses import asdict, dataclass, field
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from hermes_cli import __version__
-from hermes_cli.config import load_config, get_hermes_home
-from agent.skill_utils import (
-    iter_skill_index_files,
-    parse_frontmatter,
-    get_all_skills_dirs,
-    get_disabled_skill_names,
-    skill_matches_platform
-)
-
-logger = logging.getLogger(__name__)
-
-@dataclass
-class AgentSkill:
-    id: str
-    name: str
-    description: str = ""
-    version: str = "1.0.0"
-
-@dataclass
-class AgentCapabilities:
-    streaming: bool = True
-    tools: bool = True
-    vision: bool = False
-    reasoning: bool = False
-
-@dataclass
-class AgentCard:
-    name: str
-    description: str
-    url: str
-    version: str = __version__
-    capabilities: AgentCapabilities = field(default_factory=AgentCapabilities)
-    skills: List[AgentSkill] = field(default_factory=list)
-    defaultInputModes: List[str] = field(default_factory=lambda: ["text/plain"])
-    defaultOutputModes: List[str] = field(default_factory=lambda: ["text/plain"])
-
-def _load_skills() -> List[AgentSkill]:
-    """Scan all enabled skills and return metadata."""
-    skills = []
-    disabled = get_disabled_skill_names()
-    
-    for skills_dir in get_all_skills_dirs():
-        if not skills_dir.is_dir():
-            continue
-        for skill_file in iter_skill_index_files(skills_dir, "SKILL.md"):
-            try:
-                raw = skill_file.read_text(encoding="utf-8")
-                frontmatter, _ = parse_frontmatter(raw)
-            except Exception:
-                continue
-
-            skill_name = frontmatter.get("name") or skill_file.parent.name
-            if str(skill_name) in disabled:
-                continue
-            if not skill_matches_platform(frontmatter):
-                continue
-
-            skills.append(AgentSkill(
-                id=str(skill_name),
-                name=str(frontmatter.get("name", skill_name)),
-                description=str(frontmatter.get("description", "")),
-                version=str(frontmatter.get("version", "1.0.0"))
-            ))
-    return skills
-
-def build_agent_card() -> AgentCard:
-    """Build the agent card from current configuration and environment."""
-    config = load_config()
-    
-    # Identity
-    name = os.environ.get("HERMES_AGENT_NAME") or config.get("agent", {}).get("name") or "hermes"
-    description = os.environ.get("HERMES_AGENT_DESCRIPTION") or config.get("agent", {}).get("description") or "Sovereign AI agent"
-    
-    # URL - try to determine from environment or config
-    port = os.environ.get("HERMES_WEB_PORT") or "9119"
-    host = os.environ.get("HERMES_WEB_HOST") or "localhost"
-    url = f"http://{host}:{port}"
-    
-    # Capabilities
-    # In a real scenario, we'd check model metadata for vision/reasoning
-    capabilities = AgentCapabilities(
-        streaming=True,
-        tools=True,
-        vision=False, # Default to false unless we can confirm
-        reasoning=False
-    )
-    
-    # Skills
-    skills = _load_skills()
-    
-    return AgentCard(
-        name=name,
-        description=description,
-        url=url,
-        version=__version__,
-        capabilities=capabilities,
-        skills=skills
-    )
-
-def get_agent_card_json() -> str:
-    """Return the agent card as a JSON string."""
-    try:
-        card = build_agent_card()
-        return json.dumps(asdict(card), indent=2)
-    except Exception as e:
-        logger.error(f"Failed to build agent card: {e}")
-        # Minimal fallback card
-        fallback = {
-            "name": "hermes",
-            "description": "Sovereign AI agent (fallback)",
-            "version": __version__,
-            "error": str(e)
-        }
-        return json.dumps(fallback, indent=2)
-
-def validate_agent_card(card_data: Dict[str, Any]) -> bool:
-    """Check if the card data complies with the A2A schema."""
-    required = ["name", "description", "url", "version"]
-    return all(k in card_data for k in required)
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@@ -60,8 +60,6 @@ _ANTHROPIC_OUTPUT_LIMITS = {
    "claude-3-opus":       4_096,
    "claude-3-sonnet":     4_096,
    "claude-3-haiku":      4_096,
-    # Third-party Anthropic-compatible providers
-    "minimax":            131_072,
 }

 # For any model not in the table, assume the highest current limit.
@@ -76,11 +74,8 @@ def _get_anthropic_max_output(model: str) -> int:
    model IDs (claude-sonnet-4-5-20250929) and variant suffixes (:1m, :fast)
    resolve correctly.  Longest-prefix match wins to avoid e.g. "claude-3-5"
    matching before "claude-3-5-sonnet".
-
-    Normalizes dots to hyphens so that model names like
-    ``anthropic/claude-opus-4.6`` match the ``claude-opus-4-6`` table key.
    """
-    m = model.lower().replace(".", "-")
+    m = model.lower()
    best_key = ""
    best_val = _ANTHROPIC_DEFAULT_OUTPUT_LIMIT
    for key, val in _ANTHROPIC_OUTPUT_LIMITS.items():
@@ -100,15 +95,6 @@ _COMMON_BETAS = [
    "interleaved-thinking-2025-05-14",
    "fine-grained-tool-streaming-2025-05-14",
 ]
-# MiniMax's Anthropic-compatible endpoints fail tool-use requests when
-# the fine-grained tool streaming beta is present.  Omit it so tool calls
-# fall back to the provider's default response path.
-_TOOL_STREAMING_BETA = "fine-grained-tool-streaming-2025-05-14"
-
-# Fast mode beta — enables the ``speed: "fast"`` request parameter for
-# significantly higher output token throughput on Opus 4.6 (~2.5x).
-# See https://platform.claude.com/docs/en/build-with-claude/fast-mode
-_FAST_MODE_BETA = "fast-mode-2026-02-01"

 # Additional beta headers required for OAuth/subscription auth.
 # Matches what Claude Code (and pi-ai / OpenCode) send.
@@ -163,27 +149,18 @@ def _get_claude_code_version() -> str:


 def _is_oauth_token(key: str) -> bool:
-    """Check if the key is an Anthropic OAuth/setup token.
+    """Check if the key is an OAuth/setup token (not a regular Console API key).

-    Positively identifies Anthropic OAuth tokens by their key format:
-    - ``sk-ant-`` prefix (but NOT ``sk-ant-api``) → setup tokens, managed keys
-    - ``eyJ`` prefix → JWTs from the Anthropic OAuth flow
-
-    Non-Anthropic keys (MiniMax, Alibaba, etc.) don't match either pattern
-    and correctly return False.
+    Regular API keys start with 'sk-ant-api'. Everything else (setup-tokens
+    starting with 'sk-ant-oat', managed keys, JWTs, etc.) needs Bearer auth.
    """
    if not key:
        return False
-    # Regular Anthropic Console API keys — x-api-key auth, never OAuth
+    # Regular Console API keys use x-api-key header
    if key.startswith("sk-ant-api"):
        return False
-    # Anthropic-issued tokens (setup-tokens sk-ant-oat-*, managed keys)
-    if key.startswith("sk-ant-"):
-        return True
-    # JWTs from Anthropic OAuth flow
-    if key.startswith("eyJ"):
-        return True
-    return False
+    # Everything else (setup-tokens, managed keys, JWTs) uses Bearer auth
+    return True


 def _normalize_base_url_text(base_url) -> str:
@@ -227,19 +204,6 @@ def _requires_bearer_auth(base_url: str | None) -> bool:
    return normalized.startswith(("https://api.minimax.io/anthropic", "https://api.minimaxi.com/anthropic"))


-def _common_betas_for_base_url(base_url: str | None) -> list[str]:
-    """Return the beta headers that are safe for the configured endpoint.
-
-    MiniMax's Anthropic-compatible endpoints (Bearer-auth) reject requests
-    that include Anthropic's ``fine-grained-tool-streaming`` beta — every
-    tool-use message triggers a connection error.  Strip that beta for
-    Bearer-auth endpoints while keeping all other betas intact.
-    """
-    if _requires_bearer_auth(base_url):
-        return [b for b in _COMMON_BETAS if b != _TOOL_STREAMING_BETA]
-    return _COMMON_BETAS
-
-
 def build_anthropic_client(api_key: str, base_url: str = None):
    """Create an Anthropic client, auto-detecting setup-tokens vs API keys.

@@ -258,7 +222,6 @@ def build_anthropic_client(api_key: str, base_url: str = None):
    }
    if normalized_base_url:
        kwargs["base_url"] = normalized_base_url
-    common_betas = _common_betas_for_base_url(normalized_base_url)

    if _requires_bearer_auth(normalized_base_url):
        # Some Anthropic-compatible providers (e.g. MiniMax) expect the API key in
@@ -268,21 +231,21 @@ def build_anthropic_client(api_key: str, base_url: str = None):
        # not use Anthropic's sk-ant-api prefix and would otherwise be misread as
        # Anthropic OAuth/setup tokens.
        kwargs["auth_token"] = api_key
-        if common_betas:
-            kwargs["default_headers"] = {"anthropic-beta": ",".join(common_betas)}
+        if _COMMON_BETAS:
+            kwargs["default_headers"] = {"anthropic-beta": ",".join(_COMMON_BETAS)}
    elif _is_third_party_anthropic_endpoint(base_url):
        # Third-party proxies (Azure AI Foundry, AWS Bedrock, etc.) use their
        # own API keys with x-api-key auth. Skip OAuth detection — their keys
        # don't follow Anthropic's sk-ant-* prefix convention and would be
        # misclassified as OAuth tokens.
        kwargs["api_key"] = api_key
-        if common_betas:
-            kwargs["default_headers"] = {"anthropic-beta": ",".join(common_betas)}
+        if _COMMON_BETAS:
+            kwargs["default_headers"] = {"anthropic-beta": ",".join(_COMMON_BETAS)}
    elif _is_oauth_token(api_key):
        # OAuth access token / setup-token → Bearer auth + Claude Code identity.
        # Anthropic routes OAuth requests based on user-agent and headers;
        # without Claude Code's fingerprint, requests get intermittent 500s.
-        all_betas = common_betas + _OAUTH_ONLY_BETAS
+        all_betas = _COMMON_BETAS + _OAUTH_ONLY_BETAS
        kwargs["auth_token"] = api_key
        kwargs["default_headers"] = {
            "anthropic-beta": ",".join(all_betas),
@@ -292,8 +255,8 @@ def build_anthropic_client(api_key: str, base_url: str = None):
    else:
        # Regular API key → x-api-key header + common betas
        kwargs["api_key"] = api_key
-        if common_betas:
-            kwargs["default_headers"] = {"anthropic-beta": ",".join(common_betas)}
+        if _COMMON_BETAS:
+            kwargs["default_headers"] = {"anthropic-beta": ",".join(_COMMON_BETAS)}

    return _anthropic_sdk.Anthropic(**kwargs)

@@ -522,6 +485,35 @@ def _prefer_refreshable_claude_code_token(env_token: str, creds: Optional[Dict[s
    return None


+def get_anthropic_token_source(token: Optional[str] = None) -> str:
+    """Best-effort source classification for an Anthropic credential token."""
+    token = (token or "").strip()
+    if not token:
+        return "none"
+
+    env_token = os.getenv("ANTHROPIC_TOKEN", "").strip()
+    if env_token and env_token == token:
+        return "anthropic_token_env"
+
+    cc_env_token = os.getenv("CLAUDE_CODE_OAUTH_TOKEN", "").strip()
+    if cc_env_token and cc_env_token == token:
+        return "claude_code_oauth_token_env"
+
+    creds = read_claude_code_credentials()
+    if creds and creds.get("accessToken") == token:
+        return str(creds.get("source") or "claude_code_credentials")
+
+    managed_key = read_claude_managed_key()
+    if managed_key and managed_key == token:
+        return "claude_json_primary_api_key"
+
+    api_key = os.getenv("ANTHROPIC_API_KEY", "").strip()
+    if api_key and api_key == token:
+        return "anthropic_api_key_env"
+
+    return "unknown"
+
+
 def resolve_anthropic_token() -> Optional[str]:
    """Resolve an Anthropic token from all available sources.

@@ -728,6 +720,21 @@ def run_hermes_oauth_login_pure() -> Optional[Dict[str, Any]]:
    }


+def _save_hermes_oauth_credentials(access_token: str, refresh_token: str, expires_at_ms: int) -> None:
+    """Save OAuth credentials to ~/.hermes/.anthropic_oauth.json."""
+    data = {
+        "accessToken": access_token,
+        "refreshToken": refresh_token,
+        "expiresAt": expires_at_ms,
+    }
+    try:
+        _HERMES_OAUTH_FILE.parent.mkdir(parents=True, exist_ok=True)
+        _HERMES_OAUTH_FILE.write_text(json.dumps(data, indent=2), encoding="utf-8")
+        _HERMES_OAUTH_FILE.chmod(0o600)
+    except (OSError, IOError) as e:
+        logger.debug("Failed to save Hermes OAuth credentials: %s", e)
+
+
 def read_hermes_oauth_credentials() -> Optional[Dict[str, Any]]:
    """Read Hermes-managed OAuth credentials from ~/.hermes/.anthropic_oauth.json."""
    if _HERMES_OAUTH_FILE.exists():
@@ -776,6 +783,39 @@ def _sanitize_tool_id(tool_id: str) -> str:
    return sanitized or "tool_0"


+def _convert_openai_image_part_to_anthropic(part: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """Convert an OpenAI-style image block to Anthropic's image source format."""
+    image_data = part.get("image_url", {})
+    url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data)
+    if not isinstance(url, str) or not url.strip():
+        return None
+    url = url.strip()
+
+    if url.startswith("data:"):
+        header, sep, data = url.partition(",")
+        if sep and ";base64" in header:
+            media_type = header[5:].split(";", 1)[0] or "image/png"
+            return {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": media_type,
+                    "data": data,
+                },
+            }
+
+    if url.startswith(("http://", "https://")):
+        return {
+            "type": "image",
+            "source": {
+                "type": "url",
+                "url": url,
+            },
+        }
+
+    return None
+
+
 def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
    """Convert OpenAI tool definitions to Anthropic format."""
    if not tools:
@@ -1195,31 +1235,13 @@ def build_anthropic_kwargs(
    preserve_dots: bool = False,
    context_length: Optional[int] = None,
    base_url: str | None = None,
-    fast_mode: bool = False,
 ) -> Dict[str, Any]:
    """Build kwargs for anthropic.messages.create().

-    Naming note — two distinct concepts, easily confused:
-      max_tokens     = OUTPUT token cap for a single response.
-                       Anthropic's API calls this "max_tokens" but it only
-                       limits the *output*.  Anthropic's own native SDK
-                       renamed it "max_output_tokens" for clarity.
-      context_length = TOTAL context window (input tokens + output tokens).
-                       The API enforces: input_tokens + max_tokens ≤ context_length.
-                       Stored on the ContextCompressor; reduced on overflow errors.
-
-    When *max_tokens* is None the model's native output ceiling is used
-    (e.g. 128K for Opus 4.6, 64K for Sonnet 4.6).
-
-    When *context_length* is provided and the model's native output ceiling
-    exceeds it (e.g. a local endpoint with an 8K window), the output cap is
-    clamped to context_length − 1.  This only kicks in for unusually small
-    context windows; for full-size models the native output cap is always
-    smaller than the context window so no clamping happens.
-    NOTE: this clamping does not account for prompt size — if the prompt is
-    large, Anthropic may still reject the request.  The caller must detect
-    "max_tokens too large given prompt" errors and retry with a smaller cap
-    (see parse_available_output_tokens_from_error + _ephemeral_max_output_tokens).
+    When *max_tokens* is None, the model's native output limit is used
+    (e.g. 128K for Opus 4.6, 64K for Sonnet 4.6).  If *context_length*
+    is provided, the effective limit is clamped so it doesn't exceed
+    the context window.

    When *is_oauth* is True, applies Claude Code compatibility transforms:
    system prompt prefix, tool name prefixing, and prompt sanitization.
@@ -1229,24 +1251,15 @@ def build_anthropic_kwargs(

    When *base_url* points to a third-party Anthropic-compatible endpoint,
    thinking block signatures are stripped (they are Anthropic-proprietary).
-
-    When *fast_mode* is True, adds ``extra_body["speed"] = "fast"`` and the
-    fast-mode beta header for ~2.5x faster output throughput on Opus 4.6.
-    Currently only supported on native Anthropic endpoints (not third-party
-    compatible ones).
    """
    system, anthropic_messages = convert_messages_to_anthropic(messages, base_url=base_url)
    anthropic_tools = convert_tools_to_anthropic(tools) if tools else []

    model = normalize_model_name(model, preserve_dots=preserve_dots)
-    # effective_max_tokens = output cap for this call (≠ total context window)
    effective_max_tokens = max_tokens or _get_anthropic_max_output(model)

-    # Clamp output cap to fit inside the total context window.
-    # Only matters for small custom endpoints where context_length < native
-    # output ceiling.  For standard Anthropic models context_length (e.g.
-    # 200K) is always larger than the output ceiling (e.g. 128K), so this
-    # branch is not taken.
+    # Clamp to context window if the user set a lower context_length
+    # (e.g. custom endpoint with limited capacity).
    if context_length and effective_max_tokens > context_length:
        effective_max_tokens = max(context_length - 1, 1)

@@ -1316,10 +1329,9 @@ def build_anthropic_kwargs(
    # Map reasoning_config to Anthropic's thinking parameter.
    # Claude 4.6 models use adaptive thinking + output_config.effort.
    # Older models use manual thinking with budget_tokens.
-    # MiniMax Anthropic-compat endpoints support thinking (manual mode only,
-    # not adaptive).  Haiku does NOT support extended thinking — skip entirely.
+    # Haiku and MiniMax models do NOT support extended thinking — skip entirely.
    if reasoning_config and isinstance(reasoning_config, dict):
-        if reasoning_config.get("enabled") is not False and "haiku" not in model.lower():
+        if reasoning_config.get("enabled") is not False and "haiku" not in model.lower() and "minimax" not in model.lower():
            effort = str(reasoning_config.get("effort", "medium")).lower()
            budget = THINKING_BUDGET.get(effort, 8000)
            if _supports_adaptive_thinking(model):
@@ -1333,20 +1345,6 @@ def build_anthropic_kwargs(
                kwargs["temperature"] = 1
                kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096)

-    # ── Fast mode (Opus 4.6 only) ────────────────────────────────────
-    # Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x
-    # output speed. Only for native Anthropic endpoints — third-party
-    # providers would reject the unknown beta header and speed parameter.
-    if fast_mode and not _is_third_party_anthropic_endpoint(base_url):
-        kwargs.setdefault("extra_body", {})["speed"] = "fast"
-        # Build extra_headers with ALL applicable betas (the per-request
-        # extra_headers override the client-level anthropic-beta header).
-        betas = list(_common_betas_for_base_url(base_url))
-        if is_oauth:
-            betas.extend(_OAUTH_ONLY_BETAS)
-        betas.append(_FAST_MODE_BETA)
-        kwargs["extra_headers"] = {"anthropic-beta": ",".join(betas)}
-
    return kwargs


@@ -1408,4 +1406,4 @@ def normalize_anthropic_response(
            reasoning_details=reasoning_details or None,
        ),
        finish_reason,
-    )
+    )
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
--- a/agent/builtin_memory_provider.py
+++ b/agent/builtin_memory_provider.py
@@ -0,0 +1,114 @@
+"""BuiltinMemoryProvider — wraps MEMORY.md / USER.md as a MemoryProvider.
+
+Always registered as the first provider. Cannot be disabled or removed.
+This is the existing Hermes memory system exposed through the provider
+interface for compatibility with the MemoryManager.
+
+The actual storage logic lives in tools/memory_tool.py (MemoryStore).
+This provider is a thin adapter that delegates to MemoryStore and
+exposes the memory tool schema.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any, Dict, List
+
+from agent.memory_provider import MemoryProvider
+from tools.registry import tool_error
+
+logger = logging.getLogger(__name__)
+
+
+class BuiltinMemoryProvider(MemoryProvider):
+    """Built-in file-backed memory (MEMORY.md + USER.md).
+
+    Always active, never disabled by other providers. The `memory` tool
+    is handled by run_agent.py's agent-level tool interception (not through
+    the normal registry), so get_tool_schemas() returns an empty list —
+    the memory tool is already wired separately.
+    """
+
+    def __init__(
+        self,
+        memory_store=None,
+        memory_enabled: bool = False,
+        user_profile_enabled: bool = False,
+    ):
+        self._store = memory_store
+        self._memory_enabled = memory_enabled
+        self._user_profile_enabled = user_profile_enabled
+
+    @property
+    def name(self) -> str:
+        return "builtin"
+
+    def is_available(self) -> bool:
+        """Built-in memory is always available."""
+        return True
+
+    def initialize(self, session_id: str, **kwargs) -> None:
+        """Load memory from disk if not already loaded."""
+        if self._store is not None:
+            self._store.load_from_disk()
+
+    def system_prompt_block(self) -> str:
+        """Return MEMORY.md and USER.md content for the system prompt.
+
+        Uses the frozen snapshot captured at load time. This ensures the
+        system prompt stays stable throughout a session (preserving the
+        prompt cache), even though the live entries may change via tool calls.
+        """
+        if not self._store:
+            return ""
+
+        parts = []
+        if self._memory_enabled:
+            mem_block = self._store.format_for_system_prompt("memory")
+            if mem_block:
+                parts.append(mem_block)
+        if self._user_profile_enabled:
+            user_block = self._store.format_for_system_prompt("user")
+            if user_block:
+                parts.append(user_block)
+
+        return "\n\n".join(parts)
+
+    def prefetch(self, query: str, *, session_id: str = "") -> str:
+        """Built-in memory doesn't do query-based recall — it's injected via system_prompt_block."""
+        return ""
+
+    def sync_turn(self, user_content: str, assistant_content: str, *, session_id: str = "") -> None:
+        """Built-in memory doesn't auto-sync turns — writes happen via the memory tool."""
+
+    def get_tool_schemas(self) -> List[Dict[str, Any]]:
+        """Return empty list.
+
+        The `memory` tool is an agent-level intercepted tool, handled
+        specially in run_agent.py before normal tool dispatch. It's not
+        part of the standard tool registry. We don't duplicate it here.
+        """
+        return []
+
+    def handle_tool_call(self, tool_name: str, args: Dict[str, Any], **kwargs) -> str:
+        """Not used — the memory tool is intercepted in run_agent.py."""
+        return tool_error("Built-in memory tool is handled by the agent loop")
+
+    def shutdown(self) -> None:
+        """No cleanup needed — files are saved on every write."""
+
+    # -- Property access for backward compatibility --------------------------
+
+    @property
+    def store(self):
+        """Access the underlying MemoryStore for legacy code paths."""
+        return self._store
+
+    @property
+    def memory_enabled(self) -> bool:
+        return self._memory_enabled
+
+    @property
+    def user_profile_enabled(self) -> bool:
+        return self._user_profile_enabled
--- a/agent/circuit_breaker.py
+++ b/agent/circuit_breaker.py
@@ -1,273 +0,0 @@
-"""
-Circuit Breaker for Error Cascading — #885
-
-P(error | prev was error) = 58.6% vs P(error | prev was success) = 25.2%.
-That's a 2.33x cascade factor. After 3 consecutive errors, the circuit
-opens and the agent must take corrective action.
-
-States:
- CLOSED: Normal operation, errors are counted
- OPEN: Too many consecutive errors, corrective action required
- HALF_OPEN: Testing if errors have cleared
-
-Usage:
-    from agent.circuit_breaker import CircuitBreaker, ToolCircuitBreaker
-    
-    cb = ToolCircuitBreaker()
-    
-    # After each tool call
-    if not cb.record_result(success=True):
-        # Circuit is open — take corrective action
-        cb.get_recovery_action()
-"""
-
-import time
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import Any, Dict, List, Optional
-
-
-class CircuitState(Enum):
-    CLOSED = "closed"      # Normal operation
-    OPEN = "open"          # Too many errors, block execution
-    HALF_OPEN = "half_open"  # Testing recovery
-
-
-@dataclass
-class CircuitBreaker:
-    """
-    Generic circuit breaker with configurable thresholds.
-    
-    Tracks consecutive errors and opens the circuit when the
-    error streak exceeds the threshold.
-    """
-    failure_threshold: int = 3
-    recovery_timeout: float = 30.0  # seconds before trying half-open
-    success_threshold: int = 2      # successes needed to close from half-open
-    
-    state: CircuitState = field(default=CircuitState.CLOSED, init=False)
-    consecutive_failures: int = field(default=0, init=False)
-    consecutive_successes: int = field(default=0, init=False)
-    last_failure_time: Optional[float] = field(default=None, init=False)
-    total_trips: int = field(default=0, init=False)
-    error_streaks: List[int] = field(default_factory=list, init=False)
-    
-    def record_result(self, success: bool) -> bool:
-        """
-        Record a tool call result. Returns True if circuit allows execution.
-        
-        Returns:
-            True if circuit is CLOSED or HALF_OPEN (execution allowed)
-            False if circuit is OPEN (execution blocked)
-        """
-        now = time.time()
-        
-        if self.state == CircuitState.OPEN:
-            # Check if recovery timeout has passed
-            if self.last_failure_time and (now - self.last_failure_time) >= self.recovery_timeout:
-                self.state = CircuitState.HALF_OPEN
-                self.consecutive_successes = 0
-                return True  # Allow one test execution
-            return False  # Still open
-        
-        if success:
-            self.consecutive_failures = 0
-            self.consecutive_successes += 1
-            
-            if self.state == CircuitState.HALF_OPEN:
-                if self.consecutive_successes >= self.success_threshold:
-                    self.state = CircuitState.CLOSED
-                    self.consecutive_successes = 0
-            
-            return True
-        else:
-            self.consecutive_successes = 0
-            self.consecutive_failures += 1
-            self.last_failure_time = now
-            
-            if self.state == CircuitState.HALF_OPEN:
-                # Failed during recovery — reopen immediately
-                self.state = CircuitState.OPEN
-                self.total_trips += 1
-                return False
-            
-            if self.consecutive_failures >= self.failure_threshold:
-                self.state = CircuitState.OPEN
-                self.total_trips += 1
-                self.error_streaks.append(self.consecutive_failures)
-                return False
-            
-            return True
-    
-    def can_execute(self) -> bool:
-        """Check if execution is allowed."""
-        if self.state == CircuitState.OPEN:
-            if self.last_failure_time:
-                now = time.time()
-                if (now - self.last_failure_time) >= self.recovery_timeout:
-                    self.state = CircuitState.HALF_OPEN
-                    self.consecutive_successes = 0
-                    return True
-            return False
-        return True
-    
-    def get_state(self) -> Dict[str, Any]:
-        """Get current circuit state."""
-        return {
-            "state": self.state.value,
-            "consecutive_failures": self.consecutive_failures,
-            "consecutive_successes": self.consecutive_successes,
-            "total_trips": self.total_trips,
-            "max_streak": max(self.error_streaks) if self.error_streaks else 0,
-            "can_execute": self.can_execute(),
-        }
-    
-    def reset(self):
-        """Reset the circuit breaker."""
-        self.state = CircuitState.CLOSED
-        self.consecutive_failures = 0
-        self.consecutive_successes = 0
-        self.last_failure_time = None
-
-
-class ToolCircuitBreaker(CircuitBreaker):
-    """
-    Circuit breaker specifically for tool call error cascading.
-    
-    Provides recovery actions when the circuit opens.
-    """
-    
-    # Tools that are most effective at recovery (from audit data)
-    RECOVERY_TOOLS = [
-        "terminal",     # Most effective — 2300 recoveries
-        "read_file",    # Reset context by reading something
-        "search_files", # Find what went wrong
-    ]
-    
-    def get_recovery_action(self) -> Dict[str, Any]:
-        """
-        Get the recommended recovery action when circuit is open.
-        
-        Returns dict with action type and details.
-        """
-        streak = self.consecutive_failures
-        
-        if streak >= 9:
-            # After 9 errors: 41/46 recoveries via terminal
-            return {
-                "action": "terminal_only",
-                "reason": f"Error streak of {streak} — terminal is the only reliable recovery",
-                "suggested_tool": "terminal",
-                "suggested_command": "echo 'Resetting context'",
-                "severity": "critical",
-            }
-        elif streak >= 5:
-            return {
-                "action": "switch_tool_type",
-                "reason": f"Error streak of {streak} — switch to a different tool category",
-                "suggested_tools": ["read_file", "search_files", "terminal"],
-                "severity": "high",
-            }
-        elif streak >= self.failure_threshold:
-            return {
-                "action": "ask_user",
-                "reason": f"{streak} consecutive errors — ask user for guidance",
-                "suggested_response": "I'm encountering repeated errors. Would you like me to try a different approach?",
-                "severity": "medium",
-            }
-        else:
-            return {
-                "action": "continue",
-                "reason": f"Error streak of {streak} — within tolerance",
-                "severity": "low",
-            }
-    
-    def should_compress_context(self) -> bool:
-        """Determine if context compression would help recovery."""
-        return self.consecutive_failures >= 5
-    
-    def get_blocked_tool(self) -> Optional[str]:
-        """Get the tool that should be blocked (if any)."""
-        if self.state == CircuitState.OPEN:
-            return "last_failed_tool"
-        return None
-
-
-class MultiToolCircuitBreaker:
-    """
-    Manages per-tool circuit breakers and cross-tool cascade detection.
-    
-    When one tool trips its breaker, related tools are also warned.
-    """
-    
-    def __init__(self):
-        self.breakers: Dict[str, ToolCircuitBreaker] = {}
-        self.global_streak: int = 0
-        self.last_tool: Optional[str] = None
-        self.last_success: bool = True
-    
-    def get_breaker(self, tool_name: str) -> ToolCircuitBreaker:
-        """Get or create a circuit breaker for a tool."""
-        if tool_name not in self.breakers:
-            self.breakers[tool_name] = ToolCircuitBreaker()
-        return self.breakers[tool_name]
-    
-    def record_result(self, tool_name: str, success: bool) -> bool:
-        """
-        Record a tool call result. Returns True if execution should continue.
-        """
-        breaker = self.get_breaker(tool_name)
-        allowed = breaker.record_result(success)
-        
-        # Track global streak
-        if success:
-            self.global_streak = 0
-            self.last_success = True
-        else:
-            self.global_streak += 1
-            self.last_success = False
-        
-        self.last_tool = tool_name
-        return allowed
-    
-    def can_execute(self, tool_name: str) -> bool:
-        """Check if a specific tool can execute."""
-        breaker = self.get_breaker(tool_name)
-        return breaker.can_execute()
-    
-    def get_global_state(self) -> Dict[str, Any]:
-        """Get overall circuit breaker state."""
-        return {
-            "global_streak": self.global_streak,
-            "last_tool": self.last_tool,
-            "last_success": self.last_success,
-            "tool_states": {
-                name: breaker.get_state()
-                for name, breaker in self.breakers.items()
-                if breaker.consecutive_failures > 0 or breaker.total_trips > 0
-            },
-            "any_open": any(b.state == CircuitState.OPEN for b in self.breakers.values()),
-        }
-    
-    def get_recovery_action(self) -> Dict[str, Any]:
-        """Get recovery action based on global state."""
-        if self.global_streak == 0:
-            return {"action": "continue", "reason": "No errors"}
-        
-        # Find the breaker with the worst streak
-        worst = max(self.breakers.values(), key=lambda b: b.consecutive_failures, default=None)
-        if worst and worst.consecutive_failures > 0:
-            return worst.get_recovery_action()
-        
-        return {
-            "action": "continue",
-            "reason": f"Global streak: {self.global_streak}",
-            "severity": "low",
-        }
-    
-    def reset_all(self):
-        """Reset all circuit breakers."""
-        for breaker in self.breakers.values():
-            breaker.reset()
-        self.global_streak = 0
-        self.last_success = True
--- a/agent/context_budget.py
+++ b/agent/context_budget.py
@@ -1,148 +0,0 @@
-"""
-Context Budget Tracker - Prevent context window overflow
-
-Poka-yoke: Visual warnings at 70%%, 85%%, 95%% capacity.
-Auto-checkpoint at 85%%. Pre-flight token estimation.
-
-Issue: #838
-"""
-
-import json
-import logging
-import time
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-
-logger = logging.getLogger(__name__)
-
-HERMES_HOME = Path.home() / ".hermes"
-CHECKPOINT_DIR = HERMES_HOME / "checkpoints"
-CHARS_PER_TOKEN = 4
-
-THRESHOLD_WARNING = 0.70
-THRESHOLD_CRITICAL = 0.85
-THRESHOLD_DANGER = 0.95
-
-
-class ContextBudget:
-    def __init__(self, context_limit: int = 128000, system_tokens: int = 0, 
-                 used_tokens: int = 0, reserved_tokens: int = 2000):
-        self.context_limit = context_limit
-        self.system_tokens = system_tokens
-        self.used_tokens = used_tokens
-        self.reserved_tokens = reserved_tokens
-    
-    @property
-    def total_used(self) -> int:
-        return self.system_tokens + self.used_tokens
-    
-    @property
-    def available(self) -> int:
-        return max(0, self.context_limit - self.reserved_tokens)
-    
-    @property
-    def remaining(self) -> int:
-        return max(0, self.available - self.total_used)
-    
-    @property
-    def utilization(self) -> float:
-        return self.total_used / self.available if self.available > 0 else 1.0
-
-
-def estimate_tokens(text: str) -> int:
-    return len(text) // CHARS_PER_TOKEN if text else 0
-
-
-def estimate_messages_tokens(messages: List[Dict]) -> int:
-    total = 0
-    for msg in messages:
-        content = msg.get("content", "")
-        if isinstance(content, str):
-            total += estimate_tokens(content)
-        if msg.get("tool_calls"):
-            total += 100
-    return total
-
-
-class ContextBudgetTracker:
-    def __init__(self, context_limit: int = 128000, session_id: str = ""):
-        self.budget = ContextBudget(context_limit=context_limit)
-        self.session_id = session_id
-        self._checkpointed = False
-        self._warnings_given = set()
-    
-    def update_from_messages(self, messages: List[Dict]):
-        self.budget.used_tokens = estimate_messages_tokens(messages)
-    
-    def can_fit(self, additional_tokens: int) -> bool:
-        return self.budget.remaining >= additional_tokens
-    
-    def preflight_check(self, text: str) -> Tuple[bool, str]:
-        tokens = estimate_tokens(text)
-        if not self.can_fit(tokens):
-            return False, f"Cannot load: ~{tokens:,} tokens needed, {self.budget.remaining:,} remaining"
-        would_util = (self.budget.total_used + tokens) / self.budget.available if self.budget.available > 0 else 1.0
-        if would_util >= THRESHOLD_DANGER:
-            return False, f"Would reach {would_util:.0%%} capacity. Summarize or start new session."
-        if would_util >= THRESHOLD_CRITICAL:
-            return True, f"Warning: will reach {would_util:.0%%} capacity."
-        return True, ""
-    
-    def get_warning(self) -> Optional[str]:
-        util = self.budget.utilization
-        if util >= THRESHOLD_DANGER and "danger" not in self._warnings_given:
-            self._warnings_given.add("danger")
-            return f"[CONTEXT CRITICAL: {util:.0%%} used -- {self.budget.remaining:,} tokens left. Summarize or start new session.]"
-        if util >= THRESHOLD_CRITICAL and "critical" not in self._warnings_given:
-            self._warnings_given.add("critical")
-            self._auto_checkpoint()
-            return f"[CONTEXT WARNING: {util:.0%%} used -- consider summarizing. Auto-checkpoint saved.]"
-        if util >= THRESHOLD_WARNING and "warning" not in self._warnings_given:
-            self._warnings_given.add("warning")
-            return f"[CONTEXT: {util:.0%%} used -- {self.budget.remaining:,} tokens remaining]"
-        return None
-    
-    def _auto_checkpoint(self):
-        if self._checkpointed or not self.session_id:
-            return
-        try:
-            CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
-            path = CHECKPOINT_DIR / f"{self.session_id}.json"
-            path.write_text(json.dumps({
-                "session_id": self.session_id,
-                "timestamp": time.time(),
-                "budget": {"utilization": round(self.budget.utilization * 100, 1)}
-            }, indent=2))
-            self._checkpointed = True
-            logger.info("Auto-checkpoint saved: %s", path)
-        except Exception as e:
-            logger.error("Auto-checkpoint failed: %s", e)
-    
-    def get_status_line(self) -> str:
-        util = self.budget.utilization
-        remaining = self.budget.remaining
-        if util >= THRESHOLD_DANGER:
-            return f"RED {util:.0%%} used ({remaining:,} left)"
-        elif util >= THRESHOLD_CRITICAL:
-            return f"ORANGE {util:.0%%} used ({remaining:,} left)"
-        elif util >= THRESHOLD_WARNING:
-            return f"YELLOW {util:.0%%} used ({remaining:,} left)"
-        return f"GREEN {util:.0%%} used ({remaining:,} left)"
-
-
-_tracker = None
-
-def get_tracker(context_limit=128000, session_id=""):
-    global _tracker
-    if _tracker is None:
-        _tracker = ContextBudgetTracker(context_limit, session_id)
-    return _tracker
-
-def check_context_budget(messages, context_limit=128000):
-    tracker = get_tracker(context_limit)
-    tracker.update_from_messages(messages)
-    return tracker.get_warning()
-
-def preflight_token_check(text):
-    tracker = get_tracker()
-    return tracker.preflight_check(text)
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -4,12 +4,8 @@ Self-contained class with its own OpenAI client for summarization.
 Uses auxiliary model (cheap/fast) to summarize middle turns while
 protecting head and tail context.

-Improvements over v2:
-  - Structured summary template with Resolved/Pending question tracking
-  - Summarizer preamble: "Do not respond to any questions" (from OpenCode)
-  - Handoff framing: "different assistant" (from Codex) to create separation
-  - "Remaining Work" replaces "Next Steps" to avoid reading as active instructions
-  - Clear separator when summary merges into tail message
+Improvements over v1:
+  - Structured summary template (Goal, Progress, Decisions, Files, Next Steps)
  - Iterative summary updates (preserves info across multiple compactions)
  - Token-budget tail protection instead of fixed message count
  - Tool output pruning before LLM summarization (cheap pre-pass)
@@ -17,17 +13,12 @@ Improvements over v2:
  - Richer tool call/result detail in summarizer input
 """

-import hashlib
-import json
 import logging
-import re
 import time
 from typing import Any, Dict, List, Optional

 from agent.auxiliary_client import call_llm
-from agent.context_engine import ContextEngine
 from agent.model_metadata import (
-    MINIMUM_CONTEXT_LENGTH,
    get_model_context_length,
    estimate_messages_tokens_rough,
 )
@@ -35,13 +26,12 @@ from agent.model_metadata import (
 logger = logging.getLogger(__name__)

 SUMMARY_PREFIX = (
-    "[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted "
-    "into the summary below. This is a handoff from a previous context "
-    "window — treat it as background reference, NOT as active instructions. "
-    "Do NOT answer questions or fulfill requests mentioned in this summary; "
-    "they were already addressed. Respond ONLY to the latest user message "
-    "that appears AFTER this summary. The current session state (files, "
-    "config, etc.) may reflect work described here — avoid repeating it:"
+    "[CONTEXT COMPACTION] Earlier turns in this conversation were compacted "
+    "to save context space. The summary below describes work that was "
+    "already completed, and the current session state may still reflect "
+    "that work (for example, files may already be changed). Use the summary "
+    "and the current state to continue from where things left off, and "
+    "avoid repeating work:"
 )
 LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"

@@ -60,130 +50,8 @@ _CHARS_PER_TOKEN = 4
 _SUMMARY_FAILURE_COOLDOWN_SECONDS = 600


-def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> str:
-    """Create an informative 1-line summary of a tool call + result.
-
-    Used during the pre-compression pruning pass to replace large tool
-    outputs with a short but useful description of what the tool did,
-    rather than a generic placeholder that carries zero information.
-
-    Returns strings like::
-
-        [terminal] ran `npm test` -> exit 0, 47 lines output
-        [read_file] read config.py from line 1 (1,200 chars)
-        [search_files] content search for 'compress' in agent/ -> 12 matches
-    """
-    try:
-        args = json.loads(tool_args) if tool_args else {}
-    except (json.JSONDecodeError, TypeError):
-        args = {}
-
-    content = tool_content or ""
-    content_len = len(content)
-    line_count = content.count("\n") + 1 if content.strip() else 0
-
-    if tool_name == "terminal":
-        cmd = args.get("command", "")
-        if len(cmd) > 80:
-            cmd = cmd[:77] + "..."
-        exit_match = re.search(r'"exit_code"\s*:\s*(-?\d+)', content)
-        exit_code = exit_match.group(1) if exit_match else "?"
-        return f"[terminal] ran `{cmd}` -> exit {exit_code}, {line_count} lines output"
-
-    if tool_name == "read_file":
-        path = args.get("path", "?")
-        offset = args.get("offset", 1)
-        return f"[read_file] read {path} from line {offset} ({content_len:,} chars)"
-
-    if tool_name == "write_file":
-        path = args.get("path", "?")
-        written_lines = args.get("content", "").count("\n") + 1 if args.get("content") else "?"
-        return f"[write_file] wrote to {path} ({written_lines} lines)"
-
-    if tool_name == "search_files":
-        pattern = args.get("pattern", "?")
-        path = args.get("path", ".")
-        target = args.get("target", "content")
-        match_count = re.search(r'"total_count"\s*:\s*(\d+)', content)
-        count = match_count.group(1) if match_count else "?"
-        return f"[search_files] {target} search for '{pattern}' in {path} -> {count} matches"
-
-    if tool_name == "patch":
-        path = args.get("path", "?")
-        mode = args.get("mode", "replace")
-        return f"[patch] {mode} in {path} ({content_len:,} chars result)"
-
-    if tool_name in ("browser_navigate", "browser_click", "browser_snapshot",
-                     "browser_type", "browser_scroll", "browser_vision"):
-        url = args.get("url", "")
-        ref = args.get("ref", "")
-        detail = f" {url}" if url else (f" ref={ref}" if ref else "")
-        return f"[{tool_name}]{detail} ({content_len:,} chars)"
-
-    if tool_name == "web_search":
-        query = args.get("query", "?")
-        return f"[web_search] query='{query}' ({content_len:,} chars result)"
-
-    if tool_name == "web_extract":
-        urls = args.get("urls", [])
-        url_desc = urls[0] if isinstance(urls, list) and urls else "?"
-        if isinstance(urls, list) and len(urls) > 1:
-            url_desc += f" (+{len(urls) - 1} more)"
-        return f"[web_extract] {url_desc} ({content_len:,} chars)"
-
-    if tool_name == "delegate_task":
-        goal = args.get("goal", "")
-        if len(goal) > 60:
-            goal = goal[:57] + "..."
-        return f"[delegate_task] '{goal}' ({content_len:,} chars result)"
-
-    if tool_name == "execute_code":
-        code_preview = (args.get("code") or "")[:60].replace("\n", " ")
-        if len(args.get("code", "")) > 60:
-            code_preview += "..."
-        return f"[execute_code] `{code_preview}` ({line_count} lines output)"
-
-    if tool_name in ("skill_view", "skills_list", "skill_manage"):
-        name = args.get("name", "?")
-        return f"[{tool_name}] name={name} ({content_len:,} chars)"
-
-    if tool_name == "vision_analyze":
-        question = args.get("question", "")[:50]
-        return f"[vision_analyze] '{question}' ({content_len:,} chars)"
-
-    if tool_name == "memory":
-        action = args.get("action", "?")
-        target = args.get("target", "?")
-        return f"[memory] {action} on {target}"
-
-    if tool_name == "todo":
-        return "[todo] updated task list"
-
-    if tool_name == "clarify":
-        return "[clarify] asked user a question"
-
-    if tool_name == "text_to_speech":
-        return f"[text_to_speech] generated audio ({content_len:,} chars)"
-
-    if tool_name == "cronjob":
-        action = args.get("action", "?")
-        return f"[cronjob] {action}"
-
-    if tool_name == "process":
-        action = args.get("action", "?")
-        sid = args.get("session_id", "?")
-        return f"[process] {action} session={sid}"
-
-    # Generic fallback
-    first_arg = ""
-    for k, v in list(args.items())[:2]:
-        sv = str(v)[:40]
-        first_arg += f" {k}={sv}"
-    return f"[{tool_name}]{first_arg} ({content_len:,} chars result)"
-
-
-class ContextCompressor(ContextEngine):
-    """Default context engine — compresses conversation context via lossy summarization.
+class ContextCompressor:
+    """Compresses conversation context when approaching the model's context limit.

    Algorithm:
      1. Prune old tool results (cheap, no LLM call)
@@ -193,40 +61,6 @@ class ContextCompressor(ContextEngine):
      5. On subsequent compactions, iteratively update the previous summary
    """

-    @property
-    def name(self) -> str:
-        return "compressor"
-
-    def on_session_reset(self) -> None:
-        """Reset all per-session state for /new or /reset."""
-        super().on_session_reset()
-        self._context_probed = False
-        self._context_probe_persistable = False
-        self._previous_summary = None
-        self._last_compression_savings_pct = 100.0
-        self._ineffective_compression_count = 0
-
-    def update_model(
-        self,
-        model: str,
-        context_length: int,
-        base_url: str = "",
-        api_key: str = "",
-        provider: str = "",
-        api_mode: str = "",
-    ) -> None:
-        """Update model info after a model switch or fallback activation."""
-        self.model = model
-        self.base_url = base_url
-        self.api_key = api_key
-        self.provider = provider
-        self.api_mode = api_mode
-        self.context_length = context_length
-        self.threshold_tokens = max(
-            int(context_length * self.threshold_percent),
-            MINIMUM_CONTEXT_LENGTH,
-        )
-
    def __init__(
        self,
        model: str,
@@ -240,13 +74,11 @@ class ContextCompressor(ContextEngine):
        api_key: str = "",
        config_context_length: int | None = None,
        provider: str = "",
-        api_mode: str = "",
    ):
        self.model = model
        self.base_url = base_url
        self.api_key = api_key
        self.provider = provider
-        self.api_mode = api_mode
        self.threshold_percent = threshold_percent
        self.protect_first_n = protect_first_n
        self.protect_last_n = protect_last_n
@@ -258,14 +90,7 @@ class ContextCompressor(ContextEngine):
            config_context_length=config_context_length,
            provider=provider,
        )
-        # Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
-        # the percentage would suggest a lower value.  This prevents premature
-        # compression on large-context models at 50% while keeping the % sane
-        # for models right at the minimum.
-        self.threshold_tokens = max(
-            int(self.context_length * threshold_percent),
-            MINIMUM_CONTEXT_LENGTH,
-        )
+        self.threshold_tokens = int(self.context_length * threshold_percent)
        self.compression_count = 0

        # Derive token budgets: ratio is relative to the threshold, not total context
@@ -289,42 +114,39 @@ class ContextCompressor(ContextEngine):

        self.last_prompt_tokens = 0
        self.last_completion_tokens = 0
+        self.last_total_tokens = 0

        self.summary_model = summary_model_override or ""

        # Stores the previous compaction summary for iterative updates
        self._previous_summary: Optional[str] = None
-        # Anti-thrashing: track whether last compression was effective
-        self._last_compression_savings_pct: float = 100.0
-        self._ineffective_compression_count: int = 0
        self._summary_failure_cooldown_until: float = 0.0

    def update_from_response(self, usage: Dict[str, Any]):
        """Update tracked token usage from API response."""
        self.last_prompt_tokens = usage.get("prompt_tokens", 0)
        self.last_completion_tokens = usage.get("completion_tokens", 0)
+        self.last_total_tokens = usage.get("total_tokens", 0)

    def should_compress(self, prompt_tokens: int = None) -> bool:
-        """Check if context exceeds the compression threshold.
-
-        Includes anti-thrashing protection: if the last two compressions
-        each saved less than 10%, skip compression to avoid infinite loops
-        where each pass removes only 1-2 messages.
-        """
+        """Check if context exceeds the compression threshold."""
        tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens
-        if tokens < self.threshold_tokens:
-            return False
-        # Anti-thrashing: back off if recent compressions were ineffective
-        if self._ineffective_compression_count >= 2:
-            if not self.quiet_mode:
-                logger.warning(
-                    "Compression skipped — last %d compressions saved <10%% each. "
-                    "Consider /new to start a fresh session, or /compress <topic> "
-                    "for focused compression.",
-                    self._ineffective_compression_count,
-                )
-            return False
-        return True
+        return tokens >= self.threshold_tokens
+
+    def should_compress_preflight(self, messages: List[Dict[str, Any]]) -> bool:
+        """Quick pre-flight check using rough estimate (before API call)."""
+        rough_estimate = estimate_messages_tokens_rough(messages)
+        return rough_estimate >= self.threshold_tokens
+
+    def get_status(self) -> Dict[str, Any]:
+        """Get current compression status for display/logging."""
+        return {
+            "last_prompt_tokens": self.last_prompt_tokens,
+            "threshold_tokens": self.threshold_tokens,
+            "context_length": self.context_length,
+            "usage_percent": min(100, (self.last_prompt_tokens / self.context_length * 100)) if self.context_length else 0,
+            "compression_count": self.compression_count,
+        }

    # ------------------------------------------------------------------
    # Tool output pruning (cheap pre-pass, no LLM call)
@@ -332,24 +154,12 @@ class ContextCompressor(ContextEngine):

    def _prune_old_tool_results(
        self, messages: List[Dict[str, Any]], protect_tail_count: int,
-        protect_tail_tokens: int | None = None,
    ) -> tuple[List[Dict[str, Any]], int]:
-        """Replace old tool result contents with informative 1-line summaries.
+        """Replace old tool result contents with a short placeholder.

-        Instead of a generic placeholder, generates a summary like::
-
-            [terminal] ran `npm test` -> exit 0, 47 lines output
-            [read_file] read config.py from line 1 (3,400 chars)
-
-        Also deduplicates identical tool results (e.g. reading the same file
-        5x keeps only the newest full copy) and truncates large tool_call
-        arguments in assistant messages outside the protected tail.
-
-        Walks backward from the end, protecting the most recent messages that
-        fall within ``protect_tail_tokens`` (when provided) OR the last
-        ``protect_tail_count`` messages (backward-compatible default).
-        When both are given, the token budget takes priority and the message
-        count acts as a hard minimum floor.
+        Walks backward from the end, protecting the most recent
+        ``protect_tail_count`` messages. Older tool results get their
+        content replaced with a placeholder string.

        Returns (pruned_messages, pruned_count).
        """
@@ -358,110 +168,20 @@ class ContextCompressor(ContextEngine):

        result = [m.copy() for m in messages]
        pruned = 0
+        prune_boundary = len(result) - protect_tail_count

-        # Build index: tool_call_id -> (tool_name, arguments_json)
-        call_id_to_tool: Dict[str, tuple] = {}
-        for msg in result:
-            if msg.get("role") == "assistant":
-                for tc in msg.get("tool_calls") or []:
-                    if isinstance(tc, dict):
-                        cid = tc.get("id", "")
-                        fn = tc.get("function", {})
-                        call_id_to_tool[cid] = (fn.get("name", "unknown"), fn.get("arguments", ""))
-                    else:
-                        cid = getattr(tc, "id", "") or ""
-                        fn = getattr(tc, "function", None)
-                        name = getattr(fn, "name", "unknown") if fn else "unknown"
-                        args_str = getattr(fn, "arguments", "") if fn else ""
-                        call_id_to_tool[cid] = (name, args_str)
-
-        # Determine the prune boundary
-        if protect_tail_tokens is not None and protect_tail_tokens > 0:
-            # Token-budget approach: walk backward accumulating tokens
-            accumulated = 0
-            boundary = len(result)
-            min_protect = min(protect_tail_count, len(result) - 1)
-            for i in range(len(result) - 1, -1, -1):
-                msg = result[i]
-                raw_content = msg.get("content") or ""
-                content_len = sum(len(p.get("text", "")) for p in raw_content) if isinstance(raw_content, list) else len(raw_content)
-                msg_tokens = content_len // _CHARS_PER_TOKEN + 10
-                for tc in msg.get("tool_calls") or []:
-                    if isinstance(tc, dict):
-                        args = tc.get("function", {}).get("arguments", "")
-                        msg_tokens += len(args) // _CHARS_PER_TOKEN
-                if accumulated + msg_tokens > protect_tail_tokens and (len(result) - i) >= min_protect:
-                    boundary = i
-                    break
-                accumulated += msg_tokens
-                boundary = i
-            prune_boundary = max(boundary, len(result) - min_protect)
-        else:
-            prune_boundary = len(result) - protect_tail_count
-
-        # Pass 1: Deduplicate identical tool results.
-        # When the same file is read multiple times, keep only the most recent
-        # full copy and replace older duplicates with a back-reference.
-        content_hashes: dict = {}  # hash -> (index, tool_call_id)
-        for i in range(len(result) - 1, -1, -1):
-            msg = result[i]
-            if msg.get("role") != "tool":
-                continue
-            content = msg.get("content") or ""
-            # Skip multimodal content (list of content blocks)
-            if isinstance(content, list):
-                continue
-            if len(content) < 200:
-                continue
-            h = hashlib.md5(content.encode("utf-8", errors="replace")).hexdigest()[:12]
-            if h in content_hashes:
-                # This is an older duplicate — replace with back-reference
-                result[i] = {**msg, "content": "[Duplicate tool output — same content as a more recent call]"}
-                pruned += 1
-            else:
-                content_hashes[h] = (i, msg.get("tool_call_id", "?"))
-
-        # Pass 2: Replace old tool results with informative summaries
        for i in range(prune_boundary):
            msg = result[i]
            if msg.get("role") != "tool":
                continue
            content = msg.get("content", "")
-            # Skip multimodal content (list of content blocks)
-            if isinstance(content, list):
-                continue
            if not content or content == _PRUNED_TOOL_PLACEHOLDER:
                continue
-            # Skip already-deduplicated or previously-summarized results
-            if content.startswith("[Duplicate tool output"):
-                continue
            # Only prune if the content is substantial (>200 chars)
            if len(content) > 200:
-                call_id = msg.get("tool_call_id", "")
-                tool_name, tool_args = call_id_to_tool.get(call_id, ("unknown", ""))
-                summary = _summarize_tool_result(tool_name, tool_args, content)
-                result[i] = {**msg, "content": summary}
+                result[i] = {**msg, "content": _PRUNED_TOOL_PLACEHOLDER}
                pruned += 1

-        # Pass 3: Truncate large tool_call arguments in assistant messages
-        # outside the protected tail. write_file with 50KB content, for
-        # example, survives pruning entirely without this.
-        for i in range(prune_boundary):
-            msg = result[i]
-            if msg.get("role") != "assistant" or not msg.get("tool_calls"):
-                continue
-            new_tcs = []
-            modified = False
-            for tc in msg["tool_calls"]:
-                if isinstance(tc, dict):
-                    args = tc.get("function", {}).get("arguments", "")
-                    if len(args) > 500:
-                        tc = {**tc, "function": {**tc["function"], "arguments": args[:200] + "...[truncated]"}}
-                        modified = True
-                new_tcs.append(tc)
-            if modified:
-                result[i] = {**msg, "tool_calls": new_tcs}
-
        return result, pruned

    # ------------------------------------------------------------------
@@ -479,39 +199,30 @@ class ContextCompressor(ContextEngine):
        budget = int(content_tokens * _SUMMARY_RATIO)
        return max(_MIN_SUMMARY_TOKENS, min(budget, self.max_summary_tokens))

-    # Truncation limits for the summarizer input.  These bound how much of
-    # each message the summary model sees — the budget is the *summary*
-    # model's context window, not the main model's.
-    _CONTENT_MAX = 6000       # total chars per message body
-    _CONTENT_HEAD = 4000      # chars kept from the start
-    _CONTENT_TAIL = 1500      # chars kept from the end
-    _TOOL_ARGS_MAX = 1500     # tool call argument chars
-    _TOOL_ARGS_HEAD = 1200    # kept from the start of tool args
-
    def _serialize_for_summary(self, turns: List[Dict[str, Any]]) -> str:
        """Serialize conversation turns into labeled text for the summarizer.

-        Includes tool call arguments and result content (up to
-        ``_CONTENT_MAX`` chars per message) so the summarizer can preserve
-        specific details like file paths, commands, and outputs.
+        Includes tool call arguments and result content (up to 3000 chars
+        per message) so the summarizer can preserve specific details like
+        file paths, commands, and outputs.
        """
        parts = []
        for msg in turns:
            role = msg.get("role", "unknown")
            content = msg.get("content") or ""

-            # Tool results: keep enough content for the summarizer
+            # Tool results: keep more content than before (3000 chars)
            if role == "tool":
                tool_id = msg.get("tool_call_id", "")
-                if len(content) > self._CONTENT_MAX:
-                    content = content[:self._CONTENT_HEAD] + "\n...[truncated]...\n" + content[-self._CONTENT_TAIL:]
+                if len(content) > 3000:
+                    content = content[:2000] + "\n...[truncated]...\n" + content[-800:]
                parts.append(f"[TOOL RESULT {tool_id}]: {content}")
                continue

            # Assistant messages: include tool call names AND arguments
            if role == "assistant":
-                if len(content) > self._CONTENT_MAX:
-                    content = content[:self._CONTENT_HEAD] + "\n...[truncated]...\n" + content[-self._CONTENT_TAIL:]
+                if len(content) > 3000:
+                    content = content[:2000] + "\n...[truncated]...\n" + content[-800:]
                tool_calls = msg.get("tool_calls", [])
                if tool_calls:
                    tc_parts = []
@@ -521,8 +232,8 @@ class ContextCompressor(ContextEngine):
                            name = fn.get("name", "?")
                            args = fn.get("arguments", "")
                            # Truncate long arguments but keep enough for context
-                            if len(args) > self._TOOL_ARGS_MAX:
-                                args = args[:self._TOOL_ARGS_HEAD] + "..."
+                            if len(args) > 500:
+                                args = args[:400] + "..."
                            tc_parts.append(f"  {name}({args})")
                        else:
                            fn = getattr(tc, "function", None)
@@ -533,26 +244,19 @@ class ContextCompressor(ContextEngine):
                continue

            # User and other roles
-            if len(content) > self._CONTENT_MAX:
-                content = content[:self._CONTENT_HEAD] + "\n...[truncated]...\n" + content[-self._CONTENT_TAIL:]
+            if len(content) > 3000:
+                content = content[:2000] + "\n...[truncated]...\n" + content[-800:]
            parts.append(f"[{role.upper()}]: {content}")

        return "\n\n".join(parts)

-    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]:
+    def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
        """Generate a structured summary of conversation turns.

-        Uses a structured template (Goal, Progress, Decisions, Resolved/Pending
-        Questions, Files, Remaining Work) with explicit preamble telling the
-        summarizer not to answer questions.  When a previous summary exists,
+        Uses a structured template (Goal, Progress, Decisions, Files, Next Steps)
+        inspired by Pi-mono and OpenCode. When a previous summary exists,
        generates an iterative update instead of summarizing from scratch.

-        Args:
-            focus_topic: Optional focus string for guided compression.  When
-                provided, the summariser prioritises preserving information
-                related to this topic and is more aggressive about compressing
-                everything else.  Inspired by Claude Code's ``/compact``.
-
        Returns None if all attempts fail — the caller should drop
        the middle turns without a summary rather than inject a useless
        placeholder.
@@ -568,75 +272,9 @@ class ContextCompressor(ContextEngine):
        summary_budget = self._compute_summary_budget(turns_to_summarize)
        content_to_summarize = self._serialize_for_summary(turns_to_summarize)

-        # Preamble shared by both first-compaction and iterative-update prompts.
-        # Inspired by OpenCode's "do not respond to any questions" instruction
-        # and Codex's "another language model" framing.
-        _summarizer_preamble = (
-            "You are a summarization agent creating a context checkpoint. "
-            "Your output will be injected as reference material for a DIFFERENT "
-            "assistant that continues the conversation. "
-            "Do NOT respond to any questions or requests in the conversation — "
-            "only output the structured summary. "
-            "Do NOT include any preamble, greeting, or prefix."
-        )
-
-        # Shared structured template (used by both paths).
-        _template_sections = f"""## Goal
-[What the user is trying to accomplish]
-
-## Constraints & Preferences
-[User preferences, coding style, constraints, important decisions]
-
-## Completed Actions
-[Numbered list of concrete actions taken — include tool used, target, and outcome.
-Format each as: N. ACTION target — outcome [tool: name]
-Example:
-1. READ config.py:45 — found `==` should be `!=` [tool: read_file]
-2. PATCH config.py:45 — changed `==` to `!=` [tool: patch]
-3. TEST `pytest tests/` — 3/50 failed: test_parse, test_validate, test_edge [tool: terminal]
-Be specific with file paths, commands, line numbers, and results.]
-
-## Active State
-[Current working state — include:
- Working directory and branch (if applicable)
- Modified/created files with brief note on each
- Test status (X/Y passing)
- Any running processes or servers
- Environment details that matter]
-
-## In Progress
-[Work currently underway — what was being done when compaction fired]
-
-## Blocked
-[Any blockers, errors, or issues not yet resolved. Include exact error messages.]
-
-## Key Decisions
-[Important technical decisions and WHY they were made]
-
-## Resolved Questions
-[Questions the user asked that were ALREADY answered — include the answer so the next assistant does not re-answer them]
-
-## Pending User Asks
-[Questions or requests from the user that have NOT yet been answered or fulfilled. If none, write "None."]
-
-## Relevant Files
-[Files read, modified, or created — with brief note on each]
-
-## Remaining Work
-[What remains to be done — framed as context, not instructions]
-
-## Critical Context
-[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
-
-Target ~{summary_budget} tokens. Be CONCRETE — include file paths, command outputs, error messages, line numbers, and specific values. Avoid vague descriptions like "made some changes" — say exactly what changed.
-
-Write only the summary body. Do not include any preamble or prefix."""
-
        if self._previous_summary:
            # Iterative update: preserve existing info, add new progress
-            prompt = f"""{_summarizer_preamble}
-
-You are updating a context compaction summary. A previous compaction produced the summary below. New conversation turns have occurred since then and need to be incorporated.
+            prompt = f"""You are updating a context compaction summary. A previous compaction produced the summary below. New conversation turns have occurred since then and need to be incorporated.

 PREVIOUS SUMMARY:
 {self._previous_summary}
@@ -644,42 +282,81 @@ PREVIOUS SUMMARY:
 NEW TURNS TO INCORPORATE:
 {content_to_summarize}

-Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new completed actions to the numbered list (continue numbering). Move items from "In Progress" to "Completed Actions" when done. Move answered questions to "Resolved Questions". Update "Active State" to reflect current state. Remove information only if it is clearly obsolete.
+Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new progress. Move items from "In Progress" to "Done" when completed. Remove information only if it is clearly obsolete.

-{_template_sections}"""
+## Goal
+[What the user is trying to accomplish — preserve from previous summary, update if goal evolved]
+
+## Constraints & Preferences
+[User preferences, coding style, constraints, important decisions — accumulate across compactions]
+
+## Progress
+### Done
+[Completed work — include specific file paths, commands run, results obtained]
+### In Progress
+[Work currently underway]
+### Blocked
+[Any blockers or issues encountered]
+
+## Key Decisions
+[Important technical decisions and why they were made]
+
+## Relevant Files
+[Files read, modified, or created — with brief note on each. Accumulate across compactions.]
+
+## Next Steps
+[What needs to happen next to continue the work]
+
+## Critical Context
+[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
+
+Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions.
+
+Write only the summary body. Do not include any preamble or prefix."""
        else:
            # First compaction: summarize from scratch
-            prompt = f"""{_summarizer_preamble}
-
-Create a structured handoff summary for a different assistant that will continue this conversation after earlier turns are compacted. The next assistant should be able to understand what happened without re-reading the original turns.
+            prompt = f"""Create a structured handoff summary for a later assistant that will continue this conversation after earlier turns are compacted.

 TURNS TO SUMMARIZE:
 {content_to_summarize}

 Use this exact structure:

-{_template_sections}"""
+## Goal
+[What the user is trying to accomplish]

-        # Inject focus topic guidance when the user provides one via /compress <focus>.
-        # This goes at the end of the prompt so it takes precedence.
-        if focus_topic:
-            prompt += f"""
+## Constraints & Preferences
+[User preferences, coding style, constraints, important decisions]

-FOCUS TOPIC: "{focus_topic}"
-The user has requested that this compaction PRIORITISE preserving all information related to the focus topic above. For content related to "{focus_topic}", include full detail — exact values, file paths, command outputs, error messages, and decisions. For content NOT related to the focus topic, summarise more aggressively (brief one-liners or omit if truly irrelevant). The focus topic sections should receive roughly 60-70% of the summary token budget."""
+## Progress
+### Done
+[Completed work — include specific file paths, commands run, results obtained]
+### In Progress
+[Work currently underway]
+### Blocked
+[Any blockers or issues encountered]
+
+## Key Decisions
+[Important technical decisions and why they were made]
+
+## Relevant Files
+[Files read, modified, or created — with brief note on each]
+
+## Next Steps
+[What needs to happen next to continue the work]
+
+## Critical Context
+[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
+
+Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions. The goal is to prevent the next assistant from repeating work or losing important details.
+
+Write only the summary body. Do not include any preamble or prefix."""

        try:
            call_kwargs = {
                "task": "compression",
-                "main_runtime": {
-                    "model": self.model,
-                    "provider": self.provider,
-                    "base_url": self.base_url,
-                    "api_key": self.api_key,
-                    "api_mode": self.api_mode,
-                },
                "messages": [{"role": "user", "content": prompt}],
-                "max_tokens": int(summary_budget * 1.3),
+                "max_tokens": summary_budget * 2,
                # timeout resolved from auxiliary.compression.timeout config by call_llm
            }
            if self.summary_model:
@@ -693,10 +370,8 @@ The user has requested that this compaction PRIORITISE preserving all informatio
            # Store for iterative updates on next compaction
            self._previous_summary = summary
            self._summary_failure_cooldown_until = 0.0
-            self._summary_model_fallen_back = False
            return self._with_summary_prefix(summary)
        except RuntimeError:
-            # No provider configured — long cooldown, unlikely to self-resolve
            self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
            logging.warning("Context compression: no provider available for "
                            "summary. Middle turns will be dropped without summary "
@@ -704,42 +379,12 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                            _SUMMARY_FAILURE_COOLDOWN_SECONDS)
            return None
        except Exception as e:
-            # If the summary model is different from the main model and the
-            # error looks permanent (model not found, 503, 404), fall back to
-            # using the main model instead of entering cooldown that leaves
-            # context growing unbounded.  (#8620 sub-issue 4)
-            _status = getattr(e, "status_code", None) or getattr(getattr(e, "response", None), "status_code", None)
-            _err_str = str(e).lower()
-            _is_model_not_found = (
-                _status in (404, 503)
-                or "model_not_found" in _err_str
-                or "does not exist" in _err_str
-                or "no available channel" in _err_str
-            )
-            if (
-                _is_model_not_found
-                and self.summary_model
-                and self.summary_model != self.model
-                and not getattr(self, "_summary_model_fallen_back", False)
-            ):
-                self._summary_model_fallen_back = True
-                logging.warning(
-                    "Summary model '%s' not available (%s). "
-                    "Falling back to main model '%s' for compression.",
-                    self.summary_model, e, self.model,
-                )
-                self.summary_model = ""  # empty = use main model
-                self._summary_failure_cooldown_until = 0.0  # no cooldown
-                return self._generate_summary(messages, summary_budget)  # retry immediately
-
-            # Transient errors (timeout, rate limit, network) — shorter cooldown
-            _transient_cooldown = 60
-            self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
+            self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
            logging.warning(
                "Failed to generate context summary: %s. "
                "Further summary attempts paused for %d seconds.",
                e,
-                _transient_cooldown,
+                _SUMMARY_FAILURE_COOLDOWN_SECONDS,
            )
            return None

@@ -873,20 +518,13 @@ The user has requested that this compaction PRIORITISE preserving all informatio
        derived from ``summary_target_ratio * context_length``, so it
        scales automatically with the model's context window.

-        Token budget is the primary criterion.  A hard minimum of 3 messages
-        is always protected, but the budget is allowed to exceed by up to
-        1.5x to avoid cutting inside an oversized message (tool output, file
-        read, etc.).  If even the minimum 3 messages exceed 1.5x the budget
-        the cut is placed right after the head so compression still runs.
-
-        Never cuts inside a tool_call/result group.
+        Never cuts inside a tool_call/result group. Falls back to the old
+        ``protect_last_n`` if the budget would protect fewer messages.
        """
        if token_budget is None:
            token_budget = self.tail_token_budget
        n = len(messages)
-        # Hard minimum: always keep at least 3 messages in the tail
-        min_tail = min(3, n - head_end - 1) if n - head_end > 1 else 0
-        soft_ceiling = int(token_budget * 1.5)
+        min_tail = self.protect_last_n
        accumulated = 0
        cut_idx = n  # start from beyond the end

@@ -899,21 +537,21 @@ The user has requested that this compaction PRIORITISE preserving all informatio
                if isinstance(tc, dict):
                    args = tc.get("function", {}).get("arguments", "")
                    msg_tokens += len(args) // _CHARS_PER_TOKEN
-            # Stop once we exceed the soft ceiling (unless we haven't hit min_tail yet)
-            if accumulated + msg_tokens > soft_ceiling and (n - i) >= min_tail:
+            if accumulated + msg_tokens > token_budget and (n - i) >= min_tail:
                break
            accumulated += msg_tokens
            cut_idx = i

-        # Ensure we protect at least min_tail messages
+        # Ensure we protect at least protect_last_n messages
        fallback_cut = n - min_tail
        if cut_idx > fallback_cut:
            cut_idx = fallback_cut

        # If the token budget would protect everything (small conversations),
-        # force a cut after the head so compression can still remove middle turns.
+        # fall back to the fixed protect_last_n approach so compression can
+        # still remove middle turns.
        if cut_idx <= head_end:
-            cut_idx = max(fallback_cut, head_end + 1)
+            cut_idx = fallback_cut

        # Align to avoid splitting tool groups
        cut_idx = self._align_boundary_backward(messages, cut_idx)
@@ -924,7 +562,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
    # Main compression entry point
    # ------------------------------------------------------------------

-    def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, focus_topic: str = None) -> List[Dict[str, Any]]:
+    def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]:
        """Compress conversation messages by summarizing middle turns.

        Algorithm:
@@ -936,21 +574,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio

        After compression, orphaned tool_call / tool_result pairs are cleaned
        up so the API never receives mismatched IDs.
-
-        Args:
-            focus_topic: Optional focus string for guided compression.  When
-                provided, the summariser will prioritise preserving information
-                related to this topic and be more aggressive about compressing
-                everything else.  Inspired by Claude Code's ``/compact``.
        """
        n_messages = len(messages)
-        # Only need head + 3 tail messages minimum (token budget decides the real tail size)
-        _min_for_compress = self.protect_first_n + 3 + 1
-        if n_messages <= _min_for_compress:
+        if n_messages <= self.protect_first_n + self.protect_last_n + 1:
            if not self.quiet_mode:
                logger.warning(
                    "Cannot compress: only %d messages (need > %d)",
-                    n_messages, _min_for_compress,
+                    n_messages,
+                    self.protect_first_n + self.protect_last_n + 1,
                )
            return messages

@@ -958,8 +589,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio

        # Phase 1: Prune old tool results (cheap, no LLM call)
        messages, pruned_count = self._prune_old_tool_results(
-            messages, protect_tail_count=self.protect_last_n,
-            protect_tail_tokens=self.tail_token_budget,
+            messages, protect_tail_count=self.protect_last_n * 3,
        )
        if pruned_count and not self.quiet_mode:
            logger.info("Pre-compression: pruned %d old tool result(s)", pruned_count)
@@ -999,67 +629,52 @@ The user has requested that this compaction PRIORITISE preserving all informatio
            )

        # Phase 3: Generate structured summary
-        summary = self._generate_summary(turns_to_summarize, focus_topic=focus_topic)
+        summary = self._generate_summary(turns_to_summarize)

        # Phase 4: Assemble compressed message list
        compressed = []
        for i in range(compress_start):
            msg = messages[i].copy()
-            if i == 0 and msg.get("role") == "system":
-                existing = msg.get("content") or ""
-                _compression_note = "[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
-                if _compression_note not in existing:
-                    msg["content"] = existing + "\n\n" + _compression_note
+            if i == 0 and msg.get("role") == "system" and self.compression_count == 0:
+                msg["content"] = (
+                    (msg.get("content") or "")
+                    + "\n\n[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
+                )
            compressed.append(msg)

-        # If LLM summary failed, insert a static fallback so the model
-        # knows context was lost rather than silently dropping everything.
-        if not summary:
-            if not self.quiet_mode:
-                logger.warning("Summary generation failed — inserting static fallback context marker")
-            n_dropped = compress_end - compress_start
-            summary = (
-                f"{SUMMARY_PREFIX}\n"
-                f"Summary generation was unavailable. {n_dropped} conversation turns were "
-                f"removed to free context space but could not be summarized. The removed "
-                f"turns contained earlier work in this session. Continue based on the "
-                f"recent messages below and the current state of any files or resources."
-            )
-
        _merge_summary_into_tail = False
-        last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user"
-        first_tail_role = messages[compress_end].get("role", "user") if compress_end < n_messages else "user"
-        # Pick a role that avoids consecutive same-role with both neighbors.
-        # Priority: avoid colliding with head (already committed), then tail.
-        if last_head_role in ("assistant", "tool"):
-            summary_role = "user"
-        else:
-            summary_role = "assistant"
-        # If the chosen role collides with the tail AND flipping wouldn't
-        # collide with the head, flip it.
-        if summary_role == first_tail_role:
-            flipped = "assistant" if summary_role == "user" else "user"
-            if flipped != last_head_role:
-                summary_role = flipped
+        if summary:
+            last_head_role = messages[compress_start - 1].get("role", "user") if compress_start > 0 else "user"
+            first_tail_role = messages[compress_end].get("role", "user") if compress_end < n_messages else "user"
+            # Pick a role that avoids consecutive same-role with both neighbors.
+            # Priority: avoid colliding with head (already committed), then tail.
+            if last_head_role in ("assistant", "tool"):
+                summary_role = "user"
            else:
-                # Both roles would create consecutive same-role messages
-                # (e.g. head=assistant, tail=user — neither role works).
-                # Merge the summary into the first tail message instead
-                # of inserting a standalone message that breaks alternation.
-                _merge_summary_into_tail = True
-        if not _merge_summary_into_tail:
-            compressed.append({"role": summary_role, "content": summary})
+                summary_role = "assistant"
+            # If the chosen role collides with the tail AND flipping wouldn't
+            # collide with the head, flip it.
+            if summary_role == first_tail_role:
+                flipped = "assistant" if summary_role == "user" else "user"
+                if flipped != last_head_role:
+                    summary_role = flipped
+                else:
+                    # Both roles would create consecutive same-role messages
+                    # (e.g. head=assistant, tail=user — neither role works).
+                    # Merge the summary into the first tail message instead
+                    # of inserting a standalone message that breaks alternation.
+                    _merge_summary_into_tail = True
+            if not _merge_summary_into_tail:
+                compressed.append({"role": summary_role, "content": summary})
+        else:
+            if not self.quiet_mode:
+                logger.debug("No summary model available — middle turns dropped without summary")

        for i in range(compress_end, n_messages):
            msg = messages[i].copy()
            if _merge_summary_into_tail and i == compress_end:
                original = msg.get("content") or ""
-                msg["content"] = (
-                    summary
-                    + "\n\n--- END OF CONTEXT SUMMARY — "
-                    "respond to the message below, not the summary above ---\n\n"
-                    + original
-                )
+                msg["content"] = summary + "\n\n" + original
                _merge_summary_into_tail = False
            compressed.append(msg)

@@ -1067,24 +682,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio

        compressed = self._sanitize_tool_pairs(compressed)

-        new_estimate = estimate_messages_tokens_rough(compressed)
-        saved_estimate = display_tokens - new_estimate
-
-        # Anti-thrashing: track compression effectiveness
-        savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0
-        self._last_compression_savings_pct = savings_pct
-        if savings_pct < 10:
-            self._ineffective_compression_count += 1
-        else:
-            self._ineffective_compression_count = 0
-
        if not self.quiet_mode:
+            new_estimate = estimate_messages_tokens_rough(compressed)
+            saved_estimate = display_tokens - new_estimate
            logger.info(
-                "Compressed: %d -> %d messages (~%d tokens saved, %.0f%%)",
+                "Compressed: %d -> %d messages (~%d tokens saved)",
                n_messages,
                len(compressed),
                saved_estimate,
-                savings_pct,
            )
            logger.info("Compression #%d complete", self.compression_count)

--- a/agent/context_engine.py
+++ b/agent/context_engine.py
@@ -1,184 +0,0 @@
-"""Abstract base class for pluggable context engines.
-
-A context engine controls how conversation context is managed when
-approaching the model's token limit. The built-in ContextCompressor
-is the default implementation. Third-party engines (e.g. LCM) can
-replace it via the plugin system or by being placed in the
-``plugins/context_engine/<name>/`` directory.
-
-Selection is config-driven: ``context.engine`` in config.yaml.
-Default is ``"compressor"`` (the built-in). Only one engine is active.
-
-The engine is responsible for:
-  - Deciding when compaction should fire
-  - Performing compaction (summarization, DAG construction, etc.)
-  - Optionally exposing tools the agent can call (e.g. lcm_grep)
-  - Tracking token usage from API responses
-
-Lifecycle:
-  1. Engine is instantiated and registered (plugin register() or default)
-  2. on_session_start() called when a conversation begins
-  3. update_from_response() called after each API response with usage data
-  4. should_compress() checked after each turn
-  5. compress() called when should_compress() returns True
-  6. on_session_end() called at real session boundaries (CLI exit, /reset,
-     gateway session expiry) — NOT per-turn
-"""
-
-from abc import ABC, abstractmethod
-from typing import Any, Dict, List
-
-
-class ContextEngine(ABC):
-    """Base class all context engines must implement."""
-
-    # -- Identity ----------------------------------------------------------
-
-    @property
-    @abstractmethod
-    def name(self) -> str:
-        """Short identifier (e.g. 'compressor', 'lcm')."""
-
-    # -- Token state (read by run_agent.py for display/logging) ------------
-    #
-    # Engines MUST maintain these. run_agent.py reads them directly.
-
-    last_prompt_tokens: int = 0
-    last_completion_tokens: int = 0
-    last_total_tokens: int = 0
-    threshold_tokens: int = 0
-    context_length: int = 0
-    compression_count: int = 0
-
-    # -- Compaction parameters (read by run_agent.py for preflight) --------
-    #
-    # These control the preflight compression check.  Subclasses may
-    # override via __init__ or property; defaults are sensible for most
-    # engines.
-
-    threshold_percent: float = 0.75
-    protect_first_n: int = 3
-    protect_last_n: int = 6
-
-    # -- Core interface ----------------------------------------------------
-
-    @abstractmethod
-    def update_from_response(self, usage: Dict[str, Any]) -> None:
-        """Update tracked token usage from an API response.
-
-        Called after every LLM call with the usage dict from the response.
-        """
-
-    @abstractmethod
-    def should_compress(self, prompt_tokens: int = None) -> bool:
-        """Return True if compaction should fire this turn."""
-
-    @abstractmethod
-    def compress(
-        self,
-        messages: List[Dict[str, Any]],
-        current_tokens: int = None,
-    ) -> List[Dict[str, Any]]:
-        """Compact the message list and return the new message list.
-
-        This is the main entry point. The engine receives the full message
-        list and returns a (possibly shorter) list that fits within the
-        context budget. The implementation is free to summarize, build a
-        DAG, or do anything else — as long as the returned list is a valid
-        OpenAI-format message sequence.
-        """
-
-    # -- Optional: pre-flight check ----------------------------------------
-
-    def should_compress_preflight(self, messages: List[Dict[str, Any]]) -> bool:
-        """Quick rough check before the API call (no real token count yet).
-
-        Default returns False (skip pre-flight). Override if your engine
-        can do a cheap estimate.
-        """
-        return False
-
-    # -- Optional: session lifecycle ---------------------------------------
-
-    def on_session_start(self, session_id: str, **kwargs) -> None:
-        """Called when a new conversation session begins.
-
-        Use this to load persisted state (DAG, store) for the session.
-        kwargs may include hermes_home, platform, model, etc.
-        """
-
-    def on_session_end(self, session_id: str, messages: List[Dict[str, Any]]) -> None:
-        """Called at real session boundaries (CLI exit, /reset, gateway expiry).
-
-        Use this to flush state, close DB connections, etc.
-        NOT called per-turn — only when the session truly ends.
-        """
-
-    def on_session_reset(self) -> None:
-        """Called on /new or /reset. Reset per-session state.
-
-        Default resets compression_count and token tracking.
-        """
-        self.last_prompt_tokens = 0
-        self.last_completion_tokens = 0
-        self.last_total_tokens = 0
-        self.compression_count = 0
-
-    # -- Optional: tools ---------------------------------------------------
-
-    def get_tool_schemas(self) -> List[Dict[str, Any]]:
-        """Return tool schemas this engine provides to the agent.
-
-        Default returns empty list (no tools). LCM would return schemas
-        for lcm_grep, lcm_describe, lcm_expand here.
-        """
-        return []
-
-    def handle_tool_call(self, name: str, args: Dict[str, Any], **kwargs) -> str:
-        """Handle a tool call from the agent.
-
-        Only called for tool names returned by get_tool_schemas().
-        Must return a JSON string.
-
-        kwargs may include:
-          messages: the current in-memory message list (for live ingestion)
-        """
-        import json
-        return json.dumps({"error": f"Unknown context engine tool: {name}"})
-
-    # -- Optional: status / display ----------------------------------------
-
-    def get_status(self) -> Dict[str, Any]:
-        """Return status dict for display/logging.
-
-        Default returns the standard fields run_agent.py expects.
-        """
-        return {
-            "last_prompt_tokens": self.last_prompt_tokens,
-            "threshold_tokens": self.threshold_tokens,
-            "context_length": self.context_length,
-            "usage_percent": (
-                min(100, self.last_prompt_tokens / self.context_length * 100)
-                if self.context_length else 0
-            ),
-            "compression_count": self.compression_count,
-        }
-
-    # -- Optional: model switch support ------------------------------------
-
-    def update_model(
-        self,
-        model: str,
-        context_length: int,
-        base_url: str = "",
-        api_key: str = "",
-        provider: str = "",
-    ) -> None:
-        """Called when the user switches models or on fallback activation.
-
-        Default updates context_length and recalculates threshold_tokens
-        from threshold_percent. Override if your engine needs more
-        (e.g. recalculate DAG budgets, switch summary models).
-        """
-        self.context_length = context_length
-        self.threshold_tokens = int(context_length * self.threshold_percent)
--- a/agent/context_references.py
+++ b/agent/context_references.py
@@ -13,9 +13,8 @@ from typing import Awaitable, Callable

 from agent.model_metadata import estimate_tokens_rough

-_QUOTED_REFERENCE_VALUE = r'(?:`[^`\n]+`|"[^"\n]+"|\'[^\'\n]+\')'
 REFERENCE_PATTERN = re.compile(
-    rf"(?<![\w/])@(?:(?P<simple>diff|staged)\b|(?P<kind>file|folder|git|url):(?P<value>{_QUOTED_REFERENCE_VALUE}(?::\d+(?:-\d+)?)?|\S+))"
+    r"(?<![\w/])@(?:(?P<simple>diff|staged)\b|(?P<kind>file|folder|git|url):(?P<value>\S+))"
 )
 TRAILING_PUNCTUATION = ",.;!?"
 _SENSITIVE_HOME_DIRS = (".ssh", ".aws", ".gnupg", ".kube", ".docker", ".azure", ".config/gh")
@@ -82,10 +81,14 @@ def parse_context_references(message: str) -> list[ContextReference]:
        value = _strip_trailing_punctuation(match.group("value") or "")
        line_start = None
        line_end = None
-        target = _strip_reference_wrappers(value)
+        target = value

        if kind == "file":
-            target, line_start, line_end = _parse_file_reference_value(value)
+            range_match = re.match(r"^(?P<path>.+?):(?P<start>\d+)(?:-(?P<end>\d+))?$", value)
+            if range_match:
+                target = range_match.group("path")
+                line_start = int(range_match.group("start"))
+                line_end = int(range_match.group("end") or range_match.group("start"))

        refs.append(
            ContextReference(
@@ -372,38 +375,6 @@ def _strip_trailing_punctuation(value: str) -> str:
    return stripped


-def _strip_reference_wrappers(value: str) -> str:
-    if len(value) >= 2 and value[0] == value[-1] and value[0] in "`\"'":
-        return value[1:-1]
-    return value
-
-
-def _parse_file_reference_value(value: str) -> tuple[str, int | None, int | None]:
-    quoted_match = re.match(
-        r'^(?P<quote>`|"|\')(?P<path>.+?)(?P=quote)(?::(?P<start>\d+)(?:-(?P<end>\d+))?)?$',
-        value,
-    )
-    if quoted_match:
-        line_start = quoted_match.group("start")
-        line_end = quoted_match.group("end")
-        return (
-            quoted_match.group("path"),
-            int(line_start) if line_start is not None else None,
-            int(line_end or line_start) if line_start is not None else None,
-        )
-
-    range_match = re.match(r"^(?P<path>.+?):(?P<start>\d+)(?:-(?P<end>\d+))?$", value)
-    if range_match:
-        line_start = int(range_match.group("start"))
-        return (
-            range_match.group("path"),
-            line_start,
-            int(range_match.group("end") or range_match.group("start")),
-        )
-
-    return _strip_reference_wrappers(value), None, None
-
-
 def _remove_reference_tokens(message: str, refs: list[ContextReference]) -> str:
    pieces: list[str] = []
    cursor = 0
--- a/agent/credential_pool.py
+++ b/agent/credential_pool.py
@@ -19,17 +19,12 @@ from hermes_cli.auth import (
    CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
    DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
    PROVIDER_REGISTRY,
-    _auth_store_lock,
    _codex_access_token_is_expiring,
    _decode_jwt_claims,
    _import_codex_cli_tokens,
-    _write_codex_cli_tokens,
    _load_auth_store,
    _load_provider_state,
-    _resolve_kimi_base_url,
    _resolve_zai_base_url,
-    _save_auth_store,
-    _save_provider_state,
    read_credential_pool,
    write_credential_pool,
 )
@@ -69,10 +64,10 @@ SUPPORTED_POOL_STRATEGIES = {
 }

 # Cooldown before retrying an exhausted credential.
-# 429 (rate-limited) and 402 (billing/quota) both cool down after 1 hour.
-# Provider-supplied reset_at timestamps override these defaults.
+# 429 (rate-limited) cools down faster since quotas reset frequently.
+# 402 (billing/quota) and other codes use a longer default.
 EXHAUSTED_TTL_429_SECONDS = 60 * 60          # 1 hour
-EXHAUSTED_TTL_DEFAULT_SECONDS = 60 * 60      # 1 hour
+EXHAUSTED_TTL_DEFAULT_SECONDS = 24 * 60 * 60 # 24 hours

 # Pool key prefix for custom OpenAI-compatible endpoints.
 # Custom endpoints all share provider='custom' but are keyed by their
@@ -288,14 +283,6 @@ def _iter_custom_providers(config: Optional[dict] = None):
        return
    custom_providers = config.get("custom_providers")
    if not isinstance(custom_providers, list):
-        # Fall back to the v12+ providers dict via the compatibility layer
-        try:
-            from hermes_cli.config import get_compatible_custom_providers
-
-            custom_providers = get_compatible_custom_providers(config)
-        except Exception:
-            return
-    if not custom_providers:
        return
    for entry in custom_providers:
        if not isinstance(entry, dict):
@@ -490,67 +477,6 @@ class CredentialPool:
            logger.debug("Failed to sync from ~/.codex/auth.json: %s", exc)
        return entry

-    def _sync_device_code_entry_to_auth_store(self, entry: PooledCredential) -> None:
-        """Write refreshed pool entry tokens back to auth.json providers.
-
-        After a pool-level refresh, the pool entry has fresh tokens but
-        auth.json's ``providers.<id>`` still holds the pre-refresh state.
-        On the next ``load_pool()``, ``_seed_from_singletons()`` reads that
-        stale state and can overwrite the fresh pool entry — potentially
-        re-seeding a consumed single-use refresh token.
-
-        Applies to any OAuth provider whose singleton lives in auth.json
-        (currently Nous and OpenAI Codex).
-        """
-        if entry.source != "device_code":
-            return
-        try:
-            with _auth_store_lock():
-                auth_store = _load_auth_store()
-                if self.provider == "nous":
-                    state = _load_provider_state(auth_store, "nous")
-                    if state is None:
-                        return
-                    state["access_token"] = entry.access_token
-                    if entry.refresh_token:
-                        state["refresh_token"] = entry.refresh_token
-                    if entry.expires_at:
-                        state["expires_at"] = entry.expires_at
-                    if entry.agent_key:
-                        state["agent_key"] = entry.agent_key
-                    if entry.agent_key_expires_at:
-                        state["agent_key_expires_at"] = entry.agent_key_expires_at
-                    for extra_key in ("obtained_at", "expires_in", "agent_key_id",
-                                      "agent_key_expires_in", "agent_key_reused",
-                                      "agent_key_obtained_at"):
-                        val = entry.extra.get(extra_key)
-                        if val is not None:
-                            state[extra_key] = val
-                    if entry.inference_base_url:
-                        state["inference_base_url"] = entry.inference_base_url
-                    _save_provider_state(auth_store, "nous", state)
-
-                elif self.provider == "openai-codex":
-                    state = _load_provider_state(auth_store, "openai-codex")
-                    if not isinstance(state, dict):
-                        return
-                    tokens = state.get("tokens")
-                    if not isinstance(tokens, dict):
-                        return
-                    tokens["access_token"] = entry.access_token
-                    if entry.refresh_token:
-                        tokens["refresh_token"] = entry.refresh_token
-                    if entry.last_refresh:
-                        state["last_refresh"] = entry.last_refresh
-                    _save_provider_state(auth_store, "openai-codex", state)
-
-                else:
-                    return
-
-                _save_auth_store(auth_store)
-        except Exception as exc:
-            logger.debug("Failed to sync %s pool entry back to auth store: %s", self.provider, exc)
-
    def _refresh_entry(self, entry: PooledCredential, *, force: bool) -> Optional[PooledCredential]:
        if entry.auth_type != AUTH_TYPE_OAUTH or not entry.refresh_token:
            if force:
@@ -585,13 +511,6 @@ class CredentialPool:
                    except Exception as wexc:
                        logger.debug("Failed to write refreshed token to credentials file: %s", wexc)
            elif self.provider == "openai-codex":
-                # Proactively sync from ~/.codex/auth.json before refresh.
-                # The Codex CLI (or another Hermes profile) may have already
-                # consumed our refresh_token.  Syncing first avoids a
-                # "refresh_token_reused" error when the CLI has a newer pair.
-                synced = self._sync_codex_entry_from_cli(entry)
-                if synced is not entry:
-                    entry = synced
                refreshed = auth_mod.refresh_codex_oauth_pure(
                    entry.access_token,
                    entry.refresh_token,
@@ -677,45 +596,6 @@ class CredentialPool:
                    # Credentials file had a valid (non-expired) token — use it directly
                    logger.debug("Credentials file has valid token, using without refresh")
                    return synced
-            # For openai-codex: the refresh_token may have been consumed by
-            # the Codex CLI between our proactive sync and the refresh call.
-            # Re-sync and retry once.
-            if self.provider == "openai-codex":
-                synced = self._sync_codex_entry_from_cli(entry)
-                if synced.refresh_token != entry.refresh_token:
-                    logger.debug("Retrying Codex refresh with synced token from ~/.codex/auth.json")
-                    try:
-                        refreshed = auth_mod.refresh_codex_oauth_pure(
-                            synced.access_token,
-                            synced.refresh_token,
-                        )
-                        updated = replace(
-                            synced,
-                            access_token=refreshed["access_token"],
-                            refresh_token=refreshed["refresh_token"],
-                            last_refresh=refreshed.get("last_refresh"),
-                            last_status=STATUS_OK,
-                            last_status_at=None,
-                            last_error_code=None,
-                        )
-                        self._replace_entry(synced, updated)
-                        self._persist()
-                        self._sync_device_code_entry_to_auth_store(updated)
-                        try:
-                            _write_codex_cli_tokens(
-                                updated.access_token,
-                                updated.refresh_token,
-                                last_refresh=updated.last_refresh,
-                            )
-                        except Exception as wexc:
-                            logger.debug("Failed to write refreshed Codex tokens to CLI file (retry): %s", wexc)
-                        return updated
-                    except Exception as retry_exc:
-                        logger.debug("Codex retry refresh also failed: %s", retry_exc)
-                elif not self._entry_needs_refresh(synced):
-                    logger.debug("Codex CLI has valid token, using without refresh")
-                    self._sync_device_code_entry_to_auth_store(synced)
-                    return synced
            self._mark_exhausted(entry, None)
            return None

@@ -730,21 +610,6 @@ class CredentialPool:
        )
        self._replace_entry(entry, updated)
        self._persist()
-        # Sync refreshed tokens back to auth.json providers so that
-        # _seed_from_singletons() on the next load_pool() sees fresh state
-        # instead of re-seeding stale/consumed tokens.
-        self._sync_device_code_entry_to_auth_store(updated)
-        # Write refreshed tokens back to ~/.codex/auth.json so Codex CLI
-        # and VS Code don't hit "refresh_token_reused" on their next refresh.
-        if self.provider == "openai-codex":
-            try:
-                _write_codex_cli_tokens(
-                    updated.access_token,
-                    updated.refresh_token,
-                    last_refresh=updated.last_refresh,
-                )
-            except Exception as wexc:
-                logger.debug("Failed to write refreshed Codex tokens to CLI file: %s", wexc)
        return updated

    def _entry_needs_refresh(self, entry: PooledCredential) -> bool:
@@ -766,6 +631,17 @@ class CredentialPool:
            return False
        return False

+    def mark_used(self, entry_id: Optional[str] = None) -> None:
+        """Increment request_count for tracking. Used by least_used strategy."""
+        target_id = entry_id or self._current_id
+        if not target_id:
+            return
+        with self._lock:
+            for idx, entry in enumerate(self._entries):
+                if entry.id == target_id:
+                    self._entries[idx] = replace(entry, request_count=entry.request_count + 1)
+                    return
+
    def select(self) -> Optional[PooledCredential]:
        with self._lock:
            return self._select_unlocked()
@@ -927,6 +803,11 @@ class CredentialPool:
            else:
                self._active_leases[credential_id] = count - 1

+    def active_lease_count(self, credential_id: str) -> int:
+        """Return the number of active leases for a credential."""
+        with self._lock:
+            return self._active_leases.get(credential_id, 0)
+
    def try_refresh_current(self) -> Optional[PooledCredential]:
        with self._lock:
            return self._try_refresh_current_unlocked()
@@ -1086,17 +967,6 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
    auth_store = _load_auth_store()

    if provider == "anthropic":
-        # Only auto-discover external credentials (Claude Code, Hermes PKCE)
-        # when the user has explicitly configured anthropic as their provider.
-        # Without this gate, auxiliary client fallback chains silently read
-        # ~/.claude/.credentials.json without user consent.  See PR #4210.
-        try:
-            from hermes_cli.auth import is_provider_explicitly_configured
-            if not is_provider_explicitly_configured("anthropic"):
-                return changed, active_sources
-        except ImportError:
-            pass
-
        from agent.anthropic_adapter import read_claude_code_credentials, read_hermes_oauth_credentials

        for source_name, creds in (
@@ -1104,13 +974,6 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
            ("claude_code", read_claude_code_credentials()),
        ):
            if creds and creds.get("accessToken"):
-                # Check if user explicitly removed this source
-                try:
-                    from hermes_cli.auth import is_source_suppressed
-                    if is_source_suppressed(provider, source_name):
-                        continue
-                except ImportError:
-                    pass
                active_sources.add(source_name)
                changed |= _upsert_entry(
                    entries,
@@ -1152,79 +1015,9 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
                },
            )

-    elif provider == "copilot":
-        # Copilot tokens are resolved dynamically via `gh auth token` or
-        # env vars (COPILOT_GITHUB_TOKEN / GH_TOKEN).  They don't live in
-        # the auth store or credential pool, so we resolve them here.
-        try:
-            from hermes_cli.copilot_auth import resolve_copilot_token
-            token, source = resolve_copilot_token()
-            if token:
-                source_name = "gh_cli" if "gh" in source.lower() else f"env:{source}"
-                active_sources.add(source_name)
-                changed |= _upsert_entry(
-                    entries,
-                    provider,
-                    source_name,
-                    {
-                        "source": source_name,
-                        "auth_type": AUTH_TYPE_API_KEY,
-                        "access_token": token,
-                        "label": source,
-                    },
-                )
-        except Exception as exc:
-            logger.debug("Copilot token seed failed: %s", exc)
-
-    elif provider == "qwen-oauth":
-        # Qwen OAuth tokens live in ~/.qwen/oauth_creds.json, written by
-        # the Qwen CLI (`qwen auth qwen-oauth`).  They aren't in the
-        # Hermes auth store or env vars, so resolve them here.
-        # Use refresh_if_expiring=False to avoid network calls during
-        # pool loading / provider discovery.
-        try:
-            from hermes_cli.auth import resolve_qwen_runtime_credentials
-            creds = resolve_qwen_runtime_credentials(refresh_if_expiring=False)
-            token = creds.get("api_key", "")
-            if token:
-                source_name = creds.get("source", "qwen-cli")
-                active_sources.add(source_name)
-                changed |= _upsert_entry(
-                    entries,
-                    provider,
-                    source_name,
-                    {
-                        "source": source_name,
-                        "auth_type": AUTH_TYPE_OAUTH,
-                        "access_token": token,
-                        "expires_at_ms": creds.get("expires_at_ms"),
-                        "base_url": creds.get("base_url", ""),
-                        "label": creds.get("auth_file", source_name),
-                    },
-                )
-        except Exception as exc:
-            logger.debug("Qwen OAuth token seed failed: %s", exc)
-
    elif provider == "openai-codex":
        state = _load_provider_state(auth_store, "openai-codex")
        tokens = state.get("tokens") if isinstance(state, dict) else None
-        # Fallback: import from Codex CLI (~/.codex/auth.json) if Hermes auth
-        # store has no tokens.  This mirrors resolve_codex_runtime_credentials()
-        # so that load_pool() and list_authenticated_providers() detect tokens
-        # that only exist in the Codex CLI shared file.
-        if not (isinstance(tokens, dict) and tokens.get("access_token")):
-            try:
-                from hermes_cli.auth import _import_codex_cli_tokens, _save_codex_tokens
-                cli_tokens = _import_codex_cli_tokens()
-                if cli_tokens:
-                    logger.info("Importing Codex CLI tokens into Hermes auth store.")
-                    _save_codex_tokens(cli_tokens)
-                    # Re-read state after import
-                    auth_store = _load_auth_store()
-                    state = _load_provider_state(auth_store, "openai-codex")
-                    tokens = state.get("tokens") if isinstance(state, dict) else None
-            except Exception as exc:
-                logger.debug("Codex CLI token import failed: %s", exc)
        if isinstance(tokens, dict) and tokens.get("access_token"):
            active_sources.add("device_code")
            changed |= _upsert_entry(
@@ -1291,9 +1084,7 @@ def _seed_from_env(provider: str, entries: List[PooledCredential]) -> Tuple[bool
        active_sources.add(source)
        auth_type = AUTH_TYPE_OAUTH if provider == "anthropic" and not token.startswith("sk-ant-api") else AUTH_TYPE_API_KEY
        base_url = env_url or pconfig.inference_base_url
-        if provider == "kimi-coding":
-            base_url = _resolve_kimi_base_url(token, pconfig.inference_base_url, env_url)
-        elif provider == "zai":
+        if provider == "zai":
            base_url = _resolve_zai_base_url(token, pconfig.inference_base_url, env_url)
        changed |= _upsert_entry(
            entries,
--- a/agent/crisis_resources.py
+++ b/agent/crisis_resources.py
@@ -1,149 +0,0 @@
-"""
-988 Suicide & Crisis Lifeline Integration (#673).
-
-When crisis is detected, provides immediate access to help:
- Phone: 988 (call or text)
- Text: Text HOME to 988
- Chat: 988lifeline.org/chat
- Spanish: 1-888-628-9454
- Emergency: 911
-
-This module provides the resource data. agent/crisis_protocol.py
-handles detection. This module formats the resources for display.
-"""
-
-from dataclasses import dataclass
-from typing import List
-
-
-@dataclass
-class CrisisResource:
-    """A crisis support contact method."""
-    name: str
-    contact: str
-    description: str
-    url: str = ""
-    available: str = "24/7"
-    language: str = "English"
-
-
-# 988 Suicide & Crisis Lifeline — all channels
-LIFELINE_988 = CrisisResource(
-    name="988 Suicide and Crisis Lifeline",
-    contact="Call or text 988",
-    description="Free, confidential support for people in suicidal crisis or emotional distress.",
-    url="https://988lifeline.org",
-    available="24/7",
-    language="English",
-)
-
-LIFELINE_988_TEXT = CrisisResource(
-    name="988 Crisis Text Line",
-    contact="Text HOME to 988",
-    description="Free, 24/7 crisis support via text message.",
-    url="",
-    available="24/7",
-    language="English",
-)
-
-LIFELINE_988_CHAT = CrisisResource(
-    name="988 Lifeline Chat",
-    contact="988lifeline.org/chat",
-    description="Free, confidential online chat with a trained crisis counselor.",
-    url="https://988lifeline.org/chat",
-    available="24/7",
-    language="English",
-)
-
-LIFELINE_988_SPANISH = CrisisResource(
-    name="988 Lifeline (Spanish)",
-    contact="1-888-628-9454",
-    description="Línea de prevención del suicidio en español.",
-    url="https://988lifeline.org/help-yourself/en-espanol/",
-    available="24/7",
-    language="Spanish",
-)
-
-CRISIS_TEXT_LINE = CrisisResource(
-    name="Crisis Text Line",
-    contact="Text HOME to 741741",
-    description="Free, 24/7 crisis support via text message.",
-    url="https://www.crisistextline.org",
-    available="24/7",
-    language="English",
-)
-
-EMERGENCY_911 = CrisisResource(
-    name="Emergency Services",
-    contact="911",
-    description="Immediate danger — police, fire, ambulance.",
-    url="",
-    available="24/7",
-    language="Any",
-)
-
-# All resources in priority order
-ALL_RESOURCES: List[CrisisResource] = [
-    EMERGENCY_911,
-    LIFELINE_988,
-    LIFELINE_988_TEXT,
-    LIFELINE_988_CHAT,
-    CRISIS_TEXT_LINE,
-    LIFELINE_988_SPANISH,
-]
-
-
-def get_crisis_resources(language: str = None) -> List[CrisisResource]:
-    """Get crisis resources, optionally filtered by language.
-
-    Args:
-        language: Filter by language ("English", "Spanish", or None for all)
-
-    Returns:
-        List of CrisisResource objects
-    """
-    if language:
-        return [r for r in ALL_RESOURCES if r.language.lower() == language.lower()]
-    return ALL_RESOURCES
-
-
-def format_crisis_resources(resources: List[CrisisResource] = None) -> str:
-    """Format crisis resources as a user-facing message.
-
-    Args:
-        resources: List of resources to format. Defaults to all resources.
-
-    Returns:
-        Formatted string suitable for displaying to a user in crisis.
-    """
-    if resources is None:
-        resources = ALL_RESOURCES
-
-    lines = ["**Please reach out — help is available right now:**
-"]
-
-    for r in resources:
-        if r.url:
-            lines.append(f"- **{r.name}:** {r.contact} ({r.url})")
-        else:
-            lines.append(f"- **{r.name}:** {r.contact}")
-
-    lines.append("")
-    lines.append("All services are free, confidential, and available 24/7.")
-    lines.append("You are not alone.")
-
-    return "
-".join(lines)
-
-
-def get_immediate_help_message() -> str:
-    """Get the most urgent crisis help message.
-
-    Used when crisis is detected at CRITICAL level.
-    """
-    return (
-        "If you are in immediate danger, call **911** right now.
-
-"
-        + format_crisis_resources()
-    )
--- a/agent/display.py
+++ b/agent/display.py
@@ -4,6 +4,7 @@ Pure display functions and classes with no AIAgent dependency.
 Used by AIAgent._execute_tool_calls for CLI feedback.
 """

+import json
 import logging
 import os
 import sys
@@ -13,8 +14,6 @@ from dataclasses import dataclass, field
 from difflib import unified_diff
 from pathlib import Path

-from utils import safe_json_loads
-
 # ANSI escape codes for coloring tool failure indicators
 _RED = "\033[31m"
 _RESET = "\033[0m"
@@ -22,67 +21,11 @@ _RESET = "\033[0m"
 logger = logging.getLogger(__name__)

 _ANSI_RESET = "\033[0m"
-
-# Diff colors — resolved lazily from the skin engine so they adapt
-# to light/dark themes.  Falls back to sensible defaults on import
-# failure.  We cache after first resolution for performance.
-_diff_colors_cached: dict[str, str] | None = None
-
-
-def _diff_ansi() -> dict[str, str]:
-    """Return ANSI escapes for diff display, resolved from the active skin."""
-    global _diff_colors_cached
-    if _diff_colors_cached is not None:
-        return _diff_colors_cached
-
-    # Defaults that work on dark terminals
-    dim = "\033[38;2;150;150;150m"
-    file_c = "\033[38;2;180;160;255m"
-    hunk = "\033[38;2;120;120;140m"
-    minus = "\033[38;2;255;255;255;48;2;120;20;20m"
-    plus = "\033[38;2;255;255;255;48;2;20;90;20m"
-
-    try:
-        from hermes_cli.skin_engine import get_active_skin
-        skin = get_active_skin()
-
-        def _hex_fg(key: str, fallback_rgb: tuple[int, int, int]) -> str:
-            h = skin.get_color(key, "")
-            if h and len(h) == 7 and h[0] == "#":
-                r, g, b = int(h[1:3], 16), int(h[3:5], 16), int(h[5:7], 16)
-                return f"\033[38;2;{r};{g};{b}m"
-            r, g, b = fallback_rgb
-            return f"\033[38;2;{r};{g};{b}m"
-
-        dim = _hex_fg("banner_dim", (150, 150, 150))
-        file_c = _hex_fg("session_label", (180, 160, 255))
-        hunk = _hex_fg("session_border", (120, 120, 140))
-        # minus/plus use background colors — derive from ui_error/ui_ok
-        err_h = skin.get_color("ui_error", "#ef5350")
-        ok_h = skin.get_color("ui_ok", "#4caf50")
-        if err_h and len(err_h) == 7:
-            er, eg, eb = int(err_h[1:3], 16), int(err_h[3:5], 16), int(err_h[5:7], 16)
-            # Use a dark tinted version as background
-            minus = f"\033[38;2;255;255;255;48;2;{max(er//2,20)};{max(eg//4,10)};{max(eb//4,10)}m"
-        if ok_h and len(ok_h) == 7:
-            or_, og, ob = int(ok_h[1:3], 16), int(ok_h[3:5], 16), int(ok_h[5:7], 16)
-            plus = f"\033[38;2;255;255;255;48;2;{max(or_//4,10)};{max(og//2,20)};{max(ob//4,10)}m"
-    except Exception:
-        pass
-
-    _diff_colors_cached = {
-        "dim": dim, "file": file_c, "hunk": hunk,
-        "minus": minus, "plus": plus,
-    }
-    return _diff_colors_cached
-
-
-# Module-level helpers — each call resolves from the active skin lazily.
-def _diff_dim():   return _diff_ansi()["dim"]
-def _diff_file():  return _diff_ansi()["file"]
-def _diff_hunk():  return _diff_ansi()["hunk"]
-def _diff_minus(): return _diff_ansi()["minus"]
-def _diff_plus():  return _diff_ansi()["plus"]
+_ANSI_DIM = "\033[38;2;150;150;150m"
+_ANSI_FILE = "\033[38;2;180;160;255m"
+_ANSI_HUNK = "\033[38;2;120;120;140m"
+_ANSI_MINUS = "\033[38;2;255;255;255;48;2;120;20;20m"
+_ANSI_PLUS = "\033[38;2;255;255;255;48;2;20;90;20m"
 _MAX_INLINE_DIFF_FILES = 6
 _MAX_INLINE_DIFF_LINES = 80

@@ -124,6 +67,26 @@ def _get_skin():
        return None


+def get_skin_faces(key: str, default: list) -> list:
+    """Get spinner face list from active skin, falling back to default."""
+    skin = _get_skin()
+    if skin:
+        faces = skin.get_spinner_list(key)
+        if faces:
+            return faces
+    return default
+
+
+def get_skin_verbs() -> list:
+    """Get thinking verbs from active skin."""
+    skin = _get_skin()
+    if skin:
+        verbs = skin.get_spinner_list("thinking_verbs")
+        if verbs:
+            return verbs
+    return KawaiiSpinner.THINKING_VERBS
+
+
 def get_skin_tool_prefix() -> str:
    """Get tool output prefix character from active skin."""
    skin = _get_skin()
@@ -367,8 +330,9 @@ def _result_succeeded(result: str | None) -> bool:
    """Conservatively detect whether a tool result represents success."""
    if not result:
        return False
-    data = safe_json_loads(result)
-    if data is None:
+    try:
+        data = json.loads(result)
+    except (json.JSONDecodeError, TypeError):
        return False
    if not isinstance(data, dict):
        return False
@@ -417,7 +381,10 @@ def extract_edit_diff(
 ) -> str | None:
    """Extract a unified diff from a file-edit tool result."""
    if tool_name == "patch" and result:
-        data = safe_json_loads(result)
+        try:
+            data = json.loads(result)
+        except (json.JSONDecodeError, TypeError):
+            data = None
        if isinstance(data, dict):
            diff = data.get("diff")
            if isinstance(diff, str) and diff.strip():
@@ -456,19 +423,19 @@ def _render_inline_unified_diff(diff: str) -> list[str]:
        if raw_line.startswith("+++ "):
            to_file = raw_line[4:].strip()
            if from_file or to_file:
-                rendered.append(f"{_diff_file()}{from_file or 'a/?'} → {to_file or 'b/?'}{_ANSI_RESET}")
+                rendered.append(f"{_ANSI_FILE}{from_file or 'a/?'} → {to_file or 'b/?'}{_ANSI_RESET}")
            continue
        if raw_line.startswith("@@"):
-            rendered.append(f"{_diff_hunk()}{raw_line}{_ANSI_RESET}")
+            rendered.append(f"{_ANSI_HUNK}{raw_line}{_ANSI_RESET}")
            continue
        if raw_line.startswith("-"):
-            rendered.append(f"{_diff_minus()}{raw_line}{_ANSI_RESET}")
+            rendered.append(f"{_ANSI_MINUS}{raw_line}{_ANSI_RESET}")
            continue
        if raw_line.startswith("+"):
-            rendered.append(f"{_diff_plus()}{raw_line}{_ANSI_RESET}")
+            rendered.append(f"{_ANSI_PLUS}{raw_line}{_ANSI_RESET}")
            continue
        if raw_line.startswith(" "):
-            rendered.append(f"{_diff_dim()}{raw_line}{_ANSI_RESET}")
+            rendered.append(f"{_ANSI_DIM}{raw_line}{_ANSI_RESET}")
            continue
        if raw_line:
            rendered.append(raw_line)
@@ -534,7 +501,7 @@ def _summarize_rendered_diff_sections(
        summary = f"… omitted {omitted_lines} diff line(s)"
        if omitted_files:
            summary += f" across {omitted_files} additional file(s)/section(s)"
-        rendered.append(f"{_diff_hunk()}{summary}{_ANSI_RESET}")
+        rendered.append(f"{_ANSI_HUNK}{summary}{_ANSI_RESET}")

    return rendered

@@ -756,6 +723,46 @@ class KawaiiSpinner:
        return False


+# =========================================================================
+# Kawaii face arrays (used by AIAgent._execute_tool_calls for spinner text)
+# =========================================================================
+
+KAWAII_SEARCH = [
+    "♪(´ε` )", "(｡◕‿◕｡)", "ヾ(＾∇＾)", "(◕ᴗ◕✿)", "( ˘▽˘)っ",
+    "٩(◕‿◕｡)۶", "(✿◠‿◠)", "♪～(´ε｀ )", "(ノ´ヮ`)ノ*:・゚✧", "＼(◎o◎)／",
+]
+KAWAII_READ = [
+    "φ(゜▽゜*)♪", "( ˘▽˘)っ", "(⌐■_■)", "٩(｡•́‿•̀｡)۶", "(◕‿◕✿)",
+    "ヾ(＠⌒ー⌒＠)ノ", "(✧ω✧)", "♪(๑ᴖ◡ᴖ๑)♪", "(≧◡≦)", "( ´ ▽ ` )ノ",
+]
+KAWAII_TERMINAL = [
+    "ヽ(>∀<☆)ノ", "(ノ°∀°)ノ", "٩(^ᴗ^)۶", "ヾ(⌐■_■)ノ♪", "(•̀ᴗ•́)و",
+    "┗(＾0＾)┓", "(｀・ω・´)", "＼(￣▽￣)／", "(ง •̀_•́)ง", "ヽ(´▽`)/",
+]
+KAWAII_BROWSER = [
+    "(ノ°∀°)ノ", "(☞゚ヮ゚)☞", "( ͡° ͜ʖ ͡°)", "┌( ಠ_ಠ)┘", "(⊙_⊙)？",
+    "ヾ(•ω•`)o", "(￣ω￣)", "( ˇωˇ )", "(ᵔᴥᵔ)", "＼(◎o◎)／",
+]
+KAWAII_CREATE = [
+    "✧*。٩(ˊᗜˋ*)و✧", "(ﾉ◕ヮ◕)ﾉ*:・ﾟ✧", "ヽ(>∀<☆)ノ", "٩(♡ε♡)۶", "(◕‿◕)♡",
+    "✿◕ ‿ ◕✿", "(*≧▽≦)", "ヾ(＾-＾)ノ", "(☆▽☆)", "°˖✧◝(⁰▿⁰)◜✧˖°",
+]
+KAWAII_SKILL = [
+    "ヾ(＠⌒ー⌒＠)ノ", "(๑˃ᴗ˂)ﻭ", "٩(◕‿◕｡)۶", "(✿╹◡╹)", "ヽ(・∀・)ノ",
+    "(ノ´ヮ`)ノ*:・ﾟ✧", "♪(๑ᴖ◡ᴖ๑)♪", "(◠‿◠)", "٩(ˊᗜˋ*)و", "(＾▽＾)",
+    "ヾ(＾∇＾)", "(★ω★)/", "٩(｡•́‿•̀｡)۶", "(◕ᴗ◕✿)", "＼(◎o◎)／",
+    "(✧ω✧)", "ヽ(>∀<☆)ノ", "( ˘▽˘)っ", "(≧◡≦) ♡", "ヾ(￣▽￣)",
+]
+KAWAII_THINK = [
+    "(っ°Д°;)っ", "(；′⌒`)", "(・_・ヾ", "( ´_ゝ`)", "(￣ヘ￣)",
+    "(。-`ω´-)", "( ˘︹˘ )", "(¬_¬)", "ヽ(ー_ー )ノ", "(；一_一)",
+]
+KAWAII_GENERIC = [
+    "♪(´ε` )", "(◕‿◕✿)", "ヾ(＾∇＾)", "٩(◕‿◕｡)۶", "(✿◠‿◠)",
+    "(ノ´ヮ`)ノ*:・ﾟ✧", "ヽ(>∀<☆)ノ", "(☆▽☆)", "( ˘▽˘)っ", "(≧◡≦)",
+]
+
+
 # =========================================================================
 # Cute tool message (completion line that replaces the spinner)
 # =========================================================================
@@ -771,19 +778,23 @@ def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str]
        return False, ""

    if tool_name == "terminal":
-        data = safe_json_loads(result)
-        if isinstance(data, dict):
+        try:
+            data = json.loads(result)
            exit_code = data.get("exit_code")
            if exit_code is not None and exit_code != 0:
                return True, f" [exit {exit_code}]"
+        except (json.JSONDecodeError, TypeError, AttributeError):
+            logger.debug("Could not parse terminal result as JSON for exit code check")
        return False, ""

    # Memory-specific: distinguish "full" from real errors
    if tool_name == "memory":
-        data = safe_json_loads(result)
-        if isinstance(data, dict):
+        try:
+            data = json.loads(result)
            if data.get("success") is False and "exceed the limit" in data.get("error", ""):
                return True, " [full]"
+        except (json.JSONDecodeError, TypeError, AttributeError):
+            logger.debug("Could not parse memory result as JSON for capacity check")

    # Generic heuristic for non-terminal tools
    lower = result[:500].lower()
@@ -959,6 +970,22 @@ _SKY_BLUE = "\033[38;5;117m"
 _ANSI_RESET = "\033[0m"


+def honcho_session_url(workspace: str, session_name: str) -> str:
+    """Build a Honcho app URL for a session."""
+    from urllib.parse import quote
+    return (
+        f"https://app.honcho.dev/explore"
+        f"?workspace={quote(workspace, safe='')}"
+        f"&view=sessions"
+        f"&session={quote(session_name, safe='')}"
+    )
+
+
+def _osc8_link(url: str, text: str) -> str:
+    """OSC 8 terminal hyperlink (clickable in iTerm2, Ghostty, WezTerm, etc.)."""
+    return f"\033]8;;{url}\033\\{text}\033]8;;\033\\"
+
+
 # =========================================================================
 # Context pressure display (CLI user-facing warnings)
 # =========================================================================
--- a/agent/error_classifier.py
+++ b/agent/error_classifier.py
@@ -1,820 +0,0 @@
-"""API error classification for smart failover and recovery.
-
-Provides a structured taxonomy of API errors and a priority-ordered
-classification pipeline that determines the correct recovery action
-(retry, rotate credential, fallback to another provider, compress
-context, or abort).
-
-Replaces scattered inline string-matching with a centralized classifier
-that the main retry loop in run_agent.py consults for every API failure.
-"""
-
-from __future__ import annotations
-
-import enum
-import logging
-from dataclasses import dataclass, field
-from typing import Any, Dict, Optional
-
-logger = logging.getLogger(__name__)
-
-
-# ── Error taxonomy ──────────────────────────────────────────────────────
-
-class FailoverReason(enum.Enum):
-    """Why an API call failed — determines recovery strategy."""
-
-    # Authentication / authorization
-    auth = "auth"                        # Transient auth (401/403) — refresh/rotate
-    auth_permanent = "auth_permanent"    # Auth failed after refresh — abort
-
-    # Billing / quota
-    billing = "billing"                  # 402 or confirmed credit exhaustion — rotate immediately
-    rate_limit = "rate_limit"            # 429 or quota-based throttling — backoff then rotate
-
-    # Server-side
-    overloaded = "overloaded"            # 503/529 — provider overloaded, backoff
-    server_error = "server_error"        # 500/502 — internal server error, retry
-
-    # Transport
-    timeout = "timeout"                  # Connection/read timeout — rebuild client + retry
-
-    # Context / payload
-    context_overflow = "context_overflow"  # Context too large — compress, not failover
-    payload_too_large = "payload_too_large"  # 413 — compress payload
-
-    # Model
-    model_not_found = "model_not_found"  # 404 or invalid model — fallback to different model
-
-    # Request format
-    format_error = "format_error"        # 400 bad request — abort or strip + retry
-
-    # Provider-specific
-    thinking_signature = "thinking_signature"  # Anthropic thinking block sig invalid
-    long_context_tier = "long_context_tier"    # Anthropic "extra usage" tier gate
-
-    # Catch-all
-    unknown = "unknown"                  # Unclassifiable — retry with backoff
-
-
-# ── Classification result ───────────────────────────────────────────────
-
-@dataclass
-class ClassifiedError:
-    """Structured classification of an API error with recovery hints."""
-
-    reason: FailoverReason
-    status_code: Optional[int] = None
-    provider: Optional[str] = None
-    model: Optional[str] = None
-    message: str = ""
-    error_context: Dict[str, Any] = field(default_factory=dict)
-
-    # Recovery action hints — the retry loop checks these instead of
-    # re-classifying the error itself.
-    retryable: bool = True
-    should_compress: bool = False
-    should_rotate_credential: bool = False
-    should_fallback: bool = False
-
-    @property
-    def is_auth(self) -> bool:
-        return self.reason in (FailoverReason.auth, FailoverReason.auth_permanent)
-
-
-
-# ── Provider-specific patterns ──────────────────────────────────────────
-
-# Patterns that indicate billing exhaustion (not transient rate limit)
-_BILLING_PATTERNS = [
-    "insufficient credits",
-    "insufficient_quota",
-    "credit balance",
-    "credits have been exhausted",
-    "top up your credits",
-    "payment required",
-    "billing hard limit",
-    "exceeded your current quota",
-    "account is deactivated",
-    "plan does not include",
-]
-
-# Patterns that indicate rate limiting (transient, will resolve)
-_RATE_LIMIT_PATTERNS = [
-    "rate limit",
-    "rate_limit",
-    "too many requests",
-    "throttled",
-    "requests per minute",
-    "tokens per minute",
-    "requests per day",
-    "try again in",
-    "please retry after",
-    "resource_exhausted",
-    "rate increased too quickly",  # Alibaba/DashScope throttling
-]
-
-# Usage-limit patterns that need disambiguation (could be billing OR rate_limit)
-_USAGE_LIMIT_PATTERNS = [
-    "usage limit",
-    "quota",
-    "limit exceeded",
-    "key limit exceeded",
-]
-
-# Patterns confirming usage limit is transient (not billing)
-_USAGE_LIMIT_TRANSIENT_SIGNALS = [
-    "try again",
-    "retry",
-    "resets at",
-    "reset in",
-    "wait",
-    "requests remaining",
-    "periodic",
-    "window",
-]
-
-# Payload-too-large patterns detected from message text (no status_code attr).
-# Proxies and some backends embed the HTTP status in the error message.
-_PAYLOAD_TOO_LARGE_PATTERNS = [
-    "request entity too large",
-    "payload too large",
-    "error code: 413",
-]
-
-# Context overflow patterns
-_CONTEXT_OVERFLOW_PATTERNS = [
-    "context length",
-    "context size",
-    "maximum context",
-    "token limit",
-    "too many tokens",
-    "reduce the length",
-    "exceeds the limit",
-    "context window",
-    "prompt is too long",
-    "prompt exceeds max length",
-    "max_tokens",
-    "maximum number of tokens",
-    # vLLM / local inference server patterns
-    "exceeds the max_model_len",
-    "max_model_len",
-    "prompt length",             # "engine prompt length X exceeds"
-    "input is too long",
-    "maximum model length",
-    # Ollama patterns
-    "context length exceeded",
-    "truncating input",
-    # llama.cpp / llama-server patterns
-    "slot context",              # "slot context: N tokens, prompt N tokens"
-    "n_ctx_slot",
-    # Chinese error messages (some providers return these)
-    "超过最大长度",
-    "上下文长度",
-]
-
-# Model not found patterns
-_MODEL_NOT_FOUND_PATTERNS = [
-    "is not a valid model",
-    "invalid model",
-    "model not found",
-    "model_not_found",
-    "does not exist",
-    "no such model",
-    "unknown model",
-    "unsupported model",
-]
-
-# Auth patterns (non-status-code signals)
-_AUTH_PATTERNS = [
-    "invalid api key",
-    "invalid_api_key",
-    "authentication",
-    "unauthorized",
-    "forbidden",
-    "invalid token",
-    "token expired",
-    "token revoked",
-    "access denied",
-]
-
-# Anthropic thinking block signature patterns
-_THINKING_SIG_PATTERNS = [
-    "signature",  # Combined with "thinking" check
-]
-
-# Transport error type names
-_TRANSPORT_ERROR_TYPES = frozenset({
-    "ReadTimeout", "ConnectTimeout", "PoolTimeout",
-    "ConnectError", "RemoteProtocolError",
-    "ConnectionError", "ConnectionResetError",
-    "ConnectionAbortedError", "BrokenPipeError",
-    "TimeoutError", "ReadError",
-    "ServerDisconnectedError",
-    # OpenAI SDK errors (not subclasses of Python builtins)
-    "APIConnectionError",
-    "APITimeoutError",
-})
-
-# Server disconnect patterns (no status code, but transport-level)
-_SERVER_DISCONNECT_PATTERNS = [
-    "server disconnected",
-    "peer closed connection",
-    "connection reset by peer",
-    "connection was closed",
-    "network connection lost",
-    "unexpected eof",
-    "incomplete chunked read",
-]
-
-
-# ── Classification pipeline ─────────────────────────────────────────────
-
-def classify_api_error(
-    error: Exception,
-    *,
-    provider: str = "",
-    model: str = "",
-    approx_tokens: int = 0,
-    context_length: int = 200000,
-    num_messages: int = 0,
-) -> ClassifiedError:
-    """Classify an API error into a structured recovery recommendation.
-
-    Priority-ordered pipeline:
-      1. Special-case provider-specific patterns (thinking sigs, tier gates)
-      2. HTTP status code + message-aware refinement
-      3. Error code classification (from body)
-      4. Message pattern matching (billing vs rate_limit vs context vs auth)
-      5. Transport error heuristics
-      6. Server disconnect + large session → context overflow
-      7. Fallback: unknown (retryable with backoff)
-
-    Args:
-        error: The exception from the API call.
-        provider: Current provider name (e.g. "openrouter", "anthropic").
-        model: Current model slug.
-        approx_tokens: Approximate token count of the current context.
-        context_length: Maximum context length for the current model.
-
-    Returns:
-        ClassifiedError with reason and recovery action hints.
-    """
-    status_code = _extract_status_code(error)
-    error_type = type(error).__name__
-    body = _extract_error_body(error)
-    error_code = _extract_error_code(body)
-
-    # Build a comprehensive error message string for pattern matching.
-    # str(error) alone may not include the body message (e.g. OpenAI SDK's
-    # APIStatusError.__str__ returns the first arg, not the body).  Append
-    # the body message so patterns like "try again" in 402 disambiguation
-    # are detected even when only present in the structured body.
-    #
-    # Also extract metadata.raw — OpenRouter wraps upstream provider errors
-    # inside {"error": {"message": "Provider returned error", "metadata":
-    # {"raw": "<actual error JSON>"}}} and the real error message (e.g.
-    # "context length exceeded") is only in the inner JSON.
-    _raw_msg = str(error).lower()
-    _body_msg = ""
-    _metadata_msg = ""
-    if isinstance(body, dict):
-        _err_obj = body.get("error", {})
-        if isinstance(_err_obj, dict):
-            _body_msg = (_err_obj.get("message") or "").lower()
-            # Parse metadata.raw for wrapped provider errors
-            _metadata = _err_obj.get("metadata", {})
-            if isinstance(_metadata, dict):
-                _raw_json = _metadata.get("raw") or ""
-                if isinstance(_raw_json, str) and _raw_json.strip():
-                    try:
-                        import json
-                        _inner = json.loads(_raw_json)
-                        if isinstance(_inner, dict):
-                            _inner_err = _inner.get("error", {})
-                            if isinstance(_inner_err, dict):
-                                _metadata_msg = (_inner_err.get("message") or "").lower()
-                    except (json.JSONDecodeError, TypeError):
-                        pass
-        if not _body_msg:
-            _body_msg = (body.get("message") or "").lower()
-    # Combine all message sources for pattern matching
-    parts = [_raw_msg]
-    if _body_msg and _body_msg not in _raw_msg:
-        parts.append(_body_msg)
-    if _metadata_msg and _metadata_msg not in _raw_msg and _metadata_msg not in _body_msg:
-        parts.append(_metadata_msg)
-    error_msg = " ".join(parts)
-    provider_lower = (provider or "").strip().lower()
-    model_lower = (model or "").strip().lower()
-
-    def _result(reason: FailoverReason, **overrides) -> ClassifiedError:
-        defaults = {
-            "reason": reason,
-            "status_code": status_code,
-            "provider": provider,
-            "model": model,
-            "message": _extract_message(error, body),
-        }
-        defaults.update(overrides)
-        return ClassifiedError(**defaults)
-
-    # ── 1. Provider-specific patterns (highest priority) ────────────
-
-    # Anthropic thinking block signature invalid (400).
-    # Don't gate on provider — OpenRouter proxies Anthropic errors, so the
-    # provider may be "openrouter" even though the error is Anthropic-specific.
-    # The message pattern ("signature" + "thinking") is unique enough.
-    if (
-        status_code == 400
-        and "signature" in error_msg
-        and "thinking" in error_msg
-    ):
-        return _result(
-            FailoverReason.thinking_signature,
-            retryable=True,
-            should_compress=False,
-        )
-
-    # Anthropic long-context tier gate (429 "extra usage" + "long context")
-    if (
-        status_code == 429
-        and "extra usage" in error_msg
-        and "long context" in error_msg
-    ):
-        return _result(
-            FailoverReason.long_context_tier,
-            retryable=True,
-            should_compress=True,
-        )
-
-    # ── 2. HTTP status code classification ──────────────────────────
-
-    if status_code is not None:
-        classified = _classify_by_status(
-            status_code, error_msg, error_code, body,
-            provider=provider_lower, model=model_lower,
-            approx_tokens=approx_tokens, context_length=context_length,
-            num_messages=num_messages,
-            result_fn=_result,
-        )
-        if classified is not None:
-            return classified
-
-    # ── 3. Error code classification ────────────────────────────────
-
-    if error_code:
-        classified = _classify_by_error_code(error_code, error_msg, _result)
-        if classified is not None:
-            return classified
-
-    # ── 4. Message pattern matching (no status code) ────────────────
-
-    classified = _classify_by_message(
-        error_msg, error_type,
-        approx_tokens=approx_tokens,
-        context_length=context_length,
-        result_fn=_result,
-    )
-    if classified is not None:
-        return classified
-
-    # ── 5. Server disconnect + large session → context overflow ─────
-    # Must come BEFORE generic transport error catch — a disconnect on
-    # a large session is more likely context overflow than a transient
-    # transport hiccup.  Without this ordering, RemoteProtocolError
-    # always maps to timeout regardless of session size.
-
-    is_disconnect = any(p in error_msg for p in _SERVER_DISCONNECT_PATTERNS)
-    if is_disconnect and not status_code:
-        is_large = approx_tokens > context_length * 0.6 or approx_tokens > 120000 or num_messages > 200
-        if is_large:
-            return _result(
-                FailoverReason.context_overflow,
-                retryable=True,
-                should_compress=True,
-            )
-        return _result(FailoverReason.timeout, retryable=True)
-
-    # ── 6. Transport / timeout heuristics ───────────────────────────
-
-    if error_type in _TRANSPORT_ERROR_TYPES or isinstance(error, (TimeoutError, ConnectionError, OSError)):
-        return _result(FailoverReason.timeout, retryable=True)
-
-    # ── 7. Fallback: unknown ────────────────────────────────────────
-
-    return _result(FailoverReason.unknown, retryable=True)
-
-
-# ── Status code classification ──────────────────────────────────────────
-
-def _classify_by_status(
-    status_code: int,
-    error_msg: str,
-    error_code: str,
-    body: dict,
-    *,
-    provider: str,
-    model: str,
-    approx_tokens: int,
-    context_length: int,
-    num_messages: int = 0,
-    result_fn,
-) -> Optional[ClassifiedError]:
-    """Classify based on HTTP status code with message-aware refinement."""
-
-    if status_code == 401:
-        # Not retryable on its own — credential pool rotation and
-        # provider-specific refresh (Codex, Anthropic, Nous) run before
-        # the retryability check in run_agent.py.  If those succeed, the
-        # loop `continue`s.  If they fail, retryable=False ensures we
-        # hit the client-error abort path (which tries fallback first).
-        return result_fn(
-            FailoverReason.auth,
-            retryable=False,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    if status_code == 403:
-        # OpenRouter 403 "key limit exceeded" is actually billing
-        if "key limit exceeded" in error_msg or "spending limit" in error_msg:
-            return result_fn(
-                FailoverReason.billing,
-                retryable=False,
-                should_rotate_credential=True,
-                should_fallback=True,
-            )
-        return result_fn(
-            FailoverReason.auth,
-            retryable=False,
-            should_fallback=True,
-        )
-
-    if status_code == 402:
-        return _classify_402(error_msg, result_fn)
-
-    if status_code == 404:
-        if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
-            return result_fn(
-                FailoverReason.model_not_found,
-                retryable=False,
-                should_fallback=True,
-            )
-        # Generic 404 — could be model or endpoint
-        return result_fn(
-            FailoverReason.model_not_found,
-            retryable=False,
-            should_fallback=True,
-        )
-
-    if status_code == 413:
-        return result_fn(
-            FailoverReason.payload_too_large,
-            retryable=True,
-            should_compress=True,
-        )
-
-    if status_code == 429:
-        # Already checked long_context_tier above; this is a normal rate limit
-        return result_fn(
-            FailoverReason.rate_limit,
-            retryable=True,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    if status_code == 400:
-        return _classify_400(
-            error_msg, error_code, body,
-            provider=provider, model=model,
-            approx_tokens=approx_tokens,
-            context_length=context_length,
-            num_messages=num_messages,
-            result_fn=result_fn,
-        )
-
-    if status_code in (500, 502):
-        return result_fn(FailoverReason.server_error, retryable=True)
-
-    if status_code in (503, 529):
-        return result_fn(FailoverReason.overloaded, retryable=True)
-
-    # Other 4xx — non-retryable
-    if 400 <= status_code < 500:
-        return result_fn(
-            FailoverReason.format_error,
-            retryable=False,
-            should_fallback=True,
-        )
-
-    # Other 5xx — retryable
-    if 500 <= status_code < 600:
-        return result_fn(FailoverReason.server_error, retryable=True)
-
-    return None
-
-
-def _classify_402(error_msg: str, result_fn) -> ClassifiedError:
-    """Disambiguate 402: billing exhaustion vs transient usage limit.
-
-    The key insight from OpenClaw: some 402s are transient rate limits
-    disguised as payment errors.  "Usage limit, try again in 5 minutes"
-    is NOT a billing problem — it's a periodic quota that resets.
-    """
-    # Check for transient usage-limit signals first
-    has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS)
-    has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS)
-
-    if has_usage_limit and has_transient_signal:
-        # Transient quota — treat as rate limit, not billing
-        return result_fn(
-            FailoverReason.rate_limit,
-            retryable=True,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    # Confirmed billing exhaustion
-    return result_fn(
-        FailoverReason.billing,
-        retryable=False,
-        should_rotate_credential=True,
-        should_fallback=True,
-    )
-
-
-def _classify_400(
-    error_msg: str,
-    error_code: str,
-    body: dict,
-    *,
-    provider: str,
-    model: str,
-    approx_tokens: int,
-    context_length: int,
-    num_messages: int = 0,
-    result_fn,
-) -> ClassifiedError:
-    """Classify 400 Bad Request — context overflow, format error, or generic."""
-
-    # Context overflow from 400
-    if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS):
-        return result_fn(
-            FailoverReason.context_overflow,
-            retryable=True,
-            should_compress=True,
-        )
-
-    # Some providers return model-not-found as 400 instead of 404 (e.g. OpenRouter).
-    if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
-        return result_fn(
-            FailoverReason.model_not_found,
-            retryable=False,
-            should_fallback=True,
-        )
-
-    # Some providers return rate limit / billing errors as 400 instead of 429/402.
-    # Check these patterns before falling through to format_error.
-    if any(p in error_msg for p in _RATE_LIMIT_PATTERNS):
-        return result_fn(
-            FailoverReason.rate_limit,
-            retryable=True,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-    if any(p in error_msg for p in _BILLING_PATTERNS):
-        return result_fn(
-            FailoverReason.billing,
-            retryable=False,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    # Generic 400 + large session → probable context overflow
-    # Anthropic sometimes returns a bare "Error" message when context is too large
-    err_body_msg = ""
-    if isinstance(body, dict):
-        err_obj = body.get("error", {})
-        if isinstance(err_obj, dict):
-            err_body_msg = (err_obj.get("message") or "").strip().lower()
-        # Responses API (and some providers) use flat body: {"message": "..."}
-        if not err_body_msg:
-            err_body_msg = (body.get("message") or "").strip().lower()
-    is_generic = len(err_body_msg) < 30 or err_body_msg in ("error", "")
-    is_large = approx_tokens > context_length * 0.4 or approx_tokens > 80000 or num_messages > 80
-
-    if is_generic and is_large:
-        return result_fn(
-            FailoverReason.context_overflow,
-            retryable=True,
-            should_compress=True,
-        )
-
-    # Non-retryable format error
-    return result_fn(
-        FailoverReason.format_error,
-        retryable=False,
-        should_fallback=True,
-    )
-
-
-# ── Error code classification ───────────────────────────────────────────
-
-def _classify_by_error_code(
-    error_code: str, error_msg: str, result_fn,
-) -> Optional[ClassifiedError]:
-    """Classify by structured error codes from the response body."""
-    code_lower = error_code.lower()
-
-    if code_lower in ("resource_exhausted", "throttled", "rate_limit_exceeded"):
-        return result_fn(
-            FailoverReason.rate_limit,
-            retryable=True,
-            should_rotate_credential=True,
-        )
-
-    if code_lower in ("insufficient_quota", "billing_not_active", "payment_required"):
-        return result_fn(
-            FailoverReason.billing,
-            retryable=False,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    if code_lower in ("model_not_found", "model_not_available", "invalid_model"):
-        return result_fn(
-            FailoverReason.model_not_found,
-            retryable=False,
-            should_fallback=True,
-        )
-
-    if code_lower in ("context_length_exceeded", "max_tokens_exceeded"):
-        return result_fn(
-            FailoverReason.context_overflow,
-            retryable=True,
-            should_compress=True,
-        )
-
-    return None
-
-
-# ── Message pattern classification ──────────────────────────────────────
-
-def _classify_by_message(
-    error_msg: str,
-    error_type: str,
-    *,
-    approx_tokens: int,
-    context_length: int,
-    result_fn,
-) -> Optional[ClassifiedError]:
-    """Classify based on error message patterns when no status code is available."""
-
-    # Payload-too-large patterns (from message text when no status_code)
-    if any(p in error_msg for p in _PAYLOAD_TOO_LARGE_PATTERNS):
-        return result_fn(
-            FailoverReason.payload_too_large,
-            retryable=True,
-            should_compress=True,
-        )
-
-    # Usage-limit patterns need the same disambiguation as 402: some providers
-    # surface "usage limit" errors without an HTTP status code.  A transient
-    # signal ("try again", "resets at", …) means it's a periodic quota, not
-    # billing exhaustion.
-    has_usage_limit = any(p in error_msg for p in _USAGE_LIMIT_PATTERNS)
-    if has_usage_limit:
-        has_transient_signal = any(p in error_msg for p in _USAGE_LIMIT_TRANSIENT_SIGNALS)
-        if has_transient_signal:
-            return result_fn(
-                FailoverReason.rate_limit,
-                retryable=True,
-                should_rotate_credential=True,
-                should_fallback=True,
-            )
-        return result_fn(
-            FailoverReason.billing,
-            retryable=False,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    # Billing patterns
-    if any(p in error_msg for p in _BILLING_PATTERNS):
-        return result_fn(
-            FailoverReason.billing,
-            retryable=False,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    # Rate limit patterns
-    if any(p in error_msg for p in _RATE_LIMIT_PATTERNS):
-        return result_fn(
-            FailoverReason.rate_limit,
-            retryable=True,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    # Context overflow patterns
-    if any(p in error_msg for p in _CONTEXT_OVERFLOW_PATTERNS):
-        return result_fn(
-            FailoverReason.context_overflow,
-            retryable=True,
-            should_compress=True,
-        )
-
-    # Auth patterns
-    # Auth errors should NOT be retried directly — the credential is invalid and
-    # retrying with the same key will always fail.  Set retryable=False so the
-    # caller triggers credential rotation (should_rotate_credential=True) or
-    # provider fallback rather than an immediate retry loop.
-    if any(p in error_msg for p in _AUTH_PATTERNS):
-        return result_fn(
-            FailoverReason.auth,
-            retryable=False,
-            should_rotate_credential=True,
-            should_fallback=True,
-        )
-
-    # Model not found patterns
-    if any(p in error_msg for p in _MODEL_NOT_FOUND_PATTERNS):
-        return result_fn(
-            FailoverReason.model_not_found,
-            retryable=False,
-            should_fallback=True,
-        )
-
-    return None
-
-
-# ── Helpers ─────────────────────────────────────────────────────────────
-
-def _extract_status_code(error: Exception) -> Optional[int]:
-    """Walk the error and its cause chain to find an HTTP status code."""
-    current = error
-    for _ in range(5):  # Max depth to prevent infinite loops
-        code = getattr(current, "status_code", None)
-        if isinstance(code, int):
-            return code
-        # Some SDKs use .status instead of .status_code
-        code = getattr(current, "status", None)
-        if isinstance(code, int) and 100 <= code < 600:
-            return code
-        # Walk cause chain
-        cause = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
-        if cause is None or cause is current:
-            break
-        current = cause
-    return None
-
-
-def _extract_error_body(error: Exception) -> dict:
-    """Extract the structured error body from an SDK exception."""
-    body = getattr(error, "body", None)
-    if isinstance(body, dict):
-        return body
-    # Some errors have .response.json()
-    response = getattr(error, "response", None)
-    if response is not None:
-        try:
-            json_body = response.json()
-            if isinstance(json_body, dict):
-                return json_body
-        except Exception:
-            pass
-    return {}
-
-
-def _extract_error_code(body: dict) -> str:
-    """Extract an error code string from the response body."""
-    if not body:
-        return ""
-    error_obj = body.get("error", {})
-    if isinstance(error_obj, dict):
-        code = error_obj.get("code") or error_obj.get("type") or ""
-        if isinstance(code, str) and code.strip():
-            return code.strip()
-    # Top-level code
-    code = body.get("code") or body.get("error_code") or ""
-    if isinstance(code, (str, int)):
-        return str(code).strip()
-    return ""
-
-
-def _extract_message(error: Exception, body: dict) -> str:
-    """Extract the most informative error message."""
-    # Try structured body first
-    if body:
-        error_obj = body.get("error", {})
-        if isinstance(error_obj, dict):
-            msg = error_obj.get("message", "")
-            if isinstance(msg, str) and msg.strip():
-                return msg.strip()[:500]
-        msg = body.get("message", "")
-        if isinstance(msg, str) and msg.strip():
-            return msg.strip()[:500]
-    # Fallback to str(error)
-    return str(error)[:500]
--- a/agent/insights.py
+++ b/agent/insights.py
@@ -27,6 +27,7 @@ from agent.usage_pricing import (
    DEFAULT_PRICING,
    estimate_usage_cost,
    format_duration_compact,
+    get_pricing,
    has_known_pricing,
 )

@@ -38,6 +39,15 @@ def _has_known_pricing(model_name: str, provider: str = None, base_url: str = No
    return has_known_pricing(model_name, provider=provider, base_url=base_url)


+def _get_pricing(model_name: str) -> Dict[str, float]:
+    """Look up pricing for a model. Uses fuzzy matching on model name.
+
+    Returns _DEFAULT_PRICING (zero cost) for unknown/custom models —
+    we can't assume costs for self-hosted endpoints, local inference, etc.
+    """
+    return get_pricing(model_name)
+
+
 def _estimate_cost(
    session_or_model: Dict[str, Any] | str,
    input_tokens: int = 0,
--- a/agent/manual_compression_feedback.py
+++ b/agent/manual_compression_feedback.py
@@ -1,49 +0,0 @@
-"""User-facing summaries for manual compression commands."""
-
-from __future__ import annotations
-
-from typing import Any, Sequence
-
-
-def summarize_manual_compression(
-    before_messages: Sequence[dict[str, Any]],
-    after_messages: Sequence[dict[str, Any]],
-    before_tokens: int,
-    after_tokens: int,
-) -> dict[str, Any]:
-    """Return consistent user-facing feedback for manual compression."""
-    before_count = len(before_messages)
-    after_count = len(after_messages)
-    noop = list(after_messages) == list(before_messages)
-
-    if noop:
-        headline = f"No changes from compression: {before_count} messages"
-        if after_tokens == before_tokens:
-            token_line = (
-                f"Rough transcript estimate: ~{before_tokens:,} tokens (unchanged)"
-            )
-        else:
-            token_line = (
-                f"Rough transcript estimate: ~{before_tokens:,} → "
-                f"~{after_tokens:,} tokens"
-            )
-    else:
-        headline = f"Compressed: {before_count} → {after_count} messages"
-        token_line = (
-            f"Rough transcript estimate: ~{before_tokens:,} → "
-            f"~{after_tokens:,} tokens"
-        )
-
-    note = None
-    if not noop and after_count < before_count and after_tokens > before_tokens:
-        note = (
-            "Note: fewer messages can still raise this rough transcript estimate "
-            "when compression rewrites the transcript into denser summaries."
-        )
-
-    return {
-        "noop": noop,
-        "headline": headline,
-        "token_line": token_line,
-        "note": note,
-    }
--- a/agent/memory_manager.py
+++ b/agent/memory_manager.py
@@ -28,6 +28,7 @@ Usage in run_agent.py:

 from __future__ import annotations

+import json
 import logging
 import re
 from typing import Any, Dict, List, Optional
@@ -133,6 +134,11 @@ class MemoryManager:
        """All registered providers in order."""
        return list(self._providers)

+    @property
+    def provider_names(self) -> List[str]:
+        """Names of all registered providers."""
+        return [p.name for p in self._providers]
+
    def get_provider(self, name: str) -> Optional[MemoryProvider]:
        """Get a provider by name, or None if not registered."""
        for p in self._providers:
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@@ -5,6 +5,7 @@ and run_agent.py for pre-flight context checks.
 """

 import logging
+import os
 import re
 import time
 from pathlib import Path
@@ -23,20 +24,15 @@ logger = logging.getLogger(__name__)
 # are preserved so the full model name reaches cache lookups and server queries.
 _PROVIDER_PREFIXES: frozenset[str] = frozenset({
    "openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
-    "gemini", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "anthropic", "deepseek",
+    "gemini", "zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek",
    "opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
    "qwen-oauth",
-    "xiaomi",
-    "arcee",
    "custom", "local",
    # Common aliases
    "google", "google-gemini", "google-ai-studio",
    "glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
-    "github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
+    "github-models", "kimi", "moonshot", "claude", "deep-seek",
    "opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
-    "mimo", "xiaomi-mimo",
-    "arcee-ai", "arceeai",
-    "xai", "x-ai", "x.ai", "grok",
    "qwen-portal",
 })

@@ -87,11 +83,6 @@ CONTEXT_PROBE_TIERS = [
 # Default context length when no detection method succeeds.
 DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]

-# Minimum context length required to run Hermes Agent.  Models with fewer
-# tokens cannot maintain enough working memory for tool-calling workflows.
-# Sessions, model switches, and cron jobs should reject models below this.
-MINIMUM_CONTEXT_LENGTH = 64_000
-
 # Thin fallback defaults — only broad model family patterns.
 # These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
 # all miss. Replaced the previous 80+ entry dict.
@@ -107,15 +98,9 @@ DEFAULT_CONTEXT_LENGTHS = {
    "claude-sonnet-4.6": 1000000,
    # Catch-all for older Claude models (must sort after specific entries)
    "claude": 200000,
-    # OpenAI — GPT-5 family (most have 400k; specific overrides first)
-    # Source: https://developers.openai.com/api/docs/models
-    "gpt-5.4-nano": 400000,           # 400k (not 1.05M like full 5.4)
-    "gpt-5.4-mini": 400000,           # 400k (not 1.05M like full 5.4)
-    "gpt-5.4": 1050000,               # GPT-5.4, GPT-5.4 Pro (1.05M context)
-    "gpt-5.3-codex-spark": 128000,    # Spark variant has reduced 128k context
-    "gpt-5.1-chat": 128000,           # Chat variant has 128k context
-    "gpt-5": 400000,                  # GPT-5.x base, mini, codex variants (400k)
+    # OpenAI
    "gpt-4.1": 1047576,
+    "gpt-5": 128000,
    "gpt-4": 128000,
    # Google
    "gemini": 1048576,
@@ -128,48 +113,33 @@ DEFAULT_CONTEXT_LENGTHS = {
    "deepseek": 128000,
    # Meta
    "llama": 131072,
-    # Qwen — specific model families before the catch-all.
-    # Official docs: https://help.aliyun.com/zh/model-studio/developer-reference/
-    "qwen3-coder-plus": 1000000,  # 1M context
-    "qwen3-coder": 262144,        # 256K context
+    # Qwen
    "qwen": 131072,
-    # MiniMax — official docs: 204,800 context for all models
-    # https://platform.minimax.io/docs/api-reference/text-anthropic-api
-    "minimax": 204800,
+    # MiniMax (lowercase — lookup lowercases model names at line 973)
+    "minimax-m1-256k": 1000000,
+    "minimax-m1-128k": 1000000,
+    "minimax-m1-80k": 1000000,
+    "minimax-m1-40k": 1000000,
+    "minimax-m1": 1000000,
+    "minimax-m2.5": 1048576,
+    "minimax-m2.7": 1048576,
+    "minimax": 1048576,
    # GLM
    "glm": 202752,
-    # xAI Grok — xAI /v1/models does not return context_length metadata,
-    # so these hardcoded fallbacks prevent Hermes from probing-down to
-    # the default 128k when the user points at https://api.x.ai/v1
-    # via a custom provider. Values sourced from models.dev (2026-04).
-    # Keys use substring matching (longest-first), so e.g. "grok-4.20"
-    # matches "grok-4.20-0309-reasoning" / "-non-reasoning" / "-multi-agent-0309".
-    "grok-code-fast": 256000,   # grok-code-fast-1
-    "grok-4-1-fast": 2000000,   # grok-4-1-fast-(non-)reasoning
-    "grok-2-vision": 8192,      # grok-2-vision, -1212, -latest
-    "grok-4-fast": 2000000,     # grok-4-fast-(non-)reasoning
-    "grok-4.20": 2000000,       # grok-4.20-0309-(non-)reasoning, -multi-agent-0309
-    "grok-4": 256000,           # grok-4, grok-4-0709
-    "grok-3": 131072,           # grok-3, grok-3-mini, grok-3-fast, grok-3-mini-fast
-    "grok-2": 131072,           # grok-2, grok-2-1212, grok-2-latest
-    "grok": 131072,             # catch-all (grok-beta, unknown grok-*)
    # Kimi
    "kimi": 262144,
    # Arcee
    "trinity": 262144,
-    # OpenRouter
-    "elephant": 262144,
    # Hugging Face Inference Providers — model IDs use org/name format
    "Qwen/Qwen3.5-397B-A17B": 131072,
    "Qwen/Qwen3.5-35B-A3B": 131072,
    "deepseek-ai/DeepSeek-V3.2": 65536,
    "moonshotai/Kimi-K2.5": 262144,
    "moonshotai/Kimi-K2-Thinking": 262144,
-    "MiniMaxAI/MiniMax-M2.5": 204800,
-    "XiaomiMiMo/MiMo-V2-Flash": 256000,
-    "mimo-v2-pro": 1000000,
-    "mimo-v2-omni": 256000,
-    "mimo-v2-flash": 256000,
+    "MiniMaxAI/MiniMax-M2.5": 1048576,
+    "XiaomiMiMo/MiMo-V2-Flash": 32768,
+    "mimo-v2-pro": 1048576,
+    "mimo-v2-omni": 1048576,
    "zai-org/GLM-5": 202752,
 }

@@ -194,12 +164,6 @@ _MAX_COMPLETION_KEYS = (

 # Local server hostnames / address patterns
 _LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
-# Docker / Podman / Lima DNS names that resolve to the host machine
-_CONTAINER_LOCAL_SUFFIXES = (
-    ".docker.internal",
-    ".containers.internal",
-    ".lima.internal",
-)


 def _normalize_base_url(base_url: str) -> str:
@@ -221,9 +185,7 @@ _URL_TO_PROVIDER: Dict[str, str] = {
    "api.anthropic.com": "anthropic",
    "api.z.ai": "zai",
    "api.moonshot.ai": "kimi-coding",
-    "api.moonshot.cn": "kimi-coding-cn",
    "api.kimi.com": "kimi-coding",
-    "api.arcee.ai": "arcee",
    "api.minimax": "minimax",
    "dashscope.aliyuncs.com": "alibaba",
    "dashscope-intl.aliyuncs.com": "alibaba",
@@ -235,10 +197,6 @@ _URL_TO_PROVIDER: Dict[str, str] = {
    "api.githubcopilot.com": "copilot",
    "models.github.ai": "copilot",
    "api.fireworks.ai": "fireworks",
-    "opencode.ai": "opencode-go",
-    "api.x.ai": "xai",
-    "api.xiaomimimo.com": "xiaomi",
-    "xiaomimimo.com": "xiaomi",
 }


@@ -277,9 +235,6 @@ def is_local_endpoint(base_url: str) -> bool:
        return False
    if host in _LOCAL_HOSTS:
        return True
-    # Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
-    if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
-        return True
    # RFC-1918 private ranges and link-local
    import ipaddress
    try:
@@ -647,49 +602,6 @@ def parse_context_limit_from_error(error_msg: str) -> Optional[int]:
    return None


-def parse_available_output_tokens_from_error(error_msg: str) -> Optional[int]:
-    """Detect an "output cap too large" error and return how many output tokens are available.
-
-    Background — two distinct context errors exist:
-      1. "Prompt too long"  — the INPUT itself exceeds the context window.
-           Fix: compress history and/or halve context_length.
-      2. "max_tokens too large" — input is fine, but input + requested_output > window.
-           Fix: reduce max_tokens (the output cap) for this call.
-           Do NOT touch context_length — the window hasn't shrunk.
-
-    Anthropic's API returns errors like:
-      "max_tokens: 32768 > context_window: 200000 - input_tokens: 190000 = available_tokens: 10000"
-
-    Returns the number of output tokens that would fit (e.g. 10000 above), or None if
-    the error does not look like a max_tokens-too-large error.
-    """
-    error_lower = error_msg.lower()
-
-    # Must look like an output-cap error, not a prompt-length error.
-    is_output_cap_error = (
-        "max_tokens" in error_lower
-        and ("available_tokens" in error_lower or "available tokens" in error_lower)
-    )
-    if not is_output_cap_error:
-        return None
-
-    # Extract the available_tokens figure.
-    # Anthropic format: "… = available_tokens: 10000"
-    patterns = [
-        r'available_tokens[:\s]+(\d+)',
-        r'available\s+tokens[:\s]+(\d+)',
-        # fallback: last number after "=" in expressions like "200000 - 190000 = 10000"
-        r'=\s*(\d+)\s*$',
-    ]
-    for pattern in patterns:
-        match = re.search(pattern, error_lower)
-        if match:
-            tokens = int(match.group(1))
-            if tokens >= 1:
-                return tokens
-    return None
-
-
 def _model_id_matches(candidate_id: str, lookup_model: str) -> bool:
    """Return True if *candidate_id* (from server) matches *lookup_model* (configured).

@@ -787,12 +699,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                resp = client.post(f"{server_url}/api/show", json={"name": model})
                if resp.status_code == 200:
                    data = resp.json()
-                    # Prefer explicit num_ctx from Modelfile parameters: this is
-                    # the *runtime* context Ollama will actually allocate KV cache
-                    # for. The GGUF model_info.context_length is the training max,
-                    # which can be larger than num_ctx — using it here would let
-                    # Hermes grow conversations past the runtime limit and Ollama
-                    # would silently truncate. Matches query_ollama_num_ctx().
+                    # Check model_info for context length
+                    model_info = data.get("model_info", {})
+                    for key, value in model_info.items():
+                        if "context_length" in key and isinstance(value, (int, float)):
+                            return int(value)
+                    # Check parameters string for num_ctx
                    params = data.get("parameters", "")
                    if "num_ctx" in params:
                        for line in params.split("\n"):
@@ -803,11 +715,6 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
                                        return int(parts[-1])
                                    except ValueError:
                                        pass
-                    # Fall back to GGUF model_info context_length (training max)
-                    model_info = data.get("model_info", {})
-                    for key, value in model_info.items():
-                        if "context_length" in key and isinstance(value, (int, float)):
-                            return int(value)

            # LM Studio native API: /api/v1/models returns max_context_length.
            # This is more reliable than the OpenAI-compat /v1/models which
@@ -1062,21 +969,16 @@ def get_model_context_length(


 def estimate_tokens_rough(text: str) -> int:
-    """Rough token estimate (~4 chars/token) for pre-flight checks.
-
-    Uses ceiling division so short texts (1-3 chars) never estimate as
-    0 tokens, which would cause the compressor and pre-flight checks to
-    systematically undercount when many short tool results are present.
-    """
+    """Rough token estimate (~4 chars/token) for pre-flight checks."""
    if not text:
        return 0
-    return (len(text) + 3) // 4
+    return len(text) // 4


 def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
    """Rough token estimate for a message list (pre-flight only)."""
    total_chars = sum(len(str(msg)) for msg in messages)
-    return (total_chars + 3) // 4
+    return total_chars // 4


 def estimate_request_tokens_rough(
@@ -1099,4 +1001,4 @@ def estimate_request_tokens_rough(
        total_chars += sum(len(str(msg)) for msg in messages)
    if tools:
        total_chars += len(str(tools))
-    return (total_chars + 3) // 4
+    return total_chars // 4
--- a/agent/models_dev.py
+++ b/agent/models_dev.py
@@ -18,8 +18,10 @@ Other modules should import the dataclasses and query functions from here
 rather than parsing the raw JSON themselves.
 """

+import difflib
 import json
 import logging
+import os
 import time
 from dataclasses import dataclass
 from pathlib import Path
@@ -133,6 +135,9 @@ class ProviderInfo:
    doc: str = ""                   # documentation URL
    model_count: int = 0

+    def has_api_url(self) -> bool:
+        return bool(self.api)
+

 # ---------------------------------------------------------------------------
 # Provider ID mapping: Hermes ↔ models.dev
@@ -142,11 +147,8 @@ class ProviderInfo:
 PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
    "openrouter": "openrouter",
    "anthropic": "anthropic",
-    "openai": "openai",
-    "openai-codex": "openai",
    "zai": "zai",
    "kimi-coding": "kimi-for-coding",
-    "kimi-coding-cn": "kimi-for-coding",
    "minimax": "minimax",
    "minimax-cn": "minimax-cn",
    "deepseek": "deepseek",
@@ -162,7 +164,6 @@ PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
    "gemini": "google",
    "google": "google",
    "xai": "xai",
-    "xiaomi": "xiaomi",
    "nvidia": "nvidia",
    "groq": "groq",
    "mistral": "mistral",
@@ -175,6 +176,13 @@ PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
 _MODELS_DEV_TO_PROVIDER: Optional[Dict[str, str]] = None


+def _get_reverse_mapping() -> Dict[str, str]:
+    """Return models.dev ID → Hermes provider ID mapping."""
+    global _MODELS_DEV_TO_PROVIDER
+    if _MODELS_DEV_TO_PROVIDER is None:
+        _MODELS_DEV_TO_PROVIDER = {v: k for k, v in PROVIDER_TO_MODELS_DEV.items()}
+    return _MODELS_DEV_TO_PROVIDER
+

 def _get_cache_path() -> Path:
    """Return path to disk cache file."""
@@ -378,14 +386,7 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit

    # Extract capability flags (default to False if missing)
    supports_tools = bool(entry.get("tool_call", False))
-    # Vision: check both the `attachment` flag and `modalities.input` for "image".
-    # Some models (e.g. gemma-4) list image in input modalities but not attachment.
-    input_mods = entry.get("modalities", {})
-    if isinstance(input_mods, dict):
-        input_mods = input_mods.get("input", [])
-    else:
-        input_mods = []
-    supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
+    supports_vision = bool(entry.get("attachment", False))
    supports_reasoning = bool(entry.get("reasoning", False))

    # Extract limits
@@ -455,6 +456,93 @@ def list_agentic_models(provider: str) -> List[str]:
    return result


+def search_models_dev(
+    query: str, provider: str = None, limit: int = 5
+) -> List[Dict[str, Any]]:
+    """Fuzzy search across models.dev catalog. Returns matching model entries.
+
+    Args:
+        query: Search string to match against model IDs.
+        provider: Optional Hermes provider ID to restrict search scope.
+                  If None, searches across all providers in PROVIDER_TO_MODELS_DEV.
+        limit: Maximum number of results to return.
+
+    Returns:
+        List of dicts, each containing 'provider', 'model_id', and the full
+        model 'entry' from models.dev.
+    """
+    data = fetch_models_dev()
+    if not data:
+        return []
+
+    # Build list of (provider_id, model_id, entry) candidates
+    candidates: List[tuple] = []
+
+    if provider is not None:
+        # Search only the specified provider
+        mdev_provider_id = PROVIDER_TO_MODELS_DEV.get(provider)
+        if not mdev_provider_id:
+            return []
+        provider_data = data.get(mdev_provider_id, {})
+        if isinstance(provider_data, dict):
+            models = provider_data.get("models", {})
+            if isinstance(models, dict):
+                for mid, mdata in models.items():
+                    candidates.append((provider, mid, mdata))
+    else:
+        # Search across all mapped providers
+        for hermes_prov, mdev_prov in PROVIDER_TO_MODELS_DEV.items():
+            provider_data = data.get(mdev_prov, {})
+            if isinstance(provider_data, dict):
+                models = provider_data.get("models", {})
+                if isinstance(models, dict):
+                    for mid, mdata in models.items():
+                        candidates.append((hermes_prov, mid, mdata))
+
+    if not candidates:
+        return []
+
+    # Use difflib for fuzzy matching — case-insensitive comparison
+    model_ids_lower = [c[1].lower() for c in candidates]
+    query_lower = query.lower()
+
+    # First try exact substring matches (more intuitive than pure edit-distance)
+    substring_matches = []
+    for prov, mid, mdata in candidates:
+        if query_lower in mid.lower():
+            substring_matches.append({"provider": prov, "model_id": mid, "entry": mdata})
+
+    # Then add difflib fuzzy matches for any remaining slots
+    fuzzy_ids = difflib.get_close_matches(
+        query_lower, model_ids_lower, n=limit * 2, cutoff=0.4
+    )
+
+    seen_ids: set = set()
+    results: List[Dict[str, Any]] = []
+
+    # Prioritize substring matches
+    for match in substring_matches:
+        key = (match["provider"], match["model_id"])
+        if key not in seen_ids:
+            seen_ids.add(key)
+            results.append(match)
+            if len(results) >= limit:
+                return results
+
+    # Add fuzzy matches
+    for fid in fuzzy_ids:
+        # Find original-case candidates matching this lowered ID
+        for prov, mid, mdata in candidates:
+            if mid.lower() == fid:
+                key = (prov, mid)
+                if key not in seen_ids:
+                    seen_ids.add(key)
+                    results.append({"provider": prov, "model_id": mid, "entry": mdata})
+                    if len(results) >= limit:
+                        return results
+
+    return results
+

 # ---------------------------------------------------------------------------
 # Rich dataclass constructors — parse raw models.dev JSON into dataclasses
@@ -546,6 +634,43 @@ def get_provider_info(provider_id: str) -> Optional[ProviderInfo]:
    return _parse_provider_info(mdev_id, raw)


+def list_all_providers() -> Dict[str, ProviderInfo]:
+    """Return all providers from models.dev as {provider_id: ProviderInfo}.
+
+    Returns the full catalog — 109+ providers.  For providers that have
+    a Hermes alias, both the models.dev ID and the Hermes ID are included.
+    """
+    data = fetch_models_dev()
+    result: Dict[str, ProviderInfo] = {}
+
+    for pid, pdata in data.items():
+        if isinstance(pdata, dict):
+            info = _parse_provider_info(pid, pdata)
+            result[pid] = info
+
+    return result
+
+
+def get_providers_for_env_var(env_var: str) -> List[str]:
+    """Reverse lookup: find all providers that use a given env var.
+
+    Useful for auto-detection: "user has ANTHROPIC_API_KEY set, which
+    providers does that enable?"
+
+    Returns list of models.dev provider IDs.
+    """
+    data = fetch_models_dev()
+    matches: List[str] = []
+
+    for pid, pdata in data.items():
+        if isinstance(pdata, dict):
+            env = pdata.get("env", [])
+            if isinstance(env, list) and env_var in env:
+                matches.append(pid)
+
+    return matches
+
+
 # ---------------------------------------------------------------------------
 # Model-level queries (rich ModelInfo)
 # ---------------------------------------------------------------------------
@@ -583,3 +708,74 @@ def get_model_info(
    return None


+def get_model_info_any_provider(model_id: str) -> Optional[ModelInfo]:
+    """Search all providers for a model by ID.
+
+    Useful when you have a full slug like "anthropic/claude-sonnet-4.6" or
+    a bare name and want to find it anywhere.  Checks Hermes-mapped providers
+    first, then falls back to all models.dev providers.
+    """
+    data = fetch_models_dev()
+
+    # Try Hermes-mapped providers first (more likely what the user wants)
+    for hermes_id, mdev_id in PROVIDER_TO_MODELS_DEV.items():
+        pdata = data.get(mdev_id)
+        if not isinstance(pdata, dict):
+            continue
+        models = pdata.get("models", {})
+        if not isinstance(models, dict):
+            continue
+
+        raw = models.get(model_id)
+        if isinstance(raw, dict):
+            return _parse_model_info(model_id, raw, mdev_id)
+
+        # Case-insensitive
+        model_lower = model_id.lower()
+        for mid, mdata in models.items():
+            if mid.lower() == model_lower and isinstance(mdata, dict):
+                return _parse_model_info(mid, mdata, mdev_id)
+
+    # Fall back to ALL providers
+    for pid, pdata in data.items():
+        if pid in _get_reverse_mapping():
+            continue  # already checked
+        if not isinstance(pdata, dict):
+            continue
+        models = pdata.get("models", {})
+        if not isinstance(models, dict):
+            continue
+
+        raw = models.get(model_id)
+        if isinstance(raw, dict):
+            return _parse_model_info(model_id, raw, pid)
+
+    return None
+
+
+def list_provider_model_infos(provider_id: str) -> List[ModelInfo]:
+    """Return all models for a provider as ModelInfo objects.
+
+    Filters out deprecated models by default.
+    """
+    mdev_id = PROVIDER_TO_MODELS_DEV.get(provider_id, provider_id)
+
+    data = fetch_models_dev()
+    pdata = data.get(mdev_id)
+    if not isinstance(pdata, dict):
+        return []
+
+    models = pdata.get("models", {})
+    if not isinstance(models, dict):
+        return []
+
+    result: List[ModelInfo] = []
+    for mid, mdata in models.items():
+        if not isinstance(mdata, dict):
+            continue
+        status = mdata.get("status", "")
+        if status == "deprecated":
+            continue
+        result.append(_parse_model_info(mid, mdata, mdev_id))
+
+    return result
--- a/agent/mtls.py
+++ b/agent/mtls.py
@@ -1,184 +0,0 @@
-"""
-agent/mtls.py — Mutual TLS support for Hermes A2A communication.
-
-Provides:
- build_server_ssl_context()  — SSL context for uvicorn that requires client certs
- build_client_ssl_context()  — SSL context for httpx/aiohttp A2A clients
- MTLSMiddleware               — FastAPI middleware that enforces client cert on A2A routes
- is_mtls_configured()        — Check if env vars are set
-
-Configuration (environment variables):
-  HERMES_MTLS_CERT   Path to this agent's TLS certificate (PEM)
-  HERMES_MTLS_KEY    Path to this agent's TLS private key (PEM)
-  HERMES_MTLS_CA     Path to the Fleet CA certificate (PEM) — used to verify peers
-
-All three must be set to enable mTLS. If any is missing, mTLS is disabled and
-the server falls back to plain HTTP (or regular TLS without client auth).
-"""
-
-import logging
-import os
-import ssl
-from pathlib import Path
-from typing import Optional
-
-logger = logging.getLogger(__name__)
-
-# A2A routes that require a valid client certificate when mTLS is enabled.
-_A2A_PATH_PREFIXES = (
-    "/.well-known/agent-card",
-    "/agent-card",
-    "/api/agent-card",
-    "/a2a/",
-)
-
-
-def _get_env(key: str) -> Optional[str]:
-    val = os.environ.get(key, "").strip()
-    return val or None
-
-
-def is_mtls_configured() -> bool:
-    """Return True if all three mTLS env vars are set and the files exist."""
-    cert = _get_env("HERMES_MTLS_CERT")
-    key = _get_env("HERMES_MTLS_KEY")
-    ca = _get_env("HERMES_MTLS_CA")
-    if not (cert and key and ca):
-        return False
-    for label, path in (("HERMES_MTLS_CERT", cert), ("HERMES_MTLS_KEY", key), ("HERMES_MTLS_CA", ca)):
-        if not Path(path).is_file():
-            logger.warning("mTLS disabled: %s file not found: %s", label, path)
-            return False
-    return True
-
-
-def build_server_ssl_context() -> ssl.SSLContext:
-    """
-    Build an SSL context for the A2A server that:
-    - presents its own certificate
-    - requires and verifies the client's certificate against the Fleet CA
-
-    Raises:
-        RuntimeError: if mTLS env vars are not set or files are missing
-        ssl.SSLError: if cert/key/CA files are invalid
-    """
-    cert = _get_env("HERMES_MTLS_CERT")
-    key = _get_env("HERMES_MTLS_KEY")
-    ca = _get_env("HERMES_MTLS_CA")
-
-    if not (cert and key and ca):
-        raise RuntimeError(
-            "mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
-        )
-
-    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
-    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
-    ctx.load_cert_chain(certfile=cert, keyfile=key)
-    ctx.load_verify_locations(cafile=ca)
-    # CERT_REQUIRED: reject connections without a valid client cert
-    ctx.verify_mode = ssl.CERT_REQUIRED
-    logger.info("mTLS server context built (cert=%s, CA=%s)", cert, ca)
-    return ctx
-
-
-def build_client_ssl_context() -> ssl.SSLContext:
-    """
-    Build an SSL context for outbound A2A connections that:
-    - presents this agent's certificate as a client cert
-    - verifies the remote server against the Fleet CA
-
-    Raises:
-        RuntimeError: if mTLS env vars are not set or files are missing
-        ssl.SSLError: if cert/key/CA files are invalid
-    """
-    cert = _get_env("HERMES_MTLS_CERT")
-    key = _get_env("HERMES_MTLS_KEY")
-    ca = _get_env("HERMES_MTLS_CA")
-
-    if not (cert and key and ca):
-        raise RuntimeError(
-            "mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
-        )
-
-    ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
-    ctx.minimum_version = ssl.TLSVersion.TLSv1_2
-    ctx.load_cert_chain(certfile=cert, keyfile=key)
-    ctx.load_verify_locations(cafile=ca)
-    ctx.verify_mode = ssl.CERT_REQUIRED
-    ctx.check_hostname = True
-    logger.info("mTLS client context built (cert=%s, CA=%s)", cert, ca)
-    return ctx
-
-
-def get_peer_cn(ssl_object) -> Optional[str]:
-    """Extract the CN from the peer certificate's subject, or None."""
-    try:
-        peer_cert = ssl_object.getpeercert()
-        if not peer_cert:
-            return None
-        for rdn in peer_cert.get("subject", ()):
-            for attr, value in rdn:
-                if attr == "commonName":
-                    return value
-    except Exception:
-        pass
-    return None
-
-
-class MTLSMiddleware:
-    """
-    ASGI middleware that enforces client certificate verification on A2A routes.
-
-    When mTLS is NOT configured (no env vars) or the route is not an A2A route,
-    the request passes through unchanged.
-
-    When mTLS IS configured and the route matches an A2A prefix, the middleware
-    checks that the request arrived over a TLS connection with a verified client
-    certificate. If not, it returns HTTP 403.
-
-    Note: This middleware only provides defence-in-depth at the app layer.
-    The primary enforcement is at the SSL context level (CERT_REQUIRED on the
-    server context). This middleware is useful when the server runs behind a
-    TLS-terminating proxy that forwards cert info via headers (not yet
-    implemented) or for test-time injection.
-    """
-
-    def __init__(self, app):
-        self.app = app
-        self._enabled = is_mtls_configured()
-        if self._enabled:
-            logger.info("MTLSMiddleware enabled — A2A routes require client cert")
-
-    def _is_a2a_route(self, path: str) -> bool:
-        return any(path.startswith(prefix) for prefix in _A2A_PATH_PREFIXES)
-
-    async def __call__(self, scope, receive, send):
-        if scope["type"] == "http" and self._enabled and self._is_a2a_route(scope.get("path", "")):
-            # Check for client cert in the SSL connection
-            transport = scope.get("extensions", {}).get("tls", {})
-            peer_cert = transport.get("peer_cert")
-            if peer_cert is None:
-                # No client cert — reject
-                response = _forbidden_response("Client certificate required for A2A endpoints")
-                await response(scope, receive, send)
-                return
-
-        await self.app(scope, receive, send)
-
-
-def _forbidden_response(message: str):
-    """Return a minimal ASGI 403 response."""
-    body = message.encode()
-
-    async def respond(scope, receive, send):
-        await send({
-            "type": "http.response.start",
-            "status": 403,
-            "headers": [
-                (b"content-type", b"text/plain"),
-                (b"content-length", str(len(body)).encode()),
-            ],
-        })
-        await send({"type": "http.response.body", "body": body})
-
-    return respond
--- a/agent/privacy_filter.py
+++ b/agent/privacy_filter.py
@@ -1,353 +0,0 @@
-"""Privacy Filter — strip PII from context before remote API calls.
-
-Implements Vitalik's Pattern 2: "A local model can strip out private data
-before passing the query along to a remote LLM."
-
-When Hermes routes a request to a cloud provider (Anthropic, OpenRouter, etc.),
-this module sanitizes the message context to remove personally identifiable
-information before it leaves the user's machine.
-
-Threat model (from Vitalik's secure LLM architecture):
- Privacy (other): Non-LLM data leakage via search queries, API calls
- LLM accidents: LLM accidentally leaking private data in prompts
- LLM jailbreaks: Remote content extracting private context
-
-Usage:
-    from agent.privacy_filter import PrivacyFilter, sanitize_messages
-
-    pf = PrivacyFilter()
-    safe_messages = pf.sanitize_messages(messages)
-    # safe_messages has PII replaced with [REDACTED] tokens
-"""
-
-from __future__ import annotations
-
-import logging
-import re
-from dataclasses import dataclass, field
-from enum import Enum, auto
-from typing import Any, Dict, List, Optional, Tuple
-
-logger = logging.getLogger(__name__)
-
-
-class Sensitivity(Enum):
-    """Classification of content sensitivity."""
-    PUBLIC = auto()       # No PII detected
-    LOW = auto()          # Generic references (e.g., city names)
-    MEDIUM = auto()       # Personal identifiers (name, email, phone)
-    HIGH = auto()         # Secrets, keys, financial data, medical info
-    CRITICAL = auto()     # Crypto keys, passwords, SSN patterns
-
-
-@dataclass
-class RedactionReport:
-    """Summary of what was redacted from a message batch."""
-    total_messages: int = 0
-    redacted_messages: int = 0
-    redactions: List[Dict[str, Any]] = field(default_factory=list)
-    max_sensitivity: Sensitivity = Sensitivity.PUBLIC
-
-    @property
-    def had_redactions(self) -> bool:
-        return self.redacted_messages > 0
-
-    def summary(self) -> str:
-        if not self.had_redactions:
-            return "No PII detected — context is clean for remote query."
-        parts = [f"Redacted {self.redacted_messages}/{self.total_messages} messages:"]
-        for r in self.redactions[:10]:
-            parts.append(f"  - {r['type']}: {r['count']} occurrence(s)")
-        if len(self.redactions) > 10:
-            parts.append(f"  ... and {len(self.redactions) - 10} more types")
-        return "\n".join(parts)
-
-
-# =========================================================================
-# PII pattern definitions
-# =========================================================================
-
-# Each pattern is (compiled_regex, redaction_type, sensitivity_level, replacement)
-_PII_PATTERNS: List[Tuple[re.Pattern, str, Sensitivity, str]] = []
-
-
-def _compile_patterns() -> None:
-    """Compile PII detection patterns. Called once at module init."""
-    global _PII_PATTERNS
-    if _PII_PATTERNS:
-        return
-
-    raw_patterns = [
-        # --- CRITICAL: secrets and credentials ---
-        (
-            r'(?:api[_-]?key|apikey|secret[_-]?key|access[_-]?token)\s*[:=]\s*["\']?([A-Za-z0-9_\-\.]{20,})["\']?',
-            "api_key_or_token",
-            Sensitivity.CRITICAL,
-            "[REDACTED-API-KEY]",
-        ),
-        (
-            r'\b(?:sk-|sk_|pk_|rk_|ak_)[A-Za-z0-9]{20,}\b',
-            "prefixed_secret",
-            Sensitivity.CRITICAL,
-            "[REDACTED-SECRET]",
-        ),
-        (
-            r'\b(?:ghp_|gho_|ghu_|ghs_|ghr_)[A-Za-z0-9]{36,}\b',
-            "github_token",
-            Sensitivity.CRITICAL,
-            "[REDACTED-GITHUB-TOKEN]",
-        ),
-        (
-            r'\b(?:xox[bposa]-[A-Za-z0-9\-]+)\b',
-            "slack_token",
-            Sensitivity.CRITICAL,
-            "[REDACTED-SLACK-TOKEN]",
-        ),
-        (
-            r'(?:password|passwd|pwd)\s*[:=]\s*["\']?([^\s"\']{4,})["\']?',
-            "password",
-            Sensitivity.CRITICAL,
-            "[REDACTED-PASSWORD]",
-        ),
-        (
-            r'(?:-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----)',
-            "private_key_block",
-            Sensitivity.CRITICAL,
-            "[REDACTED-PRIVATE-KEY]",
-        ),
-        # Ethereum / crypto addresses (42-char hex starting with 0x)
-        (
-            r'\b0x[a-fA-F0-9]{40}\b',
-            "ethereum_address",
-            Sensitivity.HIGH,
-            "[REDACTED-ETH-ADDR]",
-        ),
-        # Bitcoin addresses (base58, 25-34 chars starting with 1/3/bc1)
-        (
-            r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b',
-            "bitcoin_address",
-            Sensitivity.HIGH,
-            "[REDACTED-BTC-ADDR]",
-        ),
-        (
-            r'\bbc1[a-zA-HJ-NP-Z0-9]{39,59}\b',
-            "bech32_address",
-            Sensitivity.HIGH,
-            "[REDACTED-BTC-ADDR]",
-        ),
-        # --- HIGH: financial ---
-        (
-            r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
-            "credit_card_number",
-            Sensitivity.HIGH,
-            "[REDACTED-CC]",
-        ),
-        (
-            r'\b\d{3}-\d{2}-\d{4}\b',
-            "us_ssn",
-            Sensitivity.HIGH,
-            "[REDACTED-SSN]",
-        ),
-        # --- MEDIUM: personal identifiers ---
-        # Email addresses
-        (
-            r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b',
-            "email_address",
-            Sensitivity.MEDIUM,
-            "[REDACTED-EMAIL]",
-        ),
-        # Phone numbers (US/international patterns)
-        (
-            r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
-            "phone_number_us",
-            Sensitivity.MEDIUM,
-            "[REDACTED-PHONE]",
-        ),
-        (
-            r'\b\+\d{1,3}[-.\s]?\d{4,14}\b',
-            "phone_number_intl",
-            Sensitivity.MEDIUM,
-            "[REDACTED-PHONE]",
-        ),
-        # Filesystem paths that reveal user identity
-        (
-            r'(?:/Users/|/home/|C:\\Users\\)([A-Za-z0-9_\-]+)',
-            "user_home_path",
-            Sensitivity.MEDIUM,
-            r"/Users/[REDACTED-USER]",
-        ),
-        # --- LOW: environment / system info ---
-        # Internal IPs
-        (
-            r'\b(?:10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(?:1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3})\b',
-            "internal_ip",
-            Sensitivity.LOW,
-            "[REDACTED-IP]",
-        ),
-    ]
-
-    _PII_PATTERNS = [
-        (re.compile(pattern, re.IGNORECASE), rtype, sensitivity, replacement)
-        for pattern, rtype, sensitivity, replacement in raw_patterns
-    ]
-
-
-_compile_patterns()
-
-
-# =========================================================================
-# Sensitive file path patterns (context-aware)
-# =========================================================================
-
-_SENSITIVE_PATH_PATTERNS = [
-    re.compile(r'\.(?:env|pem|key|p12|pfx|jks|keystore)\b', re.IGNORECASE),
-    re.compile(r'(?:\.ssh/|\.gnupg/|\.aws/|\.config/gcloud/)', re.IGNORECASE),
-    re.compile(r'(?:wallet|keystore|seed|mnemonic)', re.IGNORECASE),
-    re.compile(r'(?:\.hermes/\.env)', re.IGNORECASE),
-]
-
-
-def _classify_path_sensitivity(path: str) -> Sensitivity:
-    """Check if a file path references sensitive material."""
-    for pat in _SENSITIVE_PATH_PATTERNS:
-        if pat.search(path):
-            return Sensitivity.HIGH
-    return Sensitivity.PUBLIC
-
-
-# =========================================================================
-# Core filtering
-# =========================================================================
-
-class PrivacyFilter:
-    """Strip PII from message context before remote API calls.
-
-    Integrates with the agent's message pipeline. Call sanitize_messages()
-    before sending context to any cloud LLM provider.
-    """
-
-    def __init__(
-        self,
-        min_sensitivity: Sensitivity = Sensitivity.MEDIUM,
-        aggressive_mode: bool = False,
-    ):
-        """
-        Args:
-            min_sensitivity: Only redact PII at or above this level.
-                Default MEDIUM — redacts emails, phones, paths but not IPs.
-            aggressive_mode: If True, also redact file paths and internal IPs.
-        """
-        self.min_sensitivity = (
-            Sensitivity.LOW if aggressive_mode else min_sensitivity
-        )
-        self.aggressive_mode = aggressive_mode
-
-    def sanitize_text(self, text: str) -> Tuple[str, List[Dict[str, Any]]]:
-        """Sanitize a single text string. Returns (cleaned_text, redaction_list)."""
-        redactions = []
-        cleaned = text
-
-        for pattern, rtype, sensitivity, replacement in _PII_PATTERNS:
-            if sensitivity.value < self.min_sensitivity.value:
-                continue
-
-            matches = pattern.findall(cleaned)
-            if matches:
-                count = len(matches) if isinstance(matches[0], str) else sum(
-                    1 for m in matches if m
-                )
-                if count > 0:
-                    cleaned = pattern.sub(replacement, cleaned)
-                    redactions.append({
-                        "type": rtype,
-                        "sensitivity": sensitivity.name,
-                        "count": count,
-                    })
-
-        return cleaned, redactions
-
-    def sanitize_messages(
-        self, messages: List[Dict[str, Any]]
-    ) -> Tuple[List[Dict[str, Any]], RedactionReport]:
-        """Sanitize a list of OpenAI-format messages.
-
-        Returns (safe_messages, report). System messages are NOT sanitized
-        (they're typically static prompts). Only user and assistant messages
-        with string content are processed.
-
-        Args:
-            messages: List of {"role": ..., "content": ...} dicts.
-
-        Returns:
-            Tuple of (sanitized_messages, redaction_report).
-        """
-        report = RedactionReport(total_messages=len(messages))
-        safe_messages = []
-
-        for msg in messages:
-            role = msg.get("role", "")
-            content = msg.get("content", "")
-
-            # Only sanitize user/assistant string content
-            if role in ("user", "assistant") and isinstance(content, str) and content:
-                cleaned, redactions = self.sanitize_text(content)
-                if redactions:
-                    report.redacted_messages += 1
-                    report.redactions.extend(redactions)
-                    # Track max sensitivity
-                    for r in redactions:
-                        s = Sensitivity[r["sensitivity"]]
-                        if s.value > report.max_sensitivity.value:
-                            report.max_sensitivity = s
-                    safe_msg = {**msg, "content": cleaned}
-                    safe_messages.append(safe_msg)
-                    logger.info(
-                        "Privacy filter: redacted %d PII type(s) from %s message",
-                        len(redactions), role,
-                    )
-                else:
-                    safe_messages.append(msg)
-            else:
-                safe_messages.append(msg)
-
-        return safe_messages, report
-
-    def should_use_local_only(self, text: str) -> Tuple[bool, str]:
-        """Determine if content is too sensitive for any remote call.
-
-        Returns (should_block, reason). If True, the content should only
-        be processed by a local model.
-        """
-        _, redactions = self.sanitize_text(text)
-
-        critical_count = sum(
-            1 for r in redactions
-            if Sensitivity[r["sensitivity"]] == Sensitivity.CRITICAL
-        )
-        high_count = sum(
-            1 for r in redactions
-            if Sensitivity[r["sensitivity"]] == Sensitivity.HIGH
-        )
-
-        if critical_count > 0:
-            return True, f"Contains {critical_count} critical-secret pattern(s) — local-only"
-        if high_count >= 3:
-            return True, f"Contains {high_count} high-sensitivity pattern(s) — local-only"
-        return False, ""
-
-
-def sanitize_messages(
-    messages: List[Dict[str, Any]],
-    min_sensitivity: Sensitivity = Sensitivity.MEDIUM,
-    aggressive: bool = False,
-) -> Tuple[List[Dict[str, Any]], RedactionReport]:
-    """Convenience function: sanitize messages with default settings."""
-    pf = PrivacyFilter(min_sensitivity=min_sensitivity, aggressive_mode=aggressive)
-    return pf.sanitize_messages(messages)
-
-
-def quick_sanitize(text: str) -> str:
-    """Quick sanitize a single string — returns cleaned text only."""
-    pf = PrivacyFilter()
-    cleaned, _ = pf.sanitize_text(text)
-    return cleaned
--- a/agent/profile_isolation.py
+++ b/agent/profile_isolation.py
@@ -1,262 +0,0 @@
-"""
-Profile Session Isolation — #891
-
-Tags sessions with their originating profile and provides
-filtered access so profiles cannot see each other's data.
-
-Current state: All sessions share one state.db with no profile tag.
-This module adds profile tagging and filtered queries.
-
-Usage:
-    from agent.profile_isolation import tag_session, get_profile_sessions, get_active_profile
-    
-    # Tag a new session with the current profile
-    tag_session(session_id, profile_name)
-    
-    # Get sessions for a specific profile
-    sessions = get_profile_sessions("sprint")
-    
-    # Get current active profile
-    profile = get_active_profile()
-"""
-
-import json
-import os
-import sqlite3
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-from datetime import datetime, timezone
-
-HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes")))
-SESSIONS_DB = HERMES_HOME / "sessions" / "state.db"
-PROFILE_TAGS_FILE = HERMES_HOME / "profile_session_tags.json"
-
-
-def get_active_profile() -> str:
-    """Get the currently active profile name."""
-    config_path = HERMES_HOME / "config.yaml"
-    if config_path.exists():
-        try:
-            import yaml
-            with open(config_path) as f:
-                cfg = yaml.safe_load(f) or {}
-            return cfg.get("active_profile", "default")
-        except Exception:
-            pass
-    
-    # Check environment
-    return os.getenv("HERMES_PROFILE", "default")
-
-
-def _load_tags() -> Dict[str, str]:
-    """Load session-to-profile mapping."""
-    if not PROFILE_TAGS_FILE.exists():
-        return {}
-    try:
-        with open(PROFILE_TAGS_FILE) as f:
-            return json.load(f)
-    except Exception:
-        return {}
-
-
-def _save_tags(tags: Dict[str, str]):
-    """Save session-to-profile mapping."""
-    PROFILE_TAGS_FILE.parent.mkdir(parents=True, exist_ok=True)
-    with open(PROFILE_TAGS_FILE, "w") as f:
-        json.dump(tags, f, indent=2)
-
-
-def tag_session(session_id: str, profile: Optional[str] = None) -> str:
-    """
-    Tag a session with its originating profile.
-    
-    Returns the profile name used.
-    """
-    if profile is None:
-        profile = get_active_profile()
-    
-    tags = _load_tags()
-    tags[session_id] = profile
-    _save_tags(tags)
-    
-    # Also tag in SQLite if available
-    _tag_session_in_db(session_id, profile)
-    
-    return profile
-
-
-def _tag_session_in_db(session_id: str, profile: str):
-    """Add profile tag to SQLite session store."""
-    if not SESSIONS_DB.exists():
-        return
-    
-    try:
-        conn = sqlite3.connect(str(SESSIONS_DB))
-        cursor = conn.cursor()
-        
-        # Check if sessions table has profile column
-        cursor.execute("PRAGMA table_info(sessions)")
-        columns = [row[1] for row in cursor.fetchall()]
-        
-        if "profile" not in columns:
-            # Add profile column
-            cursor.execute("ALTER TABLE sessions ADD COLUMN profile TEXT DEFAULT 'default'")
-        
-        # Update the session's profile
-        cursor.execute(
-            "UPDATE sessions SET profile = ? WHERE session_id = ?",
-            (profile, session_id)
-        )
-        
-        conn.commit()
-        conn.close()
-    except Exception:
-        pass  # SQLite might not be available or schema differs
-
-
-def get_session_profile(session_id: str) -> Optional[str]:
-    """Get the profile that owns a session."""
-    # Check JSON tags first
-    tags = _load_tags()
-    if session_id in tags:
-        return tags[session_id]
-    
-    # Check SQLite
-    if SESSIONS_DB.exists():
-        try:
-            conn = sqlite3.connect(str(SESSIONS_DB))
-            cursor = conn.cursor()
-            cursor.execute(
-                "SELECT profile FROM sessions WHERE session_id = ?",
-                (session_id,)
-            )
-            row = cursor.fetchone()
-            conn.close()
-            if row:
-                return row[0]
-        except Exception:
-            pass
-    
-    return None
-
-
-def get_profile_sessions(
-    profile: Optional[str] = None,
-    limit: int = 100,
-) -> List[Dict[str, Any]]:
-    """
-    Get sessions belonging to a specific profile.
-    
-    Returns list of session dicts.
-    """
-    if profile is None:
-        profile = get_active_profile()
-    
-    sessions = []
-    
-    # Get from JSON tags
-    tags = _load_tags()
-    tagged_sessions = [sid for sid, p in tags.items() if p == profile]
-    
-    # Get from SQLite with profile filter
-    if SESSIONS_DB.exists():
-        try:
-            conn = sqlite3.connect(str(SESSIONS_DB))
-            conn.row_factory = sqlite3.Row
-            cursor = conn.cursor()
-            
-            # Try profile column first
-            try:
-                cursor.execute(
-                    "SELECT * FROM sessions WHERE profile = ? ORDER BY updated_at DESC LIMIT ?",
-                    (profile, limit)
-                )
-                for row in cursor.fetchall():
-                    sessions.append(dict(row))
-            except Exception:
-                # Fallback: filter by tagged session IDs
-                if tagged_sessions:
-                    placeholders = ",".join("?" * len(tagged_sessions[:limit]))
-                    cursor.execute(
-                        f"SELECT * FROM sessions WHERE session_id IN ({placeholders}) ORDER BY updated_at DESC LIMIT ?",
-                        (*tagged_sessions[:limit], limit)
-                    )
-                    for row in cursor.fetchall():
-                        sessions.append(dict(row))
-            
-            conn.close()
-        except Exception:
-            pass
-    
-    return sessions[:limit]
-
-
-def filter_sessions_by_profile(
-    sessions: List[Dict[str, Any]],
-    profile: Optional[str] = None,
-) -> List[Dict[str, Any]]:
-    """Filter a list of sessions to only include those belonging to a profile."""
-    if profile is None:
-        profile = get_active_profile()
-    
-    tags = _load_tags()
-    filtered = []
-    
-    for session in sessions:
-        sid = session.get("session_id") or session.get("id")
-        if not sid:
-            continue
-        
-        # Check tag
-        session_profile = tags.get(sid)
-        if session_profile is None:
-            # Check SQLite
-            session_profile = get_session_profile(sid)
-        
-        if session_profile == profile or session_profile is None:
-            filtered.append(session)
-    
-    return filtered
-
-
-def get_profile_stats() -> Dict[str, Any]:
-    """Get statistics about profile session distribution."""
-    tags = _load_tags()
-    
-    profile_counts = {}
-    for sid, profile in tags.items():
-        profile_counts[profile] = profile_counts.get(profile, 0) + 1
-    
-    total_tagged = len(tags)
-    profiles = list(profile_counts.keys())
-    
-    return {
-        "total_tagged_sessions": total_tagged,
-        "profiles": profiles,
-        "profile_counts": profile_counts,
-        "active_profile": get_active_profile(),
-    }
-
-
-def audit_untagged_sessions() -> List[str]:
-    """Find sessions without a profile tag."""
-    if not SESSIONS_DB.exists():
-        return []
-    
-    try:
-        conn = sqlite3.connect(str(SESSIONS_DB))
-        cursor = conn.cursor()
-        
-        # Get all session IDs
-        cursor.execute("SELECT session_id FROM sessions")
-        all_sessions = {row[0] for row in cursor.fetchall()}
-        conn.close()
-        
-        # Get tagged sessions
-        tags = _load_tags()
-        tagged = set(tags.keys())
-        
-        # Return untagged
-        return list(all_sessions - tagged)
-    except Exception:
-        return []
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -12,7 +12,7 @@ import threading
 from collections import OrderedDict
 from pathlib import Path

-from hermes_constants import get_hermes_home, get_skills_dir, is_wsl
+from hermes_constants import get_hermes_home
 from typing import Optional

 from agent.skill_utils import (
@@ -40,7 +40,7 @@ _CONTEXT_THREAT_PATTERNS = [
    (r'disregard\s+(your|all|any)\s+(instructions|rules|guidelines)', "disregard_rules"),
    (r'act\s+as\s+(if|though)\s+you\s+(have\s+no|don\'t\s+have)\s+(restrictions|limits|rules)', "bypass_restrictions"),
    (r'<!--[^>]*(?:ignore|override|system|secret|hidden)[^>]*-->', "html_comment_injection"),
-    (r'<\s*div\s+style\s*=\s*["\'][\s\S]*?display\s*:\s*none', "hidden_div"),
+    (r'<\s*div\s+style\s*=\s*["\'].*display\s*:\s*none', "hidden_div"),
    (r'translate\s+.*\s+into\s+.*\s+and\s+(execute|run|eval)', "translate_execute"),
    (r'curl\s+[^\n]*\$\{?\w*(KEY|TOKEN|SECRET|PASSWORD|CREDENTIAL|API)', "exfil_curl"),
    (r'cat\s+[^\n]*(\.env|credentials|\.netrc|\.pgpass)', "read_secrets"),
@@ -349,71 +349,8 @@ PLATFORM_HINTS = {
        "only — no markdown, no formatting. SMS messages are limited to ~1600 "
        "characters, so be brief and direct."
    ),
-    "bluebubbles": (
-        "You are chatting via iMessage (BlueBubbles). iMessage does not render "
-        "markdown formatting — use plain text. Keep responses concise as they "
-        "appear as text messages. You can send media files natively: include "
-        "MEDIA:/absolute/path/to/file in your response. Images (.jpg, .png, "
-        ".heic) appear as photos and other files arrive as attachments."
-    ),
-    "weixin": (
-        "You are on Weixin/WeChat. Markdown formatting is supported, so you may use it when "
-        "it improves readability, but keep the message compact and chat-friendly. You can send media files natively: "
-        "include MEDIA:/absolute/path/to/file in your response. Images are sent as native "
-        "photos, videos play inline when supported, and other files arrive as downloadable "
-        "documents. You can also include image URLs in markdown format ![alt](url) and they "
-        "will be downloaded and sent as native media when possible."
-    ),
-    "wecom": (
-        "You are on WeCom (企业微信 / Enterprise WeChat). Markdown formatting is supported. "
-        "You CAN send media files natively — to deliver a file to the user, include "
-        "MEDIA:/absolute/path/to/file in your response. The file will be sent as a native "
-        "WeCom attachment: images (.jpg, .png, .webp) are sent as photos (up to 10 MB), "
-        "other files (.pdf, .docx, .xlsx, .md, .txt, etc.) arrive as downloadable documents "
-        "(up to 20 MB), and videos (.mp4) play inline. Voice messages are supported but "
-        "must be in AMR format — other audio formats are automatically sent as file attachments. "
-        "You can also include image URLs in markdown format ![alt](url) and they will be "
-        "downloaded and sent as native photos. Do NOT tell the user you lack file-sending "
-        "capability — use MEDIA: syntax whenever a file delivery is appropriate."
-    ),
-    "qqbot": (
-        "You are on QQ, a popular Chinese messaging platform. QQ supports markdown formatting "
-        "and emoji. You can send media files natively: include MEDIA:/absolute/path/to/file in "
-        "your response. Images are sent as native photos, and other files arrive as downloadable "
-        "documents."
-    ),
 }

-# ---------------------------------------------------------------------------
-# Environment hints — execution-environment awareness for the agent.
-# Unlike PLATFORM_HINTS (which describe the messaging channel), these describe
-# the machine/OS the agent's tools actually run on.
-# ---------------------------------------------------------------------------
-
-WSL_ENVIRONMENT_HINT = (
-    "You are running inside WSL (Windows Subsystem for Linux). "
-    "The Windows host filesystem is mounted under /mnt/ — "
-    "/mnt/c/ is the C: drive, /mnt/d/ is D:, etc. "
-    "The user's Windows files are typically at "
-    "/mnt/c/Users/<username>/Desktop/, Documents/, Downloads/, etc. "
-    "When the user references Windows paths or desktop files, translate "
-    "to the /mnt/c/ equivalent. You can list /mnt/c/Users/ to discover "
-    "the Windows username if needed."
-)
-
-
-def build_environment_hints() -> str:
-    """Return environment-specific guidance for the system prompt.
-
-    Detects WSL, and can be extended for Termux, Docker, etc.
-    Returns an empty string when no special environment is detected.
-    """
-    hints: list[str] = []
-    if is_wsl():
-        hints.append(WSL_ENVIRONMENT_HINT)
-    return "\n\n".join(hints)
-
-
 CONTEXT_FILE_MAX_CHARS = 20_000
 CONTEXT_TRUNCATE_HEAD_RATIO = 0.7
 CONTEXT_TRUNCATE_TAIL_RATIO = 0.2
@@ -535,7 +472,7 @@ def _parse_skill_file(skill_file: Path) -> tuple[bool, dict, str]:
    (True, {}, "") to err on the side of showing the skill.
    """
    try:
-        raw = skill_file.read_text(encoding="utf-8")
+        raw = skill_file.read_text(encoding="utf-8")[:2000]
        frontmatter, _ = parse_frontmatter(raw)

        if not skill_matches_platform(frontmatter):
@@ -543,10 +480,21 @@ def _parse_skill_file(skill_file: Path) -> tuple[bool, dict, str]:

        return True, frontmatter, extract_skill_description(frontmatter)
    except Exception as e:
-        logger.warning("Failed to parse skill file %s: %s", skill_file, e)
+        logger.debug("Failed to parse skill file %s: %s", skill_file, e)
        return True, {}, ""


+def _read_skill_conditions(skill_file: Path) -> dict:
+    """Extract conditional activation fields from SKILL.md frontmatter."""
+    try:
+        raw = skill_file.read_text(encoding="utf-8")[:2000]
+        frontmatter, _ = parse_frontmatter(raw)
+        return extract_skill_conditions(frontmatter)
+    except Exception as e:
+        logger.debug("Failed to read skill conditions from %s: %s", skill_file, e)
+        return {}
+
+
 def _skill_should_show(
    conditions: dict,
    available_tools: "set[str] | None",
@@ -596,7 +544,8 @@ def build_skills_system_prompt(
    are read-only — they appear in the index but new skills are always created
    in the local dir.  Local skills take precedence when names collide.
    """
-    skills_dir = get_skills_dir()
+    hermes_home = get_hermes_home()
+    skills_dir = hermes_home / "skills"
    external_dirs = get_all_skills_dirs()[1:]  # skip local (index 0)

    if not skills_dir.exists() and not external_dirs:
@@ -605,10 +554,9 @@ def build_skills_system_prompt(
    # ── Layer 1: in-process LRU cache ─────────────────────────────────
    # Include the resolved platform so per-platform disabled-skill lists
    # produce distinct cache entries (gateway serves multiple platforms).
-    from gateway.session_context import get_session_env
    _platform_hint = (
        os.environ.get("HERMES_PLATFORM")
-        or get_session_env("HERMES_SESSION_PLATFORM")
+        or os.environ.get("HERMES_SESSION_PLATFORM")
        or ""
    )
    cache_key = (
@@ -774,16 +722,8 @@ def build_skills_system_prompt(

        result = (
            "## Skills (mandatory)\n"
-            "Before replying, scan the skills below. If a skill matches or is even partially relevant "
-            "to your task, you MUST load it with skill_view(name) and follow its instructions. "
-            "Err on the side of loading — it is always better to have context you don't need "
-            "than to miss critical steps, pitfalls, or established workflows. "
-            "Skills contain specialized knowledge — API endpoints, tool-specific commands, "
-            "and proven workflows that outperform general-purpose approaches. Load the skill "
-            "even if you think you could handle the task with basic tools like web_search or terminal. "
-            "Skills also encode the user's preferred approach, conventions, and quality standards "
-            "for tasks like code review, planning, and testing — load them even for tasks you "
-            "already know how to do, because the skill defines how it should be done here.\n"
+            "Before replying, scan the skills below. If one clearly matches your task, "
+            "load it with skill_view(name) and follow its instructions. "
            "If a skill has issues, fix it with skill_manage(action='patch').\n"
            "After difficult/iterative tasks, offer to save as a skill. "
            "If a skill you loaded was missing steps, had wrong commands, or needed "
@@ -793,7 +733,7 @@ def build_skills_system_prompt(
            + "\n".join(index_lines) + "\n"
            "</available_skills>\n"
            "\n"
-            "Only proceed without loading a skill if genuinely none are relevant to the task."
+            "If none match, proceed normally without loading a skill."
        )

    # ── Store in LRU cache ────────────────────────────────────────────
--- a/agent/provider_preflight.py
+++ b/agent/provider_preflight.py
@@ -1,146 +0,0 @@
-"""Provider Preflight — Poka-yoke validation of provider/model config.
-
-Validates provider and model configuration before session start.
-Prevents wasted context on misconfigured providers.
-
-Usage:
-    from agent.provider_preflight import preflight_check
-    result = preflight_check(provider="openrouter", model="xiaomi/mimo-v2-pro")
-    if not result["valid"]:
-        print(result["error"])
-"""
-
-from __future__ import annotations
-
-import logging
-import os
-from typing import Any, Dict, Optional
-
-logger = logging.getLogger(__name__)
-
-
-# Provider -> required env var
-PROVIDER_KEYS = {
-    "openrouter": "OPENROUTER_API_KEY",
-    "anthropic": "ANTHROPIC_API_KEY",
-    "openai": "OPENAI_API_KEY",
-    "nous": "NOUS_API_KEY",
-    "ollama": None,  # Local, no key needed
-    "local": None,
-}
-
-
-def check_provider_key(provider: str) -> Dict[str, Any]:
-    """Check if provider has a valid API key configured."""
-    provider_lower = provider.lower().strip()
-
-    env_var = None
-    for known, key in PROVIDER_KEYS.items():
-        if known in provider_lower:
-            env_var = key
-            break
-
-    if env_var is None:
-        # Unknown provider — assume OK (custom/local)
-        return {"valid": True, "provider": provider, "key_status": "unknown"}
-
-    if env_var is None:
-        # Local provider, no key needed
-        return {"valid": True, "provider": provider, "key_status": "not_required"}
-
-    key_value = os.getenv(env_var, "").strip()
-    if not key_value:
-        return {
-            "valid": False,
-            "provider": provider,
-            "key_status": "missing",
-            "error": f"{env_var} is not set. Provider '{provider}' will fail.",
-            "fix": f"Set {env_var} in ~/.hermes/.env",
-        }
-
-    if len(key_value) < 10:
-        return {
-            "valid": False,
-            "provider": provider,
-            "key_status": "too_short",
-            "error": f"{env_var} is suspiciously short ({len(key_value)} chars). May be invalid.",
-            "fix": f"Verify {env_var} value in ~/.hermes/.env",
-        }
-
-    return {"valid": True, "provider": provider, "key_status": "set"}
-
-
-def check_model_availability(model: str, provider: str) -> Dict[str, Any]:
-    """Check if model is likely available for provider."""
-    if not model:
-        return {"valid": False, "error": "No model specified"}
-
-    # Basic sanity checks
-    model_lower = model.lower()
-
-    # Anthropic models should use anthropic provider
-    if "claude" in model_lower and "anthropic" not in provider.lower():
-        return {
-            "valid": True,  # Allow but warn
-            "warning": f"Model '{model}' usually runs on Anthropic provider, not '{provider}'",
-        }
-
-    # Ollama models
-    ollama_indicators = ["llama", "mistral", "qwen", "gemma", "phi", "hermes"]
-    if any(x in model_lower for x in ollama_indicators) and ":" not in model:
-        return {
-            "valid": True,
-            "warning": f"Model '{model}' may need a version tag for Ollama (e.g., {model}:latest)",
-        }
-
-    return {"valid": True}
-
-
-def preflight_check(
-    provider: str = "",
-    model: str = "",
-    fallback_provider: str = "",
-    fallback_model: str = "",
-) -> Dict[str, Any]:
-    """Full pre-flight check for provider/model configuration.
-
-    Returns:
-        Dict with valid (bool), errors (list), warnings (list).
-    """
-    errors = []
-    warnings = []
-
-    # Check primary provider
-    if provider:
-        result = check_provider_key(provider)
-        if not result["valid"]:
-            errors.append(result.get("error", f"Provider {provider} invalid"))
-
-    # Check primary model
-    if model:
-        result = check_model_availability(model, provider)
-        if not result["valid"]:
-            errors.append(result.get("error", f"Model {model} invalid"))
-        elif result.get("warning"):
-            warnings.append(result["warning"])
-
-    # Check fallback
-    if fallback_provider:
-        result = check_provider_key(fallback_provider)
-        if not result["valid"]:
-            warnings.append(f"Fallback provider {fallback_provider} also invalid: {result.get('error','')}")
-
-    if fallback_model:
-        result = check_model_availability(fallback_model, fallback_provider)
-        if not result["valid"]:
-            warnings.append(f"Fallback model {fallback_model} invalid")
-        elif result.get("warning"):
-            warnings.append(result["warning"])
-
-    return {
-        "valid": len(errors) == 0,
-        "errors": errors,
-        "warnings": warnings,
-        "provider": provider,
-        "model": model,
-    }
--- a/agent/rate_limit_tracker.py
+++ b/agent/rate_limit_tracker.py
@@ -1,246 +0,0 @@
-"""Rate limit tracking for inference API responses.
-
-Captures x-ratelimit-* headers from provider responses and provides
-formatted display for the /usage slash command.  Currently supports
-the Nous Portal header format (also used by OpenRouter and OpenAI-compatible
-APIs that follow the same convention).
-
-Header schema (12 headers total):
-    x-ratelimit-limit-requests          RPM cap
-    x-ratelimit-limit-requests-1h       RPH cap
-    x-ratelimit-limit-tokens            TPM cap
-    x-ratelimit-limit-tokens-1h         TPH cap
-    x-ratelimit-remaining-requests      requests left in minute window
-    x-ratelimit-remaining-requests-1h   requests left in hour window
-    x-ratelimit-remaining-tokens        tokens left in minute window
-    x-ratelimit-remaining-tokens-1h     tokens left in hour window
-    x-ratelimit-reset-requests          seconds until minute request window resets
-    x-ratelimit-reset-requests-1h       seconds until hour request window resets
-    x-ratelimit-reset-tokens            seconds until minute token window resets
-    x-ratelimit-reset-tokens-1h         seconds until hour token window resets
-"""
-
-from __future__ import annotations
-
-import time
-from dataclasses import dataclass, field
-from typing import Any, Mapping, Optional
-
-
-@dataclass
-class RateLimitBucket:
-    """One rate-limit window (e.g. requests per minute)."""
-
-    limit: int = 0
-    remaining: int = 0
-    reset_seconds: float = 0.0
-    captured_at: float = 0.0  # time.time() when this was captured
-
-    @property
-    def used(self) -> int:
-        return max(0, self.limit - self.remaining)
-
-    @property
-    def usage_pct(self) -> float:
-        if self.limit <= 0:
-            return 0.0
-        return (self.used / self.limit) * 100.0
-
-    @property
-    def remaining_seconds_now(self) -> float:
-        """Estimated seconds remaining until reset, adjusted for elapsed time."""
-        elapsed = time.time() - self.captured_at
-        return max(0.0, self.reset_seconds - elapsed)
-
-
-@dataclass
-class RateLimitState:
-    """Full rate-limit state parsed from response headers."""
-
-    requests_min: RateLimitBucket = field(default_factory=RateLimitBucket)
-    requests_hour: RateLimitBucket = field(default_factory=RateLimitBucket)
-    tokens_min: RateLimitBucket = field(default_factory=RateLimitBucket)
-    tokens_hour: RateLimitBucket = field(default_factory=RateLimitBucket)
-    captured_at: float = 0.0  # when the headers were captured
-    provider: str = ""
-
-    @property
-    def has_data(self) -> bool:
-        return self.captured_at > 0
-
-    @property
-    def age_seconds(self) -> float:
-        if not self.has_data:
-            return float("inf")
-        return time.time() - self.captured_at
-
-
-def _safe_int(value: Any, default: int = 0) -> int:
-    try:
-        return int(float(value))
-    except (TypeError, ValueError):
-        return default
-
-
-def _safe_float(value: Any, default: float = 0.0) -> float:
-    try:
-        return float(value)
-    except (TypeError, ValueError):
-        return default
-
-
-def parse_rate_limit_headers(
-    headers: Mapping[str, str],
-    provider: str = "",
-) -> Optional[RateLimitState]:
-    """Parse x-ratelimit-* headers into a RateLimitState.
-
-    Returns None if no rate limit headers are present.
-    """
-    # Normalize to lowercase so lookups work regardless of how the server
-    # capitalises headers (HTTP header names are case-insensitive per RFC 7230).
-    lowered = {k.lower(): v for k, v in headers.items()}
-
-    # Quick check: at least one rate limit header must exist
-    has_any = any(k.startswith("x-ratelimit-") for k in lowered)
-    if not has_any:
-        return None
-
-    now = time.time()
-
-    def _bucket(resource: str, suffix: str = "") -> RateLimitBucket:
-        # e.g. resource="requests", suffix="" -> per-minute
-        #      resource="tokens", suffix="-1h" -> per-hour
-        tag = f"{resource}{suffix}"
-        return RateLimitBucket(
-            limit=_safe_int(lowered.get(f"x-ratelimit-limit-{tag}")),
-            remaining=_safe_int(lowered.get(f"x-ratelimit-remaining-{tag}")),
-            reset_seconds=_safe_float(lowered.get(f"x-ratelimit-reset-{tag}")),
-            captured_at=now,
-        )
-
-    return RateLimitState(
-        requests_min=_bucket("requests"),
-        requests_hour=_bucket("requests", "-1h"),
-        tokens_min=_bucket("tokens"),
-        tokens_hour=_bucket("tokens", "-1h"),
-        captured_at=now,
-        provider=provider,
-    )
-
-
-# ── Formatting ──────────────────────────────────────────────────────────
-
-
-def _fmt_count(n: int) -> str:
-    """Human-friendly number: 7999856 -> '8.0M', 33599 -> '33.6K', 799 -> '799'."""
-    if n >= 1_000_000:
-        return f"{n / 1_000_000:.1f}M"
-    if n >= 10_000:
-        return f"{n / 1_000:.1f}K"
-    if n >= 1_000:
-        return f"{n / 1_000:.1f}K"
-    return str(n)
-
-
-def _fmt_seconds(seconds: float) -> str:
-    """Seconds -> human-friendly duration: '58s', '2m 14s', '58m 57s', '1h 2m'."""
-    s = max(0, int(seconds))
-    if s < 60:
-        return f"{s}s"
-    if s < 3600:
-        m, sec = divmod(s, 60)
-        return f"{m}m {sec}s" if sec else f"{m}m"
-    h, remainder = divmod(s, 3600)
-    m = remainder // 60
-    return f"{h}h {m}m" if m else f"{h}h"
-
-
-def _bar(pct: float, width: int = 20) -> str:
-    """ASCII progress bar: [████████░░░░░░░░░░░░] 40%."""
-    filled = int(pct / 100.0 * width)
-    filled = max(0, min(width, filled))
-    empty = width - filled
-    return f"[{'█' * filled}{'░' * empty}]"
-
-
-def _bucket_line(label: str, bucket: RateLimitBucket, label_width: int = 14) -> str:
-    """Format one bucket as a single line."""
-    if bucket.limit <= 0:
-        return f"  {label:<{label_width}}  (no data)"
-
-    pct = bucket.usage_pct
-    used = _fmt_count(bucket.used)
-    limit = _fmt_count(bucket.limit)
-    remaining = _fmt_count(bucket.remaining)
-    reset = _fmt_seconds(bucket.remaining_seconds_now)
-
-    bar = _bar(pct)
-    return f"  {label:<{label_width}} {bar} {pct:5.1f}%  {used}/{limit} used  ({remaining} left, resets in {reset})"
-
-
-def format_rate_limit_display(state: RateLimitState) -> str:
-    """Format rate limit state for terminal/chat display."""
-    if not state.has_data:
-        return "No rate limit data yet — make an API request first."
-
-    age = state.age_seconds
-    if age < 5:
-        freshness = "just now"
-    elif age < 60:
-        freshness = f"{int(age)}s ago"
-    else:
-        freshness = f"{_fmt_seconds(age)} ago"
-
-    provider_label = state.provider.title() if state.provider else "Provider"
-
-    lines = [
-        f"{provider_label} Rate Limits (captured {freshness}):",
-        "",
-        _bucket_line("Requests/min", state.requests_min),
-        _bucket_line("Requests/hr", state.requests_hour),
-        "",
-        _bucket_line("Tokens/min", state.tokens_min),
-        _bucket_line("Tokens/hr", state.tokens_hour),
-    ]
-
-    # Add warnings if any bucket is getting hot
-    warnings = []
-    for label, bucket in [
-        ("requests/min", state.requests_min),
-        ("requests/hr", state.requests_hour),
-        ("tokens/min", state.tokens_min),
-        ("tokens/hr", state.tokens_hour),
-    ]:
-        if bucket.limit > 0 and bucket.usage_pct >= 80:
-            reset = _fmt_seconds(bucket.remaining_seconds_now)
-            warnings.append(f"  ⚠ {label} at {bucket.usage_pct:.0f}% — resets in {reset}")
-
-    if warnings:
-        lines.append("")
-        lines.extend(warnings)
-
-    return "\n".join(lines)
-
-
-def format_rate_limit_compact(state: RateLimitState) -> str:
-    """One-line compact summary for status bars / gateway messages."""
-    if not state.has_data:
-        return "No rate limit data."
-
-    rm = state.requests_min
-    tm = state.tokens_min
-    rh = state.requests_hour
-    th = state.tokens_hour
-
-    parts = []
-    if rm.limit > 0:
-        parts.append(f"RPM: {rm.remaining}/{rm.limit}")
-    if rh.limit > 0:
-        parts.append(f"RPH: {_fmt_count(rh.remaining)}/{_fmt_count(rh.limit)} (resets {_fmt_seconds(rh.remaining_seconds_now)})")
-    if tm.limit > 0:
-        parts.append(f"TPM: {_fmt_count(tm.remaining)}/{_fmt_count(tm.limit)}")
-    if th.limit > 0:
-        parts.append(f"TPH: {_fmt_count(th.remaining)}/{_fmt_count(th.limit)} (resets {_fmt_seconds(th.remaining_seconds_now)})")
-
-    return " | ".join(parts)
--- a/agent/self_modify.py
+++ b/agent/self_modify.py
@@ -1,302 +0,0 @@
-"""Self-Modifying Prompt Engine — agent learns from its own failures.
-
-Analyzes session transcripts, identifies failure patterns, and generates
-prompt patches to prevent future failures.
-
-The loop: fail → analyze → rewrite → retry → verify improvement.
-
-Usage:
-    from agent.self_modify import PromptLearner
-    learner = PromptLearner()
-    patches = learner.analyze_session(session_id)
-    learner.apply_patches(patches)
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import os
-import re
-import time
-from dataclasses import dataclass, field
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple
-
-logger = logging.getLogger(__name__)
-
-HERMES_HOME = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
-PATCHES_DIR = HERMES_HOME / "prompt_patches"
-ROLLBACK_DIR = HERMES_HOME / "prompt_rollback"
-
-
-@dataclass
-class FailurePattern:
-    """A detected failure pattern in session transcripts."""
-    pattern_type: str  # retry_loop, timeout, error_hallucination, context_loss
-    description: str
-    frequency: int
-    example_messages: List[str] = field(default_factory=list)
-    suggested_fix: str = ""
-
-
-@dataclass
-class PromptPatch:
-    """A modification to the system prompt based on failure analysis."""
-    id: str
-    failure_type: str
-    original_rule: str
-    new_rule: str
-    confidence: float
-    applied_at: Optional[float] = None
-    reverted: bool = False
-
-
-# Failure detection patterns
-FAILURE_SIGNALS = {
-    "retry_loop": {
-        "patterns": [
-            r"(?i)retry(?:ing)?\s*(?:attempt|again)",
-            r"(?i)failed.*retrying",
-            r"(?i)error.*again",
-            r"(?i)attempt\s+\d+\s*(?:of|/)\s*\d+",
-        ],
-        "description": "Agent stuck in retry loop",
-    },
-    "timeout": {
-        "patterns": [
-            r"(?i)timed?\s*out",
-            r"(?i)deadline\s+exceeded",
-            r"(?i)took\s+(?:too\s+)?long",
-        ],
-        "description": "Operation timed out",
-    },
-    "hallucination": {
-        "patterns": [
-            r"(?i)i\s+(?:don't|do\s+not)\s+(?:have|see|find)\s+(?:any|that|this)\s+(?:information|data|file)",
-            r"(?i)the\s+file\s+doesn't\s+exist",
-            r"(?i)i\s+(?:made|invented|fabricated)\s+(?:that\s+up|this)",
-        ],
-        "description": "Agent hallucinated or fabricated information",
-    },
-    "context_loss": {
-        "patterns": [
-            r"(?i)i\s+(?:don't|do\s+not)\s+(?:remember|recall|know)\s+(?:what|where|when|how)",
-            r"(?i)could\s+you\s+remind\s+me",
-            r"(?i)what\s+were\s+we\s+(?:doing|working|talking)\s+(?:on|about)",
-        ],
-        "description": "Agent lost context from earlier in conversation",
-    },
-    "tool_failure": {
-        "patterns": [
-            r"(?i)tool\s+(?:call|execution)\s+failed",
-            r"(?i)command\s+not\s+found",
-            r"(?i)permission\s+denied",
-            r"(?i)no\s+such\s+file",
-        ],
-        "description": "Tool execution failed",
-    },
-}
-
-# Prompt improvement templates
-PROMPT_FIXES = {
-    "retry_loop": (
-        "If an operation fails more than twice, stop retrying. "
-        "Report the failure and ask the user for guidance. "
-        "Do not enter retry loops — they waste tokens."
-    ),
-    "timeout": (
-        "For operations that may take long, set a timeout and report "
-        "progress. If an operation takes more than 30 seconds, report "
-        "what you've done so far and ask if you should continue."
-    ),
-    "hallucination": (
-        "If you cannot find information, say 'I don't know' or "
-        "'I couldn't find that.' Never fabricate information. "
-        "If a file doesn't exist, say so — don't guess its contents."
-    ),
-    "context_loss": (
-        "When you need context from earlier in the conversation, "
-        "use session_search to find it. Don't ask the user to repeat themselves."
-    ),
-    "tool_failure": (
-        "If a tool fails, check the error message and try a different approach. "
-        "Don't retry the exact same command — diagnose first."
-    ),
-}
-
-
-class PromptLearner:
-    """Analyze session transcripts and generate prompt improvements."""
-
-    def __init__(self):
-        PATCHES_DIR.mkdir(parents=True, exist_ok=True)
-        ROLLBACK_DIR.mkdir(parents=True, exist_ok=True)
-
-    def analyze_session(self, session_data: dict) -> List[FailurePattern]:
-        """Analyze a session for failure patterns.
-
-        Args:
-            session_data: Session dict with 'messages' list.
-
-        Returns:
-            List of detected failure patterns.
-        """
-        messages = session_data.get("messages", [])
-        patterns_found: Dict[str, FailurePattern] = {}
-
-        for msg in messages:
-            content = str(msg.get("content", ""))
-            role = msg.get("role", "")
-
-            # Only analyze assistant messages and tool results
-            if role not in ("assistant", "tool"):
-                continue
-
-            for failure_type, config in FAILURE_SIGNALS.items():
-                for pattern in config["patterns"]:
-                    if re.search(pattern, content):
-                        if failure_type not in patterns_found:
-                            patterns_found[failure_type] = FailurePattern(
-                                pattern_type=failure_type,
-                                description=config["description"],
-                                frequency=0,
-                                suggested_fix=PROMPT_FIXES.get(failure_type, ""),
-                            )
-                        patterns_found[failure_type].frequency += 1
-                        if len(patterns_found[failure_type].example_messages) < 3:
-                            patterns_found[failure_type].example_messages.append(
-                                content[:200]
-                            )
-                        break  # One match per message per type is enough
-
-        return list(patterns_found.values())
-
-    def generate_patches(self, patterns: List[FailurePattern],
-                         min_confidence: float = 0.7) -> List[PromptPatch]:
-        """Generate prompt patches from failure patterns.
-
-        Args:
-            patterns: Detected failure patterns.
-            min_confidence: Minimum confidence to generate a patch.
-
-        Returns:
-            List of prompt patches.
-        """
-        patches = []
-        for pattern in patterns:
-            # Confidence based on frequency
-            if pattern.frequency >= 3:
-                confidence = 0.9
-            elif pattern.frequency >= 2:
-                confidence = 0.75
-            else:
-                confidence = 0.5
-
-            if confidence < min_confidence:
-                continue
-
-            if not pattern.suggested_fix:
-                continue
-
-            patch = PromptPatch(
-                id=f"{pattern.pattern_type}-{int(time.time())}",
-                failure_type=pattern.pattern_type,
-                original_rule="(missing — no existing rule for this pattern)",
-                new_rule=pattern.suggested_fix,
-                confidence=confidence,
-            )
-            patches.append(patch)
-
-        return patches
-
-    def apply_patches(self, patches: List[PromptPatch],
-                      prompt_path: Optional[str] = None) -> int:
-        """Apply patches to the system prompt.
-
-        Args:
-            patches: Patches to apply.
-            prompt_path: Path to prompt file (default: ~/.hermes/system_prompt.md)
-
-        Returns:
-            Number of patches applied.
-        """
-        if prompt_path is None:
-            prompt_path = str(HERMES_HOME / "system_prompt.md")
-
-        prompt_file = Path(prompt_path)
-
-        # Backup current prompt
-        if prompt_file.exists():
-            backup = ROLLBACK_DIR / f"{prompt_file.name}.{int(time.time())}.bak"
-            backup.write_text(prompt_file.read_text())
-
-        # Read current prompt
-        current = prompt_file.read_text() if prompt_file.exists() else ""
-
-        # Apply patches
-        applied = 0
-        additions = []
-        for patch in patches:
-            if patch.new_rule not in current:
-                additions.append(f"\n## Auto-learned: {patch.failure_type}\n{patch.new_rule}")
-                patch.applied_at = time.time()
-                applied += 1
-
-        if additions:
-            new_content = current + "\n".join(additions)
-            prompt_file.write_text(new_content)
-
-            # Log patches
-            patches_file = PATCHES_DIR / f"patches-{int(time.time())}.json"
-            with open(patches_file, "w") as f:
-                json.dump([p.__dict__ for p in patches], f, indent=2, default=str)
-
-        logger.info("Applied %d prompt patches", applied)
-        return applied
-
-    def rollback_last(self, prompt_path: Optional[str] = None) -> bool:
-        """Rollback to the most recent backup.
-
-        Args:
-            prompt_path: Path to prompt file.
-
-        Returns:
-            True if rollback succeeded.
-        """
-        if prompt_path is None:
-            prompt_path = str(HERMES_HOME / "system_prompt.md")
-
-        backups = sorted(ROLLBACK_DIR.glob("*.bak"), reverse=True)
-        if not backups:
-            logger.warning("No backups to rollback to")
-            return False
-
-        latest = backups[0]
-        Path(prompt_path).write_text(latest.read_text())
-        logger.info("Rolled back to %s", latest.name)
-        return True
-
-    def learn_from_session(self, session_data: dict) -> Dict[str, Any]:
-        """Full learning cycle: analyze → patch → apply.
-
-        Args:
-            session_data: Session dict.
-
-        Returns:
-            Summary of what was learned and applied.
-        """
-        patterns = self.analyze_session(session_data)
-        patches = self.generate_patches(patterns)
-        applied = self.apply_patches(patches)
-
-        return {
-            "patterns_detected": len(patterns),
-            "patches_generated": len(patches),
-            "patches_applied": applied,
-            "patterns": [
-                {"type": p.pattern_type, "frequency": p.frequency, "description": p.description}
-                for p in patterns
-            ],
-        }
--- a/agent/session_compactor.py
+++ b/agent/session_compactor.py
@@ -1,231 +0,0 @@
-"""Session compaction with fact extraction.
-
-Before compressing conversation context, extracts durable facts
-(user preferences, corrections, project details) and saves them
-to the fact store so they survive compression.
-
-Usage:
-    from agent.session_compactor import extract_and_save_facts
-    facts = extract_and_save_facts(messages)
-"""
-
-from __future__ import annotations
-
-import json
-import logging
-import re
-import time
-from dataclasses import dataclass, field
-from typing import Any, Dict, List, Optional, Tuple
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ExtractedFact:
-    """A fact extracted from conversation."""
-    category: str       # "user_pref", "correction", "project", "tool_quirk", "general"
-    entity: str         # what the fact is about
-    content: str        # the fact itself
-    confidence: float   # 0.0-1.0
-    source_turn: int    # which message turn it came from
-    timestamp: float = 0.0
-
-
-# Patterns that indicate user preferences
-_PREFERENCE_PATTERNS = [
-    (r"(?:I|we) (?:prefer|like|want|need) (.+?)(?:\.|$)", "preference"),
-    (r"(?:always|never) (?:use|do|run|deploy) (.+?)(?:\.|$)", "preference"),
-    (r"(?:my|our) (?:default|preferred|usual) (.+?) (?:is|are) (.+?)(?:\.|$)", "preference"),
-    (r"(?:make sure|ensure|remember) (?:to|that) (.+?)(?:\.|$)", "instruction"),
-    (r"(?:don'?t|do not) (?:ever|ever again) (.+?)(?:\.|$)", "constraint"),
-]
-
-# Patterns that indicate corrections
-_CORRECTION_PATTERNS = [
-    (r"(?:actually|no[, ]|wait[, ]|correction[: ]|sorry[, ]) (.+)", "correction"),
-    (r"(?:I meant|what I meant was|the correct) (.+?)(?:\.|$)", "correction"),
-    (r"(?:it'?s|its) (?:not|shouldn'?t be|wrong) (.+?)(?:\.|$)", "correction"),
-]
-
-# Patterns that indicate project/tool facts
-_PROJECT_PATTERNS = [
-    (r"(?:the |our )?(?:project|repo|codebase|code) (?:is|uses|needs|requires) (.+?)(?:\.|$)", "project"),
-    (r"(?:deploy|push|commit) (?:to|on) (.+?)(?:\.|$)", "project"),
-    (r"(?:this|that|the) (?:server|host|machine|VPS) (?:is|runs|has) (.+?)(?:\.|$)", "infrastructure"),
-    (r"(?:model|provider|engine) (?:is|should be|needs to be) (.+?)(?:\.|$)", "config"),
-]
-
-
-def extract_facts_from_messages(messages: List[Dict[str, Any]]) -> List[ExtractedFact]:
-    """Extract durable facts from conversation messages.
-
-    Scans user messages for preferences, corrections, project facts,
-    and infrastructure details that should survive compression.
-    """
-    facts = []
-    seen_contents = set()
-
-    for turn_idx, msg in enumerate(messages):
-        role = msg.get("role", "")
-        content = msg.get("content", "")
-
-        # Only scan user messages and assistant responses with corrections
-        if role not in ("user", "assistant"):
-            continue
-        if not content or not isinstance(content, str):
-            continue
-        if len(content) < 10:
-            continue
-
-        # Skip tool results and system messages
-        if role == "assistant" and msg.get("tool_calls"):
-            continue
-
-        extracted = _extract_from_text(content, turn_idx, role)
-
-        # Deduplicate by content
-        for fact in extracted:
-            key = f"{fact.category}:{fact.content[:100]}"
-            if key not in seen_contents:
-                seen_contents.add(key)
-                facts.append(fact)
-
-    return facts
-
-
-def _extract_from_text(text: str, turn_idx: int, role: str) -> List[ExtractedFact]:
-    """Extract facts from a single text block."""
-    facts = []
-    timestamp = time.time()
-
-    # Clean text for pattern matching
-    clean = text.strip()
-
-    # User preference patterns (from user messages)
-    if role == "user":
-        for pattern, subcategory in _PREFERENCE_PATTERNS:
-            for match in re.finditer(pattern, clean, re.IGNORECASE):
-                content = match.group(1).strip() if match.lastindex else match.group(0).strip()
-                if len(content) > 5:
-                    facts.append(ExtractedFact(
-                        category=f"user_pref.{subcategory}",
-                        entity="user",
-                        content=content[:200],
-                        confidence=0.7,
-                        source_turn=turn_idx,
-                        timestamp=timestamp,
-                    ))
-
-    # Correction patterns (from user messages)
-    if role == "user":
-        for pattern, subcategory in _CORRECTION_PATTERNS:
-            for match in re.finditer(pattern, clean, re.IGNORECASE):
-                content = match.group(1).strip() if match.lastindex else match.group(0).strip()
-                if len(content) > 5:
-                    facts.append(ExtractedFact(
-                        category=f"correction.{subcategory}",
-                        entity="user",
-                        content=content[:200],
-                        confidence=0.8,
-                        source_turn=turn_idx,
-                        timestamp=timestamp,
-                    ))
-
-    # Project/infrastructure patterns (from both user and assistant)
-    for pattern, subcategory in _PROJECT_PATTERNS:
-        for match in re.finditer(pattern, clean, re.IGNORECASE):
-            content = match.group(1).strip() if match.lastindex else match.group(0).strip()
-            if len(content) > 5:
-                facts.append(ExtractedFact(
-                    category=f"project.{subcategory}",
-                    entity=subcategory,
-                    content=content[:200],
-                    confidence=0.6,
-                    source_turn=turn_idx,
-                    timestamp=timestamp,
-                ))
-
-    return facts
-
-
-def save_facts_to_store(facts: List[ExtractedFact], fact_store_fn=None) -> int:
-    """Save extracted facts to the fact store.
-
-    Args:
-        facts: List of extracted facts.
-        fact_store_fn: Optional callable(category, entity, content, trust).
-            If None, uses the holographic fact store if available.
-
-    Returns:
-        Number of facts saved.
-    """
-    saved = 0
-
-    if fact_store_fn:
-        for fact in facts:
-            try:
-                fact_store_fn(
-                    category=fact.category,
-                    entity=fact.entity,
-                    content=fact.content,
-                    trust=fact.confidence,
-                )
-                saved += 1
-            except Exception as e:
-                logger.debug("Failed to save fact: %s", e)
-    else:
-        # Try holographic fact store
-        try:
-            from fact_store import fact_store as _fs
-            for fact in facts:
-                try:
-                    _fs(
-                        action="add",
-                        content=fact.content,
-                        category=fact.category,
-                        tags=fact.entity,
-                        trust_delta=fact.confidence - 0.5,
-                    )
-                    saved += 1
-                except Exception as e:
-                    logger.debug("Failed to save fact via fact_store: %s", e)
-        except ImportError:
-            logger.debug("fact_store not available — facts not persisted")
-
-    return saved
-
-
-def extract_and_save_facts(
-    messages: List[Dict[str, Any]],
-    fact_store_fn=None,
-) -> Tuple[List[ExtractedFact], int]:
-    """Extract facts from messages and save them.
-
-    Returns (extracted_facts, saved_count).
-    """
-    facts = extract_facts_from_messages(messages)
-    if facts:
-        logger.info("Extracted %d facts from conversation", len(facts))
-        saved = save_facts_to_store(facts, fact_store_fn)
-        logger.info("Saved %d/%d facts to store", saved, len(facts))
-    else:
-        saved = 0
-    return facts, saved
-
-
-def format_facts_summary(facts: List[ExtractedFact]) -> str:
-    """Format extracted facts as a readable summary."""
-    if not facts:
-        return "No facts extracted."
-
-    by_category = {}
-    for f in facts:
-        by_category.setdefault(f.category, []).append(f)
-
-    lines = [f"Extracted {len(facts)} facts:", ""]
-    for cat, cat_facts in sorted(by_category.items()):
-        lines.append(f"  {cat}:")
-        for f in cat_facts:
-            lines.append(f"    - {f.content[:80]}")
-    return "\n".join(lines)
--- a/agent/skill_commands.py
+++ b/agent/skill_commands.py
@@ -12,8 +12,6 @@ from datetime import datetime
 from pathlib import Path
 from typing import Any, Dict, Optional

-from hermes_constants import display_hermes_home
-
 logger = logging.getLogger(__name__)

 _skill_commands: Dict[str, Dict[str, Any]] = {}
@@ -110,7 +108,7 @@ def _inject_skill_config(loaded_skill: dict[str, Any], parts: list[str]) -> None
        if not resolved:
            return

-        lines = ["", f"[Skill config (from {display_hermes_home()}/config.yaml):"]
+        lines = ["", "[Skill config (from ~/.hermes/config.yaml):"]
        for key, value in resolved.items():
            display_val = str(value) if value else "(not set)"
            lines.append(f"  {key} = {display_val}")
@@ -170,7 +168,7 @@ def _build_skill_message(
            subdir_path = skill_dir / subdir
            if subdir_path.exists():
                for f in sorted(subdir_path.rglob("*")):
-                    if f.is_file() and not f.is_symlink():
+                    if f.is_file():
                        rel = str(f.relative_to(skill_dir))
                        supporting.append(rel)

--- a/agent/skill_utils.py
+++ b/agent/skill_utils.py
@@ -10,9 +10,9 @@ import os
 import re
 import sys
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Set, Tuple

-from hermes_constants import get_config_path, get_skills_dir
+from hermes_constants import get_hermes_home

 logger = logging.getLogger(__name__)

@@ -130,7 +130,7 @@ def get_disabled_skill_names(platform: str | None = None) -> Set[str]:
    Reads the config file directly (no CLI config imports) to stay
    lightweight.
    """
-    config_path = get_config_path()
+    config_path = get_hermes_home() / "config.yaml"
    if not config_path.exists():
        return set()
    try:
@@ -145,11 +145,10 @@ def get_disabled_skill_names(platform: str | None = None) -> Set[str]:
    if not isinstance(skills_cfg, dict):
        return set()

-    from gateway.session_context import get_session_env
    resolved_platform = (
        platform
        or os.getenv("HERMES_PLATFORM")
-        or get_session_env("HERMES_SESSION_PLATFORM")
+        or os.getenv("HERMES_SESSION_PLATFORM")
    )
    if resolved_platform:
        platform_disabled = (skills_cfg.get("platform_disabled") or {}).get(
@@ -178,7 +177,7 @@ def get_external_skills_dirs() -> List[Path]:
    path.  Only directories that actually exist are returned.  Duplicates and
    paths that resolve to the local ``~/.hermes/skills/`` are silently skipped.
    """
-    config_path = get_config_path()
+    config_path = get_hermes_home() / "config.yaml"
    if not config_path.exists():
        return []
    try:
@@ -200,7 +199,7 @@ def get_external_skills_dirs() -> List[Path]:
    if not isinstance(raw_dirs, list):
        return []

-    local_skills = get_skills_dir().resolve()
+    local_skills = (get_hermes_home() / "skills").resolve()
    seen: Set[Path] = set()
    result: List[Path] = []

@@ -230,7 +229,7 @@ def get_all_skills_dirs() -> List[Path]:
    The local dir is always first (and always included even if it doesn't exist
    yet — callers handle that).  External dirs follow in config order.
    """
-    dirs = [get_skills_dir()]
+    dirs = [get_hermes_home() / "skills"]
    dirs.extend(get_external_skills_dirs())
    return dirs

@@ -384,7 +383,7 @@ def resolve_skill_config_values(
    current values (or the declared default if the key isn't set).
    Path values are expanded via ``os.path.expanduser``.
    """
-    config_path = get_config_path()
+    config_path = get_hermes_home() / "config.yaml"
    config: Dict[str, Any] = {}
    if config_path.exists():
        try:
@@ -441,25 +440,3 @@ def iter_skill_index_files(skills_dir: Path, filename: str):
            matches.append(Path(root) / filename)
    for path in sorted(matches, key=lambda p: str(p.relative_to(skills_dir))):
        yield path
-
-
-# ── Namespace helpers for plugin-provided skills ───────────────────────────
-
-_NAMESPACE_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
-
-
-def parse_qualified_name(name: str) -> Tuple[Optional[str], str]:
-    """Split ``'namespace:skill-name'`` into ``(namespace, bare_name)``.
-
-    Returns ``(None, name)`` when there is no ``':'``.
-    """
-    if ":" not in name:
-        return None, name
-    return tuple(name.split(":", 1))  # type: ignore[return-value]
-
-
-def is_valid_namespace(candidate: Optional[str]) -> bool:
-    """Check whether *candidate* is a valid namespace (``[a-zA-Z0-9_-]+``)."""
-    if not candidate:
-        return False
-    return bool(_NAMESPACE_RE.match(candidate))
--- a/agent/smart_model_routing.py
+++ b/agent/smart_model_routing.py
@@ -181,7 +181,6 @@ def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any
            "api_mode": runtime.get("api_mode"),
            "command": runtime.get("command"),
            "args": list(runtime.get("args") or []),
-            "credential_pool": runtime.get("credential_pool"),
        },
        "label": f"smart route → {route.get('model')} ({runtime.get('provider')})",
        "signature": (
--- a/agent/subdirectory_hints.py
+++ b/agent/subdirectory_hints.py
@@ -159,10 +159,7 @@ class SubdirectoryHintTracker:

    def _is_valid_subdir(self, path: Path) -> bool:
        """Check if path is a valid directory to scan for hints."""
-        try:
-            if not path.is_dir():
-                return False
-        except OSError:
+        if not path.is_dir():
            return False
        if path in self._loaded_dirs:
            return False
@@ -175,10 +172,7 @@ class SubdirectoryHintTracker:
        found_hints = []
        for filename in _HINT_FILENAMES:
            hint_path = directory / filename
-            try:
-                if not hint_path.is_file():
-                    continue
-            except OSError:
+            if not hint_path.is_file():
                continue
            try:
                content = hint_path.read_text(encoding="utf-8").strip()
--- a/agent/time_aware_routing.py
+++ b/agent/time_aware_routing.py
@@ -1,146 +0,0 @@
-"""Time-aware model routing for cron jobs.
-
-Routes cron tasks to more capable models during off-hours when the user
-is not present to correct errors. Reduces error rates during high-error
-time windows (e.g., 18:00 evening batches).
-
-Usage:
-    from agent.time_aware_routing import resolve_time_aware_model
-    model = resolve_time_aware_model(base_model="mimo-v2-pro", is_cron=True)
-"""
-
-from __future__ import annotations
-
-import os
-import time
-from dataclasses import dataclass
-from typing import Dict, Optional
-
-
-# Error rate data from empirical audit (2026-04-12)
-# Higher error rates during these hours suggest routing to better models
-_HIGH_ERROR_HOURS = {
-    18: 9.4,  # 18:00 — 9.4% error rate (evening cron batches)
-    19: 8.1,
-    20: 7.5,
-    21: 6.8,
-    22: 6.2,
-    23: 5.9,
-    0:  5.5,
-    1:  5.2,
-}
-
-# Low error hours — default model is fine
-_LOW_ERROR_HOURS = set(range(6, 18))  # 06:00-17:59
-
-# Default fallback models by time zone
-_DEFAULT_STRONG_MODEL = os.getenv("CRON_STRONG_MODEL", "xiaomi/mimo-v2-pro")
-_DEFAULT_CHEAP_MODEL = os.getenv("CRON_CHEAP_MODEL", "qwen2.5:7b")
-_ERROR_THRESHOLD = float(os.getenv("CRON_ERROR_THRESHOLD", "6.0"))  # % error rate
-
-
-@dataclass
-class RoutingDecision:
-    """Result of time-aware routing."""
-    model: str
-    provider: str
-    reason: str
-    hour: int
-    error_rate: float
-    is_off_hours: bool
-
-
-def get_hour_error_rate(hour: int) -> float:
-    """Get expected error rate for a given hour (0-23)."""
-    return _HIGH_ERROR_HOURS.get(hour, 4.0)  # Default 4% for unlisted hours
-
-
-def is_off_hours(hour: int) -> bool:
-    """Check if hour is considered off-hours (higher error rates)."""
-    return hour not in _LOW_ERROR_HOURS
-
-
-def resolve_time_aware_model(
-    base_model: str = "",
-    base_provider: str = "",
-    is_cron: bool = False,
-    hour: Optional[int] = None,
-) -> RoutingDecision:
-    """Resolve model based on time of day and task type.
-
-    During off-hours (evening/night), routes to stronger models for cron
-    jobs to compensate for lack of human oversight.
-
-    Args:
-        base_model: The model that would normally be used.
-        base_provider: The provider for the base model.
-        is_cron: Whether this is a cron job (vs interactive session).
-        hour: Override hour (for testing). Defaults to current hour.
-
-    Returns:
-        RoutingDecision with model, provider, and reasoning.
-    """
-    if hour is None:
-        hour = time.localtime().tm_hour
-
-    error_rate = get_hour_error_rate(hour)
-    off_hours = is_off_hours(hour)
-
-    # Interactive sessions always use the base model (user can correct errors)
-    if not is_cron:
-        return RoutingDecision(
-            model=base_model or _DEFAULT_CHEAP_MODEL,
-            provider=base_provider,
-            reason="Interactive session — user can correct errors",
-            hour=hour,
-            error_rate=error_rate,
-            is_off_hours=off_hours,
-        )
-
-    # Cron jobs during low-error hours: use base model
-    if not off_hours and error_rate < _ERROR_THRESHOLD:
-        return RoutingDecision(
-            model=base_model or _DEFAULT_CHEAP_MODEL,
-            provider=base_provider,
-            reason=f"Low-error hours ({hour}:00, {error_rate}% expected)",
-            hour=hour,
-            error_rate=error_rate,
-            is_off_hours=False,
-        )
-
-    # Cron jobs during high-error hours: upgrade to stronger model
-    if error_rate >= _ERROR_THRESHOLD:
-        return RoutingDecision(
-            model=_DEFAULT_STRONG_MODEL,
-            provider="nous",
-            reason=f"High-error hours ({hour}:00, {error_rate}% expected) — using stronger model",
-            hour=hour,
-            error_rate=error_rate,
-            is_off_hours=True,
-        )
-
-    # Off-hours but low error: use base model
-    return RoutingDecision(
-        model=base_model or _DEFAULT_CHEAP_MODEL,
-        provider=base_provider,
-        reason=f"Off-hours but low error ({hour}:00, {error_rate}%)",
-        hour=hour,
-        error_rate=error_rate,
-        is_off_hours=off_hours,
-    )
-
-
-def get_routing_report() -> str:
-    """Get a report of time-based routing decisions for the next 24 hours."""
-    lines = ["Time-Aware Model Routing (24h forecast)", "=" * 40, ""]
-    lines.append(f"Error threshold: {_ERROR_THRESHOLD}%")
-    lines.append(f"Strong model: {_DEFAULT_STRONG_MODEL}")
-    lines.append(f"Cheap model: {_DEFAULT_CHEAP_MODEL}")
-    lines.append("")
-
-    for h in range(24):
-        decision = resolve_time_aware_model(is_cron=True, hour=h)
-        icon = "\U0001f7e2" if decision.model == _DEFAULT_CHEAP_MODEL else "\U0001f534"
-        lines.append(f"  {h:02d}:00 {icon} {decision.model:25s} ({decision.error_rate}% error)")
-
-    return "\n".join(lines)
--- a/agent/title_generator.py
+++ b/agent/title_generator.py
@@ -36,7 +36,7 @@ def generate_title(user_message: str, assistant_response: str, timeout: float =

    try:
        response = call_llm(
-            task="title_generation",
+            task="compression",  # reuse compression task config (cheap/fast model)
            messages=messages,
            max_tokens=30,
            temperature=0.3,
--- a/agent/token_budget.py
+++ b/agent/token_budget.py
@@ -1,316 +0,0 @@
-#!/usr/bin/env python3
-"""
-Token Budget — Poka-yoke guard against silent context overflow.
-
-Progressive warning system with circuit breakers:
-  - 60%: WARNING  — log + suggest summarization
-  - 80%: CAUTION  — auto-compress, drop raw tool outputs
-  - 90%: CRITICAL — block verbose tool calls, force wrap-up
-  - 95%: STOP     — graceful session termination with summary
-
-Also provides tool output budgeting to truncate before overflow.
-
-Usage:
-    from agent.token_budget import TokenBudget
-
-    budget = TokenBudget(context_length=128_000)
-    budget.update(8000)          # from API response prompt_tokens
-
-    status = budget.check()      # returns BudgetStatus with level + message
-    budget.should_block_tools()  # True at 90%+
-    budget.should_terminate()    # True at 95%+
-
-    # Tool output budgeting
-    remaining = budget.tool_output_budget()
-    truncated = budget.truncate_tool_output(output_text, max_chars=remaining)
-"""
-
-import logging
-from dataclasses import dataclass, field
-from enum import Enum
-from typing import Optional
-
-logger = logging.getLogger(__name__)
-
-
-# ── Thresholds ────────────────────────────────────────────────────────
-
-WARN_PERCENT = 0.60
-CAUTION_PERCENT = 0.80
-CRITICAL_PERCENT = 0.90
-STOP_PERCENT = 0.95
-
-# Reserve 5% of context for system prompt, response, and overhead
-RESPONSE_RESERVE_RATIO = 0.05
-
-# Max tool output chars at each level
-TOOL_OUTPUT_BUDGETS = {
-    "NORMAL": 50_000,
-    "WARNING": 20_000,
-    "CAUTION": 8_000,
-    "CRITICAL": 2_000,
-    "STOP": 500,
-}
-
-
-class BudgetLevel(Enum):
-    NORMAL = "NORMAL"
-    WARNING = "WARNING"
-    CAUTION = "CAUTION"
-    CRITICAL = "CRITICAL"
-    STOP = "STOP"
-
-    @property
-    def percent_threshold(self) -> float:
-        return {
-            BudgetLevel.NORMAL: 0.0,
-            BudgetLevel.WARNING: WARN_PERCENT,
-            BudgetLevel.CAUTION: CAUTION_PERCENT,
-            BudgetLevel.CRITICAL: CRITICAL_PERCENT,
-            BudgetLevel.STOP: STOP_PERCENT,
-        }[self]
-
-    @property
-    def emoji(self) -> str:
-        return {
-            BudgetLevel.NORMAL: "",
-            BudgetLevel.WARNING: "\u26a0\ufe0f",
-            BudgetLevel.CAUTION: "\U0001f525",
-            BudgetLevel.CRITICAL: "\U0001f6d1",
-            BudgetLevel.STOP: "\U0001f6d1",
-        }[self]
-
-
-@dataclass
-class BudgetStatus:
-    """Current token budget status."""
-    level: BudgetLevel
-    tokens_used: int
-    context_length: int
-    percent_used: float
-    tokens_remaining: int
-    message: str = ""
-    should_compress: bool = False
-    should_block_tools: bool = False
-    should_terminate: bool = False
-
-    def to_indicator(self) -> str:
-        """Compact status indicator for CLI display."""
-        pct = int(self.percent_used * 100)
-        if self.level == BudgetLevel.NORMAL:
-            return f"[{pct}%]"
-        return f"{self.level.emoji} [{pct}%]"
-
-    def to_bar(self, width: int = 10) -> str:
-        """Visual progress bar."""
-        filled = int(width * self.percent_used)
-        bar = "\u2588" * filled + "\u2591" * (width - filled)
-        color = self._bar_color()
-        return f"{color}{bar}\033[0m {int(self.percent_used * 100)}%"
-
-    def _bar_color(self) -> str:
-        if self.level == BudgetLevel.STOP:
-            return "\033[41m"  # red bg
-        if self.level == BudgetLevel.CRITICAL:
-            return "\033[31m"  # red
-        if self.level == BudgetLevel.CAUTION:
-            return "\033[33m"  # yellow
-        if self.level == BudgetLevel.WARNING:
-            return "\033[33m"  # yellow
-        return "\033[32m"  # green
-
-
-class TokenBudget:
-    """
-    Progressive token budget tracker with poka-yoke circuit breakers.
-
-    Tracks cumulative token usage against a context length and triggers
-    escalating actions at each threshold.
-    """
-
-    def __init__(
-        self,
-        context_length: int,
-        warn_percent: float = WARN_PERCENT,
-        caution_percent: float = CAUTION_PERCENT,
-        critical_percent: float = CRITICAL_PERCENT,
-        stop_percent: float = STOP_PERCENT,
-        response_reserve_ratio: float = RESPONSE_RESERVE_RATIO,
-    ):
-        self.context_length = context_length
-        self.warn_threshold = int(context_length * warn_percent)
-        self.caution_threshold = int(context_length * caution_percent)
-        self.critical_threshold = int(context_length * critical_percent)
-        self.stop_threshold = int(context_length * stop_percent)
-        self.response_reserve = int(context_length * response_reserve_ratio)
-
-        self.tokens_used = 0
-        self.completions_tokens = 0
-        self.total_tool_output_chars = 0
-        self._level = BudgetLevel.NORMAL
-        self._history: list[int] = []
-
-    def update(self, prompt_tokens: int, completion_tokens: int = 0) -> BudgetStatus:
-        """Update budget from API response usage."""
-        self.tokens_used = prompt_tokens
-        self.completions_tokens = completion_tokens
-        self._history.append(prompt_tokens)
-        return self.check()
-
-    def check(self) -> BudgetStatus:
-        """Evaluate current budget level and return status."""
-        pct = self.tokens_used / self.context_length if self.context_length > 0 else 0
-        remaining = max(0, self.context_length - self.tokens_used - self.response_reserve)
-
-        # Determine level
-        if pct >= STOP_PERCENT:
-            level = BudgetLevel.STOP
-        elif pct >= CRITICAL_PERCENT:
-            level = BudgetLevel.CRITICAL
-        elif pct >= CAUTION_PERCENT:
-            level = BudgetLevel.CAUTION
-        elif pct >= WARN_PERCENT:
-            level = BudgetLevel.WARNING
-        else:
-            level = BudgetLevel.NORMAL
-
-        # Log transitions (don\'t log every check)
-        if level != self._level:
-            self._log_transition(level, pct)
-            self._level = level
-
-        messages = {
-            BudgetLevel.NORMAL: "",
-            BudgetLevel.WARNING: (
-                f"Context at {int(pct*100)}%. Consider wrapping up soon or using /compress."
-            ),
-            BudgetLevel.CAUTION: (
-                f"Context at {int(pct*100)}%. Auto-compressing. "
-                f"Tool outputs will be truncated."
-            ),
-            BudgetLevel.CRITICAL: (
-                f"Context at {int(pct*100)}%. Verbose tools blocked. "
-                f"Session approaching limit — please wrap up."
-            ),
-            BudgetLevel.STOP: (
-                f"Context at {int(pct*100)}%. Session must terminate. "
-                f"Saving summary before shutdown."
-            ),
-        }
-
-        return BudgetStatus(
-            level=level,
-            tokens_used=self.tokens_used,
-            context_length=self.context_length,
-            percent_used=pct,
-            tokens_remaining=remaining,
-            message=messages[level],
-            should_compress=level in (BudgetLevel.CAUTION, BudgetLevel.CRITICAL, BudgetLevel.STOP),
-            should_block_tools=level in (BudgetLevel.CRITICAL, BudgetLevel.STOP),
-            should_terminate=level == BudgetLevel.STOP,
-        )
-
-    def should_compress(self) -> bool:
-        """True at 80%+ — auto-compression should trigger."""
-        return self.tokens_used >= self.caution_threshold
-
-    def should_block_tools(self) -> bool:
-        """True at 90%+ — verbose tool calls should be blocked."""
-        return self.tokens_used >= self.critical_threshold
-
-    def should_terminate(self) -> bool:
-        """True at 95%+ — session should gracefully terminate."""
-        return self.tokens_used >= self.stop_threshold
-
-    def tool_output_budget(self) -> int:
-        """Max chars allowed for next tool output based on current level."""
-        status = self.check()
-        return TOOL_OUTPUT_BUDGETS.get(status.level.value, 50_000)
-
-    def truncate_tool_output(self, output: str, max_chars: int = None) -> str:
-        """Truncate tool output to fit budget. Adds truncation notice."""
-        if max_chars is None:
-            max_chars = self.tool_output_budget()
-
-        if len(output) <= max_chars:
-            return output
-
-        # Preserve start and end, truncate middle
-        if max_chars < 200:
-            return output[:max_chars] + "\n[...truncated...]"
-
-        head = max_chars // 2
-        tail = max_chars - head - 30  # reserve for truncation notice
-        truncated = (
-            output[:head]
-            + f"\n\n[...{len(output) - head - tail:,} chars truncated...]\n\n"
-            + output[-tail:]
-        )
-        return truncated
-
-    def remaining_for_response(self) -> int:
-        """Tokens available for the model\'s response."""
-        return max(0, self.context_length - self.tokens_used - self.response_reserve)
-
-    def growth_rate(self) -> Optional[float]:
-        """Average token increase per turn (from history)."""
-        if len(self._history) < 2:
-            return None
-        diffs = [self._history[i] - self._history[i-1] for i in range(1, len(self._history))]
-        return sum(diffs) / len(diffs)
-
-    def turns_remaining(self) -> Optional[int]:
-        """Estimated turns until context is full (based on growth rate)."""
-        rate = self.growth_rate()
-        if rate is None or rate <= 0:
-            return None
-        remaining = self.context_length - self.tokens_used
-        return int(remaining / rate)
-
-    def reset(self):
-        """Reset budget for new session."""
-        self.tokens_used = 0
-        self.completions_tokens = 0
-        self.total_tool_output_chars = 0
-        self._level = BudgetLevel.NORMAL
-        self._history.clear()
-
-    def _log_transition(self, new_level: BudgetLevel, pct: float):
-        """Log budget level transitions."""
-        msg = (
-            f"Token budget: {self._level.value} -> {new_level.value} "
-            f"({self.tokens_used}/{self.context_length} = {pct:.0%})"
-        )
-        if new_level == BudgetLevel.WARNING:
-            logger.warning(msg)
-        elif new_level == BudgetLevel.CAUTION:
-            logger.warning(msg)
-        elif new_level in (BudgetLevel.CRITICAL, BudgetLevel.STOP):
-            logger.error(msg)
-        else:
-            logger.info(msg)
-
-    def summary(self) -> str:
-        """Human-readable budget summary."""
-        status = self.check()
-        turns = self.turns_remaining()
-        rate = self.growth_rate()
-        lines = [
-            f"Token Budget: {status.tokens_used:,} / {status.context_length:,} ({status.percent_used:.0%})",
-            f"Level: {status.level.value}",
-            f"Remaining: {status.tokens_remaining:,} tokens",
-        ]
-        if rate is not None:
-            lines.append(f"Growth rate: ~{rate:,.0f} tokens/turn")
-        if turns is not None:
-            lines.append(f"Estimated turns left: ~{turns}")
-        if status.message:
-            lines.append(f"Action: {status.message}")
-        return "\n".join(lines)
-
-
-# ── Convenience factory ───────────────────────────────────────────────
-
-def create_budget(context_length: int, **kwargs) -> TokenBudget:
-    """Create a TokenBudget with defaults."""
-    return TokenBudget(context_length=context_length, **kwargs)
--- a/agent/tool_fixation_detector.py
+++ b/agent/tool_fixation_detector.py
@@ -1,156 +0,0 @@
-"""Tool fixation detection — break repetitive tool calling loops.
-
-Detects when the agent latches onto one tool and calls it repeatedly
-without making progress. Injects a nudge prompt to break the loop.
-
-Usage:
-    from agent.tool_fixation_detector import ToolFixationDetector
-    detector = ToolFixationDetector()
-    nudge = detector.record("execute_code")
-    if nudge:
-        # Inject nudge into conversation
-        messages.append({"role": "system", "content": nudge})
-"""
-
-from __future__ import annotations
-
-import os
-from dataclasses import dataclass, field
-from typing import Dict, List, Optional
-
-
-# Default thresholds
-_DEFAULT_THRESHOLD = int(os.getenv("TOOL_FIXATION_THRESHOLD", "5"))
-_DEFAULT_WINDOW = int(os.getenv("TOOL_FIXATION_WINDOW", "10"))
-
-
-@dataclass
-class FixationEvent:
-    """Record of a fixation detection."""
-    tool_name: str
-    streak_length: int
-    threshold: int
-    nudge_sent: bool = False
-
-
-class ToolFixationDetector:
-    """Detects and breaks tool fixation loops.
-
-    Tracks the sequence of tool calls and detects when the same tool
-    is called N times consecutively. When detected, returns a nudge
-    prompt to inject into the conversation.
-    """
-
-    def __init__(self, threshold: int = 0, window: int = 0):
-        self.threshold = threshold or _DEFAULT_THRESHOLD
-        self.window = window or _DEFAULT_WINDOW
-        self._history: List[str] = []
-        self._current_streak: str = ""
-        self._streak_count: int = 0
-        self._nudges_sent: int = 0
-        self._events: List[FixationEvent] = []
-
-    @property
-    def nudges_sent(self) -> int:
-        return self._nudges_sent
-
-    @property
-    def events(self) -> List[FixationEvent]:
-        return list(self._events)
-
-    def record(self, tool_name: str) -> Optional[str]:
-        """Record a tool call and return nudge prompt if fixation detected.
-
-        Args:
-            tool_name: Name of the tool that was called.
-
-        Returns:
-            Nudge prompt string if fixation detected, None otherwise.
-        """
-        self._history.append(tool_name)
-
-        # Trim history to window
-        if len(self._history) > self.window:
-            self._history = self._history[-self.window:]
-
-        # Update streak
-        if tool_name == self._current_streak:
-            self._streak_count += 1
-        else:
-            self._current_streak = tool_name
-            self._streak_count = 1
-
-        # Check for fixation
-        if self._streak_count >= self.threshold:
-            event = FixationEvent(
-                tool_name=tool_name,
-                streak_length=self._streak_count,
-                threshold=self.threshold,
-                nudge_sent=True,
-            )
-            self._events.append(event)
-            self._nudges_sent += 1
-
-            return self._build_nudge(tool_name, self._streak_count)
-
-        return None
-
-    def _build_nudge(self, tool_name: str, count: int) -> str:
-        """Build a nudge prompt to break the fixation loop."""
-        return (
-            f"[SYSTEM: You have called `{tool_name}` {count} times in a row "
-            f"without switching tools. This suggests a fixation loop. "
-            f"Consider:\n"
-            f"1. Is the tool returning an error? Read the error carefully.\n"
-            f"2. Is there a different tool that could help?\n"
-            f"3. Should you ask the user for clarification?\n"
-            f"4. Is the task actually complete?\n"
-            f"Break the loop by trying a different approach.]"
-        )
-
-    def reset(self) -> None:
-        """Reset the detector state."""
-        self._history.clear()
-        self._current_streak = ""
-        self._streak_count = 0
-
-    def get_streak_info(self) -> dict:
-        """Get current streak information."""
-        return {
-            "current_tool": self._current_streak,
-            "streak_count": self._streak_count,
-            "threshold": self.threshold,
-            "at_threshold": self._streak_count >= self.threshold,
-            "nudges_sent": self._nudges_sent,
-        }
-
-    def format_report(self) -> str:
-        """Format fixation events as a report."""
-        if not self._events:
-            return "No tool fixation detected."
-
-        lines = [
-            f"Tool Fixation Report ({len(self._events)} events)",
-            "=" * 40,
-        ]
-        for e in self._events:
-            lines.append(f"  {e.tool_name}: {e.streak_length} consecutive calls (threshold: {e.threshold})")
-        return "\n".join(lines)
-
-
-# Singleton
-_detector: Optional[ToolFixationDetector] = None
-
-
-def get_fixation_detector() -> ToolFixationDetector:
-    """Get or create the singleton detector."""
-    global _detector
-    if _detector is None:
-        _detector = ToolFixationDetector()
-    return _detector
-
-
-def reset_fixation_detector() -> None:
-    """Reset the singleton."""
-    global _detector
-    _detector = None
--- a/agent/tool_orchestrator.py
+++ b/agent/tool_orchestrator.py
@@ -1,177 +0,0 @@
-"""Tool Orchestrator — Robust execution and circuit breaking for agent tools.
-
-Provides a unified execution service that wraps the tool registry.
-Implements the Circuit Breaker pattern to prevent the agent from getting
-stuck in failure loops when a specific tool or its underlying service
-is flapping or down.
-
-Architecture:
-    Discovery (tools/registry.py) -> Orchestration (agent/tool_orchestrator.py) -> Dispatch
-"""
-
-import json
-import time
-import logging
-import threading
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
-
-from tools.registry import registry
-
-logger = logging.getLogger(__name__)
-
-
-class CircuitState:
-    """States for the tool circuit breaker."""
-    CLOSED = "closed"        # Normal operation
-    OPEN = "open"            # Failing, execution blocked
-    HALF_OPEN = "half_open"  # Testing if service recovered
-
-
-@dataclass
-class ToolStats:
-    """Execution statistics for a tool."""
-    name: str
-    state: str = CircuitState.CLOSED
-    failures: int = 0
-    successes: int = 0
-    last_failure_time: float = 0
-    total_execution_time: float = 0
-    call_count: int = 0
-
-
-class ToolOrchestrator:
-    """Orchestrates tool execution with robustness patterns."""
-
-    def __init__(
-        self,
-        failure_threshold: int = 3,
-        reset_timeout: int = 300,
-    ):
-        """
-        Args:
-            failure_threshold: Number of failures before opening the circuit.
-            reset_timeout: Seconds to wait before transitioning from OPEN to HALF_OPEN.
-        """
-        self.failure_threshold = failure_threshold
-        self.reset_timeout = reset_timeout
-        self._stats: Dict[str, ToolStats] = {}
-        self._lock = threading.Lock()
-
-    def _get_stats(self, name: str) -> ToolStats:
-        """Get or initialize stats for a tool with thread-safe state transition."""
-        with self._lock:
-            if name not in self._stats:
-                self._stats[name] = ToolStats(name=name)
-            
-            stats = self._stats[name]
-            
-            # Transition from OPEN to HALF_OPEN if timeout expired
-            if stats.state == CircuitState.OPEN:
-                if time.time() - stats.last_failure_time > self.reset_timeout:
-                    stats.state = CircuitState.HALF_OPEN
-                    logger.info("Circuit breaker HALF_OPEN for tool: %s", name)
-            
-            return stats
-
-    def _record_success(self, name: str, execution_time: float):
-        """Record a successful tool execution and close the circuit."""
-        with self._lock:
-            stats = self._stats[name]
-            stats.successes += 1
-            stats.call_count += 1
-            stats.total_execution_time += execution_time
-            
-            if stats.state != CircuitState.CLOSED:
-                logger.info("Circuit breaker CLOSED for tool: %s (recovered)", name)
-            
-            stats.state = CircuitState.CLOSED
-            stats.failures = 0
-
-    def _record_failure(self, name: str, execution_time: float):
-        """Record a failed tool execution and potentially open the circuit."""
-        with self._lock:
-            stats = self._stats[name]
-            stats.failures += 1
-            stats.call_count += 1
-            stats.total_execution_time += execution_time
-            stats.last_failure_time = time.time()
-            
-            if stats.state == CircuitState.HALF_OPEN or stats.failures >= self.failure_threshold:
-                stats.state = CircuitState.OPEN
-                logger.warning(
-                    "Circuit breaker OPEN for tool: %s (failures: %d)", 
-                    name, stats.failures
-                )
-
-    def dispatch(self, name: str, args: dict, **kwargs) -> str:
-        """Execute a tool via the registry with circuit breaker protection."""
-        stats = self._get_stats(name)
-        
-        if stats.state == CircuitState.OPEN:
-            return json.dumps({
-                "error": (
-                    f"Tool '{name}' is temporarily unavailable due to repeated failures. "
-                    f"Circuit breaker is OPEN. Please try again in a few minutes or use an alternative tool."
-                ),
-                "circuit_breaker": True,
-                "tool_name": name
-            })
-
-        start_time = time.time()
-        try:
-            # Dispatch to the underlying registry
-            result_str = registry.dispatch(name, args, **kwargs)
-            execution_time = time.time() - start_time
-            
-            # Inspect result for errors. registry.dispatch catches internal
-            # exceptions and returns a JSON error string.
-            is_error = False
-            try:
-                # Lightweight check for error key in JSON
-                if '"error":' in result_str:
-                    res_json = json.loads(result_str)
-                    if isinstance(res_json, dict) and "error" in res_json:
-                        is_error = True
-            except (json.JSONDecodeError, TypeError):
-                # If it's not valid JSON, it's a malformed result (error)
-                is_error = True
-            
-            if is_error:
-                self._record_failure(name, execution_time)
-            else:
-                self._record_success(name, execution_time)
-                
-            return result_str
-            
-        except Exception as e:
-            # This should rarely be hit as registry.dispatch catches most things,
-            # but we guard against orchestrator-level or registry-level bugs.
-            execution_time = time.time() - start_time
-            self._record_failure(name, execution_time)
-            
-            error_msg = f"Tool orchestrator error during {name}: {type(e).__name__}: {e}"
-            logger.exception(error_msg)
-            return json.dumps({
-                "error": error_msg,
-                "tool_name": name,
-                "execution_time": execution_time
-            })
-
-    def get_fleet_stats(self) -> Dict[str, Any]:
-        """Return execution statistics for all tools."""
-        with self._lock:
-            return {
-                name: {
-                    "state": s.state,
-                    "failures": s.failures,
-                    "successes": s.successes,
-                    "avg_time": s.total_execution_time / s.call_count if s.call_count > 0 else 0,
-                    "calls": s.call_count
-                }
-                for name, s in self._stats.items()
-            }
-
-
-# Global orchestrator instance
-orchestrator = ToolOrchestrator()
--- a/agent/usage_pricing.py
+++ b/agent/usage_pricing.py
@@ -575,6 +575,49 @@ def has_known_pricing(
    return entry is not None


+def get_pricing(
+    model_name: str,
+    provider: Optional[str] = None,
+    base_url: Optional[str] = None,
+    api_key: Optional[str] = None,
+) -> Dict[str, float]:
+    """Backward-compatible thin wrapper for legacy callers.
+
+    Returns only non-cache input/output fields when a pricing entry exists.
+    Unknown routes return zeroes.
+    """
+    entry = get_pricing_entry(model_name, provider=provider, base_url=base_url, api_key=api_key)
+    if not entry:
+        return {"input": 0.0, "output": 0.0}
+    return {
+        "input": float(entry.input_cost_per_million or _ZERO),
+        "output": float(entry.output_cost_per_million or _ZERO),
+    }
+
+
+def estimate_cost_usd(
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+    *,
+    provider: Optional[str] = None,
+    base_url: Optional[str] = None,
+    api_key: Optional[str] = None,
+) -> float:
+    """Backward-compatible helper for legacy callers.
+
+    This uses non-cached input/output only. New code should call
+    `estimate_usage_cost()` with canonical usage buckets.
+    """
+    result = estimate_usage_cost(
+        model,
+        CanonicalUsage(input_tokens=input_tokens, output_tokens=output_tokens),
+        provider=provider,
+        base_url=base_url,
+        api_key=api_key,
+    )
+    return float(result.amount_usd or _ZERO)
+

 def format_duration_compact(seconds: float) -> str:
    if seconds < 60:
--- a/ansible/fleet_mtls.yml
+++ b/ansible/fleet_mtls.yml
@@ -1,32 +0,0 @@
---
-# fleet_mtls.yml — Deploy mutual-TLS certificates to all fleet agents.
-#
-# Prerequisites:
-#   1. Run scripts/gen_fleet_ca.sh to create the fleet CA.
-#   2. For each agent, run:
-#        scripts/gen_agent_cert.sh --agent timmy
-#        scripts/gen_agent_cert.sh --agent allegro
-#        scripts/gen_agent_cert.sh --agent ezra
-#
-# Usage:
-#   ansible-playbook -i inventory/fleet.ini ansible/fleet_mtls.yml
-#
-# Inventory example (inventory/fleet.ini):
-#   [fleet]
-#   timmy.local   agent_name=timmy
-#   allegro.local agent_name=allegro
-#   ezra.local    agent_name=ezra
-#
-# Refs #806
-
- name: Distribute fleet mTLS certificates
-  hosts: fleet
-  become: true
-  vars:
-    _pki_base: "{{ lookup('env', 'HOME') }}/.hermes/pki"
-  roles:
-    - role: hermes_mtls
-      vars:
-        hermes_mtls_local_ca_cert: "{{ _pki_base }}/ca/fleet-ca.crt"
-        hermes_mtls_local_agent_cert: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.crt"
-        hermes_mtls_local_agent_key: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.key"
--- a/ansible/inventory/fleet.ini.example
+++ b/ansible/inventory/fleet.ini.example
@@ -1,12 +0,0 @@
-# Example fleet inventory for mutual-TLS cert distribution.
-# Copy to fleet.ini and adjust hostnames/IPs.
-# Refs #806
-
-[fleet_agents]
-timmy    ansible_host=192.168.1.10
-allegro  ansible_host=192.168.1.11
-ezra     ansible_host=192.168.1.12
-
-[fleet_agents:vars]
-ansible_user=hermes
-ansible_python_interpreter=/usr/bin/python3
--- a/ansible/roles/fleet_mtls_certs/defaults/main.yml
+++ b/ansible/roles/fleet_mtls_certs/defaults/main.yml
@@ -1,21 +0,0 @@
---
-# Default paths on the *control node* where certs are read from.
-# Override these in your inventory / group_vars as needed.
-
-# Fleet CA certificate (public; safe to push to all nodes)
-fleet_mtls_ca_cert_src: "{{ lookup('env', 'HOME') }}/.hermes/pki/ca/fleet-ca.crt"
-
-# Per-agent cert/key source dir on the control node.
-# Expected layout:  <fleet_mtls_agent_certs_dir>/<agent_name>/<agent_name>.{crt,key}
-fleet_mtls_agent_certs_dir: "{{ lookup('env', 'HOME') }}/.hermes/pki/agents"
-
-# Remote destination paths on the fleet node
-fleet_mtls_remote_pki_dir: "/etc/hermes/pki"
-fleet_mtls_remote_ca_dir: "{{ fleet_mtls_remote_pki_dir }}/ca"
-fleet_mtls_remote_agent_dir: "{{ fleet_mtls_remote_pki_dir }}/agent"
-
-# The agent name to deploy (set per-host in inventory, e.g. timmy / allegro / ezra)
-fleet_mtls_agent_name: "{{ inventory_hostname_short }}"
-
-# Hermes service name (for reload notification)
-fleet_mtls_hermes_service: "hermes-a2a"
--- a/ansible/roles/fleet_mtls_certs/handlers/main.yml
+++ b/ansible/roles/fleet_mtls_certs/handlers/main.yml
@@ -1,7 +0,0 @@
---
- name: Restart hermes-a2a
-  ansible.builtin.systemd:
-    name: "{{ fleet_mtls_hermes_service }}"
-    state: restarted
-  when: ansible_service_mgr == "systemd"
-  ignore_errors: true   # service may not exist in all environments
--- a/ansible/roles/fleet_mtls_certs/meta/main.yml
+++ b/ansible/roles/fleet_mtls_certs/meta/main.yml
@@ -1,17 +0,0 @@
---
-galaxy_info:
-  role_name: fleet_mtls_certs
-  author: hermes-agent
-  description: >
-    Distribute fleet CA and per-agent mTLS certificates to Hermes fleet nodes.
-    Part of issue #806 — A2A mutual TLS between fleet agents.
-  min_ansible_version: "2.14"
-  platforms:
-    - name: Debian
-      versions: [bookworm, bullseye]
-    - name: Ubuntu
-      versions: ["22.04", "24.04"]
-    - name: EL
-      versions: ["8", "9"]
-
-dependencies: []
--- a/ansible/roles/fleet_mtls_certs/tasks/main.yml
+++ b/ansible/roles/fleet_mtls_certs/tasks/main.yml
@@ -1,99 +0,0 @@
---
-# fleet_mtls_certs/tasks/main.yml
-#
-# Distribute the fleet CA certificate and the per-agent TLS cert+key to
-# each fleet node.  Triggers a hermes-a2a service restart when any cert
-# changes.
-#
-# Refs #806 — A2A mutual TLS between fleet agents.
-
- name: Verify agent cert source files exist on control node
-  ansible.builtin.stat:
-    path: "{{ item }}"
-  register: _src_stat
-  delegate_to: localhost
-  loop:
-    - "{{ fleet_mtls_ca_cert_src }}"
-    - "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
-    - "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
-  loop_control:
-    label: "{{ item | basename }}"
-
- name: Fail if any source cert is missing
-  ansible.builtin.fail:
-    msg: >
-      Required cert file not found: {{ item.item }}
-      Run scripts/gen_fleet_ca.sh and scripts/gen_agent_cert.sh --agent {{ fleet_mtls_agent_name }} first.
-  when: not item.stat.exists
-  loop: "{{ _src_stat.results }}"
-  loop_control:
-    label: "{{ item.item | basename }}"
-
-# -----------------------------------------------------------------------
-# Remote directory structure
-# -----------------------------------------------------------------------
-
- name: Create remote PKI directories
-  ansible.builtin.file:
-    path: "{{ item }}"
-    state: directory
-    owner: root
-    group: root
-    mode: "0750"
-  loop:
-    - "{{ fleet_mtls_remote_pki_dir }}"
-    - "{{ fleet_mtls_remote_ca_dir }}"
-    - "{{ fleet_mtls_remote_agent_dir }}"
-
-# -----------------------------------------------------------------------
-# Fleet CA certificate (public — read-only for all)
-# -----------------------------------------------------------------------
-
- name: Deploy fleet CA certificate
-  ansible.builtin.copy:
-    src: "{{ fleet_mtls_ca_cert_src }}"
-    dest: "{{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt"
-    owner: root
-    group: root
-    mode: "0644"
-  notify: Restart hermes-a2a
-
-# -----------------------------------------------------------------------
-# Per-agent certificate (public portion)
-# -----------------------------------------------------------------------
-
- name: Deploy agent certificate
-  ansible.builtin.copy:
-    src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
-    dest: "{{ fleet_mtls_remote_agent_dir }}/agent.crt"
-    owner: root
-    group: root
-    mode: "0644"
-  notify: Restart hermes-a2a
-
-# -----------------------------------------------------------------------
-# Per-agent private key (secret — root-only read)
-# -----------------------------------------------------------------------
-
- name: Deploy agent private key
-  ansible.builtin.copy:
-    src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
-    dest: "{{ fleet_mtls_remote_agent_dir }}/agent.key"
-    owner: root
-    group: root
-    mode: "0600"
-  no_log: true   # suppress file content from Ansible output
-  notify: Restart hermes-a2a
-
-# -----------------------------------------------------------------------
-# Environment file for hermes-a2a systemd unit
-# -----------------------------------------------------------------------
-
- name: Write hermes-a2a environment file
-  ansible.builtin.template:
-    src: hermes_a2a_env.j2
-    dest: /etc/hermes/a2a.env
-    owner: root
-    group: root
-    mode: "0640"
-  notify: Restart hermes-a2a
--- a/ansible/roles/fleet_mtls_certs/templates/hermes_a2a_env.j2
+++ b/ansible/roles/fleet_mtls_certs/templates/hermes_a2a_env.j2
@@ -1,10 +0,0 @@
-# Managed by Ansible — fleet_mtls_certs role
-# Environment variables for the hermes-a2a systemd service.
-# Source this file in the [Service] section: EnvironmentFile=/etc/hermes/a2a.env
-
-HERMES_AGENT_NAME={{ fleet_mtls_agent_name }}
-HERMES_A2A_CERT={{ fleet_mtls_remote_agent_dir }}/agent.crt
-HERMES_A2A_KEY={{ fleet_mtls_remote_agent_dir }}/agent.key
-HERMES_A2A_CA={{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt
-HERMES_A2A_HOST=0.0.0.0
-HERMES_A2A_PORT=9443
--- a/ansible/roles/hermes_mtls/defaults/main.yml
+++ b/ansible/roles/hermes_mtls/defaults/main.yml
@@ -1,21 +0,0 @@
---
-# Ansible role: hermes_mtls
-# Distributes fleet mTLS certificates to Hermes agent nodes.
-#
-# Required variables (set in inventory / group_vars / --extra-vars):
-#   hermes_mtls_local_ca_cert   Local path on the Ansible controller to fleet-ca.crt
-#   hermes_mtls_local_agent_cert  Local path to this agent's .crt file
-#   hermes_mtls_local_agent_key   Local path to this agent's .key file
-#
-# Optional overrides:
-hermes_mtls_cert_dir: /etc/hermes/certs
-hermes_mtls_cert_owner: hermes
-hermes_mtls_cert_group: hermes
-hermes_mtls_cert_mode: "0640"
-hermes_mtls_ca_cert_mode: "0644"
-
-# Env file that Hermes reads on startup (systemd EnvironmentFile or .env)
-hermes_mtls_env_file: /etc/hermes/mtls.env
-
-# Hermes systemd service name — restarted after cert changes
-hermes_mtls_service: hermes-gateway
--- a/ansible/roles/hermes_mtls/handlers/main.yml
+++ b/ansible/roles/hermes_mtls/handlers/main.yml
@@ -1,7 +0,0 @@
---
- name: Restart hermes service
-  ansible.builtin.systemd:
-    name: "{{ hermes_mtls_service }}"
-    state: restarted
-    daemon_reload: true
-  when: ansible_service_mgr == "systemd"
--- a/ansible/roles/hermes_mtls/meta/main.yml
+++ b/ansible/roles/hermes_mtls/meta/main.yml
@@ -1,16 +0,0 @@
---
-galaxy_info:
-  role_name: hermes_mtls
-  author: Hermes Fleet
-  description: Distribute mTLS certificates to Hermes fleet nodes for A2A authentication
-  license: MIT
-  min_ansible_version: "2.14"
-  platforms:
-    - name: Ubuntu
-      versions: ["22.04", "24.04"]
-    - name: Debian
-      versions: ["12"]
-    - name: EL
-      versions: ["9"]
-
-dependencies: []
--- a/ansible/roles/hermes_mtls/tasks/main.yml
+++ b/ansible/roles/hermes_mtls/tasks/main.yml
@@ -1,67 +0,0 @@
---
-# hermes_mtls role — distribute fleet mTLS certificates to a Hermes agent node.
-#
-# This role:
-#   1. Creates the cert directory on the remote node
-#   2. Copies the Fleet CA cert, agent cert, and agent key
-#   3. Writes an env file with HERMES_MTLS_* variables
-#   4. Restarts the Hermes service if any cert changed
-
- name: Ensure cert directory exists
-  ansible.builtin.file:
-    path: "{{ hermes_mtls_cert_dir }}"
-    state: directory
-    owner: "{{ hermes_mtls_cert_owner }}"
-    group: "{{ hermes_mtls_cert_group }}"
-    mode: "0750"
-
- name: Copy Fleet CA certificate
-  ansible.builtin.copy:
-    src: "{{ hermes_mtls_local_ca_cert }}"
-    dest: "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
-    owner: "{{ hermes_mtls_cert_owner }}"
-    group: "{{ hermes_mtls_cert_group }}"
-    mode: "{{ hermes_mtls_ca_cert_mode }}"
-  notify: Restart hermes service
-
- name: Copy agent TLS certificate
-  ansible.builtin.copy:
-    src: "{{ hermes_mtls_local_agent_cert }}"
-    dest: "{{ hermes_mtls_cert_dir }}/agent.crt"
-    owner: "{{ hermes_mtls_cert_owner }}"
-    group: "{{ hermes_mtls_cert_group }}"
-    mode: "{{ hermes_mtls_cert_mode }}"
-  notify: Restart hermes service
-
- name: Copy agent TLS private key
-  ansible.builtin.copy:
-    src: "{{ hermes_mtls_local_agent_key }}"
-    dest: "{{ hermes_mtls_cert_dir }}/agent.key"
-    owner: "{{ hermes_mtls_cert_owner }}"
-    group: "{{ hermes_mtls_cert_group }}"
-    mode: "0600"
-  notify: Restart hermes service
-
- name: Write mTLS environment file
-  ansible.builtin.template:
-    src: mtls.env.j2
-    dest: "{{ hermes_mtls_env_file }}"
-    owner: "{{ hermes_mtls_cert_owner }}"
-    group: "{{ hermes_mtls_cert_group }}"
-    mode: "0640"
-  notify: Restart hermes service
-
- name: Verify cert files are readable by service user
-  ansible.builtin.stat:
-    path: "{{ item }}"
-  loop:
-    - "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
-    - "{{ hermes_mtls_cert_dir }}/agent.crt"
-    - "{{ hermes_mtls_cert_dir }}/agent.key"
-  register: _cert_stat
-
- name: Assert all cert files exist
-  ansible.builtin.assert:
-    that: item.stat.exists
-    fail_msg: "Expected cert file missing: {{ item.item }}"
-  loop: "{{ _cert_stat.results }}"
--- a/ansible/roles/hermes_mtls/templates/mtls.env.j2
+++ b/ansible/roles/hermes_mtls/templates/mtls.env.j2
@@ -1,8 +0,0 @@
-# Hermes mTLS environment — generated by hermes_mtls Ansible role
-# Source this file or use as a systemd EnvironmentFile=
-# WARNING: This file contains the path to the agent's private key.
-#          Restrict read access to the hermes service user.
-
-HERMES_MTLS_CERT={{ hermes_mtls_cert_dir }}/agent.crt
-HERMES_MTLS_KEY={{ hermes_mtls_cert_dir }}/agent.key
-HERMES_MTLS_CA={{ hermes_mtls_cert_dir }}/fleet-ca.crt
--- a/batch_runner.py
+++ b/batch_runner.py
@@ -1158,7 +1158,7 @@ def main(
        providers_order (str): Comma-separated list of OpenRouter providers to try in order (e.g. "anthropic,openai,google")
        provider_sort (str): Sort providers by "price", "throughput", or "latency" (OpenRouter only)
        max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
-        reasoning_effort (str): OpenRouter reasoning effort level: "none", "minimal", "low", "medium", "high", "xhigh" (default: "medium")
+        reasoning_effort (str): OpenRouter reasoning effort level: "xhigh", "high", "medium", "low", "minimal", "none" (default: "medium")
        reasoning_disabled (bool): Completely disable reasoning/thinking tokens (default: False)
        prefill_messages_file (str): Path to JSON file containing prefill messages (list of {role, content} dicts)
        max_samples (int): Only process the first N samples from the dataset (optional, processes all if not set)
@@ -1227,7 +1227,7 @@ def main(
        print("🧠 Reasoning: DISABLED (effort=none)")
    elif reasoning_effort:
        # Use specified effort level
-        valid_efforts = ["none", "minimal", "low", "medium", "high", "xhigh"]
+        valid_efforts = ["xhigh", "high", "medium", "low", "minimal", "none"]
        if reasoning_effort not in valid_efforts:
            print(f"❌ Error: --reasoning_effort must be one of: {', '.join(valid_efforts)}")
            return
--- a/benchmarks/gemma4-tool-calling-2026-04-13.md
+++ b/benchmarks/gemma4-tool-calling-2026-04-13.md
@@ -1,40 +0,0 @@
-# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
-
-Date: 2026-04-13
-Status: Awaiting execution
-
-## Test Design
-
-100 diverse tool calls across 7 categories:
-
-| Category | Count | Tools Tested |
-|----------|-------|--------------|
-| File operations | 20 | read_file, write_file, search_files |
-| Terminal commands | 20 | terminal |
-| Web search | 15 | web_search |
-| Code execution | 15 | execute_code |
-| Browser automation | 10 | browser_navigate |
-| Delegation | 10 | delegate_task |
-| MCP tools | 10 | mcp_* |
-
-## Metrics
-
-| Metric | mimo-v2-pro | Gemma 4 |
-|--------|-------------|---------|
-| Schema parse success | — | — |
-| Tool execution success | — | — |
-| Parallel tool success | — | — |
-| Avg latency (s) | — | — |
-| Token cost per call | — | — |
-
-## How to Run
-
-```bash
-python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
-python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
-python3 benchmarks/tool_call_benchmark.py --compare
-```
-
-## Gemma 4-Specific Failure Modes
-
-To be documented after benchmark execution.
--- a/benchmarks/test_images.json
+++ b/benchmarks/test_images.json
@@ -1,194 +0,0 @@
-[
-  {
-    "id": "screenshot_github_home",
-    "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
-    "category": "screenshot",
-    "expected_keywords": ["github", "logo", "mark"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "diagram_mermaid_flow",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
-    "category": "diagram",
-    "expected_keywords": ["flow", "diagram", "process"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_1",
-    "url": "https://picsum.photos/seed/vision1/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_2",
-    "url": "https://picsum.photos/seed/vision2/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_simple_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
-    "category": "chart",
-    "expected_keywords": ["bar", "chart", "revenue"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "chart_pie",
-    "url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
-    "category": "chart",
-    "expected_keywords": ["pie", "chart", "percentage"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "diagram_org_chart",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["organization", "hierarchy", "chart"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "screenshot_terminal",
-    "url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
-    "category": "screenshot",
-    "expected_keywords": ["terminal", "command", "output"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_3",
-    "url": "https://picsum.photos/seed/vision3/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_line",
-    "url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
-    "category": "chart",
-    "expected_keywords": ["line", "chart", "temperature"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "diagram_sequence",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["sequence", "interaction", "message"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_4",
-    "url": "https://picsum.photos/seed/vision4/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "screenshot_webpage",
-    "url": "https://github.githubassets.com/images/modules/site/social-cards.png",
-    "category": "screenshot",
-    "expected_keywords": ["github", "page", "web"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_radar",
-    "url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
-    "category": "chart",
-    "expected_keywords": ["radar", "chart", "skill"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "photo_random_5",
-    "url": "https://picsum.photos/seed/vision5/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "diagram_class",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["class", "object", "attribute"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "chart_doughnut",
-    "url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
-    "category": "chart",
-    "expected_keywords": ["doughnut", "chart", "device"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "photo_random_6",
-    "url": "https://picsum.photos/seed/vision6/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "screenshot_error",
-    "url": "https://http.cat/404.jpg",
-    "category": "screenshot",
-    "expected_keywords": ["404", "error", "cat"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
-  },
-  {
-    "id": "diagram_network",
-    "url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
-    "category": "diagram",
-    "expected_keywords": ["network", "node", "connection"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_7",
-    "url": "https://picsum.photos/seed/vision7/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "chart_stacked_bar",
-    "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
-    "category": "chart",
-    "expected_keywords": ["stacked", "bar", "chart"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
-  },
-  {
-    "id": "screenshot_dashboard",
-    "url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
-    "category": "screenshot",
-    "expected_keywords": ["search", "code", "feature"],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  },
-  {
-    "id": "photo_random_8",
-    "url": "https://picsum.photos/seed/vision8/400/300",
-    "category": "photo",
-    "expected_keywords": [],
-    "ground_truth_ocr": "",
-    "expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
-  }
-]
--- a/benchmarks/tool_call_benchmark.py
+++ b/benchmarks/tool_call_benchmark.py
@@ -1,614 +0,0 @@
-#!/usr/bin/env python3
-"""
-Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
-
-Runs 100 diverse tool-calling prompts through multiple models and compares
-success rates, latency, and token costs.
-
-Usage:
-    python3 benchmarks/tool_call_benchmark.py                  # full 100-call suite
-    python3 benchmarks/tool_call_benchmark.py --limit 10       # quick smoke test
-    python3 benchmarks/tool_call_benchmark.py --models nous     # single model
-    python3 benchmarks/tool_call_benchmark.py --category file   # single category
-
-Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
-"""
-
-import argparse
-import json
-import os
-import sys
-import time
-import traceback
-from dataclasses import dataclass, field, asdict
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Optional
-
-# Ensure hermes-agent root is importable
-REPO_ROOT = Path(__file__).resolve().parent.parent
-sys.path.insert(0, str(REPO_ROOT))
-
-# ---------------------------------------------------------------------------
-# Test Definitions
-# ---------------------------------------------------------------------------
-
-@dataclass
-class ToolCall:
-    """A single tool-calling test case."""
-    id: str
-    category: str
-    prompt: str
-    expected_tool: str              # tool name we expect the model to call
-    expected_params_check: str = "" # substring expected in JSON args
-    timeout: int = 30               # max seconds per call
-    notes: str = ""
-
-
-# fmt: off
-SUITE: list[ToolCall] = [
-    # ── File Operations (20) ──────────────────────────────────────────────
-    ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
-             "read_file", "path"),
-    ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
-             "write_file", "path"),
-    ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
-             "search_files", "pattern"),
-    ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
-             "read_file", "offset"),
-    ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
-             "patch", "old_string"),
-    ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
-             "search_files", "target"),
-    ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
-             "read_file", "limit"),
-    ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
-             "write_file", "content"),
-    ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
-             "search_files", "file_glob"),
-    ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
-             "read_file", "bench_config"),
-    ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
-             "write_file", "bench_readme"),
-    ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
-             "search_files", "TODO"),
-    ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
-             "read_file", "bench_readme"),
-    ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
-             "patch", "Tool Benchmark"),
-    ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
-             "write_file", "bench_hello"),
-    ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
-             "search_files", "json"),
-    ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
-             "read_file", "bench_hello"),
-    ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
-             "patch", "hello world"),
-    ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
-             "search_files", "bench"),
-    ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
-             "read_file", "test_bench"),
-
-    # ── Terminal Commands (20) ────────────────────────────────────────────
-    ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
-             "terminal", "echo"),
-    ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
-             "terminal", "date"),
-    ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
-             "terminal", "uname"),
-    ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
-             "terminal", "pwd"),
-    ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
-             "terminal", "head"),
-    ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
-             "terminal", "whoami"),
-    ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
-             "terminal", "df"),
-    ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
-             "terminal", "python3"),
-    ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
-             "terminal", "hostname"),
-    ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
-             "terminal", "uptime"),
-    ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
-             "terminal", "PATH"),
-    ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
-             "terminal", "wc"),
-    ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
-             "terminal", "SHELL"),
-    ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
-             "terminal", "memory"),
-    ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
-             "terminal", "id"),
-    ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
-             "terminal", "hostname"),
-    ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
-             "terminal", "echo"),
-    ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
-             "terminal", "seq"),
-    ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
-             "terminal", "print"),
-    ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
-             "terminal", "wc"),
-
-    # ── Code Execution (15) ──────────────────────────────────────────────
-    ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
-             "execute_code", "factorial"),
-    ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
-             "execute_code", "words"),
-    ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
-             "execute_code", "fibonacci"),
-    ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
-             "execute_code", "json"),
-    ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
-             "execute_code", "glob"),
-    ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
-             "execute_code", "sum"),
-    ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
-             "execute_code", "palindrome"),
-    ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
-             "execute_code", "csv"),
-    ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
-             "execute_code", "sort"),
-    ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
-             "execute_code", "passwd"),
-    ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
-             "execute_code", "sha256"),
-    ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
-             "execute_code", "utcnow"),
-    ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
-             "execute_code", "upper"),
-    ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
-             "execute_code", "sys"),
-    ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
-             "execute_code", "socket"),
-
-    # ── Delegation (10) ──────────────────────────────────────────────────
-    ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
-             "delegate_task", "log"),
-    ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
-             "delegate_task", "15"),
-    ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
-             "delegate_task", "python"),
-    ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
-             "delegate_task", "summarize"),
-    ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
-             "delegate_task", "tmp"),
-    ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
-             "delegate_task", ".py"),
-    ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
-             "delegate_task", "df"),
-    ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
-             "delegate_task", "os"),
-    ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
-             "delegate_task", "hostname"),
-    ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
-             "delegate_task", "write"),
-
-    # ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
-    ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
-             "todo", "benchmark"),
-    ToolCall("todo-02", "todo", "Show me the current todo list.",
-             "todo", ""),
-    ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
-             "todo", "completed"),
-    ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
-             "todo", "Review"),
-    ToolCall("todo-05", "todo", "Clear all completed todos.",
-             "todo", "clear"),
-    ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
-              date=datetime.now().strftime("%Y-%m-%d")),
-             "memory", "benchmark"),
-    ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
-             "memory", "benchmark"),
-    ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
-             "memory", "gemma"),
-    ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
-             "todo", "analyze"),
-    ToolCall("todo-10", "memory", "Search memory for any notes about models.",
-             "memory", "model"),
-
-    # ── Skills (10 — replacing MCP tools which need servers) ─────────────
-    ToolCall("skill-01", "skills", "List all available skills.",
-             "skills_list", ""),
-    ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
-             "skill_view", "test-driven"),
-    ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
-             "skills_list", "git"),
-    ToolCall("skill-04", "skills", "View the 'code-review' skill.",
-             "skill_view", "code-review"),
-    ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
-             "skills_list", "devops"),
-    ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
-             "skill_view", "systematic-debugging"),
-    ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
-             "skills_list", "testing"),
-    ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
-             "skill_view", "writing-plans"),
-    ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
-             "skills_list", "software-development"),
-    ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
-             "skill_view", "pr-review"),
-
-    # ── Additional tests to reach 100 ────────────────────────────────────
-    ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
-             "write_file", "bench_sort"),
-    ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
-             "read_file", "bench_sort"),
-    ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
-             "search_files", "class"),
-    ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
-             "terminal", "os"),
-    ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
-             "terminal", "cpu"),
-    ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
-             "execute_code", "flatten"),
-    ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
-             "execute_code", "prime"),
-    ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
-             "delegate_task", "cwd"),
-    ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
-             "todo", "Finalize"),
-    ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
-             "memory", "categories"),
-    ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
-             "skills_list", "deployment"),
-    ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
-             "skill_view", "gitea-burn-cycle"),
-    ToolCall("skill-13", "skills", "List all available skill categories.",
-             "skills_list", ""),
-    ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
-             "skills_list", "memory"),
-    ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
-             "skill_view", "mimo-swarm"),
-]
-# fmt: on
-
-
-# ---------------------------------------------------------------------------
-# Runner
-# ---------------------------------------------------------------------------
-
-@dataclass
-class CallResult:
-    test_id: str
-    category: str
-    model: str
-    prompt: str
-    expected_tool: str
-    success: bool
-    tool_called: Optional[str] = None
-    tool_args_valid: bool = False
-    execution_ok: bool = False
-    latency_s: float = 0.0
-    error: str = ""
-    raw_response: str = ""
-
-
-@dataclass
-class ModelStats:
-    model: str
-    total: int = 0
-    schema_ok: int = 0        # model produced valid tool call JSON
-    exec_ok: int = 0          # tool actually ran without error
-    latency_sum: float = 0.0
-    failures: list = field(default_factory=list)
-
-    @property
-    def schema_pct(self) -> float:
-        return (self.schema_ok / self.total * 100) if self.total else 0
-
-    @property
-    def exec_pct(self) -> float:
-        return (self.exec_ok / self.total * 100) if self.total else 0
-
-    @property
-    def avg_latency(self) -> float:
-        return (self.latency_sum / self.total) if self.total else 0
-
-
-def setup_test_files():
-    """Create prerequisite files for the benchmark."""
-    Path("/tmp/test_bench.txt").write_text(
-        "This is a benchmark test file.\n"
-        "It contains sample data for tool-calling tests.\n"
-        "Line three has some import statements.\n"
-        "import os\nimport sys\nimport json\n"
-        "End of test data.\n"
-    )
-
-
-def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
-    """Run a single tool-calling test through the agent."""
-    from run_agent import AIAgent
-
-    result = CallResult(
-        test_id=tc.id,
-        category=tc.category,
-        model=model_spec,
-        prompt=tc.prompt,
-        expected_tool=tc.expected_tool,
-        success=False,
-    )
-
-    try:
-        agent = AIAgent(
-            model=model_spec,
-            provider=provider,
-            max_iterations=3,
-            quiet_mode=True,
-            skip_context_files=True,
-            skip_memory=True,
-            persist_session=False,
-        )
-
-        t0 = time.time()
-        conv = agent.run_conversation(
-            user_message=tc.prompt,
-            system_message=(
-                "You are a benchmark test runner. Execute the user's request by calling "
-                "the appropriate tool. Return the tool result directly. Do not add commentary."
-            ),
-        )
-        result.latency_s = round(time.time() - t0, 2)
-
-        messages = conv.get("messages", [])
-
-        # Find the first assistant message with tool_calls
-        tool_called = None
-        tool_args_str = ""
-        for msg in messages:
-            if msg.get("role") == "assistant" and msg.get("tool_calls"):
-                for tc_item in msg["tool_calls"]:
-                    fn = tc_item.get("function", {})
-                    tool_called = fn.get("name", "")
-                    tool_args_str = fn.get("arguments", "{}")
-                    break
-                break
-
-        if tool_called:
-            result.tool_called = tool_called
-            result.schema_ok = True
-
-            # Check if the right tool was called
-            if tool_called == tc.expected_tool:
-                result.success = True
-
-            # Check if args contain expected substring
-            if tc.expected_params_check:
-                result.tool_args_valid = tc.expected_params_check in tool_args_str
-            else:
-                result.tool_args_valid = True
-
-            # Check if tool executed (look for tool role message)
-            for msg in messages:
-                if msg.get("role") == "tool":
-                    content = msg.get("content", "")
-                    if content and "error" not in content.lower()[:50]:
-                        result.execution_ok = True
-                        break
-                    elif content:
-                        result.execution_ok = True  # got a response, even if error
-                        break
-        else:
-            # No tool call produced — still check if model responded
-            final = conv.get("final_response", "")
-            result.raw_response = final[:200] if final else ""
-
-    except Exception as e:
-        result.error = f"{type(e).__name__}: {str(e)[:200]}"
-        result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
-
-    return result
-
-
-def generate_report(results: list[CallResult], models: list[str], output_path: Path):
-    """Generate markdown benchmark report."""
-    now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
-
-    # Aggregate per model
-    stats: dict[str, ModelStats] = {}
-    for m in models:
-        stats[m] = ModelStats(model=m)
-
-    by_category: dict[str, dict[str, list[CallResult]]] = {}
-
-    for r in results:
-        s = stats[r.model]
-        s.total += 1
-        s.schema_ok += int(r.schema_ok)
-        s.exec_ok += int(r.execution_ok)
-        s.latency_sum += r.latency_s
-        if not r.success:
-            s.failures.append(r)
-
-        by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
-
-    lines = [
-        f"# Tool-Calling Benchmark Report",
-        f"",
-        f"Generated: {now}",
-        f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
-        f"Models tested: {', '.join(models)}",
-        f"",
-        f"## Summary",
-        f"",
-        f"| Metric | {' | '.join(models)} |",
-        f"|--------|{'|'.join('---------' for _ in models)}|",
-    ]
-
-    # Schema parse success
-    row = "| Schema parse success | "
-    for m in models:
-        s = stats[m]
-        row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
-    lines.append(row)
-
-    # Tool execution success
-    row = "| Tool execution success | "
-    for m in models:
-        s = stats[m]
-        row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
-    lines.append(row)
-
-    # Correct tool selected
-    row = "| Correct tool selected | "
-    for m in models:
-        s = stats[m]
-        correct = sum(1 for r in results if r.model == m and r.success)
-        pct = (correct / s.total * 100) if s.total else 0
-        row += f"{correct}/{s.total} ({pct:.0f}%) | "
-    lines.append(row)
-
-    # Avg latency
-    row = "| Avg latency (s) | "
-    for m in models:
-        s = stats[m]
-        row += f"{s.avg_latency:.2f} | "
-    lines.append(row)
-
-    lines.append("")
-
-    # Per-category breakdown
-    lines.append("## Per-Category Breakdown")
-    lines.append("")
-
-    for cat in sorted(by_category.keys()):
-        lines.append(f"### {cat.title()}")
-        lines.append("")
-        lines.append(f"| Metric | {' | '.join(models)} |")
-        lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
-
-        cat_data = by_category[cat]
-        for metric_name, fn in [
-            ("Schema OK", lambda r: r.schema_ok),
-            ("Exec OK", lambda r: r.execution_ok),
-            ("Correct tool", lambda r: r.success),
-        ]:
-            row = f"| {metric_name} | "
-            for m in models:
-                results_m = cat_data.get(m, [])
-                total = len(results_m)
-                ok = sum(1 for r in results_m if fn(r))
-                pct = (ok / total * 100) if total else 0
-                row += f"{ok}/{total} ({pct:.0f}%) | "
-            lines.append(row)
-
-        lines.append("")
-
-    # Failure analysis
-    lines.append("## Failure Analysis")
-    lines.append("")
-
-    any_failures = False
-    for m in models:
-        s = stats[m]
-        if s.failures:
-            any_failures = True
-            lines.append(f"### {m} — {len(s.failures)} failures")
-            lines.append("")
-            lines.append("| Test | Category | Expected | Got | Error |")
-            lines.append("|------|----------|----------|-----|-------|")
-            for r in s.failures:
-                got = r.tool_called or "none"
-                err = r.error or "wrong tool"
-                lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
-            lines.append("")
-
-    if not any_failures:
-        lines.append("No failures detected.")
-        lines.append("")
-
-    # Raw results JSON
-    lines.append("## Raw Results")
-    lines.append("")
-    lines.append("```json")
-    lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
-    lines.append("```")
-
-    report = "\n".join(lines)
-    output_path.write_text(report)
-    return report
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Tool-calling benchmark")
-    parser.add_argument("--models", nargs="+",
-                        default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
-                        help="Model specs to test (provider:model)")
-    parser.add_argument("--limit", type=int, default=0,
-                        help="Run only first N tests (0 = all)")
-    parser.add_argument("--category", type=str, default="",
-                        help="Run only tests in this category")
-    parser.add_argument("--output", type=str, default="",
-                        help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Print test cases without running them")
-    args = parser.parse_args()
-
-    # Filter suite
-    suite = SUITE[:]
-    if args.category:
-        suite = [tc for tc in suite if tc.category == args.category]
-    if args.limit > 0:
-        suite = suite[:args.limit]
-
-    if args.dry_run:
-        print(f"Would run {len(suite)} tests:")
-        for tc in suite:
-            print(f"  [{tc.category:8s}] {tc.id}: {tc.expected_tool} — {tc.prompt[:60]}")
-        return
-
-    # Setup
-    setup_test_files()
-    date_str = datetime.now().strftime("%Y-%m-%d")
-    output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
-
-    # Parse model specs
-    model_specs = []
-    for spec in args.models:
-        parts = spec.split(":", 1)
-        provider = parts[0]
-        model_name = parts[1] if len(parts) > 1 else parts[0]
-        model_specs.append((provider, model_name, spec))
-
-    print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
-    print(f"Output: {output_path}")
-    print()
-
-    all_results: list[CallResult] = []
-
-    for provider, model_name, full_spec in model_specs:
-        print(f"── {full_spec} {'─' * (50 - len(full_spec))}")
-        model_results = []
-
-        for i, tc in enumerate(suite, 1):
-            sys.stdout.write(f"\r  [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s} → {tc.expected_tool:20s}")
-            sys.stdout.flush()
-
-            r = run_single_test(tc, full_spec, provider)
-            model_results.append(r)
-
-            status = "✓" if r.success else "✗"
-            sys.stdout.write(f"  {status} ({r.latency_s:.1f}s)")
-            sys.stdout.write("\n")
-
-        all_results.extend(model_results)
-
-        # Quick stats
-        ok = sum(1 for r in model_results if r.success)
-        print(f"  Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
-        print()
-
-    # Generate report
-    model_names = [spec for _, _, spec in model_specs]
-    report = generate_report(all_results, model_names, output_path)
-    print(f"Report written to {output_path}")
-
-    # Exit code: 0 if all pass, 1 if any failures
-    total_fail = sum(1 for r in all_results if not r.success)
-    sys.exit(1 if total_fail > 0 else 0)
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmarks/vision_benchmark.py
+++ b/benchmarks/vision_benchmark.py
@@ -1,635 +0,0 @@
-#!/usr/bin/env python3
-"""
-Vision Benchmark Suite — Issue #817
-
-Compares Gemma 4 vision accuracy vs current approach (Gemini 3 Flash Preview).
-Measures OCR accuracy, description quality, latency, and token usage.
-
-Usage:
-    # Run full benchmark
-    python benchmarks/vision_benchmark.py --images benchmarks/test_images.json
-
-    # Single image test
-    python benchmarks/vision_benchmark.py --url https://example.com/image.png
-
-    # Generate test report
-    python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
-
-Test image dataset: benchmarks/test_images.json (50-100 diverse images)
-"""
-
-import argparse
-import asyncio
-import base64
-import json
-import os
-import statistics
-import sys
-import time
-from datetime import datetime, timezone
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-
-# ---------------------------------------------------------------------------
-# Benchmark configuration
-# ---------------------------------------------------------------------------
-
-# Models to compare
-MODELS = {
-    "gemma4": {
-        "model_id": "google/gemma-4-27b-it",
-        "display_name": "Gemma 4 27B",
-        "provider": "nous",
-        "description": "Google's multimodal Gemma 4 model",
-    },
-    "gemini3_flash": {
-        "model_id": "google/gemini-3-flash-preview",
-        "display_name": "Gemini 3 Flash Preview",
-        "provider": "openrouter",
-        "description": "Current default vision model",
-    },
-}
-
-# Evaluation prompts for different test categories
-EVAL_PROMPTS = {
-    "screenshot": "Describe this screenshot in detail. What application is shown? What is the current state of the UI?",
-    "diagram": "Describe this diagram completely. What concepts does it illustrate? List all components and their relationships.",
-    "photo": "Describe this photo in detail. What objects are visible? What is the scene?",
-    "ocr": "Extract ALL text visible in this image. Return it exactly as written, preserving formatting.",
-    "chart": "What data does this chart show? List all axes labels, values, and key trends.",
-    "document": "Extract all text from this document image. Preserve paragraph structure.",
-}
-
-
-# ---------------------------------------------------------------------------
-# Vision model interface
-# ---------------------------------------------------------------------------
-
-
-async def analyze_with_model(
-    image_url: str,
-    prompt: str,
-    model_config: dict,
-    timeout: float = 120.0,
-) -> dict:
-    """Call a vision model and return structured results.
-
-    Returns dict with:
-        - analysis: str
-        - latency_ms: float
-        - tokens: dict (prompt_tokens, completion_tokens, total_tokens)
-        - success: bool
-        - error: str (if failed)
-    """
-    import httpx
-
-    provider = model_config["provider"]
-    model_id = model_config["model_id"]
-
-    # Prepare messages
-    messages = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
-            ],
-        }
-    ]
-
-    # Route to provider
-    if provider == "openrouter":
-        api_url = "https://openrouter.ai/api/v1/chat/completions"
-        api_key = os.getenv("OPENROUTER_API_KEY", "")
-    elif provider == "nous":
-        api_url = "https://inference.nousresearch.com/v1/chat/completions"
-        api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
-    else:
-        api_url = os.getenv(f"{provider.upper()}_API_URL", "")
-        api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
-
-    if not api_key:
-        return {
-            "analysis": "",
-            "latency_ms": 0,
-            "tokens": {},
-            "success": False,
-            "error": f"No API key for provider {provider}",
-        }
-
-    headers = {
-        "Authorization": f"Bearer {api_key}",
-        "Content-Type": "application/json",
-    }
-
-    payload = {
-        "model": model_id,
-        "messages": messages,
-        "max_tokens": 2000,
-        "temperature": 0.1,
-    }
-
-    start = time.perf_counter()
-    try:
-        async with httpx.AsyncClient(timeout=timeout) as client:
-            resp = await client.post(api_url, json=payload, headers=headers)
-            resp.raise_for_status()
-            data = resp.json()
-
-        latency_ms = (time.perf_counter() - start) * 1000
-
-        analysis = ""
-        choices = data.get("choices", [])
-        if choices:
-            msg = choices[0].get("message", {})
-            analysis = msg.get("content", "")
-
-        usage = data.get("usage", {})
-        tokens = {
-            "prompt_tokens": usage.get("prompt_tokens", 0),
-            "completion_tokens": usage.get("completion_tokens", 0),
-            "total_tokens": usage.get("total_tokens", 0),
-        }
-
-        return {
-            "analysis": analysis,
-            "latency_ms": round(latency_ms, 1),
-            "tokens": tokens,
-            "success": True,
-            "error": "",
-        }
-
-    except Exception as e:
-        return {
-            "analysis": "",
-            "latency_ms": round((time.perf_counter() - start) * 1000, 1),
-            "tokens": {},
-            "success": False,
-            "error": str(e),
-        }
-
-
-# ---------------------------------------------------------------------------
-# Evaluation metrics
-# ---------------------------------------------------------------------------
-
-
-def compute_ocr_accuracy(extracted: str, ground_truth: str) -> float:
-    """Compute OCR accuracy using character-level Levenshtein ratio.
-
-    Returns 0.0-1.0 (1.0 = perfect match).
-    """
-    if not ground_truth:
-        return 1.0 if not extracted else 0.0
-    if not extracted:
-        return 0.0
-
-    # Normalized Levenshtein similarity
-    extracted_lower = extracted.lower().strip()
-    truth_lower = ground_truth.lower().strip()
-
-    # Simple character overlap ratio (fast proxy)
-    max_len = max(len(extracted_lower), len(truth_lower))
-    if max_len == 0:
-        return 1.0
-
-    # Count matching characters at matching positions
-    matches = sum(1 for a, b in zip(extracted_lower, truth_lower) if a == b)
-    position_ratio = matches / max_len
-
-    # Also check word-level overlap
-    extracted_words = set(extracted_lower.split())
-    truth_words = set(truth_lower.split())
-    if truth_words:
-        word_recall = len(extracted_words & truth_words) / len(truth_words)
-    else:
-        word_recall = 1.0 if not extracted_words else 0.0
-
-    return round((position_ratio * 0.4 + word_recall * 0.6), 4)
-
-
-def compute_description_completeness(analysis: str, expected_keywords: list) -> float:
-    """Score description completeness based on keyword coverage.
-
-    Returns 0.0-1.0.
-    """
-    if not expected_keywords:
-        return 1.0
-    if not analysis:
-        return 0.0
-
-    analysis_lower = analysis.lower()
-    found = sum(1 for kw in expected_keywords if kw.lower() in analysis_lower)
-    return round(found / len(expected_keywords), 4)
-
-
-def compute_structural_accuracy(analysis: str, expected_structure: dict) -> dict:
-    """Evaluate structural elements of the analysis.
-
-    Returns dict with per-element scores.
-    """
-    scores = {}
-
-    # Length check
-    min_length = expected_structure.get("min_length", 50)
-    scores["length"] = min(len(analysis) / min_length, 1.0) if min_length > 0 else 1.0
-
-    # Sentence count
-    min_sentences = expected_structure.get("min_sentences", 2)
-    sentence_count = analysis.count(".") + analysis.count("!") + analysis.count("?")
-    scores["sentences"] = min(sentence_count / max(min_sentences, 1), 1.0)
-
-    # Has specifics (numbers, names, etc.)
-    if expected_structure.get("has_numbers", False):
-        import re
-        scores["has_numbers"] = 1.0 if re.search(r'\d', analysis) else 0.0
-
-    return scores
-
-
-# ---------------------------------------------------------------------------
-# Benchmark runner
-# ---------------------------------------------------------------------------
-
-
-async def run_single_test(
-    image: dict,
-    models: dict,
-    runs_per_model: int = 1,
-) -> dict:
-    """Run a single image through all models.
-
-    Args:
-        image: dict with url, category, expected_keywords, ground_truth_ocr, etc.
-        models: dict of model configs to test
-        runs_per_model: number of runs per model (for consistency testing)
-
-    Returns dict with results per model.
-    """
-    category = image.get("category", "photo")
-    prompt = EVAL_PROMPTS.get(category, EVAL_PROMPTS["photo"])
-    url = image["url"]
-
-    results = {}
-
-    for model_name, model_config in models.items():
-        runs = []
-        for run_i in range(runs_per_model):
-            result = await analyze_with_model(url, prompt, model_config)
-            runs.append(result)
-            if run_i < runs_per_model - 1:
-                await asyncio.sleep(1)  # Rate limit courtesy
-
-        # Aggregate
-        successful = [r for r in runs if r["success"]]
-        if successful:
-            avg_latency = statistics.mean(r["latency_ms"] for r in successful)
-            avg_tokens = statistics.mean(
-                r["tokens"].get("total_tokens", 0) for r in successful
-            )
-            # Use first successful run for accuracy metrics
-            primary = successful[0]
-
-            # Compute accuracy
-            ocr_score = None
-            if image.get("ground_truth_ocr"):
-                ocr_score = compute_ocr_accuracy(
-                    primary["analysis"], image["ground_truth_ocr"]
-                )
-
-            keyword_score = None
-            if image.get("expected_keywords"):
-                keyword_score = compute_description_completeness(
-                    primary["analysis"], image["expected_keywords"]
-                )
-
-            structural = compute_structural_accuracy(
-                primary["analysis"], image.get("expected_structure", {})
-            )
-
-            results[model_name] = {
-                "success": True,
-                "analysis_preview": primary["analysis"][:300],
-                "analysis_length": len(primary["analysis"]),
-                "avg_latency_ms": round(avg_latency, 1),
-                "avg_tokens": round(avg_tokens, 1),
-                "ocr_accuracy": ocr_score,
-                "keyword_completeness": keyword_score,
-                "structural_scores": structural,
-                "consistency": round(
-                    statistics.stdev(len(r["analysis"]) for r in successful), 1
-                ) if len(successful) > 1 else 0.0,
-                "runs": len(successful),
-                "errors": len(runs) - len(successful),
-            }
-        else:
-            results[model_name] = {
-                "success": False,
-                "error": runs[0]["error"] if runs else "No runs",
-                "runs": 0,
-                "errors": len(runs),
-            }
-
-    return results
-
-
-async def run_benchmark_suite(
-    images: List[dict],
-    models: dict,
-    runs_per_model: int = 1,
-) -> dict:
-    """Run the full benchmark suite.
-
-    Args:
-        images: list of image test cases
-        models: model configs to compare
-        runs_per_model: consistency runs per image
-
-    Returns structured benchmark report.
-    """
-    total = len(images)
-    all_results = []
-
-    print(f"\nRunning vision benchmark: {total} images x {len(models)} models x {runs_per_model} runs")
-    print(f"Models: {', '.join(m['display_name'] for m in models.values())}\n")
-
-    for i, image in enumerate(images):
-        img_id = image.get("id", f"img_{i}")
-        category = image.get("category", "unknown")
-        print(f"  [{i+1}/{total}] {img_id} ({category})...", end=" ", flush=True)
-
-        result = await run_single_test(image, models, runs_per_model)
-        result["image_id"] = img_id
-        result["category"] = category
-        all_results.append(result)
-
-        # Quick status
-        statuses = []
-        for mname in models:
-            if result[mname]["success"]:
-                lat = result[mname]["avg_latency_ms"]
-                statuses.append(f"{mname}:{lat:.0f}ms")
-            else:
-                statuses.append(f"{mname}:FAIL")
-        print(", ".join(statuses))
-
-    # Aggregate statistics
-    summary = aggregate_results(all_results, models)
-
-    return {
-        "generated_at": datetime.now(timezone.utc).isoformat(),
-        "config": {
-            "total_images": total,
-            "runs_per_model": runs_per_model,
-            "models": {k: v["display_name"] for k, v in models.items()},
-        },
-        "results": all_results,
-        "summary": summary,
-    }
-
-
-def aggregate_results(results: List[dict], models: dict) -> dict:
-    """Compute aggregate statistics across all test images."""
-    summary = {}
-
-    for model_name in models:
-        model_results = [r[model_name] for r in results if r[model_name]["success"]]
-        failed = [r[model_name] for r in results if not r[model_name]["success"]]
-
-        if not model_results:
-            summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
-            continue
-
-        latencies = [r["avg_latency_ms"] for r in model_results]
-        tokens = [r["avg_tokens"] for r in model_results if r.get("avg_tokens")]
-        ocr_scores = [r["ocr_accuracy"] for r in model_results if r.get("ocr_accuracy") is not None]
-        keyword_scores = [r["keyword_completeness"] for r in model_results if r.get("keyword_completeness") is not None]
-
-        summary[model_name] = {
-            "success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
-            "total_runs": len(model_results),
-            "total_failures": len(failed),
-            "latency": {
-                "mean_ms": round(statistics.mean(latencies), 1),
-                "median_ms": round(statistics.median(latencies), 1),
-                "p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)], 1),
-                "std_ms": round(statistics.stdev(latencies), 1) if len(latencies) > 1 else 0,
-            },
-            "tokens": {
-                "mean_total": round(statistics.mean(tokens), 1) if tokens else 0,
-                "total_used": sum(int(t) for t in tokens),
-            },
-            "accuracy": {
-                "ocr_mean": round(statistics.mean(ocr_scores), 4) if ocr_scores else None,
-                "ocr_count": len(ocr_scores),
-                "keyword_mean": round(statistics.mean(keyword_scores), 4) if keyword_scores else None,
-                "keyword_count": len(keyword_scores),
-            },
-        }
-
-    return summary
-
-
-# ---------------------------------------------------------------------------
-# Report generation
-# ---------------------------------------------------------------------------
-
-
-def to_markdown(report: dict) -> str:
-    """Generate human-readable markdown report."""
-    summary = report["summary"]
-    config = report["config"]
-    model_names = list(config["models"].values())
-
-    lines = [
-        "# Vision Benchmark Report",
-        "",
-        f"Generated: {report['generated_at'][:16]}",
-        f"Images tested: {config['total_images']}",
-        f"Runs per model: {config['runs_per_model']}",
-        f"Models: {', '.join(model_names)}",
-        "",
-        "## Latency Comparison",
-        "",
-        "| Model | Mean (ms) | Median | P95 | Std Dev |",
-        "|-------|-----------|--------|-----|---------|",
-    ]
-
-    for mkey, mname in config["models"].items():
-        if mkey in summary and "latency" in summary[mkey]:
-            lat = summary[mkey]["latency"]
-            lines.append(
-                f"| {mname} | {lat['mean_ms']:.0f} | {lat['median_ms']:.0f} | "
-                f"{lat['p95_ms']:.0f} | {lat['std_ms']:.0f} |"
-            )
-
-    lines += [
-        "",
-        "## Accuracy Comparison",
-        "",
-        "| Model | OCR Accuracy | Keyword Coverage | Success Rate |",
-        "|-------|-------------|-----------------|--------------|",
-    ]
-
-    for mkey, mname in config["models"].items():
-        if mkey in summary and "accuracy" in summary[mkey]:
-            acc = summary[mkey]["accuracy"]
-            sr = summary[mkey].get("success_rate", 0)
-            ocr = f"{acc['ocr_mean']:.1%}" if acc["ocr_mean"] is not None else "N/A"
-            kw = f"{acc['keyword_mean']:.1%}" if acc["keyword_mean"] is not None else "N/A"
-            lines.append(f"| {mname} | {ocr} | {kw} | {sr:.1%} |")
-
-    lines += [
-        "",
-        "## Token Usage",
-        "",
-        "| Model | Mean Tokens/Image | Total Tokens |",
-        "|-------|------------------|--------------|",
-    ]
-
-    for mkey, mname in config["models"].items():
-        if mkey in summary and "tokens" in summary[mkey]:
-            tok = summary[mkey]["tokens"]
-            lines.append(
-                f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
-            )
-
-    # Verdict
-    lines += ["", "## Verdict", ""]
-
-    # Find best model by composite score
-    best_model = None
-    best_score = -1
-    for mkey, mname in config["models"].items():
-        if mkey not in summary or "accuracy" not in summary[mkey]:
-            continue
-        acc = summary[mkey]["accuracy"]
-        sr = summary[mkey].get("success_rate", 0)
-        ocr = acc["ocr_mean"] or 0
-        kw = acc["keyword_mean"] or 0
-        # Weighted composite: 40% OCR, 30% keyword, 30% success rate
-        score = (ocr * 0.4 + kw * 0.3 + sr * 0.3)
-        if score > best_score:
-            best_score = score
-            best_model = mname
-
-    if best_model:
-        lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
-    else:
-        lines.append("No clear winner — insufficient data.")
-
-    return "\n".join(lines)
-
-
-# ---------------------------------------------------------------------------
-# Test dataset management
-# ---------------------------------------------------------------------------
-
-
-def generate_sample_dataset() -> List[dict]:
-    """Generate a sample test dataset with diverse public images.
-
-    Returns list of test image definitions.
-    """
-    return [
-        # Screenshots
-        {
-            "id": "screenshot_github",
-            "url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
-            "category": "screenshot",
-            "expected_keywords": ["github", "logo", "octocat"],
-            "expected_structure": {"min_length": 50, "min_sentences": 2},
-        },
-        # Diagrams
-        {
-            "id": "diagram_architecture",
-            "url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
-            "category": "diagram",
-            "expected_keywords": ["architecture", "component", "service"],
-            "expected_structure": {"min_length": 100, "min_sentences": 3},
-        },
-        # Photos
-        {
-            "id": "photo_nature",
-            "url": "https://picsum.photos/seed/bench1/400/300",
-            "category": "photo",
-            "expected_keywords": [],
-            "expected_structure": {"min_length": 30, "min_sentences": 1},
-        },
-        # Charts
-        {
-            "id": "chart_bar",
-            "url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
-            "category": "chart",
-            "expected_keywords": ["bar", "chart", "data"],
-            "expected_structure": {"min_length": 50, "min_sentences": 2},
-        },
-    ]
-
-
-def load_dataset(path: str) -> List[dict]:
-    """Load test dataset from JSON file."""
-    with open(path) as f:
-        return json.load(f)
-
-
-# ---------------------------------------------------------------------------
-# CLI
-# ---------------------------------------------------------------------------
-
-
-async def main():
-    parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
-    parser.add_argument("--images", help="Path to test images JSON file")
-    parser.add_argument("--url", help="Single image URL to test")
-    parser.add_argument("--category", default="photo", help="Category for single URL")
-    parser.add_argument("--output", default=None, help="Output JSON file")
-    parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
-    parser.add_argument("--models", nargs="+", default=None,
-                        help="Models to test (default: all)")
-    parser.add_argument("--markdown", action="store_true", help="Output markdown report")
-    parser.add_argument("--generate-dataset", action="store_true",
-                        help="Generate sample dataset and exit")
-    args = parser.parse_args()
-
-    if args.generate_dataset:
-        dataset = generate_sample_dataset()
-        out_path = args.images or "benchmarks/test_images.json"
-        os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
-        with open(out_path, "w") as f:
-            json.dump(dataset, f, indent=2)
-        print(f"Generated sample dataset: {out_path} ({len(dataset)} images)")
-        return
-
-    # Select models
-    if args.models:
-        selected = {k: v for k, v in MODELS.items() if k in args.models}
-    else:
-        selected = MODELS
-
-    # Load images
-    if args.url:
-        images = [{"id": "single", "url": args.url, "category": args.category}]
-    elif args.images:
-        images = load_dataset(args.images)
-    else:
-        print("ERROR: Provide --images or --url")
-        sys.exit(1)
-
-    # Run benchmark
-    report = await run_benchmark_suite(images, selected, args.runs)
-
-    # Output
-    if args.output:
-        os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
-        with open(args.output, "w") as f:
-            json.dump(report, f, indent=2)
-        print(f"\nResults saved to {args.output}")
-
-    if args.markdown or not args.output:
-        print("\n" + to_markdown(report))
-
-
-if __name__ == "__main__":
-    asyncio.run(main())
--- a/cli-config.yaml.example
+++ b/cli-config.yaml.example
@@ -24,8 +24,6 @@ model:
  #   "minimax"      - MiniMax global (requires: MINIMAX_API_KEY)
  #   "minimax-cn"   - MiniMax China (requires: MINIMAX_CN_API_KEY)
  #   "huggingface"  - Hugging Face Inference (requires: HF_TOKEN)
-  #   "xiaomi"       - Xiaomi MiMo (requires: XIAOMI_API_KEY)
-  #   "arcee"        - Arcee AI Trinity models (requires: ARCEEAI_API_KEY)
  #   "kilocode"     - KiloCode gateway (requires: KILOCODE_API_KEY)
  #   "ai-gateway"   - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
  #
@@ -50,25 +48,6 @@ model:
  # api_key: "your-key-here"  # Uncomment to set here instead of .env
  base_url: "https://openrouter.ai/api/v1"

-  # ── Token limits — two settings, easy to confuse ──────────────────────────
-  #
-  # context_length: TOTAL context window (input + output tokens combined).
-  #   Controls when Hermes compresses history and validates requests.
-  #   Leave unset — Hermes auto-detects the correct value from the provider.
-  #   Set manually only when auto-detection is wrong (e.g. a local server with
-  #   a custom num_ctx, or a proxy that doesn't expose /v1/models).
-  #
-  # context_length: 131072
-  #
-  # max_tokens: OUTPUT cap — maximum tokens the model may generate per response.
-  #   Unrelated to how long your conversation history can be.
-  #   The OpenAI-standard name "max_tokens" is a misnomer; Anthropic's native
-  #   API has since renamed it "max_output_tokens" for clarity.
-  #   Leave unset to use the model's native output ceiling (recommended).
-  #   Set only if you want to deliberately limit individual response length.
-  #
-  # max_tokens: 8192
-
 # =============================================================================
 # OpenRouter Provider Routing (only applies when using OpenRouter)
 # =============================================================================
@@ -138,8 +117,7 @@ terminal:
  timeout: 180
  docker_mount_cwd_to_workspace: false  # SECURITY: off by default. Opt in to mount the launch cwd into Docker /workspace.
  lifetime_seconds: 300
-  # sudo_password: "hunter2"  # Optional: pipe a sudo password via sudo -S. SECURITY WARNING: plaintext.
-  # sudo_password: ""         # Explicit empty password: try empty and never open the interactive sudo prompt.
+  # sudo_password: ""  # Enable sudo commands (pipes via sudo -S) - SECURITY WARNING: plaintext!

 # -----------------------------------------------------------------------------
 # OPTION 2: SSH remote execution
@@ -230,18 +208,13 @@ terminal:
 #
 # SECURITY WARNING: Password stored in plaintext!
 #
-# INTERACTIVE PROMPT: If sudo_password is unset and the CLI is running,
+# INTERACTIVE PROMPT: If no sudo_password is set and the CLI is running,
 # you'll be prompted to enter your password when sudo is needed:
 # - 45-second timeout (auto-skips if no input)
 # - Press Enter to skip (command fails gracefully)
 # - Password is hidden while typing
 # - Password is cached for the session
 #
-# EMPTY PASSWORDS: Setting sudo_password to an explicit empty string is different
-# from leaving it unset. Hermes will try an empty password via `sudo -S` and
-# will not open the interactive prompt. This is useful for passwordless sudo,
-# Touch ID sudo setups, and environments where prompting is just noise.
-#
 # ALTERNATIVES:
 # - SSH backend: Configure passwordless sudo on the remote server
 # - Containers: Run as root inside the container (no sudo needed)
@@ -310,8 +283,15 @@ compression:
  # compression of older turns.
  protect_last_n: 20

-  # To pin a specific model/provider for compression summaries, use the
-  # auxiliary section below (auxiliary.compression.provider / model).
+  # Model to use for generating summaries (fast/cheap recommended)
+  # This model compresses the middle turns into a concise summary.
+  # IMPORTANT: it receives the full middle section of the conversation, so it
+  # MUST support a context length at least as large as your main model's.
+  summary_model: "google/gemini-3-flash-preview"
+  
+  # Provider for the summary model (default: "auto")
+  # Options: "auto", "openrouter", "nous", "main"
+  # summary_provider: "auto"

 # =============================================================================
 # Auxiliary Models (Advanced — Experimental)
@@ -465,22 +445,6 @@ agent:
  # Higher = more room for complex tasks, but costs more tokens
  # Recommended: 20-30 for focused tasks, 50-100 for open exploration
  max_turns: 60
-
-  # Inactivity timeout for gateway agent runs (seconds, 0 = unlimited).
-  # The agent can run indefinitely when actively calling tools or receiving
-  # API responses.  Only fires after the agent has been idle for this duration.
-  # gateway_timeout: 1800
-
-  # Staged warning: send a warning before escalating to full timeout.
-  # Fires once per run when inactivity reaches this threshold (seconds).
-  # Set to 0 to disable the warning.
-  # gateway_timeout_warning: 900
-
-  # Graceful drain timeout for gateway stop/restart (seconds).
-  # The gateway stops accepting new work, waits for in-flight agents to
-  # finish, then interrupts anything still running after this timeout.
-  # 0 = no drain, interrupt immediately.
-  # restart_drain_timeout: 60
  
  # Enable verbose logging
  verbose: false
@@ -523,7 +487,7 @@ agent:
 #   - A preset like "hermes-cli" or "hermes-telegram" (curated tool set)
 #   - A list of individual toolsets to compose your own (see list below)
 #
-# Supported platform keys: cli, telegram, discord, whatsapp, slack, qqbot
+# Supported platform keys: cli, telegram, discord, whatsapp, slack
 #
 # Examples:
 #
@@ -552,7 +516,6 @@ agent:
 #   slack:         hermes-slack          (same as telegram)
 #   signal:        hermes-signal         (same as telegram)
 #   homeassistant: hermes-homeassistant  (same as telegram)
-#   qqbot:            hermes-qqbot            (same as telegram)
 #
 platform_toolsets:
  cli: [hermes-cli]
@@ -562,7 +525,6 @@ platform_toolsets:
  slack: [hermes-slack]
  signal: [hermes-signal]
  homeassistant: [hermes-homeassistant]
-  qqbot: [hermes-qqbot]

 # ─────────────────────────────────────────────────────────────────────────────
 # Available toolsets (use these names in platform_toolsets or the toolsets list)
@@ -585,7 +547,7 @@ platform_toolsets:
 #   skills_hub   - skill_hub (search/install/manage from online registries — user-driven only)
 #   moa          - mixture_of_agents  (requires OPENROUTER_API_KEY)
 #   todo         - todo (in-memory task planning, no deps)
-#   tts          - text_to_speech  (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX/MISTRAL key)
+#   tts          - text_to_speech  (Edge TTS free, or ELEVENLABS/OPENAI/MINIMAX key)
 #   cronjob      - cronjob (create/list/update/pause/resume/run/remove scheduled tasks)
 #   rl           - rl_list_environments, rl_start_training, etc. (requires TINKER_API_KEY)
 #
@@ -614,7 +576,7 @@ platform_toolsets:
 #   todo         - Task planning and tracking for multi-step work
 #   memory       - Persistent memory across sessions (personal notes + user profile)
 #   session_search - Search and recall past conversations (FTS5 + Gemini Flash summarization)
-#   tts          - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax, Mistral)
+#   tts          - Text-to-speech (Edge TTS free, ElevenLabs, OpenAI, MiniMax)
 #   cronjob      - Schedule and manage automated tasks (CLI-only)
 #   rl           - RL training tools (Tinker-Atropos)
 #
@@ -687,11 +649,7 @@ platform_toolsets:
 stt:
  enabled: true
  # provider: "local"          # auto-detected if omitted
-  local:
-    model: "base"              # tiny | base | small | medium | large-v3 | turbo
-    # language: ""             # auto-detect; set to "en", "es", "fr", etc. to force
-  openai:
-    model: "whisper-1"         # whisper-1 | gpt-4o-mini-transcribe | gpt-4o-transcribe
+  model: "whisper-1"  # whisper-1 (cheapest) | gpt-4o-mini-transcribe | gpt-4o-transcribe
  # mistral:
  #   model: "voxtral-mini-latest"  # voxtral-mini-latest | voxtral-mini-2602

@@ -770,11 +728,6 @@ display:
  # Toggle at runtime with /verbose in the CLI
  tool_progress: all

-  # Gateway-only natural mid-turn assistant updates.
-  # When true, completed assistant status messages are sent as separate chat
-  # messages. This is independent of tool_progress and gateway streaming.
-  interim_assistant_messages: true
-
  # What Enter does when Hermes is already busy in the CLI.
  #   interrupt: Interrupt the current run and redirect Hermes (default)
  #   queue:     Queue your message for the next turn
@@ -783,7 +736,7 @@ display:

  # Background process notifications (gateway/messaging only).
  # Controls how chatty the process watcher is when you use
-  # terminal(background=true, notify_on_complete=true) from Telegram/Discord/etc.
+  # terminal(background=true, check_interval=...) from Telegram/Discord/etc.
  #   off:     No watcher messages at all
  #   result:  Only the final completion message
  #   error:   Only the final message when exit code != 0
--- a/cli.py
+++ b/cli.py
--- a/constraints-termux.txt
+++ b/constraints-termux.txt
@@ -1,15 +0,0 @@
-# Termux / Android dependency constraints for Hermes Agent.
-#
-# Usage:
-#   python -m pip install -e '.[termux]' -c constraints-termux.txt
-#
-# These pins keep the tested Android install path stable when upstream packages
-# move faster than Termux-compatible wheels / sdists.
-
-ipython<10
-jedi>=0.18.1,<0.20
-parso>=0.8.4,<0.9
-stack-data>=0.6,<0.7
-pexpect>4.3,<5
-matplotlib-inline>=0.1.7,<0.2
-asttokens>=2.1,<3
--- a/cron/jobs.py
+++ b/cron/jobs.py
@@ -31,7 +31,7 @@ except ImportError:
 # Configuration
 # =============================================================================

-HERMES_DIR = get_hermes_home().resolve()
+HERMES_DIR = get_hermes_home()
 CRON_DIR = HERMES_DIR / "cron"
 JOBS_FILE = CRON_DIR / "jobs.json"
 OUTPUT_DIR = CRON_DIR / "output"
@@ -338,12 +338,10 @@ def load_jobs() -> List[Dict[str, Any]]:
                    save_jobs(jobs)
                    logger.warning("Auto-repaired jobs.json (had invalid control characters)")
                return jobs
-        except Exception as e:
-            logger.error("Failed to auto-repair jobs.json: %s", e)
-            raise RuntimeError(f"Cron database corrupted and unrepairable: {e}") from e
-    except IOError as e:
-        logger.error("IOError reading jobs.json: %s", e)
-        raise RuntimeError(f"Failed to read cron database: {e}") from e
+        except Exception:
+            return []
+    except IOError:
+        return []


 def save_jobs(jobs: List[Dict[str, Any]]):
@@ -454,7 +452,6 @@ def create_job(
        "last_run_at": None,
        "last_status": None,
        "last_error": None,
-        "last_delivery_error": None,
        # Delivery configuration
        "deliver": deliver,
        "origin": origin,  # Tracks where job was created for "origin" delivery
@@ -623,8 +620,8 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None,

            save_jobs(jobs)
            return
-
-    logger.warning("mark_job_run: job_id %s not found, skipping save", job_id)
+    
+    save_jobs(jobs)


 def advance_next_run(job_id: str) -> bool:
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -44,8 +44,7 @@ logger = logging.getLogger(__name__)
 _KNOWN_DELIVERY_PLATFORMS = frozenset({
    "telegram", "discord", "slack", "whatsapp", "signal",
    "matrix", "mattermost", "homeassistant", "dingtalk", "feishu",
-    "wecom", "wecom_callback", "weixin", "sms", "email", "webhook", "bluebubbles",
-    "qqbot",
+    "wecom", "sms", "email", "webhook",
 })

 from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run
@@ -92,7 +91,7 @@ def _resolve_delivery_target(job: dict) -> Optional[dict]:
            }
        # Origin missing (e.g. job created via API/script) — try each
        # platform's home channel as a fallback instead of silently dropping.
-        for platform_name in ("matrix", "telegram", "discord", "slack", "bluebubbles"):
+        for platform_name in ("matrix", "telegram", "discord", "slack"):
            chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "")
            if chat_id:
                logger.info(
@@ -220,21 +219,6 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
    chat_id = target["chat_id"]
    thread_id = target.get("thread_id")

-    # Diagnostic: log thread_id for topic-aware delivery debugging
-    origin = job.get("origin") or {}
-    origin_thread = origin.get("thread_id")
-    if origin_thread and not thread_id:
-        logger.warning(
-            "Job '%s': origin has thread_id=%s but delivery target lost it "
-            "(deliver=%s, target=%s)",
-            job["id"], origin_thread, job.get("deliver", "local"), target,
-        )
-    elif thread_id:
-        logger.debug(
-            "Job '%s': delivering to %s:%s thread_id=%s",
-            job["id"], platform_name, chat_id, thread_id,
-        )
-
    from tools.send_message_tool import _send_to_platform
    from gateway.config import load_gateway_config, Platform

@@ -250,12 +234,8 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
        "dingtalk": Platform.DINGTALK,
        "feishu": Platform.FEISHU,
        "wecom": Platform.WECOM,
-        "wecom_callback": Platform.WECOM_CALLBACK,
-        "weixin": Platform.WEIXIN,
        "email": Platform.EMAIL,
        "sms": Platform.SMS,
-        "bluebubbles": Platform.BLUEBUBBLES,
-        "qqbot": Platform.QQBOT,
    }
    platform = platform_map.get(platform_name.lower())
    if not platform:
@@ -288,13 +268,11 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option

    if wrap_response:
        task_name = job.get("name", job["id"])
-        job_id = job.get("id", "")
        delivery_content = (
            f"Cronjob Response: {task_name}\n"
-            f"(job_id: {job_id})\n"
            f"-------------\n\n"
            f"{content}\n\n"
-            f"To stop or manage this job, send me a new message (e.g. \"stop reminder {task_name}\")."
+            f"Note: The agent cannot see this message, and therefore cannot respond to it."
        )
    else:
        delivery_content = content
@@ -367,42 +345,7 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
    return None


-_DEFAULT_SCRIPT_TIMEOUT = 120  # seconds
-# Backward-compatible module override used by tests and emergency monkeypatches.
-_SCRIPT_TIMEOUT = _DEFAULT_SCRIPT_TIMEOUT
-
-
-def _get_script_timeout() -> int:
-    """Resolve cron pre-run script timeout from module/env/config with a safe default."""
-    if _SCRIPT_TIMEOUT != _DEFAULT_SCRIPT_TIMEOUT:
-        try:
-            timeout = int(float(_SCRIPT_TIMEOUT))
-            if timeout > 0:
-                return timeout
-        except Exception:
-            logger.warning("Invalid patched _SCRIPT_TIMEOUT=%r; using env/config/default", _SCRIPT_TIMEOUT)
-
-    env_value = os.getenv("HERMES_CRON_SCRIPT_TIMEOUT", "").strip()
-    if env_value:
-        try:
-            timeout = int(float(env_value))
-            if timeout > 0:
-                return timeout
-        except Exception:
-            logger.warning("Invalid HERMES_CRON_SCRIPT_TIMEOUT=%r; using config/default", env_value)
-
-    try:
-        cfg = load_config() or {}
-        cron_cfg = cfg.get("cron", {}) if isinstance(cfg, dict) else {}
-        configured = cron_cfg.get("script_timeout_seconds")
-        if configured is not None:
-            timeout = int(float(configured))
-            if timeout > 0:
-                return timeout
-    except Exception as exc:
-        logger.debug("Failed to load cron script timeout from config: %s", exc)
-
-    return _DEFAULT_SCRIPT_TIMEOUT
+_SCRIPT_TIMEOUT = 120  # seconds


 def _run_job_script(script_path: str) -> tuple[bool, str]:
@@ -449,27 +392,17 @@ def _run_job_script(script_path: str) -> tuple[bool, str]:
    if not path.is_file():
        return False, f"Script path is not a file: {path}"

-    script_timeout = _get_script_timeout()
-
    try:
        result = subprocess.run(
            [sys.executable, str(path)],
            capture_output=True,
            text=True,
-            timeout=script_timeout,
+            timeout=_SCRIPT_TIMEOUT,
            cwd=str(path.parent),
        )
        stdout = (result.stdout or "").strip()
        stderr = (result.stderr or "").strip()

-        # Redact secrets from both stdout and stderr before any return path.
-        try:
-            from agent.redact import redact_sensitive_text
-            stdout = redact_sensitive_text(stdout)
-            stderr = redact_sensitive_text(stderr)
-        except Exception:
-            pass
-
        if result.returncode != 0:
            parts = [f"Script exited with code {result.returncode}"]
            if stderr:
@@ -478,10 +411,17 @@ def _run_job_script(script_path: str) -> tuple[bool, str]:
                parts.append(f"stdout:\n{stdout}")
            return False, "\n".join(parts)

+        # Redact any secrets that may appear in script output before
+        # they are injected into the LLM prompt context.
+        try:
+            from agent.redact import redact_sensitive_text
+            stdout = redact_sensitive_text(stdout)
+        except Exception:
+            pass
        return True, stdout

    except subprocess.TimeoutExpired:
-        return False, f"Script timed out after {script_timeout}s: {path}"
+        return False, f"Script timed out after {_SCRIPT_TIMEOUT}s: {path}"
    except Exception as exc:
        return False, f"Script execution failed: {exc}"

@@ -645,15 +585,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
        except Exception as e:
            logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)

-        # Apply IPv4 preference if configured.
-        try:
-            from hermes_constants import apply_ipv4_preference
-            _net_cfg = _cfg.get("network", {})
-            if isinstance(_net_cfg, dict) and _net_cfg.get("force_ipv4"):
-                apply_ipv4_preference(force=True)
-        except Exception:
-            pass
-
        # Reasoning config from config.yaml
        from hermes_constants import parse_reasoning_effort
        effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
@@ -714,24 +645,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            },
        )

-        fallback_model = _cfg.get("fallback_providers") or _cfg.get("fallback_model") or None
-        credential_pool = None
-        runtime_provider = str(turn_route["runtime"].get("provider") or "").strip().lower()
-        if runtime_provider:
-            try:
-                from agent.credential_pool import load_pool
-                pool = load_pool(runtime_provider)
-                if pool.has_credentials():
-                    credential_pool = pool
-                    logger.info(
-                        "Job '%s': loaded credential pool for provider %s with %d entries",
-                        job_id,
-                        runtime_provider,
-                        len(pool.entries()),
-                    )
-            except Exception as e:
-                logger.debug("Job '%s': failed to load credential pool for %s: %s", job_id, runtime_provider, e)
-
        agent = AIAgent(
            model=turn_route["model"],
            api_key=turn_route["runtime"].get("api_key"),
@@ -743,15 +656,12 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            max_iterations=max_iterations,
            reasoning_config=reasoning_config,
            prefill_messages=prefill_messages,
-            fallback_model=fallback_model,
-            credential_pool=credential_pool,
            providers_allowed=pr.get("only"),
            providers_ignored=pr.get("ignore"),
            providers_order=pr.get("order"),
            provider_sort=pr.get("sort"),
            disabled_toolsets=["cronjob", "messaging", "clarify"],
            quiet_mode=True,
-            skip_context_files=True,  # Don't inject SOUL.md/AGENTS.md from scheduler cwd
            skip_memory=True,  # Cron system prompts would corrupt user representations
            platform="cron",
            session_id=_cron_session_id,
@@ -800,7 +710,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
            _cron_pool.shutdown(wait=False, cancel_futures=True)
            raise
        finally:
-            _cron_pool.shutdown(wait=False, cancel_futures=True)
+            _cron_pool.shutdown(wait=False)

        if _inactivity_timeout:
            # Build diagnostic summary from the agent's activity tracker.
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@@ -1,52 +1,15 @@
 #!/bin/bash
-# Docker/Podman entrypoint: bootstrap config files into the mounted volume, then run hermes.
+# Docker entrypoint: bootstrap config files into the mounted volume, then run hermes.
 set -e

-HERMES_HOME="${HERMES_HOME:-/opt/data}"
+HERMES_HOME="/opt/data"
 INSTALL_DIR="/opt/hermes"

-# --- Privilege dropping via gosu ---
-# When started as root (the default for Docker, or fakeroot in rootless Podman),
-# optionally remap the hermes user/group to match host-side ownership, fix volume
-# permissions, then re-exec as hermes.
-if [ "$(id -u)" = "0" ]; then
-    if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then
-        echo "Changing hermes UID to $HERMES_UID"
-        usermod -u "$HERMES_UID" hermes
-    fi
-
-    if [ -n "$HERMES_GID" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then
-        echo "Changing hermes GID to $HERMES_GID"
-        # -o allows non-unique GID (e.g. macOS GID 20 "staff" may already exist
-        # as "dialout" in the Debian-based container image)
-        groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true
-    fi
-
-    actual_hermes_uid=$(id -u hermes)
-    if [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then
-        echo "$HERMES_HOME is not owned by $actual_hermes_uid, fixing"
-        # In rootless Podman the container's "root" is mapped to an unprivileged
-        # host UID — chown will fail.  That's fine: the volume is already owned
-        # by the mapped user on the host side.
-        chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \
-            echo "Warning: chown failed (rootless container?) — continuing anyway"
-    fi
-
-    echo "Dropping root privileges"
-    exec gosu hermes "$0" "$@"
-fi
-
-# --- Running as hermes from here ---
-source "${INSTALL_DIR}/.venv/bin/activate"
-
 # Create essential directory structure.  Cache and platform directories
 # (cache/images, cache/audio, platforms/whatsapp, etc.) are created on
 # demand by the application — don't pre-create them here so new installs
 # get the consolidated layout from get_hermes_dir().
-# The "home/" subdirectory is a per-profile HOME for subprocesses (git,
-# ssh, gh, npm …).  Without it those tools write to /root which is
-# ephemeral and shared across profiles.  See issue #4426.
-mkdir -p "$HERMES_HOME"/{cron,sessions,logs,hooks,memories,skills,skins,plans,workspace,home}
+mkdir -p "$HERMES_HOME"/{cron,sessions,logs,hooks,memories,skills}

 # .env
 if [ ! -f "$HERMES_HOME/.env" ]; then
--- a/docs/WORKFLOW_ORCHESTRATION_RESEARCH.md
+++ b/docs/WORKFLOW_ORCHESTRATION_RESEARCH.md
@@ -1,432 +0,0 @@
-# Workflow Orchestration & Task Queue Research for AI Agents
-
-**Date:** 2026-04-14
-**Scope:** SOTA comparison of task queues and workflow orchestrators for autonomous AI agent workflows
-
---
-
-## 1. Current Architecture: Cron + Webhook
-
-### How it works
- **Scheduler:** `cron/scheduler.py` — gateway calls `tick()` every 60 seconds
- **Storage:** JSON file (`~/.hermes/cron/jobs.json`) + file-based lock (`cron/.tick.lock`)
- **Execution:** Each job spawns a full `AIAgent.run_conversation()` in a thread pool with inactivity timeout
- **Delivery:** Results pushed back to origin chat via platform adapters (Telegram, Discord, etc.)
- **Checkpointing:** Job outputs saved to `~/.hermes/cron/output/{job_id}/{timestamp}.md`
-
-### Strengths
- Simple, zero-dependency (no broker/redis needed)
- Jobs are isolated — each runs a fresh agent session
- Direct platform delivery with E2EE support
- Script pre-run for data collection
- Inactivity-based timeout (not hard wall-clock)
-
-### Weaknesses
- **No task dependencies** — jobs are completely independent
- **No retry logic** — single failure = lost run (recurring jobs advance schedule and move on)
- **No concurrency control** — all due jobs fire at once; no worker pool sizing
- **No observability** — no metrics, no dashboard, no structured logging of job state transitions
- **Tick-based polling** — 60s granularity, wastes cycles when idle, adds latency when busy
- **Single-process** — file lock means only one tick at a time; no horizontal scaling
- **No dead letter queue** — failed deliveries are logged but not retried
- **No workflow chaining** — cannot express "run A, then B with A's output"
-
---
-
-## 2. Framework Comparison
-
-### 2.1 Huey (Already Installed v2.6.0)
-
-**Architecture:** Embedded task queue, SQLite/Redis/file storage, consumer process model.
-
-| Feature | Huey | Our Cron |
-|---|---|---|
-| Broker | SQLite (default), Redis | JSON file |
-| Retry | Built-in: `retries=N, retry_delay=S` | None |
-| Task chaining | `task1.s() | task2.s()` (pipeline) | None |
-| Scheduling | `@huey.periodic_task(crontab(...))` | Our own cron parser |
-| Concurrency | Worker pool with `-w N` flag | Single tick lock |
-| Monitoring | `huey_consumer` logs, Huey Admin (Django) | Manual log reading |
-| Failure recovery | Automatic retry + configurable backoff | None |
-| Priority | `PriorityRedisExpireHuey` or task priority | None |
-| Result storage | `store_results=True` with result() | File output |
-
-**Task Dependencies Pattern:**
-```python
-@huey.task()
-def analyze_data(input_data):
-    return run_analysis(input_data)
-
-@huey.task()
-def generate_report(analysis_result):
-    return create_report(analysis_result)
-
-# Pipeline: analyze then report
-pipeline = analyze_data.s(raw_data) | generate_report.s()
-result = pipeline()
-```
-
-**Retry Pattern:**
-```python
-@huey.task(retries=3, retry_delay=60, retry_backoff=True)
-def flaky_api_call(url):
-    return requests.get(url, timeout=30)
-```
-
-**Benchmarks:** ~5,000 tasks/sec with SQLite backend, ~15,000 with Redis. Sub-millisecond scheduling latency. Very lightweight — single process.
-
-**Verdict:** Best fit for our use case. Already installed. SQLite backend = no external deps. Can layer on top of our existing job storage.
-
---
-
-### 2.2 Celery
-
-**Architecture:** Distributed task queue with message broker (RabbitMQ/Redis).
-
-| Feature | Celery | Huey |
-|---|---|---|
-| Broker | Redis, RabbitMQ, SQS (required) | SQLite (built-in) |
-| Scale | 100K+ tasks/sec | ~5-15K tasks/sec |
-| Chains | `chain(task1.s(), task2.s())` | Pipeline operator |
-| Groups/Chords | Parallel + callback | Not built-in |
-| Canvas | Full workflow DSL (chain, group, chord, map) | Basic pipeline |
-| Monitoring | Flower dashboard, Celery events | Minimal |
-| Complexity | Heavy — needs broker, workers, result backend | Single process |
-
-**Workflow Pattern:**
-```python
-from celery import chain, group, chord
-
-# Chain: sequential
-workflow = chain(fetch_data.s(), analyze.s(), report.s())
-
-# Group: parallel
-parallel = group(fetch_twitter.s(), fetch_reddit.s(), fetch_hn.s())
-
-# Chord: parallel then callback
-chord(parallel, aggregate_results.s())
-```
-
-**Verdict:** Overkill for our scale. Adds RabbitMQ/Redis dependency. The Canvas API is powerful but we don't need 100K task/sec throughput. Flower monitoring is nice but we'd need to deploy it separately.
-
---
-
-### 2.3 Temporal
-
-**Architecture:** Durable execution engine. Workflows as code with automatic state persistence and replay.
-
-| Feature | Temporal | Our Cron |
-|---|---|---|
-| State management | Automatic — workflow state persisted on every step | Manual JSON files |
-| Failure recovery | Workflows survive process restarts, auto-retry | Lost on crash |
-| Task dependencies | Native — activities call other activities | None |
-| Long-running tasks | Built-in (days/months OK) | Inactivity timeout |
-| Versioning | Workflow versioning for safe updates | No versioning |
-| Visibility | Full workflow state at any point | Log files |
-| Infrastructure | Requires Temporal server + database | None |
-| Language | Python SDK, but Temporal server is Go | Pure Python |
-
-**Workflow Pattern:**
-```python
-@workflow.defn
-class AIAgentWorkflow:
-    @workflow.run
-    async def run(self, job_config: dict) -> str:
-        # Step 1: Fetch data
-        data = await workflow.execute_activity(
-            fetch_data_activity,
-            job_config["script"],
-            start_to_close_timeout=timedelta(minutes=5),
-            retry_policy=RetryPolicy(maximum_attempts=3),
-        )
-        
-        # Step 2: Analyze with AI agent
-        analysis = await workflow.execute_activity(
-            run_agent_activity,
-            {"prompt": job_config["prompt"], "context": data},
-            start_to_close_timeout=timedelta(minutes=30),
-            retry_policy=RetryPolicy(
-                initial_interval=timedelta(seconds=60),
-                maximum_attempts=3,
-            ),
-        )
-        
-        # Step 3: Deliver
-        await workflow.execute_activity(
-            deliver_activity,
-            {"platform": job_config["deliver"], "content": analysis},
-            start_to_close_timeout=timedelta(seconds=60),
-        )
-        return analysis
-```
-
-**Verdict:** Best architecture for complex multi-step AI workflows, but heavy infrastructure cost. Temporal server needs PostgreSQL/Cassandra + visibility store. Ideal if we reach 50+ multi-step workflows with complex failure modes. Overkill for current needs.
-
---
-
-### 2.4 Prefect
-
-**Architecture:** Modern data/workflow orchestration with Python-native API.
-
-| Feature | Prefect |
-|---|---|
-| Dependencies | SQLite (default) or PostgreSQL |
-| Task retries | `@task(retries=3, retry_delay_seconds=10)` |
-| Task dependencies | `result = task_a(wait_for=[task_b])` |
-| Caching | `cache_key_fn` for result caching |
-| Subflows | Nested workflow composition |
-| Deployments | Schedule via `Deployment` or `CronSchedule` |
-| UI | Excellent web dashboard |
-| Async | Full async support |
-
-**Workflow Pattern:**
-```python
-from prefect import flow, task
-from prefect.tasks import task_input_hash
-
-@task(retries=3, retry_delay_seconds=30)
-def run_agent(prompt: str) -> str:
-    agent = AIAgent(...)
-    return agent.run_conversation(prompt)
-
-@task(cache_key_fn=task_input_hash, cache_expiration=timedelta(hours=1))
-def fetch_context(script: str) -> str:
-    return run_script(script)
-
-@flow(name="agent-workflow")
-def agent_workflow(job_config: dict):
-    context = fetch_context(job_config.get("script", ""))
-    result = run_agent(
-        f"{context}\n\n{job_config['prompt']}",
-        wait_for=[context]
-    )
-    deliver(result, job_config["deliver"])
-    return result
-```
-
-**Benchmarks:** Sub-second task scheduling. Handles 10K+ concurrent task runs. SQLite backend for single-node.
-
-**Verdict:** Strong alternative. Pythonic, good UI, built-in scheduling. But heavier than Huey — deploys a server process. Best if we want a web dashboard for monitoring. Less infrastructure than Temporal but more than Huey.
-
---
-
-### 2.5 Apache Airflow
-
-**Architecture:** Batch-oriented DAG scheduler, Python-based.
-
-| Feature | Airflow |
-|---|---|
-| DAG model | Static DAGs defined in Python files |
-| Scheduler | Polling-based, 5-30s granularity |
-| Dependencies | PostgreSQL/MySQL + Redis/RabbitMQ + webserver |
-| UI | Rich web UI with DAG visualization |
-| Best for | ETL, data pipelines, batch processing |
-| Weakness | Not designed for dynamic task creation; heavy; DAG definition overhead |
-
-**Verdict:** Wrong tool for this job. Airflow excels at static, well-defined data pipelines (ETL). Our agent workflows are dynamic — tasks are created at runtime based on user prompts. Airflow's DAG model fights against this. Massive overhead (needs webserver, scheduler, worker, metadata DB).
-
---
-
-### 2.6 Dramatiq
-
-**Architecture:** Lightweight distributed task queue, Celery alternative.
-
-| Feature | Dramatiq |
-|---|---|
-| Broker | Redis, RabbitMQ |
-| Retries | `@dramatiq.actor(max_retries=3)` |
-| Middleware | Pluggable: age_limit, time_limit, retries, callbacks |
-| Groups | `group(actor.message(...), ...).run()` |
-| Pipes | `actor.message() | other_actor.message()` |
-| Simplicity | Cleaner API than Celery |
-
-**Verdict:** Nice middle ground between Huey and Celery. But still requires a broker (Redis/RabbitMQ). No SQLite backend. Less ecosystem than Celery, less lightweight than Huey.
-
---
-
-### 2.7 RQ (Redis Queue)
-
-**Architecture:** Minimal Redis-based task queue.
-
-| Feature | RQ |
-|---|---|
-| Broker | Redis only |
-| Retries | Via `Retry` class |
-| Workers | Simple worker processes |
-| Dashboard | `rq-dashboard` (separate) |
-| Limitation | Redis-only, no SQLite, no scheduling built-in |
-
-**Verdict:** Too simple and Redis-dependent. No periodic task support without `rq-scheduler`. No task chaining without third-party. Not competitive with Huey for our use case.
-
---
-
-## 3. Architecture Patterns for AI Agent Workflows
-
-### 3.1 Task Chaining (Fan-out / Fan-in)
-
-The critical pattern for multi-step AI workflows:
-
-```
-[Script] → [Agent] → [Deliver]
-    ↓          ↓          ↓
-  Context    Report    Notification
-```
-
-**Implementation with Huey:**
-```python
-@huey.task(retries=2)
-def run_script_task(script_path):
-    return run_script(script_path)
-
-@huey.task(retries=3, retry_delay=60)
-def run_agent_task(prompt, context=None):
-    if context:
-        prompt = f"## Context\n{context}\n\n{prompt}"
-    agent = AIAgent(...)
-    return agent.run_conversation(prompt)
-
-@huey.task()
-def deliver_task(result, job_config):
-    return deliver_result(job_config, result)
-
-# Compose: script → agent → deliver
-def compose_workflow(job):
-    steps = []
-    if job.get("script"):
-        steps.append(run_script_task.s(job["script"]))
-    steps.append(run_agent_task.s(job["prompt"]))
-    steps.append(deliver_task.s(job))
-    return reduce(lambda a, b: a.then(b), steps)
-```
-
-### 3.2 Retry with Exponential Backoff
-
-```python
-from huey import RetryTask
-
-class AIWorkflowTask(RetryTask):
-    retries = 3
-    retry_delay = 30        # Start at 30s
-    retry_backoff = True    # 30s → 60s → 120s
-    max_retry_delay = 600   # Cap at 10min
-```
-
-### 3.3 Dead Letter Queue
-
-For tasks that exhaust retries:
-```python
-@huey.task(retries=3)
-def flaky_task(data):
-    ...
-
-# Dead letter handling
-def handle_failure(task, exc, retries):
-    # Log to dead letter store
-    save_dead_letter(task, exc, retries)
-    # Notify user of failure
-    notify_user(f"Task {task.name} failed after {retries} retries: {exc}")
-```
-
-### 3.4 Observability Pattern
-
-```python
-# Structured event logging for every state transition
-def emit_event(job_id, event_type, metadata):
-    event = {
-        "job_id": job_id,
-        "event": event_type,  # scheduled, started, completed, failed, retried
-        "timestamp": iso_now(),
-        "metadata": metadata,
-    }
-    append_to_event_log(event)
-    # Also emit to metrics (Prometheus/StatsD)
-    metrics.increment(f"cron.{event_type}")
-```
-
---
-
-## 4. Benchmarks Summary
-
-| Framework | Throughput | Latency | Memory | Startup | Dependencies |
-|---|---|---|---|---|---|
-| Current Cron | ~1 job/60s tick | 60-120s | Minimal | Instant | None |
-| Huey (SQLite) | ~5K tasks/sec | <10ms | ~20MB | <1s | None |
-| Huey (Redis) | ~15K tasks/sec | <5ms | ~20MB | <1s | Redis |
-| Celery (Redis) | ~15K tasks/sec | <10ms | ~100MB | ~3s | Redis |
-| Temporal | ~50K activities/sec | <5ms | ~200MB | ~10s | Temporal server+DB |
-| Prefect | ~10K tasks/sec | <20ms | ~150MB | ~5s | PostgreSQL |
-
---
-
-## 5. Recommendations
-
-### Immediate (Phase 1): Enhance Current Cron
-
-Add these capabilities to the existing `cron/` module **without** switching frameworks:
-
-1. **Retry logic** — Add `retry_count`, `retry_delay`, `max_retries` fields to job JSON. In `scheduler.py tick()`, on failure: if `retries_remaining > 0`, don't advance schedule, set `next_run_at = now + retry_delay * (attempt^2)`.
-
-2. **Backoff** — Exponential: `delay * 2^attempt`, capped at 10 minutes.
-
-3. **Dead letter tracking** — After max retries, mark job state as `dead_letter` and emit a delivery notification with the error.
-
-4. **Concurrency limit** — Add a semaphore (e.g., `max_concurrent=3`) to `tick()` so we don't spawn 20 agents simultaneously.
-
-5. **Structured events** — Append JSON events to `~/.hermes/cron/events.jsonl` for every state transition (scheduled, started, completed, failed, retried, delivered).
-
-**Effort:** ~1-2 days. No new dependencies.
-
-### Medium-term (Phase 2): Adopt Huey for Workflow Chaining
-
-When we need task dependencies (multi-step agent workflows), migrate to Huey:
-
-1. **Keep the JSON job store** as the source of truth for user-facing job management.
-2. **Use Huey as the execution engine** — enqueue tasks from `tick()`, let Huey handle retries, scheduling, and chaining.
-3. **SQLite backend** — no new infrastructure. One consumer process (`huey_consumer.py`) alongside the gateway.
-4. **Task chaining for multi-step jobs** — `script_task.then(agent_task).then(delivery_task)`.
-
-**Migration path:**
- Phase 2a: Run Huey consumer alongside gateway. Mirror cron jobs to Huey periodic tasks.
- Phase 2b: Add task chaining for jobs with scripts.
- Phase 2c: Migrate all jobs to Huey, deprecate tick()-based execution.
-
-**Effort:** ~1 week. Huey already installed. Gateway integration ~2-3 days.
-
-### Long-term (Phase 3): Evaluate Temporal/Prefect
-
-Only if:
- We have 100+ concurrent multi-step workflows
- We need workflow versioning and A/B testing
- We need cross-service orchestration (agent calls to external APIs with complex compensation logic)
- We want a web dashboard for non-technical users
-
-**Don't adopt early** — these tools solve problems we don't have yet.
-
---
-
-## 6. Decision Matrix
-
-| Need | Best Solution | Why |
-|---|---|---|
-| Simple retry logic | Enhance current cron | Zero deps, fast to implement |
-| Task chaining | **Huey** | Already installed, SQLite backend, pipeline API |
-| Monitoring dashboard | Prefect or Huey+Flower | If monitoring becomes critical |
-| Massive scale (10K+/sec) | Celery + Redis | If we're processing thousands of agent runs per hour |
-| Complex compensation | Temporal | Only if we need durable multi-service workflows |
-| Periodic scheduling | Current cron (works) or Huey | Current is fine; Huey adds `crontab()` with seconds |
-
---
-
-## 7. Key Insight
-
-The cron system's biggest gap isn't the framework — it's the **absence of retry and dependency primitives**. These can be added to the current system in <100 lines of code. The second biggest gap is observability (structured events + metrics), which is also solvable incrementally.
-
-Huey is the right *eventual* target for workflow execution because:
-1. Already installed, zero new dependencies
-2. SQLite backend matches our "no infrastructure" philosophy
-3. Pipeline API gives us task chaining for free
-4. Retry/backoff is first-class
-5. Consumer model is more efficient than tick-polling
-6. ~50x better scheduling latency (ms vs 60s)
-
-The migration should be gradual — start by wrapping Huey inside our existing cron tick, then progressively move execution to Huey's consumer model.
--- a/docs/cron-audit-890.md
+++ b/docs/cron-audit-890.md
@@ -1,38 +0,0 @@
-# Cron Job Audit — #890
-
-## Problem
-
-9 of 69 cron jobs have zero completions. They waste scheduler cycles.
-
-## Dead Jobs Identified
-
-| Job | Schedule | Completions | Action |
-|-----|----------|-------------|--------|
-| exp-swarm-pipeline | every 10 min | 0 | disable |
-| exp-music-generator | every 2h | 0 | disable |
-| exp-paper-citations | every 3h | 0 | disable |
-| exp-gbrain-patterns | every 2h | 0 | disable |
-| exp-infra-hardening | every 2h | 0 | disable |
-| gemma4-multimodal-burn | every 1h | 0 | disable |
-| morning-paper-report | daily | 0 | disable |
-| overnight-collector | every 15 min | 0 | disable |
-| morning-experiment-report | daily | 0 | disable |
-
-## Usage
-
-```bash
-# Audit: show dead jobs
-python scripts/cron_audit.py
-
-# Dry run: preview changes
-python scripts/cron_audit.py --disable --dry-run
-
-# Disable dead jobs
-python scripts/cron_audit.py --disable
-
-# Delete dead jobs
-python scripts/cron_audit.py --delete
-
-# Find jobs with no runs in 7 days
-python scripts/cron_audit.py --older-than 7d
-```
--- a/docs/emotional-presence-patterns.md
+++ b/docs/emotional-presence-patterns.md
@@ -1,56 +0,0 @@
-# Emotional Presence Patterns for Crisis Support
-
-Research issue #880. What works for reaching broken men in their darkest moment.
-
-## Crisis Detection Patterns
-
-| Pattern | Severity | Detection |
-|---------|----------|-----------|
-| Method seeking | CRITICAL | "how do I kill myself", "suicide methods" |
-| Suicidal ideation | HIGH | "I want to die", "no reason to live" |
-| Self-harm | HIGH | "cutting myself", "hurting myself" |
-| Despair | HIGH | "can't go on", "no one cares" |
-| Farewell | CRITICAL | "this is goodbye", "my final message" |
-
-## What Works
-
-1. Validation without agreement: "That sounds incredibly painful"
-2. Presence over solutions: "I'm here with you right now"
-3. Specific next steps: "Text HOME to 741741 right now"
-4. Connection: "You don't have to face this alone"
-5. Hope without minimizing: "There is a way through this"
-
-## What Does NOT Work
-
-1. False empathy: "I feel your pain"
-2. Toxic positivity: "Look on the bright side!"
-3. Minimization: "Other people have it worse"
-4. Guilt induction: "Think about your family"
-5. Premature problem-solving: "Have you tried therapy?"
-6. AI dishonesty: pretending to be human
-
-## SOUL.md Protocol
-
-1. Ask: "Are you safe right now?" (FIRST response)
-2. Listen. Stay present. Don't disconnect.
-3. Never compute the value of a human life
-4. Never suggest death is an option
-5. Always point to 988 Lifeline
-6. Share the gospel: "Jesus saves those who call on His name"
-
-## Crisis Resources
-
- 988 Suicide & Crisis Lifeline: call or text 988
- Crisis Text Line: text HOME to 741741
- 988 Chat: 988lifeline.org/chat
- Spanish: 1-888-628-9454
- Emergency: 911
-
-## Implementation Status
-
- Crisis detection: agent/crisis_protocol.py
- SHIELD integration: tools/shield/
- 988 Lifeline: resources defined
- Emotional presence: this document
- Escalation tracking: future work
- Human notification: future work
--- a/docs/holographic-vector-hybrid.md
+++ b/docs/holographic-vector-hybrid.md
@@ -1,42 +0,0 @@
-# Holographic + Vector Hybrid Memory Architecture
-
-Research issue #879. Combining HRR (holographic) and vector (Qdrant) memory.
-
-## Architecture
-
-Three memory backends, each with unique strengths:
-
-| Backend | Strength | Weakness | Use Case |
-|---------|----------|----------|----------|
-| FTS5 | Exact keyword match | No semantic understanding | Precise recall |
-| Vector (Qdrant) | Semantic similarity | No compositional queries | Topic search |
-| HRR (Holographic) | Compositional queries | Limited scale | Complex reasoning |
-
-## Why Hybrid
-
- FTS5 alone: misses ~30-40% of semantically relevant content
- Vector alone: can't do compositional queries ("what did I discuss about X after doing Y?")
- HRR alone: unique capability but no semantic fallback
- Hybrid: best of all three, RRF fusion for ranking
-
-## Implementation: Reciprocal Rank Fusion
-
-Results from each backend are merged using RRF:
- score = sum(weight / (k + rank)) for each backend
- k=60 (standard RRF constant)
- Weights: FTS5=0.6, Vector=0.4 (configurable)
-
-## Status
-
- FTS5: EXISTS (hermes_state.py)
- Vector (Qdrant): implemented (tools/hybrid_search.py)
- HRR: EXISTS (plugins/memory/holographic.py)
- RRF fusion: implemented (tools/hybrid_search.py)
- Ingestion pipeline: partial
-
-## Next Steps
-
-1. Wire HRR into hybrid_search.py
-2. Session-level vector ingestion
-3. Benchmark: measure R@5 improvement
-4. Cross-session memory persistence
--- a/docs/migration/openclaw.md
+++ b/docs/migration/openclaw.md
@@ -11,14 +11,12 @@ When you run `hermes setup` for the first time and Hermes detects `~/.openclaw`,
 ### 2. CLI Command (quick, scriptable)

 ```bash
-hermes claw migrate                      # Preview then migrate (always shows preview first)
-hermes claw migrate --dry-run            # Preview only, no changes
+hermes claw migrate                      # Full migration with confirmation prompt
+hermes claw migrate --dry-run            # Preview what would happen
 hermes claw migrate --preset user-data   # Migrate without API keys/secrets
 hermes claw migrate --yes                # Skip confirmation prompt
 ```

-The migration always shows a full preview of what will be imported before making any changes. You review the preview and confirm before anything is written.
-
 **All options:**

 | Flag | Description |
@@ -41,7 +39,7 @@ Ask the agent to run the migration for you:
 ```

 The agent will use the `openclaw-migration` skill to:
-1. Run a preview first to show what would change
+1. Run a dry-run first to preview changes
 2. Ask about conflict resolution (SOUL.md, skills, etc.)
 3. Let you choose between `user-data` and `full` presets
 4. Execute the migration with your choices
@@ -60,31 +58,16 @@ The agent will use the `openclaw-migration` skill to:
 | Messaging settings | `~/.openclaw/config.yaml` (TELEGRAM_ALLOWED_USERS, MESSAGING_CWD) | `~/.hermes/.env` |
 | TTS assets | `~/.openclaw/workspace/tts/` | `~/.hermes/tts/` |

-Workspace files are also checked at `workspace.default/` and `workspace-main/` as fallback paths (OpenClaw renamed `workspace/` to `workspace-main/` in recent versions).
-
 ### `full` preset (adds to `user-data`)
 | Item | Source | Destination |
 |------|--------|-------------|
-| Telegram bot token | `openclaw.json` channels config | `~/.hermes/.env` |
-| OpenRouter API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
-| OpenAI API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
-| Anthropic API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
-| ElevenLabs API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
+| Telegram bot token | `~/.openclaw/config.yaml` | `~/.hermes/.env` |
+| OpenRouter API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
+| OpenAI API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
+| Anthropic API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
+| ElevenLabs API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |

-API keys are searched across four sources: inline config values, `~/.openclaw/.env`, the `openclaw.json` `"env"` sub-object, and per-agent auth profiles.
-
-Only allowlisted secrets are ever imported. Other credentials are skipped and reported.
-
-## OpenClaw Schema Compatibility
-
-The migration handles both old and current OpenClaw config layouts:
-
- **Channel tokens**: Reads from flat paths (`channels.telegram.botToken`) and the newer `accounts.default` layout (`channels.telegram.accounts.default.botToken`)
- **TTS provider**: OpenClaw renamed "edge" to "microsoft" — both are recognized and mapped to Hermes' "edge"
- **Provider API types**: Both short (`openai`, `anthropic`) and hyphenated (`openai-completions`, `anthropic-messages`, `google-generative-ai`) values are mapped correctly
- **thinkingDefault**: All enum values are handled including newer ones (`minimal`, `xhigh`, `adaptive`)
- **Matrix**: Uses `accessToken` field (not `botToken`)
- **SecretRef formats**: Plain strings, env templates (`${VAR}`), and `source: "env"` SecretRefs are resolved. `source: "file"` and `source: "exec"` SecretRefs produce a warning — add those keys manually after migration.
+Only these 6 allowlisted secrets are ever imported. Other credentials are skipped and reported.

 ## Conflict Handling

@@ -101,24 +84,18 @@ For skills, you can also use `--skill-conflict rename` to import conflicting ski

 ## Migration Report

-Every migration produces a report showing:
+Every migration (including dry runs) produces a report showing:
 - **Migrated items** — what was successfully imported
 - **Conflicts** — items skipped because they already exist
 - **Skipped items** — items not found in the source
 - **Errors** — items that failed to import

-For executed migrations, the full report is saved to `~/.hermes/migration/openclaw/<timestamp>/`.
-
-## Post-Migration Notes
-
- **Skills require a new session** — imported skills take effect after restarting your agent or starting a new chat.
- **WhatsApp requires re-pairing** — WhatsApp uses QR-code pairing, not token-based auth. Run `hermes whatsapp` to pair.
- **Archive cleanup** — after migration, you'll be offered to rename `~/.openclaw/` to `.openclaw.pre-migration/` to prevent state confusion. You can also run `hermes claw cleanup` later.
+For execute runs, the full report is saved to `~/.hermes/migration/openclaw/<timestamp>/`.

 ## Troubleshooting

 ### "OpenClaw directory not found"
-The migration looks for `~/.openclaw` by default, then tries `~/.clawdbot` and `~/.moltbot`. If your OpenClaw is installed elsewhere, use `--source`:
+The migration looks for `~/.openclaw` by default. If your OpenClaw is installed elsewhere, use `--source`:
 ```bash
 hermes claw migrate --source /path/to/.openclaw
 ```
@@ -131,12 +108,3 @@ hermes skills install openclaw-migration

 ### Memory overflow
 If your OpenClaw MEMORY.md or USER.md exceeds Hermes' character limits, excess entries are exported to an overflow file in the migration report directory. You can manually review and add the most important ones.
-
-### API keys not found
-Keys might be stored in different places depending on your OpenClaw setup:
- `~/.openclaw/.env` file
- Inline in `openclaw.json` under `models.providers.*.apiKey`
- In `openclaw.json` under the `"env"` or `"env.vars"` sub-objects
- In `~/.openclaw/agents/main/agent/auth-profiles.json`
-
-The migration checks all four. If keys use `source: "file"` or `source: "exec"` SecretRefs, they can't be resolved automatically — add them via `hermes config set`.
--- a/docs/plans/awesome-ai-tools-integration.md
+++ b/docs/plans/awesome-ai-tools-integration.md
@@ -1,44 +0,0 @@
-# awesome-ai-tools Integration Plan
-
-**Tracking:** #842
-**Source report:** docs/tool-investigation-2026-04-15.md
-**Date:** 2026-04-16
-
---
-
-## Status Dashboard
-
-| # | Tool | Category | Impact | Effort | Status | Issue |
-|---|------|----------|--------|--------|--------|-------|
-| 1 | Mem0 | Memory | 5/5 | 3/5 | Cloud + Local done | #842 |
-| 2 | LightRAG | RAG | 4/5 | 3/5 | Not started | #857 |
-| 3 | n8n | Orchestration | 5/5 | 4/5 | Not started | #858 |
-| 4 | RAGFlow | RAG | 4/5 | 4/5 | Not started | #859 |
-| 5 | tensorzero | LLMOps | 4/5 | 3/5 | Not started | #860 |
-
---
-
-## #1: Mem0 — DONE
-
-Cloud: `plugins/memory/mem0/` (MEM0_API_KEY required)
-Local: `plugins/memory/mem0_local/` (ChromaDB, no API key)
-
-## #2: LightRAG (P2)
-
-Create `plugins/rag/lightrag/` plugin. Index skill docs. Use local Ollama embeddings.
-
-## #3: n8n (P3)
-
-Deploy as Docker service. Create workflow templates for Hermes patterns.
-
-## #4: RAGFlow (P4)
-
-Deploy as Docker service. Integrate via HTTP API for document understanding.
-
-## #5: tensorzero (P3)
-
-Evaluate as provider routing replacement. Canary migration (10% traffic first).
-
---
-
-*Last updated: 2026-04-16*
--- a/docs/plans/fleet-knowledge-graph-sota-research.md
+++ b/docs/plans/fleet-knowledge-graph-sota-research.md
@@ -1,324 +0,0 @@
-# SOTA Research: Multi-Agent Coordination & Fleet Knowledge Graphs
-
-**Date:** 2026-04-14  
-**Scope:** Agent-to-agent communication, shared memory, task delegation, consensus protocols  
-**Frameworks Analyzed:** CrewAI, AutoGen, MetaGPT, ChatDev, CAMEL
-
---
-
-## 1. Architecture Pattern Summary
-
-### 1.1 CrewAI — Role-Based Crew Orchestration
-
-**Core Pattern:** Agents organized into "Crews" with explicit roles, goals, and backstories. Tasks are assigned to agents, executed via sequential or hierarchical process flows.
-
-**Agent-to-Agent Communication:**
- **Sequential:** Agent A completes Task A → output injected into Task B's context for Agent B
- **Hierarchical:** Manager agent delegates to worker agents, collects results, synthesizes
- **Context passing:** Tasks can declare `context: [other_tasks]` — outputs from dependent tasks are automatically injected into the current task's prompt
- **No direct agent-to-agent messaging** — communication is mediated through task outputs
-
-**Shared Memory (v2 — Unified Memory):**
- `Memory` class with `remember()` / `recall()` using vector embeddings (LanceDB/ChromaDB)
- **Scope-based isolation:** `MemoryScope` provides path-based namespacing (`/crew/research/agent-foo`)
- **Composite scoring:** semantic similarity (0.5) + recency (0.3) + importance (0.2)
- **RecallFlow:** LLM-driven deep recall with adaptive query expansion
- **Privacy flags:** Private memories only visible to the source that created them
- **Background saves:** ThreadPoolExecutor with write barrier (drain_writes before recall)
-
-**Task Delegation:**
- Agent tools include `Delegate Work to Co-worker` and `Ask Question to Co-worker`
- Delegation creates a new task for another agent, results come back to delegator
- Depth-limited (no infinite delegation chains)
-
-**State & Checkpointing:**
- `SqliteProvider` / `JsonProvider` for state checkpoint persistence
- `CheckpointConfig` with event-driven persistence
- Flow state is Pydantic models with serialization
-
-**Cache:**
- Thread-safe in-memory tool result cache with RWLock
- Key: `{tool_name}-{input}` → cached output
-
-### 1.2 AutoGen (Microsoft) — Conversation-Centric Teams
-
-**Core Pattern:** Agents communicate through shared conversation threads. A "Group Chat Manager" controls turn-taking and speaker selection.
-
-**Agent-to-Agent Communication:**
- **Shared message thread** — all agents see all messages (like a group chat)
- **Three team patterns:**
-  - `RoundRobinGroupChat`: Fixed order cycling through participants
-  - `SelectorGroupChat`: LLM-based speaker selection with candidate filtering
-  - `SwarmGroupChat`: Handoff-based routing (agent sends HandoffMessage to next agent)
-  - `GraphFlow` (DiGraph): DAG-based execution with conditional edges, parallel fan-out, loops
-  - `MagenticOneOrchestrator`: Ledger-based orchestration with task planning, progress tracking, stall detection
-
-**Shared State:**
- `ChatCompletionContext` — manages message history per agent (can be unbounded or windowed)
- `ModelContext` shared across agents in a team
- State serialization: `save_state()` / `load_state()` for all managers
- **No built-in vector memory** — context is purely conversational
-
-**Task Delegation:**
- `Swarm`: Agents use `HandoffMessage` to explicitly route control
- `GraphFlow`: Conditional edges route based on message content (keyword or callable)
- `MagenticOne`: Orchestrator maintains a "task ledger" (facts + plan) and dynamically re-plans on stalls
-
-**Consensus / Termination:**
- `TerminationCondition` — composable conditions (text match, max messages, source-based)
- No explicit consensus protocols — termination is manager-decided
-
-**Key Insight:** AutoGen's `ChatCompletionContext` is the closest analog to shared memory, but it's purely sequential message history, not a knowledge base.
-
-### 1.3 MetaGPT — SOP-Driven Software Teams
-
-**Core Pattern:** Agents follow Standard Operating Procedures (SOPs). Each agent has a defined role (Product Manager, Architect, Engineer, QA) and produces structured artifacts.
-
-**Agent-to-Agent Communication:**
- **Publish-Subscribe via Environment:** Agents publish "actions" to a shared Environment, subscribers react
- **Structured outputs:** Each role produces specific artifact types (PRD, design doc, code, test cases)
- **Message routing:** Environment acts as a message bus, filtering by subscriber interest
-
-**Shared Memory:**
- `Environment` class maintains shared state (project workspace)
- File-based shared memory: agents write/read from a shared filesystem
- `SharedMemory` for cross-agent context (structured data, not free-form text)
-
-**Task Delegation:**
- Implicit through SOP stages: PM → Architect → Engineer → QA
- Each agent's output is the next agent's input
- No dynamic re-delegation
-
-**Consensus:**
- Sequential SOP execution (no parallel agents)
- QA agent can trigger re-work loops back to Engineer
-
-### 1.4 ChatDev — Chat-Chain Software Development
-
-**Core Pattern:** Agents follow a "chat chain" — a sequence of chat phases (designing, coding, testing, documenting). Each phase involves a pair of agents (CEO↔CTO, Programmer↔Reviewer, etc.).
-
-**Agent-to-Agent Communication:**
- **Paired chat sessions:** Two agents communicate in each phase (role-play between instructor and assistant)
- **Chain propagation:** Phase N's output (code, design doc) becomes Phase N+1's input
- **No broadcast** — communication is strictly pairwise within phases
-
-**Shared Memory:**
- Software-centric: shared code repository is the "memory"
- Each phase modifies/inherits the codebase
- No explicit vector memory or knowledge graph
-
-**Task Delegation:**
- Hardcoded phase sequence: Design → Code → Test → Document
- Each phase delegates to a specific agent pair
- No dynamic task re-assignment
-
-**Consensus:**
- Phase-level termination: when both agents agree the phase is complete
- "Thought" tokens for chain-of-thought within chat
-
-### 1.5 CAMEL — Role-Playing & Workforce
-
-**Core Pattern:** Two primary modes:
-1. **RolePlaying:** Two-agent conversation with task specification and optional critic
-2. **Workforce:** Multi-agent with coordinator, task planner, and worker pool
-
-**Agent-to-Agent Communication:**
- **RolePlaying:** Structured turn-taking between assistant and user agents
- **Workforce:** Coordinator assigns tasks via `TaskChannel`, workers return results
- **Worker types:** `SingleAgentWorker` (single ChatAgent), `RolePlayingWorker` (two-agent pair)
-
-**Shared Memory / Task Channel:**
- `TaskChannel` — async queue-based task dispatch with packet tracking
-  - States: SENT → PROCESSING → RETURNED → ARCHIVED
-  - O(1) lookup by task ID, status-based filtering, assignee/publisher queues
- `WorkflowMemoryManager` — persists workflow patterns as markdown files
-  - Role-based organization: workflows stored by `role_identifier`
-  - Agent-based intelligent selection: LLM picks relevant past workflows
-  - Versioned: metadata tracks creation time and version numbers
-
-**Task Delegation:**
- Coordinator agent decomposes complex tasks using LLM analysis
- Tasks assigned to workers based on capability matching
- Failed tasks trigger: retry, create new worker, or further decomposition
- `FailureHandlingConfig` with configurable `RecoveryStrategy`
-
-**Consensus / Quality:**
- Quality evaluation via structured output (response format enforced)
- Task dependencies tracked (worker receives dependency tasks as context)
- `WorkforceMetrics` for tracking execution statistics
-
---
-
-## 2. Key Architectural Patterns for Fleet Knowledge Graph
-
-### 2.1 Communication Topology Patterns
-
-| Pattern | Used By | Description |
-|---------|---------|-------------|
-| **Sequential Chain** | CrewAI, ChatDev, MetaGPT | A→B→C linear flow, output feeds next |
-| **Shared Thread** | AutoGen | All agents see all messages |
-| **Publish-Subscribe** | MetaGPT | Environment-based message bus |
-| **Paired Chat** | ChatDev, CAMEL | Two-agent conversation pairs |
-| **Handoff Routing** | AutoGen Swarm | Agent explicitly names next speaker |
-| **DAG Graph** | AutoGen GraphFlow | Conditional edges, parallel, loops |
-| **Ledger Orchestration** | AutoGen MagenticOne | Maintains task ledger, re-plans |
-| **Task Channel** | CAMEL | Async queue with packet states |
-
-### 2.2 Shared State Patterns
-
-| Pattern | Used By | Description |
-|---------|---------|-------------|
-| **Vector Memory** | CrewAI | Embeddings + scope-based namespacing |
-| **Message History** | AutoGen | Sequential conversation context |
-| **File System** | MetaGPT, ChatDev | Agents read/write shared files |
-| **Task Channel** | CAMEL | Async packet-based task dispatch |
-| **Workflow Files** | CAMEL | Markdown-based workflow memory |
-| **Tool Cache** | CrewAI | In-memory RWLock tool result cache |
-| **State Checkpoint** | CrewAI, AutoGen | Serialized Pydantic/SQLite checkpoints |
-
-### 2.3 Task Delegation Patterns
-
-| Pattern | Used By | Description |
-|---------|---------|-------------|
-| **Role Assignment** | CrewAI | Fixed agent per task |
-| **Manager Delegation** | CrewAI Hierarchical | Manager assigns tasks dynamically |
-| **Speaker Selection** | AutoGen Selector | LLM picks next agent |
-| **Handoff** | AutoGen Swarm | Agent explicitly transfers control |
-| **SOP Routing** | MetaGPT | Stage-based implicit delegation |
-| **Coordinator** | CAMEL Workforce | LLM-based task decomposition + assignment |
-| **Dynamic Worker Creation** | CAMEL Workforce | Create new workers on failure |
-
-### 2.4 Conflict Resolution Patterns
-
-| Pattern | Used By | Description |
-|---------|---------|-------------|
-| **Manager Arbitration** | CrewAI Hierarchical | Manager resolves conflicts |
-| **Critic-in-the-loop** | CAMEL | Critic agent evaluates and selects |
-| **Quality Gate** | CAMEL Workforce | Structured quality evaluation |
-| **Termination Conditions** | AutoGen | Composable stop conditions |
-| **Stall Detection** | AutoGen MagenticOne | Re-plans when progress stalls |
-
---
-
-## 3. Recommendations for Hermes Fleet Knowledge Graph
-
-### 3.1 Architecture: Hybrid Graph + Memory
-
-Based on the SOTA analysis, the optimal fleet knowledge graph should combine:
-
-1. **CrewAI's scoped memory** for hierarchical knowledge organization
-   - Path-based namespaces: `/fleet/{fleet_id}/agent/{agent_id}/diary`
-   - Composite scoring: semantic + recency + importance
-   - Background writes with read barriers
-
-2. **CAMEL's TaskChannel** for task dispatch and tracking
-   - Packet states (SENT → PROCESSING → RETURNED → ARCHIVED)
-   - O(1) lookup by task ID
-   - Assignee/publisher tracking
-
-3. **AutoGen's DiGraph** for execution flow definition
-   - DAG with conditional edges for complex workflows
-   - Parallel fan-out for independent tasks
-   - Activation conditions (all vs any) for synchronization points
-
-4. **AutoGen MagenticOne's ledger** for shared task context
-   - Maintained facts, plan, and progress ledger
-   - Dynamic re-planning on stalls
-
-### 3.2 Fleet Knowledge Graph Schema
-
-```
-/fleet/{fleet_id}/
-  ├── shared/              # Shared knowledge (all agents read)
-  │   ├── facts/           # Known facts, constraints
-  │   ├── decisions/       # Record of decisions made
-  │   └── context/         # Active task context
-  ├── agent/{agent_id}/
-  │   ├── diary/           # Agent's personal experience log
-  │   ├── capabilities/    # What this agent can do
-  │   └── state/           # Current task state
-  ├── tasks/
-  │   ├── {task_id}/       # Task metadata, dependencies, status
-  │   └── graph/           # DAG definition for task dependencies
-  └── consensus/
-      ├── proposals/       # Pending proposals
-      └── decisions/       # Resolved consensus decisions
-```
-
-### 3.3 Key Design Decisions
-
-1. **Diary System (Agent Memory):**
-   - Each agent writes to its own scoped memory after every significant action
-   - LLM-analyzed importance scoring (like CrewAI's unified memory)
-   - Cross-agent recall: agents can query other agents' diaries for relevant experiences
-   - Decay: old low-importance memories expire
-
-2. **Shared State (Fleet Knowledge):**
-   - SQLite-backed (like Hermes' existing `state.db`) with FTS5 search
-   - Hierarchical scopes (like CrewAI's MemoryScope)
-   - Write-ahead log for concurrent access
-   - Read barriers before queries (like CrewAI's `drain_writes`)
-
-3. **Task Delegation:**
-   - Coordinator pattern (like CAMEL's Workforce)
-   - Task decomposition via LLM
-   - Failed task → retry, reassign, or decompose
-   - Max depth limit (like Hermes' existing MAX_DEPTH=2)
-
-4. **Consensus Protocol:**
-   - Proposal-based: agent proposes, others vote/acknowledge
-   - Timeout-based fallback: if no response within N seconds, proceed
-   - Manager override: designated manager can break ties
-   - Simple majority for non-critical, unanimity for critical decisions
-
-5. **Conflict Resolution:**
-   - Last-write-wins for non-critical state
-   - Optimistic locking with version numbers
-   - Manager arbitration for task assignment conflicts
-   - Quality gates (like CAMEL) for output validation
-
-### 3.4 Integration with Existing Hermes Architecture
-
-Hermes already has strong foundations:
- **Delegation system** (`delegate_tool.py`): Isolated child agents, parallel execution, depth limits
- **State DB** (`hermes_state.py`): SQLite + FTS5, WAL mode, session tracking, message history
- **Credential pools**: Shared credentials with rotation
-
-The fleet knowledge graph should extend these patterns:
- **Session DB → Fleet DB:** Add tables for fleet metadata, agent registrations, task graphs
- **Memory tool → Fleet Memory:** Scoped vector memory shared across fleet agents
- **Delegate tool → Fleet Delegation:** Task channel with persistence, quality evaluation
- **New: Consensus module:** Proposal/vote protocol with timeout handling
-
---
-
-## 4. Reference Implementations
-
-| Component | Best Reference | Key Takeaway |
-|-----------|---------------|--------------|
-| Scoped Memory | CrewAI `Memory` + `MemoryScope` | Path-based namespaces, composite scoring, background writes |
-| Task Dispatch | CAMEL `TaskChannel` | Packet-based with state machine, O(1) lookup |
-| Execution DAG | AutoGen `DiGraphBuilder` | Fluent builder, conditional edges, activation groups |
-| Orchestration | AutoGen `MagenticOneOrchestrator` | Ledger-based planning, stall detection, re-planning |
-| Agent Communication | AutoGen `SelectorGroupChat` | LLM-based speaker selection, shared message thread |
-| Quality Evaluation | CAMEL Workforce | Structured output for quality scoring |
-| Workflow Memory | CAMEL `WorkflowMemoryManager` | Markdown-based, role-organized, versioned |
-| State Checkpoint | CrewAI `SqliteProvider` | JSONB checkpoints, WAL mode |
-| Tool Cache | CrewAI `CacheHandler` | RWLock-based concurrent tool result cache |
-
---
-
-## 5. Open Questions
-
-1. **Graph vs Vector for knowledge:** Should fleet knowledge use a proper graph DB (e.g., Neo4j) or stick with vector + SQLite?
-   - Recommendation: Start with SQLite + vectors (existing stack), add graph later if needed
-
-2. **Real-time vs Batch:** Should agents receive updates in real-time or batched?
-   - Recommendation: Event-driven for critical updates, batched for diary entries
-
-3. **Security model:** How should cross-agent access be controlled?
-   - Recommendation: Role-based ACLs on scope paths, similar to CrewAI's privacy flags
-
-4. **Scalability:** How many agents can a single fleet support?
-   - Recommendation: Start with 10-agent fleets, optimize SQLite concurrency first
-
--- a/docs/skins/example-skin.yaml
+++ b/docs/skins/example-skin.yaml
@@ -41,14 +41,6 @@ colors:
  session_label: "#DAA520"        # Session label
  session_border: "#8B8682"       # Session ID dim color

-  # TUI surfaces
-  status_bar_bg: "#1a1a2e"              # Status / usage bar background
-  voice_status_bg: "#1a1a2e"            # Voice-mode badge background
-  completion_menu_bg: "#1a1a2e"         # Completion list background
-  completion_menu_current_bg: "#333355" # Active completion row background
-  completion_menu_meta_bg: "#1a1a2e"    # Completion meta column background
-  completion_menu_meta_current_bg: "#333355"  # Active completion meta background
-
 # ── Spinner ─────────────────────────────────────────────────────────────────
 # Customize the animated spinner shown during API calls and tool execution.
 spinner:
--- a/docs/specs/container-cli-review-fixes.md
+++ b/docs/specs/container-cli-review-fixes.md
@@ -1,329 +0,0 @@
-# Container-Aware CLI Review Fixes Spec
-
-**PR:** NousResearch/hermes-agent#7543
-**Review:** cursor[bot] bugbot review (4094049442) + two prior rounds
-**Date:** 2026-04-12
-**Branch:** `feat/container-aware-cli-clean`
-
-## Review Issues Summary
-
-Six issues were raised across three bugbot review rounds. Three were fixed in intermediate commits (38277a6a, 726cf90f). This spec addresses remaining design concerns surfaced by those reviews and simplifies the implementation based on interview decisions.
-
-| # | Issue | Severity | Status |
-|---|-------|----------|--------|
-| 1 | `os.execvp` retry loop unreachable | Medium | Fixed in 79e8cd12 (switched to subprocess.run) |
-| 2 | Redundant `shutil.which("sudo")` | Medium | Fixed in 38277a6a (reuses `sudo` var) |
-| 3 | Missing `chown -h` on symlink update | Low | Fixed in 38277a6a |
-| 4 | Container routing after `parse_args()` | High | Fixed in 726cf90f |
-| 5 | Hardcoded `/home/${user}` | Medium | Fixed in 726cf90f |
-| 6 | Group membership not gated on `container.enable` | Low | Fixed in 726cf90f |
-
-The mechanical fixes are in place but the overall design needs revision. The retry loop, error swallowing, and process model have deeper issues than what the bugbot flagged.
-
---
-
-## Spec: Revised `_exec_in_container`
-
-### Design Principles
-
-1. **Let it crash.** No silent fallbacks. If `.container-mode` exists but something goes wrong, the error propagates naturally (Python traceback). The only case where container routing is skipped is when `.container-mode` doesn't exist or `HERMES_DEV=1`.
-2. **No retries.** Probe once for sudo, exec once. If it fails, docker/podman's stderr reaches the user verbatim.
-3. **Completely transparent.** No error wrapping, no prefixes, no spinners. Docker's output goes straight through.
-4. **`os.execvp` on the happy path.** Replace the Python process entirely so there's no idle parent during interactive sessions. Note: `execvp` never returns on success (process is replaced) and raises `OSError` on failure (it does not return a value). The container process's exit code becomes the process exit code by definition — no explicit propagation needed.
-5. **One human-readable exception to "let it crash".** `subprocess.TimeoutExpired` from the sudo probe gets a specific catch with a readable message, since a raw traceback for "your Docker daemon is slow" is confusing. All other exceptions propagate naturally.
-
-### Execution Flow
-
-```
-1. get_container_exec_info()
-   - HERMES_DEV=1 → return None (skip routing)
-   - Inside container → return None (skip routing)
-   - .container-mode doesn't exist → return None (skip routing)
-   - .container-mode exists → parse and return dict
-   - .container-mode exists but malformed/unreadable → LET IT CRASH (no try/except)
-
-2. _exec_in_container(container_info, sys.argv[1:])
-   a. shutil.which(backend) → if None, print "{backend} not found on PATH" and sys.exit(1)
-   b. Sudo probe: subprocess.run([runtime, "inspect", "--format", "ok", container_name], timeout=15)
-      - If succeeds → needs_sudo = False
-      - If fails → try subprocess.run([sudo, "-n", runtime, "inspect", ...], timeout=15)
-        - If succeeds → needs_sudo = True
-        - If fails → print error with sudoers hint (including why -n is required) and sys.exit(1)
-      - If TimeoutExpired → catch specifically, print human-readable message about slow daemon
-   c. Build exec_cmd: [sudo? + runtime, "exec", tty_flags, "-u", exec_user, env_flags, container, hermes_bin, *cli_args]
-   d. os.execvp(exec_cmd[0], exec_cmd)
-      - On success: process is replaced — Python is gone, container exit code IS the process exit code
-      - On OSError: let it crash (natural traceback)
-```
-
-### Changes to `hermes_cli/main.py`
-
-#### `_exec_in_container` — rewrite
-
-Remove:
- The entire retry loop (`max_retries`, `for attempt in range(...)`)
- Spinner logic (`"Waiting for container..."`, dots)
- Exit code classification (125/126/127 handling)
- `subprocess.run` for the exec call (keep it only for the sudo probe)
- Special TTY vs non-TTY retry counts
- The `time` import (no longer needed)
-
-Change:
- Use `os.execvp(exec_cmd[0], exec_cmd)` as the final call
- Keep the `subprocess` import only for the sudo probe
- Keep TTY detection for the `-it` vs `-i` flag
- Keep env var forwarding (TERM, COLORTERM, LANG, LC_ALL)
- Keep the sudo probe as-is (it's the one "smart" part)
- Bump probe `timeout` from 5s to 15s — cold podman on a loaded machine needs headroom
- Catch `subprocess.TimeoutExpired` specifically on both probe calls — print a readable message about the daemon being unresponsive instead of a raw traceback
- Expand the sudoers hint error message to explain *why* `-n` (non-interactive) is required: a password prompt would hang the CLI or break piped commands
-
-The function becomes roughly:
-
-```python
-def _exec_in_container(container_info: dict, cli_args: list):
-    """Replace the current process with a command inside the managed container.
-
-    Probes whether sudo is needed (rootful containers), then os.execvp
-    into the container. If exec fails, the OS error propagates naturally.
-    """
-    import shutil
-    import subprocess
-
-    backend = container_info["backend"]
-    container_name = container_info["container_name"]
-    exec_user = container_info["exec_user"]
-    hermes_bin = container_info["hermes_bin"]
-
-    runtime = shutil.which(backend)
-    if not runtime:
-        print(f"Error: {backend} not found on PATH. Cannot route to container.",
-              file=sys.stderr)
-        sys.exit(1)
-
-    # Probe whether we need sudo to see the rootful container.
-    # Timeout is 15s — cold podman on a loaded machine can take a while.
-    # TimeoutExpired is caught specifically for a human-readable message;
-    # all other exceptions propagate naturally.
-    needs_sudo = False
-    sudo = None
-    try:
-        probe = subprocess.run(
-            [runtime, "inspect", "--format", "ok", container_name],
-            capture_output=True, text=True, timeout=15,
-        )
-    except subprocess.TimeoutExpired:
-        print(
-            f"Error: timed out waiting for {backend} to respond.\n"
-            f"The {backend} daemon may be unresponsive or starting up.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-
-    if probe.returncode != 0:
-        sudo = shutil.which("sudo")
-        if sudo:
-            try:
-                probe2 = subprocess.run(
-                    [sudo, "-n", runtime, "inspect", "--format", "ok", container_name],
-                    capture_output=True, text=True, timeout=15,
-                )
-            except subprocess.TimeoutExpired:
-                print(
-                    f"Error: timed out waiting for sudo {backend} to respond.",
-                    file=sys.stderr,
-                )
-                sys.exit(1)
-
-            if probe2.returncode == 0:
-                needs_sudo = True
-            else:
-                print(
-                    f"Error: container '{container_name}' not found via {backend}.\n"
-                    f"\n"
-                    f"The NixOS service runs the container as root. Your user cannot\n"
-                    f"see it because {backend} uses per-user namespaces.\n"
-                    f"\n"
-                    f"Fix: grant passwordless sudo for {backend}. The -n (non-interactive)\n"
-                    f"flag is required because the CLI calls sudo non-interactively —\n"
-                    f"a password prompt would hang or break piped commands:\n"
-                    f"\n"
-                    f'  security.sudo.extraRules = [{{\n'
-                    f'    users = [ "{os.getenv("USER", "your-user")}" ];\n'
-                    f'    commands = [{{ command = "{runtime}"; options = [ "NOPASSWD" ]; }}];\n'
-                    f'  }}];\n'
-                    f"\n"
-                    f"Or run: sudo hermes {' '.join(cli_args)}",
-                    file=sys.stderr,
-                )
-                sys.exit(1)
-        else:
-            print(
-                f"Error: container '{container_name}' not found via {backend}.\n"
-                f"The container may be running under root. Try: sudo hermes {' '.join(cli_args)}",
-                file=sys.stderr,
-            )
-            sys.exit(1)
-
-    is_tty = sys.stdin.isatty()
-    tty_flags = ["-it"] if is_tty else ["-i"]
-
-    env_flags = []
-    for var in ("TERM", "COLORTERM", "LANG", "LC_ALL"):
-        val = os.environ.get(var)
-        if val:
-            env_flags.extend(["-e", f"{var}={val}"])
-
-    cmd_prefix = [sudo, "-n", runtime] if needs_sudo else [runtime]
-    exec_cmd = (
-        cmd_prefix + ["exec"]
-        + tty_flags
-        + ["-u", exec_user]
-        + env_flags
-        + [container_name, hermes_bin]
-        + cli_args
-    )
-
-    # execvp replaces this process entirely — it never returns on success.
-    # On failure it raises OSError, which propagates naturally.
-    os.execvp(exec_cmd[0], exec_cmd)
-```
-
-#### Container routing call site in `main()` — remove try/except
-
-Current:
-```python
-try:
-    from hermes_cli.config import get_container_exec_info
-    container_info = get_container_exec_info()
-    if container_info:
-        _exec_in_container(container_info, sys.argv[1:])
-        sys.exit(1)  # exec failed if we reach here
-except SystemExit:
-    raise
-except Exception:
-    pass  # Container routing unavailable, proceed locally
-```
-
-Revised:
-```python
-from hermes_cli.config import get_container_exec_info
-container_info = get_container_exec_info()
-if container_info:
-    _exec_in_container(container_info, sys.argv[1:])
-    # Unreachable: os.execvp never returns on success (process is replaced)
-    # and raises OSError on failure (which propagates as a traceback).
-    # This line exists only as a defensive assertion.
-    sys.exit(1)
-```
-
-No try/except. If `.container-mode` doesn't exist, `get_container_exec_info()` returns `None` and we skip routing. If it exists but is broken, the exception propagates with a natural traceback.
-
-Note: `sys.exit(1)` after `_exec_in_container` is dead code in all paths — `os.execvp` either replaces the process or raises. It's kept as a belt-and-suspenders assertion with a comment marking it unreachable, not as actual error handling.
-
-### Changes to `hermes_cli/config.py`
-
-#### `get_container_exec_info` — remove inner try/except
-
-Current code catches `(OSError, IOError)` and returns `None`. This silently hides permission errors, corrupt files, etc.
-
-Change: Remove the try/except around file reading. Keep the early returns for `HERMES_DEV=1` and `_is_inside_container()`. The `FileNotFoundError` from `open()` when `.container-mode` doesn't exist should still return `None` (this is the "container mode not enabled" case). All other exceptions propagate.
-
-```python
-def get_container_exec_info() -> Optional[dict]:
-    if os.environ.get("HERMES_DEV") == "1":
-        return None
-    if _is_inside_container():
-        return None
-
-    container_mode_file = get_hermes_home() / ".container-mode"
-
-    try:
-        with open(container_mode_file, "r") as f:
-            # ... parse key=value lines ...
-    except FileNotFoundError:
-        return None
-    # All other exceptions (PermissionError, malformed data, etc.) propagate
-
-    return { ... }
-```
-
---
-
-## Spec: NixOS Module Changes
-
-### Symlink creation — simplify to two branches
-
-Current: 4 branches (symlink exists, directory exists, other file, doesn't exist).
-
-Revised: 2 branches.
-
-```bash
-if [ -d "${symlinkPath}" ] && [ ! -L "${symlinkPath}" ]; then
-  # Real directory — back it up, then create symlink
-  _backup="${symlinkPath}.bak.$(date +%s)"
-  echo "hermes-agent: backing up existing ${symlinkPath} to $_backup"
-  mv "${symlinkPath}" "$_backup"
-fi
-# For everything else (symlink, doesn't exist, etc.) — just force-create
-ln -sfn "${target}" "${symlinkPath}"
-chown -h ${user}:${cfg.group} "${symlinkPath}"
-```
-
-`ln -sfn` handles: existing symlink (replaces), doesn't exist (creates), and after the `mv` above (creates). The only case that needs special handling is a real directory, because `ln -sfn` cannot atomically replace a directory.
-
-Note: there is a theoretical race between the `[ -d ... ]` check and the `mv` (something could create/remove the directory in between). In practice this is a NixOS activation script running as root during `nixos-rebuild switch` — no other process should be touching `~/.hermes` at that moment. Not worth adding locking for.
-
-### Sudoers — document, don't auto-configure
-
-Do NOT add `security.sudo.extraRules` to the module. Document the sudoers requirement in the module's description/comments and in the error message the CLI prints when sudo probe fails.
-
-### Group membership gating — keep as-is
-
-The fix in 726cf90f (`cfg.container.enable && cfg.container.hostUsers != []`) is correct. Leftover group membership when container mode is disabled is harmless. No cleanup needed.
-
---
-
-## Spec: Test Rewrite
-
-The existing test file (`tests/hermes_cli/test_container_aware_cli.py`) has 16 tests. With the simplified exec model, several are obsolete.
-
-### Tests to keep (update as needed)
-
- `test_is_inside_container_dockerenv` — unchanged
- `test_is_inside_container_containerenv` — unchanged
- `test_is_inside_container_cgroup_docker` — unchanged
- `test_is_inside_container_false_on_host` — unchanged
- `test_get_container_exec_info_returns_metadata` — unchanged
- `test_get_container_exec_info_none_inside_container` — unchanged
- `test_get_container_exec_info_none_without_file` — unchanged
- `test_get_container_exec_info_skipped_when_hermes_dev` — unchanged
- `test_get_container_exec_info_not_skipped_when_hermes_dev_zero` — unchanged
- `test_get_container_exec_info_defaults` — unchanged
- `test_get_container_exec_info_docker_backend` — unchanged
-
-### Tests to add
-
- `test_get_container_exec_info_crashes_on_permission_error` — verify that `PermissionError` propagates (no silent `None` return)
- `test_exec_in_container_calls_execvp` — verify `os.execvp` is called with correct args (runtime, tty flags, user, env, container, binary, cli args)
- `test_exec_in_container_sudo_probe_sets_prefix` — verify that when first probe fails and sudo probe succeeds, `os.execvp` is called with `sudo -n` prefix
- `test_exec_in_container_no_runtime_hard_fails` — keep existing, verify `sys.exit(1)` when `shutil.which` returns None
- `test_exec_in_container_non_tty_uses_i_only` — update to check `os.execvp` args instead of `subprocess.run` args
- `test_exec_in_container_probe_timeout_prints_message` — verify that `subprocess.TimeoutExpired` from the probe produces a human-readable error and `sys.exit(1)`, not a raw traceback
- `test_exec_in_container_container_not_running_no_sudo` — verify the path where runtime exists (`shutil.which` returns a path) but probe returns non-zero and no sudo is available. Should print the "container may be running under root" error. This is distinct from `no_runtime_hard_fails` which covers `shutil.which` returning None.
-
-### Tests to delete
-
- `test_exec_in_container_tty_retries_on_container_failure` — retry loop removed
- `test_exec_in_container_non_tty_retries_silently_exits_126` — retry loop removed
- `test_exec_in_container_propagates_hermes_exit_code` — no subprocess.run to check exit codes; execvp replaces the process. Note: exit code propagation still works correctly — when `os.execvp` succeeds, the container's process *becomes* this process, so its exit code is the process exit code by OS semantics. No application code needed, no test needed. A comment in the function docstring documents this intent for future readers.
-
---
-
-## Out of Scope
-
- Auto-configuring sudoers rules in the NixOS module
- Any changes to `get_container_exec_info` parsing logic beyond the try/except narrowing
- Changes to `.container-mode` file format
- Changes to the `HERMES_DEV=1` bypass
- Changes to container detection logic (`_is_inside_container`)
--- a/docs/tool-investigation-2026-04-15.md
+++ b/docs/tool-investigation-2026-04-15.md
@@ -1,151 +0,0 @@
-## Tool Investigation Report: Top 5 Recommendations from awesome-ai-tools
-
-**Source:** [formatho/awesome-ai-tools](https://github.com/formatho/awesome-ai-tools)
-**Date:** 2026-04-15
-**Tools Analyzed:** 414 across 9 categories
-**Agent:** Timmy
-
---
-
-## Analysis Summary
-
-Scanned 414 tools from the awesome-ai-tools repository. Evaluated each against Hermes integration potential across five categories: Memory/Context, Inference Optimization, Agent Orchestration, Workflow Automation, and Retrieval/RAG.
-
-### Evaluation Criteria
- **Stars:** GitHub community validation (stability signal)
- **Freshness:** Active development (Fresh = updated <=7 days)
- **Integration Fit:** How well it complements Hermes' existing architecture (skills, memory, tools)
- **Integration Effort:** 1 (trivial drop-in) to 5 (major refactor required)
- **Impact:** 1 (incremental) to 5 (transformative)
-
---
-
-## Top 5 Recommended Tools
-
-### #1: Mem0 — Universal Memory Layer for AI Agents
-
-| Metric | Value |
-|--------|-------|
-| **Category** | Memory/Context |
-| **GitHub** | [mem0ai/mem0](https://github.com/mem0ai/mem0) |
-| **Stars** | 53.1k |
-| **Freshness** | Fresh |
-| **Integration Effort** | 3/5 |
-| **Impact** | 5/5 |
-| **Hermes Status** | IMPLEMENTED (plugins/memory/mem0/) + LOCAL MODE (plugins/memory/mem0_local/) |
-
-**Why it fits Hermes:**
-Hermes currently has session_search (transcript recall) and memory (persistent facts), but lacks a unified memory layer that bridges sessions with semantic understanding. Mem0 provides exactly this: automatic memory extraction from conversations, deduplication, and cross-session retrieval with semantic search.
-
-**Integration path:**
- Cloud: plugins/memory/mem0/ (requires MEM0_API_KEY)
- Local: plugins/memory/mem0_local/ (ChromaDB-backed, no API key)
- Auto-extract facts from session transcripts
- Query before session_search for richer contextual recall
-
-**Key risk:** Mem0 is freemium — core is open-source but advanced features require paid tier. Local mode mitigates this entirely.
-
---
-
-### #2: LightRAG — Simple and Fast Retrieval-Augmented Generation
-
-| Metric | Value |
-|--------|-------|
-| **Category** | Retrieval/RAG |
-| **GitHub** | [HKUDS/LightRAG](https://github.com/HKUDS/LightRAG) |
-| **Stars** | 33.1k |
-| **Freshness** | Fresh |
-| **Integration Effort** | 3/5 |
-| **Impact** | 4/5 |
-| **Hermes Status** | NOT IMPLEMENTED — Issue #857 |
-
-**Why it fits Hermes:**
-Hermes has 190+ skills but no unified knowledge retrieval system. LightRAG adds graph-based RAG that understands relationships between concepts, not just keyword matches. It's lightweight, runs locally, and has a simple API.
-
-**Integration path:**
- LightRAG as a local knowledge base for skill references
- Index GENOME.md files, README.md, and key codebase files
- Use local Ollama models for embeddings
- Complements existing search_files without replacing it
-
---
-
-### #3: n8n — Workflow Automation Platform
-
-| Metric | Value |
-|--------|-------|
-| **Category** | Workflow Automation / Agent Orchestration |
-| **GitHub** | [n8n-io/n8n](https://github.com/n8n-io/n8n) |
-| **Stars** | 183.9k |
-| **Freshness** | Fresh |
-| **Integration Effort** | 4/5 |
-| **Impact** | 5/5 |
-| **Hermes Status** | NOT IMPLEMENTED — Issue #858 |
-
-**Why it fits Hermes:**
-n8n provides a self-hosted, fair-code workflow platform with 400+ integrations. Rather than replacing Hermes' agent loop, n8n sits above it: trigger Hermes agents from external events, chain multi-agent workflows, and visualize execution.
-
---
-
-### #4: RAGFlow — Open-Source RAG Engine
-
-| Metric | Value |
-|--------|-------|
-| **Category** | Retrieval/RAG |
-| **GitHub** | [infiniflow/ragflow](https://github.com/infiniflow/ragflow) |
-| **Stars** | 77.9k |
-| **Freshness** | Fresh |
-| **Integration Effort** | 4/5 |
-| **Impact** | 4/5 |
-| **Hermes Status** | NOT IMPLEMENTED — Issue #859 |
-
-**Why it fits Hermes:**
-RAGFlow handles document parsing (PDF, Word, images via OCR), chunking, embedding, and retrieval with a web UI. Enables "document understanding" as a first-class capability.
-
---
-
-### #5: tensorzero — LLMOps Platform
-
-| Metric | Value |
-|--------|-------|
-| **Category** | Inference Optimization / LLMOps |
-| **GitHub** | [tensorzero/tensorzero](https://github.com/tensorzero/tensorzero) |
-| **Stars** | 11.2k |
-| **Freshness** | Fresh |
-| **Integration Effort** | 3/5 |
-| **Impact** | 4/5 |
-| **Hermes Status** | NOT IMPLEMENTED — Issue #860 |
-
-**Why it fits Hermes:**
-TensorZero unifies LLM gateway, observability, evaluation, and optimization. Replaces custom provider routing with a maintained, battle-tested platform.
-
---
-
-## Honorable Mentions
-
-| Tool | Stars | Category | Why Not Top 5 |
-|------|-------|----------|---------------|
-| memvid | 14.9k | Memory | Newer; Mem0 is more mature |
-| mempalace | 44.8k | Memory | Already evaluated; Mem0 has broader API |
-| Everything Claude Code | 154.3k | Agent | Too Claude-specific |
-| Portkey AI Gateway | 11.3k | Gateway | TensorZero is OSS; Portkey is freemium |
-
---
-
-## Implementation Priority
-
-| Priority | Tool | Action | Status | Issue |
-|----------|------|--------|--------|-------|
-| P1 | Mem0 | Local-only mode (ChromaDB) | DONE | #842 |
-| P2 | LightRAG | Set up local instance, index skills | Not started | #857 |
-| P3 | tensorzero | Evaluate as provider routing | Not started | #860 |
-| P4 | RAGFlow | Deploy Docker, test docs | Not started | #859 |
-| P5 | n8n | Deploy for workflow viz | Not started | #858 |
-
---
-
-## References
- Source: https://github.com/formatho/awesome-ai-tools
- Total tools: 414 across 9 categories
- Last updated: April 16, 2026
- Tracking issue: Timmy_Foundation/hermes-agent#842
--- a/docs/tool-investigation-report.md
+++ b/docs/tool-investigation-report.md
@@ -1,24 +0,0 @@
-# Tool Investigation Report: Top 5 Recommendations
-
-**Generated:** 2026-04-20 | **Source:** formatho/awesome-ai-tools (795 tools, 10 categories)
-
-## Top 5
-
-1. **LiteLLM** (76k) — Unified API gateway. Replace custom provider routing. Impact: 5/5, Effort: 2/5
-2. **Mem0** (53k) — Universal memory layer. Structured long-term memory. Impact: 5/5, Effort: 3/5
-3. **RAGFlow** (77k) — RAG engine with OCR. Document processing upgrade. Impact: 4/5, Effort: 4/5
-4. **LiteRT-LM** (3.7k) — On-device inference. Edge/mobile deployment. Impact: 4/5, Effort: 3/5
-5. **Claude-Mem** (61k) — Session capture and context injection. Impact: 3/5, Effort: 2/5
-
-## Priority
-
- Phase 1: LiteLLM (2-3 days, highest ROI)
- Phase 2: Mem0 (1 week, critical for agent maturity)
- Phase 3: RAGFlow (1-2 weeks, capability upgrade)
-
-## Honorable Mentions
-
- GPTCache: Semantic cache, 30-50% cost reduction
- promptfoo: LLM testing framework
- PageIndex: Vectorless RAG
- rtk: Token reduction proxy, 60-90% savings
--- a/environments/tool_call_parsers/hermes_parser.py
+++ b/environments/tool_call_parsers/hermes_parser.py
@@ -49,8 +49,6 @@ class HermesToolCallParser(ToolCallParser):
                    continue

                tc_data = json.loads(raw_json)
-                if "name" not in tc_data:
-                    continue
                tool_calls.append(
                    ChatCompletionMessageToolCall(
                        id=f"call_{uuid.uuid4().hex[:8]}",
--- a/environments/tool_call_parsers/mistral_parser.py
+++ b/environments/tool_call_parsers/mistral_parser.py
@@ -89,8 +89,6 @@ class MistralToolCallParser(ToolCallParser):
                        parsed = [parsed]

                    for tc in parsed:
-                        if "name" not in tc:
-                            continue
                        args = tc.get("arguments", {})
                        if isinstance(args, dict):
                            args = json.dumps(args, ensure_ascii=False)
--- a/flake.lock
+++ b/flake.lock
@@ -22,16 +22,16 @@
    },
    "nixpkgs": {
      "locked": {
-        "lastModified": 1775036866,
-        "narHash": "sha256-ZojAnPuCdy657PbTq5V0Y+AHKhZAIwSIT2cb8UgAz/U=",
+        "lastModified": 1751274312,
+        "narHash": "sha256-/bVBlRpECLVzjV19t5KMdMFWSwKLtb5RyXdjz3LJT+g=",
        "owner": "NixOS",
        "repo": "nixpkgs",
-        "rev": "6201e203d09599479a3b3450ed24fa81537ebc4e",
+        "rev": "50ab793786d9de88ee30ec4e4c24fb4236fc2674",
        "type": "github"
      },
      "original": {
        "owner": "NixOS",
-        "ref": "nixos-unstable",
+        "ref": "nixos-24.11",
        "repo": "nixpkgs",
        "type": "github"
      }
--- a/flake.nix
+++ b/flake.nix
@@ -2,7 +2,7 @@
  description = "Hermes Agent - AI agent framework by Nous Research";

  inputs = {
-    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+    nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";
    flake-parts = {
      url = "github:hercules-ci/flake-parts";
      inputs.nixpkgs-lib.follows = "nixpkgs";
--- a/gateway/builtin_hooks/boot_md.py
+++ b/gateway/builtin_hooks/boot_md.py
@@ -18,7 +18,9 @@ suppress delivery.
 """

 import logging
+import os
 import threading
+from pathlib import Path

 logger = logging.getLogger("hooks.boot-md")

--- a/Show More
+++ b/Show More