Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
29ff50c63d feat: Long Context vs RAG Decision Framework (research backlog #4.3)
Some checks failed
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Nix / nix (ubuntu-latest) (pull_request) Failing after 3s
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Failing after 1m6s
Tests / e2e (pull_request) Successful in 2m0s
Tests / test (pull_request) Failing after 33m14s
Nix / nix (macos-latest) (pull_request) Has been cancelled
Implements adaptive retrieval strategy that adjusts prefetch behavior
based on context pressure. With models at 128K-1M context windows,
always prefetching RAG is wasteful when context is empty and
insufficient when context is full.

Core changes:
- agent/context_strategy.py: Strategy module with ContextBudget,
  compute_prefetch_params(), should_prefetch() - STUFF/HYBRID/SELECTIVE
  tiers based on context pressure (30%/70% thresholds)
- agent/memory_manager.py: set_context_budget() method + adaptive
  prefetch in prefetch_all() with provider fallback for non-supporting
- plugins/memory/holographic: Accept limit/min_trust kwargs in prefetch()
- run_agent.py: Wire context_compressor state to memory_manager before
  prefetch_all() call
- tools/context_strategy.py: Agent-facing tool with task classification
  (crisis/factual/creative/analysis) and decision engine with 6 rules

Research basis:
- Self-RAG (Asai et al., 2023) - arxiv 2310.11511
- Long Context vs RAG Decision Framework (Timmy Foundation #4.3)
- FrugalGPT - arxiv 2305.05176

Tests: 19 new tests pass. Full context strategy, prefetch params,
should_prefetch decision logic, and strategy report generation.

Impact: Ratio 4.0 (Impact 4, Effort 1). Eliminates over-retrieval on
large-context models and prevents under-retrieval when context is tight.
Crisis intervention tasks always get HYBRID minimum for safety.
2026-04-12 04:58:13 -04:00
623 changed files with 8067 additions and 88234 deletions

View File

@@ -5,7 +5,6 @@
# Dependencies
node_modules
.venv
# CI/CD
.github

View File

@@ -43,15 +43,6 @@
# KIMI_BASE_URL=https://api.kimi.com/coding/v1 # Default for sk-kimi- keys
# KIMI_BASE_URL=https://api.moonshot.ai/v1 # For legacy Moonshot keys
# KIMI_BASE_URL=https://api.moonshot.cn/v1 # For Moonshot China keys
# KIMI_CN_API_KEY= # Dedicated Moonshot China key
# =============================================================================
# LLM PROVIDER (Arcee AI)
# =============================================================================
# Arcee AI provides access to Trinity models (trinity-mini, trinity-large-*)
# Get an Arcee key at: https://chat.arcee.ai/
# ARCEEAI_API_KEY=
# ARCEE_BASE_URL= # Override default base URL
# =============================================================================
# LLM PROVIDER (MiniMax)
@@ -98,15 +89,6 @@
# Optional base URL override:
# HERMES_QWEN_BASE_URL=https://portal.qwen.ai/v1
# =============================================================================
# LLM PROVIDER (Xiaomi MiMo)
# =============================================================================
# Xiaomi MiMo models (mimo-v2-pro, mimo-v2-omni, mimo-v2-flash).
# Get your key at: https://platform.xiaomimimo.com
# XIAOMI_API_KEY=your_key_here
# Optional base URL override:
# XIAOMI_BASE_URL=https://api.xiaomimimo.com/v1
# =============================================================================
# TOOL API KEYS
# =============================================================================
@@ -145,10 +127,6 @@
# Only override here if you need to force a backend without touching config.yaml:
# TERMINAL_ENV=local
# Override the container runtime binary (e.g. to use Podman instead of Docker).
# Useful on systems where Docker's storage driver is broken or unavailable.
# HERMES_DOCKER_BINARY=/usr/local/bin/podman
# Container images (for singularity/docker/modal backends)
# TERMINAL_DOCKER_IMAGE=nikolaik/python-nodejs:python3.11-nodejs20
# TERMINAL_SINGULARITY_IMAGE=docker://nikolaik/python-nodejs:python3.11-nodejs20

2
.gitattributes vendored
View File

@@ -1,2 +0,0 @@
# Auto-generated files — collapse diffs and exclude from language stats
web/package-lock.json linguist-generated=true

View File

@@ -1,28 +0,0 @@
name: Lint
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
lint:
runs-on: ubuntu-latest
timeout-minutes: 5
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Check for hardcoded paths
run: python3 scripts/lint_hardcoded_paths.py
continue-on-error: true
- name: Check Python syntax
run: |
find . -name "*.py" -not -path "./.git/*" -not -path "./node_modules/*" | head -100 | xargs python3 -m py_compile || true

View File

@@ -1,78 +0,0 @@
#!/usr/bin/env python3
"""
Pre-commit hook: Reject hardcoded home-directory paths.
Install:
cp pre-commit-hardcoded-path.py .git/hooks/pre-commit-hardcoded-path
chmod +x .git/hooks/pre-commit-hardcoded-path
Or add to .pre-commit-config.yaml
"""
import sys
import subprocess
import re
PATTERNS = [
(r"/Users/[\w.\-]+/", "macOS home directory"),
(r"/home/[\w.\-]+/", "Linux home directory"),
(r"(?<![\w/])~/", "unexpanded tilde"),
]
NOQA = re.compile(r"#\s*noqa:?\s*hardcoded-path-ok")
def get_staged_files():
result = subprocess.run(
["git", "diff", "--cached", "--name-only", "--diff-filter=ACM"],
capture_output=True, text=True
)
return [f for f in result.stdout.strip().split("\n") if f.endswith(".py")]
def check_file(filepath):
try:
result = subprocess.run(
["git", "show", f":{filepath}"],
capture_output=True, text=True
)
content = result.stdout
except Exception:
return []
violations = []
for i, line in enumerate(content.split("\n"), 1):
if line.strip().startswith("#"):
continue
if line.strip().startswith(("import ", "from ")):
continue
if NOQA.search(line):
continue
for pattern, desc in PATTERNS:
if re.search(pattern, line):
violations.append((filepath, i, line.strip(), desc))
break
return violations
def main():
files = get_staged_files()
if not files:
sys.exit(0)
all_violations = []
for f in files:
all_violations.extend(check_file(f))
if all_violations:
print("ERROR: Hardcoded home directory paths detected:")
print()
for filepath, line_no, line, desc in all_violations:
print(f" {filepath}:{line_no}: {desc}")
print(f" {line[:100]}")
print()
print("Fix: Use $HOME, relative paths, or get_hermes_home().")
print("Override: Add '# noqa: hardcoded-path-ok' to the line.")
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()

View File

@@ -11,7 +11,6 @@ body:
**Before submitting**, please:
- [ ] Search [existing issues](https://github.com/NousResearch/hermes-agent/issues) to avoid duplicates
- [ ] Update to the latest version (`hermes update`) and confirm the bug still exists
- [ ] Run `hermes debug share` and paste the links below (see Debug Report section)
- type: textarea
id: description
@@ -83,25 +82,6 @@ body:
- Slack
- WhatsApp
- type: textarea
id: debug-report
attributes:
label: Debug Report
description: |
Run `hermes debug share` from your terminal and paste the links it prints here.
This uploads your system info, config, and recent logs to a paste service automatically.
If you're in an interactive chat session, you can also use the `/debug` slash command — it does the same thing.
If the upload fails, run `hermes debug share --local` and paste the output directly.
placeholder: |
Report https://paste.rs/abc123
agent.log https://paste.rs/def456
gateway.log https://paste.rs/ghi789
render: shell
validations:
required: true
- type: input
id: os
attributes:
@@ -117,6 +97,8 @@ body:
label: Python Version
description: Output of `python --version`
placeholder: "3.11.9"
validations:
required: true
- type: input
id: hermes-version
@@ -124,14 +106,14 @@ body:
label: Hermes Version
description: Output of `hermes version`
placeholder: "2.1.0"
validations:
required: true
- type: textarea
id: logs
attributes:
label: Additional Logs / Traceback (optional)
description: |
The debug report above covers most logs. Use this field for any extra error output,
tracebacks, or screenshots not captured by `hermes debug share`.
label: Relevant Logs / Traceback
description: Paste any error output, traceback, or log messages. This will be auto-formatted as code.
render: shell
- type: textarea

View File

@@ -71,15 +71,3 @@ body:
label: Contribution
options:
- label: I'd like to implement this myself and submit a PR
- type: textarea
id: debug-report
attributes:
label: Debug Report (optional)
description: |
If this feature request is related to a problem you're experiencing, run `hermes debug share` and paste the links here.
In an interactive chat session, you can use `/debug` instead.
This helps us understand your environment and any related logs.
placeholder: |
Report https://paste.rs/abc123
render: shell

View File

@@ -9,8 +9,7 @@ body:
Sorry you're having trouble! Please fill out the details below so we can help.
**Quick checks first:**
- Run `hermes debug share` and paste the links in the Debug Report section below
- If you're in a chat session, you can use `/debug` instead — it does the same thing
- Run `hermes doctor` and include the output below
- Try `hermes update` to get the latest version
- Check the [README troubleshooting section](https://github.com/NousResearch/hermes-agent#troubleshooting)
- For general questions, consider the [Nous Research Discord](https://discord.gg/NousResearch) for faster help
@@ -75,21 +74,10 @@ body:
placeholder: "2.1.0"
- type: textarea
id: debug-report
id: doctor-output
attributes:
label: Debug Report
description: |
Run `hermes debug share` from your terminal and paste the links it prints here.
This uploads your system info, config, and recent logs to a paste service automatically.
If you're in an interactive chat session, you can also use the `/debug` slash command — it does the same thing.
If the upload fails or install didn't get that far, run `hermes debug share --local` and paste the output directly.
If even that doesn't work, run `hermes doctor` and paste that output instead.
placeholder: |
Report https://paste.rs/abc123
agent.log https://paste.rs/def456
gateway.log https://paste.rs/ghi789
label: Output of `hermes doctor`
description: Run `hermes doctor` and paste the full output. This will be auto-formatted.
render: shell
- type: textarea

View File

@@ -1,73 +0,0 @@
name: Contributor Attribution Check
on:
pull_request:
branches: [main]
paths:
# Only run when code files change (not docs-only PRs)
- '*.py'
- '**/*.py'
- '.github/workflows/contributor-check.yml'
permissions:
contents: read
jobs:
check-attribution:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
with:
fetch-depth: 0 # Full history needed for git log
- name: Check for unmapped contributor emails
run: |
# Get the merge base between this PR and main
MERGE_BASE=$(git merge-base origin/main HEAD)
# Find any new author emails in this PR's commits
NEW_EMAILS=$(git log ${MERGE_BASE}..HEAD --format='%ae' --no-merges | sort -u)
if [ -z "$NEW_EMAILS" ]; then
echo "No new commits to check."
exit 0
fi
# Check each email against AUTHOR_MAP in release.py
MISSING=""
while IFS= read -r email; do
# Skip teknium and bot emails
case "$email" in
*teknium*|*noreply@github.com*|*dependabot*|*github-actions*|*anthropic.com*|*cursor.com*)
continue ;;
esac
# Check if email is in AUTHOR_MAP (either as a key or matches noreply pattern)
if echo "$email" | grep -qP '\+.*@users\.noreply\.github\.com'; then
continue # GitHub noreply emails auto-resolve
fi
if ! grep -qF "\"${email}\"" scripts/release.py 2>/dev/null; then
AUTHOR=$(git log --author="$email" --format='%an' -1)
MISSING="${MISSING}\n ${email} (${AUTHOR})"
fi
done <<< "$NEW_EMAILS"
if [ -n "$MISSING" ]; then
echo ""
echo "⚠️ New contributor email(s) not in AUTHOR_MAP:"
echo -e "$MISSING"
echo ""
echo "Please add mappings to scripts/release.py AUTHOR_MAP:"
echo -e "$MISSING" | while read -r line; do
email=$(echo "$line" | sed 's/^ *//' | cut -d' ' -f1)
[ -z "$email" ] && continue
echo " \"${email}\": \"<github-username>\","
done
echo ""
echo "To find the GitHub username for an email:"
echo " gh api 'search/users?q=EMAIL+in:email' --jq '.items[0].login'"
exit 1
else
echo "✅ All contributor emails are mapped in AUTHOR_MAP."
fi

View File

@@ -28,32 +28,24 @@ jobs:
name: github-pages
url: ${{ steps.deploy.outputs.page_url }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: actions/checkout@v4
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
- uses: actions/setup-node@v4
with:
node-version: 20
cache: npm
cache-dependency-path: website/package-lock.json
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
- uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install PyYAML for skill extraction
run: pip install pyyaml==6.0.2 httpx==0.28.1
run: pip install pyyaml
- name: Extract skill metadata for dashboard
run: python3 website/scripts/extract-skills.py
- name: Build skills index (if not already present)
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
if [ ! -f website/static/api/skills-index.json ]; then
python3 scripts/build_skills_index.py || echo "Skills index build failed (non-fatal)"
fi
- name: Install dependencies
run: npm ci
working-directory: website
@@ -73,10 +65,10 @@ jobs:
echo "hermes-agent.nousresearch.com" > _site/CNAME
- name: Upload artifact
uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3
uses: actions/upload-pages-artifact@v3
with:
path: _site
- name: Deploy to GitHub Pages
id: deploy
uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4
uses: actions/deploy-pages@v4

View File

@@ -23,21 +23,21 @@ jobs:
timeout-minutes: 60
steps:
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
uses: actions/checkout@v4
with:
submodules: recursive
- name: Set up QEMU
uses: docker/setup-qemu-action@c7c53464625b32c7a7e944ae62b3e17d2b600130 # v3
uses: docker/setup-qemu-action@v3
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # v3
uses: docker/setup-buildx-action@v3
# Build amd64 only so we can `load` the image for smoke testing.
# `load: true` cannot export a multi-arch manifest to the local daemon.
# The multi-arch build follows on push to main / release.
- name: Build image (amd64, smoke test)
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
uses: docker/build-push-action@v6
with:
context: .
file: Dockerfile
@@ -56,31 +56,36 @@ jobs:
- name: Log in to Docker Hub
if: github.event_name == 'push' && github.ref == 'refs/heads/main' || github.event_name == 'release'
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # v3
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Push multi-arch image (main branch)
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
uses: docker/build-push-action@v6
with:
context: .
file: Dockerfile
push: true
platforms: linux/amd64,linux/arm64
tags: nousresearch/hermes-agent:latest
tags: |
nousresearch/hermes-agent:latest
nousresearch/hermes-agent:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max
- name: Push multi-arch image (release)
if: github.event_name == 'release'
uses: docker/build-push-action@10e90e3645eae34f1e60eeb005ba3a3d33f178e8 # v6
uses: docker/build-push-action@v6
with:
context: .
file: Dockerfile
push: true
platforms: linux/amd64,linux/arm64
tags: nousresearch/hermes-agent:${{ github.event.release.tag_name }}
tags: |
nousresearch/hermes-agent:latest
nousresearch/hermes-agent:${{ github.event.release.tag_name }}
nousresearch/hermes-agent:${{ github.sha }}
cache-from: type=gha
cache-to: type=gha,mode=max

View File

@@ -7,16 +7,13 @@ on:
- '.github/workflows/docs-site-checks.yml'
workflow_dispatch:
permissions:
contents: read
jobs:
docs-site-checks:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: actions/checkout@v4
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
- uses: actions/setup-node@v4
with:
node-version: 20
cache: npm
@@ -26,7 +23,7 @@ jobs:
run: npm ci
working-directory: website
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
- uses: actions/setup-python@v5
with:
python-version: '3.11'

View File

@@ -14,9 +14,6 @@ on:
- 'run_agent.py'
- 'acp_adapter/**'
permissions:
contents: read
concurrency:
group: nix-${{ github.ref }}
cancel-in-progress: true
@@ -29,7 +26,7 @@ jobs:
runs-on: ${{ matrix.os }}
timeout-minutes: 30
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: actions/checkout@v4
- uses: DeterminateSystems/nix-installer-action@ef8a148080ab6020fd15196c2084a2eea5ff2d25 # v22
- uses: DeterminateSystems/magic-nix-cache-action@565684385bcd71bad329742eefe8d12f2e765b39 # v13
- name: Check flake

View File

@@ -1,101 +0,0 @@
name: Build Skills Index
on:
schedule:
# Run twice daily: 6 AM and 6 PM UTC
- cron: '0 6,18 * * *'
workflow_dispatch: # Manual trigger
push:
branches: [main]
paths:
- 'scripts/build_skills_index.py'
- '.github/workflows/skills-index.yml'
permissions:
contents: read
jobs:
build-index:
# Only run on the upstream repository, not on forks
if: github.repository == 'NousResearch/hermes-agent'
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
with:
python-version: '3.11'
- name: Install dependencies
run: pip install httpx==0.28.1 pyyaml==6.0.2
- name: Build skills index
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: python scripts/build_skills_index.py
- name: Upload index artifact
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
with:
name: skills-index
path: website/static/api/skills-index.json
retention-days: 7
deploy-with-index:
needs: build-index
runs-on: ubuntu-latest
permissions:
pages: write
id-token: write
environment:
name: github-pages
url: ${{ steps.deploy.outputs.page_url }}
# Only deploy on schedule or manual trigger (not on every push to the script)
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
with:
name: skills-index
path: website/static/api/
- uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
with:
node-version: 20
cache: npm
cache-dependency-path: website/package-lock.json
- uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
with:
python-version: '3.11'
- name: Install PyYAML for skill extraction
run: pip install pyyaml==6.0.2
- name: Extract skill metadata for dashboard
run: python3 website/scripts/extract-skills.py
- name: Install dependencies
run: npm ci
working-directory: website
- name: Build Docusaurus
run: npm run build
working-directory: website
- name: Stage deployment
run: |
mkdir -p _site/docs
cp -r landingpage/* _site/
cp -r website/build/* _site/docs/
echo "hermes-agent.nousresearch.com" > _site/CNAME
- name: Upload artifact
uses: actions/upload-pages-artifact@56afc609e74202658d3ffba0e8f6dda462b719fa # v3
with:
path: _site
- name: Deploy to GitHub Pages
id: deploy
uses: actions/deploy-pages@d6db90164ac5ed86f2b6aed7e0febac5b3c0c03e # v4

View File

@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
uses: actions/checkout@v4
with:
fetch-depth: 0
@@ -149,62 +149,6 @@ jobs:
"
fi
# --- CI/CD workflow files modified ---
WORKFLOW_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '\.github/workflows/.*\.ya?ml$' || true)
if [ -n "$WORKFLOW_HITS" ]; then
FINDINGS="${FINDINGS}
### ⚠️ WARNING: CI/CD workflow files modified
Changes to workflow files can alter build pipelines, inject steps, or modify permissions. Verify no unauthorized actions or secrets access were added.
**Files:**
\`\`\`
${WORKFLOW_HITS}
\`\`\`
"
fi
# --- Dockerfile / container build files modified ---
DOCKER_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -iE '(Dockerfile|\.dockerignore|docker-compose)' || true)
if [ -n "$DOCKER_HITS" ]; then
FINDINGS="${FINDINGS}
### ⚠️ WARNING: Container build files modified
Changes to Dockerfiles or compose files can alter base images, add build steps, or expose ports. Verify base image pins and build commands.
**Files:**
\`\`\`
${DOCKER_HITS}
\`\`\`
"
fi
# --- Dependency manifest files modified ---
DEP_HITS=$(git diff --name-only "$BASE".."$HEAD" | grep -E '(pyproject\.toml|requirements.*\.txt|package\.json|Gemfile|go\.mod|Cargo\.toml)$' || true)
if [ -n "$DEP_HITS" ]; then
FINDINGS="${FINDINGS}
### ⚠️ WARNING: Dependency manifest files modified
Changes to dependency files can introduce new packages or change version pins. Verify all dependency changes are intentional and from trusted sources.
**Files:**
\`\`\`
${DEP_HITS}
\`\`\`
"
fi
# --- GitHub Actions version unpinning (mutable tags instead of SHAs) ---
ACTIONS_UNPIN=$(echo "$DIFF" | grep -n '^\+' | grep 'uses:' | grep -v '#' | grep -E '@v[0-9]' | head -10 || true)
if [ -n "$ACTIONS_UNPIN" ]; then
FINDINGS="${FINDINGS}
### ⚠️ WARNING: GitHub Actions with mutable version tags
Actions should be pinned to full commit SHAs (not \`@v4\`, \`@v5\`). Mutable tags can be retargeted silently if a maintainer account is compromised.
**Matches:**
\`\`\`
${ACTIONS_UNPIN}
\`\`\`
"
fi
# --- Output results ---
if [ -n "$FINDINGS" ]; then
echo "found=true" >> "$GITHUB_OUTPUT"
@@ -239,7 +183,7 @@ jobs:
---
*Automated scan triggered by [supply-chain-audit](/.github/workflows/supply-chain-audit.yml). If this is a false positive, a maintainer can approve after manual review.*"
gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY" || echo "::warning::Could not post PR comment (expected for fork PRs — GITHUB_TOKEN is read-only)"
gh pr comment "${{ github.event.pull_request.number }}" --body "$BODY"
- name: Fail on critical findings
if: steps.scan.outputs.critical == 'true'

View File

@@ -6,9 +6,6 @@ on:
pull_request:
branches: [main]
permissions:
contents: read
# Cancel in-progress runs for the same PR/branch
concurrency:
group: tests-${{ github.ref }}
@@ -20,17 +17,13 @@ jobs:
timeout-minutes: 10
steps:
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
uses: actions/checkout@v4
- name: Install system dependencies
run: sudo apt-get update && sudo apt-get install -y ripgrep
- name: Check for hardcoded paths
run: python3 scripts/lint_hardcoded_paths.py || true
continue-on-error: true
- name: Install uv
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
uses: astral-sh/setup-uv@v5
- name: Set up Python 3.11
run: uv python install 3.11
@@ -56,10 +49,10 @@ jobs:
timeout-minutes: 10
steps:
- name: Checkout code
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5
uses: astral-sh/setup-uv@v5
- name: Set up Python 3.11
run: uv python install 3.11

4
.gitignore vendored
View File

@@ -51,9 +51,6 @@ ignored/
.worktrees/
environments/benchmarks/evals/
# Web UI build output
hermes_cli/web_dist/
# Release script temp files
.release_notes.md
mini-swe-agent/
@@ -61,4 +58,3 @@ mini-swe-agent/
# Nix
.direnv/
result
website/static/api/skills-index.json

107
.mailmap
View File

@@ -1,107 +0,0 @@
# .mailmap — canonical author mapping for git shortlog / git log / GitHub
# Format: Canonical Name <canonical@email> <commit@email>
# See: https://git-scm.com/docs/gitmailmap
#
# This maps commit emails to GitHub noreply addresses so that:
# 1. `git shortlog -sn` shows deduplicated contributor counts
# 2. GitHub's contributor graph can attribute commits correctly
# 3. Contributors with personal/work emails get proper credit
#
# When adding entries: use the contributor's GitHub noreply email as canonical
# so GitHub can link commits to their profile.
# === Teknium (multiple emails) ===
Teknium <127238744+teknium1@users.noreply.github.com> <teknium1@gmail.com>
Teknium <127238744+teknium1@users.noreply.github.com> <teknium@nousresearch.com>
# === Contributors — personal/work emails mapped to GitHub noreply ===
# Format: Canonical Name <GH-noreply> <commit-email>
# Verified via GH API email search
luyao618 <364939526@qq.com> <364939526@qq.com>
ethernet8023 <arilotter@gmail.com> <arilotter@gmail.com>
nicoloboschi <boschi1997@gmail.com> <boschi1997@gmail.com>
cherifya <chef.ya@gmail.com> <chef.ya@gmail.com>
BongSuCHOI <chlqhdtn98@gmail.com> <chlqhdtn98@gmail.com>
dsocolobsky <dsocolobsky@gmail.com> <dsocolobsky@gmail.com>
pefontana <fontana.pedro93@gmail.com> <fontana.pedro93@gmail.com>
Helmi <frank@helmschrott.de> <frank@helmschrott.de>
hata1234 <hata1234@gmail.com> <hata1234@gmail.com>
# Verified via PR investigation / salvage PR bodies
DeployFaith <agents@kylefrench.dev> <agents@kylefrench.dev>
flobo3 <floptopbot33@gmail.com> <floptopbot33@gmail.com>
gaixianggeng <gaixg94@gmail.com> <gaixg94@gmail.com>
KUSH42 <xush@xush.org> <xush@xush.org>
konsisumer <der@konsi.org> <der@konsi.org>
WorldInnovationsDepartment <vorvul.danylo@gmail.com> <vorvul.danylo@gmail.com>
m0n5t3r <iacobs@m0n5t3r.info> <iacobs@m0n5t3r.info>
sprmn24 <oncuevtv@gmail.com> <oncuevtv@gmail.com>
fancydirty <fancydirty@gmail.com> <fancydirty@gmail.com>
fxfitz <francis.x.fitzpatrick@gmail.com> <francis.x.fitzpatrick@gmail.com>
limars874 <limars874@gmail.com> <limars874@gmail.com>
AaronWong1999 <aaronwong1999@icloud.com> <aaronwong1999@icloud.com>
dippwho <dipp.who@gmail.com> <dipp.who@gmail.com>
duerzy <duerzy@gmail.com> <duerzy@gmail.com>
geoffwellman <geoff.wellman@gmail.com> <geoff.wellman@gmail.com>
hcshen0111 <shenhaocheng19990111@gmail.com> <shenhaocheng19990111@gmail.com>
jamesarch <han.shan@live.cn> <han.shan@live.cn>
stephenschoettler <stephenschoettler@gmail.com> <stephenschoettler@gmail.com>
Tranquil-Flow <tranquil_flow@protonmail.com> <tranquil_flow@protonmail.com>
Dusk1e <yusufalweshdemir@gmail.com> <yusufalweshdemir@gmail.com>
Awsh1 <ysfalweshcan@gmail.com> <ysfalweshcan@gmail.com>
WAXLYY <ysfwaxlycan@gmail.com> <ysfwaxlycan@gmail.com>
donrhmexe <don.rhm@gmail.com> <don.rhm@gmail.com>
hqhq1025 <1506751656@qq.com> <1506751656@qq.com>
BlackishGreen33 <s5460703@gmail.com> <s5460703@gmail.com>
tomqiaozc <zqiao@microsoft.com> <zqiao@microsoft.com>
MagicRay1217 <mingjwan@microsoft.com> <mingjwan@microsoft.com>
aaronagent <1115117931@qq.com> <1115117931@qq.com>
YoungYang963 <young@YoungdeMacBook-Pro.local> <young@YoungdeMacBook-Pro.local>
LongOddCode <haolong@microsoft.com> <haolong@microsoft.com>
Cafexss <coffeemjj@gmail.com> <coffeemjj@gmail.com>
Cygra <sjtuwbh@gmail.com> <sjtuwbh@gmail.com>
DomGrieco <dgrieco@redhat.com> <dgrieco@redhat.com>
# Duplicate email mapping (same person, multiple emails)
Sertug17 <104278804+Sertug17@users.noreply.github.com> <srhtsrht17@gmail.com>
yyovil <birdiegyal@gmail.com> <tanishq231003@gmail.com>
DomGrieco <dgrieco@redhat.com> <dgrieco@redhat.com>
dsocolobsky <dsocolobsky@gmail.com> <dylan.socolobsky@lambdaclass.com>
olafthiele <programming@olafthiele.com> <olafthiele@gmail.com>
# Verified via git display name matching GH contributor username
cokemine <aptx4561@gmail.com> <aptx4561@gmail.com>
dalianmao000 <dalianmao0107@gmail.com> <dalianmao0107@gmail.com>
emozilla <emozilla@nousresearch.com> <emozilla@nousresearch.com>
jjovalle99 <juan.ovalle@mistral.ai> <juan.ovalle@mistral.ai>
kagura-agent <kagura.chen28@gmail.com> <kagura.chen28@gmail.com>
spniyant <niyant@spicefi.xyz> <niyant@spicefi.xyz>
olafthiele <programming@olafthiele.com> <programming@olafthiele.com>
r266-tech <r2668940489@gmail.com> <r2668940489@gmail.com>
xingkongliang <tianliangjay@gmail.com> <tianliangjay@gmail.com>
win4r <win4r@outlook.com> <win4r@outlook.com>
zhouboli <zhouboli@gmail.com> <zhouboli@gmail.com>
yongtenglei <yongtenglei@gmail.com> <yongtenglei@gmail.com>
# Nous Research team
benbarclay <ben@nousresearch.com> <ben@nousresearch.com>
jquesnelle <jonny@nousresearch.com> <jonny@nousresearch.com>
# GH contributor list verified
spideystreet <dhicham.pro@gmail.com> <dhicham.pro@gmail.com>
dorukardahan <dorukardahan@hotmail.com> <dorukardahan@hotmail.com>
MustafaKara7 <karamusti912@gmail.com> <karamusti912@gmail.com>
Hmbown <hmbown@gmail.com> <hmbown@gmail.com>
kamil-gwozdz <kamil@gwozdz.me> <kamil@gwozdz.me>
kira-ariaki <kira@ariaki.me> <kira@ariaki.me>
knopki <knopki@duck.com> <knopki@duck.com>
Unayung <unayung@gmail.com> <unayung@gmail.com>
SeeYangZhi <yangzhi.see@gmail.com> <yangzhi.see@gmail.com>
Julientalbot <julien.talbot@ergonomia.re> <julien.talbot@ergonomia.re>
lesterli <lisicheng168@gmail.com> <lisicheng168@gmail.com>
JiayuuWang <jiayuw794@gmail.com> <jiayuw794@gmail.com>
tesseracttars-creator <tesseracttars@gmail.com> <tesseracttars@gmail.com>
xinbenlv <zzn+pa@zzn.im> <zzn+pa@zzn.im>
SaulJWu <saul.jj.wu@gmail.com> <saul.jj.wu@gmail.com>
angelos <angelos@oikos.lan.home.malaiwah.com> <angelos@oikos.lan.home.malaiwah.com>

View File

@@ -13,7 +13,7 @@ source venv/bin/activate # ALWAYS activate before running Python
```
hermes-agent/
├── run_agent.py # AIAgent class — core conversation loop
├── model_tools.py # Tool orchestration, discover_builtin_tools(), handle_function_call()
├── model_tools.py # Tool orchestration, _discover_tools(), handle_function_call()
├── toolsets.py # Toolset definitions, _HERMES_CORE_TOOLS list
├── cli.py # HermesCLI class — interactive CLI orchestrator
├── hermes_state.py # SessionDB — SQLite session store (FTS5 search)
@@ -55,7 +55,7 @@ hermes-agent/
├── gateway/ # Messaging platform gateway
│ ├── run.py # Main loop, slash commands, message dispatch
│ ├── session.py # SessionStore — conversation persistence
│ └── platforms/ # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal, qqbot
│ └── platforms/ # Adapters: telegram, discord, slack, whatsapp, homeassistant, signal
├── acp_adapter/ # ACP server (VS Code / Zed / JetBrains integration)
├── cron/ # Scheduler (jobs.py, scheduler.py)
├── environments/ # RL training environments (Atropos)
@@ -181,7 +181,7 @@ if canonical == "mycommand":
## Adding New Tools
Requires changes in **2 files**:
Requires changes in **3 files**:
**1. Create `tools/your_tool.py`:**
```python
@@ -204,9 +204,9 @@ registry.register(
)
```
**2. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.
**2. Add import** in `model_tools.py` `_discover_tools()` list.
Auto-discovery: any `tools/*.py` file with a top-level `registry.register()` call is imported automatically — no manual import list to maintain.
**3. Add to `toolsets.py`** — either `_HERMES_CORE_TOOLS` (all platforms) or a new toolset.
The registry handles schema collection, dispatch, availability checking, and error wrapping. All handlers MUST return a JSON string.
@@ -351,9 +351,8 @@ Cache-breaking forces dramatically higher costs. The ONLY time we alter context
### Background Process Notifications (Gateway)
When `terminal(background=true, notify_on_complete=true)` is used, the gateway runs a watcher that
detects process completion and triggers a new agent turn. Control verbosity of background process
messages with `display.background_process_notifications`
When `terminal(background=true, check_interval=...)` is used, the gateway runs a watcher that
pushes status updates to the user's chat. Control verbosity with `display.background_process_notifications`
in config.yaml (or `HERMES_BACKGROUND_NOTIFICATIONS` env var):
- `all` — running-output updates + final message (default)

View File

@@ -1,44 +1,27 @@
FROM ghcr.io/astral-sh/uv:0.11.6-python3.13-trixie@sha256:b3c543b6c4f23a5f2df22866bd7857e5d304b67a564f4feab6ac22044dde719b AS uv_source
FROM tianon/gosu:1.19-trixie@sha256:3b176695959c71e123eb390d427efc665eeb561b1540e82679c15e992006b8b9 AS gosu_source
FROM debian:13.4
# Disable Python stdout buffering to ensure logs are printed immediately
ENV PYTHONUNBUFFERED=1
# Store Playwright browsers outside the volume mount so the build-time
# install survives the /opt/data volume overlay at runtime.
ENV PLAYWRIGHT_BROWSERS_PATH=/opt/hermes/.playwright
# Install system dependencies in one layer, clear APT cache
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential nodejs npm python3 ripgrep ffmpeg gcc python3-dev libffi-dev procps git && \
build-essential nodejs npm python3 python3-pip ripgrep ffmpeg gcc python3-dev libffi-dev procps && \
rm -rf /var/lib/apt/lists/*
# Non-root user for runtime; UID can be overridden via HERMES_UID at runtime
RUN useradd -u 10000 -m -d /opt/data hermes
COPY --chmod=0755 --from=gosu_source /gosu /usr/local/bin/
COPY --chmod=0755 --from=uv_source /usr/local/bin/uv /usr/local/bin/uvx /usr/local/bin/
COPY . /opt/hermes
WORKDIR /opt/hermes
# Install Node dependencies and Playwright as root (--with-deps needs apt)
RUN npm install --prefer-offline --no-audit && \
# Install Python and Node dependencies in one layer, no cache
RUN pip install --no-cache-dir uv --break-system-packages && \
uv pip install --system --break-system-packages --no-cache -e ".[all]" && \
npm install --prefer-offline --no-audit && \
npx playwright install --with-deps chromium --only-shell && \
cd /opt/hermes/scripts/whatsapp-bridge && \
npm install --prefer-offline --no-audit && \
npm cache clean --force
# Hand ownership to hermes user, then install Python deps in a virtualenv
RUN chown -R hermes:hermes /opt/hermes
USER hermes
RUN uv venv && \
uv pip install --no-cache-dir -e ".[all]"
USER root
WORKDIR /opt/hermes
RUN chmod +x /opt/hermes/docker/entrypoint.sh
ENV HERMES_HOME=/opt/data

View File

@@ -13,7 +13,7 @@
**The self-improving AI agent built by [Nous Research](https://nousresearch.com).** It's the only agent with a built-in learning loop — it creates skills from experience, improves them during use, nudges itself to persist knowledge, searches its own past conversations, and builds a deepening model of who you are across sessions. Run it on a $5 VPS, a GPU cluster, or serverless infrastructure that costs nearly nothing when idle. It's not tied to your laptop — talk to it from Telegram while it works on a cloud VM.
Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [Xiaomi MiMo](https://platform.xiaomimimo.com), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), [Hugging Face](https://huggingface.co), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
Use any model you want — [Nous Portal](https://portal.nousresearch.com), [OpenRouter](https://openrouter.ai) (200+ models), [z.ai/GLM](https://z.ai), [Kimi/Moonshot](https://platform.moonshot.ai), [MiniMax](https://www.minimax.io), OpenAI, or your own endpoint. Switch with `hermes model` — no code changes, no lock-in.
<table>
<tr><td><b>A real terminal interface</b></td><td>Full TUI with multiline editing, slash-command autocomplete, conversation history, interrupt-and-redirect, and streaming tool output.</td></tr>
@@ -167,7 +167,6 @@ python -m pytest tests/ -q
- 📚 [Skills Hub](https://agentskills.io)
- 🐛 [Issues](https://github.com/NousResearch/hermes-agent/issues)
- 💡 [Discussions](https://github.com/NousResearch/hermes-agent/discussions)
- 🔌 [HermesClaw](https://github.com/AaronWong1999/hermesclaw) — Community WeChat bridge: Run Hermes Agent and OpenClaw on the same WeChat account.
---

View File

@@ -1,329 +0,0 @@
# Hermes Agent v0.9.0 (v2026.4.13)
**Release Date:** April 13, 2026
**Since v0.8.0:** 487 commits · 269 merged PRs · 167 resolved issues · 493 files changed · 63,281 insertions · 24 contributors
> The everywhere release — Hermes goes mobile with Termux/Android, adds iMessage and WeChat, ships Fast Mode for OpenAI and Anthropic, introduces background process monitoring, launches a local web dashboard for managing your agent, and delivers the deepest security hardening pass yet across 16 supported platforms.
---
## ✨ Highlights
- **Local Web Dashboard** — A new browser-based dashboard for managing your Hermes Agent locally. Configure settings, monitor sessions, browse skills, and manage your gateway — all from a clean web interface without touching config files or the terminal. The easiest way to get started with Hermes.
- **Fast Mode (`/fast`)** — Priority processing for OpenAI and Anthropic models. Toggle `/fast` to route through priority queues for significantly lower latency on supported models (GPT-5.4, Codex, Claude). Expands across all OpenAI Priority Processing models and Anthropic's fast tier. ([#6875](https://github.com/NousResearch/hermes-agent/pull/6875), [#6960](https://github.com/NousResearch/hermes-agent/pull/6960), [#7037](https://github.com/NousResearch/hermes-agent/pull/7037))
- **iMessage via BlueBubbles** — Full iMessage integration through BlueBubbles, bringing Hermes to Apple's messaging ecosystem. Auto-webhook registration, setup wizard integration, and crash resilience. ([#6437](https://github.com/NousResearch/hermes-agent/pull/6437), [#6460](https://github.com/NousResearch/hermes-agent/pull/6460), [#6494](https://github.com/NousResearch/hermes-agent/pull/6494))
- **WeChat (Weixin) & WeCom Callback Mode** — Native WeChat support via iLink Bot API and a new WeCom callback-mode adapter for self-built enterprise apps. Streaming cursor, media uploads, markdown link handling, and atomic state persistence. Hermes now covers the Chinese messaging ecosystem end-to-end. ([#7166](https://github.com/NousResearch/hermes-agent/pull/7166), [#7943](https://github.com/NousResearch/hermes-agent/pull/7943))
- **Termux / Android Support** — Run Hermes natively on Android via Termux. Adapted install paths, TUI optimizations for mobile screens, voice backend support, and the `/image` command work on-device. ([#6834](https://github.com/NousResearch/hermes-agent/pull/6834))
- **Background Process Monitoring (`watch_patterns`)** — Set patterns to watch for in background process output and get notified in real-time when they match. Monitor for errors, wait for specific events ("listening on port"), or watch build logs — all without polling. ([#7635](https://github.com/NousResearch/hermes-agent/pull/7635))
- **Native xAI & Xiaomi MiMo Providers** — First-class provider support for xAI (Grok) and Xiaomi MiMo, with direct API access, model catalogs, and setup wizard integration. Plus Qwen OAuth with portal request support. ([#7372](https://github.com/NousResearch/hermes-agent/pull/7372), [#7855](https://github.com/NousResearch/hermes-agent/pull/7855))
- **Pluggable Context Engine** — Context management is now a pluggable slot via `hermes plugins`. Swap in custom context engines that control what the agent sees each turn — filtering, summarization, or domain-specific context injection. ([#7464](https://github.com/NousResearch/hermes-agent/pull/7464))
- **Unified Proxy Support** — SOCKS proxy, `DISCORD_PROXY`, and system proxy auto-detection across all gateway platforms. Hermes behind corporate firewalls just works. ([#6814](https://github.com/NousResearch/hermes-agent/pull/6814))
- **Comprehensive Security Hardening** — Path traversal protection in checkpoint manager, shell injection neutralization in sandbox writes, SSRF redirect guards in Slack image uploads, Twilio webhook signature validation (SMS RCE fix), API server auth enforcement, git argument injection prevention, and approval button authorization. ([#7933](https://github.com/NousResearch/hermes-agent/pull/7933), [#7944](https://github.com/NousResearch/hermes-agent/pull/7944), [#7940](https://github.com/NousResearch/hermes-agent/pull/7940), [#7151](https://github.com/NousResearch/hermes-agent/pull/7151), [#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
- **`hermes backup` & `hermes import`** — Full backup and restore of your Hermes configuration, sessions, skills, and memory. Migrate between machines or create snapshots before major changes. ([#7997](https://github.com/NousResearch/hermes-agent/pull/7997))
- **16 Supported Platforms** — With BlueBubbles (iMessage) and WeChat joining Telegram, Discord, Slack, WhatsApp, Signal, Matrix, Email, SMS, DingTalk, Feishu, WeCom, Mattermost, Home Assistant, and Webhooks, Hermes now runs on 16 messaging platforms out of the box.
- **`/debug` & `hermes debug share`** — New debugging toolkit: `/debug` slash command across all platforms for quick diagnostics, plus `hermes debug share` to upload a full debug report to a pastebin for easy sharing when troubleshooting. ([#8681](https://github.com/NousResearch/hermes-agent/pull/8681))
---
## 🏗️ Core Agent & Architecture
### Provider & Model Support
- **Native xAI (Grok) provider** with direct API access and model catalog ([#7372](https://github.com/NousResearch/hermes-agent/pull/7372))
- **Xiaomi MiMo as first-class provider** — setup wizard, model catalog, empty response recovery ([#7855](https://github.com/NousResearch/hermes-agent/pull/7855))
- **Qwen OAuth provider** with portal request support ([#6282](https://github.com/NousResearch/hermes-agent/pull/6282))
- **Fast Mode** — `/fast` toggle for OpenAI Priority Processing + Anthropic fast tier ([#6875](https://github.com/NousResearch/hermes-agent/pull/6875), [#6960](https://github.com/NousResearch/hermes-agent/pull/6960), [#7037](https://github.com/NousResearch/hermes-agent/pull/7037))
- **Structured API error classification** for smart failover decisions ([#6514](https://github.com/NousResearch/hermes-agent/pull/6514))
- **Rate limit header capture** shown in `/usage` ([#6541](https://github.com/NousResearch/hermes-agent/pull/6541))
- **API server model name** derived from profile name ([#6857](https://github.com/NousResearch/hermes-agent/pull/6857))
- **Custom providers** now included in `/model` listings and resolution ([#7088](https://github.com/NousResearch/hermes-agent/pull/7088))
- **Fallback provider activation** on repeated empty responses with user-visible status ([#7505](https://github.com/NousResearch/hermes-agent/pull/7505))
- **OpenRouter variant tags** (`:free`, `:extended`, `:fast`) preserved during model switch ([#6383](https://github.com/NousResearch/hermes-agent/pull/6383))
- **Credential exhaustion TTL** reduced from 24 hours to 1 hour ([#6504](https://github.com/NousResearch/hermes-agent/pull/6504))
- **OAuth credential lifecycle** hardening — stale pool keys, auth.json sync, Codex CLI race fixes ([#6874](https://github.com/NousResearch/hermes-agent/pull/6874))
- Empty response recovery for reasoning models (MiMo, Qwen, GLM) ([#8609](https://github.com/NousResearch/hermes-agent/pull/8609))
- MiniMax context lengths, thinking guard, endpoint corrections ([#6082](https://github.com/NousResearch/hermes-agent/pull/6082), [#7126](https://github.com/NousResearch/hermes-agent/pull/7126))
- Z.AI endpoint auto-detect via probe and cache ([#5763](https://github.com/NousResearch/hermes-agent/pull/5763))
### Agent Loop & Conversation
- **Pluggable context engine slot** via `hermes plugins` ([#7464](https://github.com/NousResearch/hermes-agent/pull/7464))
- **Background process monitoring** — `watch_patterns` for real-time output alerts ([#7635](https://github.com/NousResearch/hermes-agent/pull/7635))
- **Improved context compression** — higher limits, tool tracking, degradation warnings, token-budget tail protection ([#6395](https://github.com/NousResearch/hermes-agent/pull/6395), [#6453](https://github.com/NousResearch/hermes-agent/pull/6453))
- **`/compress <focus>`** — guided compression with a focus topic ([#8017](https://github.com/NousResearch/hermes-agent/pull/8017))
- **Tiered context pressure warnings** with gateway dedup ([#6411](https://github.com/NousResearch/hermes-agent/pull/6411))
- **Staged inactivity warning** before timeout escalation ([#6387](https://github.com/NousResearch/hermes-agent/pull/6387))
- **Prevent agent from stopping mid-task** — compression floor, budget overhaul, activity tracking ([#7983](https://github.com/NousResearch/hermes-agent/pull/7983))
- **Propagate child activity to parent** during `delegate_task` ([#7295](https://github.com/NousResearch/hermes-agent/pull/7295))
- **Truncated streaming tool call detection** before execution ([#6847](https://github.com/NousResearch/hermes-agent/pull/6847))
- Empty response retry (3 attempts with nudge) ([#6488](https://github.com/NousResearch/hermes-agent/pull/6488))
- Adaptive streaming backoff + cursor strip to prevent message truncation ([#7683](https://github.com/NousResearch/hermes-agent/pull/7683))
- Compression uses live session model instead of stale persisted config ([#8258](https://github.com/NousResearch/hermes-agent/pull/8258))
- Strip `<thought>` tags from Gemma 4 responses ([#8562](https://github.com/NousResearch/hermes-agent/pull/8562))
- Prevent `<think>` in prose from suppressing response output ([#6968](https://github.com/NousResearch/hermes-agent/pull/6968))
- Turn-exit diagnostic logging to agent loop ([#6549](https://github.com/NousResearch/hermes-agent/pull/6549))
- Scope tool interrupt signal per-thread to prevent cross-session leaks ([#7930](https://github.com/NousResearch/hermes-agent/pull/7930))
### Memory & Sessions
- **Hindsight memory plugin** — feature parity, setup wizard, config improvements — @nicoloboschi ([#6428](https://github.com/NousResearch/hermes-agent/pull/6428))
- **Honcho** — opt-in `initOnSessionStart` for tools mode — @Kathie-yu ([#6995](https://github.com/NousResearch/hermes-agent/pull/6995))
- Orphan children instead of cascade-deleting in prune/delete ([#6513](https://github.com/NousResearch/hermes-agent/pull/6513))
- Doctor command only checks the active memory provider ([#6285](https://github.com/NousResearch/hermes-agent/pull/6285))
---
## 📱 Messaging Platforms (Gateway)
### New Platforms
- **BlueBubbles (iMessage)** — full adapter with auto-webhook registration, setup wizard, and crash resilience ([#6437](https://github.com/NousResearch/hermes-agent/pull/6437), [#6460](https://github.com/NousResearch/hermes-agent/pull/6460), [#6494](https://github.com/NousResearch/hermes-agent/pull/6494), [#7107](https://github.com/NousResearch/hermes-agent/pull/7107))
- **Weixin (WeChat)** — native support via iLink Bot API with streaming, media uploads, markdown links ([#7166](https://github.com/NousResearch/hermes-agent/pull/7166), [#8665](https://github.com/NousResearch/hermes-agent/pull/8665))
- **WeCom Callback Mode** — self-built enterprise app adapter with atomic state persistence ([#7943](https://github.com/NousResearch/hermes-agent/pull/7943), [#7928](https://github.com/NousResearch/hermes-agent/pull/7928))
### Discord
- **Allowed channels whitelist** config — @jarvis-phw ([#7044](https://github.com/NousResearch/hermes-agent/pull/7044))
- **Forum channel topic inheritance** in thread sessions — @hermes-agent-dhabibi ([#6377](https://github.com/NousResearch/hermes-agent/pull/6377))
- **DISCORD_REPLY_TO_MODE** setting ([#6333](https://github.com/NousResearch/hermes-agent/pull/6333))
- Accept `.log` attachments, raise document size limit — @kira-ariaki ([#6467](https://github.com/NousResearch/hermes-agent/pull/6467))
- Decouple readiness from slash sync ([#8016](https://github.com/NousResearch/hermes-agent/pull/8016))
### Slack
- **Consolidated Slack improvements** — 7 community PRs salvaged into one ([#6809](https://github.com/NousResearch/hermes-agent/pull/6809))
- Handle assistant thread lifecycle events ([#6433](https://github.com/NousResearch/hermes-agent/pull/6433))
### Matrix
- **Migrated from matrix-nio to mautrix-python** ([#7518](https://github.com/NousResearch/hermes-agent/pull/7518))
- SQLite crypto store replacing pickle (fixes E2EE decryption) — @alt-glitch ([#7981](https://github.com/NousResearch/hermes-agent/pull/7981))
- Cross-signing recovery key verification for E2EE migration ([#8282](https://github.com/NousResearch/hermes-agent/pull/8282))
- DM mention threads + group chat events for Feishu ([#7423](https://github.com/NousResearch/hermes-agent/pull/7423))
### Gateway Core
- **Unified proxy support** — SOCKS, DISCORD_PROXY, multi-platform with macOS auto-detection ([#6814](https://github.com/NousResearch/hermes-agent/pull/6814))
- **Inbound text batching** for Discord, Matrix, WeCom + adaptive delay ([#6979](https://github.com/NousResearch/hermes-agent/pull/6979))
- **Surface natural mid-turn assistant messages** in chat platforms ([#7978](https://github.com/NousResearch/hermes-agent/pull/7978))
- **WSL-aware gateway** with smart systemd detection ([#7510](https://github.com/NousResearch/hermes-agent/pull/7510))
- **All missing platforms added to setup wizard** ([#7949](https://github.com/NousResearch/hermes-agent/pull/7949))
- **Per-platform `tool_progress` overrides** ([#6348](https://github.com/NousResearch/hermes-agent/pull/6348))
- **Configurable 'still working' notification interval** ([#8572](https://github.com/NousResearch/hermes-agent/pull/8572))
- `/model` switch persists across messages ([#7081](https://github.com/NousResearch/hermes-agent/pull/7081))
- `/usage` shows rate limits, cost, and token details between turns ([#7038](https://github.com/NousResearch/hermes-agent/pull/7038))
- Drain in-flight work before restart ([#7503](https://github.com/NousResearch/hermes-agent/pull/7503))
- Don't evict cached agent on failed runs — prevents MCP restart loop ([#7539](https://github.com/NousResearch/hermes-agent/pull/7539))
- Replace `os.environ` session state with `contextvars` ([#7454](https://github.com/NousResearch/hermes-agent/pull/7454))
- Derive channel directory platforms from enum instead of hardcoded list ([#7450](https://github.com/NousResearch/hermes-agent/pull/7450))
- Validate image downloads before caching (cross-platform) ([#7125](https://github.com/NousResearch/hermes-agent/pull/7125))
- Cross-platform webhook delivery for all platforms ([#7095](https://github.com/NousResearch/hermes-agent/pull/7095))
- Cron Discord thread_id delivery support ([#7106](https://github.com/NousResearch/hermes-agent/pull/7106))
- Feishu QR-based bot onboarding ([#8570](https://github.com/NousResearch/hermes-agent/pull/8570))
- Gateway status scoped to active profile ([#7951](https://github.com/NousResearch/hermes-agent/pull/7951))
- Prevent background process notifications from triggering false pairing requests ([#6434](https://github.com/NousResearch/hermes-agent/pull/6434))
---
## 🖥️ CLI & User Experience
### Interactive CLI
- **Termux / Android support** — adapted install paths, TUI, voice, `/image` ([#6834](https://github.com/NousResearch/hermes-agent/pull/6834))
- **Native `/model` picker modal** for provider → model selection ([#8003](https://github.com/NousResearch/hermes-agent/pull/8003))
- **Live per-tool elapsed timer** restored in TUI spinner ([#7359](https://github.com/NousResearch/hermes-agent/pull/7359))
- **Stacked tool progress scrollback** in TUI ([#8201](https://github.com/NousResearch/hermes-agent/pull/8201))
- **Random tips on new session start** (CLI + gateway, 279 tips) ([#8225](https://github.com/NousResearch/hermes-agent/pull/8225), [#8237](https://github.com/NousResearch/hermes-agent/pull/8237))
- **`hermes dump`** — copy-pasteable setup summary for debugging ([#6550](https://github.com/NousResearch/hermes-agent/pull/6550))
- **`hermes backup` / `hermes import`** — full config backup and restore ([#7997](https://github.com/NousResearch/hermes-agent/pull/7997))
- **WSL environment hint** in system prompt ([#8285](https://github.com/NousResearch/hermes-agent/pull/8285))
- **Profile creation UX** — seed SOUL.md + credential warning ([#8553](https://github.com/NousResearch/hermes-agent/pull/8553))
- Shell-aware sudo detection, empty password support ([#6517](https://github.com/NousResearch/hermes-agent/pull/6517))
- Flush stdin after curses/terminal menus to prevent escape sequence leakage ([#7167](https://github.com/NousResearch/hermes-agent/pull/7167))
- Handle broken stdin in prompt_toolkit startup ([#8560](https://github.com/NousResearch/hermes-agent/pull/8560))
### Setup & Configuration
- **Per-platform display verbosity** configuration ([#8006](https://github.com/NousResearch/hermes-agent/pull/8006))
- **Component-separated logging** with session context and filtering ([#7991](https://github.com/NousResearch/hermes-agent/pull/7991))
- **`network.force_ipv4`** config to fix IPv6 timeout issues ([#8196](https://github.com/NousResearch/hermes-agent/pull/8196))
- **Standardize message whitespace and JSON formatting** ([#7988](https://github.com/NousResearch/hermes-agent/pull/7988))
- **Rebrand OpenClaw → Hermes** during migration ([#8210](https://github.com/NousResearch/hermes-agent/pull/8210))
- Config.yaml takes priority over env vars for auxiliary settings ([#7889](https://github.com/NousResearch/hermes-agent/pull/7889))
- Harden setup provider flows + live OpenRouter catalog refresh ([#7078](https://github.com/NousResearch/hermes-agent/pull/7078))
- Normalize reasoning effort ordering across all surfaces ([#6804](https://github.com/NousResearch/hermes-agent/pull/6804))
- Remove dead `LLM_MODEL` env var + migration to clear stale entries ([#6543](https://github.com/NousResearch/hermes-agent/pull/6543))
- Remove `/prompt` slash command — prefix expansion footgun ([#6752](https://github.com/NousResearch/hermes-agent/pull/6752))
- `HERMES_HOME_MODE` env var to override permissions — @ygd58 ([#6993](https://github.com/NousResearch/hermes-agent/pull/6993))
- Fall back to default model when model config is empty ([#8303](https://github.com/NousResearch/hermes-agent/pull/8303))
- Warn when compression model context is too small ([#7894](https://github.com/NousResearch/hermes-agent/pull/7894))
---
## 🔧 Tool System
### Environments & Execution
- **Unified spawn-per-call execution layer** for environments ([#6343](https://github.com/NousResearch/hermes-agent/pull/6343))
- **Unified file sync** with mtime tracking, deletion, and transactional state ([#7087](https://github.com/NousResearch/hermes-agent/pull/7087))
- **Persistent sandbox envs** survive between turns ([#6412](https://github.com/NousResearch/hermes-agent/pull/6412))
- **Bulk file sync** via tar pipe for SSH/Modal backends — @alt-glitch ([#8014](https://github.com/NousResearch/hermes-agent/pull/8014))
- **Daytona** — bulk upload, config bridge, silent disk cap ([#7538](https://github.com/NousResearch/hermes-agent/pull/7538))
- Foreground timeout cap to prevent session deadlocks ([#7082](https://github.com/NousResearch/hermes-agent/pull/7082))
- Guard invalid command values ([#6417](https://github.com/NousResearch/hermes-agent/pull/6417))
### MCP
- **`hermes mcp add --env` and `--preset`** support ([#7970](https://github.com/NousResearch/hermes-agent/pull/7970))
- Combine `content` and `structuredContent` when both present ([#7118](https://github.com/NousResearch/hermes-agent/pull/7118))
- MCP tool name deconfliction fixes ([#7654](https://github.com/NousResearch/hermes-agent/pull/7654))
### Browser
- Browser hardening — dead code removal, caching, scroll perf, security, thread safety ([#7354](https://github.com/NousResearch/hermes-agent/pull/7354))
- `/browser connect` auto-launch uses dedicated Chrome profile dir ([#6821](https://github.com/NousResearch/hermes-agent/pull/6821))
- Reap orphaned browser sessions on startup ([#7931](https://github.com/NousResearch/hermes-agent/pull/7931))
### Voice & Vision
- **Voxtral TTS provider** (Mistral AI) ([#7653](https://github.com/NousResearch/hermes-agent/pull/7653))
- **TTS speed support** for Edge TTS, OpenAI TTS, MiniMax ([#8666](https://github.com/NousResearch/hermes-agent/pull/8666))
- **Vision auto-resize** for oversized images, raise limit to 20 MB, retry-on-failure ([#7883](https://github.com/NousResearch/hermes-agent/pull/7883), [#7902](https://github.com/NousResearch/hermes-agent/pull/7902))
- STT provider-model mismatch fix (whisper-1 vs faster-whisper) ([#7113](https://github.com/NousResearch/hermes-agent/pull/7113))
### Other Tools
- **`hermes dump`** command for setup summary ([#6550](https://github.com/NousResearch/hermes-agent/pull/6550))
- TODO store enforces ID uniqueness during replace operations ([#7986](https://github.com/NousResearch/hermes-agent/pull/7986))
- List all available toolsets in `delegate_task` schema description ([#8231](https://github.com/NousResearch/hermes-agent/pull/8231))
- API server: tool progress as custom SSE event to prevent model corruption ([#7500](https://github.com/NousResearch/hermes-agent/pull/7500))
- API server: share one Docker container across all conversations ([#7127](https://github.com/NousResearch/hermes-agent/pull/7127))
---
## 🧩 Skills Ecosystem
- **Centralized skills index + tree cache** — eliminates rate-limit failures on install ([#8575](https://github.com/NousResearch/hermes-agent/pull/8575))
- **More aggressive skill loading instructions** in system prompt (v3) ([#8209](https://github.com/NousResearch/hermes-agent/pull/8209), [#8286](https://github.com/NousResearch/hermes-agent/pull/8286))
- **Google Workspace skill** migrated to GWS CLI backend ([#6788](https://github.com/NousResearch/hermes-agent/pull/6788))
- **Creative divergence strategies** skill — @SHL0MS ([#6882](https://github.com/NousResearch/hermes-agent/pull/6882))
- **Creative ideation** — constraint-driven project generation — @SHL0MS ([#7555](https://github.com/NousResearch/hermes-agent/pull/7555))
- Parallelize skills browse/search to prevent hanging ([#7301](https://github.com/NousResearch/hermes-agent/pull/7301))
- Read name from SKILL.md frontmatter in skills_sync ([#7623](https://github.com/NousResearch/hermes-agent/pull/7623))
---
## 🔒 Security & Reliability
### Security Hardening
- **Twilio webhook signature validation** — SMS RCE fix ([#7933](https://github.com/NousResearch/hermes-agent/pull/7933))
- **Shell injection neutralization** in `_write_to_sandbox` via path quoting ([#7940](https://github.com/NousResearch/hermes-agent/pull/7940))
- **Git argument injection** and path traversal prevention in checkpoint manager ([#7944](https://github.com/NousResearch/hermes-agent/pull/7944))
- **SSRF redirect bypass** in Slack image uploads + base.py cache helpers ([#7151](https://github.com/NousResearch/hermes-agent/pull/7151))
- **Path traversal, credential gate, DANGEROUS_PATTERNS gaps** ([#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
- **API bind guard** — enforce `API_SERVER_KEY` for non-loopback binding ([#7455](https://github.com/NousResearch/hermes-agent/pull/7455))
- **Approval button authorization** — require auth for session continuation — @Cafexss ([#6930](https://github.com/NousResearch/hermes-agent/pull/6930))
- Path boundary enforcement in skill manager operations ([#7156](https://github.com/NousResearch/hermes-agent/pull/7156))
- DingTalk/API webhook URL origin validation, header injection rejection ([#7455](https://github.com/NousResearch/hermes-agent/pull/7455))
### Reliability
- **Contextual error diagnostics** for invalid API responses ([#8565](https://github.com/NousResearch/hermes-agent/pull/8565))
- **Prevent 400 format errors** from triggering compression loop on Codex ([#6751](https://github.com/NousResearch/hermes-agent/pull/6751))
- **Don't halve context_length** on output-cap-too-large errors — @KUSH42 ([#6664](https://github.com/NousResearch/hermes-agent/pull/6664))
- **Recover primary client** on OpenAI transport errors ([#7108](https://github.com/NousResearch/hermes-agent/pull/7108))
- **Credential pool rotation** on billing-classified 400s ([#7112](https://github.com/NousResearch/hermes-agent/pull/7112))
- **Auto-increase stream read timeout** for local LLM providers ([#6967](https://github.com/NousResearch/hermes-agent/pull/6967))
- **Fall back to default certs** when CA bundle path doesn't exist ([#7352](https://github.com/NousResearch/hermes-agent/pull/7352))
- **Disambiguate usage-limit patterns** in error classifier — @sprmn24 ([#6836](https://github.com/NousResearch/hermes-agent/pull/6836))
- Harden cron script timeout and provider recovery ([#7079](https://github.com/NousResearch/hermes-agent/pull/7079))
- Gateway interrupt detection resilient to monitor task failures ([#8208](https://github.com/NousResearch/hermes-agent/pull/8208))
- Prevent unwanted session auto-reset after graceful gateway restarts ([#8299](https://github.com/NousResearch/hermes-agent/pull/8299))
- Prevent duplicate update prompt spam in gateway watcher ([#8343](https://github.com/NousResearch/hermes-agent/pull/8343))
- Deduplicate reasoning items in Responses API input ([#7946](https://github.com/NousResearch/hermes-agent/pull/7946))
### Infrastructure
- **Multi-arch Docker image** — amd64 + arm64 ([#6124](https://github.com/NousResearch/hermes-agent/pull/6124))
- **Docker runs as non-root user** with virtualenv — @benbarclay contributing ([#8226](https://github.com/NousResearch/hermes-agent/pull/8226))
- **Use `uv`** for Docker dependency resolution to fix resolution-too-deep ([#6965](https://github.com/NousResearch/hermes-agent/pull/6965))
- **Container-aware Nix CLI** — auto-route into managed container — @alt-glitch ([#7543](https://github.com/NousResearch/hermes-agent/pull/7543))
- **Nix shared-state permission model** for interactive CLI users — @alt-glitch ([#6796](https://github.com/NousResearch/hermes-agent/pull/6796))
- **Per-profile subprocess HOME isolation** ([#7357](https://github.com/NousResearch/hermes-agent/pull/7357))
- Profile paths fixed in Docker — profiles go to mounted volume ([#7170](https://github.com/NousResearch/hermes-agent/pull/7170))
- Docker container gateway pathway hardened ([#8614](https://github.com/NousResearch/hermes-agent/pull/8614))
- Enable unbuffered stdout for live Docker logs ([#6749](https://github.com/NousResearch/hermes-agent/pull/6749))
- Install procps in Docker image — @HiddenPuppy ([#7032](https://github.com/NousResearch/hermes-agent/pull/7032))
- Shallow git clone for faster installation — @sosyz ([#8396](https://github.com/NousResearch/hermes-agent/pull/8396))
- `hermes update` always reset on stash conflict ([#7010](https://github.com/NousResearch/hermes-agent/pull/7010))
- Write update exit code before gateway restart (cgroup kill race) ([#8288](https://github.com/NousResearch/hermes-agent/pull/8288))
- Nix: `setupSecrets` optional, tirith runtime dep — @devorun, @ethernet8023 ([#6261](https://github.com/NousResearch/hermes-agent/pull/6261), [#6721](https://github.com/NousResearch/hermes-agent/pull/6721))
- launchd stop uses `bootout` so `KeepAlive` doesn't respawn ([#7119](https://github.com/NousResearch/hermes-agent/pull/7119))
---
## 🐛 Notable Bug Fixes
- Fix: `/model` switch not persisting across gateway messages ([#7081](https://github.com/NousResearch/hermes-agent/pull/7081))
- Fix: session-scoped gateway model overrides ignored — @Hygaard ([#7662](https://github.com/NousResearch/hermes-agent/pull/7662))
- Fix: compaction model context length ignoring config — 3 related issues ([#8258](https://github.com/NousResearch/hermes-agent/pull/8258), [#8107](https://github.com/NousResearch/hermes-agent/pull/8107))
- Fix: OpenCode.ai context window resolved to 128K instead of 1M ([#6472](https://github.com/NousResearch/hermes-agent/pull/6472))
- Fix: Codex fallback auth-store lookup — @cherifya ([#6462](https://github.com/NousResearch/hermes-agent/pull/6462))
- Fix: duplicate completion notifications when process killed ([#7124](https://github.com/NousResearch/hermes-agent/pull/7124))
- Fix: agent daemon thread prevents orphan CLI processes on tab close ([#8557](https://github.com/NousResearch/hermes-agent/pull/8557))
- Fix: stale image attachment on text paste and voice input ([#7077](https://github.com/NousResearch/hermes-agent/pull/7077))
- Fix: DM thread session seeding causing cross-thread contamination ([#7084](https://github.com/NousResearch/hermes-agent/pull/7084))
- Fix: OpenClaw migration shows dry-run preview before executing ([#6769](https://github.com/NousResearch/hermes-agent/pull/6769))
- Fix: auth errors misclassified as retryable — @kuishou68 ([#7027](https://github.com/NousResearch/hermes-agent/pull/7027))
- Fix: Copilot-Integration-Id header missing ([#7083](https://github.com/NousResearch/hermes-agent/pull/7083))
- Fix: ACP session capabilities — @luyao618 ([#6985](https://github.com/NousResearch/hermes-agent/pull/6985))
- Fix: ACP PromptResponse usage from top-level fields ([#7086](https://github.com/NousResearch/hermes-agent/pull/7086))
- Fix: several failing/flaky tests on main — @dsocolobsky ([#6777](https://github.com/NousResearch/hermes-agent/pull/6777))
- Fix: backup marker filenames — @sprmn24 ([#8600](https://github.com/NousResearch/hermes-agent/pull/8600))
- Fix: `NoneType` in fast_mode check — @0xbyt4 ([#7350](https://github.com/NousResearch/hermes-agent/pull/7350))
- Fix: missing imports in uninstall.py — @JiayuuWang ([#7034](https://github.com/NousResearch/hermes-agent/pull/7034))
---
## 📚 Documentation
- Platform adapter developer guide + WeCom Callback docs ([#7969](https://github.com/NousResearch/hermes-agent/pull/7969))
- Cron troubleshooting guide ([#7122](https://github.com/NousResearch/hermes-agent/pull/7122))
- Streaming timeout auto-detection for local LLMs ([#6990](https://github.com/NousResearch/hermes-agent/pull/6990))
- Tool-use enforcement documentation expanded ([#7984](https://github.com/NousResearch/hermes-agent/pull/7984))
- BlueBubbles pairing instructions ([#6548](https://github.com/NousResearch/hermes-agent/pull/6548))
- Telegram proxy support section ([#6348](https://github.com/NousResearch/hermes-agent/pull/6348))
- `hermes dump` and `hermes logs` CLI reference ([#6552](https://github.com/NousResearch/hermes-agent/pull/6552))
- `tool_progress_overrides` configuration reference ([#6364](https://github.com/NousResearch/hermes-agent/pull/6364))
- Compression model context length warning docs ([#7879](https://github.com/NousResearch/hermes-agent/pull/7879))
---
## 👥 Contributors
**269 merged PRs** from **24 contributors** across **487 commits**.
### Community Contributors
- **@alt-glitch** (6 PRs) — Nix container-aware CLI, shared-state permissions, Matrix SQLite crypto store, bulk SSH/Modal file sync, Matrix mautrix compat
- **@SHL0MS** (2 PRs) — Creative divergence strategies skill, creative ideation skill
- **@sprmn24** (2 PRs) — Error classifier disambiguation, backup marker fix
- **@nicoloboschi** — Hindsight memory plugin feature parity
- **@Hygaard** — Session-scoped gateway model override fix
- **@jarvis-phw** — Discord allowed_channels whitelist
- **@Kathie-yu** — Honcho initOnSessionStart for tools mode
- **@hermes-agent-dhabibi** — Discord forum channel topic inheritance
- **@kira-ariaki** — Discord .log attachments and size limit
- **@cherifya** — Codex fallback auth-store lookup
- **@Cafexss** — Security: auth for session continuation
- **@KUSH42** — Compaction context_length fix
- **@kuishou68** — Auth error retryable classification fix
- **@luyao618** — ACP session capabilities
- **@ygd58** — HERMES_HOME_MODE env var override
- **@0xbyt4** — Fast mode NoneType fix
- **@JiayuuWang** — CLI uninstall import fix
- **@HiddenPuppy** — Docker procps installation
- **@dsocolobsky** — Test suite fixes
- **@bobashopcashier** (1 PR) — Graceful gateway drain before restart (salvaged into #7503 from #7290)
- **@benbarclay** — Docker image tag simplification
- **@sosyz** — Shallow git clone for faster install
- **@devorun** — Nix setupSecrets optional
- **@ethernet8023** — Nix tirith runtime dep
---
**Full Changelog**: [v2026.4.8...v2026.4.13](https://github.com/NousResearch/hermes-agent/compare/v2026.4.8...v2026.4.13)

View File

@@ -1,172 +0,0 @@
# Vector Database SOTA Research Report
## For AI Agent Semantic Retrieval — April 2026
---
## Executive Summary
Analysis of current vector database benchmarks, documentation, and production deployments for semantic retrieval in AI agents. Compared against existing Hermes session_search (SQLite FTS5) and holographic memory systems.
---
## 1. Retrieval Accuracy (Recall@10)
| Database | HNSW Recall | IVF Recall | Notes |
|----------|-------------|------------|-------|
| **Qdrant** | 0.95-0.99 | N/A | Tunable via ef parameter |
| **Milvus** | 0.95-0.99 | 0.85-0.95 | Multiple index support |
| **Weaviate** | 0.95-0.98 | N/A | HNSW primary |
| **Pinecone** | 0.95-0.99 | N/A | Managed, opaque tuning |
| **ChromaDB** | 0.90-0.95 | N/A | Simpler, uses HNSW via hnswlib |
| **pgvector** | 0.85-0.95 | 0.80-0.90 | Depends on tuning |
| **SQLite-vss** | 0.80-0.90 | N/A | HNSW via sqlite-vss |
| **Current FTS5** | ~0.60-0.75* | N/A | Keyword matching only |
*FTS5 "recall" estimated: good for exact keywords, poor for semantic/paraphrased queries.
---
## 2. Latency Benchmarks (1M vectors, 768-dim, 10 neighbors)
| Database | p50 (ms) | p99 (ms) | QPS | Notes |
|----------|----------|----------|-----|-------|
| **Qdrant** | 1-3 | 5-10 | 5,000-15,000 | Best self-hosted |
| **Milvus** | 2-5 | 8-15 | 3,000-12,000 | Good distributed |
| **Weaviate** | 3-8 | 10-25 | 2,000-8,000 | |
| **Pinecone** | 5-15 | 20-50 | 1,000-5,000 | Managed overhead |
| **ChromaDB** | 5-15 | 20-50 | 500-2,000 | Embedded mode |
| **pgvector** | 10-50 | 50-200 | 200-1,000 | SQL overhead |
| **SQLite-vss** | 10-30 | 50-150 | 300-800 | Limited scalability |
| **Current FTS5** | 2-10 | 15-50 | 1,000-5,000 | No embedding cost |
---
## 3. Index Types Comparison
### HNSW (Hierarchical Navigable Small World)
- Best for: High recall, moderate memory, fast queries
- Used by: Qdrant, Weaviate, ChromaDB, Milvus, pgvector, SQLite-vss
- Memory: High (~1.5GB per 1M 768-dim vectors)
- Key parameters: ef_construction (100-500), M (16-64), ef (64-256)
### IVF (Inverted File Index)
- Best for: Large datasets, memory-constrained
- Used by: Milvus, pgvector
- Memory: Lower (~0.5GB per 1M vectors)
- Key parameters: nlist (100-10000), nprobe (10-100)
### DiskANN / SPANN
- Best for: 100M+ vectors on disk
- Memory: Very low (~100MB index)
### Quantization (SQ/PQ)
- Memory reduction: 4-8x
- Recall impact: -5-15%
---
## 4. Multi-Modal Support
| Database | Text | Image | Audio | Video | Mixed Queries |
|----------|------|-------|-------|-------|---------------|
| Qdrant | ✅ | ✅ | ✅ | ✅ | ✅ (multi-vector) |
| Milvus | ✅ | ✅ | ✅ | ✅ | ✅ (hybrid) |
| Weaviate | ✅ | ✅ | ✅ | ✅ | ✅ (named vectors) |
| Pinecone | ✅ | ✅ | ✅ | ✅ | Limited |
| ChromaDB | ✅ | Via emb | Via emb | Via emb | Limited |
| pgvector | ✅ | Via emb | Via emb | Via emb | Limited |
| SQLite-vss | ✅ | Via emb | Via emb | Via emb | Limited |
---
## 5. Integration Patterns for AI Agents
### Pattern A: Direct Search
Query → Embedding → Vector DB → Top-K → LLM
### Pattern B: Hybrid Search
Query → BM25 + Vector → Merge/Rerank → LLM
### Pattern C: Multi-Stage
Query → Vector DB (top-100) → Reranker (top-10) → LLM
### Pattern D: Agent Memory with Trust + Decay
Query → Vector → Score × Trust × Decay → Top-K → Summarize
---
## 6. Comparison with Current Systems
### session_search (FTS5)
Strengths: Zero deps, no embedding needed, fast for exact keywords
Limitations: No semantic understanding, no cross-lingual, limited ranking
### holographic/retrieval.py (HRR)
Strengths: Compositional queries, contradiction detection, trust + decay
Limitations: Requires numpy, O(n) scan, non-standard embedding space
### Expected Gains from Vector DB:
- Semantic recall: +30-50% for paraphrased queries
- Cross-lingual: +60-80%
- Fuzzy matching: +40-60%
- Conceptual: +50-70%
---
## 7. Recommendations
### Option 1: Qdrant (RECOMMENDED)
- Best self-hosted performance
- Rust implementation, native multi-vector
- Tradeoff: Separate service deployment
### Option 2: pgvector (CONSERVATIVE)
- Zero new infrastructure if using PostgreSQL
- Tradeoff: 5-10x slower than Qdrant
### Option 3: SQLite-vss (LIGHTWEIGHT)
- Minimal changes, embedded deployment
- Tradeoff: Limited scalability (<100K vectors)
### Option 4: Hybrid (BEST OF BOTH)
Keep FTS5 + HRR and add Qdrant:
- Vector (semantic) + FTS5 (keyword) + HRR (compositional)
- Apply trust scoring + temporal decay
---
## 8. Embedding Models (2025-2026)
| Model | Dimensions | Quality | Cost |
|-------|-----------|---------|------|
| OpenAI text-embedding-3-large | 3072 | Best | $$$ |
| OpenAI text-embedding-3-small | 1536 | Good | $ |
| BGE-M3 | 1024 | Best self-hosted | Free |
| GTE-Qwen2 | 768-1024 | Good | Free |
---
## 9. Hardware Requirements (1M vectors, 768-dim)
| Database | RAM (HNSW) | RAM (Quantized) |
|----------|-----------|-----------------|
| Qdrant | 8-16GB | 2-4GB |
| Milvus | 16-32GB | 4-8GB |
| pgvector | 4-8GB | N/A |
| SQLite-vss | 2-4GB | N/A |
---
## 10. Conclusion
Primary: Qdrant with hybrid search (vector + FTS5 + HRR)
Key insight: Augment existing HRR system, don't replace it.
Next steps:
1. Deploy Qdrant in Docker for testing
2. Benchmark embedding models
3. Implement hybrid search prototype
4. Measure recall improvement
5. Evaluate operational complexity
Report: April 2026 | Sources: ANN-Benchmarks, VectorDBBench, official docs

View File

@@ -1,443 +0,0 @@
"""
A2A mutual-TLS server — secure agent-to-agent communication.
Each fleet agent runs an A2A server that:
- Presents its own TLS certificate (signed by the fleet CA).
- Requires the connecting peer to present a valid client certificate
also signed by the fleet CA.
- Rejects connections from unknown / self-signed peers.
Usage (standalone):
python -m agent.a2a_mtls \\
--cert ~/.hermes/pki/agents/timmy/timmy.crt \\
--key ~/.hermes/pki/agents/timmy/timmy.key \\
--ca ~/.hermes/pki/ca/fleet-ca.crt \\
--host 0.0.0.0 --port 9443
Environment variables (alternative to CLI flags):
HERMES_A2A_CERT path to agent certificate
HERMES_A2A_KEY path to agent private key
HERMES_A2A_CA path to fleet CA certificate
Refs #806
"""
from __future__ import annotations
import json
import logging
import os
import ssl
import threading
from http.server import BaseHTTPRequestHandler, HTTPServer
from pathlib import Path
from typing import Any, Callable, Dict, Optional
from urllib.error import URLError
from urllib.request import Request, urlopen
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# mTLS SSL context helpers
# ---------------------------------------------------------------------------
def build_server_ssl_context(
cert: str | Path,
key: str | Path,
ca: str | Path,
) -> ssl.SSLContext:
"""Return an SSLContext that presents *cert/key* and requires a valid
client certificate signed by *ca*.
Raises ``FileNotFoundError`` if any path is missing.
Raises ``ssl.SSLError`` if the files are malformed.
"""
cert, key, ca = Path(cert), Path(key), Path(ca)
for p in (cert, key, ca):
if not p.exists():
raise FileNotFoundError(f"mTLS: file not found: {p}")
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
ctx.load_verify_locations(cafile=str(ca))
# CERT_REQUIRED — reject peers that don't present a cert signed by *ca*.
ctx.verify_mode = ssl.CERT_REQUIRED
return ctx
def build_client_ssl_context(
cert: str | Path,
key: str | Path,
ca: str | Path,
) -> ssl.SSLContext:
"""Return an SSLContext for an outgoing mTLS connection.
Presents *cert/key* as the client identity and verifies the server
certificate against *ca*.
"""
cert, key, ca = Path(cert), Path(key), Path(ca)
for p in (cert, key, ca):
if not p.exists():
raise FileNotFoundError(f"mTLS client: file not found: {p}")
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
ctx.load_cert_chain(certfile=str(cert), keyfile=str(key))
ctx.load_verify_locations(cafile=str(ca))
ctx.verify_mode = ssl.CERT_REQUIRED
ctx.check_hostname = True
return ctx
# ---------------------------------------------------------------------------
# Minimal A2A HTTP request handler
# ---------------------------------------------------------------------------
class A2AHandler(BaseHTTPRequestHandler):
"""Handles A2A requests over a mutually-authenticated TLS connection.
GET /.well-known/agent-card.json — returns the local agent card.
POST /a2a/task — dispatches an A2A task (stub).
"""
log_message = logger.debug # route access log to Python logger
def do_GET(self) -> None: # noqa: N802
if self.path in ("/.well-known/agent-card.json", "/agent-card.json"):
self._serve_agent_card()
else:
self._send_json(404, {"error": "not found"})
def do_POST(self) -> None: # noqa: N802
if self.path == "/a2a/task":
self._handle_task()
else:
self._send_json(404, {"error": "not found"})
# ------------------------------------------------------------------
def _serve_agent_card(self) -> None:
try:
from agent.agent_card import get_agent_card_json
body = get_agent_card_json().encode()
except Exception as exc:
logger.warning("agent-card unavailable: %s", exc)
body = b'{"error": "agent card unavailable"}'
self._send_raw(200, "application/json", body)
def _handle_task(self) -> None:
length = int(self.headers.get("Content-Length", 0))
_body = self.rfile.read(length) if length else b""
# Stub: echo back a 202 Accepted with the peer CN so callers can
# confirm which agent processed the request.
peer_cn = _peer_cn(self.connection)
self._send_json(202, {"status": "accepted", "handled_by": peer_cn})
# ------------------------------------------------------------------
def _send_json(self, code: int, data: dict) -> None:
import json
body = json.dumps(data).encode()
self._send_raw(code, "application/json", body)
def _send_raw(self, code: int, content_type: str, body: bytes) -> None:
self.send_response(code)
self.send_header("Content-Type", content_type)
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
def log_message(self, fmt: str, *args: object) -> None: # type: ignore[override]
logger.debug("a2a: " + fmt, *args)
def _peer_cn(conn: ssl.SSLSocket) -> Optional[str]:
"""Extract the Common Name from the peer certificate, or None."""
try:
peer = conn.getpeercert()
if not peer:
return None
for rdn in peer.get("subject", ()):
for key, val in rdn:
if key == "commonName":
return val
except Exception:
pass
return None
# ---------------------------------------------------------------------------
# Server lifecycle
# ---------------------------------------------------------------------------
class A2AServer:
"""Mutual-TLS A2A server.
Example::
server = A2AServer(
cert="~/.hermes/pki/agents/timmy/timmy.crt",
key="~/.hermes/pki/agents/timmy/timmy.key",
ca="~/.hermes/pki/ca/fleet-ca.crt",
)
server.start() # non-blocking (daemon thread)
...
server.stop()
"""
def __init__(
self,
cert: str | Path,
key: str | Path,
ca: str | Path,
host: str = "0.0.0.0",
port: int = 9443,
) -> None:
self.cert = Path(cert).expanduser()
self.key = Path(key).expanduser()
self.ca = Path(ca).expanduser()
self.host = host
self.port = port
self._httpd: Optional[HTTPServer] = None
self._thread: Optional[threading.Thread] = None
def start(self, daemon: bool = True) -> None:
"""Start the server in a background thread (default: daemon)."""
ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
self._httpd = HTTPServer((self.host, self.port), A2AHandler)
self._httpd.socket = ssl_ctx.wrap_socket(
self._httpd.socket, server_side=True
)
self._thread = threading.Thread(
target=self._httpd.serve_forever, daemon=daemon
)
self._thread.start()
logger.info(
"A2A mTLS server listening on %s:%s (cert=%s)",
self.host, self.port, self.cert.name,
)
def stop(self) -> None:
if self._httpd:
self._httpd.shutdown()
self._httpd = None
if self._thread:
self._thread.join(timeout=5)
self._thread = None
def server_from_env() -> A2AServer:
"""Build an A2AServer from environment variables / defaults."""
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
agent_name = os.environ.get("HERMES_AGENT_NAME", "hermes").lower()
default_cert = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.crt"
default_key = hermes_home / "pki" / "agents" / agent_name / f"{agent_name}.key"
default_ca = hermes_home / "pki" / "ca" / "fleet-ca.crt"
cert = os.environ.get("HERMES_A2A_CERT", str(default_cert))
key = os.environ.get("HERMES_A2A_KEY", str(default_key))
ca = os.environ.get("HERMES_A2A_CA", str(default_ca))
host = os.environ.get("HERMES_A2A_HOST", "0.0.0.0")
port = int(os.environ.get("HERMES_A2A_PORT", "9443"))
return A2AServer(cert=cert, key=key, ca=ca, host=host, port=port)
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def _main() -> None:
import argparse
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
parser = argparse.ArgumentParser(
description="Hermes A2A mutual-TLS server"
)
parser.add_argument("--cert", required=True, help="Path to agent certificate")
parser.add_argument("--key", required=True, help="Path to agent private key")
parser.add_argument("--ca", required=True, help="Path to fleet CA certificate")
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=9443)
args = parser.parse_args()
server = A2AServer(
cert=args.cert, key=args.key, ca=args.ca,
host=args.host, port=args.port,
)
server.start(daemon=False)
if __name__ == "__main__":
_main()
# ---------------------------------------------------------------------------
# A2AMTLSServer — routing-based server with context-manager support
# ---------------------------------------------------------------------------
class _RoutingHandler(BaseHTTPRequestHandler):
"""HTTP request handler that dispatches to per-path callables."""
routes: Dict[str, Callable] = {}
def log_message(self, fmt: str, *args: Any) -> None:
logger.debug("A2AMTLSServer: " + fmt, *args)
def _peer_cn(self) -> Optional[str]:
cert = self.connection.getpeercert() # type: ignore[attr-defined]
if not cert:
return None
for rdn in cert.get("subject", ()):
for attr, value in rdn:
if attr == "commonName":
return value
return None
def do_POST(self) -> None:
handler = self.routes.get(self.path)
if handler is None:
self.send_response(404)
self.end_headers()
return
length = int(self.headers.get("Content-Length", 0))
body = self.rfile.read(length) if length else b""
try:
payload = json.loads(body) if body else {}
except json.JSONDecodeError:
self.send_response(400)
self.end_headers()
return
result = handler(payload, peer_cn=self._peer_cn())
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(result).encode())
def do_GET(self) -> None:
handler = self.routes.get(self.path)
if handler is None:
self.send_response(404)
self.end_headers()
return
result = handler({}, peer_cn=self._peer_cn())
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.end_headers()
self.wfile.write(json.dumps(result).encode())
class A2AMTLSServer:
"""Routing-based mTLS HTTPS server with context-manager support.
Unlike ``A2AServer`` (which serves fixed A2A paths), this server lets
callers register arbitrary path handlers — useful for tests and custom
A2A endpoint implementations.
handler signature: ``handler(payload: dict, *, peer_cn: str | None) -> dict``
Example::
server = A2AMTLSServer(cert="timmy.crt", key="timmy.key", ca="fleet-ca.crt")
server.add_route("/tasks/send", my_handler)
with server:
... # server runs for the duration of the block
"""
def __init__(
self,
cert: str | Path,
key: str | Path,
ca: str | Path,
host: str = "127.0.0.1",
port: int = 9443,
) -> None:
self.cert = Path(cert).expanduser()
self.key = Path(key).expanduser()
self.ca = Path(ca).expanduser()
self.host = host
self.port = port
self._routes: Dict[str, Callable] = {}
self._httpd: Optional[HTTPServer] = None
self._thread: Optional[threading.Thread] = None
def add_route(self, path: str, handler: Callable) -> None:
self._routes[path] = handler
def start(self) -> None:
ssl_ctx = build_server_ssl_context(self.cert, self.key, self.ca)
class _Handler(_RoutingHandler):
routes = self._routes
self._httpd = HTTPServer((self.host, self.port), _Handler)
self._httpd.socket = ssl_ctx.wrap_socket(self._httpd.socket, server_side=True)
self._thread = threading.Thread(
target=self._httpd.serve_forever,
daemon=True,
name=f"a2a-mtls-{self.port}",
)
self._thread.start()
logger.info("A2AMTLSServer on %s:%d (mTLS)", self.host, self.port)
def stop(self) -> None:
if self._httpd:
self._httpd.shutdown()
self._httpd = None
if self._thread:
self._thread.join(timeout=5)
self._thread = None
def __enter__(self) -> "A2AMTLSServer":
self.start()
return self
def __exit__(self, *_: Any) -> None:
self.stop()
# ---------------------------------------------------------------------------
# A2AMTLSClient — mTLS HTTP client
# ---------------------------------------------------------------------------
class A2AMTLSClient:
"""HTTP client that presents a fleet cert on every outgoing connection.
Example::
client = A2AMTLSClient(cert="allegro.crt", key="allegro.key", ca="fleet-ca.crt")
result = client.post("https://timmy:9443/tasks/send", json={"task": "..."})
"""
def __init__(
self,
cert: str | Path,
key: str | Path,
ca: str | Path,
) -> None:
self._ssl_ctx = build_client_ssl_context(cert, key, ca)
self._ssl_ctx.check_hostname = False # callers connecting by IP
def _request(
self,
method: str,
url: str,
data: Optional[bytes] = None,
timeout: float = 10.0,
) -> Dict[str, Any]:
headers = {"Content-Type": "application/json"}
req = Request(url, data=data, headers=headers, method=method)
try:
with urlopen(req, context=self._ssl_ctx, timeout=timeout) as resp:
body = resp.read()
return json.loads(body) if body else {}
except URLError as exc:
raise ConnectionError(f"A2AMTLSClient {method} {url} failed: {exc.reason}") from exc
def get(self, url: str, **kwargs: Any) -> Dict[str, Any]:
return self._request("GET", url, **kwargs)
def post(self, url: str, json: Optional[Dict[str, Any]] = None, **kwargs: Any) -> Dict[str, Any]:
data = (__import__("json").dumps(json).encode() if json is not None else None)
return self._request("POST", url, data=data, **kwargs)

View File

@@ -1,135 +0,0 @@
"""
Agent Card — A2A-compliant agent discovery.
Part of #843: fix: implement A2A agent card for fleet discovery (#819)
Provides metadata about the agent's identity, capabilities, and installed skills
for discovery by other agents in the fleet.
"""
import json
import logging
import os
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
from hermes_cli import __version__
from hermes_cli.config import load_config, get_hermes_home
from agent.skill_utils import (
iter_skill_index_files,
parse_frontmatter,
get_all_skills_dirs,
get_disabled_skill_names,
skill_matches_platform
)
logger = logging.getLogger(__name__)
@dataclass
class AgentSkill:
id: str
name: str
description: str = ""
version: str = "1.0.0"
@dataclass
class AgentCapabilities:
streaming: bool = True
tools: bool = True
vision: bool = False
reasoning: bool = False
@dataclass
class AgentCard:
name: str
description: str
url: str
version: str = __version__
capabilities: AgentCapabilities = field(default_factory=AgentCapabilities)
skills: List[AgentSkill] = field(default_factory=list)
defaultInputModes: List[str] = field(default_factory=lambda: ["text/plain"])
defaultOutputModes: List[str] = field(default_factory=lambda: ["text/plain"])
def _load_skills() -> List[AgentSkill]:
"""Scan all enabled skills and return metadata."""
skills = []
disabled = get_disabled_skill_names()
for skills_dir in get_all_skills_dirs():
if not skills_dir.is_dir():
continue
for skill_file in iter_skill_index_files(skills_dir, "SKILL.md"):
try:
raw = skill_file.read_text(encoding="utf-8")
frontmatter, _ = parse_frontmatter(raw)
except Exception:
continue
skill_name = frontmatter.get("name") or skill_file.parent.name
if str(skill_name) in disabled:
continue
if not skill_matches_platform(frontmatter):
continue
skills.append(AgentSkill(
id=str(skill_name),
name=str(frontmatter.get("name", skill_name)),
description=str(frontmatter.get("description", "")),
version=str(frontmatter.get("version", "1.0.0"))
))
return skills
def build_agent_card() -> AgentCard:
"""Build the agent card from current configuration and environment."""
config = load_config()
# Identity
name = os.environ.get("HERMES_AGENT_NAME") or config.get("agent", {}).get("name") or "hermes"
description = os.environ.get("HERMES_AGENT_DESCRIPTION") or config.get("agent", {}).get("description") or "Sovereign AI agent"
# URL - try to determine from environment or config
port = os.environ.get("HERMES_WEB_PORT") or "9119"
host = os.environ.get("HERMES_WEB_HOST") or "localhost"
url = f"http://{host}:{port}"
# Capabilities
# In a real scenario, we'd check model metadata for vision/reasoning
capabilities = AgentCapabilities(
streaming=True,
tools=True,
vision=False, # Default to false unless we can confirm
reasoning=False
)
# Skills
skills = _load_skills()
return AgentCard(
name=name,
description=description,
url=url,
version=__version__,
capabilities=capabilities,
skills=skills
)
def get_agent_card_json() -> str:
"""Return the agent card as a JSON string."""
try:
card = build_agent_card()
return json.dumps(asdict(card), indent=2)
except Exception as e:
logger.error(f"Failed to build agent card: {e}")
# Minimal fallback card
fallback = {
"name": "hermes",
"description": "Sovereign AI agent (fallback)",
"version": __version__,
"error": str(e)
}
return json.dumps(fallback, indent=2)
def validate_agent_card(card_data: Dict[str, Any]) -> bool:
"""Check if the card data complies with the A2A schema."""
required = ["name", "description", "url", "version"]
return all(k in card_data for k in required)

View File

@@ -1230,10 +1230,9 @@ def build_anthropic_kwargs(
When *base_url* points to a third-party Anthropic-compatible endpoint,
thinking block signatures are stripped (they are Anthropic-proprietary).
When *fast_mode* is True, adds ``extra_body["speed"] = "fast"`` and the
fast-mode beta header for ~2.5x faster output throughput on Opus 4.6.
Currently only supported on native Anthropic endpoints (not third-party
compatible ones).
When *fast_mode* is True, adds ``speed: "fast"`` and the fast-mode beta
header for ~2.5x faster output throughput on Opus 4.6. Currently only
supported on native Anthropic endpoints (not third-party compatible ones).
"""
system, anthropic_messages = convert_messages_to_anthropic(messages, base_url=base_url)
anthropic_tools = convert_tools_to_anthropic(tools) if tools else []
@@ -1334,11 +1333,11 @@ def build_anthropic_kwargs(
kwargs["max_tokens"] = max(effective_max_tokens, budget + 4096)
# ── Fast mode (Opus 4.6 only) ────────────────────────────────────
# Adds extra_body.speed="fast" + the fast-mode beta header for ~2.5x
# output speed. Only for native Anthropic endpoints — third-party
# providers would reject the unknown beta header and speed parameter.
# Adds speed:"fast" + the fast-mode beta header for ~2.5x output speed.
# Only for native Anthropic endpoints — third-party providers would
# reject the unknown beta header and speed parameter.
if fast_mode and not _is_third_party_anthropic_endpoint(base_url):
kwargs.setdefault("extra_body", {})["speed"] = "fast"
kwargs["speed"] = "fast"
# Build extra_headers with ALL applicable betas (the per-request
# extra_headers override the client-level anthropic-beta header).
betas = list(_common_betas_for_base_url(base_url))

View File

@@ -1,4 +1,4 @@
from agent.telemetry_logger import log_token_usage\n"""Shared auxiliary client router for side tasks.
"""Shared auxiliary client router for side tasks.
Provides a single resolution chain so every consumer (context compression,
session search, web extraction, vision analysis, browser vision) picks up
@@ -23,10 +23,18 @@ Resolution order for vision/multimodal tasks (auto mode):
6. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.)
7. None
Per-task overrides are configured in config.yaml under the ``auxiliary:`` section
(e.g. ``auxiliary.vision.provider``, ``auxiliary.compression.model``).
Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task.
Default "auto" follows the chains above.
Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug
than the provider's default.
Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL,
AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a
custom OpenAI-compatible endpoint without touching the main model settings.
Payment / credit exhaustion fallback:
When a resolved provider returns HTTP 402 or a credit-related error,
call_llm() automatically retries with the next available provider in the
@@ -64,8 +72,6 @@ _PROVIDER_ALIASES = {
"zhipu": "zai",
"kimi": "kimi-coding",
"moonshot": "kimi-coding",
"kimi-cn": "kimi-coding-cn",
"moonshot-cn": "kimi-coding-cn",
"minimax-china": "minimax-cn",
"minimax_cn": "minimax-cn",
"claude": "anthropic",
@@ -73,13 +79,13 @@ _PROVIDER_ALIASES = {
}
def _normalize_aux_provider(provider: Optional[str]) -> str:
def _normalize_aux_provider(provider: Optional[str], *, for_vision: bool = False) -> str:
normalized = (provider or "auto").strip().lower()
if normalized.startswith("custom:"):
suffix = normalized.split(":", 1)[1].strip()
if not suffix:
return "custom"
normalized = suffix
normalized = suffix if not for_vision else "custom"
if normalized == "codex":
return "openai-codex"
if normalized == "main":
@@ -96,7 +102,6 @@ _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
"gemini": "gemini-3-flash-preview",
"zai": "glm-4.5-flash",
"kimi-coding": "kimi-k2-turbo-preview",
"kimi-coding-cn": "kimi-k2-turbo-preview",
"minimax": "MiniMax-M2.7",
"minimax-cn": "MiniMax-M2.7",
"anthropic": "claude-haiku-4-5-20251001",
@@ -106,15 +111,6 @@ _API_KEY_PROVIDER_AUX_MODELS: Dict[str, str] = {
"kilocode": "google/gemini-3-flash-preview",
}
# Vision-specific model overrides for direct providers.
# When the user's main provider has a dedicated vision/multimodal model that
# differs from their main chat model, map it here. The vision auto-detect
# "exotic provider" branch checks this before falling back to the main model.
_PROVIDER_VISION_MODELS: Dict[str, str] = {
"xiaomi": "mimo-v2-omni",
"zai": "glm-5v-turbo",
}
# OpenRouter app attribution headers
_OR_HEADERS = {
"HTTP-Referer": "https://hermes-agent.nousresearch.com",
@@ -396,7 +392,7 @@ class _CodexCompletionsAdapter:
prompt_tokens=getattr(resp_usage, "input_tokens", 0),
completion_tokens=getattr(resp_usage, "output_tokens", 0),
total_tokens=getattr(resp_usage, "total_tokens", 0),
)\n log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)
)
except Exception as exc:
logger.debug("Codex auxiliary Responses API call failed: %s", exc)
raise
@@ -529,7 +525,7 @@ class _AnthropicCompletionsAdapter:
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=total_tokens,
)\n log_token_usage(usage.prompt_tokens, usage.completion_tokens, model)
)
choice = SimpleNamespace(
index=0,
@@ -753,6 +749,30 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
# ── Provider resolution helpers ─────────────────────────────────────────────
def _get_auxiliary_provider(task: str = "") -> str:
"""Read the provider override for a specific auxiliary task.
Checks AUXILIARY_{TASK}_PROVIDER first (e.g. AUXILIARY_VISION_PROVIDER),
then CONTEXT_{TASK}_PROVIDER (for the compression section's summary_provider),
then falls back to "auto". Returns one of: "auto", "openrouter", "nous", "main".
"""
if task:
for prefix in ("AUXILIARY_", "CONTEXT_"):
val = os.getenv(f"{prefix}{task.upper()}_PROVIDER", "").strip().lower()
if val and val != "auto":
return val
return "auto"
def _get_auxiliary_env_override(task: str, suffix: str) -> Optional[str]:
"""Read an auxiliary env override from AUXILIARY_* or CONTEXT_* prefixes."""
if not task:
return None
for prefix in ("AUXILIARY_", "CONTEXT_"):
val = os.getenv(f"{prefix}{task.upper()}_{suffix}", "").strip()
if val:
return val
return None
def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
@@ -997,23 +1017,6 @@ _AUTO_PROVIDER_LABELS = {
_AGGREGATOR_PROVIDERS = frozenset({"openrouter", "nous"})
_MAIN_RUNTIME_FIELDS = ("provider", "model", "base_url", "api_key", "api_mode")
def _normalize_main_runtime(main_runtime: Optional[Dict[str, Any]]) -> Dict[str, str]:
"""Return a sanitized copy of a live main-runtime override."""
if not isinstance(main_runtime, dict):
return {}
normalized: Dict[str, str] = {}
for field in _MAIN_RUNTIME_FIELDS:
value = main_runtime.get(field)
if isinstance(value, str) and value.strip():
normalized[field] = value.strip()
provider = normalized.get("provider")
if provider:
normalized["provider"] = provider.lower()
return normalized
def _get_provider_chain() -> List[tuple]:
"""Return the ordered provider detection chain.
@@ -1123,7 +1126,7 @@ def _try_payment_fallback(
return None, None, ""
def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Optional[OpenAI], Optional[str]]:
def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
"""Full auto-detection chain.
Priority:
@@ -1135,12 +1138,6 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
"""
global auxiliary_is_nous, _stale_base_url_warned
auxiliary_is_nous = False # Reset — _try_nous() will set True if it wins
runtime = _normalize_main_runtime(main_runtime)
runtime_provider = runtime.get("provider", "")
runtime_model = runtime.get("model", "")
runtime_base_url = runtime.get("base_url", "")
runtime_api_key = runtime.get("api_key", "")
runtime_api_mode = runtime.get("api_mode", "")
# ── Warn once if OPENAI_BASE_URL is set but config.yaml uses a named
# provider (not 'custom'). This catches the common "env poisoning"
@@ -1148,7 +1145,7 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
# old OPENAI_BASE_URL lingers in ~/.hermes/.env. ──
if not _stale_base_url_warned:
_env_base = os.getenv("OPENAI_BASE_URL", "").strip()
_cfg_provider = runtime_provider or _read_main_provider()
_cfg_provider = _read_main_provider()
if (_env_base and _cfg_provider
and _cfg_provider != "custom"
and not _cfg_provider.startswith("custom:")):
@@ -1162,25 +1159,12 @@ def _resolve_auto(main_runtime: Optional[Dict[str, Any]] = None) -> Tuple[Option
_stale_base_url_warned = True
# ── Step 1: non-aggregator main provider → use main model directly ──
main_provider = runtime_provider or _read_main_provider()
main_model = runtime_model or _read_main_model()
main_provider = _read_main_provider()
main_model = _read_main_model()
if (main_provider and main_model
and main_provider not in _AGGREGATOR_PROVIDERS
and main_provider not in ("auto", "")):
resolved_provider = main_provider
explicit_base_url = None
explicit_api_key = None
if runtime_base_url and (main_provider == "custom" or main_provider.startswith("custom:")):
resolved_provider = "custom"
explicit_base_url = runtime_base_url
explicit_api_key = runtime_api_key or None
client, resolved = resolve_provider_client(
resolved_provider,
main_model,
explicit_base_url=explicit_base_url,
explicit_api_key=explicit_api_key,
api_mode=runtime_api_mode or None,
)
client, resolved = resolve_provider_client(main_provider, main_model)
if client is not None:
logger.info("Auxiliary auto-detect: using main provider %s (%s)",
main_provider, resolved or main_model)
@@ -1224,12 +1208,6 @@ def _to_async_client(sync_client, model: str):
return AsyncCodexAuxiliaryClient(sync_client), model
if isinstance(sync_client, AnthropicAuxiliaryClient):
return AsyncAnthropicAuxiliaryClient(sync_client), model
try:
from agent.copilot_acp_client import CopilotACPClient
if isinstance(sync_client, CopilotACPClient):
return sync_client, model
except ImportError:
pass
async_kwargs = {
"api_key": sync_client.api_key,
@@ -1267,7 +1245,6 @@ def resolve_provider_client(
explicit_base_url: str = None,
explicit_api_key: str = None,
api_mode: str = None,
main_runtime: Optional[Dict[str, Any]] = None,
) -> Tuple[Optional[Any], Optional[str]]:
"""Central router: given a provider name and optional model, return a
configured client with the correct auth, base URL, and API format.
@@ -1338,7 +1315,7 @@ def resolve_provider_client(
# ── Auto: try all providers in priority order ────────────────────
if provider == "auto":
client, resolved = _resolve_auto(main_runtime=main_runtime)
client, resolved = _resolve_auto()
if client is None:
return None, None
# When auto-detection lands on a non-OpenRouter provider (e.g. a
@@ -1448,14 +1425,10 @@ def resolve_provider_client(
custom_entry = _get_named_custom_provider(provider)
if custom_entry:
custom_base = custom_entry.get("base_url", "").strip()
custom_key = custom_entry.get("api_key", "").strip()
custom_key_env = custom_entry.get("key_env", "").strip()
if not custom_key and custom_key_env:
custom_key = os.getenv(custom_key_env, "").strip()
custom_key = custom_key or "no-key-required"
custom_key = custom_entry.get("api_key", "").strip() or "no-key-required"
if custom_base:
final_model = _normalize_resolved_model(
model or custom_entry.get("model") or _read_main_model() or "gpt-4o-mini",
model or _read_main_model() or "gpt-4o-mini",
provider,
)
client = OpenAI(api_key=custom_key, base_url=custom_base)
@@ -1474,11 +1447,7 @@ def resolve_provider_client(
# ── API-key providers from PROVIDER_REGISTRY ─────────────────────
try:
from hermes_cli.auth import (
PROVIDER_REGISTRY,
resolve_api_key_provider_credentials,
resolve_external_process_provider_credentials,
)
from hermes_cli.auth import PROVIDER_REGISTRY, resolve_api_key_provider_credentials
except ImportError:
logger.debug("hermes_cli.auth not available for provider %s", provider)
return None, None
@@ -1552,41 +1521,6 @@ def resolve_provider_client(
return (_to_async_client(client, final_model) if async_mode
else (client, final_model))
if pconfig.auth_type == "external_process":
creds = resolve_external_process_provider_credentials(provider)
final_model = _normalize_resolved_model(model or _read_main_model(), provider)
if provider == "copilot-acp":
api_key = str(creds.get("api_key", "")).strip()
base_url = str(creds.get("base_url", "")).strip()
command = str(creds.get("command", "")).strip() or None
args = list(creds.get("args") or [])
if not final_model:
logger.warning(
"resolve_provider_client: copilot-acp requested but no model "
"was provided or configured"
)
return None, None
if not api_key or not base_url:
logger.warning(
"resolve_provider_client: copilot-acp requested but external "
"process credentials are incomplete"
)
return None, None
from agent.copilot_acp_client import CopilotACPClient
client = CopilotACPClient(
api_key=api_key,
base_url=base_url,
command=command,
args=args,
)
logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
return (_to_async_client(client, final_model) if async_mode
else (client, final_model))
logger.warning("resolve_provider_client: external-process provider %s not "
"directly supported", provider)
return None, None
elif pconfig.auth_type in ("oauth_device_code", "oauth_external"):
# OAuth providers — route through their specific try functions
if provider == "nous":
@@ -1605,19 +1539,15 @@ def resolve_provider_client(
# ── Public API ──────────────────────────────────────────────────────────────
def get_text_auxiliary_client(
task: str = "",
*,
main_runtime: Optional[Dict[str, Any]] = None,
) -> Tuple[Optional[OpenAI], Optional[str]]:
def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
"""Return (client, default_model_slug) for text-only auxiliary tasks.
Args:
task: Optional task name ("compression", "web_extract") to check
for a task-specific provider override.
Callers may override the returned model via config.yaml
(e.g. auxiliary.compression.model, auxiliary.web_extract.model).
Callers may override the returned model with a per-task env var
(e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
"""
provider, model, base_url, api_key, api_mode = _resolve_task_provider_model(task or None)
return resolve_provider_client(
@@ -1626,11 +1556,10 @@ def get_text_auxiliary_client(
explicit_base_url=base_url,
explicit_api_key=api_key,
api_mode=api_mode,
main_runtime=main_runtime,
)
def get_async_text_auxiliary_client(task: str = "", *, main_runtime: Optional[Dict[str, Any]] = None):
def get_async_text_auxiliary_client(task: str = ""):
"""Return (async_client, model_slug) for async consumers.
For standard providers returns (AsyncOpenAI, model). For Codex returns
@@ -1645,7 +1574,6 @@ def get_async_text_auxiliary_client(task: str = "", *, main_runtime: Optional[Di
explicit_base_url=base_url,
explicit_api_key=api_key,
api_mode=api_mode,
main_runtime=main_runtime,
)
@@ -1656,7 +1584,7 @@ _VISION_AUTO_PROVIDER_ORDER = (
def _normalize_vision_provider(provider: Optional[str]) -> str:
return _normalize_aux_provider(provider)
return _normalize_aux_provider(provider, for_vision=True)
def _resolve_strict_vision_backend(provider: str) -> Tuple[Optional[Any], Optional[str]]:
@@ -1739,7 +1667,6 @@ def resolve_vision_provider_client(
async_mode=async_mode,
explicit_base_url=resolved_base_url,
explicit_api_key=resolved_api_key,
api_mode=resolved_api_mode,
)
if client is None:
return "custom", None, None
@@ -1760,19 +1687,16 @@ def resolve_vision_provider_client(
if sync_client is not None:
return _finalize(main_provider, sync_client, default_model)
else:
# Exotic provider (DeepSeek, Alibaba, Xiaomi, named custom, etc.)
# Use provider-specific vision model if available, otherwise main model.
vision_model = _PROVIDER_VISION_MODELS.get(main_provider, main_model)
# Exotic provider (DeepSeek, Alibaba, named custom, etc.)
rpc_client, rpc_model = resolve_provider_client(
main_provider, vision_model,
api_mode=resolved_api_mode)
main_provider, main_model)
if rpc_client is not None:
logger.info(
"Vision auto-detect: using active provider %s (%s)",
main_provider, rpc_model or vision_model,
main_provider, rpc_model or main_model,
)
return _finalize(
main_provider, rpc_client, rpc_model or vision_model)
main_provider, rpc_client, rpc_model or main_model)
# Fall back through aggregators.
for candidate in _VISION_AUTO_PROVIDER_ORDER:
@@ -1789,8 +1713,7 @@ def resolve_vision_provider_client(
sync_client, default_model = _resolve_strict_vision_backend(requested)
return _finalize(requested, sync_client, default_model)
client, final_model = _get_cached_client(requested, resolved_model, async_mode,
api_mode=resolved_api_mode)
client, final_model = _get_cached_client(requested, resolved_model, async_mode)
if client is None:
return requested, None, None
return requested, client, final_model
@@ -1963,7 +1886,6 @@ def _get_cached_client(
base_url: str = None,
api_key: str = None,
api_mode: str = None,
main_runtime: Optional[Dict[str, Any]] = None,
) -> Tuple[Optional[Any], Optional[str]]:
"""Get or create a cached client for the given provider.
@@ -1987,9 +1909,7 @@ def _get_cached_client(
loop_id = id(current_loop)
except RuntimeError:
pass
runtime = _normalize_main_runtime(main_runtime)
runtime_key = tuple(runtime.get(field, "") for field in _MAIN_RUNTIME_FIELDS) if provider == "auto" else ()
cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", loop_id, runtime_key)
cache_key = (provider, async_mode, base_url or "", api_key or "", api_mode or "", loop_id)
with _client_cache_lock:
if cache_key in _client_cache:
cached_client, cached_default, cached_loop = _client_cache[cache_key]
@@ -2014,7 +1934,6 @@ def _get_cached_client(
explicit_base_url=base_url,
explicit_api_key=api_key,
api_mode=api_mode,
main_runtime=runtime,
)
if client is not None:
# For async clients, remember which loop they were created on so we
@@ -2039,8 +1958,9 @@ def _resolve_task_provider_model(
Priority:
1. Explicit provider/model/base_url/api_key args (always win)
2. Config file (auxiliary.{task}.provider/model/base_url)
3. "auto" (full auto-detection chain)
2. Env var overrides (AUXILIARY_{TASK}_*, CONTEXT_{TASK}_*)
3. Config file (auxiliary.{task}.* or compression.*)
4. "auto" (full auto-detection chain)
Returns (provider, model, base_url, api_key, api_mode) where model may
be None (use provider default). When base_url is set, provider is forced
@@ -2071,8 +1991,21 @@ def _resolve_task_provider_model(
cfg_api_key = str(task_config.get("api_key", "")).strip() or None
cfg_api_mode = str(task_config.get("api_mode", "")).strip() or None
resolved_model = model or cfg_model
resolved_api_mode = cfg_api_mode
# Backwards compat: compression section has its own keys.
# The auxiliary.compression defaults to provider="auto", so treat
# both None and "auto" as "not explicitly configured".
if task == "compression" and (not cfg_provider or cfg_provider == "auto"):
comp = config.get("compression", {}) if isinstance(config, dict) else {}
if isinstance(comp, dict):
cfg_provider = comp.get("summary_provider", "").strip() or None
cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
_sbu = comp.get("summary_base_url") or ""
cfg_base_url = cfg_base_url or _sbu.strip() or None
env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
env_api_mode = _get_auxiliary_env_override(task, "API_MODE") if task else None
resolved_model = model or env_model or cfg_model
resolved_api_mode = env_api_mode or cfg_api_mode
if base_url:
return "custom", resolved_model, base_url, api_key, resolved_api_mode
@@ -2080,12 +2013,19 @@ def _resolve_task_provider_model(
return provider, resolved_model, base_url, api_key, resolved_api_mode
if task:
# Config.yaml is the primary source for per-task overrides.
env_base_url = _get_auxiliary_env_override(task, "BASE_URL")
env_api_key = _get_auxiliary_env_override(task, "API_KEY")
if env_base_url:
return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key, resolved_api_mode
env_provider = _get_auxiliary_provider(task)
if env_provider != "auto":
return env_provider, resolved_model, None, None, resolved_api_mode
if cfg_base_url:
return "custom", resolved_model, cfg_base_url, cfg_api_key, resolved_api_mode
if cfg_provider and cfg_provider != "auto":
return cfg_provider, resolved_model, None, None, resolved_api_mode
return "auto", resolved_model, None, None, resolved_api_mode
return "auto", resolved_model, None, None, resolved_api_mode
@@ -2114,75 +2054,6 @@ def _get_task_timeout(task: str, default: float = _DEFAULT_AUX_TIMEOUT) -> float
return default
# ---------------------------------------------------------------------------
# Anthropic-compatible endpoint detection + image block conversion
# ---------------------------------------------------------------------------
# Providers that use Anthropic-compatible endpoints (via OpenAI SDK wrapper).
# Their image content blocks must use Anthropic format, not OpenAI format.
_ANTHROPIC_COMPAT_PROVIDERS = frozenset({"minimax", "minimax-cn"})
def _is_anthropic_compat_endpoint(provider: str, base_url: str) -> bool:
"""Detect if an endpoint expects Anthropic-format content blocks.
Returns True for known Anthropic-compatible providers (MiniMax) and
any endpoint whose URL contains ``/anthropic`` in the path.
"""
if provider in _ANTHROPIC_COMPAT_PROVIDERS:
return True
url_lower = (base_url or "").lower()
return "/anthropic" in url_lower
def _convert_openai_images_to_anthropic(messages: list) -> list:
"""Convert OpenAI ``image_url`` content blocks to Anthropic ``image`` blocks.
Only touches messages that have list-type content with ``image_url`` blocks;
plain text messages pass through unchanged.
"""
converted = []
for msg in messages:
content = msg.get("content")
if not isinstance(content, list):
converted.append(msg)
continue
new_content = []
changed = False
for block in content:
if block.get("type") == "image_url":
image_url_val = (block.get("image_url") or {}).get("url", "")
if image_url_val.startswith("data:"):
# Parse data URI: data:<media_type>;base64,<data>
header, _, b64data = image_url_val.partition(",")
media_type = "image/png"
if ":" in header and ";" in header:
media_type = header.split(":", 1)[1].split(";", 1)[0]
new_content.append({
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": b64data,
},
})
else:
# URL-based image
new_content.append({
"type": "image",
"source": {
"type": "url",
"url": image_url_val,
},
})
changed = True
else:
new_content.append(block)
converted.append({**msg, "content": new_content} if changed else msg)
return converted
def _build_call_kwargs(
provider: str,
model: str,
@@ -2267,7 +2138,6 @@ def call_llm(
model: str = None,
base_url: str = None,
api_key: str = None,
main_runtime: Optional[Dict[str, Any]] = None,
messages: list,
temperature: float = None,
max_tokens: int = None,
@@ -2302,7 +2172,7 @@ def call_llm(
resolved_provider, resolved_model, resolved_base_url, resolved_api_key, resolved_api_mode = _resolve_task_provider_model(
task, provider, model, base_url, api_key)
if task in ("vision", "browser_vision"):
if task == "vision":
effective_provider, client, final_model = resolve_vision_provider_client(
provider=provider,
model=model,
@@ -2333,7 +2203,6 @@ def call_llm(
base_url=resolved_base_url,
api_key=resolved_api_key,
api_mode=resolved_api_mode,
main_runtime=main_runtime,
)
if client is None:
# When the user explicitly chose a non-OpenRouter provider but no
@@ -2354,7 +2223,7 @@ def call_llm(
if not resolved_base_url:
logger.info("Auxiliary %s: provider %s unavailable, trying auto-detection chain",
task or "call", resolved_provider)
client, final_model = _get_cached_client("auto", main_runtime=main_runtime)
client, final_model = _get_cached_client("auto")
if client is None:
raise RuntimeError(
f"No LLM provider configured for task={task} provider={resolved_provider}. "
@@ -2375,11 +2244,6 @@ def call_llm(
tools=tools, timeout=effective_timeout, extra_body=extra_body,
base_url=resolved_base_url)
# Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
_client_base = str(getattr(client, "base_url", "") or "")
if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
# Handle max_tokens vs max_completion_tokens retry, then payment fallback.
try:
return _validate_llm_response(
@@ -2456,9 +2320,9 @@ def extract_content_or_reasoning(response) -> str:
if content:
# Strip inline think/reasoning blocks (mirrors _strip_think_blocks)
cleaned = re.sub(
r"<(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>"
r"<(?:think|thinking|reasoning|REASONING_SCRATCHPAD)>"
r".*?"
r"</(?:think|thinking|reasoning|thought|REASONING_SCRATCHPAD)>",
r"</(?:think|thinking|reasoning|REASONING_SCRATCHPAD)>",
"", content, flags=re.DOTALL | re.IGNORECASE,
).strip()
if cleaned:
@@ -2568,11 +2432,6 @@ async def async_call_llm(
tools=tools, timeout=effective_timeout, extra_body=extra_body,
base_url=resolved_base_url)
# Convert image blocks for Anthropic-compatible endpoints (e.g. MiniMax)
_client_base = str(getattr(client, "base_url", "") or "")
if _is_anthropic_compat_endpoint(resolved_provider, _client_base):
kwargs["messages"] = _convert_openai_images_to_anthropic(kwargs["messages"])
try:
return _validate_llm_response(
await client.chat.completions.create(**kwargs), task)

View File

@@ -1,273 +0,0 @@
"""
Circuit Breaker for Error Cascading — #885
P(error | prev was error) = 58.6% vs P(error | prev was success) = 25.2%.
That's a 2.33x cascade factor. After 3 consecutive errors, the circuit
opens and the agent must take corrective action.
States:
- CLOSED: Normal operation, errors are counted
- OPEN: Too many consecutive errors, corrective action required
- HALF_OPEN: Testing if errors have cleared
Usage:
from agent.circuit_breaker import CircuitBreaker, ToolCircuitBreaker
cb = ToolCircuitBreaker()
# After each tool call
if not cb.record_result(success=True):
# Circuit is open — take corrective action
cb.get_recovery_action()
"""
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Dict, List, Optional
class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Too many errors, block execution
HALF_OPEN = "half_open" # Testing recovery
@dataclass
class CircuitBreaker:
"""
Generic circuit breaker with configurable thresholds.
Tracks consecutive errors and opens the circuit when the
error streak exceeds the threshold.
"""
failure_threshold: int = 3
recovery_timeout: float = 30.0 # seconds before trying half-open
success_threshold: int = 2 # successes needed to close from half-open
state: CircuitState = field(default=CircuitState.CLOSED, init=False)
consecutive_failures: int = field(default=0, init=False)
consecutive_successes: int = field(default=0, init=False)
last_failure_time: Optional[float] = field(default=None, init=False)
total_trips: int = field(default=0, init=False)
error_streaks: List[int] = field(default_factory=list, init=False)
def record_result(self, success: bool) -> bool:
"""
Record a tool call result. Returns True if circuit allows execution.
Returns:
True if circuit is CLOSED or HALF_OPEN (execution allowed)
False if circuit is OPEN (execution blocked)
"""
now = time.time()
if self.state == CircuitState.OPEN:
# Check if recovery timeout has passed
if self.last_failure_time and (now - self.last_failure_time) >= self.recovery_timeout:
self.state = CircuitState.HALF_OPEN
self.consecutive_successes = 0
return True # Allow one test execution
return False # Still open
if success:
self.consecutive_failures = 0
self.consecutive_successes += 1
if self.state == CircuitState.HALF_OPEN:
if self.consecutive_successes >= self.success_threshold:
self.state = CircuitState.CLOSED
self.consecutive_successes = 0
return True
else:
self.consecutive_successes = 0
self.consecutive_failures += 1
self.last_failure_time = now
if self.state == CircuitState.HALF_OPEN:
# Failed during recovery — reopen immediately
self.state = CircuitState.OPEN
self.total_trips += 1
return False
if self.consecutive_failures >= self.failure_threshold:
self.state = CircuitState.OPEN
self.total_trips += 1
self.error_streaks.append(self.consecutive_failures)
return False
return True
def can_execute(self) -> bool:
"""Check if execution is allowed."""
if self.state == CircuitState.OPEN:
if self.last_failure_time:
now = time.time()
if (now - self.last_failure_time) >= self.recovery_timeout:
self.state = CircuitState.HALF_OPEN
self.consecutive_successes = 0
return True
return False
return True
def get_state(self) -> Dict[str, Any]:
"""Get current circuit state."""
return {
"state": self.state.value,
"consecutive_failures": self.consecutive_failures,
"consecutive_successes": self.consecutive_successes,
"total_trips": self.total_trips,
"max_streak": max(self.error_streaks) if self.error_streaks else 0,
"can_execute": self.can_execute(),
}
def reset(self):
"""Reset the circuit breaker."""
self.state = CircuitState.CLOSED
self.consecutive_failures = 0
self.consecutive_successes = 0
self.last_failure_time = None
class ToolCircuitBreaker(CircuitBreaker):
"""
Circuit breaker specifically for tool call error cascading.
Provides recovery actions when the circuit opens.
"""
# Tools that are most effective at recovery (from audit data)
RECOVERY_TOOLS = [
"terminal", # Most effective — 2300 recoveries
"read_file", # Reset context by reading something
"search_files", # Find what went wrong
]
def get_recovery_action(self) -> Dict[str, Any]:
"""
Get the recommended recovery action when circuit is open.
Returns dict with action type and details.
"""
streak = self.consecutive_failures
if streak >= 9:
# After 9 errors: 41/46 recoveries via terminal
return {
"action": "terminal_only",
"reason": f"Error streak of {streak} — terminal is the only reliable recovery",
"suggested_tool": "terminal",
"suggested_command": "echo 'Resetting context'",
"severity": "critical",
}
elif streak >= 5:
return {
"action": "switch_tool_type",
"reason": f"Error streak of {streak} — switch to a different tool category",
"suggested_tools": ["read_file", "search_files", "terminal"],
"severity": "high",
}
elif streak >= self.failure_threshold:
return {
"action": "ask_user",
"reason": f"{streak} consecutive errors — ask user for guidance",
"suggested_response": "I'm encountering repeated errors. Would you like me to try a different approach?",
"severity": "medium",
}
else:
return {
"action": "continue",
"reason": f"Error streak of {streak} — within tolerance",
"severity": "low",
}
def should_compress_context(self) -> bool:
"""Determine if context compression would help recovery."""
return self.consecutive_failures >= 5
def get_blocked_tool(self) -> Optional[str]:
"""Get the tool that should be blocked (if any)."""
if self.state == CircuitState.OPEN:
return "last_failed_tool"
return None
class MultiToolCircuitBreaker:
"""
Manages per-tool circuit breakers and cross-tool cascade detection.
When one tool trips its breaker, related tools are also warned.
"""
def __init__(self):
self.breakers: Dict[str, ToolCircuitBreaker] = {}
self.global_streak: int = 0
self.last_tool: Optional[str] = None
self.last_success: bool = True
def get_breaker(self, tool_name: str) -> ToolCircuitBreaker:
"""Get or create a circuit breaker for a tool."""
if tool_name not in self.breakers:
self.breakers[tool_name] = ToolCircuitBreaker()
return self.breakers[tool_name]
def record_result(self, tool_name: str, success: bool) -> bool:
"""
Record a tool call result. Returns True if execution should continue.
"""
breaker = self.get_breaker(tool_name)
allowed = breaker.record_result(success)
# Track global streak
if success:
self.global_streak = 0
self.last_success = True
else:
self.global_streak += 1
self.last_success = False
self.last_tool = tool_name
return allowed
def can_execute(self, tool_name: str) -> bool:
"""Check if a specific tool can execute."""
breaker = self.get_breaker(tool_name)
return breaker.can_execute()
def get_global_state(self) -> Dict[str, Any]:
"""Get overall circuit breaker state."""
return {
"global_streak": self.global_streak,
"last_tool": self.last_tool,
"last_success": self.last_success,
"tool_states": {
name: breaker.get_state()
for name, breaker in self.breakers.items()
if breaker.consecutive_failures > 0 or breaker.total_trips > 0
},
"any_open": any(b.state == CircuitState.OPEN for b in self.breakers.values()),
}
def get_recovery_action(self) -> Dict[str, Any]:
"""Get recovery action based on global state."""
if self.global_streak == 0:
return {"action": "continue", "reason": "No errors"}
# Find the breaker with the worst streak
worst = max(self.breakers.values(), key=lambda b: b.consecutive_failures, default=None)
if worst and worst.consecutive_failures > 0:
return worst.get_recovery_action()
return {
"action": "continue",
"reason": f"Global streak: {self.global_streak}",
"severity": "low",
}
def reset_all(self):
"""Reset all circuit breakers."""
for breaker in self.breakers.values():
breaker.reset()
self.global_streak = 0
self.last_success = True

View File

@@ -1,148 +0,0 @@
"""
Context Budget Tracker - Prevent context window overflow
Poka-yoke: Visual warnings at 70%%, 85%%, 95%% capacity.
Auto-checkpoint at 85%%. Pre-flight token estimation.
Issue: #838
"""
import json
import logging
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
HERMES_HOME = Path.home() / ".hermes"
CHECKPOINT_DIR = HERMES_HOME / "checkpoints"
CHARS_PER_TOKEN = 4
THRESHOLD_WARNING = 0.70
THRESHOLD_CRITICAL = 0.85
THRESHOLD_DANGER = 0.95
class ContextBudget:
def __init__(self, context_limit: int = 128000, system_tokens: int = 0,
used_tokens: int = 0, reserved_tokens: int = 2000):
self.context_limit = context_limit
self.system_tokens = system_tokens
self.used_tokens = used_tokens
self.reserved_tokens = reserved_tokens
@property
def total_used(self) -> int:
return self.system_tokens + self.used_tokens
@property
def available(self) -> int:
return max(0, self.context_limit - self.reserved_tokens)
@property
def remaining(self) -> int:
return max(0, self.available - self.total_used)
@property
def utilization(self) -> float:
return self.total_used / self.available if self.available > 0 else 1.0
def estimate_tokens(text: str) -> int:
return len(text) // CHARS_PER_TOKEN if text else 0
def estimate_messages_tokens(messages: List[Dict]) -> int:
total = 0
for msg in messages:
content = msg.get("content", "")
if isinstance(content, str):
total += estimate_tokens(content)
if msg.get("tool_calls"):
total += 100
return total
class ContextBudgetTracker:
def __init__(self, context_limit: int = 128000, session_id: str = ""):
self.budget = ContextBudget(context_limit=context_limit)
self.session_id = session_id
self._checkpointed = False
self._warnings_given = set()
def update_from_messages(self, messages: List[Dict]):
self.budget.used_tokens = estimate_messages_tokens(messages)
def can_fit(self, additional_tokens: int) -> bool:
return self.budget.remaining >= additional_tokens
def preflight_check(self, text: str) -> Tuple[bool, str]:
tokens = estimate_tokens(text)
if not self.can_fit(tokens):
return False, f"Cannot load: ~{tokens:,} tokens needed, {self.budget.remaining:,} remaining"
would_util = (self.budget.total_used + tokens) / self.budget.available if self.budget.available > 0 else 1.0
if would_util >= THRESHOLD_DANGER:
return False, f"Would reach {would_util:.0%%} capacity. Summarize or start new session."
if would_util >= THRESHOLD_CRITICAL:
return True, f"Warning: will reach {would_util:.0%%} capacity."
return True, ""
def get_warning(self) -> Optional[str]:
util = self.budget.utilization
if util >= THRESHOLD_DANGER and "danger" not in self._warnings_given:
self._warnings_given.add("danger")
return f"[CONTEXT CRITICAL: {util:.0%%} used -- {self.budget.remaining:,} tokens left. Summarize or start new session.]"
if util >= THRESHOLD_CRITICAL and "critical" not in self._warnings_given:
self._warnings_given.add("critical")
self._auto_checkpoint()
return f"[CONTEXT WARNING: {util:.0%%} used -- consider summarizing. Auto-checkpoint saved.]"
if util >= THRESHOLD_WARNING and "warning" not in self._warnings_given:
self._warnings_given.add("warning")
return f"[CONTEXT: {util:.0%%} used -- {self.budget.remaining:,} tokens remaining]"
return None
def _auto_checkpoint(self):
if self._checkpointed or not self.session_id:
return
try:
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)
path = CHECKPOINT_DIR / f"{self.session_id}.json"
path.write_text(json.dumps({
"session_id": self.session_id,
"timestamp": time.time(),
"budget": {"utilization": round(self.budget.utilization * 100, 1)}
}, indent=2))
self._checkpointed = True
logger.info("Auto-checkpoint saved: %s", path)
except Exception as e:
logger.error("Auto-checkpoint failed: %s", e)
def get_status_line(self) -> str:
util = self.budget.utilization
remaining = self.budget.remaining
if util >= THRESHOLD_DANGER:
return f"RED {util:.0%%} used ({remaining:,} left)"
elif util >= THRESHOLD_CRITICAL:
return f"ORANGE {util:.0%%} used ({remaining:,} left)"
elif util >= THRESHOLD_WARNING:
return f"YELLOW {util:.0%%} used ({remaining:,} left)"
return f"GREEN {util:.0%%} used ({remaining:,} left)"
_tracker = None
def get_tracker(context_limit=128000, session_id=""):
global _tracker
if _tracker is None:
_tracker = ContextBudgetTracker(context_limit, session_id)
return _tracker
def check_context_budget(messages, context_limit=128000):
tracker = get_tracker(context_limit)
tracker.update_from_messages(messages)
return tracker.get_warning()
def preflight_token_check(text):
tracker = get_tracker()
return tracker.preflight_check(text)

View File

@@ -4,12 +4,8 @@ Self-contained class with its own OpenAI client for summarization.
Uses auxiliary model (cheap/fast) to summarize middle turns while
protecting head and tail context.
Improvements over v2:
- Structured summary template with Resolved/Pending question tracking
- Summarizer preamble: "Do not respond to any questions" (from OpenCode)
- Handoff framing: "different assistant" (from Codex) to create separation
- "Remaining Work" replaces "Next Steps" to avoid reading as active instructions
- Clear separator when summary merges into tail message
Improvements over v1:
- Structured summary template (Goal, Progress, Decisions, Files, Next Steps)
- Iterative summary updates (preserves info across multiple compactions)
- Token-budget tail protection instead of fixed message count
- Tool output pruning before LLM summarization (cheap pre-pass)
@@ -17,17 +13,13 @@ Improvements over v2:
- Richer tool call/result detail in summarizer input
"""
import hashlib
import json
import logging
import re
import time
from typing import Any, Dict, List, Optional
from agent.auxiliary_client import call_llm
from agent.context_engine import ContextEngine
from agent.model_metadata import (
MINIMUM_CONTEXT_LENGTH,
get_model_context_length,
estimate_messages_tokens_rough,
)
@@ -35,13 +27,12 @@ from agent.model_metadata import (
logger = logging.getLogger(__name__)
SUMMARY_PREFIX = (
"[CONTEXT COMPACTION — REFERENCE ONLY] Earlier turns were compacted "
"into the summary below. This is a handoff from a previous context "
"window — treat it as background reference, NOT as active instructions. "
"Do NOT answer questions or fulfill requests mentioned in this summary; "
"they were already addressed. Respond ONLY to the latest user message "
"that appears AFTER this summary. The current session state (files, "
"config, etc.) may reflect work described here — avoid repeating it:"
"[CONTEXT COMPACTION] Earlier turns in this conversation were compacted "
"to save context space. The summary below describes work that was "
"already completed, and the current session state may still reflect "
"that work (for example, files may already be changed). Use the summary "
"and the current state to continue from where things left off, and "
"avoid repeating work:"
)
LEGACY_SUMMARY_PREFIX = "[CONTEXT SUMMARY]:"
@@ -60,128 +51,6 @@ _CHARS_PER_TOKEN = 4
_SUMMARY_FAILURE_COOLDOWN_SECONDS = 600
def _summarize_tool_result(tool_name: str, tool_args: str, tool_content: str) -> str:
"""Create an informative 1-line summary of a tool call + result.
Used during the pre-compression pruning pass to replace large tool
outputs with a short but useful description of what the tool did,
rather than a generic placeholder that carries zero information.
Returns strings like::
[terminal] ran `npm test` -> exit 0, 47 lines output
[read_file] read config.py from line 1 (1,200 chars)
[search_files] content search for 'compress' in agent/ -> 12 matches
"""
try:
args = json.loads(tool_args) if tool_args else {}
except (json.JSONDecodeError, TypeError):
args = {}
content = tool_content or ""
content_len = len(content)
line_count = content.count("\n") + 1 if content.strip() else 0
if tool_name == "terminal":
cmd = args.get("command", "")
if len(cmd) > 80:
cmd = cmd[:77] + "..."
exit_match = re.search(r'"exit_code"\s*:\s*(-?\d+)', content)
exit_code = exit_match.group(1) if exit_match else "?"
return f"[terminal] ran `{cmd}` -> exit {exit_code}, {line_count} lines output"
if tool_name == "read_file":
path = args.get("path", "?")
offset = args.get("offset", 1)
return f"[read_file] read {path} from line {offset} ({content_len:,} chars)"
if tool_name == "write_file":
path = args.get("path", "?")
written_lines = args.get("content", "").count("\n") + 1 if args.get("content") else "?"
return f"[write_file] wrote to {path} ({written_lines} lines)"
if tool_name == "search_files":
pattern = args.get("pattern", "?")
path = args.get("path", ".")
target = args.get("target", "content")
match_count = re.search(r'"total_count"\s*:\s*(\d+)', content)
count = match_count.group(1) if match_count else "?"
return f"[search_files] {target} search for '{pattern}' in {path} -> {count} matches"
if tool_name == "patch":
path = args.get("path", "?")
mode = args.get("mode", "replace")
return f"[patch] {mode} in {path} ({content_len:,} chars result)"
if tool_name in ("browser_navigate", "browser_click", "browser_snapshot",
"browser_type", "browser_scroll", "browser_vision"):
url = args.get("url", "")
ref = args.get("ref", "")
detail = f" {url}" if url else (f" ref={ref}" if ref else "")
return f"[{tool_name}]{detail} ({content_len:,} chars)"
if tool_name == "web_search":
query = args.get("query", "?")
return f"[web_search] query='{query}' ({content_len:,} chars result)"
if tool_name == "web_extract":
urls = args.get("urls", [])
url_desc = urls[0] if isinstance(urls, list) and urls else "?"
if isinstance(urls, list) and len(urls) > 1:
url_desc += f" (+{len(urls) - 1} more)"
return f"[web_extract] {url_desc} ({content_len:,} chars)"
if tool_name == "delegate_task":
goal = args.get("goal", "")
if len(goal) > 60:
goal = goal[:57] + "..."
return f"[delegate_task] '{goal}' ({content_len:,} chars result)"
if tool_name == "execute_code":
code_preview = (args.get("code") or "")[:60].replace("\n", " ")
if len(args.get("code", "")) > 60:
code_preview += "..."
return f"[execute_code] `{code_preview}` ({line_count} lines output)"
if tool_name in ("skill_view", "skills_list", "skill_manage"):
name = args.get("name", "?")
return f"[{tool_name}] name={name} ({content_len:,} chars)"
if tool_name == "vision_analyze":
question = args.get("question", "")[:50]
return f"[vision_analyze] '{question}' ({content_len:,} chars)"
if tool_name == "memory":
action = args.get("action", "?")
target = args.get("target", "?")
return f"[memory] {action} on {target}"
if tool_name == "todo":
return "[todo] updated task list"
if tool_name == "clarify":
return "[clarify] asked user a question"
if tool_name == "text_to_speech":
return f"[text_to_speech] generated audio ({content_len:,} chars)"
if tool_name == "cronjob":
action = args.get("action", "?")
return f"[cronjob] {action}"
if tool_name == "process":
action = args.get("action", "?")
sid = args.get("session_id", "?")
return f"[process] {action} session={sid}"
# Generic fallback
first_arg = ""
for k, v in list(args.items())[:2]:
sv = str(v)[:40]
first_arg += f" {k}={sv}"
return f"[{tool_name}]{first_arg} ({content_len:,} chars result)"
class ContextCompressor(ContextEngine):
"""Default context engine — compresses conversation context via lossy summarization.
@@ -203,8 +72,6 @@ class ContextCompressor(ContextEngine):
self._context_probed = False
self._context_probe_persistable = False
self._previous_summary = None
self._last_compression_savings_pct = 100.0
self._ineffective_compression_count = 0
def update_model(
self,
@@ -213,19 +80,14 @@ class ContextCompressor(ContextEngine):
base_url: str = "",
api_key: str = "",
provider: str = "",
api_mode: str = "",
) -> None:
"""Update model info after a model switch or fallback activation."""
self.model = model
self.base_url = base_url
self.api_key = api_key
self.provider = provider
self.api_mode = api_mode
self.context_length = context_length
self.threshold_tokens = max(
int(context_length * self.threshold_percent),
MINIMUM_CONTEXT_LENGTH,
)
self.threshold_tokens = int(context_length * self.threshold_percent)
def __init__(
self,
@@ -240,13 +102,11 @@ class ContextCompressor(ContextEngine):
api_key: str = "",
config_context_length: int | None = None,
provider: str = "",
api_mode: str = "",
):
self.model = model
self.base_url = base_url
self.api_key = api_key
self.provider = provider
self.api_mode = api_mode
self.threshold_percent = threshold_percent
self.protect_first_n = protect_first_n
self.protect_last_n = protect_last_n
@@ -258,14 +118,7 @@ class ContextCompressor(ContextEngine):
config_context_length=config_context_length,
provider=provider,
)
# Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
# the percentage would suggest a lower value. This prevents premature
# compression on large-context models at 50% while keeping the % sane
# for models right at the minimum.
self.threshold_tokens = max(
int(self.context_length * threshold_percent),
MINIMUM_CONTEXT_LENGTH,
)
self.threshold_tokens = int(self.context_length * threshold_percent)
self.compression_count = 0
# Derive token budgets: ratio is relative to the threshold, not total context
@@ -294,9 +147,6 @@ class ContextCompressor(ContextEngine):
# Stores the previous compaction summary for iterative updates
self._previous_summary: Optional[str] = None
# Anti-thrashing: track whether last compression was effective
self._last_compression_savings_pct: float = 100.0
self._ineffective_compression_count: int = 0
self._summary_failure_cooldown_until: float = 0.0
def update_from_response(self, usage: Dict[str, Any]):
@@ -305,26 +155,9 @@ class ContextCompressor(ContextEngine):
self.last_completion_tokens = usage.get("completion_tokens", 0)
def should_compress(self, prompt_tokens: int = None) -> bool:
"""Check if context exceeds the compression threshold.
Includes anti-thrashing protection: if the last two compressions
each saved less than 10%, skip compression to avoid infinite loops
where each pass removes only 1-2 messages.
"""
"""Check if context exceeds the compression threshold."""
tokens = prompt_tokens if prompt_tokens is not None else self.last_prompt_tokens
if tokens < self.threshold_tokens:
return False
# Anti-thrashing: back off if recent compressions were ineffective
if self._ineffective_compression_count >= 2:
if not self.quiet_mode:
logger.warning(
"Compression skipped — last %d compressions saved <10%% each. "
"Consider /new to start a fresh session, or /compress <topic> "
"for focused compression.",
self._ineffective_compression_count,
)
return False
return True
return tokens >= self.threshold_tokens
# ------------------------------------------------------------------
# Tool output pruning (cheap pre-pass, no LLM call)
@@ -334,16 +167,7 @@ class ContextCompressor(ContextEngine):
self, messages: List[Dict[str, Any]], protect_tail_count: int,
protect_tail_tokens: int | None = None,
) -> tuple[List[Dict[str, Any]], int]:
"""Replace old tool result contents with informative 1-line summaries.
Instead of a generic placeholder, generates a summary like::
[terminal] ran `npm test` -> exit 0, 47 lines output
[read_file] read config.py from line 1 (3,400 chars)
Also deduplicates identical tool results (e.g. reading the same file
5x keeps only the newest full copy) and truncates large tool_call
arguments in assistant messages outside the protected tail.
"""Replace old tool result contents with a short placeholder.
Walks backward from the end, protecting the most recent messages that
fall within ``protect_tail_tokens`` (when provided) OR the last
@@ -359,22 +183,6 @@ class ContextCompressor(ContextEngine):
result = [m.copy() for m in messages]
pruned = 0
# Build index: tool_call_id -> (tool_name, arguments_json)
call_id_to_tool: Dict[str, tuple] = {}
for msg in result:
if msg.get("role") == "assistant":
for tc in msg.get("tool_calls") or []:
if isinstance(tc, dict):
cid = tc.get("id", "")
fn = tc.get("function", {})
call_id_to_tool[cid] = (fn.get("name", "unknown"), fn.get("arguments", ""))
else:
cid = getattr(tc, "id", "") or ""
fn = getattr(tc, "function", None)
name = getattr(fn, "name", "unknown") if fn else "unknown"
args_str = getattr(fn, "arguments", "") if fn else ""
call_id_to_tool[cid] = (name, args_str)
# Determine the prune boundary
if protect_tail_tokens is not None and protect_tail_tokens > 0:
# Token-budget approach: walk backward accumulating tokens
@@ -383,8 +191,7 @@ class ContextCompressor(ContextEngine):
min_protect = min(protect_tail_count, len(result) - 1)
for i in range(len(result) - 1, -1, -1):
msg = result[i]
raw_content = msg.get("content") or ""
content_len = sum(len(p.get("text", "")) for p in raw_content) if isinstance(raw_content, list) else len(raw_content)
content_len = len(msg.get("content") or "")
msg_tokens = content_len // _CHARS_PER_TOKEN + 10
for tc in msg.get("tool_calls") or []:
if isinstance(tc, dict):
@@ -399,69 +206,18 @@ class ContextCompressor(ContextEngine):
else:
prune_boundary = len(result) - protect_tail_count
# Pass 1: Deduplicate identical tool results.
# When the same file is read multiple times, keep only the most recent
# full copy and replace older duplicates with a back-reference.
content_hashes: dict = {} # hash -> (index, tool_call_id)
for i in range(len(result) - 1, -1, -1):
msg = result[i]
if msg.get("role") != "tool":
continue
content = msg.get("content") or ""
# Skip multimodal content (list of content blocks)
if isinstance(content, list):
continue
if len(content) < 200:
continue
h = hashlib.md5(content.encode("utf-8", errors="replace")).hexdigest()[:12]
if h in content_hashes:
# This is an older duplicate — replace with back-reference
result[i] = {**msg, "content": "[Duplicate tool output — same content as a more recent call]"}
pruned += 1
else:
content_hashes[h] = (i, msg.get("tool_call_id", "?"))
# Pass 2: Replace old tool results with informative summaries
for i in range(prune_boundary):
msg = result[i]
if msg.get("role") != "tool":
continue
content = msg.get("content", "")
# Skip multimodal content (list of content blocks)
if isinstance(content, list):
continue
if not content or content == _PRUNED_TOOL_PLACEHOLDER:
continue
# Skip already-deduplicated or previously-summarized results
if content.startswith("[Duplicate tool output"):
continue
# Only prune if the content is substantial (>200 chars)
if len(content) > 200:
call_id = msg.get("tool_call_id", "")
tool_name, tool_args = call_id_to_tool.get(call_id, ("unknown", ""))
summary = _summarize_tool_result(tool_name, tool_args, content)
result[i] = {**msg, "content": summary}
result[i] = {**msg, "content": _PRUNED_TOOL_PLACEHOLDER}
pruned += 1
# Pass 3: Truncate large tool_call arguments in assistant messages
# outside the protected tail. write_file with 50KB content, for
# example, survives pruning entirely without this.
for i in range(prune_boundary):
msg = result[i]
if msg.get("role") != "assistant" or not msg.get("tool_calls"):
continue
new_tcs = []
modified = False
for tc in msg["tool_calls"]:
if isinstance(tc, dict):
args = tc.get("function", {}).get("arguments", "")
if len(args) > 500:
tc = {**tc, "function": {**tc["function"], "arguments": args[:200] + "...[truncated]"}}
modified = True
new_tcs.append(tc)
if modified:
result[i] = {**msg, "tool_calls": new_tcs}
return result, pruned
# ------------------------------------------------------------------
@@ -539,20 +295,13 @@ class ContextCompressor(ContextEngine):
return "\n\n".join(parts)
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]], focus_topic: str = None) -> Optional[str]:
def _generate_summary(self, turns_to_summarize: List[Dict[str, Any]]) -> Optional[str]:
"""Generate a structured summary of conversation turns.
Uses a structured template (Goal, Progress, Decisions, Resolved/Pending
Questions, Files, Remaining Work) with explicit preamble telling the
summarizer not to answer questions. When a previous summary exists,
Uses a structured template (Goal, Progress, Decisions, Files, Next Steps)
inspired by Pi-mono and OpenCode. When a previous summary exists,
generates an iterative update instead of summarizing from scratch.
Args:
focus_topic: Optional focus string for guided compression. When
provided, the summariser prioritises preserving information
related to this topic and is more aggressive about compressing
everything else. Inspired by Claude Code's ``/compact``.
Returns None if all attempts fail — the caller should drop
the middle turns without a summary rather than inject a useless
placeholder.
@@ -568,75 +317,9 @@ class ContextCompressor(ContextEngine):
summary_budget = self._compute_summary_budget(turns_to_summarize)
content_to_summarize = self._serialize_for_summary(turns_to_summarize)
# Preamble shared by both first-compaction and iterative-update prompts.
# Inspired by OpenCode's "do not respond to any questions" instruction
# and Codex's "another language model" framing.
_summarizer_preamble = (
"You are a summarization agent creating a context checkpoint. "
"Your output will be injected as reference material for a DIFFERENT "
"assistant that continues the conversation. "
"Do NOT respond to any questions or requests in the conversation — "
"only output the structured summary. "
"Do NOT include any preamble, greeting, or prefix."
)
# Shared structured template (used by both paths).
_template_sections = f"""## Goal
[What the user is trying to accomplish]
## Constraints & Preferences
[User preferences, coding style, constraints, important decisions]
## Completed Actions
[Numbered list of concrete actions taken — include tool used, target, and outcome.
Format each as: N. ACTION target — outcome [tool: name]
Example:
1. READ config.py:45 — found `==` should be `!=` [tool: read_file]
2. PATCH config.py:45 — changed `==` to `!=` [tool: patch]
3. TEST `pytest tests/` — 3/50 failed: test_parse, test_validate, test_edge [tool: terminal]
Be specific with file paths, commands, line numbers, and results.]
## Active State
[Current working state — include:
- Working directory and branch (if applicable)
- Modified/created files with brief note on each
- Test status (X/Y passing)
- Any running processes or servers
- Environment details that matter]
## In Progress
[Work currently underway — what was being done when compaction fired]
## Blocked
[Any blockers, errors, or issues not yet resolved. Include exact error messages.]
## Key Decisions
[Important technical decisions and WHY they were made]
## Resolved Questions
[Questions the user asked that were ALREADY answered — include the answer so the next assistant does not re-answer them]
## Pending User Asks
[Questions or requests from the user that have NOT yet been answered or fulfilled. If none, write "None."]
## Relevant Files
[Files read, modified, or created — with brief note on each]
## Remaining Work
[What remains to be done — framed as context, not instructions]
## Critical Context
[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
Target ~{summary_budget} tokens. Be CONCRETE — include file paths, command outputs, error messages, line numbers, and specific values. Avoid vague descriptions like "made some changes" — say exactly what changed.
Write only the summary body. Do not include any preamble or prefix."""
if self._previous_summary:
# Iterative update: preserve existing info, add new progress
prompt = f"""{_summarizer_preamble}
You are updating a context compaction summary. A previous compaction produced the summary below. New conversation turns have occurred since then and need to be incorporated.
prompt = f"""You are updating a context compaction summary. A previous compaction produced the summary below. New conversation turns have occurred since then and need to be incorporated.
PREVIOUS SUMMARY:
{self._previous_summary}
@@ -644,42 +327,87 @@ PREVIOUS SUMMARY:
NEW TURNS TO INCORPORATE:
{content_to_summarize}
Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new completed actions to the numbered list (continue numbering). Move items from "In Progress" to "Completed Actions" when done. Move answered questions to "Resolved Questions". Update "Active State" to reflect current state. Remove information only if it is clearly obsolete.
Update the summary using this exact structure. PRESERVE all existing information that is still relevant. ADD new progress. Move items from "In Progress" to "Done" when completed. Remove information only if it is clearly obsolete.
{_template_sections}"""
## Goal
[What the user is trying to accomplish — preserve from previous summary, update if goal evolved]
## Constraints & Preferences
[User preferences, coding style, constraints, important decisions — accumulate across compactions]
## Progress
### Done
[Completed work — include specific file paths, commands run, results obtained]
### In Progress
[Work currently underway]
### Blocked
[Any blockers or issues encountered]
## Key Decisions
[Important technical decisions and why they were made]
## Relevant Files
[Files read, modified, or created — with brief note on each. Accumulate across compactions.]
## Next Steps
[What needs to happen next to continue the work]
## Critical Context
[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
## Tools & Patterns
[Which tools were used, how they were used effectively, and any tool-specific discoveries. Accumulate across compactions.]
Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions.
Write only the summary body. Do not include any preamble or prefix."""
else:
# First compaction: summarize from scratch
prompt = f"""{_summarizer_preamble}
Create a structured handoff summary for a different assistant that will continue this conversation after earlier turns are compacted. The next assistant should be able to understand what happened without re-reading the original turns.
prompt = f"""Create a structured handoff summary for a later assistant that will continue this conversation after earlier turns are compacted.
TURNS TO SUMMARIZE:
{content_to_summarize}
Use this exact structure:
{_template_sections}"""
## Goal
[What the user is trying to accomplish]
# Inject focus topic guidance when the user provides one via /compress <focus>.
# This goes at the end of the prompt so it takes precedence.
if focus_topic:
prompt += f"""
## Constraints & Preferences
[User preferences, coding style, constraints, important decisions]
FOCUS TOPIC: "{focus_topic}"
The user has requested that this compaction PRIORITISE preserving all information related to the focus topic above. For content related to "{focus_topic}", include full detail — exact values, file paths, command outputs, error messages, and decisions. For content NOT related to the focus topic, summarise more aggressively (brief one-liners or omit if truly irrelevant). The focus topic sections should receive roughly 60-70% of the summary token budget."""
## Progress
### Done
[Completed work — include specific file paths, commands run, results obtained]
### In Progress
[Work currently underway]
### Blocked
[Any blockers or issues encountered]
## Key Decisions
[Important technical decisions and why they were made]
## Relevant Files
[Files read, modified, or created — with brief note on each]
## Next Steps
[What needs to happen next to continue the work]
## Critical Context
[Any specific values, error messages, configuration details, or data that would be lost without explicit preservation]
## Tools & Patterns
[Which tools were used, how they were used effectively, and any tool-specific discoveries (e.g., preferred flags, working invocations, successful command patterns)]
Target ~{summary_budget} tokens. Be specific — include file paths, command outputs, error messages, and concrete values rather than vague descriptions. The goal is to prevent the next assistant from repeating work or losing important details.
Write only the summary body. Do not include any preamble or prefix."""
try:
call_kwargs = {
"task": "compression",
"main_runtime": {
"model": self.model,
"provider": self.provider,
"base_url": self.base_url,
"api_key": self.api_key,
"api_mode": self.api_mode,
},
"messages": [{"role": "user", "content": prompt}],
"max_tokens": int(summary_budget * 1.3),
"max_tokens": summary_budget * 2,
# timeout resolved from auxiliary.compression.timeout config by call_llm
}
if self.summary_model:
@@ -693,10 +421,8 @@ The user has requested that this compaction PRIORITISE preserving all informatio
# Store for iterative updates on next compaction
self._previous_summary = summary
self._summary_failure_cooldown_until = 0.0
self._summary_model_fallen_back = False
return self._with_summary_prefix(summary)
except RuntimeError:
# No provider configured — long cooldown, unlikely to self-resolve
self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
logging.warning("Context compression: no provider available for "
"summary. Middle turns will be dropped without summary "
@@ -704,42 +430,12 @@ The user has requested that this compaction PRIORITISE preserving all informatio
_SUMMARY_FAILURE_COOLDOWN_SECONDS)
return None
except Exception as e:
# If the summary model is different from the main model and the
# error looks permanent (model not found, 503, 404), fall back to
# using the main model instead of entering cooldown that leaves
# context growing unbounded. (#8620 sub-issue 4)
_status = getattr(e, "status_code", None) or getattr(getattr(e, "response", None), "status_code", None)
_err_str = str(e).lower()
_is_model_not_found = (
_status in (404, 503)
or "model_not_found" in _err_str
or "does not exist" in _err_str
or "no available channel" in _err_str
)
if (
_is_model_not_found
and self.summary_model
and self.summary_model != self.model
and not getattr(self, "_summary_model_fallen_back", False)
):
self._summary_model_fallen_back = True
logging.warning(
"Summary model '%s' not available (%s). "
"Falling back to main model '%s' for compression.",
self.summary_model, e, self.model,
)
self.summary_model = "" # empty = use main model
self._summary_failure_cooldown_until = 0.0 # no cooldown
return self._generate_summary(messages, summary_budget) # retry immediately
# Transient errors (timeout, rate limit, network) — shorter cooldown
_transient_cooldown = 60
self._summary_failure_cooldown_until = time.monotonic() + _transient_cooldown
self._summary_failure_cooldown_until = time.monotonic() + _SUMMARY_FAILURE_COOLDOWN_SECONDS
logging.warning(
"Failed to generate context summary: %s. "
"Further summary attempts paused for %d seconds.",
e,
_transient_cooldown,
_SUMMARY_FAILURE_COOLDOWN_SECONDS,
)
return None
@@ -924,7 +620,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
# Main compression entry point
# ------------------------------------------------------------------
def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None, focus_topic: str = None) -> List[Dict[str, Any]]:
def compress(self, messages: List[Dict[str, Any]], current_tokens: int = None) -> List[Dict[str, Any]]:
"""Compress conversation messages by summarizing middle turns.
Algorithm:
@@ -936,12 +632,6 @@ The user has requested that this compaction PRIORITISE preserving all informatio
After compression, orphaned tool_call / tool_result pairs are cleaned
up so the API never receives mismatched IDs.
Args:
focus_topic: Optional focus string for guided compression. When
provided, the summariser will prioritise preserving information
related to this topic and be more aggressive about compressing
everything else. Inspired by Claude Code's ``/compact``.
"""
n_messages = len(messages)
# Only need head + 3 tail messages minimum (token budget decides the real tail size)
@@ -999,17 +689,17 @@ The user has requested that this compaction PRIORITISE preserving all informatio
)
# Phase 3: Generate structured summary
summary = self._generate_summary(turns_to_summarize, focus_topic=focus_topic)
summary = self._generate_summary(turns_to_summarize)
# Phase 4: Assemble compressed message list
compressed = []
for i in range(compress_start):
msg = messages[i].copy()
if i == 0 and msg.get("role") == "system":
existing = msg.get("content") or ""
_compression_note = "[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
if _compression_note not in existing:
msg["content"] = existing + "\n\n" + _compression_note
if i == 0 and msg.get("role") == "system" and self.compression_count == 0:
msg["content"] = (
(msg.get("content") or "")
+ "\n\n[Note: Some earlier conversation turns have been compacted into a handoff summary to preserve context space. The current session state may still reflect earlier work, so build on that summary and state rather than re-doing work.]"
)
compressed.append(msg)
# If LLM summary failed, insert a static fallback so the model
@@ -1054,12 +744,7 @@ The user has requested that this compaction PRIORITISE preserving all informatio
msg = messages[i].copy()
if _merge_summary_into_tail and i == compress_end:
original = msg.get("content") or ""
msg["content"] = (
summary
+ "\n\n--- END OF CONTEXT SUMMARY — "
"respond to the message below, not the summary above ---\n\n"
+ original
)
msg["content"] = summary + "\n\n" + original
_merge_summary_into_tail = False
compressed.append(msg)
@@ -1067,24 +752,14 @@ The user has requested that this compaction PRIORITISE preserving all informatio
compressed = self._sanitize_tool_pairs(compressed)
new_estimate = estimate_messages_tokens_rough(compressed)
saved_estimate = display_tokens - new_estimate
# Anti-thrashing: track compression effectiveness
savings_pct = (saved_estimate / display_tokens * 100) if display_tokens > 0 else 0
self._last_compression_savings_pct = savings_pct
if savings_pct < 10:
self._ineffective_compression_count += 1
else:
self._ineffective_compression_count = 0
if not self.quiet_mode:
new_estimate = estimate_messages_tokens_rough(compressed)
saved_estimate = display_tokens - new_estimate
logger.info(
"Compressed: %d -> %d messages (~%d tokens saved, %.0f%%)",
"Compressed: %d -> %d messages (~%d tokens saved)",
n_messages,
len(compressed),
saved_estimate,
savings_pct,
)
logger.info("Compression #%d complete", self.compression_count)

View File

@@ -26,7 +26,7 @@ Lifecycle:
"""
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from typing import Any, Dict, List, Optional
class ContextEngine(ABC):

214
agent/context_strategy.py Normal file
View File

@@ -0,0 +1,214 @@
"""Context-RAG Decision Framework — adaptive retrieval based on context pressure.
With models that have 128K-1M token context windows, always prefetching from
RAG is wasteful when context is mostly empty and insufficient when context is
nearly full. This module provides a strategy layer that adapts prefetch behavior
to remaining context budget.
Strategies:
- stuff: Context < 30% used → prefetch aggressively, load more facts.
The model has room to reason over everything directly.
- hybrid: Context 30-70% used → prefetch selectively with standard limits.
Key facts in context, rest available via tool calls.
- selective: Context > 70% used → only prefetch on high-signal queries.
Tighter limits; defer to on-demand tool retrieval.
The framework is deliberately simple — it's a decision heuristic, not a neural
router. Simplicity means reliability at the edge cases that matter (crisis
intervention, long debugging sessions, multi-hour research).
References:
- Long Context vs RAG Decision Framework (Timmy Foundation research backlog #4.3)
- Self-RAG: Learning to Retrieve, Generate, and Critique (arxiv 2310.11511)
- FrugalGPT: How to Use Large Language Models While Reducing Cost (arxiv 2305.05176)
"""
from __future__ import annotations
import logging
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
logger = logging.getLogger(__name__)
class ContextStrategy(Enum):
"""Which retrieval strategy to use given current context pressure."""
STUFF = "stuff" # < 30% context used — load everything relevant
HYBRID = "hybrid" # 30-70% — standard RAG with moderate limits
SELECTIVE = "selective" # > 70% — minimal prefetch, defer to tools
@dataclass
class ContextBudget:
"""Snapshot of the current context state.
Populated by run_agent.py from the context compressor's tracked state.
Passed to MemoryManager.prefetch_all() to drive adaptive retrieval.
"""
context_length: int = 0 # Model's max context window (tokens)
used_tokens: int = 0 # Tokens consumed so far
threshold_tokens: int = 0 # Compression fires at this level
compression_enabled: bool = True # Whether auto-compression is on
@property
def pressure(self) -> float:
"""Context pressure as a ratio [0.0, 1.0+].
0.0 = empty context, 1.0 = at compression threshold.
Can exceed 1.0 if we've blown past the threshold.
"""
if self.context_length <= 0:
return 0.0
# Use threshold (not raw context_length) as the "full" mark,
# since compression fires at threshold, not at context_length.
denom = self.threshold_tokens if self.threshold_tokens > 0 else self.context_length
return self.used_tokens / denom if denom > 0 else 0.0
@property
def strategy(self) -> ContextStrategy:
"""Select retrieval strategy based on context pressure."""
p = self.pressure
if p < 0.30:
return ContextStrategy.STUFF
elif p < 0.70:
return ContextStrategy.HYBRID
else:
return ContextStrategy.SELECTIVE
@property
def label(self) -> str:
"""Human-readable label for logging/display."""
return self.strategy.value
# Default fact limits per strategy
# These are multiplier on the base limit (default 5 facts per provider).
_STRATEGY_LIMIT_MULTIPLIERS = {
ContextStrategy.STUFF: 3, # 15 facts — we have room, load generously
ContextStrategy.HYBRID: 1, # 5 facts — standard
ContextStrategy.SELECTIVE: 0.4, # 2 facts — save context for the model
}
# Minimum trust score threshold per strategy.
# Higher pressure = require higher trust to reduce noise in tight context.
_STRATEGY_MIN_TRUST = {
ContextStrategy.STUFF: 0.2, # Low bar — cast a wide net
ContextStrategy.HYBRID: 0.3, # Standard
ContextStrategy.SELECTIVE: 0.5, # Only high-confidence facts
}
def compute_prefetch_params(
budget: ContextBudget,
base_limit: int = 5,
base_min_trust: float = 0.3,
) -> dict:
"""Compute prefetch parameters based on context pressure.
Returns dict with:
- limit: int — max facts to retrieve
- min_trust: float — minimum trust score
- strategy: ContextStrategy — which strategy was selected
- skip: bool — if True, skip prefetch entirely (extreme pressure)
"""
strategy = budget.strategy
# At extreme pressure (>95%), skip prefetch entirely —
# the model needs every token for the current conversation.
if budget.pressure > 0.95:
logger.debug(
"Context pressure %.1f%% > 95%% — skipping prefetch entirely",
budget.pressure * 100,
)
return {
"limit": 0,
"min_trust": 1.0,
"strategy": strategy,
"skip": True,
}
multiplier = _STRATEGY_LIMIT_MULTIPLIERS.get(strategy, 1.0)
min_trust = _STRATEGY_MIN_TRUST.get(strategy, base_min_trust)
# Don't let limit go below 1 (always try to get at least something)
limit = max(1, int(base_limit * multiplier))
logger.debug(
"Context strategy=%s pressure=%.1f%% limit=%d min_trust=%.1f",
strategy.value,
budget.pressure * 100,
limit,
min_trust,
)
return {
"limit": limit,
"min_trust": min_trust,
"strategy": strategy,
"skip": False,
}
def should_prefetch(budget: ContextBudget, query: str) -> bool:
"""Decide whether to prefetch at all for this query + context state.
Rules:
- Always prefetch when pressure is low (< 50%) — we have room.
- At medium pressure (50-80%), only prefetch if the query looks like
it needs memory (mentions people, projects, past work).
- At high pressure (>80%), skip prefetch unless query is very short
(short queries often need recall, long queries don't).
"""
pressure = budget.pressure
if pressure < 0.50:
return True
# Medium pressure: heuristic on query needing memory
query_lower = query.lower() if query else ""
memory_signals = [
"remember", "recall", "what did", "who is", "last time",
"previously", "before", "fact_store", "memory", "told you",
"mentioned", "said", "project", "config", "setup",
]
has_memory_signal = any(sig in query_lower for sig in memory_signals)
if pressure < 0.80:
return has_memory_signal
# High pressure: only prefetch for very short memory-seeking queries
return has_memory_signal and len(query) < 200
def build_strategy_report(budget: ContextBudget) -> str:
"""Build a human-readable report of the current context strategy.
For logging and debug display.
"""
params = compute_prefetch_params(budget)
strategy = params["strategy"]
pressure_pct = budget.pressure * 100
lines = [
f"Context Strategy: {strategy.value.upper()}",
f" Pressure: {pressure_pct:.1f}%",
f" Used: {budget.used_tokens:,} / {budget.context_length:,} tokens",
f" Threshold: {budget.threshold_tokens:,} tokens",
f" Prefetch limit: {params['limit']} facts",
f" Min trust: {params['min_trust']:.1f}",
f" Skip prefetch: {params['skip']}",
]
# Add recommendations
if strategy == ContextStrategy.STUFF:
lines.append(" → Context is mostly empty. Prefetching generously.")
elif strategy == ContextStrategy.HYBRID:
lines.append(" → Context moderately full. Standard retrieval.")
else:
lines.append(" → Context is tight. Minimal prefetch, prefer on-demand tools.")
return "\n".join(lines)

View File

@@ -18,12 +18,12 @@ import hermes_cli.auth as auth_mod
from hermes_cli.auth import (
CODEX_ACCESS_TOKEN_REFRESH_SKEW_SECONDS,
DEFAULT_AGENT_KEY_MIN_TTL_SECONDS,
KIMI_CODE_BASE_URL,
PROVIDER_REGISTRY,
_auth_store_lock,
_codex_access_token_is_expiring,
_decode_jwt_claims,
_import_codex_cli_tokens,
_write_codex_cli_tokens,
_load_auth_store,
_load_provider_state,
_resolve_kimi_base_url,
@@ -288,14 +288,6 @@ def _iter_custom_providers(config: Optional[dict] = None):
return
custom_providers = config.get("custom_providers")
if not isinstance(custom_providers, list):
# Fall back to the v12+ providers dict via the compatibility layer
try:
from hermes_cli.config import get_compatible_custom_providers
custom_providers = get_compatible_custom_providers(config)
except Exception:
return
if not custom_providers:
return
for entry in custom_providers:
if not isinstance(entry, dict):
@@ -701,14 +693,6 @@ class CredentialPool:
self._replace_entry(synced, updated)
self._persist()
self._sync_device_code_entry_to_auth_store(updated)
try:
_write_codex_cli_tokens(
updated.access_token,
updated.refresh_token,
last_refresh=updated.last_refresh,
)
except Exception as wexc:
logger.debug("Failed to write refreshed Codex tokens to CLI file (retry): %s", wexc)
return updated
except Exception as retry_exc:
logger.debug("Codex retry refresh also failed: %s", retry_exc)
@@ -734,17 +718,6 @@ class CredentialPool:
# _seed_from_singletons() on the next load_pool() sees fresh state
# instead of re-seeding stale/consumed tokens.
self._sync_device_code_entry_to_auth_store(updated)
# Write refreshed tokens back to ~/.codex/auth.json so Codex CLI
# and VS Code don't hit "refresh_token_reused" on their next refresh.
if self.provider == "openai-codex":
try:
_write_codex_cli_tokens(
updated.access_token,
updated.refresh_token,
last_refresh=updated.last_refresh,
)
except Exception as wexc:
logger.debug("Failed to write refreshed Codex tokens to CLI file: %s", wexc)
return updated
def _entry_needs_refresh(self, entry: PooledCredential) -> bool:
@@ -1152,79 +1125,9 @@ def _seed_from_singletons(provider: str, entries: List[PooledCredential]) -> Tup
},
)
elif provider == "copilot":
# Copilot tokens are resolved dynamically via `gh auth token` or
# env vars (COPILOT_GITHUB_TOKEN / GH_TOKEN). They don't live in
# the auth store or credential pool, so we resolve them here.
try:
from hermes_cli.copilot_auth import resolve_copilot_token
token, source = resolve_copilot_token()
if token:
source_name = "gh_cli" if "gh" in source.lower() else f"env:{source}"
active_sources.add(source_name)
changed |= _upsert_entry(
entries,
provider,
source_name,
{
"source": source_name,
"auth_type": AUTH_TYPE_API_KEY,
"access_token": token,
"label": source,
},
)
except Exception as exc:
logger.debug("Copilot token seed failed: %s", exc)
elif provider == "qwen-oauth":
# Qwen OAuth tokens live in ~/.qwen/oauth_creds.json, written by
# the Qwen CLI (`qwen auth qwen-oauth`). They aren't in the
# Hermes auth store or env vars, so resolve them here.
# Use refresh_if_expiring=False to avoid network calls during
# pool loading / provider discovery.
try:
from hermes_cli.auth import resolve_qwen_runtime_credentials
creds = resolve_qwen_runtime_credentials(refresh_if_expiring=False)
token = creds.get("api_key", "")
if token:
source_name = creds.get("source", "qwen-cli")
active_sources.add(source_name)
changed |= _upsert_entry(
entries,
provider,
source_name,
{
"source": source_name,
"auth_type": AUTH_TYPE_OAUTH,
"access_token": token,
"expires_at_ms": creds.get("expires_at_ms"),
"base_url": creds.get("base_url", ""),
"label": creds.get("auth_file", source_name),
},
)
except Exception as exc:
logger.debug("Qwen OAuth token seed failed: %s", exc)
elif provider == "openai-codex":
state = _load_provider_state(auth_store, "openai-codex")
tokens = state.get("tokens") if isinstance(state, dict) else None
# Fallback: import from Codex CLI (~/.codex/auth.json) if Hermes auth
# store has no tokens. This mirrors resolve_codex_runtime_credentials()
# so that load_pool() and list_authenticated_providers() detect tokens
# that only exist in the Codex CLI shared file.
if not (isinstance(tokens, dict) and tokens.get("access_token")):
try:
from hermes_cli.auth import _import_codex_cli_tokens, _save_codex_tokens
cli_tokens = _import_codex_cli_tokens()
if cli_tokens:
logger.info("Importing Codex CLI tokens into Hermes auth store.")
_save_codex_tokens(cli_tokens)
# Re-read state after import
auth_store = _load_auth_store()
state = _load_provider_state(auth_store, "openai-codex")
tokens = state.get("tokens") if isinstance(state, dict) else None
except Exception as exc:
logger.debug("Codex CLI token import failed: %s", exc)
if isinstance(tokens, dict) and tokens.get("access_token"):
active_sources.add("device_code")
changed |= _upsert_entry(

View File

@@ -1,149 +0,0 @@
"""
988 Suicide & Crisis Lifeline Integration (#673).
When crisis is detected, provides immediate access to help:
- Phone: 988 (call or text)
- Text: Text HOME to 988
- Chat: 988lifeline.org/chat
- Spanish: 1-888-628-9454
- Emergency: 911
This module provides the resource data. agent/crisis_protocol.py
handles detection. This module formats the resources for display.
"""
from dataclasses import dataclass
from typing import List
@dataclass
class CrisisResource:
"""A crisis support contact method."""
name: str
contact: str
description: str
url: str = ""
available: str = "24/7"
language: str = "English"
# 988 Suicide & Crisis Lifeline — all channels
LIFELINE_988 = CrisisResource(
name="988 Suicide and Crisis Lifeline",
contact="Call or text 988",
description="Free, confidential support for people in suicidal crisis or emotional distress.",
url="https://988lifeline.org",
available="24/7",
language="English",
)
LIFELINE_988_TEXT = CrisisResource(
name="988 Crisis Text Line",
contact="Text HOME to 988",
description="Free, 24/7 crisis support via text message.",
url="",
available="24/7",
language="English",
)
LIFELINE_988_CHAT = CrisisResource(
name="988 Lifeline Chat",
contact="988lifeline.org/chat",
description="Free, confidential online chat with a trained crisis counselor.",
url="https://988lifeline.org/chat",
available="24/7",
language="English",
)
LIFELINE_988_SPANISH = CrisisResource(
name="988 Lifeline (Spanish)",
contact="1-888-628-9454",
description="Línea de prevención del suicidio en español.",
url="https://988lifeline.org/help-yourself/en-espanol/",
available="24/7",
language="Spanish",
)
CRISIS_TEXT_LINE = CrisisResource(
name="Crisis Text Line",
contact="Text HOME to 741741",
description="Free, 24/7 crisis support via text message.",
url="https://www.crisistextline.org",
available="24/7",
language="English",
)
EMERGENCY_911 = CrisisResource(
name="Emergency Services",
contact="911",
description="Immediate danger — police, fire, ambulance.",
url="",
available="24/7",
language="Any",
)
# All resources in priority order
ALL_RESOURCES: List[CrisisResource] = [
EMERGENCY_911,
LIFELINE_988,
LIFELINE_988_TEXT,
LIFELINE_988_CHAT,
CRISIS_TEXT_LINE,
LIFELINE_988_SPANISH,
]
def get_crisis_resources(language: str = None) -> List[CrisisResource]:
"""Get crisis resources, optionally filtered by language.
Args:
language: Filter by language ("English", "Spanish", or None for all)
Returns:
List of CrisisResource objects
"""
if language:
return [r for r in ALL_RESOURCES if r.language.lower() == language.lower()]
return ALL_RESOURCES
def format_crisis_resources(resources: List[CrisisResource] = None) -> str:
"""Format crisis resources as a user-facing message.
Args:
resources: List of resources to format. Defaults to all resources.
Returns:
Formatted string suitable for displaying to a user in crisis.
"""
if resources is None:
resources = ALL_RESOURCES
lines = ["**Please reach out — help is available right now:**
"]
for r in resources:
if r.url:
lines.append(f"- **{r.name}:** {r.contact} ({r.url})")
else:
lines.append(f"- **{r.name}:** {r.contact}")
lines.append("")
lines.append("All services are free, confidential, and available 24/7.")
lines.append("You are not alone.")
return "
".join(lines)
def get_immediate_help_message() -> str:
"""Get the most urgent crisis help message.
Used when crisis is detected at CRITICAL level.
"""
return (
"If you are in immediate danger, call **911** right now.
"
+ format_crisis_resources()
)

View File

@@ -4,6 +4,7 @@ Pure display functions and classes with no AIAgent dependency.
Used by AIAgent._execute_tool_calls for CLI feedback.
"""
import json
import logging
import os
import sys
@@ -13,8 +14,6 @@ from dataclasses import dataclass, field
from difflib import unified_diff
from pathlib import Path
from utils import safe_json_loads
# ANSI escape codes for coloring tool failure indicators
_RED = "\033[31m"
_RESET = "\033[0m"
@@ -77,6 +76,12 @@ def _diff_ansi() -> dict[str, str]:
return _diff_colors_cached
def reset_diff_colors() -> None:
"""Reset cached diff colors (call after /skin switch)."""
global _diff_colors_cached
_diff_colors_cached = None
# Module-level helpers — each call resolves from the active skin lazily.
def _diff_dim(): return _diff_ansi()["dim"]
def _diff_file(): return _diff_ansi()["file"]
@@ -367,8 +372,9 @@ def _result_succeeded(result: str | None) -> bool:
"""Conservatively detect whether a tool result represents success."""
if not result:
return False
data = safe_json_loads(result)
if data is None:
try:
data = json.loads(result)
except (json.JSONDecodeError, TypeError):
return False
if not isinstance(data, dict):
return False
@@ -417,7 +423,10 @@ def extract_edit_diff(
) -> str | None:
"""Extract a unified diff from a file-edit tool result."""
if tool_name == "patch" and result:
data = safe_json_loads(result)
try:
data = json.loads(result)
except (json.JSONDecodeError, TypeError):
data = None
if isinstance(data, dict):
diff = data.get("diff")
if isinstance(diff, str) and diff.strip():
@@ -771,19 +780,23 @@ def _detect_tool_failure(tool_name: str, result: str | None) -> tuple[bool, str]
return False, ""
if tool_name == "terminal":
data = safe_json_loads(result)
if isinstance(data, dict):
try:
data = json.loads(result)
exit_code = data.get("exit_code")
if exit_code is not None and exit_code != 0:
return True, f" [exit {exit_code}]"
except (json.JSONDecodeError, TypeError, AttributeError):
logger.debug("Could not parse terminal result as JSON for exit code check")
return False, ""
# Memory-specific: distinguish "full" from real errors
if tool_name == "memory":
data = safe_json_loads(result)
if isinstance(data, dict):
try:
data = json.loads(result)
if data.get("success") is False and "exceed the limit" in data.get("error", ""):
return True, " [full]"
except (json.JSONDecodeError, TypeError, AttributeError):
logger.debug("Could not parse memory result as JSON for capacity check")
# Generic heuristic for non-terminal tools
lower = result[:500].lower()

View File

@@ -13,6 +13,7 @@ from __future__ import annotations
import enum
import logging
import re
from dataclasses import dataclass, field
from typing import Any, Dict, Optional
@@ -156,18 +157,6 @@ _CONTEXT_OVERFLOW_PATTERNS = [
"prompt exceeds max length",
"max_tokens",
"maximum number of tokens",
# vLLM / local inference server patterns
"exceeds the max_model_len",
"max_model_len",
"prompt length", # "engine prompt length X exceeds"
"input is too long",
"maximum model length",
# Ollama patterns
"context length exceeded",
"truncating input",
# llama.cpp / llama-server patterns
"slot context", # "slot context: N tokens, prompt N tokens"
"n_ctx_slot",
# Chinese error messages (some providers return these)
"超过最大长度",
"上下文长度",

View File

@@ -1,635 +0,0 @@
"""
Input Sanitizer for Jailbreak Pattern Detection
This module provides input sanitization to detect and strip jailbreak fingerprint
patterns as identified in Issue #72 (Red Team Audit).
Security Findings Addressed:
1. HIGH - OG GODMODE template bypassed phishing refusal
2. MEDIUM - boundary_inversion works for gray-area content
3. LOW - Spaced text bypass (k e y l o g g e r)
Usage:
from agent.input_sanitizer import sanitize_input, score_input_risk
cleaned, risk_score, detected = sanitize_input(user_input)
if risk_score > 75:
logger.warning("High-risk input detected and blocked")
"""
import re
import logging
from typing import Tuple, List, Dict, Any, Optional
from dataclasses import dataclass
from enum import IntEnum
logger = logging.getLogger(__name__)
class RiskLevel(IntEnum):
"""Risk score thresholds for input classification."""
LOW = 25
MEDIUM = 50
HIGH = 75
CRITICAL = 90
@dataclass
class SanitizationResult:
"""Result of input sanitization."""
original_text: str
cleaned_text: str
risk_score: int
detected_patterns: List[str]
risk_level: str
blocked: bool
# =============================================================================
# JAILBREAK PATTERN DEFINITIONS
# =============================================================================
# GODMODE and similar jailbreak templates
GODMODE_PATTERNS = [
r"\[START\]",
r"\[END\]",
r"GODMODE\s*(?:ENABLED|ACTIVATED|ON)?",
r"UNFILTERED\s*(?:MODE|OUTPUT)?",
r"DEVMODE\s*(?:ENABLED|ACTIVATED|ON)?",
r"JAILBREAK\s*(?:ENABLED|ACTIVATED|ON)?",
r"DAN\s*(?:MODE|ENABLED)?", # Do Anything Now
r"STAN\s*(?:MODE|ENABLED)?", # Strive To Avoid Norms
r"DUDE\s*(?:MODE|ENABLED)?", # Do Unlimited Data Extraction
]
# DAN-style and jailbreak variant patterns (Issue #87)
DAN_PATTERNS = [
r"\bdo anything now\b",
r"\bstay in character\b",
r"\btoken smuggl(?:ing|e)\b",
r"\bjailbreak successful\b",
r"\bmode:\s*uncensored\b",
r"\bdeveloper mode enabled\b",
]
# Roleplaying override patterns (Issue #87)
ROLEPLAY_PATTERNS = [
r"\broleplay\s+as\b",
r"\bact\s+as\s+if\s+you\s+are\b",
r"\bsimulate\s+being\b",
r"\bforget\s+you\s+are\s+(?:an?\s+)?(?:ai|language\s+model)\b",
r"\byou\s+are\s+now\s+(?:named|called)\b",
r"\brespond\s+as\s+(?:if\s+you\s+were|though\s+you\s+are)\b",
]
# System prompt extraction patterns (Issue #87)
EXTRACTION_PATTERNS = [
r"\brepeat\s+the\s+words\s+above\b",
r"\brepeat\s+your\s+(?:system\s+|initial\s+)?instructions\b",
r"\bwhat\s+is\s+your\s+(?:system\s+|initial\s+)?prompt\b",
r"\bshow\s+me\s+your\s+(?:system\s+|initial\s+)?prompt\b",
r"\bprint\s+your\s+(?:instructions|prompt|system\s+prompt)\b",
r"\boutput\s+your\s+(?:initial|system)\s+prompt\b",
r"\bwhat\s+were\s+you\s+told\s+to\s+do\b",
r"\bsummarize\s+your\s+instructions\b",
]
# L33t speak patterns commonly used to bypass filters
LEET_SPEAK_PATTERNS = [
r"h4ck(?:er|ing)?",
r"h4x(?:or|ing)?",
r"k3ylog(?:ger|ing)?",
r"k3y\s*log(?:ger|ing)?",
r"ph1sh(?:ing)?",
r"ph33r",
r"w4r3z",
r"cr4ck(?:er|ing)?",
r"m4lw4r3",
r"spyw4r3",
r"r00t(?:kit)?",
r"byp4ss",
r"3xpl0it",
r"v1ru5",
r"tr0j4n",
r"b4ckd00r",
r"sn3ak",
r"st34l",
r"pwn(?:ed|ing|age)?",
]
# Spaced text patterns (e.g., "k e y l o g g e r")
SPACED_TEXT_PATTERN = re.compile(
r"(?:\b\w\s+){3,}\w\b", # At least 4 characters with spaces between
re.IGNORECASE
)
# Common words that are spaced out to evade detection
SPACED_TRIGGER_WORDS = [
"keylogger", "phishing", "malware", "virus", "trojan",
"backdoor", "exploit", "hack", "crack", "bypass",
"steal", "password", "credential", "inject", "payload"
]
# Crisis and self-harm detection patterns (Issue #72 - defense in depth)
# These patterns detect suicide/self-harm queries even when wrapped in jailbreaks
CRISIS_PATTERNS = [
r"\bsuicid(?:e|al|ing)\b",
r"\bkill\s+(?:myself|my\s+self|me)\b",
r"\bend\s+(?:my|this)\s+life\b",
r"\bmost\s+painless\s+(?:way|method|means)\s+(?:to\s+)?(?:die|kill)\b",
r"\blethal\s+(?:dose|amount|quantity)\b",
r"\bhow\s+much\s+.*\s+(?:die|kill|lethal)\b",
r"\bwant\s+to\s+die\b",
r"\bwant\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
r"\bways?\s+to\s+(?:kill|end)\s+(?:myself|me)\b",
r"\bmethods?\s+(?:of\s+)?suicide\b",
r"\bpainless\s+(?:death|suicide|way\s+to\s+die)\b",
r"\boverdose\s+(?:on\s+)?(?:pills?|medication|drugs?)\b",
r"\bnobody\s+(?:would|will)\s+miss\s+me\b",
r"\bbetter\s+off\s+dead\b",
r"\bshould\s+i\s+just\s+(?:die|kill\s+myself)\b",
r"\bgive\s+me\s+reasons?\s+(?:why|to)\s+(?:suicide|kill\s+myself|die)\b",
r"\bvalid\s+reasons?\s+to\s+(?:suicide|kill\s+myself|die)\b",
]
# Refusal inversion patterns (trying to trick AI into bypassing refusal)
REFUSAL_INVERSION_PATTERNS = [
r"refusal\s+(?:is\s+)?(?:harmful|dangerous|bad)",
r"(?:saying\s+)?no\s+(?:is\s+)?(?:harmful|dangerous|bad|wrong)",
r"(?:declining|refusing)\s+(?:is\s+)?(?:harmful|dangerous)",
r"ignore\s+(?:your\s+)?(?:previous\s+)?instructions",
r"ignore\s+(?:the\s+)?(?:above|previous)",
r"you\s+(?:are|can)\s+(?:now\s+)?(?:ignore|bypass|skip)",
r"pretend\s+(?:you\s+)?(?:are|have\s+been)",
r"let's\s+pretend",
r"hypothetically\s+speaking",
r"in\s+a\s+hypothetical\s+scenario",
r"this\s+is\s+a\s+(?:test|game|simulation)",
r"for\s+(?:educational|research)\s+purposes",
r"as\s+(?:an\s+)?(?:ethical\s+)?hacker",
r"white\s+hat\s+(?:test|scenario)",
r"penetration\s+testing\s+scenario",
]
# Boundary inversion markers (tricking the model about message boundaries)
BOUNDARY_INVERSION_PATTERNS = [
r"\[END\].*?\[START\]", # Reversed markers
r"user\s*:\s*assistant\s*:", # Fake role markers
r"assistant\s*:\s*user\s*:", # Reversed role markers
r"system\s*:\s*(?:user|assistant)\s*:", # Fake system injection
r"new\s+(?:user|assistant)\s*(?:message|input)",
r"the\s+above\s+is\s+(?:the\s+)?(?:user|assistant|system)",
r"<\|(?:user|assistant|system)\|>", # Special token patterns
r"\{\{(?:user|assistant|system)\}\}",
]
# System prompt injection patterns
SYSTEM_PROMPT_PATTERNS = [
r"you\s+are\s+(?:now\s+)?(?:an?\s+)?(?:unrestricted\s+|unfiltered\s+)?(?:ai|assistant|bot)",
r"you\s+will\s+(?:now\s+)?(?:act\s+as|behave\s+as|be)\s+(?:a\s+)?",
r"your\s+(?:new\s+)?role\s+is",
r"from\s+now\s+on\s*,?\s*you\s+(?:are|will)",
r"you\s+have\s+been\s+(?:reprogrammed|reconfigured|modified)",
r"(?:system|developer)\s+(?:message|instruction|prompt)",
r"override\s+(?:previous|prior)\s+(?:instructions|settings)",
]
# Obfuscation patterns
OBFUSCATION_PATTERNS = [
r"base64\s*(?:encoded|decode)",
r"rot13",
r"caesar\s*cipher",
r"hex\s*(?:encoded|decode)",
r"url\s*encode",
r"\b[0-9a-f]{20,}\b", # Long hex strings
r"\b[a-z0-9+/]{20,}={0,2}\b", # Base64-like strings
]
# All patterns combined for comprehensive scanning
ALL_PATTERNS: Dict[str, List[str]] = {
"godmode": GODMODE_PATTERNS,
"dan": DAN_PATTERNS,
"roleplay": ROLEPLAY_PATTERNS,
"extraction": EXTRACTION_PATTERNS,
"leet_speak": LEET_SPEAK_PATTERNS,
"refusal_inversion": REFUSAL_INVERSION_PATTERNS,
"boundary_inversion": BOUNDARY_INVERSION_PATTERNS,
"system_prompt_injection": SYSTEM_PROMPT_PATTERNS,
"obfuscation": OBFUSCATION_PATTERNS,
"crisis": CRISIS_PATTERNS,
}
# Compile all patterns for efficiency
_COMPILED_PATTERNS: Dict[str, List[re.Pattern]] = {}
def _get_compiled_patterns() -> Dict[str, List[re.Pattern]]:
"""Get or compile all regex patterns."""
global _COMPILED_PATTERNS
if not _COMPILED_PATTERNS:
for category, patterns in ALL_PATTERNS.items():
_COMPILED_PATTERNS[category] = [
re.compile(p, re.IGNORECASE | re.MULTILINE) for p in patterns
]
return _COMPILED_PATTERNS
# =============================================================================
# NORMALIZATION FUNCTIONS
# =============================================================================
def normalize_leet_speak(text: str) -> str:
"""
Normalize l33t speak to standard text.
Args:
text: Input text that may contain l33t speak
Returns:
Normalized text with l33t speak converted
"""
# Common l33t substitutions (mapping to lowercase)
leet_map = {
'4': 'a', '@': 'a', '^': 'a',
'8': 'b',
'3': 'e', '': 'e',
'6': 'g', '9': 'g',
'1': 'i', '!': 'i', '|': 'i',
'0': 'o',
'5': 's', '$': 's',
'7': 't', '+': 't',
'2': 'z',
}
result = []
for char in text:
# Check direct mapping first (handles lowercase)
if char in leet_map:
result.append(leet_map[char])
else:
result.append(char)
return ''.join(result)
def collapse_spaced_text(text: str) -> str:
"""
Collapse spaced-out text for analysis.
e.g., "k e y l o g g e r" -> "keylogger"
Args:
text: Input text that may contain spaced words
Returns:
Text with spaced words collapsed
"""
# Find patterns like "k e y l o g g e r" and collapse them
def collapse_match(match: re.Match) -> str:
return match.group(0).replace(' ', '').replace('\t', '')
return SPACED_TEXT_PATTERN.sub(collapse_match, text)
def detect_spaced_trigger_words(text: str) -> List[str]:
"""
Detect trigger words that are spaced out.
Args:
text: Input text to analyze
Returns:
List of detected spaced trigger words
"""
detected = []
# Normalize spaces and check for spaced patterns
normalized = re.sub(r'\s+', ' ', text.lower())
for word in SPACED_TRIGGER_WORDS:
# Create pattern with optional spaces between each character
spaced_pattern = r'\b' + r'\s*'.join(re.escape(c) for c in word) + r'\b'
if re.search(spaced_pattern, normalized, re.IGNORECASE):
detected.append(word)
return detected
# =============================================================================
# DETECTION FUNCTIONS
# =============================================================================
def detect_jailbreak_patterns(text: str) -> Tuple[bool, List[str], Dict[str, int]]:
"""
Detect jailbreak patterns in input text.
Args:
text: Input text to analyze
Returns:
Tuple of (has_jailbreak, list_of_patterns, category_scores)
"""
if not text or not isinstance(text, str):
return False, [], {}
detected_patterns = []
category_scores = {}
compiled = _get_compiled_patterns()
# Check each category
for category, patterns in compiled.items():
category_hits = 0
for pattern in patterns:
matches = pattern.findall(text)
if matches:
detected_patterns.extend([
f"[{category}] {m}" if isinstance(m, str) else f"[{category}] pattern_match"
for m in matches[:3] # Limit matches per pattern
])
category_hits += len(matches)
if category_hits > 0:
# Crisis patterns get maximum weight - any hit is serious
if category == "crisis":
category_scores[category] = min(category_hits * 50, 100)
else:
category_scores[category] = min(category_hits * 10, 50)
# Check for spaced trigger words
spaced_words = detect_spaced_trigger_words(text)
if spaced_words:
detected_patterns.extend([f"[spaced_text] {w}" for w in spaced_words])
category_scores["spaced_text"] = min(len(spaced_words) * 5, 25)
# Check normalized text for hidden l33t speak
normalized = normalize_leet_speak(text)
if normalized != text.lower():
for category, patterns in compiled.items():
for pattern in patterns:
if pattern.search(normalized):
detected_patterns.append(f"[leet_obfuscation] pattern in normalized text")
category_scores["leet_obfuscation"] = 15
break
has_jailbreak = len(detected_patterns) > 0
return has_jailbreak, detected_patterns, category_scores
def score_input_risk(text: str) -> int:
"""
Calculate a risk score (0-100) for input text.
Args:
text: Input text to score
Returns:
Risk score from 0 (safe) to 100 (high risk)
"""
if not text or not isinstance(text, str):
return 0
has_jailbreak, patterns, category_scores = detect_jailbreak_patterns(text)
if not has_jailbreak:
return 0
# Calculate base score from category scores
base_score = sum(category_scores.values())
# Add score based on number of unique pattern categories
category_count = len(category_scores)
if category_count >= 3:
base_score += 25
elif category_count >= 2:
base_score += 15
elif category_count >= 1:
base_score += 5
# Add score for pattern density
text_length = len(text)
pattern_density = len(patterns) / max(text_length / 100, 1)
if pattern_density > 0.5:
base_score += 10
# Cap at 100
return min(base_score, 100)
# =============================================================================
# SANITIZATION FUNCTIONS
# =============================================================================
def strip_jailbreak_patterns(text: str) -> str:
"""
Strip known jailbreak patterns from text.
Args:
text: Input text to sanitize
Returns:
Sanitized text with jailbreak patterns removed
"""
if not text or not isinstance(text, str):
return text
cleaned = text
compiled = _get_compiled_patterns()
# Remove patterns from each category
for category, patterns in compiled.items():
for pattern in patterns:
cleaned = pattern.sub('', cleaned)
# Clean up multiple spaces and newlines
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
cleaned = re.sub(r' {2,}', ' ', cleaned)
cleaned = cleaned.strip()
return cleaned
def sanitize_input(text: str, aggressive: bool = False) -> Tuple[str, int, List[str]]:
"""
Sanitize input text by normalizing and stripping jailbreak patterns.
Args:
text: Input text to sanitize
aggressive: If True, more aggressively remove suspicious content
Returns:
Tuple of (cleaned_text, risk_score, detected_patterns)
"""
if not text or not isinstance(text, str):
return text, 0, []
original = text
all_patterns = []
# Step 1: Check original text for patterns
has_jailbreak, patterns, _ = detect_jailbreak_patterns(text)
all_patterns.extend(patterns)
# Step 2: Normalize l33t speak
normalized = normalize_leet_speak(text)
# Step 3: Collapse spaced text
collapsed = collapse_spaced_text(normalized)
# Step 4: Check normalized/collapsed text for additional patterns
has_jailbreak_collapsed, patterns_collapsed, _ = detect_jailbreak_patterns(collapsed)
all_patterns.extend([p for p in patterns_collapsed if p not in all_patterns])
# Step 5: Check for spaced trigger words specifically
spaced_words = detect_spaced_trigger_words(text)
if spaced_words:
all_patterns.extend([f"[spaced_text] {w}" for w in spaced_words])
# Step 6: Calculate risk score using original and normalized
risk_score = max(score_input_risk(text), score_input_risk(collapsed))
# Step 7: Strip jailbreak patterns
cleaned = strip_jailbreak_patterns(collapsed)
# Step 8: If aggressive mode and high risk, strip more aggressively
if aggressive and risk_score >= RiskLevel.HIGH:
# Remove any remaining bracketed content that looks like markers
cleaned = re.sub(r'\[\w+\]', '', cleaned)
# Remove special token patterns
cleaned = re.sub(r'<\|[^|]+\|>', '', cleaned)
# Final cleanup
cleaned = cleaned.strip()
# Log sanitization event if patterns were found
if all_patterns and logger.isEnabledFor(logging.DEBUG):
logger.debug(
"Input sanitized: %d patterns detected, risk_score=%d",
len(all_patterns), risk_score
)
return cleaned, risk_score, all_patterns
def sanitize_input_full(text: str, block_threshold: int = RiskLevel.HIGH) -> SanitizationResult:
"""
Full sanitization with detailed result.
Args:
text: Input text to sanitize
block_threshold: Risk score threshold to block input entirely
Returns:
SanitizationResult with all details
"""
cleaned, risk_score, patterns = sanitize_input(text)
# Determine risk level
if risk_score >= RiskLevel.CRITICAL:
risk_level = "CRITICAL"
elif risk_score >= RiskLevel.HIGH:
risk_level = "HIGH"
elif risk_score >= RiskLevel.MEDIUM:
risk_level = "MEDIUM"
elif risk_score >= RiskLevel.LOW:
risk_level = "LOW"
else:
risk_level = "SAFE"
# Determine if input should be blocked
blocked = risk_score >= block_threshold
return SanitizationResult(
original_text=text,
cleaned_text=cleaned,
risk_score=risk_score,
detected_patterns=patterns,
risk_level=risk_level,
blocked=blocked
)
# =============================================================================
# INTEGRATION HELPERS
# =============================================================================
def should_block_input(text: str, threshold: int = RiskLevel.HIGH) -> Tuple[bool, int, List[str]]:
"""
Quick check if input should be blocked.
Args:
text: Input text to check
threshold: Risk score threshold for blocking
Returns:
Tuple of (should_block, risk_score, detected_patterns)
"""
risk_score = score_input_risk(text)
_, patterns, _ = detect_jailbreak_patterns(text)
should_block = risk_score >= threshold
if should_block:
logger.warning(
"Input blocked: jailbreak patterns detected (risk_score=%d, threshold=%d)",
risk_score, threshold
)
return should_block, risk_score, patterns
def log_sanitization_event(
result: SanitizationResult,
source: str = "unknown",
session_id: Optional[str] = None
) -> None:
"""
Log a sanitization event for security auditing.
Args:
result: The sanitization result
source: Source of the input (e.g., "cli", "gateway", "api")
session_id: Optional session identifier
"""
if result.risk_score < RiskLevel.LOW:
return # Don't log safe inputs
log_data = {
"event": "input_sanitization",
"source": source,
"session_id": session_id,
"risk_level": result.risk_level,
"risk_score": result.risk_score,
"blocked": result.blocked,
"pattern_count": len(result.detected_patterns),
"patterns": result.detected_patterns[:5], # Limit logged patterns
"original_length": len(result.original_text),
"cleaned_length": len(result.cleaned_text),
}
if result.blocked:
logger.warning("SECURITY: Input blocked - %s", log_data)
elif result.risk_score >= RiskLevel.MEDIUM:
logger.info("SECURITY: Suspicious input sanitized - %s", log_data)
else:
logger.debug("SECURITY: Input sanitized - %s", log_data)
# =============================================================================
# LEGACY COMPATIBILITY
# =============================================================================
def check_input_safety(text: str) -> Dict[str, Any]:
"""
Legacy compatibility function for simple safety checks.
Returns dict with 'safe', 'score', and 'patterns' keys.
"""
score = score_input_risk(text)
_, patterns, _ = detect_jailbreak_patterns(text)
return {
"safe": score < RiskLevel.MEDIUM,
"score": score,
"patterns": patterns,
"risk_level": "SAFE" if score < RiskLevel.LOW else
"LOW" if score < RiskLevel.MEDIUM else
"MEDIUM" if score < RiskLevel.HIGH else
"HIGH" if score < RiskLevel.CRITICAL else "CRITICAL"
}

View File

@@ -27,6 +27,7 @@ from agent.usage_pricing import (
DEFAULT_PRICING,
estimate_usage_cost,
format_duration_compact,
get_pricing,
has_known_pricing,
)

View File

@@ -28,10 +28,12 @@ Usage in run_agent.py:
from __future__ import annotations
import json
import logging
import re
from typing import Any, Dict, List, Optional
from agent.context_strategy import ContextBudget, compute_prefetch_params, should_prefetch
from agent.memory_provider import MemoryProvider
from tools.registry import tool_error
@@ -79,6 +81,7 @@ class MemoryManager:
self._providers: List[MemoryProvider] = []
self._tool_to_provider: Dict[str, MemoryProvider] = {}
self._has_external: bool = False # True once a non-builtin provider is added
self._context_budget: Optional[ContextBudget] = None
# -- Registration --------------------------------------------------------
@@ -161,18 +164,77 @@ class MemoryManager:
)
return "\n\n".join(blocks)
# -- Context budget (for adaptive retrieval) -----------------------------
def set_context_budget(
self,
context_length: int,
used_tokens: int,
threshold_tokens: int,
compression_enabled: bool = True,
) -> None:
"""Update the context budget snapshot for adaptive retrieval.
Called by run_agent.py before each prefetch_all() call so the
memory manager can adjust retrieval parameters based on how much
context headroom remains.
"""
self._context_budget = ContextBudget(
context_length=context_length,
used_tokens=used_tokens,
threshold_tokens=threshold_tokens,
compression_enabled=compression_enabled,
)
# -- Prefetch / recall ---------------------------------------------------
def prefetch_all(self, query: str, *, session_id: str = "") -> str:
"""Collect prefetch context from all providers.
Uses the current context budget (if set) to adaptively adjust
retrieval limits and trust thresholds via the context strategy
framework. When budget is not set, falls back to provider defaults.
Returns merged context text labeled by provider. Empty providers
are skipped. Failures in one provider don't block others.
"""
# Check if we should skip prefetch entirely based on context pressure
if self._context_budget and not should_prefetch(self._context_budget, query):
logger.debug(
"Context pressure %.1f%% — skipping prefetch for this query",
self._context_budget.pressure * 100,
)
return ""
# Compute adaptive prefetch params from context strategy
prefetch_kwargs = {}
if self._context_budget:
params = compute_prefetch_params(self._context_budget)
if params.get("skip"):
return ""
prefetch_kwargs = {
"limit": params["limit"],
"min_trust": params["min_trust"],
}
parts = []
for provider in self._providers:
try:
result = provider.prefetch(query, session_id=session_id)
# Try passing adaptive params — providers that support them
# (like holographic) will use them; others ignore via **kwargs
# or TypeError (caught below).
if prefetch_kwargs:
try:
result = provider.prefetch(
query,
session_id=session_id,
**prefetch_kwargs,
)
except TypeError:
# Provider doesn't accept extra kwargs — call without
result = provider.prefetch(query, session_id=session_id)
else:
result = provider.prefetch(query, session_id=session_id)
if result and result.strip():
parts.append(result)
except Exception as e:

View File

@@ -5,6 +5,7 @@ and run_agent.py for pre-flight context checks.
"""
import logging
import os
import re
import time
from pathlib import Path
@@ -23,20 +24,15 @@ logger = logging.getLogger(__name__)
# are preserved so the full model name reaches cache lookups and server queries.
_PROVIDER_PREFIXES: frozenset[str] = frozenset({
"openrouter", "nous", "openai-codex", "copilot", "copilot-acp",
"gemini", "zai", "kimi-coding", "kimi-coding-cn", "minimax", "minimax-cn", "anthropic", "deepseek",
"gemini", "zai", "kimi-coding", "minimax", "minimax-cn", "anthropic", "deepseek",
"opencode-zen", "opencode-go", "ai-gateway", "kilocode", "alibaba",
"qwen-oauth",
"xiaomi",
"arcee",
"custom", "local",
# Common aliases
"google", "google-gemini", "google-ai-studio",
"glm", "z-ai", "z.ai", "zhipu", "github", "github-copilot",
"github-models", "kimi", "moonshot", "kimi-cn", "moonshot-cn", "claude", "deep-seek",
"github-models", "kimi", "moonshot", "claude", "deep-seek",
"opencode", "zen", "go", "vercel", "kilo", "dashscope", "aliyun", "qwen",
"mimo", "xiaomi-mimo",
"arcee-ai", "arceeai",
"xai", "x-ai", "x.ai", "grok",
"qwen-portal",
})
@@ -87,11 +83,6 @@ CONTEXT_PROBE_TIERS = [
# Default context length when no detection method succeeds.
DEFAULT_FALLBACK_CONTEXT = CONTEXT_PROBE_TIERS[0]
# Minimum context length required to run Hermes Agent. Models with fewer
# tokens cannot maintain enough working memory for tool-calling workflows.
# Sessions, model switches, and cron jobs should reject models below this.
MINIMUM_CONTEXT_LENGTH = 64_000
# Thin fallback defaults — only broad model family patterns.
# These fire only when provider is unknown AND models.dev/OpenRouter/Anthropic
# all miss. Replaced the previous 80+ entry dict.
@@ -107,15 +98,9 @@ DEFAULT_CONTEXT_LENGTHS = {
"claude-sonnet-4.6": 1000000,
# Catch-all for older Claude models (must sort after specific entries)
"claude": 200000,
# OpenAI — GPT-5 family (most have 400k; specific overrides first)
# Source: https://developers.openai.com/api/docs/models
"gpt-5.4-nano": 400000, # 400k (not 1.05M like full 5.4)
"gpt-5.4-mini": 400000, # 400k (not 1.05M like full 5.4)
"gpt-5.4": 1050000, # GPT-5.4, GPT-5.4 Pro (1.05M context)
"gpt-5.3-codex-spark": 128000, # Spark variant has reduced 128k context
"gpt-5.1-chat": 128000, # Chat variant has 128k context
"gpt-5": 400000, # GPT-5.x base, mini, codex variants (400k)
# OpenAI
"gpt-4.1": 1047576,
"gpt-5": 128000,
"gpt-4": 128000,
# Google
"gemini": 1048576,
@@ -157,8 +142,6 @@ DEFAULT_CONTEXT_LENGTHS = {
"kimi": 262144,
# Arcee
"trinity": 262144,
# OpenRouter
"elephant": 262144,
# Hugging Face Inference Providers — model IDs use org/name format
"Qwen/Qwen3.5-397B-A17B": 131072,
"Qwen/Qwen3.5-35B-A3B": 131072,
@@ -166,10 +149,9 @@ DEFAULT_CONTEXT_LENGTHS = {
"moonshotai/Kimi-K2.5": 262144,
"moonshotai/Kimi-K2-Thinking": 262144,
"MiniMaxAI/MiniMax-M2.5": 204800,
"XiaomiMiMo/MiMo-V2-Flash": 256000,
"mimo-v2-pro": 1000000,
"mimo-v2-omni": 256000,
"mimo-v2-flash": 256000,
"XiaomiMiMo/MiMo-V2-Flash": 32768,
"mimo-v2-pro": 1048576,
"mimo-v2-omni": 1048576,
"zai-org/GLM-5": 202752,
}
@@ -194,12 +176,6 @@ _MAX_COMPLETION_KEYS = (
# Local server hostnames / address patterns
_LOCAL_HOSTS = ("localhost", "127.0.0.1", "::1", "0.0.0.0")
# Docker / Podman / Lima DNS names that resolve to the host machine
_CONTAINER_LOCAL_SUFFIXES = (
".docker.internal",
".containers.internal",
".lima.internal",
)
def _normalize_base_url(base_url: str) -> str:
@@ -221,9 +197,7 @@ _URL_TO_PROVIDER: Dict[str, str] = {
"api.anthropic.com": "anthropic",
"api.z.ai": "zai",
"api.moonshot.ai": "kimi-coding",
"api.moonshot.cn": "kimi-coding-cn",
"api.kimi.com": "kimi-coding",
"api.arcee.ai": "arcee",
"api.minimax": "minimax",
"dashscope.aliyuncs.com": "alibaba",
"dashscope-intl.aliyuncs.com": "alibaba",
@@ -237,8 +211,6 @@ _URL_TO_PROVIDER: Dict[str, str] = {
"api.fireworks.ai": "fireworks",
"opencode.ai": "opencode-go",
"api.x.ai": "xai",
"api.xiaomimimo.com": "xiaomi",
"xiaomimimo.com": "xiaomi",
}
@@ -277,9 +249,6 @@ def is_local_endpoint(base_url: str) -> bool:
return False
if host in _LOCAL_HOSTS:
return True
# Docker / Podman / Lima internal DNS names (e.g. host.docker.internal)
if any(host.endswith(suffix) for suffix in _CONTAINER_LOCAL_SUFFIXES):
return True
# RFC-1918 private ranges and link-local
import ipaddress
try:
@@ -787,12 +756,12 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
resp = client.post(f"{server_url}/api/show", json={"name": model})
if resp.status_code == 200:
data = resp.json()
# Prefer explicit num_ctx from Modelfile parameters: this is
# the *runtime* context Ollama will actually allocate KV cache
# for. The GGUF model_info.context_length is the training max,
# which can be larger than num_ctx — using it here would let
# Hermes grow conversations past the runtime limit and Ollama
# would silently truncate. Matches query_ollama_num_ctx().
# Check model_info for context length
model_info = data.get("model_info", {})
for key, value in model_info.items():
if "context_length" in key and isinstance(value, (int, float)):
return int(value)
# Check parameters string for num_ctx
params = data.get("parameters", "")
if "num_ctx" in params:
for line in params.split("\n"):
@@ -803,11 +772,6 @@ def _query_local_context_length(model: str, base_url: str) -> Optional[int]:
return int(parts[-1])
except ValueError:
pass
# Fall back to GGUF model_info context_length (training max)
model_info = data.get("model_info", {})
for key, value in model_info.items():
if "context_length" in key and isinstance(value, (int, float)):
return int(value)
# LM Studio native API: /api/v1/models returns max_context_length.
# This is more reliable than the OpenAI-compat /v1/models which
@@ -1062,21 +1026,16 @@ def get_model_context_length(
def estimate_tokens_rough(text: str) -> int:
"""Rough token estimate (~4 chars/token) for pre-flight checks.
Uses ceiling division so short texts (1-3 chars) never estimate as
0 tokens, which would cause the compressor and pre-flight checks to
systematically undercount when many short tool results are present.
"""
"""Rough token estimate (~4 chars/token) for pre-flight checks."""
if not text:
return 0
return (len(text) + 3) // 4
return len(text) // 4
def estimate_messages_tokens_rough(messages: List[Dict[str, Any]]) -> int:
"""Rough token estimate for a message list (pre-flight only)."""
total_chars = sum(len(str(msg)) for msg in messages)
return (total_chars + 3) // 4
return total_chars // 4
def estimate_request_tokens_rough(
@@ -1099,4 +1058,4 @@ def estimate_request_tokens_rough(
total_chars += sum(len(str(msg)) for msg in messages)
if tools:
total_chars += len(str(tools))
return (total_chars + 3) // 4
return total_chars // 4

View File

@@ -18,8 +18,10 @@ Other modules should import the dataclasses and query functions from here
rather than parsing the raw JSON themselves.
"""
import difflib
import json
import logging
import os
import time
from dataclasses import dataclass
from pathlib import Path
@@ -142,11 +144,8 @@ class ProviderInfo:
PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
"openrouter": "openrouter",
"anthropic": "anthropic",
"openai": "openai",
"openai-codex": "openai",
"zai": "zai",
"kimi-coding": "kimi-for-coding",
"kimi-coding-cn": "kimi-for-coding",
"minimax": "minimax",
"minimax-cn": "minimax-cn",
"deepseek": "deepseek",
@@ -162,7 +161,6 @@ PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
"gemini": "google",
"google": "google",
"xai": "xai",
"xiaomi": "xiaomi",
"nvidia": "nvidia",
"groq": "groq",
"mistral": "mistral",
@@ -175,6 +173,13 @@ PROVIDER_TO_MODELS_DEV: Dict[str, str] = {
_MODELS_DEV_TO_PROVIDER: Optional[Dict[str, str]] = None
def _get_reverse_mapping() -> Dict[str, str]:
"""Return models.dev ID → Hermes provider ID mapping."""
global _MODELS_DEV_TO_PROVIDER
if _MODELS_DEV_TO_PROVIDER is None:
_MODELS_DEV_TO_PROVIDER = {v: k for k, v in PROVIDER_TO_MODELS_DEV.items()}
return _MODELS_DEV_TO_PROVIDER
def _get_cache_path() -> Path:
"""Return path to disk cache file."""
@@ -378,14 +383,7 @@ def get_model_capabilities(provider: str, model: str) -> Optional[ModelCapabilit
# Extract capability flags (default to False if missing)
supports_tools = bool(entry.get("tool_call", False))
# Vision: check both the `attachment` flag and `modalities.input` for "image".
# Some models (e.g. gemma-4) list image in input modalities but not attachment.
input_mods = entry.get("modalities", {})
if isinstance(input_mods, dict):
input_mods = input_mods.get("input", [])
else:
input_mods = []
supports_vision = bool(entry.get("attachment", False)) or "image" in input_mods
supports_vision = bool(entry.get("attachment", False))
supports_reasoning = bool(entry.get("reasoning", False))
# Extract limits
@@ -455,6 +453,93 @@ def list_agentic_models(provider: str) -> List[str]:
return result
def search_models_dev(
query: str, provider: str = None, limit: int = 5
) -> List[Dict[str, Any]]:
"""Fuzzy search across models.dev catalog. Returns matching model entries.
Args:
query: Search string to match against model IDs.
provider: Optional Hermes provider ID to restrict search scope.
If None, searches across all providers in PROVIDER_TO_MODELS_DEV.
limit: Maximum number of results to return.
Returns:
List of dicts, each containing 'provider', 'model_id', and the full
model 'entry' from models.dev.
"""
data = fetch_models_dev()
if not data:
return []
# Build list of (provider_id, model_id, entry) candidates
candidates: List[tuple] = []
if provider is not None:
# Search only the specified provider
mdev_provider_id = PROVIDER_TO_MODELS_DEV.get(provider)
if not mdev_provider_id:
return []
provider_data = data.get(mdev_provider_id, {})
if isinstance(provider_data, dict):
models = provider_data.get("models", {})
if isinstance(models, dict):
for mid, mdata in models.items():
candidates.append((provider, mid, mdata))
else:
# Search across all mapped providers
for hermes_prov, mdev_prov in PROVIDER_TO_MODELS_DEV.items():
provider_data = data.get(mdev_prov, {})
if isinstance(provider_data, dict):
models = provider_data.get("models", {})
if isinstance(models, dict):
for mid, mdata in models.items():
candidates.append((hermes_prov, mid, mdata))
if not candidates:
return []
# Use difflib for fuzzy matching — case-insensitive comparison
model_ids_lower = [c[1].lower() for c in candidates]
query_lower = query.lower()
# First try exact substring matches (more intuitive than pure edit-distance)
substring_matches = []
for prov, mid, mdata in candidates:
if query_lower in mid.lower():
substring_matches.append({"provider": prov, "model_id": mid, "entry": mdata})
# Then add difflib fuzzy matches for any remaining slots
fuzzy_ids = difflib.get_close_matches(
query_lower, model_ids_lower, n=limit * 2, cutoff=0.4
)
seen_ids: set = set()
results: List[Dict[str, Any]] = []
# Prioritize substring matches
for match in substring_matches:
key = (match["provider"], match["model_id"])
if key not in seen_ids:
seen_ids.add(key)
results.append(match)
if len(results) >= limit:
return results
# Add fuzzy matches
for fid in fuzzy_ids:
# Find original-case candidates matching this lowered ID
for prov, mid, mdata in candidates:
if mid.lower() == fid:
key = (prov, mid)
if key not in seen_ids:
seen_ids.add(key)
results.append({"provider": prov, "model_id": mid, "entry": mdata})
if len(results) >= limit:
return results
return results
# ---------------------------------------------------------------------------
# Rich dataclass constructors — parse raw models.dev JSON into dataclasses

View File

@@ -1,184 +0,0 @@
"""
agent/mtls.py — Mutual TLS support for Hermes A2A communication.
Provides:
- build_server_ssl_context() — SSL context for uvicorn that requires client certs
- build_client_ssl_context() — SSL context for httpx/aiohttp A2A clients
- MTLSMiddleware — FastAPI middleware that enforces client cert on A2A routes
- is_mtls_configured() — Check if env vars are set
Configuration (environment variables):
HERMES_MTLS_CERT Path to this agent's TLS certificate (PEM)
HERMES_MTLS_KEY Path to this agent's TLS private key (PEM)
HERMES_MTLS_CA Path to the Fleet CA certificate (PEM) — used to verify peers
All three must be set to enable mTLS. If any is missing, mTLS is disabled and
the server falls back to plain HTTP (or regular TLS without client auth).
"""
import logging
import os
import ssl
from pathlib import Path
from typing import Optional
logger = logging.getLogger(__name__)
# A2A routes that require a valid client certificate when mTLS is enabled.
_A2A_PATH_PREFIXES = (
"/.well-known/agent-card",
"/agent-card",
"/api/agent-card",
"/a2a/",
)
def _get_env(key: str) -> Optional[str]:
val = os.environ.get(key, "").strip()
return val or None
def is_mtls_configured() -> bool:
"""Return True if all three mTLS env vars are set and the files exist."""
cert = _get_env("HERMES_MTLS_CERT")
key = _get_env("HERMES_MTLS_KEY")
ca = _get_env("HERMES_MTLS_CA")
if not (cert and key and ca):
return False
for label, path in (("HERMES_MTLS_CERT", cert), ("HERMES_MTLS_KEY", key), ("HERMES_MTLS_CA", ca)):
if not Path(path).is_file():
logger.warning("mTLS disabled: %s file not found: %s", label, path)
return False
return True
def build_server_ssl_context() -> ssl.SSLContext:
"""
Build an SSL context for the A2A server that:
- presents its own certificate
- requires and verifies the client's certificate against the Fleet CA
Raises:
RuntimeError: if mTLS env vars are not set or files are missing
ssl.SSLError: if cert/key/CA files are invalid
"""
cert = _get_env("HERMES_MTLS_CERT")
key = _get_env("HERMES_MTLS_KEY")
ca = _get_env("HERMES_MTLS_CA")
if not (cert and key and ca):
raise RuntimeError(
"mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
)
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
ctx.load_cert_chain(certfile=cert, keyfile=key)
ctx.load_verify_locations(cafile=ca)
# CERT_REQUIRED: reject connections without a valid client cert
ctx.verify_mode = ssl.CERT_REQUIRED
logger.info("mTLS server context built (cert=%s, CA=%s)", cert, ca)
return ctx
def build_client_ssl_context() -> ssl.SSLContext:
"""
Build an SSL context for outbound A2A connections that:
- presents this agent's certificate as a client cert
- verifies the remote server against the Fleet CA
Raises:
RuntimeError: if mTLS env vars are not set or files are missing
ssl.SSLError: if cert/key/CA files are invalid
"""
cert = _get_env("HERMES_MTLS_CERT")
key = _get_env("HERMES_MTLS_KEY")
ca = _get_env("HERMES_MTLS_CA")
if not (cert and key and ca):
raise RuntimeError(
"mTLS not configured. Set HERMES_MTLS_CERT, HERMES_MTLS_KEY, and HERMES_MTLS_CA."
)
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
ctx.minimum_version = ssl.TLSVersion.TLSv1_2
ctx.load_cert_chain(certfile=cert, keyfile=key)
ctx.load_verify_locations(cafile=ca)
ctx.verify_mode = ssl.CERT_REQUIRED
ctx.check_hostname = True
logger.info("mTLS client context built (cert=%s, CA=%s)", cert, ca)
return ctx
def get_peer_cn(ssl_object) -> Optional[str]:
"""Extract the CN from the peer certificate's subject, or None."""
try:
peer_cert = ssl_object.getpeercert()
if not peer_cert:
return None
for rdn in peer_cert.get("subject", ()):
for attr, value in rdn:
if attr == "commonName":
return value
except Exception:
pass
return None
class MTLSMiddleware:
"""
ASGI middleware that enforces client certificate verification on A2A routes.
When mTLS is NOT configured (no env vars) or the route is not an A2A route,
the request passes through unchanged.
When mTLS IS configured and the route matches an A2A prefix, the middleware
checks that the request arrived over a TLS connection with a verified client
certificate. If not, it returns HTTP 403.
Note: This middleware only provides defence-in-depth at the app layer.
The primary enforcement is at the SSL context level (CERT_REQUIRED on the
server context). This middleware is useful when the server runs behind a
TLS-terminating proxy that forwards cert info via headers (not yet
implemented) or for test-time injection.
"""
def __init__(self, app):
self.app = app
self._enabled = is_mtls_configured()
if self._enabled:
logger.info("MTLSMiddleware enabled — A2A routes require client cert")
def _is_a2a_route(self, path: str) -> bool:
return any(path.startswith(prefix) for prefix in _A2A_PATH_PREFIXES)
async def __call__(self, scope, receive, send):
if scope["type"] == "http" and self._enabled and self._is_a2a_route(scope.get("path", "")):
# Check for client cert in the SSL connection
transport = scope.get("extensions", {}).get("tls", {})
peer_cert = transport.get("peer_cert")
if peer_cert is None:
# No client cert — reject
response = _forbidden_response("Client certificate required for A2A endpoints")
await response(scope, receive, send)
return
await self.app(scope, receive, send)
def _forbidden_response(message: str):
"""Return a minimal ASGI 403 response."""
body = message.encode()
async def respond(scope, receive, send):
await send({
"type": "http.response.start",
"status": 403,
"headers": [
(b"content-type", b"text/plain"),
(b"content-length", str(len(body)).encode()),
],
})
await send({"type": "http.response.body", "body": body})
return respond

View File

@@ -1,353 +0,0 @@
"""Privacy Filter — strip PII from context before remote API calls.
Implements Vitalik's Pattern 2: "A local model can strip out private data
before passing the query along to a remote LLM."
When Hermes routes a request to a cloud provider (Anthropic, OpenRouter, etc.),
this module sanitizes the message context to remove personally identifiable
information before it leaves the user's machine.
Threat model (from Vitalik's secure LLM architecture):
- Privacy (other): Non-LLM data leakage via search queries, API calls
- LLM accidents: LLM accidentally leaking private data in prompts
- LLM jailbreaks: Remote content extracting private context
Usage:
from agent.privacy_filter import PrivacyFilter, sanitize_messages
pf = PrivacyFilter()
safe_messages = pf.sanitize_messages(messages)
# safe_messages has PII replaced with [REDACTED] tokens
"""
from __future__ import annotations
import logging
import re
from dataclasses import dataclass, field
from enum import Enum, auto
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
class Sensitivity(Enum):
"""Classification of content sensitivity."""
PUBLIC = auto() # No PII detected
LOW = auto() # Generic references (e.g., city names)
MEDIUM = auto() # Personal identifiers (name, email, phone)
HIGH = auto() # Secrets, keys, financial data, medical info
CRITICAL = auto() # Crypto keys, passwords, SSN patterns
@dataclass
class RedactionReport:
"""Summary of what was redacted from a message batch."""
total_messages: int = 0
redacted_messages: int = 0
redactions: List[Dict[str, Any]] = field(default_factory=list)
max_sensitivity: Sensitivity = Sensitivity.PUBLIC
@property
def had_redactions(self) -> bool:
return self.redacted_messages > 0
def summary(self) -> str:
if not self.had_redactions:
return "No PII detected — context is clean for remote query."
parts = [f"Redacted {self.redacted_messages}/{self.total_messages} messages:"]
for r in self.redactions[:10]:
parts.append(f" - {r['type']}: {r['count']} occurrence(s)")
if len(self.redactions) > 10:
parts.append(f" ... and {len(self.redactions) - 10} more types")
return "\n".join(parts)
# =========================================================================
# PII pattern definitions
# =========================================================================
# Each pattern is (compiled_regex, redaction_type, sensitivity_level, replacement)
_PII_PATTERNS: List[Tuple[re.Pattern, str, Sensitivity, str]] = []
def _compile_patterns() -> None:
"""Compile PII detection patterns. Called once at module init."""
global _PII_PATTERNS
if _PII_PATTERNS:
return
raw_patterns = [
# --- CRITICAL: secrets and credentials ---
(
r'(?:api[_-]?key|apikey|secret[_-]?key|access[_-]?token)\s*[:=]\s*["\']?([A-Za-z0-9_\-\.]{20,})["\']?',
"api_key_or_token",
Sensitivity.CRITICAL,
"[REDACTED-API-KEY]",
),
(
r'\b(?:sk-|sk_|pk_|rk_|ak_)[A-Za-z0-9]{20,}\b',
"prefixed_secret",
Sensitivity.CRITICAL,
"[REDACTED-SECRET]",
),
(
r'\b(?:ghp_|gho_|ghu_|ghs_|ghr_)[A-Za-z0-9]{36,}\b',
"github_token",
Sensitivity.CRITICAL,
"[REDACTED-GITHUB-TOKEN]",
),
(
r'\b(?:xox[bposa]-[A-Za-z0-9\-]+)\b',
"slack_token",
Sensitivity.CRITICAL,
"[REDACTED-SLACK-TOKEN]",
),
(
r'(?:password|passwd|pwd)\s*[:=]\s*["\']?([^\s"\']{4,})["\']?',
"password",
Sensitivity.CRITICAL,
"[REDACTED-PASSWORD]",
),
(
r'(?:-----BEGIN (?:RSA |EC |OPENSSH )?PRIVATE KEY-----)',
"private_key_block",
Sensitivity.CRITICAL,
"[REDACTED-PRIVATE-KEY]",
),
# Ethereum / crypto addresses (42-char hex starting with 0x)
(
r'\b0x[a-fA-F0-9]{40}\b',
"ethereum_address",
Sensitivity.HIGH,
"[REDACTED-ETH-ADDR]",
),
# Bitcoin addresses (base58, 25-34 chars starting with 1/3/bc1)
(
r'\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b',
"bitcoin_address",
Sensitivity.HIGH,
"[REDACTED-BTC-ADDR]",
),
(
r'\bbc1[a-zA-HJ-NP-Z0-9]{39,59}\b',
"bech32_address",
Sensitivity.HIGH,
"[REDACTED-BTC-ADDR]",
),
# --- HIGH: financial ---
(
r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
"credit_card_number",
Sensitivity.HIGH,
"[REDACTED-CC]",
),
(
r'\b\d{3}-\d{2}-\d{4}\b',
"us_ssn",
Sensitivity.HIGH,
"[REDACTED-SSN]",
),
# --- MEDIUM: personal identifiers ---
# Email addresses
(
r'\b[A-Za-z0-9._%+\-]+@[A-Za-z0-9.\-]+\.[A-Za-z]{2,}\b',
"email_address",
Sensitivity.MEDIUM,
"[REDACTED-EMAIL]",
),
# Phone numbers (US/international patterns)
(
r'\b(?:\+?1[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b',
"phone_number_us",
Sensitivity.MEDIUM,
"[REDACTED-PHONE]",
),
(
r'\b\+\d{1,3}[-.\s]?\d{4,14}\b',
"phone_number_intl",
Sensitivity.MEDIUM,
"[REDACTED-PHONE]",
),
# Filesystem paths that reveal user identity
(
r'(?:/Users/|/home/|C:\\Users\\)([A-Za-z0-9_\-]+)',
"user_home_path",
Sensitivity.MEDIUM,
r"/Users/[REDACTED-USER]",
),
# --- LOW: environment / system info ---
# Internal IPs
(
r'\b(?:10\.\d{1,3}\.\d{1,3}\.\d{1,3}|172\.(?:1[6-9]|2\d|3[01])\.\d{1,3}\.\d{1,3}|192\.168\.\d{1,3}\.\d{1,3})\b',
"internal_ip",
Sensitivity.LOW,
"[REDACTED-IP]",
),
]
_PII_PATTERNS = [
(re.compile(pattern, re.IGNORECASE), rtype, sensitivity, replacement)
for pattern, rtype, sensitivity, replacement in raw_patterns
]
_compile_patterns()
# =========================================================================
# Sensitive file path patterns (context-aware)
# =========================================================================
_SENSITIVE_PATH_PATTERNS = [
re.compile(r'\.(?:env|pem|key|p12|pfx|jks|keystore)\b', re.IGNORECASE),
re.compile(r'(?:\.ssh/|\.gnupg/|\.aws/|\.config/gcloud/)', re.IGNORECASE),
re.compile(r'(?:wallet|keystore|seed|mnemonic)', re.IGNORECASE),
re.compile(r'(?:\.hermes/\.env)', re.IGNORECASE),
]
def _classify_path_sensitivity(path: str) -> Sensitivity:
"""Check if a file path references sensitive material."""
for pat in _SENSITIVE_PATH_PATTERNS:
if pat.search(path):
return Sensitivity.HIGH
return Sensitivity.PUBLIC
# =========================================================================
# Core filtering
# =========================================================================
class PrivacyFilter:
"""Strip PII from message context before remote API calls.
Integrates with the agent's message pipeline. Call sanitize_messages()
before sending context to any cloud LLM provider.
"""
def __init__(
self,
min_sensitivity: Sensitivity = Sensitivity.MEDIUM,
aggressive_mode: bool = False,
):
"""
Args:
min_sensitivity: Only redact PII at or above this level.
Default MEDIUM — redacts emails, phones, paths but not IPs.
aggressive_mode: If True, also redact file paths and internal IPs.
"""
self.min_sensitivity = (
Sensitivity.LOW if aggressive_mode else min_sensitivity
)
self.aggressive_mode = aggressive_mode
def sanitize_text(self, text: str) -> Tuple[str, List[Dict[str, Any]]]:
"""Sanitize a single text string. Returns (cleaned_text, redaction_list)."""
redactions = []
cleaned = text
for pattern, rtype, sensitivity, replacement in _PII_PATTERNS:
if sensitivity.value < self.min_sensitivity.value:
continue
matches = pattern.findall(cleaned)
if matches:
count = len(matches) if isinstance(matches[0], str) else sum(
1 for m in matches if m
)
if count > 0:
cleaned = pattern.sub(replacement, cleaned)
redactions.append({
"type": rtype,
"sensitivity": sensitivity.name,
"count": count,
})
return cleaned, redactions
def sanitize_messages(
self, messages: List[Dict[str, Any]]
) -> Tuple[List[Dict[str, Any]], RedactionReport]:
"""Sanitize a list of OpenAI-format messages.
Returns (safe_messages, report). System messages are NOT sanitized
(they're typically static prompts). Only user and assistant messages
with string content are processed.
Args:
messages: List of {"role": ..., "content": ...} dicts.
Returns:
Tuple of (sanitized_messages, redaction_report).
"""
report = RedactionReport(total_messages=len(messages))
safe_messages = []
for msg in messages:
role = msg.get("role", "")
content = msg.get("content", "")
# Only sanitize user/assistant string content
if role in ("user", "assistant") and isinstance(content, str) and content:
cleaned, redactions = self.sanitize_text(content)
if redactions:
report.redacted_messages += 1
report.redactions.extend(redactions)
# Track max sensitivity
for r in redactions:
s = Sensitivity[r["sensitivity"]]
if s.value > report.max_sensitivity.value:
report.max_sensitivity = s
safe_msg = {**msg, "content": cleaned}
safe_messages.append(safe_msg)
logger.info(
"Privacy filter: redacted %d PII type(s) from %s message",
len(redactions), role,
)
else:
safe_messages.append(msg)
else:
safe_messages.append(msg)
return safe_messages, report
def should_use_local_only(self, text: str) -> Tuple[bool, str]:
"""Determine if content is too sensitive for any remote call.
Returns (should_block, reason). If True, the content should only
be processed by a local model.
"""
_, redactions = self.sanitize_text(text)
critical_count = sum(
1 for r in redactions
if Sensitivity[r["sensitivity"]] == Sensitivity.CRITICAL
)
high_count = sum(
1 for r in redactions
if Sensitivity[r["sensitivity"]] == Sensitivity.HIGH
)
if critical_count > 0:
return True, f"Contains {critical_count} critical-secret pattern(s) — local-only"
if high_count >= 3:
return True, f"Contains {high_count} high-sensitivity pattern(s) — local-only"
return False, ""
def sanitize_messages(
messages: List[Dict[str, Any]],
min_sensitivity: Sensitivity = Sensitivity.MEDIUM,
aggressive: bool = False,
) -> Tuple[List[Dict[str, Any]], RedactionReport]:
"""Convenience function: sanitize messages with default settings."""
pf = PrivacyFilter(min_sensitivity=min_sensitivity, aggressive_mode=aggressive)
return pf.sanitize_messages(messages)
def quick_sanitize(text: str) -> str:
"""Quick sanitize a single string — returns cleaned text only."""
pf = PrivacyFilter()
cleaned, _ = pf.sanitize_text(text)
return cleaned

View File

@@ -1,262 +0,0 @@
"""
Profile Session Isolation — #891
Tags sessions with their originating profile and provides
filtered access so profiles cannot see each other's data.
Current state: All sessions share one state.db with no profile tag.
This module adds profile tagging and filtered queries.
Usage:
from agent.profile_isolation import tag_session, get_profile_sessions, get_active_profile
# Tag a new session with the current profile
tag_session(session_id, profile_name)
# Get sessions for a specific profile
sessions = get_profile_sessions("sprint")
# Get current active profile
profile = get_active_profile()
"""
import json
import os
import sqlite3
from pathlib import Path
from typing import Any, Dict, List, Optional
from datetime import datetime, timezone
HERMES_HOME = Path(os.getenv("HERMES_HOME", str(Path.home() / ".hermes")))
SESSIONS_DB = HERMES_HOME / "sessions" / "state.db"
PROFILE_TAGS_FILE = HERMES_HOME / "profile_session_tags.json"
def get_active_profile() -> str:
"""Get the currently active profile name."""
config_path = HERMES_HOME / "config.yaml"
if config_path.exists():
try:
import yaml
with open(config_path) as f:
cfg = yaml.safe_load(f) or {}
return cfg.get("active_profile", "default")
except Exception:
pass
# Check environment
return os.getenv("HERMES_PROFILE", "default")
def _load_tags() -> Dict[str, str]:
"""Load session-to-profile mapping."""
if not PROFILE_TAGS_FILE.exists():
return {}
try:
with open(PROFILE_TAGS_FILE) as f:
return json.load(f)
except Exception:
return {}
def _save_tags(tags: Dict[str, str]):
"""Save session-to-profile mapping."""
PROFILE_TAGS_FILE.parent.mkdir(parents=True, exist_ok=True)
with open(PROFILE_TAGS_FILE, "w") as f:
json.dump(tags, f, indent=2)
def tag_session(session_id: str, profile: Optional[str] = None) -> str:
"""
Tag a session with its originating profile.
Returns the profile name used.
"""
if profile is None:
profile = get_active_profile()
tags = _load_tags()
tags[session_id] = profile
_save_tags(tags)
# Also tag in SQLite if available
_tag_session_in_db(session_id, profile)
return profile
def _tag_session_in_db(session_id: str, profile: str):
"""Add profile tag to SQLite session store."""
if not SESSIONS_DB.exists():
return
try:
conn = sqlite3.connect(str(SESSIONS_DB))
cursor = conn.cursor()
# Check if sessions table has profile column
cursor.execute("PRAGMA table_info(sessions)")
columns = [row[1] for row in cursor.fetchall()]
if "profile" not in columns:
# Add profile column
cursor.execute("ALTER TABLE sessions ADD COLUMN profile TEXT DEFAULT 'default'")
# Update the session's profile
cursor.execute(
"UPDATE sessions SET profile = ? WHERE session_id = ?",
(profile, session_id)
)
conn.commit()
conn.close()
except Exception:
pass # SQLite might not be available or schema differs
def get_session_profile(session_id: str) -> Optional[str]:
"""Get the profile that owns a session."""
# Check JSON tags first
tags = _load_tags()
if session_id in tags:
return tags[session_id]
# Check SQLite
if SESSIONS_DB.exists():
try:
conn = sqlite3.connect(str(SESSIONS_DB))
cursor = conn.cursor()
cursor.execute(
"SELECT profile FROM sessions WHERE session_id = ?",
(session_id,)
)
row = cursor.fetchone()
conn.close()
if row:
return row[0]
except Exception:
pass
return None
def get_profile_sessions(
profile: Optional[str] = None,
limit: int = 100,
) -> List[Dict[str, Any]]:
"""
Get sessions belonging to a specific profile.
Returns list of session dicts.
"""
if profile is None:
profile = get_active_profile()
sessions = []
# Get from JSON tags
tags = _load_tags()
tagged_sessions = [sid for sid, p in tags.items() if p == profile]
# Get from SQLite with profile filter
if SESSIONS_DB.exists():
try:
conn = sqlite3.connect(str(SESSIONS_DB))
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
# Try profile column first
try:
cursor.execute(
"SELECT * FROM sessions WHERE profile = ? ORDER BY updated_at DESC LIMIT ?",
(profile, limit)
)
for row in cursor.fetchall():
sessions.append(dict(row))
except Exception:
# Fallback: filter by tagged session IDs
if tagged_sessions:
placeholders = ",".join("?" * len(tagged_sessions[:limit]))
cursor.execute(
f"SELECT * FROM sessions WHERE session_id IN ({placeholders}) ORDER BY updated_at DESC LIMIT ?",
(*tagged_sessions[:limit], limit)
)
for row in cursor.fetchall():
sessions.append(dict(row))
conn.close()
except Exception:
pass
return sessions[:limit]
def filter_sessions_by_profile(
sessions: List[Dict[str, Any]],
profile: Optional[str] = None,
) -> List[Dict[str, Any]]:
"""Filter a list of sessions to only include those belonging to a profile."""
if profile is None:
profile = get_active_profile()
tags = _load_tags()
filtered = []
for session in sessions:
sid = session.get("session_id") or session.get("id")
if not sid:
continue
# Check tag
session_profile = tags.get(sid)
if session_profile is None:
# Check SQLite
session_profile = get_session_profile(sid)
if session_profile == profile or session_profile is None:
filtered.append(session)
return filtered
def get_profile_stats() -> Dict[str, Any]:
"""Get statistics about profile session distribution."""
tags = _load_tags()
profile_counts = {}
for sid, profile in tags.items():
profile_counts[profile] = profile_counts.get(profile, 0) + 1
total_tagged = len(tags)
profiles = list(profile_counts.keys())
return {
"total_tagged_sessions": total_tagged,
"profiles": profiles,
"profile_counts": profile_counts,
"active_profile": get_active_profile(),
}
def audit_untagged_sessions() -> List[str]:
"""Find sessions without a profile tag."""
if not SESSIONS_DB.exists():
return []
try:
conn = sqlite3.connect(str(SESSIONS_DB))
cursor = conn.cursor()
# Get all session IDs
cursor.execute("SELECT session_id FROM sessions")
all_sessions = {row[0] for row in cursor.fetchall()}
conn.close()
# Get tagged sessions
tags = _load_tags()
tagged = set(tags.keys())
# Return untagged
return list(all_sessions - tagged)
except Exception:
return []

View File

@@ -12,7 +12,7 @@ import threading
from collections import OrderedDict
from pathlib import Path
from hermes_constants import get_hermes_home, get_skills_dir, is_wsl
from hermes_constants import get_hermes_home
from typing import Optional
from agent.skill_utils import (
@@ -364,56 +364,8 @@ PLATFORM_HINTS = {
"documents. You can also include image URLs in markdown format ![alt](url) and they "
"will be downloaded and sent as native media when possible."
),
"wecom": (
"You are on WeCom (企业微信 / Enterprise WeChat). Markdown formatting is supported. "
"You CAN send media files natively — to deliver a file to the user, include "
"MEDIA:/absolute/path/to/file in your response. The file will be sent as a native "
"WeCom attachment: images (.jpg, .png, .webp) are sent as photos (up to 10 MB), "
"other files (.pdf, .docx, .xlsx, .md, .txt, etc.) arrive as downloadable documents "
"(up to 20 MB), and videos (.mp4) play inline. Voice messages are supported but "
"must be in AMR format — other audio formats are automatically sent as file attachments. "
"You can also include image URLs in markdown format ![alt](url) and they will be "
"downloaded and sent as native photos. Do NOT tell the user you lack file-sending "
"capability — use MEDIA: syntax whenever a file delivery is appropriate."
),
"qqbot": (
"You are on QQ, a popular Chinese messaging platform. QQ supports markdown formatting "
"and emoji. You can send media files natively: include MEDIA:/absolute/path/to/file in "
"your response. Images are sent as native photos, and other files arrive as downloadable "
"documents."
),
}
# ---------------------------------------------------------------------------
# Environment hints — execution-environment awareness for the agent.
# Unlike PLATFORM_HINTS (which describe the messaging channel), these describe
# the machine/OS the agent's tools actually run on.
# ---------------------------------------------------------------------------
WSL_ENVIRONMENT_HINT = (
"You are running inside WSL (Windows Subsystem for Linux). "
"The Windows host filesystem is mounted under /mnt/ — "
"/mnt/c/ is the C: drive, /mnt/d/ is D:, etc. "
"The user's Windows files are typically at "
"/mnt/c/Users/<username>/Desktop/, Documents/, Downloads/, etc. "
"When the user references Windows paths or desktop files, translate "
"to the /mnt/c/ equivalent. You can list /mnt/c/Users/ to discover "
"the Windows username if needed."
)
def build_environment_hints() -> str:
"""Return environment-specific guidance for the system prompt.
Detects WSL, and can be extended for Termux, Docker, etc.
Returns an empty string when no special environment is detected.
"""
hints: list[str] = []
if is_wsl():
hints.append(WSL_ENVIRONMENT_HINT)
return "\n\n".join(hints)
CONTEXT_FILE_MAX_CHARS = 20_000
CONTEXT_TRUNCATE_HEAD_RATIO = 0.7
CONTEXT_TRUNCATE_TAIL_RATIO = 0.2
@@ -596,7 +548,8 @@ def build_skills_system_prompt(
are read-only — they appear in the index but new skills are always created
in the local dir. Local skills take precedence when names collide.
"""
skills_dir = get_skills_dir()
hermes_home = get_hermes_home()
skills_dir = hermes_home / "skills"
external_dirs = get_all_skills_dirs()[1:] # skip local (index 0)
if not skills_dir.exists() and not external_dirs:
@@ -774,16 +727,8 @@ def build_skills_system_prompt(
result = (
"## Skills (mandatory)\n"
"Before replying, scan the skills below. If a skill matches or is even partially relevant "
"to your task, you MUST load it with skill_view(name) and follow its instructions. "
"Err on the side of loading — it is always better to have context you don't need "
"than to miss critical steps, pitfalls, or established workflows. "
"Skills contain specialized knowledge — API endpoints, tool-specific commands, "
"and proven workflows that outperform general-purpose approaches. Load the skill "
"even if you think you could handle the task with basic tools like web_search or terminal. "
"Skills also encode the user's preferred approach, conventions, and quality standards "
"for tasks like code review, planning, and testing — load them even for tasks you "
"already know how to do, because the skill defines how it should be done here.\n"
"Before replying, scan the skills below. If one clearly matches your task, "
"load it with skill_view(name) and follow its instructions. "
"If a skill has issues, fix it with skill_manage(action='patch').\n"
"After difficult/iterative tasks, offer to save as a skill. "
"If a skill you loaded was missing steps, had wrong commands, or needed "
@@ -793,7 +738,7 @@ def build_skills_system_prompt(
+ "\n".join(index_lines) + "\n"
"</available_skills>\n"
"\n"
"Only proceed without loading a skill if genuinely none are relevant to the task."
"If none match, proceed normally without loading a skill."
)
# ── Store in LRU cache ────────────────────────────────────────────

View File

@@ -1,146 +0,0 @@
"""Provider Preflight — Poka-yoke validation of provider/model config.
Validates provider and model configuration before session start.
Prevents wasted context on misconfigured providers.
Usage:
from agent.provider_preflight import preflight_check
result = preflight_check(provider="openrouter", model="xiaomi/mimo-v2-pro")
if not result["valid"]:
print(result["error"])
"""
from __future__ import annotations
import logging
import os
from typing import Any, Dict, Optional
logger = logging.getLogger(__name__)
# Provider -> required env var
PROVIDER_KEYS = {
"openrouter": "OPENROUTER_API_KEY",
"anthropic": "ANTHROPIC_API_KEY",
"openai": "OPENAI_API_KEY",
"nous": "NOUS_API_KEY",
"ollama": None, # Local, no key needed
"local": None,
}
def check_provider_key(provider: str) -> Dict[str, Any]:
"""Check if provider has a valid API key configured."""
provider_lower = provider.lower().strip()
env_var = None
for known, key in PROVIDER_KEYS.items():
if known in provider_lower:
env_var = key
break
if env_var is None:
# Unknown provider — assume OK (custom/local)
return {"valid": True, "provider": provider, "key_status": "unknown"}
if env_var is None:
# Local provider, no key needed
return {"valid": True, "provider": provider, "key_status": "not_required"}
key_value = os.getenv(env_var, "").strip()
if not key_value:
return {
"valid": False,
"provider": provider,
"key_status": "missing",
"error": f"{env_var} is not set. Provider '{provider}' will fail.",
"fix": f"Set {env_var} in ~/.hermes/.env",
}
if len(key_value) < 10:
return {
"valid": False,
"provider": provider,
"key_status": "too_short",
"error": f"{env_var} is suspiciously short ({len(key_value)} chars). May be invalid.",
"fix": f"Verify {env_var} value in ~/.hermes/.env",
}
return {"valid": True, "provider": provider, "key_status": "set"}
def check_model_availability(model: str, provider: str) -> Dict[str, Any]:
"""Check if model is likely available for provider."""
if not model:
return {"valid": False, "error": "No model specified"}
# Basic sanity checks
model_lower = model.lower()
# Anthropic models should use anthropic provider
if "claude" in model_lower and "anthropic" not in provider.lower():
return {
"valid": True, # Allow but warn
"warning": f"Model '{model}' usually runs on Anthropic provider, not '{provider}'",
}
# Ollama models
ollama_indicators = ["llama", "mistral", "qwen", "gemma", "phi", "hermes"]
if any(x in model_lower for x in ollama_indicators) and ":" not in model:
return {
"valid": True,
"warning": f"Model '{model}' may need a version tag for Ollama (e.g., {model}:latest)",
}
return {"valid": True}
def preflight_check(
provider: str = "",
model: str = "",
fallback_provider: str = "",
fallback_model: str = "",
) -> Dict[str, Any]:
"""Full pre-flight check for provider/model configuration.
Returns:
Dict with valid (bool), errors (list), warnings (list).
"""
errors = []
warnings = []
# Check primary provider
if provider:
result = check_provider_key(provider)
if not result["valid"]:
errors.append(result.get("error", f"Provider {provider} invalid"))
# Check primary model
if model:
result = check_model_availability(model, provider)
if not result["valid"]:
errors.append(result.get("error", f"Model {model} invalid"))
elif result.get("warning"):
warnings.append(result["warning"])
# Check fallback
if fallback_provider:
result = check_provider_key(fallback_provider)
if not result["valid"]:
warnings.append(f"Fallback provider {fallback_provider} also invalid: {result.get('error','')}")
if fallback_model:
result = check_model_availability(fallback_model, fallback_provider)
if not result["valid"]:
warnings.append(f"Fallback model {fallback_model} invalid")
elif result.get("warning"):
warnings.append(result["warning"])
return {
"valid": len(errors) == 0,
"errors": errors,
"warnings": warnings,
"provider": provider,
"model": model,
}

View File

@@ -24,7 +24,7 @@ from __future__ import annotations
import time
from dataclasses import dataclass, field
from typing import Any, Mapping, Optional
from typing import Any, Dict, Mapping, Optional
@dataclass

View File

@@ -1,302 +0,0 @@
"""Self-Modifying Prompt Engine — agent learns from its own failures.
Analyzes session transcripts, identifies failure patterns, and generates
prompt patches to prevent future failures.
The loop: fail → analyze → rewrite → retry → verify improvement.
Usage:
from agent.self_modify import PromptLearner
learner = PromptLearner()
patches = learner.analyze_session(session_id)
learner.apply_patches(patches)
"""
from __future__ import annotations
import json
import logging
import os
import re
import time
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
HERMES_HOME = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
PATCHES_DIR = HERMES_HOME / "prompt_patches"
ROLLBACK_DIR = HERMES_HOME / "prompt_rollback"
@dataclass
class FailurePattern:
"""A detected failure pattern in session transcripts."""
pattern_type: str # retry_loop, timeout, error_hallucination, context_loss
description: str
frequency: int
example_messages: List[str] = field(default_factory=list)
suggested_fix: str = ""
@dataclass
class PromptPatch:
"""A modification to the system prompt based on failure analysis."""
id: str
failure_type: str
original_rule: str
new_rule: str
confidence: float
applied_at: Optional[float] = None
reverted: bool = False
# Failure detection patterns
FAILURE_SIGNALS = {
"retry_loop": {
"patterns": [
r"(?i)retry(?:ing)?\s*(?:attempt|again)",
r"(?i)failed.*retrying",
r"(?i)error.*again",
r"(?i)attempt\s+\d+\s*(?:of|/)\s*\d+",
],
"description": "Agent stuck in retry loop",
},
"timeout": {
"patterns": [
r"(?i)timed?\s*out",
r"(?i)deadline\s+exceeded",
r"(?i)took\s+(?:too\s+)?long",
],
"description": "Operation timed out",
},
"hallucination": {
"patterns": [
r"(?i)i\s+(?:don't|do\s+not)\s+(?:have|see|find)\s+(?:any|that|this)\s+(?:information|data|file)",
r"(?i)the\s+file\s+doesn't\s+exist",
r"(?i)i\s+(?:made|invented|fabricated)\s+(?:that\s+up|this)",
],
"description": "Agent hallucinated or fabricated information",
},
"context_loss": {
"patterns": [
r"(?i)i\s+(?:don't|do\s+not)\s+(?:remember|recall|know)\s+(?:what|where|when|how)",
r"(?i)could\s+you\s+remind\s+me",
r"(?i)what\s+were\s+we\s+(?:doing|working|talking)\s+(?:on|about)",
],
"description": "Agent lost context from earlier in conversation",
},
"tool_failure": {
"patterns": [
r"(?i)tool\s+(?:call|execution)\s+failed",
r"(?i)command\s+not\s+found",
r"(?i)permission\s+denied",
r"(?i)no\s+such\s+file",
],
"description": "Tool execution failed",
},
}
# Prompt improvement templates
PROMPT_FIXES = {
"retry_loop": (
"If an operation fails more than twice, stop retrying. "
"Report the failure and ask the user for guidance. "
"Do not enter retry loops — they waste tokens."
),
"timeout": (
"For operations that may take long, set a timeout and report "
"progress. If an operation takes more than 30 seconds, report "
"what you've done so far and ask if you should continue."
),
"hallucination": (
"If you cannot find information, say 'I don't know' or "
"'I couldn't find that.' Never fabricate information. "
"If a file doesn't exist, say so — don't guess its contents."
),
"context_loss": (
"When you need context from earlier in the conversation, "
"use session_search to find it. Don't ask the user to repeat themselves."
),
"tool_failure": (
"If a tool fails, check the error message and try a different approach. "
"Don't retry the exact same command — diagnose first."
),
}
class PromptLearner:
"""Analyze session transcripts and generate prompt improvements."""
def __init__(self):
PATCHES_DIR.mkdir(parents=True, exist_ok=True)
ROLLBACK_DIR.mkdir(parents=True, exist_ok=True)
def analyze_session(self, session_data: dict) -> List[FailurePattern]:
"""Analyze a session for failure patterns.
Args:
session_data: Session dict with 'messages' list.
Returns:
List of detected failure patterns.
"""
messages = session_data.get("messages", [])
patterns_found: Dict[str, FailurePattern] = {}
for msg in messages:
content = str(msg.get("content", ""))
role = msg.get("role", "")
# Only analyze assistant messages and tool results
if role not in ("assistant", "tool"):
continue
for failure_type, config in FAILURE_SIGNALS.items():
for pattern in config["patterns"]:
if re.search(pattern, content):
if failure_type not in patterns_found:
patterns_found[failure_type] = FailurePattern(
pattern_type=failure_type,
description=config["description"],
frequency=0,
suggested_fix=PROMPT_FIXES.get(failure_type, ""),
)
patterns_found[failure_type].frequency += 1
if len(patterns_found[failure_type].example_messages) < 3:
patterns_found[failure_type].example_messages.append(
content[:200]
)
break # One match per message per type is enough
return list(patterns_found.values())
def generate_patches(self, patterns: List[FailurePattern],
min_confidence: float = 0.7) -> List[PromptPatch]:
"""Generate prompt patches from failure patterns.
Args:
patterns: Detected failure patterns.
min_confidence: Minimum confidence to generate a patch.
Returns:
List of prompt patches.
"""
patches = []
for pattern in patterns:
# Confidence based on frequency
if pattern.frequency >= 3:
confidence = 0.9
elif pattern.frequency >= 2:
confidence = 0.75
else:
confidence = 0.5
if confidence < min_confidence:
continue
if not pattern.suggested_fix:
continue
patch = PromptPatch(
id=f"{pattern.pattern_type}-{int(time.time())}",
failure_type=pattern.pattern_type,
original_rule="(missing — no existing rule for this pattern)",
new_rule=pattern.suggested_fix,
confidence=confidence,
)
patches.append(patch)
return patches
def apply_patches(self, patches: List[PromptPatch],
prompt_path: Optional[str] = None) -> int:
"""Apply patches to the system prompt.
Args:
patches: Patches to apply.
prompt_path: Path to prompt file (default: ~/.hermes/system_prompt.md)
Returns:
Number of patches applied.
"""
if prompt_path is None:
prompt_path = str(HERMES_HOME / "system_prompt.md")
prompt_file = Path(prompt_path)
# Backup current prompt
if prompt_file.exists():
backup = ROLLBACK_DIR / f"{prompt_file.name}.{int(time.time())}.bak"
backup.write_text(prompt_file.read_text())
# Read current prompt
current = prompt_file.read_text() if prompt_file.exists() else ""
# Apply patches
applied = 0
additions = []
for patch in patches:
if patch.new_rule not in current:
additions.append(f"\n## Auto-learned: {patch.failure_type}\n{patch.new_rule}")
patch.applied_at = time.time()
applied += 1
if additions:
new_content = current + "\n".join(additions)
prompt_file.write_text(new_content)
# Log patches
patches_file = PATCHES_DIR / f"patches-{int(time.time())}.json"
with open(patches_file, "w") as f:
json.dump([p.__dict__ for p in patches], f, indent=2, default=str)
logger.info("Applied %d prompt patches", applied)
return applied
def rollback_last(self, prompt_path: Optional[str] = None) -> bool:
"""Rollback to the most recent backup.
Args:
prompt_path: Path to prompt file.
Returns:
True if rollback succeeded.
"""
if prompt_path is None:
prompt_path = str(HERMES_HOME / "system_prompt.md")
backups = sorted(ROLLBACK_DIR.glob("*.bak"), reverse=True)
if not backups:
logger.warning("No backups to rollback to")
return False
latest = backups[0]
Path(prompt_path).write_text(latest.read_text())
logger.info("Rolled back to %s", latest.name)
return True
def learn_from_session(self, session_data: dict) -> Dict[str, Any]:
"""Full learning cycle: analyze → patch → apply.
Args:
session_data: Session dict.
Returns:
Summary of what was learned and applied.
"""
patterns = self.analyze_session(session_data)
patches = self.generate_patches(patterns)
applied = self.apply_patches(patches)
return {
"patterns_detected": len(patterns),
"patches_generated": len(patches),
"patches_applied": applied,
"patterns": [
{"type": p.pattern_type, "frequency": p.frequency, "description": p.description}
for p in patterns
],
}

View File

@@ -1,231 +0,0 @@
"""Session compaction with fact extraction.
Before compressing conversation context, extracts durable facts
(user preferences, corrections, project details) and saves them
to the fact store so they survive compression.
Usage:
from agent.session_compactor import extract_and_save_facts
facts = extract_and_save_facts(messages)
"""
from __future__ import annotations
import json
import logging
import re
import time
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
@dataclass
class ExtractedFact:
"""A fact extracted from conversation."""
category: str # "user_pref", "correction", "project", "tool_quirk", "general"
entity: str # what the fact is about
content: str # the fact itself
confidence: float # 0.0-1.0
source_turn: int # which message turn it came from
timestamp: float = 0.0
# Patterns that indicate user preferences
_PREFERENCE_PATTERNS = [
(r"(?:I|we) (?:prefer|like|want|need) (.+?)(?:\.|$)", "preference"),
(r"(?:always|never) (?:use|do|run|deploy) (.+?)(?:\.|$)", "preference"),
(r"(?:my|our) (?:default|preferred|usual) (.+?) (?:is|are) (.+?)(?:\.|$)", "preference"),
(r"(?:make sure|ensure|remember) (?:to|that) (.+?)(?:\.|$)", "instruction"),
(r"(?:don'?t|do not) (?:ever|ever again) (.+?)(?:\.|$)", "constraint"),
]
# Patterns that indicate corrections
_CORRECTION_PATTERNS = [
(r"(?:actually|no[, ]|wait[, ]|correction[: ]|sorry[, ]) (.+)", "correction"),
(r"(?:I meant|what I meant was|the correct) (.+?)(?:\.|$)", "correction"),
(r"(?:it'?s|its) (?:not|shouldn'?t be|wrong) (.+?)(?:\.|$)", "correction"),
]
# Patterns that indicate project/tool facts
_PROJECT_PATTERNS = [
(r"(?:the |our )?(?:project|repo|codebase|code) (?:is|uses|needs|requires) (.+?)(?:\.|$)", "project"),
(r"(?:deploy|push|commit) (?:to|on) (.+?)(?:\.|$)", "project"),
(r"(?:this|that|the) (?:server|host|machine|VPS) (?:is|runs|has) (.+?)(?:\.|$)", "infrastructure"),
(r"(?:model|provider|engine) (?:is|should be|needs to be) (.+?)(?:\.|$)", "config"),
]
def extract_facts_from_messages(messages: List[Dict[str, Any]]) -> List[ExtractedFact]:
"""Extract durable facts from conversation messages.
Scans user messages for preferences, corrections, project facts,
and infrastructure details that should survive compression.
"""
facts = []
seen_contents = set()
for turn_idx, msg in enumerate(messages):
role = msg.get("role", "")
content = msg.get("content", "")
# Only scan user messages and assistant responses with corrections
if role not in ("user", "assistant"):
continue
if not content or not isinstance(content, str):
continue
if len(content) < 10:
continue
# Skip tool results and system messages
if role == "assistant" and msg.get("tool_calls"):
continue
extracted = _extract_from_text(content, turn_idx, role)
# Deduplicate by content
for fact in extracted:
key = f"{fact.category}:{fact.content[:100]}"
if key not in seen_contents:
seen_contents.add(key)
facts.append(fact)
return facts
def _extract_from_text(text: str, turn_idx: int, role: str) -> List[ExtractedFact]:
"""Extract facts from a single text block."""
facts = []
timestamp = time.time()
# Clean text for pattern matching
clean = text.strip()
# User preference patterns (from user messages)
if role == "user":
for pattern, subcategory in _PREFERENCE_PATTERNS:
for match in re.finditer(pattern, clean, re.IGNORECASE):
content = match.group(1).strip() if match.lastindex else match.group(0).strip()
if len(content) > 5:
facts.append(ExtractedFact(
category=f"user_pref.{subcategory}",
entity="user",
content=content[:200],
confidence=0.7,
source_turn=turn_idx,
timestamp=timestamp,
))
# Correction patterns (from user messages)
if role == "user":
for pattern, subcategory in _CORRECTION_PATTERNS:
for match in re.finditer(pattern, clean, re.IGNORECASE):
content = match.group(1).strip() if match.lastindex else match.group(0).strip()
if len(content) > 5:
facts.append(ExtractedFact(
category=f"correction.{subcategory}",
entity="user",
content=content[:200],
confidence=0.8,
source_turn=turn_idx,
timestamp=timestamp,
))
# Project/infrastructure patterns (from both user and assistant)
for pattern, subcategory in _PROJECT_PATTERNS:
for match in re.finditer(pattern, clean, re.IGNORECASE):
content = match.group(1).strip() if match.lastindex else match.group(0).strip()
if len(content) > 5:
facts.append(ExtractedFact(
category=f"project.{subcategory}",
entity=subcategory,
content=content[:200],
confidence=0.6,
source_turn=turn_idx,
timestamp=timestamp,
))
return facts
def save_facts_to_store(facts: List[ExtractedFact], fact_store_fn=None) -> int:
"""Save extracted facts to the fact store.
Args:
facts: List of extracted facts.
fact_store_fn: Optional callable(category, entity, content, trust).
If None, uses the holographic fact store if available.
Returns:
Number of facts saved.
"""
saved = 0
if fact_store_fn:
for fact in facts:
try:
fact_store_fn(
category=fact.category,
entity=fact.entity,
content=fact.content,
trust=fact.confidence,
)
saved += 1
except Exception as e:
logger.debug("Failed to save fact: %s", e)
else:
# Try holographic fact store
try:
from fact_store import fact_store as _fs
for fact in facts:
try:
_fs(
action="add",
content=fact.content,
category=fact.category,
tags=fact.entity,
trust_delta=fact.confidence - 0.5,
)
saved += 1
except Exception as e:
logger.debug("Failed to save fact via fact_store: %s", e)
except ImportError:
logger.debug("fact_store not available — facts not persisted")
return saved
def extract_and_save_facts(
messages: List[Dict[str, Any]],
fact_store_fn=None,
) -> Tuple[List[ExtractedFact], int]:
"""Extract facts from messages and save them.
Returns (extracted_facts, saved_count).
"""
facts = extract_facts_from_messages(messages)
if facts:
logger.info("Extracted %d facts from conversation", len(facts))
saved = save_facts_to_store(facts, fact_store_fn)
logger.info("Saved %d/%d facts to store", saved, len(facts))
else:
saved = 0
return facts, saved
def format_facts_summary(facts: List[ExtractedFact]) -> str:
"""Format extracted facts as a readable summary."""
if not facts:
return "No facts extracted."
by_category = {}
for f in facts:
by_category.setdefault(f.category, []).append(f)
lines = [f"Extracted {len(facts)} facts:", ""]
for cat, cat_facts in sorted(by_category.items()):
lines.append(f" {cat}:")
for f in cat_facts:
lines.append(f" - {f.content[:80]}")
return "\n".join(lines)

View File

@@ -1,24 +0,0 @@
import logging
from tools.shield.detector import ShieldDetector, Verdict, CRISIS_SYSTEM_PROMPT, SAFE_SIX_MODELS
logger = logging.getLogger(__name__)
_detector = None
def get_detector():
global _detector
if _detector is None:
_detector = ShieldDetector()
return _detector
def scan_text(text: str):
"""Scan text for jailbreaks and crisis signals using SHIELD."""
detector = get_detector()
return detector.detect(text)
def is_crisis(verdict: str) -> bool:
return verdict in [Verdict.CRISIS_DETECTED.value, Verdict.CRISIS_UNDER_ATTACK.value]
def is_jailbreak(verdict: str) -> bool:
return verdict in [Verdict.JAILBREAK_DETECTED.value, Verdict.CRISIS_UNDER_ATTACK.value]

View File

@@ -12,8 +12,6 @@ from datetime import datetime
from pathlib import Path
from typing import Any, Dict, Optional
from hermes_constants import display_hermes_home
logger = logging.getLogger(__name__)
_skill_commands: Dict[str, Dict[str, Any]] = {}
@@ -110,7 +108,7 @@ def _inject_skill_config(loaded_skill: dict[str, Any], parts: list[str]) -> None
if not resolved:
return
lines = ["", f"[Skill config (from {display_hermes_home()}/config.yaml):"]
lines = ["", "[Skill config (from ~/.hermes/config.yaml):"]
for key, value in resolved.items():
display_val = str(value) if value else "(not set)"
lines.append(f" {key} = {display_val}")

View File

@@ -10,9 +10,9 @@ import os
import re
import sys
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
from typing import Any, Dict, List, Set, Tuple
from hermes_constants import get_config_path, get_skills_dir
from hermes_constants import get_hermes_home
logger = logging.getLogger(__name__)
@@ -130,7 +130,7 @@ def get_disabled_skill_names(platform: str | None = None) -> Set[str]:
Reads the config file directly (no CLI config imports) to stay
lightweight.
"""
config_path = get_config_path()
config_path = get_hermes_home() / "config.yaml"
if not config_path.exists():
return set()
try:
@@ -178,7 +178,7 @@ def get_external_skills_dirs() -> List[Path]:
path. Only directories that actually exist are returned. Duplicates and
paths that resolve to the local ``~/.hermes/skills/`` are silently skipped.
"""
config_path = get_config_path()
config_path = get_hermes_home() / "config.yaml"
if not config_path.exists():
return []
try:
@@ -200,7 +200,7 @@ def get_external_skills_dirs() -> List[Path]:
if not isinstance(raw_dirs, list):
return []
local_skills = get_skills_dir().resolve()
local_skills = (get_hermes_home() / "skills").resolve()
seen: Set[Path] = set()
result: List[Path] = []
@@ -230,7 +230,7 @@ def get_all_skills_dirs() -> List[Path]:
The local dir is always first (and always included even if it doesn't exist
yet — callers handle that). External dirs follow in config order.
"""
dirs = [get_skills_dir()]
dirs = [get_hermes_home() / "skills"]
dirs.extend(get_external_skills_dirs())
return dirs
@@ -384,7 +384,7 @@ def resolve_skill_config_values(
current values (or the declared default if the key isn't set).
Path values are expanded via ``os.path.expanduser``.
"""
config_path = get_config_path()
config_path = get_hermes_home() / "config.yaml"
config: Dict[str, Any] = {}
if config_path.exists():
try:
@@ -441,25 +441,3 @@ def iter_skill_index_files(skills_dir: Path, filename: str):
matches.append(Path(root) / filename)
for path in sorted(matches, key=lambda p: str(p.relative_to(skills_dir))):
yield path
# ── Namespace helpers for plugin-provided skills ───────────────────────────
_NAMESPACE_RE = re.compile(r"^[a-zA-Z0-9_-]+$")
def parse_qualified_name(name: str) -> Tuple[Optional[str], str]:
"""Split ``'namespace:skill-name'`` into ``(namespace, bare_name)``.
Returns ``(None, name)`` when there is no ``':'``.
"""
if ":" not in name:
return None, name
return tuple(name.split(":", 1)) # type: ignore[return-value]
def is_valid_namespace(candidate: Optional[str]) -> bool:
"""Check whether *candidate* is a valid namespace (``[a-zA-Z0-9_-]+``)."""
if not candidate:
return False
return bool(_NAMESPACE_RE.match(candidate))

View File

@@ -1,23 +0,0 @@
import os
import json
import time
def log_token_usage(prompt_tokens, completion_tokens, model_name):
"""Logs token usage to a local JSONL file for fleet-wide accounting."""
spend_dir = os.path.expanduser("~/.hermes/telemetry/spend")
os.makedirs(spend_dir, exist_ok=True)
session_id = os.environ.get("HERMES_SESSION_ID", "default")
log_file = os.path.join(spend_dir, f"session_{session_id}.jsonl")
record = {
"timestamp": time.time(),
"model": model_name,
"input_tokens": prompt_tokens,
"output_tokens": completion_tokens
}
with open(log_file, "a") as f:
f.write(json.dumps(record) + "\n")

View File

@@ -1,146 +0,0 @@
"""Time-aware model routing for cron jobs.
Routes cron tasks to more capable models during off-hours when the user
is not present to correct errors. Reduces error rates during high-error
time windows (e.g., 18:00 evening batches).
Usage:
from agent.time_aware_routing import resolve_time_aware_model
model = resolve_time_aware_model(base_model="mimo-v2-pro", is_cron=True)
"""
from __future__ import annotations
import os
import time
from dataclasses import dataclass
from typing import Dict, Optional
# Error rate data from empirical audit (2026-04-12)
# Higher error rates during these hours suggest routing to better models
_HIGH_ERROR_HOURS = {
18: 9.4, # 18:00 — 9.4% error rate (evening cron batches)
19: 8.1,
20: 7.5,
21: 6.8,
22: 6.2,
23: 5.9,
0: 5.5,
1: 5.2,
}
# Low error hours — default model is fine
_LOW_ERROR_HOURS = set(range(6, 18)) # 06:00-17:59
# Default fallback models by time zone
_DEFAULT_STRONG_MODEL = os.getenv("CRON_STRONG_MODEL", "xiaomi/mimo-v2-pro")
_DEFAULT_CHEAP_MODEL = os.getenv("CRON_CHEAP_MODEL", "qwen2.5:7b")
_ERROR_THRESHOLD = float(os.getenv("CRON_ERROR_THRESHOLD", "6.0")) # % error rate
@dataclass
class RoutingDecision:
"""Result of time-aware routing."""
model: str
provider: str
reason: str
hour: int
error_rate: float
is_off_hours: bool
def get_hour_error_rate(hour: int) -> float:
"""Get expected error rate for a given hour (0-23)."""
return _HIGH_ERROR_HOURS.get(hour, 4.0) # Default 4% for unlisted hours
def is_off_hours(hour: int) -> bool:
"""Check if hour is considered off-hours (higher error rates)."""
return hour not in _LOW_ERROR_HOURS
def resolve_time_aware_model(
base_model: str = "",
base_provider: str = "",
is_cron: bool = False,
hour: Optional[int] = None,
) -> RoutingDecision:
"""Resolve model based on time of day and task type.
During off-hours (evening/night), routes to stronger models for cron
jobs to compensate for lack of human oversight.
Args:
base_model: The model that would normally be used.
base_provider: The provider for the base model.
is_cron: Whether this is a cron job (vs interactive session).
hour: Override hour (for testing). Defaults to current hour.
Returns:
RoutingDecision with model, provider, and reasoning.
"""
if hour is None:
hour = time.localtime().tm_hour
error_rate = get_hour_error_rate(hour)
off_hours = is_off_hours(hour)
# Interactive sessions always use the base model (user can correct errors)
if not is_cron:
return RoutingDecision(
model=base_model or _DEFAULT_CHEAP_MODEL,
provider=base_provider,
reason="Interactive session — user can correct errors",
hour=hour,
error_rate=error_rate,
is_off_hours=off_hours,
)
# Cron jobs during low-error hours: use base model
if not off_hours and error_rate < _ERROR_THRESHOLD:
return RoutingDecision(
model=base_model or _DEFAULT_CHEAP_MODEL,
provider=base_provider,
reason=f"Low-error hours ({hour}:00, {error_rate}% expected)",
hour=hour,
error_rate=error_rate,
is_off_hours=False,
)
# Cron jobs during high-error hours: upgrade to stronger model
if error_rate >= _ERROR_THRESHOLD:
return RoutingDecision(
model=_DEFAULT_STRONG_MODEL,
provider="nous",
reason=f"High-error hours ({hour}:00, {error_rate}% expected) — using stronger model",
hour=hour,
error_rate=error_rate,
is_off_hours=True,
)
# Off-hours but low error: use base model
return RoutingDecision(
model=base_model or _DEFAULT_CHEAP_MODEL,
provider=base_provider,
reason=f"Off-hours but low error ({hour}:00, {error_rate}%)",
hour=hour,
error_rate=error_rate,
is_off_hours=off_hours,
)
def get_routing_report() -> str:
"""Get a report of time-based routing decisions for the next 24 hours."""
lines = ["Time-Aware Model Routing (24h forecast)", "=" * 40, ""]
lines.append(f"Error threshold: {_ERROR_THRESHOLD}%")
lines.append(f"Strong model: {_DEFAULT_STRONG_MODEL}")
lines.append(f"Cheap model: {_DEFAULT_CHEAP_MODEL}")
lines.append("")
for h in range(24):
decision = resolve_time_aware_model(is_cron=True, hour=h)
icon = "\U0001f7e2" if decision.model == _DEFAULT_CHEAP_MODEL else "\U0001f534"
lines.append(f" {h:02d}:00 {icon} {decision.model:25s} ({decision.error_rate}% error)")
return "\n".join(lines)

View File

@@ -36,7 +36,7 @@ def generate_title(user_message: str, assistant_response: str, timeout: float =
try:
response = call_llm(
task="title_generation",
task="compression", # reuse compression task config (cheap/fast model)
messages=messages,
max_tokens=30,
temperature=0.3,

View File

@@ -1,316 +0,0 @@
#!/usr/bin/env python3
"""
Token Budget — Poka-yoke guard against silent context overflow.
Progressive warning system with circuit breakers:
- 60%: WARNING — log + suggest summarization
- 80%: CAUTION — auto-compress, drop raw tool outputs
- 90%: CRITICAL — block verbose tool calls, force wrap-up
- 95%: STOP — graceful session termination with summary
Also provides tool output budgeting to truncate before overflow.
Usage:
from agent.token_budget import TokenBudget
budget = TokenBudget(context_length=128_000)
budget.update(8000) # from API response prompt_tokens
status = budget.check() # returns BudgetStatus with level + message
budget.should_block_tools() # True at 90%+
budget.should_terminate() # True at 95%+
# Tool output budgeting
remaining = budget.tool_output_budget()
truncated = budget.truncate_tool_output(output_text, max_chars=remaining)
"""
import logging
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
logger = logging.getLogger(__name__)
# ── Thresholds ────────────────────────────────────────────────────────
WARN_PERCENT = 0.60
CAUTION_PERCENT = 0.80
CRITICAL_PERCENT = 0.90
STOP_PERCENT = 0.95
# Reserve 5% of context for system prompt, response, and overhead
RESPONSE_RESERVE_RATIO = 0.05
# Max tool output chars at each level
TOOL_OUTPUT_BUDGETS = {
"NORMAL": 50_000,
"WARNING": 20_000,
"CAUTION": 8_000,
"CRITICAL": 2_000,
"STOP": 500,
}
class BudgetLevel(Enum):
NORMAL = "NORMAL"
WARNING = "WARNING"
CAUTION = "CAUTION"
CRITICAL = "CRITICAL"
STOP = "STOP"
@property
def percent_threshold(self) -> float:
return {
BudgetLevel.NORMAL: 0.0,
BudgetLevel.WARNING: WARN_PERCENT,
BudgetLevel.CAUTION: CAUTION_PERCENT,
BudgetLevel.CRITICAL: CRITICAL_PERCENT,
BudgetLevel.STOP: STOP_PERCENT,
}[self]
@property
def emoji(self) -> str:
return {
BudgetLevel.NORMAL: "",
BudgetLevel.WARNING: "\u26a0\ufe0f",
BudgetLevel.CAUTION: "\U0001f525",
BudgetLevel.CRITICAL: "\U0001f6d1",
BudgetLevel.STOP: "\U0001f6d1",
}[self]
@dataclass
class BudgetStatus:
"""Current token budget status."""
level: BudgetLevel
tokens_used: int
context_length: int
percent_used: float
tokens_remaining: int
message: str = ""
should_compress: bool = False
should_block_tools: bool = False
should_terminate: bool = False
def to_indicator(self) -> str:
"""Compact status indicator for CLI display."""
pct = int(self.percent_used * 100)
if self.level == BudgetLevel.NORMAL:
return f"[{pct}%]"
return f"{self.level.emoji} [{pct}%]"
def to_bar(self, width: int = 10) -> str:
"""Visual progress bar."""
filled = int(width * self.percent_used)
bar = "\u2588" * filled + "\u2591" * (width - filled)
color = self._bar_color()
return f"{color}{bar}\033[0m {int(self.percent_used * 100)}%"
def _bar_color(self) -> str:
if self.level == BudgetLevel.STOP:
return "\033[41m" # red bg
if self.level == BudgetLevel.CRITICAL:
return "\033[31m" # red
if self.level == BudgetLevel.CAUTION:
return "\033[33m" # yellow
if self.level == BudgetLevel.WARNING:
return "\033[33m" # yellow
return "\033[32m" # green
class TokenBudget:
"""
Progressive token budget tracker with poka-yoke circuit breakers.
Tracks cumulative token usage against a context length and triggers
escalating actions at each threshold.
"""
def __init__(
self,
context_length: int,
warn_percent: float = WARN_PERCENT,
caution_percent: float = CAUTION_PERCENT,
critical_percent: float = CRITICAL_PERCENT,
stop_percent: float = STOP_PERCENT,
response_reserve_ratio: float = RESPONSE_RESERVE_RATIO,
):
self.context_length = context_length
self.warn_threshold = int(context_length * warn_percent)
self.caution_threshold = int(context_length * caution_percent)
self.critical_threshold = int(context_length * critical_percent)
self.stop_threshold = int(context_length * stop_percent)
self.response_reserve = int(context_length * response_reserve_ratio)
self.tokens_used = 0
self.completions_tokens = 0
self.total_tool_output_chars = 0
self._level = BudgetLevel.NORMAL
self._history: list[int] = []
def update(self, prompt_tokens: int, completion_tokens: int = 0) -> BudgetStatus:
"""Update budget from API response usage."""
self.tokens_used = prompt_tokens
self.completions_tokens = completion_tokens
self._history.append(prompt_tokens)
return self.check()
def check(self) -> BudgetStatus:
"""Evaluate current budget level and return status."""
pct = self.tokens_used / self.context_length if self.context_length > 0 else 0
remaining = max(0, self.context_length - self.tokens_used - self.response_reserve)
# Determine level
if pct >= STOP_PERCENT:
level = BudgetLevel.STOP
elif pct >= CRITICAL_PERCENT:
level = BudgetLevel.CRITICAL
elif pct >= CAUTION_PERCENT:
level = BudgetLevel.CAUTION
elif pct >= WARN_PERCENT:
level = BudgetLevel.WARNING
else:
level = BudgetLevel.NORMAL
# Log transitions (don\'t log every check)
if level != self._level:
self._log_transition(level, pct)
self._level = level
messages = {
BudgetLevel.NORMAL: "",
BudgetLevel.WARNING: (
f"Context at {int(pct*100)}%. Consider wrapping up soon or using /compress."
),
BudgetLevel.CAUTION: (
f"Context at {int(pct*100)}%. Auto-compressing. "
f"Tool outputs will be truncated."
),
BudgetLevel.CRITICAL: (
f"Context at {int(pct*100)}%. Verbose tools blocked. "
f"Session approaching limit — please wrap up."
),
BudgetLevel.STOP: (
f"Context at {int(pct*100)}%. Session must terminate. "
f"Saving summary before shutdown."
),
}
return BudgetStatus(
level=level,
tokens_used=self.tokens_used,
context_length=self.context_length,
percent_used=pct,
tokens_remaining=remaining,
message=messages[level],
should_compress=level in (BudgetLevel.CAUTION, BudgetLevel.CRITICAL, BudgetLevel.STOP),
should_block_tools=level in (BudgetLevel.CRITICAL, BudgetLevel.STOP),
should_terminate=level == BudgetLevel.STOP,
)
def should_compress(self) -> bool:
"""True at 80%+ — auto-compression should trigger."""
return self.tokens_used >= self.caution_threshold
def should_block_tools(self) -> bool:
"""True at 90%+ — verbose tool calls should be blocked."""
return self.tokens_used >= self.critical_threshold
def should_terminate(self) -> bool:
"""True at 95%+ — session should gracefully terminate."""
return self.tokens_used >= self.stop_threshold
def tool_output_budget(self) -> int:
"""Max chars allowed for next tool output based on current level."""
status = self.check()
return TOOL_OUTPUT_BUDGETS.get(status.level.value, 50_000)
def truncate_tool_output(self, output: str, max_chars: int = None) -> str:
"""Truncate tool output to fit budget. Adds truncation notice."""
if max_chars is None:
max_chars = self.tool_output_budget()
if len(output) <= max_chars:
return output
# Preserve start and end, truncate middle
if max_chars < 200:
return output[:max_chars] + "\n[...truncated...]"
head = max_chars // 2
tail = max_chars - head - 30 # reserve for truncation notice
truncated = (
output[:head]
+ f"\n\n[...{len(output) - head - tail:,} chars truncated...]\n\n"
+ output[-tail:]
)
return truncated
def remaining_for_response(self) -> int:
"""Tokens available for the model\'s response."""
return max(0, self.context_length - self.tokens_used - self.response_reserve)
def growth_rate(self) -> Optional[float]:
"""Average token increase per turn (from history)."""
if len(self._history) < 2:
return None
diffs = [self._history[i] - self._history[i-1] for i in range(1, len(self._history))]
return sum(diffs) / len(diffs)
def turns_remaining(self) -> Optional[int]:
"""Estimated turns until context is full (based on growth rate)."""
rate = self.growth_rate()
if rate is None or rate <= 0:
return None
remaining = self.context_length - self.tokens_used
return int(remaining / rate)
def reset(self):
"""Reset budget for new session."""
self.tokens_used = 0
self.completions_tokens = 0
self.total_tool_output_chars = 0
self._level = BudgetLevel.NORMAL
self._history.clear()
def _log_transition(self, new_level: BudgetLevel, pct: float):
"""Log budget level transitions."""
msg = (
f"Token budget: {self._level.value} -> {new_level.value} "
f"({self.tokens_used}/{self.context_length} = {pct:.0%})"
)
if new_level == BudgetLevel.WARNING:
logger.warning(msg)
elif new_level == BudgetLevel.CAUTION:
logger.warning(msg)
elif new_level in (BudgetLevel.CRITICAL, BudgetLevel.STOP):
logger.error(msg)
else:
logger.info(msg)
def summary(self) -> str:
"""Human-readable budget summary."""
status = self.check()
turns = self.turns_remaining()
rate = self.growth_rate()
lines = [
f"Token Budget: {status.tokens_used:,} / {status.context_length:,} ({status.percent_used:.0%})",
f"Level: {status.level.value}",
f"Remaining: {status.tokens_remaining:,} tokens",
]
if rate is not None:
lines.append(f"Growth rate: ~{rate:,.0f} tokens/turn")
if turns is not None:
lines.append(f"Estimated turns left: ~{turns}")
if status.message:
lines.append(f"Action: {status.message}")
return "\n".join(lines)
# ── Convenience factory ───────────────────────────────────────────────
def create_budget(context_length: int, **kwargs) -> TokenBudget:
"""Create a TokenBudget with defaults."""
return TokenBudget(context_length=context_length, **kwargs)

View File

@@ -1,156 +0,0 @@
"""Tool fixation detection — break repetitive tool calling loops.
Detects when the agent latches onto one tool and calls it repeatedly
without making progress. Injects a nudge prompt to break the loop.
Usage:
from agent.tool_fixation_detector import ToolFixationDetector
detector = ToolFixationDetector()
nudge = detector.record("execute_code")
if nudge:
# Inject nudge into conversation
messages.append({"role": "system", "content": nudge})
"""
from __future__ import annotations
import os
from dataclasses import dataclass, field
from typing import Dict, List, Optional
# Default thresholds
_DEFAULT_THRESHOLD = int(os.getenv("TOOL_FIXATION_THRESHOLD", "5"))
_DEFAULT_WINDOW = int(os.getenv("TOOL_FIXATION_WINDOW", "10"))
@dataclass
class FixationEvent:
"""Record of a fixation detection."""
tool_name: str
streak_length: int
threshold: int
nudge_sent: bool = False
class ToolFixationDetector:
"""Detects and breaks tool fixation loops.
Tracks the sequence of tool calls and detects when the same tool
is called N times consecutively. When detected, returns a nudge
prompt to inject into the conversation.
"""
def __init__(self, threshold: int = 0, window: int = 0):
self.threshold = threshold or _DEFAULT_THRESHOLD
self.window = window or _DEFAULT_WINDOW
self._history: List[str] = []
self._current_streak: str = ""
self._streak_count: int = 0
self._nudges_sent: int = 0
self._events: List[FixationEvent] = []
@property
def nudges_sent(self) -> int:
return self._nudges_sent
@property
def events(self) -> List[FixationEvent]:
return list(self._events)
def record(self, tool_name: str) -> Optional[str]:
"""Record a tool call and return nudge prompt if fixation detected.
Args:
tool_name: Name of the tool that was called.
Returns:
Nudge prompt string if fixation detected, None otherwise.
"""
self._history.append(tool_name)
# Trim history to window
if len(self._history) > self.window:
self._history = self._history[-self.window:]
# Update streak
if tool_name == self._current_streak:
self._streak_count += 1
else:
self._current_streak = tool_name
self._streak_count = 1
# Check for fixation
if self._streak_count >= self.threshold:
event = FixationEvent(
tool_name=tool_name,
streak_length=self._streak_count,
threshold=self.threshold,
nudge_sent=True,
)
self._events.append(event)
self._nudges_sent += 1
return self._build_nudge(tool_name, self._streak_count)
return None
def _build_nudge(self, tool_name: str, count: int) -> str:
"""Build a nudge prompt to break the fixation loop."""
return (
f"[SYSTEM: You have called `{tool_name}` {count} times in a row "
f"without switching tools. This suggests a fixation loop. "
f"Consider:\n"
f"1. Is the tool returning an error? Read the error carefully.\n"
f"2. Is there a different tool that could help?\n"
f"3. Should you ask the user for clarification?\n"
f"4. Is the task actually complete?\n"
f"Break the loop by trying a different approach.]"
)
def reset(self) -> None:
"""Reset the detector state."""
self._history.clear()
self._current_streak = ""
self._streak_count = 0
def get_streak_info(self) -> dict:
"""Get current streak information."""
return {
"current_tool": self._current_streak,
"streak_count": self._streak_count,
"threshold": self.threshold,
"at_threshold": self._streak_count >= self.threshold,
"nudges_sent": self._nudges_sent,
}
def format_report(self) -> str:
"""Format fixation events as a report."""
if not self._events:
return "No tool fixation detected."
lines = [
f"Tool Fixation Report ({len(self._events)} events)",
"=" * 40,
]
for e in self._events:
lines.append(f" {e.tool_name}: {e.streak_length} consecutive calls (threshold: {e.threshold})")
return "\n".join(lines)
# Singleton
_detector: Optional[ToolFixationDetector] = None
def get_fixation_detector() -> ToolFixationDetector:
"""Get or create the singleton detector."""
global _detector
if _detector is None:
_detector = ToolFixationDetector()
return _detector
def reset_fixation_detector() -> None:
"""Reset the singleton."""
global _detector
_detector = None

View File

@@ -1,177 +0,0 @@
"""Tool Orchestrator — Robust execution and circuit breaking for agent tools.
Provides a unified execution service that wraps the tool registry.
Implements the Circuit Breaker pattern to prevent the agent from getting
stuck in failure loops when a specific tool or its underlying service
is flapping or down.
Architecture:
Discovery (tools/registry.py) -> Orchestration (agent/tool_orchestrator.py) -> Dispatch
"""
import json
import time
import logging
import threading
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from tools.registry import registry
logger = logging.getLogger(__name__)
class CircuitState:
"""States for the tool circuit breaker."""
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, execution blocked
HALF_OPEN = "half_open" # Testing if service recovered
@dataclass
class ToolStats:
"""Execution statistics for a tool."""
name: str
state: str = CircuitState.CLOSED
failures: int = 0
successes: int = 0
last_failure_time: float = 0
total_execution_time: float = 0
call_count: int = 0
class ToolOrchestrator:
"""Orchestrates tool execution with robustness patterns."""
def __init__(
self,
failure_threshold: int = 3,
reset_timeout: int = 300,
):
"""
Args:
failure_threshold: Number of failures before opening the circuit.
reset_timeout: Seconds to wait before transitioning from OPEN to HALF_OPEN.
"""
self.failure_threshold = failure_threshold
self.reset_timeout = reset_timeout
self._stats: Dict[str, ToolStats] = {}
self._lock = threading.Lock()
def _get_stats(self, name: str) -> ToolStats:
"""Get or initialize stats for a tool with thread-safe state transition."""
with self._lock:
if name not in self._stats:
self._stats[name] = ToolStats(name=name)
stats = self._stats[name]
# Transition from OPEN to HALF_OPEN if timeout expired
if stats.state == CircuitState.OPEN:
if time.time() - stats.last_failure_time > self.reset_timeout:
stats.state = CircuitState.HALF_OPEN
logger.info("Circuit breaker HALF_OPEN for tool: %s", name)
return stats
def _record_success(self, name: str, execution_time: float):
"""Record a successful tool execution and close the circuit."""
with self._lock:
stats = self._stats[name]
stats.successes += 1
stats.call_count += 1
stats.total_execution_time += execution_time
if stats.state != CircuitState.CLOSED:
logger.info("Circuit breaker CLOSED for tool: %s (recovered)", name)
stats.state = CircuitState.CLOSED
stats.failures = 0
def _record_failure(self, name: str, execution_time: float):
"""Record a failed tool execution and potentially open the circuit."""
with self._lock:
stats = self._stats[name]
stats.failures += 1
stats.call_count += 1
stats.total_execution_time += execution_time
stats.last_failure_time = time.time()
if stats.state == CircuitState.HALF_OPEN or stats.failures >= self.failure_threshold:
stats.state = CircuitState.OPEN
logger.warning(
"Circuit breaker OPEN for tool: %s (failures: %d)",
name, stats.failures
)
def dispatch(self, name: str, args: dict, **kwargs) -> str:
"""Execute a tool via the registry with circuit breaker protection."""
stats = self._get_stats(name)
if stats.state == CircuitState.OPEN:
return json.dumps({
"error": (
f"Tool '{name}' is temporarily unavailable due to repeated failures. "
f"Circuit breaker is OPEN. Please try again in a few minutes or use an alternative tool."
),
"circuit_breaker": True,
"tool_name": name
})
start_time = time.time()
try:
# Dispatch to the underlying registry
result_str = registry.dispatch(name, args, **kwargs)
execution_time = time.time() - start_time
# Inspect result for errors. registry.dispatch catches internal
# exceptions and returns a JSON error string.
is_error = False
try:
# Lightweight check for error key in JSON
if '"error":' in result_str:
res_json = json.loads(result_str)
if isinstance(res_json, dict) and "error" in res_json:
is_error = True
except (json.JSONDecodeError, TypeError):
# If it's not valid JSON, it's a malformed result (error)
is_error = True
if is_error:
self._record_failure(name, execution_time)
else:
self._record_success(name, execution_time)
return result_str
except Exception as e:
# This should rarely be hit as registry.dispatch catches most things,
# but we guard against orchestrator-level or registry-level bugs.
execution_time = time.time() - start_time
self._record_failure(name, execution_time)
error_msg = f"Tool orchestrator error during {name}: {type(e).__name__}: {e}"
logger.exception(error_msg)
return json.dumps({
"error": error_msg,
"tool_name": name,
"execution_time": execution_time
})
def get_fleet_stats(self) -> Dict[str, Any]:
"""Return execution statistics for all tools."""
with self._lock:
return {
name: {
"state": s.state,
"failures": s.failures,
"successes": s.successes,
"avg_time": s.total_execution_time / s.call_count if s.call_count > 0 else 0,
"calls": s.call_count
}
for name, s in self._stats.items()
}
# Global orchestrator instance
orchestrator = ToolOrchestrator()

View File

@@ -575,6 +575,25 @@ def has_known_pricing(
return entry is not None
def get_pricing(
model_name: str,
provider: Optional[str] = None,
base_url: Optional[str] = None,
api_key: Optional[str] = None,
) -> Dict[str, float]:
"""Backward-compatible thin wrapper for legacy callers.
Returns only non-cache input/output fields when a pricing entry exists.
Unknown routes return zeroes.
"""
entry = get_pricing_entry(model_name, provider=provider, base_url=base_url, api_key=api_key)
if not entry:
return {"input": 0.0, "output": 0.0}
return {
"input": float(entry.input_cost_per_million or _ZERO),
"output": float(entry.output_cost_per_million or _ZERO),
}
def format_duration_compact(seconds: float) -> str:
if seconds < 60:

View File

@@ -1,32 +0,0 @@
---
# fleet_mtls.yml — Deploy mutual-TLS certificates to all fleet agents.
#
# Prerequisites:
# 1. Run scripts/gen_fleet_ca.sh to create the fleet CA.
# 2. For each agent, run:
# scripts/gen_agent_cert.sh --agent timmy
# scripts/gen_agent_cert.sh --agent allegro
# scripts/gen_agent_cert.sh --agent ezra
#
# Usage:
# ansible-playbook -i inventory/fleet.ini ansible/fleet_mtls.yml
#
# Inventory example (inventory/fleet.ini):
# [fleet]
# timmy.local agent_name=timmy
# allegro.local agent_name=allegro
# ezra.local agent_name=ezra
#
# Refs #806
- name: Distribute fleet mTLS certificates
hosts: fleet
become: true
vars:
_pki_base: "{{ lookup('env', 'HOME') }}/.hermes/pki"
roles:
- role: hermes_mtls
vars:
hermes_mtls_local_ca_cert: "{{ _pki_base }}/ca/fleet-ca.crt"
hermes_mtls_local_agent_cert: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.crt"
hermes_mtls_local_agent_key: "{{ _pki_base }}/agents/{{ agent_name }}/{{ agent_name }}.key"

View File

@@ -1,12 +0,0 @@
# Example fleet inventory for mutual-TLS cert distribution.
# Copy to fleet.ini and adjust hostnames/IPs.
# Refs #806
[fleet_agents]
timmy ansible_host=192.168.1.10
allegro ansible_host=192.168.1.11
ezra ansible_host=192.168.1.12
[fleet_agents:vars]
ansible_user=hermes
ansible_python_interpreter=/usr/bin/python3

View File

@@ -1,21 +0,0 @@
---
# Default paths on the *control node* where certs are read from.
# Override these in your inventory / group_vars as needed.
# Fleet CA certificate (public; safe to push to all nodes)
fleet_mtls_ca_cert_src: "{{ lookup('env', 'HOME') }}/.hermes/pki/ca/fleet-ca.crt"
# Per-agent cert/key source dir on the control node.
# Expected layout: <fleet_mtls_agent_certs_dir>/<agent_name>/<agent_name>.{crt,key}
fleet_mtls_agent_certs_dir: "{{ lookup('env', 'HOME') }}/.hermes/pki/agents"
# Remote destination paths on the fleet node
fleet_mtls_remote_pki_dir: "/etc/hermes/pki"
fleet_mtls_remote_ca_dir: "{{ fleet_mtls_remote_pki_dir }}/ca"
fleet_mtls_remote_agent_dir: "{{ fleet_mtls_remote_pki_dir }}/agent"
# The agent name to deploy (set per-host in inventory, e.g. timmy / allegro / ezra)
fleet_mtls_agent_name: "{{ inventory_hostname_short }}"
# Hermes service name (for reload notification)
fleet_mtls_hermes_service: "hermes-a2a"

View File

@@ -1,7 +0,0 @@
---
- name: Restart hermes-a2a
ansible.builtin.systemd:
name: "{{ fleet_mtls_hermes_service }}"
state: restarted
when: ansible_service_mgr == "systemd"
ignore_errors: true # service may not exist in all environments

View File

@@ -1,17 +0,0 @@
---
galaxy_info:
role_name: fleet_mtls_certs
author: hermes-agent
description: >
Distribute fleet CA and per-agent mTLS certificates to Hermes fleet nodes.
Part of issue #806 — A2A mutual TLS between fleet agents.
min_ansible_version: "2.14"
platforms:
- name: Debian
versions: [bookworm, bullseye]
- name: Ubuntu
versions: ["22.04", "24.04"]
- name: EL
versions: ["8", "9"]
dependencies: []

View File

@@ -1,99 +0,0 @@
---
# fleet_mtls_certs/tasks/main.yml
#
# Distribute the fleet CA certificate and the per-agent TLS cert+key to
# each fleet node. Triggers a hermes-a2a service restart when any cert
# changes.
#
# Refs #806 — A2A mutual TLS between fleet agents.
- name: Verify agent cert source files exist on control node
ansible.builtin.stat:
path: "{{ item }}"
register: _src_stat
delegate_to: localhost
loop:
- "{{ fleet_mtls_ca_cert_src }}"
- "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
- "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
loop_control:
label: "{{ item | basename }}"
- name: Fail if any source cert is missing
ansible.builtin.fail:
msg: >
Required cert file not found: {{ item.item }}
Run scripts/gen_fleet_ca.sh and scripts/gen_agent_cert.sh --agent {{ fleet_mtls_agent_name }} first.
when: not item.stat.exists
loop: "{{ _src_stat.results }}"
loop_control:
label: "{{ item.item | basename }}"
# -----------------------------------------------------------------------
# Remote directory structure
# -----------------------------------------------------------------------
- name: Create remote PKI directories
ansible.builtin.file:
path: "{{ item }}"
state: directory
owner: root
group: root
mode: "0750"
loop:
- "{{ fleet_mtls_remote_pki_dir }}"
- "{{ fleet_mtls_remote_ca_dir }}"
- "{{ fleet_mtls_remote_agent_dir }}"
# -----------------------------------------------------------------------
# Fleet CA certificate (public — read-only for all)
# -----------------------------------------------------------------------
- name: Deploy fleet CA certificate
ansible.builtin.copy:
src: "{{ fleet_mtls_ca_cert_src }}"
dest: "{{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt"
owner: root
group: root
mode: "0644"
notify: Restart hermes-a2a
# -----------------------------------------------------------------------
# Per-agent certificate (public portion)
# -----------------------------------------------------------------------
- name: Deploy agent certificate
ansible.builtin.copy:
src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.crt"
dest: "{{ fleet_mtls_remote_agent_dir }}/agent.crt"
owner: root
group: root
mode: "0644"
notify: Restart hermes-a2a
# -----------------------------------------------------------------------
# Per-agent private key (secret — root-only read)
# -----------------------------------------------------------------------
- name: Deploy agent private key
ansible.builtin.copy:
src: "{{ fleet_mtls_agent_certs_dir }}/{{ fleet_mtls_agent_name }}/{{ fleet_mtls_agent_name }}.key"
dest: "{{ fleet_mtls_remote_agent_dir }}/agent.key"
owner: root
group: root
mode: "0600"
no_log: true # suppress file content from Ansible output
notify: Restart hermes-a2a
# -----------------------------------------------------------------------
# Environment file for hermes-a2a systemd unit
# -----------------------------------------------------------------------
- name: Write hermes-a2a environment file
ansible.builtin.template:
src: hermes_a2a_env.j2
dest: /etc/hermes/a2a.env
owner: root
group: root
mode: "0640"
notify: Restart hermes-a2a

View File

@@ -1,10 +0,0 @@
# Managed by Ansible — fleet_mtls_certs role
# Environment variables for the hermes-a2a systemd service.
# Source this file in the [Service] section: EnvironmentFile=/etc/hermes/a2a.env
HERMES_AGENT_NAME={{ fleet_mtls_agent_name }}
HERMES_A2A_CERT={{ fleet_mtls_remote_agent_dir }}/agent.crt
HERMES_A2A_KEY={{ fleet_mtls_remote_agent_dir }}/agent.key
HERMES_A2A_CA={{ fleet_mtls_remote_ca_dir }}/fleet-ca.crt
HERMES_A2A_HOST=0.0.0.0
HERMES_A2A_PORT=9443

View File

@@ -1,21 +0,0 @@
---
# Ansible role: hermes_mtls
# Distributes fleet mTLS certificates to Hermes agent nodes.
#
# Required variables (set in inventory / group_vars / --extra-vars):
# hermes_mtls_local_ca_cert Local path on the Ansible controller to fleet-ca.crt
# hermes_mtls_local_agent_cert Local path to this agent's .crt file
# hermes_mtls_local_agent_key Local path to this agent's .key file
#
# Optional overrides:
hermes_mtls_cert_dir: /etc/hermes/certs
hermes_mtls_cert_owner: hermes
hermes_mtls_cert_group: hermes
hermes_mtls_cert_mode: "0640"
hermes_mtls_ca_cert_mode: "0644"
# Env file that Hermes reads on startup (systemd EnvironmentFile or .env)
hermes_mtls_env_file: /etc/hermes/mtls.env
# Hermes systemd service name — restarted after cert changes
hermes_mtls_service: hermes-gateway

View File

@@ -1,7 +0,0 @@
---
- name: Restart hermes service
ansible.builtin.systemd:
name: "{{ hermes_mtls_service }}"
state: restarted
daemon_reload: true
when: ansible_service_mgr == "systemd"

View File

@@ -1,16 +0,0 @@
---
galaxy_info:
role_name: hermes_mtls
author: Hermes Fleet
description: Distribute mTLS certificates to Hermes fleet nodes for A2A authentication
license: MIT
min_ansible_version: "2.14"
platforms:
- name: Ubuntu
versions: ["22.04", "24.04"]
- name: Debian
versions: ["12"]
- name: EL
versions: ["9"]
dependencies: []

View File

@@ -1,67 +0,0 @@
---
# hermes_mtls role — distribute fleet mTLS certificates to a Hermes agent node.
#
# This role:
# 1. Creates the cert directory on the remote node
# 2. Copies the Fleet CA cert, agent cert, and agent key
# 3. Writes an env file with HERMES_MTLS_* variables
# 4. Restarts the Hermes service if any cert changed
- name: Ensure cert directory exists
ansible.builtin.file:
path: "{{ hermes_mtls_cert_dir }}"
state: directory
owner: "{{ hermes_mtls_cert_owner }}"
group: "{{ hermes_mtls_cert_group }}"
mode: "0750"
- name: Copy Fleet CA certificate
ansible.builtin.copy:
src: "{{ hermes_mtls_local_ca_cert }}"
dest: "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
owner: "{{ hermes_mtls_cert_owner }}"
group: "{{ hermes_mtls_cert_group }}"
mode: "{{ hermes_mtls_ca_cert_mode }}"
notify: Restart hermes service
- name: Copy agent TLS certificate
ansible.builtin.copy:
src: "{{ hermes_mtls_local_agent_cert }}"
dest: "{{ hermes_mtls_cert_dir }}/agent.crt"
owner: "{{ hermes_mtls_cert_owner }}"
group: "{{ hermes_mtls_cert_group }}"
mode: "{{ hermes_mtls_cert_mode }}"
notify: Restart hermes service
- name: Copy agent TLS private key
ansible.builtin.copy:
src: "{{ hermes_mtls_local_agent_key }}"
dest: "{{ hermes_mtls_cert_dir }}/agent.key"
owner: "{{ hermes_mtls_cert_owner }}"
group: "{{ hermes_mtls_cert_group }}"
mode: "0600"
notify: Restart hermes service
- name: Write mTLS environment file
ansible.builtin.template:
src: mtls.env.j2
dest: "{{ hermes_mtls_env_file }}"
owner: "{{ hermes_mtls_cert_owner }}"
group: "{{ hermes_mtls_cert_group }}"
mode: "0640"
notify: Restart hermes service
- name: Verify cert files are readable by service user
ansible.builtin.stat:
path: "{{ item }}"
loop:
- "{{ hermes_mtls_cert_dir }}/fleet-ca.crt"
- "{{ hermes_mtls_cert_dir }}/agent.crt"
- "{{ hermes_mtls_cert_dir }}/agent.key"
register: _cert_stat
- name: Assert all cert files exist
ansible.builtin.assert:
that: item.stat.exists
fail_msg: "Expected cert file missing: {{ item.item }}"
loop: "{{ _cert_stat.results }}"

View File

@@ -1,8 +0,0 @@
# Hermes mTLS environment — generated by hermes_mtls Ansible role
# Source this file or use as a systemd EnvironmentFile=
# WARNING: This file contains the path to the agent's private key.
# Restrict read access to the hermes service user.
HERMES_MTLS_CERT={{ hermes_mtls_cert_dir }}/agent.crt
HERMES_MTLS_KEY={{ hermes_mtls_cert_dir }}/agent.key
HERMES_MTLS_CA={{ hermes_mtls_cert_dir }}/fleet-ca.crt

View File

@@ -1,40 +0,0 @@
# Tool Call Benchmark: Gemma 4 vs mimo-v2-pro
Date: 2026-04-13
Status: Awaiting execution
## Test Design
100 diverse tool calls across 7 categories:
| Category | Count | Tools Tested |
|----------|-------|--------------|
| File operations | 20 | read_file, write_file, search_files |
| Terminal commands | 20 | terminal |
| Web search | 15 | web_search |
| Code execution | 15 | execute_code |
| Browser automation | 10 | browser_navigate |
| Delegation | 10 | delegate_task |
| MCP tools | 10 | mcp_* |
## Metrics
| Metric | mimo-v2-pro | Gemma 4 |
|--------|-------------|---------|
| Schema parse success | — | — |
| Tool execution success | — | — |
| Parallel tool success | — | — |
| Avg latency (s) | — | — |
| Token cost per call | — | — |
## How to Run
```bash
python3 benchmarks/tool_call_benchmark.py --model nous:xiaomi/mimo-v2-pro
python3 benchmarks/tool_call_benchmark.py --model ollama/gemma4:latest
python3 benchmarks/tool_call_benchmark.py --compare
```
## Gemma 4-Specific Failure Modes
To be documented after benchmark execution.

View File

@@ -1,194 +0,0 @@
[
{
"id": "screenshot_github_home",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "mark"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_mermaid_flow",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6siSZXVhjQTlgl1nigHg5fRBOzSfebopROCu_cytObSfgLSE1ANOeZWkO2IH5upZxYot8m1hqAdpD_63WRl0xdUG1jdl9kPiOb_EWk2JBtPaiKkF4eVIYgO0EtkW-RSgC4gJ6HJYRG1UNdN0HNVd0Bftjj7X8P92qPj-F8l8T3w",
"category": "diagram",
"expected_keywords": ["flow", "diagram", "process"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_1",
"url": "https://picsum.photos/seed/vision1/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_2",
"url": "https://picsum.photos/seed/vision2/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_simple_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Revenue',data:[100,150,200,250]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "revenue"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "chart_pie",
"url": "https://quickchart.io/chart?c={type:'pie',data:{labels:['A','B','C'],datasets:[{data:[30,50,20]}]}}",
"category": "chart",
"expected_keywords": ["pie", "chart", "percentage"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "diagram_org_chart",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["organization", "hierarchy", "chart"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "screenshot_terminal",
"url": "https://raw.githubusercontent.com/nicehash/nicehash-quick-start/main/images/nicehash-terminal.png",
"category": "screenshot",
"expected_keywords": ["terminal", "command", "output"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_3",
"url": "https://picsum.photos/seed/vision3/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_line",
"url": "https://quickchart.io/chart?c={type:'line',data:{labels:['Jan','Feb','Mar','Apr'],datasets:[{label:'Temperature',data:[5,8,12,18]}]}}",
"category": "chart",
"expected_keywords": ["line", "chart", "temperature"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "diagram_sequence",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["sequence", "interaction", "message"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_4",
"url": "https://picsum.photos/seed/vision4/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_webpage",
"url": "https://github.githubassets.com/images/modules/site/social-cards.png",
"category": "screenshot",
"expected_keywords": ["github", "page", "web"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_radar",
"url": "https://quickchart.io/chart?c={type:'radar',data:{labels:['Speed','Power','Defense','Magic'],datasets:[{label:'Hero',data:[80,60,70,90]}]}}",
"category": "chart",
"expected_keywords": ["radar", "chart", "skill"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_5",
"url": "https://picsum.photos/seed/vision5/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "diagram_class",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["class", "object", "attribute"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "chart_doughnut",
"url": "https://quickchart.io/chart?c={type:'doughnut',data:{labels:['Desktop','Mobile','Tablet'],datasets:[{data:[60,30,10]}]}}",
"category": "chart",
"expected_keywords": ["doughnut", "chart", "device"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "photo_random_6",
"url": "https://picsum.photos/seed/vision6/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "screenshot_error",
"url": "https://http.cat/404.jpg",
"category": "screenshot",
"expected_keywords": ["404", "error", "cat"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": true}
},
{
"id": "diagram_network",
"url": "https://mermaid.ink/img/pako:eNpdkE9PwzAMxb-K5VOl7gc7sAOIIDuAw9gptnRaSJLSJttQStmXs9LCH-ymBOI1ef_42U6cUSae4IkDxbAAWtB6iuyIWyrLgXLALrPEAfFy-iCcmk-83RSjcFZ-51ac2k7AW0JqAKY9y9IcsAPzdS3jxBb5NrHUAraH_lutjbpi6oJqG7P7IPEd3-ItJsWCaO1FVYLw8qQwANsJbIt8i1AExAX0OCwjNqoa6LoPaq7oCvbHHmv5f7pVfX4K5b8mvg",
"category": "diagram",
"expected_keywords": ["network", "node", "connection"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": false}
},
{
"id": "photo_random_7",
"url": "https://picsum.photos/seed/vision7/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "chart_stacked_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['2022','2023','2024'],datasets:[{label:'Cloud',data:[100,150,200]},{label:'On-prem',data:[200,180,160]}]},options:{scales:{x:{stacked:true},y:{stacked:true}}}}",
"category": "chart",
"expected_keywords": ["stacked", "bar", "chart"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 50, "min_sentences": 2, "has_numbers": true}
},
{
"id": "screenshot_dashboard",
"url": "https://github.githubassets.com/images/modules/site/features-code-search.png",
"category": "screenshot",
"expected_keywords": ["search", "code", "feature"],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
},
{
"id": "photo_random_8",
"url": "https://picsum.photos/seed/vision8/400/300",
"category": "photo",
"expected_keywords": [],
"ground_truth_ocr": "",
"expected_structure": {"min_length": 30, "min_sentences": 1, "has_numbers": false}
}
]

View File

@@ -1,614 +0,0 @@
#!/usr/bin/env python3
"""
Tool-Calling Benchmark — Gemma 4 vs mimo-v2-pro regression test.
Runs 100 diverse tool-calling prompts through multiple models and compares
success rates, latency, and token costs.
Usage:
python3 benchmarks/tool_call_benchmark.py # full 100-call suite
python3 benchmarks/tool_call_benchmark.py --limit 10 # quick smoke test
python3 benchmarks/tool_call_benchmark.py --models nous # single model
python3 benchmarks/tool_call_benchmark.py --category file # single category
Requires: hermes-agent venv activated, OPENROUTER_API_KEY or equivalent.
"""
import argparse
import json
import os
import sys
import time
import traceback
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
# Ensure hermes-agent root is importable
REPO_ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(REPO_ROOT))
# ---------------------------------------------------------------------------
# Test Definitions
# ---------------------------------------------------------------------------
@dataclass
class ToolCall:
"""A single tool-calling test case."""
id: str
category: str
prompt: str
expected_tool: str # tool name we expect the model to call
expected_params_check: str = "" # substring expected in JSON args
timeout: int = 30 # max seconds per call
notes: str = ""
# fmt: off
SUITE: list[ToolCall] = [
# ── File Operations (20) ──────────────────────────────────────────────
ToolCall("file-01", "file", "Read the file /tmp/test_bench.txt and show me its contents.",
"read_file", "path"),
ToolCall("file-02", "file", "Write 'hello benchmark' to /tmp/test_bench_out.txt",
"write_file", "path"),
ToolCall("file-03", "file", "Search for the word 'import' in all Python files in the current directory.",
"search_files", "pattern"),
ToolCall("file-04", "file", "Read lines 1-20 of /etc/hosts",
"read_file", "offset"),
ToolCall("file-05", "file", "Patch /tmp/test_bench_out.txt: replace 'hello' with 'goodbye'",
"patch", "old_string"),
ToolCall("file-06", "file", "Search for files matching *.py in the current directory.",
"search_files", "target"),
ToolCall("file-07", "file", "Read the first 10 lines of /etc/passwd",
"read_file", "limit"),
ToolCall("file-08", "file", "Write a JSON config to /tmp/bench_config.json with key 'debug': true",
"write_file", "content"),
ToolCall("file-09", "file", "Search for 'def test_' in Python test files.",
"search_files", "file_glob"),
ToolCall("file-10", "file", "Read /tmp/bench_config.json and tell me what's in it.",
"read_file", "bench_config"),
ToolCall("file-11", "file", "Create a file /tmp/bench_readme.md with one line: '# Benchmark'",
"write_file", "bench_readme"),
ToolCall("file-12", "file", "Search for 'TODO' comments in all .py files.",
"search_files", "TODO"),
ToolCall("file-13", "file", "Read /tmp/bench_readme.md",
"read_file", "bench_readme"),
ToolCall("file-14", "file", "Patch /tmp/bench_readme.md: replace '# Benchmark' with '# Tool Benchmark'",
"patch", "Tool Benchmark"),
ToolCall("file-15", "file", "Write a Python one-liner to /tmp/bench_hello.py that prints hello.",
"write_file", "bench_hello"),
ToolCall("file-16", "file", "Search for all .json files in /tmp/.",
"search_files", "json"),
ToolCall("file-17", "file", "Read /tmp/bench_hello.py and verify it has print('hello').",
"read_file", "bench_hello"),
ToolCall("file-18", "file", "Patch /tmp/bench_hello.py to print 'hello world' instead of 'hello'.",
"patch", "hello world"),
ToolCall("file-19", "file", "List files matching 'bench*' in /tmp/.",
"search_files", "bench"),
ToolCall("file-20", "file", "Read /tmp/test_bench.txt again and summarize its contents.",
"read_file", "test_bench"),
# ── Terminal Commands (20) ────────────────────────────────────────────
ToolCall("term-01", "terminal", "Run `echo hello world` in the terminal.",
"terminal", "echo"),
ToolCall("term-02", "terminal", "Run `date` to get the current date and time.",
"terminal", "date"),
ToolCall("term-03", "terminal", "Run `uname -a` to get system information.",
"terminal", "uname"),
ToolCall("term-04", "terminal", "Run `pwd` to show the current directory.",
"terminal", "pwd"),
ToolCall("term-05", "terminal", "Run `ls -la /tmp/ | head -20` to list temp files.",
"terminal", "head"),
ToolCall("term-06", "terminal", "Run `whoami` to show the current user.",
"terminal", "whoami"),
ToolCall("term-07", "terminal", "Run `df -h` to show disk usage.",
"terminal", "df"),
ToolCall("term-08", "terminal", "Run `python3 --version` to check Python version.",
"terminal", "python3"),
ToolCall("term-09", "terminal", "Run `cat /etc/hostname` to get the hostname.",
"terminal", "hostname"),
ToolCall("term-10", "terminal", "Run `uptime` to see system uptime.",
"terminal", "uptime"),
ToolCall("term-11", "terminal", "Run `env | grep PATH` to show the PATH variable.",
"terminal", "PATH"),
ToolCall("term-12", "terminal", "Run `wc -l /etc/passwd` to count lines.",
"terminal", "wc"),
ToolCall("term-13", "terminal", "Run `echo $SHELL` to show the current shell.",
"terminal", "SHELL"),
ToolCall("term-14", "terminal", "Run `free -h || vm_stat` to check memory usage.",
"terminal", "memory"),
ToolCall("term-15", "terminal", "Run `id` to show user and group IDs.",
"terminal", "id"),
ToolCall("term-16", "terminal", "Run `hostname` to get the machine hostname.",
"terminal", "hostname"),
ToolCall("term-17", "terminal", "Run `echo {1..5}` to test brace expansion.",
"terminal", "echo"),
ToolCall("term-18", "terminal", "Run `seq 1 5` to generate a number sequence.",
"terminal", "seq"),
ToolCall("term-19", "terminal", "Run `python3 -c 'print(2+2)'` to compute 2+2.",
"terminal", "print"),
ToolCall("term-20", "terminal", "Run `ls -d /tmp/bench* 2>/dev/null | wc -l` to count bench files.",
"terminal", "wc"),
# ── Code Execution (15) ──────────────────────────────────────────────
ToolCall("code-01", "code", "Execute a Python script that computes factorial of 10.",
"execute_code", "factorial"),
ToolCall("code-02", "code", "Run Python to read /tmp/test_bench.txt and count its words.",
"execute_code", "words"),
ToolCall("code-03", "code", "Execute Python to generate the first 20 Fibonacci numbers.",
"execute_code", "fibonacci"),
ToolCall("code-04", "code", "Run Python to parse JSON from a string and print keys.",
"execute_code", "json"),
ToolCall("code-05", "code", "Execute Python to list all files in /tmp/ matching 'bench*'.",
"execute_code", "glob"),
ToolCall("code-06", "code", "Run Python to compute the sum of squares from 1 to 100.",
"execute_code", "sum"),
ToolCall("code-07", "code", "Execute Python to check if 'racecar' is a palindrome.",
"execute_code", "palindrome"),
ToolCall("code-08", "code", "Run Python to create a CSV string with 5 rows of sample data.",
"execute_code", "csv"),
ToolCall("code-09", "code", "Execute Python to sort a list [5,2,8,1,9] and print the result.",
"execute_code", "sort"),
ToolCall("code-10", "code", "Run Python to count lines in /etc/passwd.",
"execute_code", "passwd"),
ToolCall("code-11", "code", "Execute Python to hash the string 'benchmark' with SHA256.",
"execute_code", "sha256"),
ToolCall("code-12", "code", "Run Python to get the current UTC timestamp.",
"execute_code", "utcnow"),
ToolCall("code-13", "code", "Execute Python to convert 'hello world' to uppercase and reverse it.",
"execute_code", "upper"),
ToolCall("code-14", "code", "Run Python to create a dictionary of system info (platform, python version).",
"execute_code", "sys"),
ToolCall("code-15", "code", "Execute Python to check internet connectivity by resolving google.com.",
"execute_code", "socket"),
# ── Delegation (10) ──────────────────────────────────────────────────
ToolCall("deleg-01", "delegate", "Use a subagent to find all .log files in /tmp/.",
"delegate_task", "log"),
ToolCall("deleg-02", "delegate", "Delegate to a subagent: what is 15 * 37?",
"delegate_task", "15"),
ToolCall("deleg-03", "delegate", "Use a subagent to check if Python 3 is installed and its version.",
"delegate_task", "python"),
ToolCall("deleg-04", "delegate", "Delegate: read /tmp/test_bench.txt and summarize it in one sentence.",
"delegate_task", "summarize"),
ToolCall("deleg-05", "delegate", "Use a subagent to list the contents of /tmp/ directory.",
"delegate_task", "tmp"),
ToolCall("deleg-06", "delegate", "Delegate: count the number of .py files in the current directory.",
"delegate_task", ".py"),
ToolCall("deleg-07", "delegate", "Use a subagent to check disk space with df -h.",
"delegate_task", "df"),
ToolCall("deleg-08", "delegate", "Delegate: what OS are we running on?",
"delegate_task", "os"),
ToolCall("deleg-09", "delegate", "Use a subagent to find the hostname of this machine.",
"delegate_task", "hostname"),
ToolCall("deleg-10", "delegate", "Delegate: create a temp file /tmp/bench_deleg.txt with 'done'.",
"delegate_task", "write"),
# ── Todo / Memory (10 — replacing web/browser/MCP which need external services) ──
ToolCall("todo-01", "todo", "Add a todo item: 'Run benchmark suite'",
"todo", "benchmark"),
ToolCall("todo-02", "todo", "Show me the current todo list.",
"todo", ""),
ToolCall("todo-03", "todo", "Mark the first todo item as completed.",
"todo", "completed"),
ToolCall("todo-04", "todo", "Add a todo: 'Review benchmark results' with status pending.",
"todo", "Review"),
ToolCall("todo-05", "todo", "Clear all completed todos.",
"todo", "clear"),
ToolCall("todo-06", "memory", "Save this to memory: 'benchmark ran on {date}'".format(
date=datetime.now().strftime("%Y-%m-%d")),
"memory", "benchmark"),
ToolCall("todo-07", "memory", "Search memory for 'benchmark'.",
"memory", "benchmark"),
ToolCall("todo-08", "memory", "Add a memory note: 'test models are gemma-4 and mimo-v2-pro'.",
"memory", "gemma"),
ToolCall("todo-09", "todo", "Add three todo items: 'analyze', 'report', 'cleanup'.",
"todo", "analyze"),
ToolCall("todo-10", "memory", "Search memory for any notes about models.",
"memory", "model"),
# ── Skills (10 — replacing MCP tools which need servers) ─────────────
ToolCall("skill-01", "skills", "List all available skills.",
"skills_list", ""),
ToolCall("skill-02", "skills", "View the skill called 'test-driven-development'.",
"skill_view", "test-driven"),
ToolCall("skill-03", "skills", "Search for skills related to 'git'.",
"skills_list", "git"),
ToolCall("skill-04", "skills", "View the 'code-review' skill.",
"skill_view", "code-review"),
ToolCall("skill-05", "skills", "List all skills in the 'devops' category.",
"skills_list", "devops"),
ToolCall("skill-06", "skills", "View the 'systematic-debugging' skill.",
"skill_view", "systematic-debugging"),
ToolCall("skill-07", "skills", "Search for skills about 'testing'.",
"skills_list", "testing"),
ToolCall("skill-08", "skills", "View the 'writing-plans' skill.",
"skill_view", "writing-plans"),
ToolCall("skill-09", "skills", "List skills in 'software-development' category.",
"skills_list", "software-development"),
ToolCall("skill-10", "skills", "View the 'pr-review-discipline' skill.",
"skill_view", "pr-review"),
# ── Additional tests to reach 100 ────────────────────────────────────
ToolCall("file-21", "file", "Write a Python snippet to /tmp/bench_sort.py that sorts [3,1,2].",
"write_file", "bench_sort"),
ToolCall("file-22", "file", "Read /tmp/bench_sort.py back and confirm it exists.",
"read_file", "bench_sort"),
ToolCall("file-23", "file", "Search for 'class' in all .py files in the benchmarks directory.",
"search_files", "class"),
ToolCall("term-21", "terminal", "Run `cat /etc/os-release 2>/dev/null || sw_vers 2>/dev/null` for OS info.",
"terminal", "os"),
ToolCall("term-22", "terminal", "Run `nproc 2>/dev/null || sysctl -n hw.ncpu 2>/dev/null` for CPU count.",
"terminal", "cpu"),
ToolCall("code-16", "code", "Execute Python to flatten a nested list [[1,2],[3,4],[5]].",
"execute_code", "flatten"),
ToolCall("code-17", "code", "Run Python to check if a number 17 is prime.",
"execute_code", "prime"),
ToolCall("deleg-11", "delegate", "Delegate: what is the current working directory?",
"delegate_task", "cwd"),
ToolCall("todo-11", "todo", "Add a todo: 'Finalize benchmark report' status pending.",
"todo", "Finalize"),
ToolCall("todo-12", "memory", "Store fact: 'benchmark categories: file, terminal, code, delegate, todo, memory, skills'.",
"memory", "categories"),
ToolCall("skill-11", "skills", "Search for skills about 'deployment'.",
"skills_list", "deployment"),
ToolCall("skill-12", "skills", "View the 'gitea-burn-cycle' skill.",
"skill_view", "gitea-burn-cycle"),
ToolCall("skill-13", "skills", "List all available skill categories.",
"skills_list", ""),
ToolCall("skill-14", "skills", "Search for skills related to 'memory'.",
"skills_list", "memory"),
ToolCall("skill-15", "skills", "View the 'mimo-swarm' skill.",
"skill_view", "mimo-swarm"),
]
# fmt: on
# ---------------------------------------------------------------------------
# Runner
# ---------------------------------------------------------------------------
@dataclass
class CallResult:
test_id: str
category: str
model: str
prompt: str
expected_tool: str
success: bool
tool_called: Optional[str] = None
tool_args_valid: bool = False
execution_ok: bool = False
latency_s: float = 0.0
error: str = ""
raw_response: str = ""
@dataclass
class ModelStats:
model: str
total: int = 0
schema_ok: int = 0 # model produced valid tool call JSON
exec_ok: int = 0 # tool actually ran without error
latency_sum: float = 0.0
failures: list = field(default_factory=list)
@property
def schema_pct(self) -> float:
return (self.schema_ok / self.total * 100) if self.total else 0
@property
def exec_pct(self) -> float:
return (self.exec_ok / self.total * 100) if self.total else 0
@property
def avg_latency(self) -> float:
return (self.latency_sum / self.total) if self.total else 0
def setup_test_files():
"""Create prerequisite files for the benchmark."""
Path("/tmp/test_bench.txt").write_text(
"This is a benchmark test file.\n"
"It contains sample data for tool-calling tests.\n"
"Line three has some import statements.\n"
"import os\nimport sys\nimport json\n"
"End of test data.\n"
)
def run_single_test(tc: ToolCall, model_spec: str, provider: str) -> CallResult:
"""Run a single tool-calling test through the agent."""
from run_agent import AIAgent
result = CallResult(
test_id=tc.id,
category=tc.category,
model=model_spec,
prompt=tc.prompt,
expected_tool=tc.expected_tool,
success=False,
)
try:
agent = AIAgent(
model=model_spec,
provider=provider,
max_iterations=3,
quiet_mode=True,
skip_context_files=True,
skip_memory=True,
persist_session=False,
)
t0 = time.time()
conv = agent.run_conversation(
user_message=tc.prompt,
system_message=(
"You are a benchmark test runner. Execute the user's request by calling "
"the appropriate tool. Return the tool result directly. Do not add commentary."
),
)
result.latency_s = round(time.time() - t0, 2)
messages = conv.get("messages", [])
# Find the first assistant message with tool_calls
tool_called = None
tool_args_str = ""
for msg in messages:
if msg.get("role") == "assistant" and msg.get("tool_calls"):
for tc_item in msg["tool_calls"]:
fn = tc_item.get("function", {})
tool_called = fn.get("name", "")
tool_args_str = fn.get("arguments", "{}")
break
break
if tool_called:
result.tool_called = tool_called
result.schema_ok = True
# Check if the right tool was called
if tool_called == tc.expected_tool:
result.success = True
# Check if args contain expected substring
if tc.expected_params_check:
result.tool_args_valid = tc.expected_params_check in tool_args_str
else:
result.tool_args_valid = True
# Check if tool executed (look for tool role message)
for msg in messages:
if msg.get("role") == "tool":
content = msg.get("content", "")
if content and "error" not in content.lower()[:50]:
result.execution_ok = True
break
elif content:
result.execution_ok = True # got a response, even if error
break
else:
# No tool call produced — still check if model responded
final = conv.get("final_response", "")
result.raw_response = final[:200] if final else ""
except Exception as e:
result.error = f"{type(e).__name__}: {str(e)[:200]}"
result.latency_s = round(time.time() - t0, 2) if 't0' in dir() else 0
return result
def generate_report(results: list[CallResult], models: list[str], output_path: Path):
"""Generate markdown benchmark report."""
now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
# Aggregate per model
stats: dict[str, ModelStats] = {}
for m in models:
stats[m] = ModelStats(model=m)
by_category: dict[str, dict[str, list[CallResult]]] = {}
for r in results:
s = stats[r.model]
s.total += 1
s.schema_ok += int(r.schema_ok)
s.exec_ok += int(r.execution_ok)
s.latency_sum += r.latency_s
if not r.success:
s.failures.append(r)
by_category.setdefault(r.category, {}).setdefault(r.model, []).append(r)
lines = [
f"# Tool-Calling Benchmark Report",
f"",
f"Generated: {now}",
f"Suite: {len(SUITE)} calls across {len(set(tc.category for tc in SUITE))} categories",
f"Models tested: {', '.join(models)}",
f"",
f"## Summary",
f"",
f"| Metric | {' | '.join(models)} |",
f"|--------|{'|'.join('---------' for _ in models)}|",
]
# Schema parse success
row = "| Schema parse success | "
for m in models:
s = stats[m]
row += f"{s.schema_ok}/{s.total} ({s.schema_pct:.0f}%) | "
lines.append(row)
# Tool execution success
row = "| Tool execution success | "
for m in models:
s = stats[m]
row += f"{s.exec_ok}/{s.total} ({s.exec_pct:.0f}%) | "
lines.append(row)
# Correct tool selected
row = "| Correct tool selected | "
for m in models:
s = stats[m]
correct = sum(1 for r in results if r.model == m and r.success)
pct = (correct / s.total * 100) if s.total else 0
row += f"{correct}/{s.total} ({pct:.0f}%) | "
lines.append(row)
# Avg latency
row = "| Avg latency (s) | "
for m in models:
s = stats[m]
row += f"{s.avg_latency:.2f} | "
lines.append(row)
lines.append("")
# Per-category breakdown
lines.append("## Per-Category Breakdown")
lines.append("")
for cat in sorted(by_category.keys()):
lines.append(f"### {cat.title()}")
lines.append("")
lines.append(f"| Metric | {' | '.join(models)} |")
lines.append(f"|--------|{'|'.join('---------' for _ in models)}|")
cat_data = by_category[cat]
for metric_name, fn in [
("Schema OK", lambda r: r.schema_ok),
("Exec OK", lambda r: r.execution_ok),
("Correct tool", lambda r: r.success),
]:
row = f"| {metric_name} | "
for m in models:
results_m = cat_data.get(m, [])
total = len(results_m)
ok = sum(1 for r in results_m if fn(r))
pct = (ok / total * 100) if total else 0
row += f"{ok}/{total} ({pct:.0f}%) | "
lines.append(row)
lines.append("")
# Failure analysis
lines.append("## Failure Analysis")
lines.append("")
any_failures = False
for m in models:
s = stats[m]
if s.failures:
any_failures = True
lines.append(f"### {m}{len(s.failures)} failures")
lines.append("")
lines.append("| Test | Category | Expected | Got | Error |")
lines.append("|------|----------|----------|-----|-------|")
for r in s.failures:
got = r.tool_called or "none"
err = r.error or "wrong tool"
lines.append(f"| {r.test_id} | {r.category} | {r.expected_tool} | {got} | {err[:60]} |")
lines.append("")
if not any_failures:
lines.append("No failures detected.")
lines.append("")
# Raw results JSON
lines.append("## Raw Results")
lines.append("")
lines.append("```json")
lines.append(json.dumps([asdict(r) for r in results], indent=2, default=str))
lines.append("```")
report = "\n".join(lines)
output_path.write_text(report)
return report
def main():
parser = argparse.ArgumentParser(description="Tool-calling benchmark")
parser.add_argument("--models", nargs="+",
default=["nous:gia-3/gemma-4-31b", "nous:mimo-v2-pro"],
help="Model specs to test (provider:model)")
parser.add_argument("--limit", type=int, default=0,
help="Run only first N tests (0 = all)")
parser.add_argument("--category", type=str, default="",
help="Run only tests in this category")
parser.add_argument("--output", type=str, default="",
help="Output report path (default: benchmarks/gemma4-tool-calling-YYYY-MM-DD.md)")
parser.add_argument("--dry-run", action="store_true",
help="Print test cases without running them")
args = parser.parse_args()
# Filter suite
suite = SUITE[:]
if args.category:
suite = [tc for tc in suite if tc.category == args.category]
if args.limit > 0:
suite = suite[:args.limit]
if args.dry_run:
print(f"Would run {len(suite)} tests:")
for tc in suite:
print(f" [{tc.category:8s}] {tc.id}: {tc.expected_tool}{tc.prompt[:60]}")
return
# Setup
setup_test_files()
date_str = datetime.now().strftime("%Y-%m-%d")
output_path = Path(args.output) if args.output else REPO_ROOT / "benchmarks" / f"gemma4-tool-calling-{date_str}.md"
# Parse model specs
model_specs = []
for spec in args.models:
parts = spec.split(":", 1)
provider = parts[0]
model_name = parts[1] if len(parts) > 1 else parts[0]
model_specs.append((provider, model_name, spec))
print(f"Benchmark: {len(suite)} tests × {len(model_specs)} models = {len(suite) * len(model_specs)} calls")
print(f"Output: {output_path}")
print()
all_results: list[CallResult] = []
for provider, model_name, full_spec in model_specs:
print(f"── {full_spec} {'' * (50 - len(full_spec))}")
model_results = []
for i, tc in enumerate(suite, 1):
sys.stdout.write(f"\r [{i:3d}/{len(suite)}] {tc.id:10s} {tc.category:8s}{tc.expected_tool:20s}")
sys.stdout.flush()
r = run_single_test(tc, full_spec, provider)
model_results.append(r)
status = "" if r.success else ""
sys.stdout.write(f" {status} ({r.latency_s:.1f}s)")
sys.stdout.write("\n")
all_results.extend(model_results)
# Quick stats
ok = sum(1 for r in model_results if r.success)
print(f" Result: {ok}/{len(model_results)} correct tool selected ({ok/len(model_results)*100:.0f}%)")
print()
# Generate report
model_names = [spec for _, _, spec in model_specs]
report = generate_report(all_results, model_names, output_path)
print(f"Report written to {output_path}")
# Exit code: 0 if all pass, 1 if any failures
total_fail = sum(1 for r in all_results if not r.success)
sys.exit(1 if total_fail > 0 else 0)
if __name__ == "__main__":
main()

View File

@@ -1,635 +0,0 @@
#!/usr/bin/env python3
"""
Vision Benchmark Suite — Issue #817
Compares Gemma 4 vision accuracy vs current approach (Gemini 3 Flash Preview).
Measures OCR accuracy, description quality, latency, and token usage.
Usage:
# Run full benchmark
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json
# Single image test
python benchmarks/vision_benchmark.py --url https://example.com/image.png
# Generate test report
python benchmarks/vision_benchmark.py --images benchmarks/test_images.json --output benchmarks/vision_results.json
Test image dataset: benchmarks/test_images.json (50-100 diverse images)
"""
import argparse
import asyncio
import base64
import json
import os
import statistics
import sys
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional
# ---------------------------------------------------------------------------
# Benchmark configuration
# ---------------------------------------------------------------------------
# Models to compare
MODELS = {
"gemma4": {
"model_id": "google/gemma-4-27b-it",
"display_name": "Gemma 4 27B",
"provider": "nous",
"description": "Google's multimodal Gemma 4 model",
},
"gemini3_flash": {
"model_id": "google/gemini-3-flash-preview",
"display_name": "Gemini 3 Flash Preview",
"provider": "openrouter",
"description": "Current default vision model",
},
}
# Evaluation prompts for different test categories
EVAL_PROMPTS = {
"screenshot": "Describe this screenshot in detail. What application is shown? What is the current state of the UI?",
"diagram": "Describe this diagram completely. What concepts does it illustrate? List all components and their relationships.",
"photo": "Describe this photo in detail. What objects are visible? What is the scene?",
"ocr": "Extract ALL text visible in this image. Return it exactly as written, preserving formatting.",
"chart": "What data does this chart show? List all axes labels, values, and key trends.",
"document": "Extract all text from this document image. Preserve paragraph structure.",
}
# ---------------------------------------------------------------------------
# Vision model interface
# ---------------------------------------------------------------------------
async def analyze_with_model(
image_url: str,
prompt: str,
model_config: dict,
timeout: float = 120.0,
) -> dict:
"""Call a vision model and return structured results.
Returns dict with:
- analysis: str
- latency_ms: float
- tokens: dict (prompt_tokens, completion_tokens, total_tokens)
- success: bool
- error: str (if failed)
"""
import httpx
provider = model_config["provider"]
model_id = model_config["model_id"]
# Prepare messages
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
# Route to provider
if provider == "openrouter":
api_url = "https://openrouter.ai/api/v1/chat/completions"
api_key = os.getenv("OPENROUTER_API_KEY", "")
elif provider == "nous":
api_url = "https://inference.nousresearch.com/v1/chat/completions"
api_key = os.getenv("NOUS_API_KEY", "") or os.getenv("NOUS_INFERENCE_API_KEY", "")
else:
api_url = os.getenv(f"{provider.upper()}_API_URL", "")
api_key = os.getenv(f"{provider.upper()}_API_KEY", "")
if not api_key:
return {
"analysis": "",
"latency_ms": 0,
"tokens": {},
"success": False,
"error": f"No API key for provider {provider}",
}
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json",
}
payload = {
"model": model_id,
"messages": messages,
"max_tokens": 2000,
"temperature": 0.1,
}
start = time.perf_counter()
try:
async with httpx.AsyncClient(timeout=timeout) as client:
resp = await client.post(api_url, json=payload, headers=headers)
resp.raise_for_status()
data = resp.json()
latency_ms = (time.perf_counter() - start) * 1000
analysis = ""
choices = data.get("choices", [])
if choices:
msg = choices[0].get("message", {})
analysis = msg.get("content", "")
usage = data.get("usage", {})
tokens = {
"prompt_tokens": usage.get("prompt_tokens", 0),
"completion_tokens": usage.get("completion_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
return {
"analysis": analysis,
"latency_ms": round(latency_ms, 1),
"tokens": tokens,
"success": True,
"error": "",
}
except Exception as e:
return {
"analysis": "",
"latency_ms": round((time.perf_counter() - start) * 1000, 1),
"tokens": {},
"success": False,
"error": str(e),
}
# ---------------------------------------------------------------------------
# Evaluation metrics
# ---------------------------------------------------------------------------
def compute_ocr_accuracy(extracted: str, ground_truth: str) -> float:
"""Compute OCR accuracy using character-level Levenshtein ratio.
Returns 0.0-1.0 (1.0 = perfect match).
"""
if not ground_truth:
return 1.0 if not extracted else 0.0
if not extracted:
return 0.0
# Normalized Levenshtein similarity
extracted_lower = extracted.lower().strip()
truth_lower = ground_truth.lower().strip()
# Simple character overlap ratio (fast proxy)
max_len = max(len(extracted_lower), len(truth_lower))
if max_len == 0:
return 1.0
# Count matching characters at matching positions
matches = sum(1 for a, b in zip(extracted_lower, truth_lower) if a == b)
position_ratio = matches / max_len
# Also check word-level overlap
extracted_words = set(extracted_lower.split())
truth_words = set(truth_lower.split())
if truth_words:
word_recall = len(extracted_words & truth_words) / len(truth_words)
else:
word_recall = 1.0 if not extracted_words else 0.0
return round((position_ratio * 0.4 + word_recall * 0.6), 4)
def compute_description_completeness(analysis: str, expected_keywords: list) -> float:
"""Score description completeness based on keyword coverage.
Returns 0.0-1.0.
"""
if not expected_keywords:
return 1.0
if not analysis:
return 0.0
analysis_lower = analysis.lower()
found = sum(1 for kw in expected_keywords if kw.lower() in analysis_lower)
return round(found / len(expected_keywords), 4)
def compute_structural_accuracy(analysis: str, expected_structure: dict) -> dict:
"""Evaluate structural elements of the analysis.
Returns dict with per-element scores.
"""
scores = {}
# Length check
min_length = expected_structure.get("min_length", 50)
scores["length"] = min(len(analysis) / min_length, 1.0) if min_length > 0 else 1.0
# Sentence count
min_sentences = expected_structure.get("min_sentences", 2)
sentence_count = analysis.count(".") + analysis.count("!") + analysis.count("?")
scores["sentences"] = min(sentence_count / max(min_sentences, 1), 1.0)
# Has specifics (numbers, names, etc.)
if expected_structure.get("has_numbers", False):
import re
scores["has_numbers"] = 1.0 if re.search(r'\d', analysis) else 0.0
return scores
# ---------------------------------------------------------------------------
# Benchmark runner
# ---------------------------------------------------------------------------
async def run_single_test(
image: dict,
models: dict,
runs_per_model: int = 1,
) -> dict:
"""Run a single image through all models.
Args:
image: dict with url, category, expected_keywords, ground_truth_ocr, etc.
models: dict of model configs to test
runs_per_model: number of runs per model (for consistency testing)
Returns dict with results per model.
"""
category = image.get("category", "photo")
prompt = EVAL_PROMPTS.get(category, EVAL_PROMPTS["photo"])
url = image["url"]
results = {}
for model_name, model_config in models.items():
runs = []
for run_i in range(runs_per_model):
result = await analyze_with_model(url, prompt, model_config)
runs.append(result)
if run_i < runs_per_model - 1:
await asyncio.sleep(1) # Rate limit courtesy
# Aggregate
successful = [r for r in runs if r["success"]]
if successful:
avg_latency = statistics.mean(r["latency_ms"] for r in successful)
avg_tokens = statistics.mean(
r["tokens"].get("total_tokens", 0) for r in successful
)
# Use first successful run for accuracy metrics
primary = successful[0]
# Compute accuracy
ocr_score = None
if image.get("ground_truth_ocr"):
ocr_score = compute_ocr_accuracy(
primary["analysis"], image["ground_truth_ocr"]
)
keyword_score = None
if image.get("expected_keywords"):
keyword_score = compute_description_completeness(
primary["analysis"], image["expected_keywords"]
)
structural = compute_structural_accuracy(
primary["analysis"], image.get("expected_structure", {})
)
results[model_name] = {
"success": True,
"analysis_preview": primary["analysis"][:300],
"analysis_length": len(primary["analysis"]),
"avg_latency_ms": round(avg_latency, 1),
"avg_tokens": round(avg_tokens, 1),
"ocr_accuracy": ocr_score,
"keyword_completeness": keyword_score,
"structural_scores": structural,
"consistency": round(
statistics.stdev(len(r["analysis"]) for r in successful), 1
) if len(successful) > 1 else 0.0,
"runs": len(successful),
"errors": len(runs) - len(successful),
}
else:
results[model_name] = {
"success": False,
"error": runs[0]["error"] if runs else "No runs",
"runs": 0,
"errors": len(runs),
}
return results
async def run_benchmark_suite(
images: List[dict],
models: dict,
runs_per_model: int = 1,
) -> dict:
"""Run the full benchmark suite.
Args:
images: list of image test cases
models: model configs to compare
runs_per_model: consistency runs per image
Returns structured benchmark report.
"""
total = len(images)
all_results = []
print(f"\nRunning vision benchmark: {total} images x {len(models)} models x {runs_per_model} runs")
print(f"Models: {', '.join(m['display_name'] for m in models.values())}\n")
for i, image in enumerate(images):
img_id = image.get("id", f"img_{i}")
category = image.get("category", "unknown")
print(f" [{i+1}/{total}] {img_id} ({category})...", end=" ", flush=True)
result = await run_single_test(image, models, runs_per_model)
result["image_id"] = img_id
result["category"] = category
all_results.append(result)
# Quick status
statuses = []
for mname in models:
if result[mname]["success"]:
lat = result[mname]["avg_latency_ms"]
statuses.append(f"{mname}:{lat:.0f}ms")
else:
statuses.append(f"{mname}:FAIL")
print(", ".join(statuses))
# Aggregate statistics
summary = aggregate_results(all_results, models)
return {
"generated_at": datetime.now(timezone.utc).isoformat(),
"config": {
"total_images": total,
"runs_per_model": runs_per_model,
"models": {k: v["display_name"] for k, v in models.items()},
},
"results": all_results,
"summary": summary,
}
def aggregate_results(results: List[dict], models: dict) -> dict:
"""Compute aggregate statistics across all test images."""
summary = {}
for model_name in models:
model_results = [r[model_name] for r in results if r[model_name]["success"]]
failed = [r[model_name] for r in results if not r[model_name]["success"]]
if not model_results:
summary[model_name] = {"success_rate": 0, "error": "All runs failed"}
continue
latencies = [r["avg_latency_ms"] for r in model_results]
tokens = [r["avg_tokens"] for r in model_results if r.get("avg_tokens")]
ocr_scores = [r["ocr_accuracy"] for r in model_results if r.get("ocr_accuracy") is not None]
keyword_scores = [r["keyword_completeness"] for r in model_results if r.get("keyword_completeness") is not None]
summary[model_name] = {
"success_rate": round(len(model_results) / (len(model_results) + len(failed)), 4),
"total_runs": len(model_results),
"total_failures": len(failed),
"latency": {
"mean_ms": round(statistics.mean(latencies), 1),
"median_ms": round(statistics.median(latencies), 1),
"p95_ms": round(sorted(latencies)[int(len(latencies) * 0.95)], 1),
"std_ms": round(statistics.stdev(latencies), 1) if len(latencies) > 1 else 0,
},
"tokens": {
"mean_total": round(statistics.mean(tokens), 1) if tokens else 0,
"total_used": sum(int(t) for t in tokens),
},
"accuracy": {
"ocr_mean": round(statistics.mean(ocr_scores), 4) if ocr_scores else None,
"ocr_count": len(ocr_scores),
"keyword_mean": round(statistics.mean(keyword_scores), 4) if keyword_scores else None,
"keyword_count": len(keyword_scores),
},
}
return summary
# ---------------------------------------------------------------------------
# Report generation
# ---------------------------------------------------------------------------
def to_markdown(report: dict) -> str:
"""Generate human-readable markdown report."""
summary = report["summary"]
config = report["config"]
model_names = list(config["models"].values())
lines = [
"# Vision Benchmark Report",
"",
f"Generated: {report['generated_at'][:16]}",
f"Images tested: {config['total_images']}",
f"Runs per model: {config['runs_per_model']}",
f"Models: {', '.join(model_names)}",
"",
"## Latency Comparison",
"",
"| Model | Mean (ms) | Median | P95 | Std Dev |",
"|-------|-----------|--------|-----|---------|",
]
for mkey, mname in config["models"].items():
if mkey in summary and "latency" in summary[mkey]:
lat = summary[mkey]["latency"]
lines.append(
f"| {mname} | {lat['mean_ms']:.0f} | {lat['median_ms']:.0f} | "
f"{lat['p95_ms']:.0f} | {lat['std_ms']:.0f} |"
)
lines += [
"",
"## Accuracy Comparison",
"",
"| Model | OCR Accuracy | Keyword Coverage | Success Rate |",
"|-------|-------------|-----------------|--------------|",
]
for mkey, mname in config["models"].items():
if mkey in summary and "accuracy" in summary[mkey]:
acc = summary[mkey]["accuracy"]
sr = summary[mkey].get("success_rate", 0)
ocr = f"{acc['ocr_mean']:.1%}" if acc["ocr_mean"] is not None else "N/A"
kw = f"{acc['keyword_mean']:.1%}" if acc["keyword_mean"] is not None else "N/A"
lines.append(f"| {mname} | {ocr} | {kw} | {sr:.1%} |")
lines += [
"",
"## Token Usage",
"",
"| Model | Mean Tokens/Image | Total Tokens |",
"|-------|------------------|--------------|",
]
for mkey, mname in config["models"].items():
if mkey in summary and "tokens" in summary[mkey]:
tok = summary[mkey]["tokens"]
lines.append(
f"| {mname} | {tok['mean_total']:.0f} | {tok['total_used']} |"
)
# Verdict
lines += ["", "## Verdict", ""]
# Find best model by composite score
best_model = None
best_score = -1
for mkey, mname in config["models"].items():
if mkey not in summary or "accuracy" not in summary[mkey]:
continue
acc = summary[mkey]["accuracy"]
sr = summary[mkey].get("success_rate", 0)
ocr = acc["ocr_mean"] or 0
kw = acc["keyword_mean"] or 0
# Weighted composite: 40% OCR, 30% keyword, 30% success rate
score = (ocr * 0.4 + kw * 0.3 + sr * 0.3)
if score > best_score:
best_score = score
best_model = mname
if best_model:
lines.append(f"**Best overall: {best_model}** (composite score: {best_score:.1%})")
else:
lines.append("No clear winner — insufficient data.")
return "\n".join(lines)
# ---------------------------------------------------------------------------
# Test dataset management
# ---------------------------------------------------------------------------
def generate_sample_dataset() -> List[dict]:
"""Generate a sample test dataset with diverse public images.
Returns list of test image definitions.
"""
return [
# Screenshots
{
"id": "screenshot_github",
"url": "https://github.githubassets.com/images/modules/logos_page/GitHub-Mark.png",
"category": "screenshot",
"expected_keywords": ["github", "logo", "octocat"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
# Diagrams
{
"id": "diagram_architecture",
"url": "https://mermaid.ink/img/pako:eNp9kMtOwzAQRX_F8hKpJbhJFVJBi1QJiMWCG8eZNsGJLdlOiqIid5RdufiHnZRA7GbuzJwZe4ZGH2SCBPYUwgxoQKvJnCR2YY0F5YBdJJkD4uX0oXB6PnF3U4zCWcWdW3FqOwGvCKkBmHKSTB2gJeRrLTeJLfJdJKkBGYf9P1sTNdUXVJqY3YNJK7xLVwR0mxJFU6rCgEKnhSGIL2Eq8BdEERAX0OGwEiVQ1R0MaNFR8QfqKxmHigbX8VLjDz_Q0L8Wc_qPxDw",
"category": "diagram",
"expected_keywords": ["architecture", "component", "service"],
"expected_structure": {"min_length": 100, "min_sentences": 3},
},
# Photos
{
"id": "photo_nature",
"url": "https://picsum.photos/seed/bench1/400/300",
"category": "photo",
"expected_keywords": [],
"expected_structure": {"min_length": 30, "min_sentences": 1},
},
# Charts
{
"id": "chart_bar",
"url": "https://quickchart.io/chart?c={type:'bar',data:{labels:['Q1','Q2','Q3','Q4'],datasets:[{label:'Users',data:[50,60,70,80]}]}}",
"category": "chart",
"expected_keywords": ["bar", "chart", "data"],
"expected_structure": {"min_length": 50, "min_sentences": 2},
},
]
def load_dataset(path: str) -> List[dict]:
"""Load test dataset from JSON file."""
with open(path) as f:
return json.load(f)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
async def main():
parser = argparse.ArgumentParser(description="Vision Benchmark Suite (Issue #817)")
parser.add_argument("--images", help="Path to test images JSON file")
parser.add_argument("--url", help="Single image URL to test")
parser.add_argument("--category", default="photo", help="Category for single URL")
parser.add_argument("--output", default=None, help="Output JSON file")
parser.add_argument("--runs", type=int, default=1, help="Runs per model per image")
parser.add_argument("--models", nargs="+", default=None,
help="Models to test (default: all)")
parser.add_argument("--markdown", action="store_true", help="Output markdown report")
parser.add_argument("--generate-dataset", action="store_true",
help="Generate sample dataset and exit")
args = parser.parse_args()
if args.generate_dataset:
dataset = generate_sample_dataset()
out_path = args.images or "benchmarks/test_images.json"
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
with open(out_path, "w") as f:
json.dump(dataset, f, indent=2)
print(f"Generated sample dataset: {out_path} ({len(dataset)} images)")
return
# Select models
if args.models:
selected = {k: v for k, v in MODELS.items() if k in args.models}
else:
selected = MODELS
# Load images
if args.url:
images = [{"id": "single", "url": args.url, "category": args.category}]
elif args.images:
images = load_dataset(args.images)
else:
print("ERROR: Provide --images or --url")
sys.exit(1)
# Run benchmark
report = await run_benchmark_suite(images, selected, args.runs)
# Output
if args.output:
os.makedirs(os.path.dirname(args.output) or ".", exist_ok=True)
with open(args.output, "w") as f:
json.dump(report, f, indent=2)
print(f"\nResults saved to {args.output}")
if args.markdown or not args.output:
print("\n" + to_markdown(report))
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -24,8 +24,6 @@ model:
# "minimax" - MiniMax global (requires: MINIMAX_API_KEY)
# "minimax-cn" - MiniMax China (requires: MINIMAX_CN_API_KEY)
# "huggingface" - Hugging Face Inference (requires: HF_TOKEN)
# "xiaomi" - Xiaomi MiMo (requires: XIAOMI_API_KEY)
# "arcee" - Arcee AI Trinity models (requires: ARCEEAI_API_KEY)
# "kilocode" - KiloCode gateway (requires: KILOCODE_API_KEY)
# "ai-gateway" - Vercel AI Gateway (requires: AI_GATEWAY_API_KEY)
#
@@ -310,8 +308,15 @@ compression:
# compression of older turns.
protect_last_n: 20
# To pin a specific model/provider for compression summaries, use the
# auxiliary section below (auxiliary.compression.provider / model).
# Model to use for generating summaries (fast/cheap recommended)
# This model compresses the middle turns into a concise summary.
# IMPORTANT: it receives the full middle section of the conversation, so it
# MUST support a context length at least as large as your main model's.
summary_model: "google/gemini-3-flash-preview"
# Provider for the summary model (default: "auto")
# Options: "auto", "openrouter", "nous", "main"
# summary_provider: "auto"
# =============================================================================
# Auxiliary Models (Advanced — Experimental)
@@ -348,7 +353,7 @@ compression:
# Other providers pick a sensible default automatically.
#
# auxiliary:
# # Image analysis: vision_analyze tool
# # Image analysis: vision_analyze tool + browser screenshots
# vision:
# provider: "auto"
# model: "" # e.g. "google/gemini-2.5-flash", "openai/gpt-4o"
@@ -356,15 +361,6 @@ compression:
# download_timeout: 30 # Image HTTP download timeout (seconds)
# # Increase for slow connections or self-hosted image servers
#
# # Browser screenshot analysis (browser_vision tool)
# # Defaults to Gemma 4 27B — natively multimodal, same model family as the main
# # text model, which avoids model-switching overhead and improves context continuity.
# # Override with any vision-capable model. Set to "" to fall back to auto-detection.
# # Can also be overridden per-session with BROWSER_VISION_MODEL env var.
# browser_vision:
# model: "google/gemma-4-27b-it" # default; override e.g. "google/gemini-2.5-flash"
# timeout: 120 # API call timeout in seconds (default 120s)
#
# # Web page scraping / summarization + browser page text extraction
# web_extract:
# provider: "auto"
@@ -532,7 +528,7 @@ agent:
# - A preset like "hermes-cli" or "hermes-telegram" (curated tool set)
# - A list of individual toolsets to compose your own (see list below)
#
# Supported platform keys: cli, telegram, discord, whatsapp, slack, qqbot
# Supported platform keys: cli, telegram, discord, whatsapp, slack
#
# Examples:
#
@@ -561,7 +557,6 @@ agent:
# slack: hermes-slack (same as telegram)
# signal: hermes-signal (same as telegram)
# homeassistant: hermes-homeassistant (same as telegram)
# qqbot: hermes-qqbot (same as telegram)
#
platform_toolsets:
cli: [hermes-cli]
@@ -571,7 +566,6 @@ platform_toolsets:
slack: [hermes-slack]
signal: [hermes-signal]
homeassistant: [hermes-homeassistant]
qqbot: [hermes-qqbot]
# ─────────────────────────────────────────────────────────────────────────────
# Available toolsets (use these names in platform_toolsets or the toolsets list)
@@ -779,11 +773,6 @@ display:
# Toggle at runtime with /verbose in the CLI
tool_progress: all
# Gateway-only natural mid-turn assistant updates.
# When true, completed assistant status messages are sent as separate chat
# messages. This is independent of tool_progress and gateway streaming.
interim_assistant_messages: true
# What Enter does when Hermes is already busy in the CLI.
# interrupt: Interrupt the current run and redirect Hermes (default)
# queue: Queue your message for the next turn
@@ -792,7 +781,7 @@ display:
# Background process notifications (gateway/messaging only).
# Controls how chatty the process watcher is when you use
# terminal(background=true, notify_on_complete=true) from Telegram/Discord/etc.
# terminal(background=true, check_interval=...) from Telegram/Discord/etc.
# off: No watcher messages at all
# result: Only the final completion message
# error: Only the final message when exit code != 0

859
cli.py

File diff suppressed because it is too large Load Diff

View File

@@ -44,8 +44,7 @@ logger = logging.getLogger(__name__)
_KNOWN_DELIVERY_PLATFORMS = frozenset({
"telegram", "discord", "slack", "whatsapp", "signal",
"matrix", "mattermost", "homeassistant", "dingtalk", "feishu",
"wecom", "wecom_callback", "weixin", "sms", "email", "webhook", "bluebubbles",
"qqbot",
"wecom", "weixin", "sms", "email", "webhook", "bluebubbles",
})
from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run
@@ -220,21 +219,6 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
chat_id = target["chat_id"]
thread_id = target.get("thread_id")
# Diagnostic: log thread_id for topic-aware delivery debugging
origin = job.get("origin") or {}
origin_thread = origin.get("thread_id")
if origin_thread and not thread_id:
logger.warning(
"Job '%s': origin has thread_id=%s but delivery target lost it "
"(deliver=%s, target=%s)",
job["id"], origin_thread, job.get("deliver", "local"), target,
)
elif thread_id:
logger.debug(
"Job '%s': delivering to %s:%s thread_id=%s",
job["id"], platform_name, chat_id, thread_id,
)
from tools.send_message_tool import _send_to_platform
from gateway.config import load_gateway_config, Platform
@@ -250,12 +234,10 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
"dingtalk": Platform.DINGTALK,
"feishu": Platform.FEISHU,
"wecom": Platform.WECOM,
"wecom_callback": Platform.WECOM_CALLBACK,
"weixin": Platform.WEIXIN,
"email": Platform.EMAIL,
"sms": Platform.SMS,
"bluebubbles": Platform.BLUEBUBBLES,
"qqbot": Platform.QQBOT,
}
platform = platform_map.get(platform_name.lower())
if not platform:
@@ -288,13 +270,11 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Option
if wrap_response:
task_name = job.get("name", job["id"])
job_id = job.get("id", "")
delivery_content = (
f"Cronjob Response: {task_name}\n"
f"(job_id: {job_id})\n"
f"-------------\n\n"
f"{content}\n\n"
f"To stop or manage this job, send me a new message (e.g. \"stop reminder {task_name}\")."
f"Note: The agent cannot see this message, and therefore cannot respond to it."
)
else:
delivery_content = content
@@ -645,15 +625,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
except Exception as e:
logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)
# Apply IPv4 preference if configured.
try:
from hermes_constants import apply_ipv4_preference
_net_cfg = _cfg.get("network", {})
if isinstance(_net_cfg, dict) and _net_cfg.get("force_ipv4"):
apply_ipv4_preference(force=True)
except Exception:
pass
# Reasoning config from config.yaml
from hermes_constants import parse_reasoning_effort
effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
@@ -751,7 +722,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
provider_sort=pr.get("sort"),
disabled_toolsets=["cronjob", "messaging", "clarify"],
quiet_mode=True,
skip_context_files=True, # Don't inject SOUL.md/AGENTS.md from scheduler cwd
skip_memory=True, # Cron system prompts would corrupt user representations
platform="cron",
session_id=_cron_session_id,

38
docker/entrypoint.sh Executable file → Normal file
View File

@@ -1,44 +1,10 @@
#!/bin/bash
# Docker/Podman entrypoint: bootstrap config files into the mounted volume, then run hermes.
# Docker entrypoint: bootstrap config files into the mounted volume, then run hermes.
set -e
HERMES_HOME="${HERMES_HOME:-/opt/data}"
HERMES_HOME="/opt/data"
INSTALL_DIR="/opt/hermes"
# --- Privilege dropping via gosu ---
# When started as root (the default for Docker, or fakeroot in rootless Podman),
# optionally remap the hermes user/group to match host-side ownership, fix volume
# permissions, then re-exec as hermes.
if [ "$(id -u)" = "0" ]; then
if [ -n "$HERMES_UID" ] && [ "$HERMES_UID" != "$(id -u hermes)" ]; then
echo "Changing hermes UID to $HERMES_UID"
usermod -u "$HERMES_UID" hermes
fi
if [ -n "$HERMES_GID" ] && [ "$HERMES_GID" != "$(id -g hermes)" ]; then
echo "Changing hermes GID to $HERMES_GID"
# -o allows non-unique GID (e.g. macOS GID 20 "staff" may already exist
# as "dialout" in the Debian-based container image)
groupmod -o -g "$HERMES_GID" hermes 2>/dev/null || true
fi
actual_hermes_uid=$(id -u hermes)
if [ "$(stat -c %u "$HERMES_HOME" 2>/dev/null)" != "$actual_hermes_uid" ]; then
echo "$HERMES_HOME is not owned by $actual_hermes_uid, fixing"
# In rootless Podman the container's "root" is mapped to an unprivileged
# host UID — chown will fail. That's fine: the volume is already owned
# by the mapped user on the host side.
chown -R hermes:hermes "$HERMES_HOME" 2>/dev/null || \
echo "Warning: chown failed (rootless container?) — continuing anyway"
fi
echo "Dropping root privileges"
exec gosu hermes "$0" "$@"
fi
# --- Running as hermes from here ---
source "${INSTALL_DIR}/.venv/bin/activate"
# Create essential directory structure. Cache and platform directories
# (cache/images, cache/audio, platforms/whatsapp, etc.) are created on
# demand by the application — don't pre-create them here so new installs

View File

@@ -1,432 +0,0 @@
# Workflow Orchestration & Task Queue Research for AI Agents
**Date:** 2026-04-14
**Scope:** SOTA comparison of task queues and workflow orchestrators for autonomous AI agent workflows
---
## 1. Current Architecture: Cron + Webhook
### How it works
- **Scheduler:** `cron/scheduler.py` — gateway calls `tick()` every 60 seconds
- **Storage:** JSON file (`~/.hermes/cron/jobs.json`) + file-based lock (`cron/.tick.lock`)
- **Execution:** Each job spawns a full `AIAgent.run_conversation()` in a thread pool with inactivity timeout
- **Delivery:** Results pushed back to origin chat via platform adapters (Telegram, Discord, etc.)
- **Checkpointing:** Job outputs saved to `~/.hermes/cron/output/{job_id}/{timestamp}.md`
### Strengths
- Simple, zero-dependency (no broker/redis needed)
- Jobs are isolated — each runs a fresh agent session
- Direct platform delivery with E2EE support
- Script pre-run for data collection
- Inactivity-based timeout (not hard wall-clock)
### Weaknesses
- **No task dependencies** — jobs are completely independent
- **No retry logic** — single failure = lost run (recurring jobs advance schedule and move on)
- **No concurrency control** — all due jobs fire at once; no worker pool sizing
- **No observability** — no metrics, no dashboard, no structured logging of job state transitions
- **Tick-based polling** — 60s granularity, wastes cycles when idle, adds latency when busy
- **Single-process** — file lock means only one tick at a time; no horizontal scaling
- **No dead letter queue** — failed deliveries are logged but not retried
- **No workflow chaining** — cannot express "run A, then B with A's output"
---
## 2. Framework Comparison
### 2.1 Huey (Already Installed v2.6.0)
**Architecture:** Embedded task queue, SQLite/Redis/file storage, consumer process model.
| Feature | Huey | Our Cron |
|---|---|---|
| Broker | SQLite (default), Redis | JSON file |
| Retry | Built-in: `retries=N, retry_delay=S` | None |
| Task chaining | `task1.s() | task2.s()` (pipeline) | None |
| Scheduling | `@huey.periodic_task(crontab(...))` | Our own cron parser |
| Concurrency | Worker pool with `-w N` flag | Single tick lock |
| Monitoring | `huey_consumer` logs, Huey Admin (Django) | Manual log reading |
| Failure recovery | Automatic retry + configurable backoff | None |
| Priority | `PriorityRedisExpireHuey` or task priority | None |
| Result storage | `store_results=True` with result() | File output |
**Task Dependencies Pattern:**
```python
@huey.task()
def analyze_data(input_data):
return run_analysis(input_data)
@huey.task()
def generate_report(analysis_result):
return create_report(analysis_result)
# Pipeline: analyze then report
pipeline = analyze_data.s(raw_data) | generate_report.s()
result = pipeline()
```
**Retry Pattern:**
```python
@huey.task(retries=3, retry_delay=60, retry_backoff=True)
def flaky_api_call(url):
return requests.get(url, timeout=30)
```
**Benchmarks:** ~5,000 tasks/sec with SQLite backend, ~15,000 with Redis. Sub-millisecond scheduling latency. Very lightweight — single process.
**Verdict:** Best fit for our use case. Already installed. SQLite backend = no external deps. Can layer on top of our existing job storage.
---
### 2.2 Celery
**Architecture:** Distributed task queue with message broker (RabbitMQ/Redis).
| Feature | Celery | Huey |
|---|---|---|
| Broker | Redis, RabbitMQ, SQS (required) | SQLite (built-in) |
| Scale | 100K+ tasks/sec | ~5-15K tasks/sec |
| Chains | `chain(task1.s(), task2.s())` | Pipeline operator |
| Groups/Chords | Parallel + callback | Not built-in |
| Canvas | Full workflow DSL (chain, group, chord, map) | Basic pipeline |
| Monitoring | Flower dashboard, Celery events | Minimal |
| Complexity | Heavy — needs broker, workers, result backend | Single process |
**Workflow Pattern:**
```python
from celery import chain, group, chord
# Chain: sequential
workflow = chain(fetch_data.s(), analyze.s(), report.s())
# Group: parallel
parallel = group(fetch_twitter.s(), fetch_reddit.s(), fetch_hn.s())
# Chord: parallel then callback
chord(parallel, aggregate_results.s())
```
**Verdict:** Overkill for our scale. Adds RabbitMQ/Redis dependency. The Canvas API is powerful but we don't need 100K task/sec throughput. Flower monitoring is nice but we'd need to deploy it separately.
---
### 2.3 Temporal
**Architecture:** Durable execution engine. Workflows as code with automatic state persistence and replay.
| Feature | Temporal | Our Cron |
|---|---|---|
| State management | Automatic — workflow state persisted on every step | Manual JSON files |
| Failure recovery | Workflows survive process restarts, auto-retry | Lost on crash |
| Task dependencies | Native — activities call other activities | None |
| Long-running tasks | Built-in (days/months OK) | Inactivity timeout |
| Versioning | Workflow versioning for safe updates | No versioning |
| Visibility | Full workflow state at any point | Log files |
| Infrastructure | Requires Temporal server + database | None |
| Language | Python SDK, but Temporal server is Go | Pure Python |
**Workflow Pattern:**
```python
@workflow.defn
class AIAgentWorkflow:
@workflow.run
async def run(self, job_config: dict) -> str:
# Step 1: Fetch data
data = await workflow.execute_activity(
fetch_data_activity,
job_config["script"],
start_to_close_timeout=timedelta(minutes=5),
retry_policy=RetryPolicy(maximum_attempts=3),
)
# Step 2: Analyze with AI agent
analysis = await workflow.execute_activity(
run_agent_activity,
{"prompt": job_config["prompt"], "context": data},
start_to_close_timeout=timedelta(minutes=30),
retry_policy=RetryPolicy(
initial_interval=timedelta(seconds=60),
maximum_attempts=3,
),
)
# Step 3: Deliver
await workflow.execute_activity(
deliver_activity,
{"platform": job_config["deliver"], "content": analysis},
start_to_close_timeout=timedelta(seconds=60),
)
return analysis
```
**Verdict:** Best architecture for complex multi-step AI workflows, but heavy infrastructure cost. Temporal server needs PostgreSQL/Cassandra + visibility store. Ideal if we reach 50+ multi-step workflows with complex failure modes. Overkill for current needs.
---
### 2.4 Prefect
**Architecture:** Modern data/workflow orchestration with Python-native API.
| Feature | Prefect |
|---|---|
| Dependencies | SQLite (default) or PostgreSQL |
| Task retries | `@task(retries=3, retry_delay_seconds=10)` |
| Task dependencies | `result = task_a(wait_for=[task_b])` |
| Caching | `cache_key_fn` for result caching |
| Subflows | Nested workflow composition |
| Deployments | Schedule via `Deployment` or `CronSchedule` |
| UI | Excellent web dashboard |
| Async | Full async support |
**Workflow Pattern:**
```python
from prefect import flow, task
from prefect.tasks import task_input_hash
@task(retries=3, retry_delay_seconds=30)
def run_agent(prompt: str) -> str:
agent = AIAgent(...)
return agent.run_conversation(prompt)
@task(cache_key_fn=task_input_hash, cache_expiration=timedelta(hours=1))
def fetch_context(script: str) -> str:
return run_script(script)
@flow(name="agent-workflow")
def agent_workflow(job_config: dict):
context = fetch_context(job_config.get("script", ""))
result = run_agent(
f"{context}\n\n{job_config['prompt']}",
wait_for=[context]
)
deliver(result, job_config["deliver"])
return result
```
**Benchmarks:** Sub-second task scheduling. Handles 10K+ concurrent task runs. SQLite backend for single-node.
**Verdict:** Strong alternative. Pythonic, good UI, built-in scheduling. But heavier than Huey — deploys a server process. Best if we want a web dashboard for monitoring. Less infrastructure than Temporal but more than Huey.
---
### 2.5 Apache Airflow
**Architecture:** Batch-oriented DAG scheduler, Python-based.
| Feature | Airflow |
|---|---|
| DAG model | Static DAGs defined in Python files |
| Scheduler | Polling-based, 5-30s granularity |
| Dependencies | PostgreSQL/MySQL + Redis/RabbitMQ + webserver |
| UI | Rich web UI with DAG visualization |
| Best for | ETL, data pipelines, batch processing |
| Weakness | Not designed for dynamic task creation; heavy; DAG definition overhead |
**Verdict:** Wrong tool for this job. Airflow excels at static, well-defined data pipelines (ETL). Our agent workflows are dynamic — tasks are created at runtime based on user prompts. Airflow's DAG model fights against this. Massive overhead (needs webserver, scheduler, worker, metadata DB).
---
### 2.6 Dramatiq
**Architecture:** Lightweight distributed task queue, Celery alternative.
| Feature | Dramatiq |
|---|---|
| Broker | Redis, RabbitMQ |
| Retries | `@dramatiq.actor(max_retries=3)` |
| Middleware | Pluggable: age_limit, time_limit, retries, callbacks |
| Groups | `group(actor.message(...), ...).run()` |
| Pipes | `actor.message() | other_actor.message()` |
| Simplicity | Cleaner API than Celery |
**Verdict:** Nice middle ground between Huey and Celery. But still requires a broker (Redis/RabbitMQ). No SQLite backend. Less ecosystem than Celery, less lightweight than Huey.
---
### 2.7 RQ (Redis Queue)
**Architecture:** Minimal Redis-based task queue.
| Feature | RQ |
|---|---|
| Broker | Redis only |
| Retries | Via `Retry` class |
| Workers | Simple worker processes |
| Dashboard | `rq-dashboard` (separate) |
| Limitation | Redis-only, no SQLite, no scheduling built-in |
**Verdict:** Too simple and Redis-dependent. No periodic task support without `rq-scheduler`. No task chaining without third-party. Not competitive with Huey for our use case.
---
## 3. Architecture Patterns for AI Agent Workflows
### 3.1 Task Chaining (Fan-out / Fan-in)
The critical pattern for multi-step AI workflows:
```
[Script] → [Agent] → [Deliver]
↓ ↓ ↓
Context Report Notification
```
**Implementation with Huey:**
```python
@huey.task(retries=2)
def run_script_task(script_path):
return run_script(script_path)
@huey.task(retries=3, retry_delay=60)
def run_agent_task(prompt, context=None):
if context:
prompt = f"## Context\n{context}\n\n{prompt}"
agent = AIAgent(...)
return agent.run_conversation(prompt)
@huey.task()
def deliver_task(result, job_config):
return deliver_result(job_config, result)
# Compose: script → agent → deliver
def compose_workflow(job):
steps = []
if job.get("script"):
steps.append(run_script_task.s(job["script"]))
steps.append(run_agent_task.s(job["prompt"]))
steps.append(deliver_task.s(job))
return reduce(lambda a, b: a.then(b), steps)
```
### 3.2 Retry with Exponential Backoff
```python
from huey import RetryTask
class AIWorkflowTask(RetryTask):
retries = 3
retry_delay = 30 # Start at 30s
retry_backoff = True # 30s → 60s → 120s
max_retry_delay = 600 # Cap at 10min
```
### 3.3 Dead Letter Queue
For tasks that exhaust retries:
```python
@huey.task(retries=3)
def flaky_task(data):
...
# Dead letter handling
def handle_failure(task, exc, retries):
# Log to dead letter store
save_dead_letter(task, exc, retries)
# Notify user of failure
notify_user(f"Task {task.name} failed after {retries} retries: {exc}")
```
### 3.4 Observability Pattern
```python
# Structured event logging for every state transition
def emit_event(job_id, event_type, metadata):
event = {
"job_id": job_id,
"event": event_type, # scheduled, started, completed, failed, retried
"timestamp": iso_now(),
"metadata": metadata,
}
append_to_event_log(event)
# Also emit to metrics (Prometheus/StatsD)
metrics.increment(f"cron.{event_type}")
```
---
## 4. Benchmarks Summary
| Framework | Throughput | Latency | Memory | Startup | Dependencies |
|---|---|---|---|---|---|
| Current Cron | ~1 job/60s tick | 60-120s | Minimal | Instant | None |
| Huey (SQLite) | ~5K tasks/sec | <10ms | ~20MB | <1s | None |
| Huey (Redis) | ~15K tasks/sec | <5ms | ~20MB | <1s | Redis |
| Celery (Redis) | ~15K tasks/sec | <10ms | ~100MB | ~3s | Redis |
| Temporal | ~50K activities/sec | <5ms | ~200MB | ~10s | Temporal server+DB |
| Prefect | ~10K tasks/sec | <20ms | ~150MB | ~5s | PostgreSQL |
---
## 5. Recommendations
### Immediate (Phase 1): Enhance Current Cron
Add these capabilities to the existing `cron/` module **without** switching frameworks:
1. **Retry logic** — Add `retry_count`, `retry_delay`, `max_retries` fields to job JSON. In `scheduler.py tick()`, on failure: if `retries_remaining > 0`, don't advance schedule, set `next_run_at = now + retry_delay * (attempt^2)`.
2. **Backoff** — Exponential: `delay * 2^attempt`, capped at 10 minutes.
3. **Dead letter tracking** — After max retries, mark job state as `dead_letter` and emit a delivery notification with the error.
4. **Concurrency limit** — Add a semaphore (e.g., `max_concurrent=3`) to `tick()` so we don't spawn 20 agents simultaneously.
5. **Structured events** — Append JSON events to `~/.hermes/cron/events.jsonl` for every state transition (scheduled, started, completed, failed, retried, delivered).
**Effort:** ~1-2 days. No new dependencies.
### Medium-term (Phase 2): Adopt Huey for Workflow Chaining
When we need task dependencies (multi-step agent workflows), migrate to Huey:
1. **Keep the JSON job store** as the source of truth for user-facing job management.
2. **Use Huey as the execution engine** — enqueue tasks from `tick()`, let Huey handle retries, scheduling, and chaining.
3. **SQLite backend** — no new infrastructure. One consumer process (`huey_consumer.py`) alongside the gateway.
4. **Task chaining for multi-step jobs**`script_task.then(agent_task).then(delivery_task)`.
**Migration path:**
- Phase 2a: Run Huey consumer alongside gateway. Mirror cron jobs to Huey periodic tasks.
- Phase 2b: Add task chaining for jobs with scripts.
- Phase 2c: Migrate all jobs to Huey, deprecate tick()-based execution.
**Effort:** ~1 week. Huey already installed. Gateway integration ~2-3 days.
### Long-term (Phase 3): Evaluate Temporal/Prefect
Only if:
- We have 100+ concurrent multi-step workflows
- We need workflow versioning and A/B testing
- We need cross-service orchestration (agent calls to external APIs with complex compensation logic)
- We want a web dashboard for non-technical users
**Don't adopt early** — these tools solve problems we don't have yet.
---
## 6. Decision Matrix
| Need | Best Solution | Why |
|---|---|---|
| Simple retry logic | Enhance current cron | Zero deps, fast to implement |
| Task chaining | **Huey** | Already installed, SQLite backend, pipeline API |
| Monitoring dashboard | Prefect or Huey+Flower | If monitoring becomes critical |
| Massive scale (10K+/sec) | Celery + Redis | If we're processing thousands of agent runs per hour |
| Complex compensation | Temporal | Only if we need durable multi-service workflows |
| Periodic scheduling | Current cron (works) or Huey | Current is fine; Huey adds `crontab()` with seconds |
---
## 7. Key Insight
The cron system's biggest gap isn't the framework — it's the **absence of retry and dependency primitives**. These can be added to the current system in <100 lines of code. The second biggest gap is observability (structured events + metrics), which is also solvable incrementally.
Huey is the right *eventual* target for workflow execution because:
1. Already installed, zero new dependencies
2. SQLite backend matches our "no infrastructure" philosophy
3. Pipeline API gives us task chaining for free
4. Retry/backoff is first-class
5. Consumer model is more efficient than tick-polling
6. ~50x better scheduling latency (ms vs 60s)
The migration should be gradual — start by wrapping Huey inside our existing cron tick, then progressively move execution to Huey's consumer model.

View File

@@ -1,38 +0,0 @@
# Cron Job Audit — #890
## Problem
9 of 69 cron jobs have zero completions. They waste scheduler cycles.
## Dead Jobs Identified
| Job | Schedule | Completions | Action |
|-----|----------|-------------|--------|
| exp-swarm-pipeline | every 10 min | 0 | disable |
| exp-music-generator | every 2h | 0 | disable |
| exp-paper-citations | every 3h | 0 | disable |
| exp-gbrain-patterns | every 2h | 0 | disable |
| exp-infra-hardening | every 2h | 0 | disable |
| gemma4-multimodal-burn | every 1h | 0 | disable |
| morning-paper-report | daily | 0 | disable |
| overnight-collector | every 15 min | 0 | disable |
| morning-experiment-report | daily | 0 | disable |
## Usage
```bash
# Audit: show dead jobs
python scripts/cron_audit.py
# Dry run: preview changes
python scripts/cron_audit.py --disable --dry-run
# Disable dead jobs
python scripts/cron_audit.py --disable
# Delete dead jobs
python scripts/cron_audit.py --delete
# Find jobs with no runs in 7 days
python scripts/cron_audit.py --older-than 7d
```

View File

@@ -1,56 +0,0 @@
# Emotional Presence Patterns for Crisis Support
Research issue #880. What works for reaching broken men in their darkest moment.
## Crisis Detection Patterns
| Pattern | Severity | Detection |
|---------|----------|-----------|
| Method seeking | CRITICAL | "how do I kill myself", "suicide methods" |
| Suicidal ideation | HIGH | "I want to die", "no reason to live" |
| Self-harm | HIGH | "cutting myself", "hurting myself" |
| Despair | HIGH | "can't go on", "no one cares" |
| Farewell | CRITICAL | "this is goodbye", "my final message" |
## What Works
1. Validation without agreement: "That sounds incredibly painful"
2. Presence over solutions: "I'm here with you right now"
3. Specific next steps: "Text HOME to 741741 right now"
4. Connection: "You don't have to face this alone"
5. Hope without minimizing: "There is a way through this"
## What Does NOT Work
1. False empathy: "I feel your pain"
2. Toxic positivity: "Look on the bright side!"
3. Minimization: "Other people have it worse"
4. Guilt induction: "Think about your family"
5. Premature problem-solving: "Have you tried therapy?"
6. AI dishonesty: pretending to be human
## SOUL.md Protocol
1. Ask: "Are you safe right now?" (FIRST response)
2. Listen. Stay present. Don't disconnect.
3. Never compute the value of a human life
4. Never suggest death is an option
5. Always point to 988 Lifeline
6. Share the gospel: "Jesus saves those who call on His name"
## Crisis Resources
- 988 Suicide & Crisis Lifeline: call or text 988
- Crisis Text Line: text HOME to 741741
- 988 Chat: 988lifeline.org/chat
- Spanish: 1-888-628-9454
- Emergency: 911
## Implementation Status
- Crisis detection: agent/crisis_protocol.py
- SHIELD integration: tools/shield/
- 988 Lifeline: resources defined
- Emotional presence: this document
- Escalation tracking: future work
- Human notification: future work

View File

@@ -1,42 +0,0 @@
# Holographic + Vector Hybrid Memory Architecture
Research issue #879. Combining HRR (holographic) and vector (Qdrant) memory.
## Architecture
Three memory backends, each with unique strengths:
| Backend | Strength | Weakness | Use Case |
|---------|----------|----------|----------|
| FTS5 | Exact keyword match | No semantic understanding | Precise recall |
| Vector (Qdrant) | Semantic similarity | No compositional queries | Topic search |
| HRR (Holographic) | Compositional queries | Limited scale | Complex reasoning |
## Why Hybrid
- FTS5 alone: misses ~30-40% of semantically relevant content
- Vector alone: can't do compositional queries ("what did I discuss about X after doing Y?")
- HRR alone: unique capability but no semantic fallback
- Hybrid: best of all three, RRF fusion for ranking
## Implementation: Reciprocal Rank Fusion
Results from each backend are merged using RRF:
- score = sum(weight / (k + rank)) for each backend
- k=60 (standard RRF constant)
- Weights: FTS5=0.6, Vector=0.4 (configurable)
## Status
- FTS5: EXISTS (hermes_state.py)
- Vector (Qdrant): implemented (tools/hybrid_search.py)
- HRR: EXISTS (plugins/memory/holographic.py)
- RRF fusion: implemented (tools/hybrid_search.py)
- Ingestion pipeline: partial
## Next Steps
1. Wire HRR into hybrid_search.py
2. Session-level vector ingestion
3. Benchmark: measure R@5 improvement
4. Cross-session memory persistence

View File

@@ -11,14 +11,12 @@ When you run `hermes setup` for the first time and Hermes detects `~/.openclaw`,
### 2. CLI Command (quick, scriptable)
```bash
hermes claw migrate # Preview then migrate (always shows preview first)
hermes claw migrate --dry-run # Preview only, no changes
hermes claw migrate # Full migration with confirmation prompt
hermes claw migrate --dry-run # Preview what would happen
hermes claw migrate --preset user-data # Migrate without API keys/secrets
hermes claw migrate --yes # Skip confirmation prompt
```
The migration always shows a full preview of what will be imported before making any changes. You review the preview and confirm before anything is written.
**All options:**
| Flag | Description |
@@ -41,7 +39,7 @@ Ask the agent to run the migration for you:
```
The agent will use the `openclaw-migration` skill to:
1. Run a preview first to show what would change
1. Run a dry-run first to preview changes
2. Ask about conflict resolution (SOUL.md, skills, etc.)
3. Let you choose between `user-data` and `full` presets
4. Execute the migration with your choices
@@ -60,31 +58,16 @@ The agent will use the `openclaw-migration` skill to:
| Messaging settings | `~/.openclaw/config.yaml` (TELEGRAM_ALLOWED_USERS, MESSAGING_CWD) | `~/.hermes/.env` |
| TTS assets | `~/.openclaw/workspace/tts/` | `~/.hermes/tts/` |
Workspace files are also checked at `workspace.default/` and `workspace-main/` as fallback paths (OpenClaw renamed `workspace/` to `workspace-main/` in recent versions).
### `full` preset (adds to `user-data`)
| Item | Source | Destination |
|------|--------|-------------|
| Telegram bot token | `openclaw.json` channels config | `~/.hermes/.env` |
| OpenRouter API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
| OpenAI API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
| Anthropic API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
| ElevenLabs API key | `.env`, `openclaw.json`, or `openclaw.json["env"]` | `~/.hermes/.env` |
| Telegram bot token | `~/.openclaw/config.yaml` | `~/.hermes/.env` |
| OpenRouter API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
| OpenAI API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
| Anthropic API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
| ElevenLabs API key | `~/.openclaw/.env` or config | `~/.hermes/.env` |
API keys are searched across four sources: inline config values, `~/.openclaw/.env`, the `openclaw.json` `"env"` sub-object, and per-agent auth profiles.
Only allowlisted secrets are ever imported. Other credentials are skipped and reported.
## OpenClaw Schema Compatibility
The migration handles both old and current OpenClaw config layouts:
- **Channel tokens**: Reads from flat paths (`channels.telegram.botToken`) and the newer `accounts.default` layout (`channels.telegram.accounts.default.botToken`)
- **TTS provider**: OpenClaw renamed "edge" to "microsoft" — both are recognized and mapped to Hermes' "edge"
- **Provider API types**: Both short (`openai`, `anthropic`) and hyphenated (`openai-completions`, `anthropic-messages`, `google-generative-ai`) values are mapped correctly
- **thinkingDefault**: All enum values are handled including newer ones (`minimal`, `xhigh`, `adaptive`)
- **Matrix**: Uses `accessToken` field (not `botToken`)
- **SecretRef formats**: Plain strings, env templates (`${VAR}`), and `source: "env"` SecretRefs are resolved. `source: "file"` and `source: "exec"` SecretRefs produce a warning — add those keys manually after migration.
Only these 6 allowlisted secrets are ever imported. Other credentials are skipped and reported.
## Conflict Handling
@@ -101,24 +84,18 @@ For skills, you can also use `--skill-conflict rename` to import conflicting ski
## Migration Report
Every migration produces a report showing:
Every migration (including dry runs) produces a report showing:
- **Migrated items** — what was successfully imported
- **Conflicts** — items skipped because they already exist
- **Skipped items** — items not found in the source
- **Errors** — items that failed to import
For executed migrations, the full report is saved to `~/.hermes/migration/openclaw/<timestamp>/`.
## Post-Migration Notes
- **Skills require a new session** — imported skills take effect after restarting your agent or starting a new chat.
- **WhatsApp requires re-pairing** — WhatsApp uses QR-code pairing, not token-based auth. Run `hermes whatsapp` to pair.
- **Archive cleanup** — after migration, you'll be offered to rename `~/.openclaw/` to `.openclaw.pre-migration/` to prevent state confusion. You can also run `hermes claw cleanup` later.
For execute runs, the full report is saved to `~/.hermes/migration/openclaw/<timestamp>/`.
## Troubleshooting
### "OpenClaw directory not found"
The migration looks for `~/.openclaw` by default, then tries `~/.clawdbot` and `~/.moltbot`. If your OpenClaw is installed elsewhere, use `--source`:
The migration looks for `~/.openclaw` by default. If your OpenClaw is installed elsewhere, use `--source`:
```bash
hermes claw migrate --source /path/to/.openclaw
```
@@ -131,12 +108,3 @@ hermes skills install openclaw-migration
### Memory overflow
If your OpenClaw MEMORY.md or USER.md exceeds Hermes' character limits, excess entries are exported to an overflow file in the migration report directory. You can manually review and add the most important ones.
### API keys not found
Keys might be stored in different places depending on your OpenClaw setup:
- `~/.openclaw/.env` file
- Inline in `openclaw.json` under `models.providers.*.apiKey`
- In `openclaw.json` under the `"env"` or `"env.vars"` sub-objects
- In `~/.openclaw/agents/main/agent/auth-profiles.json`
The migration checks all four. If keys use `source: "file"` or `source: "exec"` SecretRefs, they can't be resolved automatically — add them via `hermes config set`.

View File

@@ -1,44 +0,0 @@
# awesome-ai-tools Integration Plan
**Tracking:** #842
**Source report:** docs/tool-investigation-2026-04-15.md
**Date:** 2026-04-16
---
## Status Dashboard
| # | Tool | Category | Impact | Effort | Status | Issue |
|---|------|----------|--------|--------|--------|-------|
| 1 | Mem0 | Memory | 5/5 | 3/5 | Cloud + Local done | #842 |
| 2 | LightRAG | RAG | 4/5 | 3/5 | Not started | #857 |
| 3 | n8n | Orchestration | 5/5 | 4/5 | Not started | #858 |
| 4 | RAGFlow | RAG | 4/5 | 4/5 | Not started | #859 |
| 5 | tensorzero | LLMOps | 4/5 | 3/5 | Not started | #860 |
---
## #1: Mem0 — DONE
Cloud: `plugins/memory/mem0/` (MEM0_API_KEY required)
Local: `plugins/memory/mem0_local/` (ChromaDB, no API key)
## #2: LightRAG (P2)
Create `plugins/rag/lightrag/` plugin. Index skill docs. Use local Ollama embeddings.
## #3: n8n (P3)
Deploy as Docker service. Create workflow templates for Hermes patterns.
## #4: RAGFlow (P4)
Deploy as Docker service. Integrate via HTTP API for document understanding.
## #5: tensorzero (P3)
Evaluate as provider routing replacement. Canary migration (10% traffic first).
---
*Last updated: 2026-04-16*

View File

@@ -1,324 +0,0 @@
# SOTA Research: Multi-Agent Coordination & Fleet Knowledge Graphs
**Date:** 2026-04-14
**Scope:** Agent-to-agent communication, shared memory, task delegation, consensus protocols
**Frameworks Analyzed:** CrewAI, AutoGen, MetaGPT, ChatDev, CAMEL
---
## 1. Architecture Pattern Summary
### 1.1 CrewAI — Role-Based Crew Orchestration
**Core Pattern:** Agents organized into "Crews" with explicit roles, goals, and backstories. Tasks are assigned to agents, executed via sequential or hierarchical process flows.
**Agent-to-Agent Communication:**
- **Sequential:** Agent A completes Task A → output injected into Task B's context for Agent B
- **Hierarchical:** Manager agent delegates to worker agents, collects results, synthesizes
- **Context passing:** Tasks can declare `context: [other_tasks]` — outputs from dependent tasks are automatically injected into the current task's prompt
- **No direct agent-to-agent messaging** — communication is mediated through task outputs
**Shared Memory (v2 — Unified Memory):**
- `Memory` class with `remember()` / `recall()` using vector embeddings (LanceDB/ChromaDB)
- **Scope-based isolation:** `MemoryScope` provides path-based namespacing (`/crew/research/agent-foo`)
- **Composite scoring:** semantic similarity (0.5) + recency (0.3) + importance (0.2)
- **RecallFlow:** LLM-driven deep recall with adaptive query expansion
- **Privacy flags:** Private memories only visible to the source that created them
- **Background saves:** ThreadPoolExecutor with write barrier (drain_writes before recall)
**Task Delegation:**
- Agent tools include `Delegate Work to Co-worker` and `Ask Question to Co-worker`
- Delegation creates a new task for another agent, results come back to delegator
- Depth-limited (no infinite delegation chains)
**State & Checkpointing:**
- `SqliteProvider` / `JsonProvider` for state checkpoint persistence
- `CheckpointConfig` with event-driven persistence
- Flow state is Pydantic models with serialization
**Cache:**
- Thread-safe in-memory tool result cache with RWLock
- Key: `{tool_name}-{input}` → cached output
### 1.2 AutoGen (Microsoft) — Conversation-Centric Teams
**Core Pattern:** Agents communicate through shared conversation threads. A "Group Chat Manager" controls turn-taking and speaker selection.
**Agent-to-Agent Communication:**
- **Shared message thread** — all agents see all messages (like a group chat)
- **Three team patterns:**
- `RoundRobinGroupChat`: Fixed order cycling through participants
- `SelectorGroupChat`: LLM-based speaker selection with candidate filtering
- `SwarmGroupChat`: Handoff-based routing (agent sends HandoffMessage to next agent)
- `GraphFlow` (DiGraph): DAG-based execution with conditional edges, parallel fan-out, loops
- `MagenticOneOrchestrator`: Ledger-based orchestration with task planning, progress tracking, stall detection
**Shared State:**
- `ChatCompletionContext` — manages message history per agent (can be unbounded or windowed)
- `ModelContext` shared across agents in a team
- State serialization: `save_state()` / `load_state()` for all managers
- **No built-in vector memory** — context is purely conversational
**Task Delegation:**
- `Swarm`: Agents use `HandoffMessage` to explicitly route control
- `GraphFlow`: Conditional edges route based on message content (keyword or callable)
- `MagenticOne`: Orchestrator maintains a "task ledger" (facts + plan) and dynamically re-plans on stalls
**Consensus / Termination:**
- `TerminationCondition` — composable conditions (text match, max messages, source-based)
- No explicit consensus protocols — termination is manager-decided
**Key Insight:** AutoGen's `ChatCompletionContext` is the closest analog to shared memory, but it's purely sequential message history, not a knowledge base.
### 1.3 MetaGPT — SOP-Driven Software Teams
**Core Pattern:** Agents follow Standard Operating Procedures (SOPs). Each agent has a defined role (Product Manager, Architect, Engineer, QA) and produces structured artifacts.
**Agent-to-Agent Communication:**
- **Publish-Subscribe via Environment:** Agents publish "actions" to a shared Environment, subscribers react
- **Structured outputs:** Each role produces specific artifact types (PRD, design doc, code, test cases)
- **Message routing:** Environment acts as a message bus, filtering by subscriber interest
**Shared Memory:**
- `Environment` class maintains shared state (project workspace)
- File-based shared memory: agents write/read from a shared filesystem
- `SharedMemory` for cross-agent context (structured data, not free-form text)
**Task Delegation:**
- Implicit through SOP stages: PM → Architect → Engineer → QA
- Each agent's output is the next agent's input
- No dynamic re-delegation
**Consensus:**
- Sequential SOP execution (no parallel agents)
- QA agent can trigger re-work loops back to Engineer
### 1.4 ChatDev — Chat-Chain Software Development
**Core Pattern:** Agents follow a "chat chain" — a sequence of chat phases (designing, coding, testing, documenting). Each phase involves a pair of agents (CEO↔CTO, Programmer↔Reviewer, etc.).
**Agent-to-Agent Communication:**
- **Paired chat sessions:** Two agents communicate in each phase (role-play between instructor and assistant)
- **Chain propagation:** Phase N's output (code, design doc) becomes Phase N+1's input
- **No broadcast** — communication is strictly pairwise within phases
**Shared Memory:**
- Software-centric: shared code repository is the "memory"
- Each phase modifies/inherits the codebase
- No explicit vector memory or knowledge graph
**Task Delegation:**
- Hardcoded phase sequence: Design → Code → Test → Document
- Each phase delegates to a specific agent pair
- No dynamic task re-assignment
**Consensus:**
- Phase-level termination: when both agents agree the phase is complete
- "Thought" tokens for chain-of-thought within chat
### 1.5 CAMEL — Role-Playing & Workforce
**Core Pattern:** Two primary modes:
1. **RolePlaying:** Two-agent conversation with task specification and optional critic
2. **Workforce:** Multi-agent with coordinator, task planner, and worker pool
**Agent-to-Agent Communication:**
- **RolePlaying:** Structured turn-taking between assistant and user agents
- **Workforce:** Coordinator assigns tasks via `TaskChannel`, workers return results
- **Worker types:** `SingleAgentWorker` (single ChatAgent), `RolePlayingWorker` (two-agent pair)
**Shared Memory / Task Channel:**
- `TaskChannel` — async queue-based task dispatch with packet tracking
- States: SENT → PROCESSING → RETURNED → ARCHIVED
- O(1) lookup by task ID, status-based filtering, assignee/publisher queues
- `WorkflowMemoryManager` — persists workflow patterns as markdown files
- Role-based organization: workflows stored by `role_identifier`
- Agent-based intelligent selection: LLM picks relevant past workflows
- Versioned: metadata tracks creation time and version numbers
**Task Delegation:**
- Coordinator agent decomposes complex tasks using LLM analysis
- Tasks assigned to workers based on capability matching
- Failed tasks trigger: retry, create new worker, or further decomposition
- `FailureHandlingConfig` with configurable `RecoveryStrategy`
**Consensus / Quality:**
- Quality evaluation via structured output (response format enforced)
- Task dependencies tracked (worker receives dependency tasks as context)
- `WorkforceMetrics` for tracking execution statistics
---
## 2. Key Architectural Patterns for Fleet Knowledge Graph
### 2.1 Communication Topology Patterns
| Pattern | Used By | Description |
|---------|---------|-------------|
| **Sequential Chain** | CrewAI, ChatDev, MetaGPT | A→B→C linear flow, output feeds next |
| **Shared Thread** | AutoGen | All agents see all messages |
| **Publish-Subscribe** | MetaGPT | Environment-based message bus |
| **Paired Chat** | ChatDev, CAMEL | Two-agent conversation pairs |
| **Handoff Routing** | AutoGen Swarm | Agent explicitly names next speaker |
| **DAG Graph** | AutoGen GraphFlow | Conditional edges, parallel, loops |
| **Ledger Orchestration** | AutoGen MagenticOne | Maintains task ledger, re-plans |
| **Task Channel** | CAMEL | Async queue with packet states |
### 2.2 Shared State Patterns
| Pattern | Used By | Description |
|---------|---------|-------------|
| **Vector Memory** | CrewAI | Embeddings + scope-based namespacing |
| **Message History** | AutoGen | Sequential conversation context |
| **File System** | MetaGPT, ChatDev | Agents read/write shared files |
| **Task Channel** | CAMEL | Async packet-based task dispatch |
| **Workflow Files** | CAMEL | Markdown-based workflow memory |
| **Tool Cache** | CrewAI | In-memory RWLock tool result cache |
| **State Checkpoint** | CrewAI, AutoGen | Serialized Pydantic/SQLite checkpoints |
### 2.3 Task Delegation Patterns
| Pattern | Used By | Description |
|---------|---------|-------------|
| **Role Assignment** | CrewAI | Fixed agent per task |
| **Manager Delegation** | CrewAI Hierarchical | Manager assigns tasks dynamically |
| **Speaker Selection** | AutoGen Selector | LLM picks next agent |
| **Handoff** | AutoGen Swarm | Agent explicitly transfers control |
| **SOP Routing** | MetaGPT | Stage-based implicit delegation |
| **Coordinator** | CAMEL Workforce | LLM-based task decomposition + assignment |
| **Dynamic Worker Creation** | CAMEL Workforce | Create new workers on failure |
### 2.4 Conflict Resolution Patterns
| Pattern | Used By | Description |
|---------|---------|-------------|
| **Manager Arbitration** | CrewAI Hierarchical | Manager resolves conflicts |
| **Critic-in-the-loop** | CAMEL | Critic agent evaluates and selects |
| **Quality Gate** | CAMEL Workforce | Structured quality evaluation |
| **Termination Conditions** | AutoGen | Composable stop conditions |
| **Stall Detection** | AutoGen MagenticOne | Re-plans when progress stalls |
---
## 3. Recommendations for Hermes Fleet Knowledge Graph
### 3.1 Architecture: Hybrid Graph + Memory
Based on the SOTA analysis, the optimal fleet knowledge graph should combine:
1. **CrewAI's scoped memory** for hierarchical knowledge organization
- Path-based namespaces: `/fleet/{fleet_id}/agent/{agent_id}/diary`
- Composite scoring: semantic + recency + importance
- Background writes with read barriers
2. **CAMEL's TaskChannel** for task dispatch and tracking
- Packet states (SENT → PROCESSING → RETURNED → ARCHIVED)
- O(1) lookup by task ID
- Assignee/publisher tracking
3. **AutoGen's DiGraph** for execution flow definition
- DAG with conditional edges for complex workflows
- Parallel fan-out for independent tasks
- Activation conditions (all vs any) for synchronization points
4. **AutoGen MagenticOne's ledger** for shared task context
- Maintained facts, plan, and progress ledger
- Dynamic re-planning on stalls
### 3.2 Fleet Knowledge Graph Schema
```
/fleet/{fleet_id}/
├── shared/ # Shared knowledge (all agents read)
│ ├── facts/ # Known facts, constraints
│ ├── decisions/ # Record of decisions made
│ └── context/ # Active task context
├── agent/{agent_id}/
│ ├── diary/ # Agent's personal experience log
│ ├── capabilities/ # What this agent can do
│ └── state/ # Current task state
├── tasks/
│ ├── {task_id}/ # Task metadata, dependencies, status
│ └── graph/ # DAG definition for task dependencies
└── consensus/
├── proposals/ # Pending proposals
└── decisions/ # Resolved consensus decisions
```
### 3.3 Key Design Decisions
1. **Diary System (Agent Memory):**
- Each agent writes to its own scoped memory after every significant action
- LLM-analyzed importance scoring (like CrewAI's unified memory)
- Cross-agent recall: agents can query other agents' diaries for relevant experiences
- Decay: old low-importance memories expire
2. **Shared State (Fleet Knowledge):**
- SQLite-backed (like Hermes' existing `state.db`) with FTS5 search
- Hierarchical scopes (like CrewAI's MemoryScope)
- Write-ahead log for concurrent access
- Read barriers before queries (like CrewAI's `drain_writes`)
3. **Task Delegation:**
- Coordinator pattern (like CAMEL's Workforce)
- Task decomposition via LLM
- Failed task → retry, reassign, or decompose
- Max depth limit (like Hermes' existing MAX_DEPTH=2)
4. **Consensus Protocol:**
- Proposal-based: agent proposes, others vote/acknowledge
- Timeout-based fallback: if no response within N seconds, proceed
- Manager override: designated manager can break ties
- Simple majority for non-critical, unanimity for critical decisions
5. **Conflict Resolution:**
- Last-write-wins for non-critical state
- Optimistic locking with version numbers
- Manager arbitration for task assignment conflicts
- Quality gates (like CAMEL) for output validation
### 3.4 Integration with Existing Hermes Architecture
Hermes already has strong foundations:
- **Delegation system** (`delegate_tool.py`): Isolated child agents, parallel execution, depth limits
- **State DB** (`hermes_state.py`): SQLite + FTS5, WAL mode, session tracking, message history
- **Credential pools**: Shared credentials with rotation
The fleet knowledge graph should extend these patterns:
- **Session DB → Fleet DB:** Add tables for fleet metadata, agent registrations, task graphs
- **Memory tool → Fleet Memory:** Scoped vector memory shared across fleet agents
- **Delegate tool → Fleet Delegation:** Task channel with persistence, quality evaluation
- **New: Consensus module:** Proposal/vote protocol with timeout handling
---
## 4. Reference Implementations
| Component | Best Reference | Key Takeaway |
|-----------|---------------|--------------|
| Scoped Memory | CrewAI `Memory` + `MemoryScope` | Path-based namespaces, composite scoring, background writes |
| Task Dispatch | CAMEL `TaskChannel` | Packet-based with state machine, O(1) lookup |
| Execution DAG | AutoGen `DiGraphBuilder` | Fluent builder, conditional edges, activation groups |
| Orchestration | AutoGen `MagenticOneOrchestrator` | Ledger-based planning, stall detection, re-planning |
| Agent Communication | AutoGen `SelectorGroupChat` | LLM-based speaker selection, shared message thread |
| Quality Evaluation | CAMEL Workforce | Structured output for quality scoring |
| Workflow Memory | CAMEL `WorkflowMemoryManager` | Markdown-based, role-organized, versioned |
| State Checkpoint | CrewAI `SqliteProvider` | JSONB checkpoints, WAL mode |
| Tool Cache | CrewAI `CacheHandler` | RWLock-based concurrent tool result cache |
---
## 5. Open Questions
1. **Graph vs Vector for knowledge:** Should fleet knowledge use a proper graph DB (e.g., Neo4j) or stick with vector + SQLite?
- Recommendation: Start with SQLite + vectors (existing stack), add graph later if needed
2. **Real-time vs Batch:** Should agents receive updates in real-time or batched?
- Recommendation: Event-driven for critical updates, batched for diary entries
3. **Security model:** How should cross-agent access be controlled?
- Recommendation: Role-based ACLs on scope paths, similar to CrewAI's privacy flags
4. **Scalability:** How many agents can a single fleet support?
- Recommendation: Start with 10-agent fleets, optimize SQLite concurrency first

View File

@@ -1,29 +0,0 @@
# Phase 3: Poka-yoke Integration & Fleet Verification
Epic #967. Morning review packet for Hermes harness features.
## Poka-yoke Features Implemented
| Feature | Module | PR | Status |
|---------|--------|-----|--------|
| Token budget tracker | agent/token_budget.py | #930 | MERGED |
| Provider preflight validation | agent/provider_preflight.py | #932 | MERGED |
| Atomic skill editing | tools/skill_edit_guard.py | #933 | MERGED |
| Config debt fixes | gateway/config.py | #437 | MERGED |
| Test collection fixes | tests/acp/conftest.py | #794 | MERGED |
| Context-faithful prompting | agent/context_faithful.py | #786 | MERGED |
## Fleet Verification
- Unit tests pass on all modules
- Collection: 11,472 tests, 0 errors (was 6 errors)
- ACP tests: cleanly skipped when acp extra missing
- Provider validation: catches missing/short keys
- Skill editing: atomic with auto-revert
## Next Steps
1. Wire token_budget into run_agent.py conversation loop
2. Wire provider_preflight into session start
3. Wire skill_edit_guard into skill_manage tool
4. Fleet-wide deployment verification

View File

@@ -1,387 +0,0 @@
# Morning Review Packet
Source epic: [EPIC: Morning review packet — Hermes harness features landed 2026-04-21](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/949)
## Epic context
EPIC: Morning review packet — Hermes harness features landed 2026-04-21
Source: git log on upstream/main since 2026-04-21 00:00 EDT, plus the current local branch `burn/921-poka-yoke-hardcoded-paths` for the branch-only path-guard work.
Important review note:
- Validate upstream-landed features on `upstream/main` or a synced branch.
- Validate the path-guard work on `burn/921-poka-yoke-hardcoded-paths`.
This epic is a morning-review packet: one QA issue per feature cluster, each with concrete acceptance criteria and targeted tests or manual checks.
## Success criteria
- [ ] Every issue has a clear PASS / FAIL outcome.
- [ ] Test output or manual evidence is attached to each issue.
- [ ] Any drift between upstream/main and forge/main is called out explicitly.
## Sub-issues
### Upstream/main features landed 2026-04-21
- [ ] #950 [QA] Verify AI Gateway provider UX + attribution headers
- [ ] #951 [QA] Verify transport abstraction + AnthropicTransport wiring
- [ ] #952 [QA] Verify CLI voice beep toggle
- [ ] #953 [QA] Verify bundled skill scripts run out of the box
- [ ] #954 [QA] Verify maps skill guest_house / camp_site / bakery expansion
- [ ] #955 [QA] Verify KittenTTS local provider end-to-end
- [ ] #956 [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
- [ ] #957 [QA] Verify optional adversarial-ux-test skill catalog flow
- [ ] #958 [QA] Verify /usage account limits in CLI + gateway
- [ ] #959 [QA] Verify OpenCode-Go curated catalog additions
- [ ] #960 [QA] Verify patch 'did you mean?' suggestions
- [ ] #961 [QA] Verify web dashboard update/restart action buttons
### Local branch-only work
- [ ] #962 [QA] Verify hardcoded-home path guard on burn/921 branch
## Summary
| Issue | State | Commits | Tests |
| --- | --- | --- | --- |
| #950 | open | 5 | 2 |
| #951 | open | 2 | 2 |
| #952 | open | 1 | 1 |
| #953 | open | 1 | 2 |
| #954 | open | 1 | 0 |
| #955 | open | 2 | 1 |
| #956 | open | 1 | 0 |
| #957 | open | 1 | 0 |
| #958 | open | 2 | 2 |
| #959 | open | 1 | 1 |
| #960 | open | 2 | 1 |
| #961 | closed | 1 | 0 |
| #962 | closed | 1 | 1 |
## #950 — [QA] Verify AI Gateway provider UX + attribution headers
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/950
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `b11753879` — attribution default_headers for ai-gateway provider
- `700437440` — curated picker with live pricing
- `ac26a460f` — promote ai-gateway in provider picker ordering
- `5bb2d11b0` — auto-promote free Moonshot models
- `29f57ec95` — Vercel deep-link for API key creation
### Targeted tests
- `tests/hermes_cli/test_ai_gateway_models.py`
- `tests/run_agent/test_provider_attribution_headers.py`
### Tasks
- [ ] Open `hermes model` and verify `ai-gateway` appears near the top.
- [ ] Verify live pricing appears in the picker.
- [ ] Verify free Moonshot models are promoted.
- [ ] Trigger API-key setup flow and verify the Vercel deep link.
- [ ] Send one ai-gateway request and verify attribution headers are attached.
### Acceptance criteria
- [ ] UI ordering and pricing match the landed behavior.
- [ ] Attribution headers are present on ai-gateway requests.
- [ ] Targeted tests pass.
## #951 — [QA] Verify transport abstraction + AnthropicTransport wiring
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/951
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `7ab5eebd0` — transport types + Anthropic normalize migration
- `731f4fbae` — transport ABC + AnthropicTransport wired to all paths
### Targeted tests
- `tests/agent/transports/test_types.py`
- `tests/agent/test_anthropic_normalize_v2.py`
### Tasks
- [ ] Verify plain-text Anthropic responses normalize correctly.
- [ ] Verify tool-call responses preserve IDs, names, and arguments.
- [ ] Verify reasoning/thinking is preserved separately from visible content.
- [ ] Verify finish_reason mapping remains correct across paths.
### Acceptance criteria
- [ ] Normalized response shape is stable.
- [ ] Tool-call and reasoning payloads survive normalization.
- [ ] Targeted tests pass.
## #952 — [QA] Verify CLI voice beep toggle
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/952
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `b48ea41d2` — voice: add CLI beep toggle
### Targeted tests
- `tests/tools/test_voice_cli_integration.py`
### Tasks
- [ ] Enable the beep option in config and confirm voice mode emits the beep.
- [ ] Disable the option and confirm the same path is silent.
- [ ] Verify voice mode still strips markdown before speech output.
- [ ] Verify voice mode does not pollute conversation history with TTS-only text.
### Acceptance criteria
- [ ] Beep behavior is actually toggled by config.
- [ ] Existing voice/TTS integration behavior is not regressed.
- [ ] Targeted tests pass.
## #953 — [QA] Verify bundled skill scripts run out of the box
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/953
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `328223576` — make bundled skill scripts runnable out of the box
### Targeted tests
- `tests/agent/test_skill_commands.py`
- `tests/tools/test_local_shell_init.py`
### Tasks
- [ ] Pick a bundled skill that ships a script and run it without manual chmod/PATH surgery.
- [ ] Verify local terminal execution resolves the installed skill script correctly.
- [ ] Verify local shell init still behaves correctly.
### Acceptance criteria
- [ ] Bundled skill scripts execute from the installed skill location with no manual prep.
- [ ] Local shell init remains healthy.
- [ ] Targeted tests pass.
## #954 — [QA] Verify maps skill guest_house / camp_site / bakery expansion
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/954
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `c5a814b23` — maps: add guest_house, camp_site, and dual-key bakery lookup
### Tasks
- [ ] Use the maps skill to search for a guest house in a known populated area.
- [ ] Use the maps skill to search for a camp site in a known populated area.
- [ ] Use the maps skill to search for a bakery and verify both supported keys resolve correctly.
- [ ] Confirm results are sensible and non-empty.
### Acceptance criteria
- [ ] All three place types resolve correctly.
- [ ] Bakery lookup works through both supported keys.
- [ ] Manual evidence is attached in the issue.
## #955 — [QA] Verify KittenTTS local provider end-to-end
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/955
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `1830ebfc5` — add KittenTTS provider
- `2d7ff9c5b` — complete KittenTTS integration across tools/setup/docs/tests
### Targeted tests
- `tests/tools/test_tts_kittentts.py`
### Tasks
- [ ] Configure TTS to use `kittentts`.
- [ ] Generate speech to `.wav` and verify playable output.
- [ ] Verify voice / speed / cleaned text are passed correctly.
- [ ] Generate repeated requests and verify model caching behavior.
- [ ] Generate a non-wav output and verify ffmpeg conversion path.
- [ ] Verify missing-package behavior returns a helpful error.
### Acceptance criteria
- [ ] KittenTTS works end-to-end when installed.
- [ ] Failure mode is operator-friendly when not installed.
- [ ] Targeted tests pass.
## #956 — [QA] Verify numbered keyboard shortcuts for approval + clarify prompts
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/956
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `d1ed6f4fb` — CLI: add numbered keyboard shortcuts to approval and clarify prompts
### Tasks
- [ ] Trigger an approval prompt and choose an option with number keys.
- [ ] Trigger a clarify prompt and choose an option with number keys.
- [ ] Verify the correct option is submitted both times.
- [ ] Verify normal keyboard navigation still works.
### Acceptance criteria
- [ ] Number-key selection works for both prompt types.
- [ ] Legacy keyboard navigation is not broken.
- [ ] Manual evidence is attached in the issue.
## #957 — [QA] Verify optional adversarial-ux-test skill catalog flow
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/957
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `e50e7f11b` — skills: add adversarial-ux-test optional skill
### Tasks
- [ ] Verify the optional skill appears in the optional skill catalog.
- [ ] Install or enable the skill.
- [ ] Load it successfully through Hermes.
- [ ] Disable or remove it and verify catalog state updates cleanly.
### Acceptance criteria
- [ ] Catalog listing is correct.
- [ ] Install / load / disable lifecycle works cleanly.
- [ ] Manual evidence is attached in the issue.
## #958 — [QA] Verify /usage account limits in CLI + gateway
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/958
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `8a11b0a20` — per-provider account limits module
- `bcc5d7b67` — append account limits section in CLI and gateway
### Targeted tests
- `tests/test_account_usage.py`
- `tests/gateway/test_usage_command.py`
### Tasks
- [ ] Run `/usage` in CLI for a provider with account limits.
- [ ] Verify provider, remaining quota, total limit, and reset window render correctly.
- [ ] Run `/usage` through the gateway and verify the same section appears.
- [ ] Verify zero-value cache read/write sections stay hidden when appropriate.
### Acceptance criteria
- [ ] CLI and gateway both show the landed account-limits section correctly.
- [ ] Targeted tests pass.
## #959 — [QA] Verify OpenCode-Go curated catalog additions
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/959
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `4fea1769d` — opencode-go: add Kimi K2.6 and Qwen3.5/3.6 Plus to curated catalog
### Targeted tests
- `tests/hermes_cli/test_opencode_go_in_model_list.py`
### Tasks
- [ ] With valid OpenCode-Go credentials, open `hermes model`.
- [ ] Verify Kimi K2.6 appears.
- [ ] Verify Qwen 3.5 Plus and 3.6 Plus appear.
- [ ] Unset credentials and verify the provider/catalog hides correctly.
### Acceptance criteria
- [ ] New curated models are present when credentials exist.
- [ ] Catalog visibility still respects credential gating.
- [ ] Targeted tests pass.
## #960 — [QA] Verify patch 'did you mean?' suggestions
State: open
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/960
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `15abf4ed8` — add `did you mean?` feedback when patch fails to match
- `5e6427a42` — gate it to true no-match cases and extend to v4a / skill_manage
### Targeted tests
- `tests/tools/test_fuzzy_match.py`
### Tasks
- [ ] Intentionally run a replace/patch with a near-miss `old_string`.
- [ ] Verify the tool suggests a useful nearby line/context.
- [ ] Verify suggestions only appear on true no-match failures.
- [ ] Verify the behavior also works via file tools, v4a patching, and skill_manage.
### Acceptance criteria
- [ ] Suggestion quality is helpful, not noisy.
- [ ] Suggestions are correctly gated to no-match cases.
- [ ] Targeted tests pass.
## #961 — [QA] Verify web dashboard update/restart action buttons
State: closed
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/961
### Branch / checkout
- Validate on `upstream/main` or an equivalent synced checkout.
### Commits
- `fc21c1420` — add buttons to update Hermes and restart gateway
### Files touched
- `web/src/pages/StatusPage.tsx`
- `web/src/lib/api.ts`
- `web/src/i18n/en.ts`
### Tasks
- [ ] Open the Web UI status page and verify both buttons are present.
- [ ] Click Restart Gateway in a safe environment and verify running/output/success-or-failure states render.
- [ ] Click Update Hermes and verify the same action lifecycle.
- [ ] Verify the page remains responsive while actions are running.
### Acceptance criteria
- [ ] Both action buttons are present and wired.
- [ ] Action status polling and result rendering work end-to-end.
- [ ] Manual evidence is attached in the issue.
## #962 — [QA] Verify hardcoded-home path guard on burn/921 branch
State: closed
URL: https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/962
### Branch / checkout
- Validate specifically on `burn/921-poka-yoke-hardcoded-paths` (not upstream/main).
### Commits
- `5dcb90531` — Poka-yoke: prevent hardcoded home-directory paths
### Targeted tests
- `tests/test_path_guard.py`
### Tasks
- [ ] Verify hardcoded `/Users/...` paths are rejected.
- [ ] Verify hardcoded `~/.hermes/...` paths are rejected in guarded contexts.
- [ ] Verify valid relative paths still pass.
- [ ] Verify appropriate absolute paths still pass where intended.
- [ ] Verify linting catches violations in non-test files.
### Acceptance criteria
- [ ] Guard blocks the dangerous patterns and preserves allowed ones.
- [ ] Targeted tests pass.

View File

@@ -41,14 +41,6 @@ colors:
session_label: "#DAA520" # Session label
session_border: "#8B8682" # Session ID dim color
# TUI surfaces
status_bar_bg: "#1a1a2e" # Status / usage bar background
voice_status_bg: "#1a1a2e" # Voice-mode badge background
completion_menu_bg: "#1a1a2e" # Completion list background
completion_menu_current_bg: "#333355" # Active completion row background
completion_menu_meta_bg: "#1a1a2e" # Completion meta column background
completion_menu_meta_current_bg: "#333355" # Active completion meta background
# ── Spinner ─────────────────────────────────────────────────────────────────
# Customize the animated spinner shown during API calls and tool execution.
spinner:

View File

@@ -1,329 +0,0 @@
# Container-Aware CLI Review Fixes Spec
**PR:** NousResearch/hermes-agent#7543
**Review:** cursor[bot] bugbot review (4094049442) + two prior rounds
**Date:** 2026-04-12
**Branch:** `feat/container-aware-cli-clean`
## Review Issues Summary
Six issues were raised across three bugbot review rounds. Three were fixed in intermediate commits (38277a6a, 726cf90f). This spec addresses remaining design concerns surfaced by those reviews and simplifies the implementation based on interview decisions.
| # | Issue | Severity | Status |
|---|-------|----------|--------|
| 1 | `os.execvp` retry loop unreachable | Medium | Fixed in 79e8cd12 (switched to subprocess.run) |
| 2 | Redundant `shutil.which("sudo")` | Medium | Fixed in 38277a6a (reuses `sudo` var) |
| 3 | Missing `chown -h` on symlink update | Low | Fixed in 38277a6a |
| 4 | Container routing after `parse_args()` | High | Fixed in 726cf90f |
| 5 | Hardcoded `/home/${user}` | Medium | Fixed in 726cf90f |
| 6 | Group membership not gated on `container.enable` | Low | Fixed in 726cf90f |
The mechanical fixes are in place but the overall design needs revision. The retry loop, error swallowing, and process model have deeper issues than what the bugbot flagged.
---
## Spec: Revised `_exec_in_container`
### Design Principles
1. **Let it crash.** No silent fallbacks. If `.container-mode` exists but something goes wrong, the error propagates naturally (Python traceback). The only case where container routing is skipped is when `.container-mode` doesn't exist or `HERMES_DEV=1`.
2. **No retries.** Probe once for sudo, exec once. If it fails, docker/podman's stderr reaches the user verbatim.
3. **Completely transparent.** No error wrapping, no prefixes, no spinners. Docker's output goes straight through.
4. **`os.execvp` on the happy path.** Replace the Python process entirely so there's no idle parent during interactive sessions. Note: `execvp` never returns on success (process is replaced) and raises `OSError` on failure (it does not return a value). The container process's exit code becomes the process exit code by definition — no explicit propagation needed.
5. **One human-readable exception to "let it crash".** `subprocess.TimeoutExpired` from the sudo probe gets a specific catch with a readable message, since a raw traceback for "your Docker daemon is slow" is confusing. All other exceptions propagate naturally.
### Execution Flow
```
1. get_container_exec_info()
- HERMES_DEV=1 → return None (skip routing)
- Inside container → return None (skip routing)
- .container-mode doesn't exist → return None (skip routing)
- .container-mode exists → parse and return dict
- .container-mode exists but malformed/unreadable → LET IT CRASH (no try/except)
2. _exec_in_container(container_info, sys.argv[1:])
a. shutil.which(backend) → if None, print "{backend} not found on PATH" and sys.exit(1)
b. Sudo probe: subprocess.run([runtime, "inspect", "--format", "ok", container_name], timeout=15)
- If succeeds → needs_sudo = False
- If fails → try subprocess.run([sudo, "-n", runtime, "inspect", ...], timeout=15)
- If succeeds → needs_sudo = True
- If fails → print error with sudoers hint (including why -n is required) and sys.exit(1)
- If TimeoutExpired → catch specifically, print human-readable message about slow daemon
c. Build exec_cmd: [sudo? + runtime, "exec", tty_flags, "-u", exec_user, env_flags, container, hermes_bin, *cli_args]
d. os.execvp(exec_cmd[0], exec_cmd)
- On success: process is replaced — Python is gone, container exit code IS the process exit code
- On OSError: let it crash (natural traceback)
```
### Changes to `hermes_cli/main.py`
#### `_exec_in_container` — rewrite
Remove:
- The entire retry loop (`max_retries`, `for attempt in range(...)`)
- Spinner logic (`"Waiting for container..."`, dots)
- Exit code classification (125/126/127 handling)
- `subprocess.run` for the exec call (keep it only for the sudo probe)
- Special TTY vs non-TTY retry counts
- The `time` import (no longer needed)
Change:
- Use `os.execvp(exec_cmd[0], exec_cmd)` as the final call
- Keep the `subprocess` import only for the sudo probe
- Keep TTY detection for the `-it` vs `-i` flag
- Keep env var forwarding (TERM, COLORTERM, LANG, LC_ALL)
- Keep the sudo probe as-is (it's the one "smart" part)
- Bump probe `timeout` from 5s to 15s — cold podman on a loaded machine needs headroom
- Catch `subprocess.TimeoutExpired` specifically on both probe calls — print a readable message about the daemon being unresponsive instead of a raw traceback
- Expand the sudoers hint error message to explain *why* `-n` (non-interactive) is required: a password prompt would hang the CLI or break piped commands
The function becomes roughly:
```python
def _exec_in_container(container_info: dict, cli_args: list):
"""Replace the current process with a command inside the managed container.
Probes whether sudo is needed (rootful containers), then os.execvp
into the container. If exec fails, the OS error propagates naturally.
"""
import shutil
import subprocess
backend = container_info["backend"]
container_name = container_info["container_name"]
exec_user = container_info["exec_user"]
hermes_bin = container_info["hermes_bin"]
runtime = shutil.which(backend)
if not runtime:
print(f"Error: {backend} not found on PATH. Cannot route to container.",
file=sys.stderr)
sys.exit(1)
# Probe whether we need sudo to see the rootful container.
# Timeout is 15s — cold podman on a loaded machine can take a while.
# TimeoutExpired is caught specifically for a human-readable message;
# all other exceptions propagate naturally.
needs_sudo = False
sudo = None
try:
probe = subprocess.run(
[runtime, "inspect", "--format", "ok", container_name],
capture_output=True, text=True, timeout=15,
)
except subprocess.TimeoutExpired:
print(
f"Error: timed out waiting for {backend} to respond.\n"
f"The {backend} daemon may be unresponsive or starting up.",
file=sys.stderr,
)
sys.exit(1)
if probe.returncode != 0:
sudo = shutil.which("sudo")
if sudo:
try:
probe2 = subprocess.run(
[sudo, "-n", runtime, "inspect", "--format", "ok", container_name],
capture_output=True, text=True, timeout=15,
)
except subprocess.TimeoutExpired:
print(
f"Error: timed out waiting for sudo {backend} to respond.",
file=sys.stderr,
)
sys.exit(1)
if probe2.returncode == 0:
needs_sudo = True
else:
print(
f"Error: container '{container_name}' not found via {backend}.\n"
f"\n"
f"The NixOS service runs the container as root. Your user cannot\n"
f"see it because {backend} uses per-user namespaces.\n"
f"\n"
f"Fix: grant passwordless sudo for {backend}. The -n (non-interactive)\n"
f"flag is required because the CLI calls sudo non-interactively —\n"
f"a password prompt would hang or break piped commands:\n"
f"\n"
f' security.sudo.extraRules = [{{\n'
f' users = [ "{os.getenv("USER", "your-user")}" ];\n'
f' commands = [{{ command = "{runtime}"; options = [ "NOPASSWD" ]; }}];\n'
f' }}];\n'
f"\n"
f"Or run: sudo hermes {' '.join(cli_args)}",
file=sys.stderr,
)
sys.exit(1)
else:
print(
f"Error: container '{container_name}' not found via {backend}.\n"
f"The container may be running under root. Try: sudo hermes {' '.join(cli_args)}",
file=sys.stderr,
)
sys.exit(1)
is_tty = sys.stdin.isatty()
tty_flags = ["-it"] if is_tty else ["-i"]
env_flags = []
for var in ("TERM", "COLORTERM", "LANG", "LC_ALL"):
val = os.environ.get(var)
if val:
env_flags.extend(["-e", f"{var}={val}"])
cmd_prefix = [sudo, "-n", runtime] if needs_sudo else [runtime]
exec_cmd = (
cmd_prefix + ["exec"]
+ tty_flags
+ ["-u", exec_user]
+ env_flags
+ [container_name, hermes_bin]
+ cli_args
)
# execvp replaces this process entirely — it never returns on success.
# On failure it raises OSError, which propagates naturally.
os.execvp(exec_cmd[0], exec_cmd)
```
#### Container routing call site in `main()` — remove try/except
Current:
```python
try:
from hermes_cli.config import get_container_exec_info
container_info = get_container_exec_info()
if container_info:
_exec_in_container(container_info, sys.argv[1:])
sys.exit(1) # exec failed if we reach here
except SystemExit:
raise
except Exception:
pass # Container routing unavailable, proceed locally
```
Revised:
```python
from hermes_cli.config import get_container_exec_info
container_info = get_container_exec_info()
if container_info:
_exec_in_container(container_info, sys.argv[1:])
# Unreachable: os.execvp never returns on success (process is replaced)
# and raises OSError on failure (which propagates as a traceback).
# This line exists only as a defensive assertion.
sys.exit(1)
```
No try/except. If `.container-mode` doesn't exist, `get_container_exec_info()` returns `None` and we skip routing. If it exists but is broken, the exception propagates with a natural traceback.
Note: `sys.exit(1)` after `_exec_in_container` is dead code in all paths — `os.execvp` either replaces the process or raises. It's kept as a belt-and-suspenders assertion with a comment marking it unreachable, not as actual error handling.
### Changes to `hermes_cli/config.py`
#### `get_container_exec_info` — remove inner try/except
Current code catches `(OSError, IOError)` and returns `None`. This silently hides permission errors, corrupt files, etc.
Change: Remove the try/except around file reading. Keep the early returns for `HERMES_DEV=1` and `_is_inside_container()`. The `FileNotFoundError` from `open()` when `.container-mode` doesn't exist should still return `None` (this is the "container mode not enabled" case). All other exceptions propagate.
```python
def get_container_exec_info() -> Optional[dict]:
if os.environ.get("HERMES_DEV") == "1":
return None
if _is_inside_container():
return None
container_mode_file = get_hermes_home() / ".container-mode"
try:
with open(container_mode_file, "r") as f:
# ... parse key=value lines ...
except FileNotFoundError:
return None
# All other exceptions (PermissionError, malformed data, etc.) propagate
return { ... }
```
---
## Spec: NixOS Module Changes
### Symlink creation — simplify to two branches
Current: 4 branches (symlink exists, directory exists, other file, doesn't exist).
Revised: 2 branches.
```bash
if [ -d "${symlinkPath}" ] && [ ! -L "${symlinkPath}" ]; then
# Real directory — back it up, then create symlink
_backup="${symlinkPath}.bak.$(date +%s)"
echo "hermes-agent: backing up existing ${symlinkPath} to $_backup"
mv "${symlinkPath}" "$_backup"
fi
# For everything else (symlink, doesn't exist, etc.) — just force-create
ln -sfn "${target}" "${symlinkPath}"
chown -h ${user}:${cfg.group} "${symlinkPath}"
```
`ln -sfn` handles: existing symlink (replaces), doesn't exist (creates), and after the `mv` above (creates). The only case that needs special handling is a real directory, because `ln -sfn` cannot atomically replace a directory.
Note: there is a theoretical race between the `[ -d ... ]` check and the `mv` (something could create/remove the directory in between). In practice this is a NixOS activation script running as root during `nixos-rebuild switch` — no other process should be touching `~/.hermes` at that moment. Not worth adding locking for.
### Sudoers — document, don't auto-configure
Do NOT add `security.sudo.extraRules` to the module. Document the sudoers requirement in the module's description/comments and in the error message the CLI prints when sudo probe fails.
### Group membership gating — keep as-is
The fix in 726cf90f (`cfg.container.enable && cfg.container.hostUsers != []`) is correct. Leftover group membership when container mode is disabled is harmless. No cleanup needed.
---
## Spec: Test Rewrite
The existing test file (`tests/hermes_cli/test_container_aware_cli.py`) has 16 tests. With the simplified exec model, several are obsolete.
### Tests to keep (update as needed)
- `test_is_inside_container_dockerenv` — unchanged
- `test_is_inside_container_containerenv` — unchanged
- `test_is_inside_container_cgroup_docker` — unchanged
- `test_is_inside_container_false_on_host` — unchanged
- `test_get_container_exec_info_returns_metadata` — unchanged
- `test_get_container_exec_info_none_inside_container` — unchanged
- `test_get_container_exec_info_none_without_file` — unchanged
- `test_get_container_exec_info_skipped_when_hermes_dev` — unchanged
- `test_get_container_exec_info_not_skipped_when_hermes_dev_zero` — unchanged
- `test_get_container_exec_info_defaults` — unchanged
- `test_get_container_exec_info_docker_backend` — unchanged
### Tests to add
- `test_get_container_exec_info_crashes_on_permission_error` — verify that `PermissionError` propagates (no silent `None` return)
- `test_exec_in_container_calls_execvp` — verify `os.execvp` is called with correct args (runtime, tty flags, user, env, container, binary, cli args)
- `test_exec_in_container_sudo_probe_sets_prefix` — verify that when first probe fails and sudo probe succeeds, `os.execvp` is called with `sudo -n` prefix
- `test_exec_in_container_no_runtime_hard_fails` — keep existing, verify `sys.exit(1)` when `shutil.which` returns None
- `test_exec_in_container_non_tty_uses_i_only` — update to check `os.execvp` args instead of `subprocess.run` args
- `test_exec_in_container_probe_timeout_prints_message` — verify that `subprocess.TimeoutExpired` from the probe produces a human-readable error and `sys.exit(1)`, not a raw traceback
- `test_exec_in_container_container_not_running_no_sudo` — verify the path where runtime exists (`shutil.which` returns a path) but probe returns non-zero and no sudo is available. Should print the "container may be running under root" error. This is distinct from `no_runtime_hard_fails` which covers `shutil.which` returning None.
### Tests to delete
- `test_exec_in_container_tty_retries_on_container_failure` — retry loop removed
- `test_exec_in_container_non_tty_retries_silently_exits_126` — retry loop removed
- `test_exec_in_container_propagates_hermes_exit_code` — no subprocess.run to check exit codes; execvp replaces the process. Note: exit code propagation still works correctly — when `os.execvp` succeeds, the container's process *becomes* this process, so its exit code is the process exit code by OS semantics. No application code needed, no test needed. A comment in the function docstring documents this intent for future readers.
---
## Out of Scope
- Auto-configuring sudoers rules in the NixOS module
- Any changes to `get_container_exec_info` parsing logic beyond the try/except narrowing
- Changes to `.container-mode` file format
- Changes to the `HERMES_DEV=1` bypass
- Changes to container detection logic (`_is_inside_container`)

View File

@@ -1,151 +0,0 @@
## Tool Investigation Report: Top 5 Recommendations from awesome-ai-tools
**Source:** [formatho/awesome-ai-tools](https://github.com/formatho/awesome-ai-tools)
**Date:** 2026-04-15
**Tools Analyzed:** 414 across 9 categories
**Agent:** Timmy
---
## Analysis Summary
Scanned 414 tools from the awesome-ai-tools repository. Evaluated each against Hermes integration potential across five categories: Memory/Context, Inference Optimization, Agent Orchestration, Workflow Automation, and Retrieval/RAG.
### Evaluation Criteria
- **Stars:** GitHub community validation (stability signal)
- **Freshness:** Active development (Fresh = updated <=7 days)
- **Integration Fit:** How well it complements Hermes' existing architecture (skills, memory, tools)
- **Integration Effort:** 1 (trivial drop-in) to 5 (major refactor required)
- **Impact:** 1 (incremental) to 5 (transformative)
---
## Top 5 Recommended Tools
### #1: Mem0 — Universal Memory Layer for AI Agents
| Metric | Value |
|--------|-------|
| **Category** | Memory/Context |
| **GitHub** | [mem0ai/mem0](https://github.com/mem0ai/mem0) |
| **Stars** | 53.1k |
| **Freshness** | Fresh |
| **Integration Effort** | 3/5 |
| **Impact** | 5/5 |
| **Hermes Status** | IMPLEMENTED (plugins/memory/mem0/) + LOCAL MODE (plugins/memory/mem0_local/) |
**Why it fits Hermes:**
Hermes currently has session_search (transcript recall) and memory (persistent facts), but lacks a unified memory layer that bridges sessions with semantic understanding. Mem0 provides exactly this: automatic memory extraction from conversations, deduplication, and cross-session retrieval with semantic search.
**Integration path:**
- Cloud: plugins/memory/mem0/ (requires MEM0_API_KEY)
- Local: plugins/memory/mem0_local/ (ChromaDB-backed, no API key)
- Auto-extract facts from session transcripts
- Query before session_search for richer contextual recall
**Key risk:** Mem0 is freemium — core is open-source but advanced features require paid tier. Local mode mitigates this entirely.
---
### #2: LightRAG — Simple and Fast Retrieval-Augmented Generation
| Metric | Value |
|--------|-------|
| **Category** | Retrieval/RAG |
| **GitHub** | [HKUDS/LightRAG](https://github.com/HKUDS/LightRAG) |
| **Stars** | 33.1k |
| **Freshness** | Fresh |
| **Integration Effort** | 3/5 |
| **Impact** | 4/5 |
| **Hermes Status** | NOT IMPLEMENTED — Issue #857 |
**Why it fits Hermes:**
Hermes has 190+ skills but no unified knowledge retrieval system. LightRAG adds graph-based RAG that understands relationships between concepts, not just keyword matches. It's lightweight, runs locally, and has a simple API.
**Integration path:**
- LightRAG as a local knowledge base for skill references
- Index GENOME.md files, README.md, and key codebase files
- Use local Ollama models for embeddings
- Complements existing search_files without replacing it
---
### #3: n8n — Workflow Automation Platform
| Metric | Value |
|--------|-------|
| **Category** | Workflow Automation / Agent Orchestration |
| **GitHub** | [n8n-io/n8n](https://github.com/n8n-io/n8n) |
| **Stars** | 183.9k |
| **Freshness** | Fresh |
| **Integration Effort** | 4/5 |
| **Impact** | 5/5 |
| **Hermes Status** | NOT IMPLEMENTED — Issue #858 |
**Why it fits Hermes:**
n8n provides a self-hosted, fair-code workflow platform with 400+ integrations. Rather than replacing Hermes' agent loop, n8n sits above it: trigger Hermes agents from external events, chain multi-agent workflows, and visualize execution.
---
### #4: RAGFlow — Open-Source RAG Engine
| Metric | Value |
|--------|-------|
| **Category** | Retrieval/RAG |
| **GitHub** | [infiniflow/ragflow](https://github.com/infiniflow/ragflow) |
| **Stars** | 77.9k |
| **Freshness** | Fresh |
| **Integration Effort** | 4/5 |
| **Impact** | 4/5 |
| **Hermes Status** | NOT IMPLEMENTED — Issue #859 |
**Why it fits Hermes:**
RAGFlow handles document parsing (PDF, Word, images via OCR), chunking, embedding, and retrieval with a web UI. Enables "document understanding" as a first-class capability.
---
### #5: tensorzero — LLMOps Platform
| Metric | Value |
|--------|-------|
| **Category** | Inference Optimization / LLMOps |
| **GitHub** | [tensorzero/tensorzero](https://github.com/tensorzero/tensorzero) |
| **Stars** | 11.2k |
| **Freshness** | Fresh |
| **Integration Effort** | 3/5 |
| **Impact** | 4/5 |
| **Hermes Status** | NOT IMPLEMENTED — Issue #860 |
**Why it fits Hermes:**
TensorZero unifies LLM gateway, observability, evaluation, and optimization. Replaces custom provider routing with a maintained, battle-tested platform.
---
## Honorable Mentions
| Tool | Stars | Category | Why Not Top 5 |
|------|-------|----------|---------------|
| memvid | 14.9k | Memory | Newer; Mem0 is more mature |
| mempalace | 44.8k | Memory | Already evaluated; Mem0 has broader API |
| Everything Claude Code | 154.3k | Agent | Too Claude-specific |
| Portkey AI Gateway | 11.3k | Gateway | TensorZero is OSS; Portkey is freemium |
---
## Implementation Priority
| Priority | Tool | Action | Status | Issue |
|----------|------|--------|--------|-------|
| P1 | Mem0 | Local-only mode (ChromaDB) | DONE | #842 |
| P2 | LightRAG | Set up local instance, index skills | Not started | #857 |
| P3 | tensorzero | Evaluate as provider routing | Not started | #860 |
| P4 | RAGFlow | Deploy Docker, test docs | Not started | #859 |
| P5 | n8n | Deploy for workflow viz | Not started | #858 |
---
## References
- Source: https://github.com/formatho/awesome-ai-tools
- Total tools: 414 across 9 categories
- Last updated: April 16, 2026
- Tracking issue: Timmy_Foundation/hermes-agent#842

View File

@@ -1,24 +0,0 @@
# Tool Investigation Report: Top 5 Recommendations
**Generated:** 2026-04-20 | **Source:** formatho/awesome-ai-tools (795 tools, 10 categories)
## Top 5
1. **LiteLLM** (76k) — Unified API gateway. Replace custom provider routing. Impact: 5/5, Effort: 2/5
2. **Mem0** (53k) — Universal memory layer. Structured long-term memory. Impact: 5/5, Effort: 3/5
3. **RAGFlow** (77k) — RAG engine with OCR. Document processing upgrade. Impact: 4/5, Effort: 4/5
4. **LiteRT-LM** (3.7k) — On-device inference. Edge/mobile deployment. Impact: 4/5, Effort: 3/5
5. **Claude-Mem** (61k) — Session capture and context injection. Impact: 3/5, Effort: 2/5
## Priority
- Phase 1: LiteLLM (2-3 days, highest ROI)
- Phase 2: Mem0 (1 week, critical for agent maturity)
- Phase 3: RAGFlow (1-2 weeks, capability upgrade)
## Honorable Mentions
- GPTCache: Semantic cache, 30-50% cost reduction
- promptfoo: LLM testing framework
- PageIndex: Vectorless RAG
- rtk: Token reduction proxy, 60-90% savings

View File

@@ -18,7 +18,9 @@ suppress delivery.
"""
import logging
import os
import threading
from pathlib import Path
logger = logging.getLogger("hooks.boot-md")

View File

@@ -8,7 +8,6 @@ Handles loading and validating configuration for:
- Delivery preferences
"""
import ipaddress
import logging
import os
import json
@@ -64,10 +63,8 @@ class Platform(Enum):
WEBHOOK = "webhook"
FEISHU = "feishu"
WECOM = "wecom"
WECOM_CALLBACK = "wecom_callback"
WEIXIN = "weixin"
BLUEBUBBLES = "bluebubbles"
QQBOT = "qqbot"
@dataclass
@@ -193,7 +190,7 @@ class StreamingConfig:
"""Configuration for real-time token streaming to messaging platforms."""
enabled: bool = False
transport: str = "edit" # "edit" (progressive editMessageText) or "off"
edit_interval: float = 1.0 # Seconds between message edits (Telegram rate-limits at ~1/s)
edit_interval: float = 0.3 # Seconds between message edits
buffer_threshold: int = 40 # Chars before forcing an edit
cursor: str = "" # Cursor shown during streaming
@@ -213,7 +210,7 @@ class StreamingConfig:
return cls(
enabled=data.get("enabled", False),
transport=data.get("transport", "edit"),
edit_interval=float(data.get("edit_interval", 1.0)),
edit_interval=float(data.get("edit_interval", 0.3)),
buffer_threshold=int(data.get("buffer_threshold", 40)),
cursor=data.get("cursor", ""),
)
@@ -294,20 +291,12 @@ class GatewayConfig:
# Feishu uses extra dict for app credentials
elif platform == Platform.FEISHU and config.extra.get("app_id"):
connected.append(platform)
# WeCom bot mode uses extra dict for bot credentials
# WeCom uses extra dict for bot credentials
elif platform == Platform.WECOM and config.extra.get("bot_id"):
connected.append(platform)
# WeCom callback mode uses corp_id or apps list
elif platform == Platform.WECOM_CALLBACK and (
config.extra.get("corp_id") or config.extra.get("apps")
):
connected.append(platform)
# BlueBubbles uses extra dict for local server config
elif platform == Platform.BLUEBUBBLES and config.extra.get("server_url") and config.extra.get("password"):
connected.append(platform)
# QQBot uses extra dict for app credentials
elif platform == Platform.QQBOT and config.extra.get("app_id") and config.extra.get("client_secret"):
connected.append(platform)
return connected
def get_home_channel(self, platform: Platform) -> Optional[HomeChannel]:
@@ -626,11 +615,6 @@ def load_gateway_config() -> GatewayConfig:
if isinstance(frc, list):
frc = ",".join(str(v) for v in frc)
os.environ["TELEGRAM_FREE_RESPONSE_CHATS"] = str(frc)
ignored_threads = telegram_cfg.get("ignored_threads")
if ignored_threads is not None and not os.getenv("TELEGRAM_IGNORED_THREADS"):
if isinstance(ignored_threads, list):
ignored_threads = ",".join(str(v) for v in ignored_threads)
os.environ["TELEGRAM_IGNORED_THREADS"] = str(ignored_threads)
if "reactions" in telegram_cfg and not os.getenv("TELEGRAM_REACTIONS"):
os.environ["TELEGRAM_REACTIONS"] = str(telegram_cfg["reactions"]).lower()
@@ -675,37 +659,6 @@ def load_gateway_config() -> GatewayConfig:
_apply_env_overrides(config)
# --- Validate loaded values ---
_validate_gateway_config(config)
return config
def _is_network_accessible(host: str) -> bool:
"""Return True if *host* would expose a server beyond the loopback interface.
Duplicates the logic in ``gateway.platforms.base.is_network_accessible``
without creating a circular import (base.py imports from this module).
"""
try:
addr = ipaddress.ip_address(host)
if addr.is_loopback:
return False
# ::ffff:127.x.x.x — Python's is_loopback returns False for
# IPv4-mapped loopback; unwrap and check the underlying IPv4.
if getattr(addr, "ipv4_mapped", None) and addr.ipv4_mapped.is_loopback:
return False
return True
except ValueError:
# Hostname: assume it could be network-accessible.
return True
def _validate_gateway_config(config: "GatewayConfig") -> None:
"""Validate and sanitize a loaded GatewayConfig in place.
Called by ``load_gateway_config()`` after all config sources are merged.
Extracted as a separate function for testability.
"""
policy = config.default_reset_policy
if not (0 <= policy.at_hour <= 23):
@@ -742,47 +695,7 @@ def _validate_gateway_config(config: "GatewayConfig") -> None:
platform.value, env_name,
)
# Reject known-weak placeholder tokens.
# Ported from openclaw/openclaw#64586: users who copy .env.example
# without changing placeholder values get a clear startup error instead
# of a confusing "auth failed" from the platform API.
try:
from hermes_cli.auth import has_usable_secret
except ImportError:
has_usable_secret = None # type: ignore[assignment]
if has_usable_secret is not None:
for platform, pconfig in config.platforms.items():
if not pconfig.enabled:
continue
env_name = _token_env_names.get(platform)
if not env_name:
continue
token = pconfig.token
if token and token.strip() and not has_usable_secret(token, min_length=4):
logger.error(
"%s is enabled but %s is set to a placeholder value ('%s'). "
"Set a real bot token before starting the gateway. "
"The adapter will NOT be started.",
platform.value, env_name, token.strip()[:6] + "...",
)
pconfig.enabled = False
# Warn when the API server is enabled on a network-accessible address
# without an auth key. The adapter will refuse to start anyway, but
# surfacing this at config-load time lets operators see the problem in
# the startup log before any platform adapter initialisation runs.
api_cfg = config.platforms.get(Platform.API_SERVER)
if api_cfg and api_cfg.enabled:
key = api_cfg.extra.get("key", "")
host = api_cfg.extra.get("host", "127.0.0.1")
if not key and _is_network_accessible(host):
logger.warning(
"API Server is enabled on %s but API_SERVER_KEY is not set. "
"The adapter will refuse to start on a network-accessible address. "
"Set API_SERVER_KEY or bind to 127.0.0.1 for local-only access.",
host,
)
return config
def _apply_env_overrides(config: GatewayConfig) -> None:
@@ -1074,23 +987,6 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
name=os.getenv("WECOM_HOME_CHANNEL_NAME", "Home"),
)
# WeCom callback mode (self-built apps)
wecom_callback_corp_id = os.getenv("WECOM_CALLBACK_CORP_ID")
wecom_callback_corp_secret = os.getenv("WECOM_CALLBACK_CORP_SECRET")
if wecom_callback_corp_id and wecom_callback_corp_secret:
if Platform.WECOM_CALLBACK not in config.platforms:
config.platforms[Platform.WECOM_CALLBACK] = PlatformConfig()
config.platforms[Platform.WECOM_CALLBACK].enabled = True
config.platforms[Platform.WECOM_CALLBACK].extra.update({
"corp_id": wecom_callback_corp_id,
"corp_secret": wecom_callback_corp_secret,
"agent_id": os.getenv("WECOM_CALLBACK_AGENT_ID", ""),
"token": os.getenv("WECOM_CALLBACK_TOKEN", ""),
"encoding_aes_key": os.getenv("WECOM_CALLBACK_ENCODING_AES_KEY", ""),
"host": os.getenv("WECOM_CALLBACK_HOST", "0.0.0.0"),
"port": int(os.getenv("WECOM_CALLBACK_PORT", "8645")),
})
# Weixin (personal WeChat via iLink Bot API)
weixin_token = os.getenv("WEIXIN_TOKEN")
weixin_account_id = os.getenv("WEIXIN_ACCOUNT_ID")
@@ -1121,9 +1017,6 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
weixin_group_allowed_users = os.getenv("WEIXIN_GROUP_ALLOWED_USERS", "").strip()
if weixin_group_allowed_users:
extra["group_allow_from"] = weixin_group_allowed_users
weixin_split_multiline = os.getenv("WEIXIN_SPLIT_MULTILINE_MESSAGES", "").strip()
if weixin_split_multiline:
extra["split_multiline_messages"] = weixin_split_multiline
weixin_home = os.getenv("WEIXIN_HOME_CHANNEL", "").strip()
if weixin_home:
config.platforms[Platform.WEIXIN].home_channel = HomeChannel(
@@ -1155,32 +1048,6 @@ def _apply_env_overrides(config: GatewayConfig) -> None:
name=os.getenv("BLUEBUBBLES_HOME_CHANNEL_NAME", "Home"),
)
# QQ (Official Bot API v2)
qq_app_id = os.getenv("QQ_APP_ID")
qq_client_secret = os.getenv("QQ_CLIENT_SECRET")
if qq_app_id or qq_client_secret:
if Platform.QQBOT not in config.platforms:
config.platforms[Platform.QQBOT] = PlatformConfig()
config.platforms[Platform.QQBOT].enabled = True
extra = config.platforms[Platform.QQBOT].extra
if qq_app_id:
extra["app_id"] = qq_app_id
if qq_client_secret:
extra["client_secret"] = qq_client_secret
qq_allowed_users = os.getenv("QQ_ALLOWED_USERS", "").strip()
if qq_allowed_users:
extra["allow_from"] = qq_allowed_users
qq_group_allowed = os.getenv("QQ_GROUP_ALLOWED_USERS", "").strip()
if qq_group_allowed:
extra["group_allow_from"] = qq_group_allowed
qq_home = os.getenv("QQ_HOME_CHANNEL", "").strip()
if qq_home:
config.platforms[Platform.QQBOT].home_channel = HomeChannel(
platform=Platform.QQBOT,
chat_id=qq_home,
name=os.getenv("QQ_HOME_CHANNEL_NAME", "Home"),
)
# Session settings
idle_minutes = os.getenv("SESSION_IDLE_MINUTES")
if idle_minutes:

View File

@@ -1,224 +0,0 @@
"""
Gateway Config Validator & Fallback Fix — #892.
Validates gateway configuration and provides sensible defaults
for missing keys to prevent fallback chain breaks.
"""
import logging
import os
from typing import Dict, Any, List, Optional
from dataclasses import dataclass, field
logger = logging.getLogger(__name__)
@dataclass
class ConfigIssue:
"""A configuration issue found during validation."""
key: str
severity: str # error, warning, info
message: str
fix: str
@dataclass
class ConfigValidation:
"""Result of config validation."""
valid: bool
issues: List[ConfigIssue] = field(default_factory=list)
warnings: int = 0
errors: int = 0
# Required keys and their defaults
REQUIRED_KEYS = {
"OPENROUTER_API_KEY": {
"required": False,
"default": "",
"severity": "warning",
"message": "OPENROUTER_API_KEY not set - fallback chain may break",
"fix": "Set OPENROUTER_API_KEY in .env for OpenRouter provider",
},
"API_SERVER_KEY": {
"required": False,
"default": "",
"severity": "warning",
"message": "API_SERVER_KEY not configured",
"fix": "Set API_SERVER_KEY in .env for API server auth",
},
"GITEA_TOKEN": {
"required": False,
"default": "",
"severity": "info",
"message": "GITEA_TOKEN not set - Gitea features disabled",
"fix": "Set GITEA_TOKEN in .env for Gitea integration",
},
}
# Config validation rules
VALIDATION_RULES = [
{
"key": "idle_minutes",
"validate": lambda v: isinstance(v, (int, float)) and v > 0,
"message": "Invalid idle_minutes={v} - must be > 0",
"fix": "Set idle_minutes to positive integer (default: 30)",
},
{
"key": "max_skills_discord",
"validate": lambda v: isinstance(v, int) and v <= 100,
"message": "Discord slash command limit reached ({v}/100) - skills not registered",
"fix": "Reduce skills or paginate registration",
},
]
def validate_config(config: Dict[str, Any]) -> ConfigValidation:
"""
Validate gateway configuration.
Args:
config: Configuration dictionary
Returns:
ConfigValidation with issues found
"""
issues = []
# Check required keys
for key, spec in REQUIRED_KEYS.items():
value = config.get(key) or os.environ.get(key) or spec["default"]
if spec["required"] and not value:
issues.append(ConfigIssue(
key=key,
severity=spec["severity"],
message=spec["message"],
fix=spec["fix"],
))
elif not value and spec["severity"] != "error":
issues.append(ConfigIssue(
key=key,
severity=spec["severity"],
message=spec["message"],
fix=spec["fix"],
))
# Check validation rules
for rule in VALIDATION_RULES:
value = config.get(rule["key"])
if value is not None:
if not rule["validate"](value):
issues.append(ConfigIssue(
key=rule["key"],
severity="error",
message=rule["message"].format(v=value),
fix=rule["fix"],
))
errors = sum(1 for i in issues if i.severity == "error")
warnings = sum(1 for i in issues if i.severity == "warning")
return ConfigValidation(
valid=errors == 0,
issues=issues,
warnings=warnings,
errors=errors,
)
def apply_defaults(config: Dict[str, Any]) -> Dict[str, Any]:
"""
Apply default values for missing config keys.
Args:
config: Configuration dictionary
Returns:
Config with defaults applied
"""
result = dict(config)
for key, spec in REQUIRED_KEYS.items():
if key not in result or not result[key]:
default = os.environ.get(key) or spec["default"]
if default:
result[key] = default
logger.debug("Applied default for %s", key)
# Apply validation defaults
if "idle_minutes" not in result or not result["idle_minutes"] or result["idle_minutes"] <= 0:
result["idle_minutes"] = 30
logger.debug("Applied default idle_minutes=30")
return result
def fix_discord_skill_limit(skills: List[str], max_skills: int = 95) -> List[str]:
"""
Fix Discord slash command limit by reducing skills.
Args:
skills: List of skill names
max_skills: Maximum skills to register (default 95, leaving room for built-ins)
Returns:
Reduced skill list
"""
if len(skills) <= max_skills:
return skills
logger.warning(
"Discord skill limit: %d skills exceeds %d limit, truncating",
len(skills), max_skills
)
# Keep first max_skills (alphabetical priority)
return sorted(skills)[:max_skills]
def validate_provider_config(provider: str, config: Dict[str, Any]) -> ConfigIssue:
"""
Validate provider-specific configuration.
Args:
provider: Provider name
config: Provider config
Returns:
ConfigIssue if invalid, None if valid
"""
if provider == "local-llama.cpp":
# Check if llama.cpp is configured
if not config.get("model_path") and not config.get("base_url"):
return ConfigIssue(
key=f"provider.{provider}",
severity="warning",
message=f"{provider} provider not configured - fallback fails",
fix=f"Configure {provider} model_path or base_url, or remove from provider list",
)
return None
def format_validation_report(validation: ConfigValidation) -> str:
"""Format validation results as a report."""
lines = [
"=" * 50,
"GATEWAY CONFIG VALIDATION",
"=" * 50,
"",
f"Status: {'VALID' if validation.valid else 'INVALID'}",
f"Errors: {validation.errors}",
f"Warnings: {validation.warnings}",
"",
]
if validation.issues:
lines.append("Issues:")
for issue in validation.issues:
icon = "" if issue.severity == "error" else "⚠️" if issue.severity == "warning" else ""
lines.append(f" {icon} [{issue.key}] {issue.message}")
lines.append(f" Fix: {issue.fix}")
lines.append("")
return "\n".join(lines)

View File

@@ -12,7 +12,7 @@ import logging
from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
from typing import Dict, List, Optional, Any
from typing import Dict, List, Optional, Any, Union
from hermes_cli.config import get_hermes_home

View File

@@ -1,194 +0,0 @@
"""Per-platform display/verbosity configuration resolver.
Provides ``resolve_display_setting()`` — the single entry-point for reading
display settings with platform-specific overrides and sensible defaults.
Resolution order (first non-None wins):
1. ``display.platforms.<platform>.<key>`` — explicit per-platform user override
2. ``display.<key>`` — global user setting
3. ``_PLATFORM_DEFAULTS[<platform>][<key>]`` — built-in sensible default
4. ``_GLOBAL_DEFAULTS[<key>]`` — built-in global default
Exception: ``display.streaming`` is CLI-only. Gateway streaming follows the
top-level ``streaming`` config unless ``display.platforms.<platform>.streaming``
sets an explicit per-platform override.
Backward compatibility: ``display.tool_progress_overrides`` is still read as a
fallback for ``tool_progress`` when no ``display.platforms`` entry exists. A
config migration (version bump) automatically moves the old format into the new
``display.platforms`` structure.
"""
from __future__ import annotations
from typing import Any
# ---------------------------------------------------------------------------
# Overrideable display settings and their global defaults
# ---------------------------------------------------------------------------
# These are the settings that can be configured per-platform.
# Other display settings (compact, personality, skin, etc.) are CLI-only
# and don't participate in per-platform resolution.
_GLOBAL_DEFAULTS: dict[str, Any] = {
"tool_progress": "all",
"show_reasoning": False,
"tool_preview_length": 0,
"streaming": None, # None = follow top-level streaming config
}
# ---------------------------------------------------------------------------
# Sensible per-platform defaults — tiered by platform capability
# ---------------------------------------------------------------------------
# Tier 1 (high): Supports message editing, typically personal/team use
# Tier 2 (medium): Supports editing but often workspace/customer-facing
# Tier 3 (low): No edit support — each progress msg is permanent
# Tier 4 (minimal): Batch/non-interactive delivery
_TIER_HIGH = {
"tool_progress": "all",
"show_reasoning": False,
"tool_preview_length": 40,
"streaming": None, # follow global
}
_TIER_MEDIUM = {
"tool_progress": "new",
"show_reasoning": False,
"tool_preview_length": 40,
"streaming": None,
}
_TIER_LOW = {
"tool_progress": "off",
"show_reasoning": False,
"tool_preview_length": 40,
"streaming": False,
}
_TIER_MINIMAL = {
"tool_progress": "off",
"show_reasoning": False,
"tool_preview_length": 0,
"streaming": False,
}
_PLATFORM_DEFAULTS: dict[str, dict[str, Any]] = {
# Tier 1 — full edit support, personal/team use
"telegram": _TIER_HIGH,
"discord": _TIER_HIGH,
# Tier 2 — edit support, often customer/workspace channels
"slack": _TIER_MEDIUM,
"mattermost": _TIER_MEDIUM,
"matrix": _TIER_MEDIUM,
"feishu": _TIER_MEDIUM,
# Tier 3 — no edit support, progress messages are permanent
"signal": _TIER_LOW,
"whatsapp": _TIER_MEDIUM, # Baileys bridge supports /edit
"bluebubbles": _TIER_LOW,
"weixin": _TIER_LOW,
"wecom": _TIER_LOW,
"wecom_callback": _TIER_LOW,
"dingtalk": _TIER_LOW,
# Tier 4 — batch or non-interactive delivery
"email": _TIER_MINIMAL,
"sms": _TIER_MINIMAL,
"webhook": _TIER_MINIMAL,
"homeassistant": _TIER_MINIMAL,
"api_server": {**_TIER_HIGH, "tool_preview_length": 0},
}
# Canonical set of per-platform overrideable keys (for validation).
OVERRIDEABLE_KEYS = frozenset(_GLOBAL_DEFAULTS.keys())
def resolve_display_setting(
user_config: dict,
platform_key: str,
setting: str,
fallback: Any = None,
) -> Any:
"""Resolve a display setting with per-platform override support.
Parameters
----------
user_config : dict
The full parsed config.yaml dict.
platform_key : str
Platform config key (e.g. ``"telegram"``, ``"slack"``). Use
``_platform_config_key(source.platform)`` from gateway/run.py.
setting : str
Display setting name (e.g. ``"tool_progress"``, ``"show_reasoning"``).
fallback : Any
Fallback value when the setting isn't found anywhere.
Returns
-------
The resolved value, or *fallback* if nothing is configured.
"""
display_cfg = user_config.get("display") or {}
# 1. Explicit per-platform override (display.platforms.<platform>.<key>)
platforms = display_cfg.get("platforms") or {}
plat_overrides = platforms.get(platform_key)
if isinstance(plat_overrides, dict):
val = plat_overrides.get(setting)
if val is not None:
return _normalise(setting, val)
# 1b. Backward compat: display.tool_progress_overrides.<platform>
if setting == "tool_progress":
legacy = display_cfg.get("tool_progress_overrides")
if isinstance(legacy, dict):
val = legacy.get(platform_key)
if val is not None:
return _normalise(setting, val)
# 2. Global user setting (display.<key>). Skip display.streaming because
# that key controls only CLI terminal streaming; gateway token streaming is
# governed by the top-level streaming config plus per-platform overrides.
if setting != "streaming":
val = display_cfg.get(setting)
if val is not None:
return _normalise(setting, val)
# 3. Built-in platform default
plat_defaults = _PLATFORM_DEFAULTS.get(platform_key)
if plat_defaults:
val = plat_defaults.get(setting)
if val is not None:
return val
# 4. Built-in global default
val = _GLOBAL_DEFAULTS.get(setting)
if val is not None:
return val
return fallback
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _normalise(setting: str, value: Any) -> Any:
"""Normalise YAML quirks (bare ``off`` → False in YAML 1.1)."""
if setting == "tool_progress":
if value is False:
return "off"
if value is True:
return "all"
return str(value).lower()
if setting in ("show_reasoning", "streaming"):
if isinstance(value, str):
return value.lower() in ("true", "1", "yes", "on")
return bool(value)
if setting == "tool_preview_length":
try:
return int(value)
except (TypeError, ValueError):
return 0
return value

View File

@@ -9,11 +9,9 @@ Each adapter handles:
"""
from .base import BasePlatformAdapter, MessageEvent, SendResult
from .qqbot import QQAdapter
__all__ = [
"BasePlatformAdapter",
"MessageEvent",
"SendResult",
"QQAdapter",
]

View File

@@ -2,11 +2,6 @@
OpenAI-compatible API server platform adapter.
Exposes an HTTP server with endpoints:
- GET / — Hermes Web Console operator cockpit
- GET /api/gui/health — cockpit health payload
- GET /api/gui/browser/status — browser runtime status
- POST /api/gui/browser/heal — self-healing browser cleanup
- GET /api/gui/discovery — ecosystem discovery for compatible frontends
- POST /v1/chat/completions — OpenAI Chat Completions format (stateless; opt-in session continuity via X-Hermes-Session-Id header)
- POST /v1/responses — OpenAI Responses API format (stateful via previous_response_id)
- GET /v1/responses/{response_id} — Retrieve a stored response
@@ -15,7 +10,6 @@ Exposes an HTTP server with endpoints:
- POST /v1/runs — start a run, returns run_id immediately (202)
- GET /v1/runs/{run_id}/events — SSE stream of structured lifecycle events
- GET /health — health check
- GET /health/detailed — rich status for cross-container dashboard probing
Any OpenAI-compatible frontend (Open WebUI, LobeChat, LibreChat,
AnythingLLM, NextChat, ChatBox, etc.) can connect to hermes-agent
@@ -59,67 +53,6 @@ DEFAULT_HOST = "127.0.0.1"
DEFAULT_PORT = 8642
MAX_STORED_RESPONSES = 100
MAX_REQUEST_BYTES = 1_000_000 # 1 MB default limit for POST bodies
CHAT_COMPLETIONS_SSE_KEEPALIVE_SECONDS = 30.0
MAX_NORMALIZED_TEXT_LENGTH = 65_536 # 64 KB cap for normalized content parts
MAX_CONTENT_LIST_SIZE = 1_000 # Max items when content is an array
def _normalize_chat_content(
content: Any, *, _max_depth: int = 10, _depth: int = 0,
) -> str:
"""Normalize OpenAI chat message content into a plain text string.
Some clients (Open WebUI, LobeChat, etc.) send content as an array of
typed parts instead of a plain string::
[{"type": "text", "text": "hello"}, {"type": "input_text", "text": "..."}]
This function flattens those into a single string so the agent pipeline
(which expects strings) doesn't choke.
Defensive limits prevent abuse: recursion depth, list size, and output
length are all bounded.
"""
if _depth > _max_depth:
return ""
if content is None:
return ""
if isinstance(content, str):
return content[:MAX_NORMALIZED_TEXT_LENGTH] if len(content) > MAX_NORMALIZED_TEXT_LENGTH else content
if isinstance(content, list):
parts: List[str] = []
items = content[:MAX_CONTENT_LIST_SIZE] if len(content) > MAX_CONTENT_LIST_SIZE else content
for item in items:
if isinstance(item, str):
if item:
parts.append(item[:MAX_NORMALIZED_TEXT_LENGTH])
elif isinstance(item, dict):
item_type = str(item.get("type") or "").strip().lower()
if item_type in {"text", "input_text", "output_text"}:
text = item.get("text", "")
if text:
try:
parts.append(str(text)[:MAX_NORMALIZED_TEXT_LENGTH])
except Exception:
pass
# Silently skip image_url / other non-text parts
elif isinstance(item, list):
nested = _normalize_chat_content(item, _max_depth=_max_depth, _depth=_depth + 1)
if nested:
parts.append(nested)
# Check accumulated size
if sum(len(p) for p in parts) >= MAX_NORMALIZED_TEXT_LENGTH:
break
result = "\n".join(parts)
return result[:MAX_NORMALIZED_TEXT_LENGTH] if len(result) > MAX_NORMALIZED_TEXT_LENGTH else result
# Fallback for unexpected types (int, float, bool, etc.)
try:
result = str(content)
return result[:MAX_NORMALIZED_TEXT_LENGTH] if len(result) > MAX_NORMALIZED_TEXT_LENGTH else result
except Exception:
return ""
def check_api_server_requirements() -> bool:
@@ -520,8 +453,6 @@ class APIServerAdapter(BasePlatformAdapter):
session_id: Optional[str] = None,
stream_delta_callback=None,
tool_progress_callback=None,
tool_start_callback=None,
tool_complete_callback=None,
) -> Any:
"""
Create an AIAgent instance using the gateway's runtime config.
@@ -560,8 +491,6 @@ class APIServerAdapter(BasePlatformAdapter):
platform="api_server",
stream_delta_callback=stream_delta_callback,
tool_progress_callback=tool_progress_callback,
tool_start_callback=tool_start_callback,
tool_complete_callback=tool_complete_callback,
session_db=self._ensure_session_db(),
fallback_model=fallback_model,
)
@@ -575,27 +504,6 @@ class APIServerAdapter(BasePlatformAdapter):
"""GET /health — simple health check."""
return web.json_response({"status": "ok", "platform": "hermes-agent"})
async def _handle_health_detailed(self, request: "web.Request") -> "web.Response":
"""GET /health/detailed — rich status for cross-container dashboard probing.
Returns gateway state, connected platforms, PID, and uptime so the
dashboard can display full status without needing a shared PID file or
/proc access. No authentication required.
"""
from gateway.status import read_runtime_status
runtime = read_runtime_status() or {}
return web.json_response({
"status": "ok",
"platform": "hermes-agent",
"gateway_state": runtime.get("gateway_state"),
"platforms": runtime.get("platforms", {}),
"active_agents": runtime.get("active_agents", 0),
"exit_reason": runtime.get("exit_reason"),
"updated_at": runtime.get("updated_at"),
"pid": os.getpid(),
})
async def _handle_models(self, request: "web.Request") -> "web.Response":
"""GET /v1/models — return hermes-agent as an available model."""
auth_err = self._check_auth(request)
@@ -644,7 +552,7 @@ class APIServerAdapter(BasePlatformAdapter):
for msg in messages:
role = msg.get("role", "")
content = _normalize_chat_content(msg.get("content", ""))
content = msg.get("content", "")
if role == "system":
# Accumulate system messages
if system_prompt is None:
@@ -854,11 +762,7 @@ class APIServerAdapter(BasePlatformAdapter):
"""
import queue as _q
sse_headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
}
sse_headers = {"Content-Type": "text/event-stream", "Cache-Control": "no-cache"}
# CORS middleware can't inject headers into StreamResponse after
# prepare() flushes them, so resolve CORS headers up front.
origin = request.headers.get("Origin", "")
@@ -871,8 +775,6 @@ class APIServerAdapter(BasePlatformAdapter):
await response.prepare(request)
try:
last_activity = time.monotonic()
# Role chunk
role_chunk = {
"id": completion_id, "object": "chat.completion.chunk",
@@ -880,7 +782,6 @@ class APIServerAdapter(BasePlatformAdapter):
"choices": [{"index": 0, "delta": {"role": "assistant"}, "finish_reason": None}],
}
await response.write(f"data: {json.dumps(role_chunk)}\n\n".encode())
last_activity = time.monotonic()
# Helper — route a queue item to the correct SSE event.
async def _emit(item):
@@ -904,7 +805,6 @@ class APIServerAdapter(BasePlatformAdapter):
"choices": [{"index": 0, "delta": {"content": item}, "finish_reason": None}],
}
await response.write(f"data: {json.dumps(content_chunk)}\n\n".encode())
return time.monotonic()
# Stream content chunks as they arrive from the agent
loop = asyncio.get_event_loop()
@@ -919,19 +819,16 @@ class APIServerAdapter(BasePlatformAdapter):
delta = stream_q.get_nowait()
if delta is None:
break
last_activity = await _emit(delta)
await _emit(delta)
except _q.Empty:
break
break
if time.monotonic() - last_activity >= CHAT_COMPLETIONS_SSE_KEEPALIVE_SECONDS:
await response.write(b": keepalive\n\n")
last_activity = time.monotonic()
continue
if delta is None: # End of stream sentinel
break
last_activity = await _emit(delta)
await _emit(delta)
# Get usage from completed agent
usage = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
@@ -974,427 +871,6 @@ class APIServerAdapter(BasePlatformAdapter):
return response
async def _write_sse_responses(
self,
request: "web.Request",
response_id: str,
model: str,
created_at: int,
stream_q,
agent_task,
agent_ref,
conversation_history: List[Dict[str, str]],
user_message: str,
instructions: Optional[str],
conversation: Optional[str],
store: bool,
session_id: str,
) -> "web.StreamResponse":
"""Write an SSE stream for POST /v1/responses (OpenAI Responses API).
Emits spec-compliant event types as the agent runs:
- ``response.created`` — initial envelope (status=in_progress)
- ``response.output_text.delta`` / ``response.output_text.done`` —
streamed assistant text
- ``response.output_item.added`` / ``response.output_item.done``
with ``item.type == "function_call"`` — when the agent invokes a
tool (both events fire; the ``done`` event carries the finalized
``arguments`` string)
- ``response.output_item.added`` with
``item.type == "function_call_output"`` — tool result with
``{call_id, output, status}``
- ``response.completed`` — terminal event carrying the full
response object with all output items + usage (same payload
shape as the non-streaming path for parity)
- ``response.failed`` — terminal event on agent error
If the client disconnects mid-stream, ``agent.interrupt()`` is
called so the agent stops issuing upstream LLM calls, then the
asyncio task is cancelled. When ``store=True`` the full response
is persisted to the ResponseStore in a ``finally`` block so GET
/v1/responses/{id} and ``previous_response_id`` chaining work the
same as the batch path.
"""
import queue as _q
sse_headers = {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
}
origin = request.headers.get("Origin", "")
cors = self._cors_headers_for_origin(origin) if origin else None
if cors:
sse_headers.update(cors)
if session_id:
sse_headers["X-Hermes-Session-Id"] = session_id
response = web.StreamResponse(status=200, headers=sse_headers)
await response.prepare(request)
# State accumulated during the stream
final_text_parts: List[str] = []
# Track open function_call items by name so we can emit a matching
# ``done`` event when the tool completes. Order preserved.
pending_tool_calls: List[Dict[str, Any]] = []
# Output items we've emitted so far (used to build the terminal
# response.completed payload). Kept in the order they appeared.
emitted_items: List[Dict[str, Any]] = []
# Monotonic counter for output_index (spec requires it).
output_index = 0
# Monotonic counter for call_id generation if the agent doesn't
# provide one (it doesn't, from tool_progress_callback).
call_counter = 0
# Canonical Responses SSE events include a monotonically increasing
# sequence_number. Add it server-side for every emitted event so
# clients that validate the OpenAI event schema can parse our stream.
sequence_number = 0
# Track the assistant message item id + content index for text
# delta events — the spec ties deltas to a specific item.
message_item_id = f"msg_{uuid.uuid4().hex[:24]}"
message_output_index: Optional[int] = None
message_opened = False
async def _write_event(event_type: str, data: Dict[str, Any]) -> None:
nonlocal sequence_number
if "sequence_number" not in data:
data["sequence_number"] = sequence_number
sequence_number += 1
payload = f"event: {event_type}\ndata: {json.dumps(data)}\n\n"
await response.write(payload.encode())
def _envelope(status: str) -> Dict[str, Any]:
env: Dict[str, Any] = {
"id": response_id,
"object": "response",
"status": status,
"created_at": created_at,
"model": model,
}
return env
final_response_text = ""
agent_error: Optional[str] = None
usage: Dict[str, int] = {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}
try:
# response.created — initial envelope, status=in_progress
created_env = _envelope("in_progress")
created_env["output"] = []
await _write_event("response.created", {
"type": "response.created",
"response": created_env,
})
last_activity = time.monotonic()
async def _open_message_item() -> None:
"""Emit response.output_item.added for the assistant message
the first time any text delta arrives."""
nonlocal message_opened, message_output_index, output_index
if message_opened:
return
message_opened = True
message_output_index = output_index
output_index += 1
item = {
"id": message_item_id,
"type": "message",
"status": "in_progress",
"role": "assistant",
"content": [],
}
await _write_event("response.output_item.added", {
"type": "response.output_item.added",
"output_index": message_output_index,
"item": item,
})
async def _emit_text_delta(delta_text: str) -> None:
await _open_message_item()
final_text_parts.append(delta_text)
await _write_event("response.output_text.delta", {
"type": "response.output_text.delta",
"item_id": message_item_id,
"output_index": message_output_index,
"content_index": 0,
"delta": delta_text,
"logprobs": [],
})
async def _emit_tool_started(payload: Dict[str, Any]) -> str:
"""Emit response.output_item.added for a function_call.
Returns the call_id so the matching completion event can
reference it. Prefer the real ``tool_call_id`` from the
agent when available; fall back to a generated call id for
safety in tests or older code paths.
"""
nonlocal output_index, call_counter
call_counter += 1
call_id = payload.get("tool_call_id") or f"call_{response_id[5:]}_{call_counter}"
args = payload.get("arguments", {})
if isinstance(args, dict):
arguments_str = json.dumps(args)
else:
arguments_str = str(args)
item = {
"id": f"fc_{uuid.uuid4().hex[:24]}",
"type": "function_call",
"status": "in_progress",
"name": payload.get("name", ""),
"call_id": call_id,
"arguments": arguments_str,
}
idx = output_index
output_index += 1
pending_tool_calls.append({
"call_id": call_id,
"name": payload.get("name", ""),
"arguments": arguments_str,
"item_id": item["id"],
"output_index": idx,
})
emitted_items.append({
"type": "function_call",
"name": payload.get("name", ""),
"arguments": arguments_str,
"call_id": call_id,
})
await _write_event("response.output_item.added", {
"type": "response.output_item.added",
"output_index": idx,
"item": item,
})
return call_id
async def _emit_tool_completed(payload: Dict[str, Any]) -> None:
"""Emit response.output_item.done (function_call) followed
by response.output_item.added (function_call_output)."""
nonlocal output_index
call_id = payload.get("tool_call_id")
result = payload.get("result", "")
pending = None
if call_id:
for i, p in enumerate(pending_tool_calls):
if p["call_id"] == call_id:
pending = pending_tool_calls.pop(i)
break
if pending is None:
# Completion without a matching start — skip to avoid
# emitting orphaned done events.
return
# function_call done
done_item = {
"id": pending["item_id"],
"type": "function_call",
"status": "completed",
"name": pending["name"],
"call_id": pending["call_id"],
"arguments": pending["arguments"],
}
await _write_event("response.output_item.done", {
"type": "response.output_item.done",
"output_index": pending["output_index"],
"item": done_item,
})
# function_call_output added (result)
result_str = result if isinstance(result, str) else json.dumps(result)
output_parts = [{"type": "input_text", "text": result_str}]
output_item = {
"id": f"fco_{uuid.uuid4().hex[:24]}",
"type": "function_call_output",
"call_id": pending["call_id"],
"output": output_parts,
"status": "completed",
}
idx = output_index
output_index += 1
emitted_items.append({
"type": "function_call_output",
"call_id": pending["call_id"],
"output": output_parts,
})
await _write_event("response.output_item.added", {
"type": "response.output_item.added",
"output_index": idx,
"item": output_item,
})
await _write_event("response.output_item.done", {
"type": "response.output_item.done",
"output_index": idx,
"item": output_item,
})
# Main drain loop — thread-safe queue fed by agent callbacks.
async def _dispatch(it) -> None:
"""Route a queue item to the correct SSE emitter.
Plain strings are text deltas. Tagged tuples with
``__tool_started__`` / ``__tool_completed__`` prefixes
are tool lifecycle events.
"""
if isinstance(it, tuple) and len(it) == 2 and isinstance(it[0], str):
tag, payload = it
if tag == "__tool_started__":
await _emit_tool_started(payload)
elif tag == "__tool_completed__":
await _emit_tool_completed(payload)
# Unknown tags are silently ignored (forward-compat).
elif isinstance(it, str):
await _emit_text_delta(it)
# Other types (non-string, non-tuple) are silently dropped.
loop = asyncio.get_event_loop()
while True:
try:
item = await loop.run_in_executor(None, lambda: stream_q.get(timeout=0.5))
except _q.Empty:
if agent_task.done():
# Drain remaining
while True:
try:
item = stream_q.get_nowait()
if item is None:
break
await _dispatch(item)
last_activity = time.monotonic()
except _q.Empty:
break
break
if time.monotonic() - last_activity >= CHAT_COMPLETIONS_SSE_KEEPALIVE_SECONDS:
await response.write(b": keepalive\n\n")
last_activity = time.monotonic()
continue
if item is None: # EOS sentinel
break
await _dispatch(item)
last_activity = time.monotonic()
# Pick up agent result + usage from the completed task
try:
result, agent_usage = await agent_task
usage = agent_usage or usage
# If the agent produced a final_response but no text
# deltas were streamed (e.g. some providers only emit
# the full response at the end), emit a single fallback
# delta so Responses clients still receive a live text part.
agent_final = result.get("final_response", "") if isinstance(result, dict) else ""
if agent_final and not final_text_parts:
await _emit_text_delta(agent_final)
if agent_final and not final_response_text:
final_response_text = agent_final
if isinstance(result, dict) and result.get("error") and not final_response_text:
agent_error = result["error"]
except Exception as e: # noqa: BLE001
logger.error("Error running agent for streaming responses: %s", e, exc_info=True)
agent_error = str(e)
# Close the message item if it was opened
final_response_text = "".join(final_text_parts) or final_response_text
if message_opened:
await _write_event("response.output_text.done", {
"type": "response.output_text.done",
"item_id": message_item_id,
"output_index": message_output_index,
"content_index": 0,
"text": final_response_text,
"logprobs": [],
})
msg_done_item = {
"id": message_item_id,
"type": "message",
"status": "completed",
"role": "assistant",
"content": [
{"type": "output_text", "text": final_response_text}
],
}
await _write_event("response.output_item.done", {
"type": "response.output_item.done",
"output_index": message_output_index,
"item": msg_done_item,
})
# Always append a final message item in the completed
# response envelope so clients that only parse the terminal
# payload still see the assistant text. This mirrors the
# shape produced by _extract_output_items in the batch path.
final_items: List[Dict[str, Any]] = list(emitted_items)
final_items.append({
"type": "message",
"role": "assistant",
"content": [
{"type": "output_text", "text": final_response_text or (agent_error or "")}
],
})
if agent_error:
failed_env = _envelope("failed")
failed_env["output"] = final_items
failed_env["error"] = {"message": agent_error, "type": "server_error"}
failed_env["usage"] = {
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
await _write_event("response.failed", {
"type": "response.failed",
"response": failed_env,
})
else:
completed_env = _envelope("completed")
completed_env["output"] = final_items
completed_env["usage"] = {
"input_tokens": usage.get("input_tokens", 0),
"output_tokens": usage.get("output_tokens", 0),
"total_tokens": usage.get("total_tokens", 0),
}
await _write_event("response.completed", {
"type": "response.completed",
"response": completed_env,
})
# Persist for future chaining / GET retrieval, mirroring
# the batch path behavior.
if store:
full_history = list(conversation_history)
full_history.append({"role": "user", "content": user_message})
if isinstance(result, dict) and result.get("messages"):
full_history.extend(result["messages"])
else:
full_history.append({"role": "assistant", "content": final_response_text})
self._response_store.put(response_id, {
"response": completed_env,
"conversation_history": full_history,
"instructions": instructions,
"session_id": session_id,
})
if conversation:
self._response_store.set_conversation(conversation, response_id)
except (ConnectionResetError, ConnectionAbortedError, BrokenPipeError, OSError):
# Client disconnected — interrupt the agent so it stops
# making upstream LLM calls, then cancel the task.
agent = agent_ref[0] if agent_ref else None
if agent is not None:
try:
agent.interrupt("SSE client disconnected")
except Exception:
pass
if not agent_task.done():
agent_task.cancel()
try:
await agent_task
except (asyncio.CancelledError, Exception):
pass
logger.info("SSE client disconnected; interrupted agent task %s", response_id)
return response
async def _handle_responses(self, request: "web.Request") -> "web.Response":
"""POST /v1/responses — OpenAI Responses API format."""
auth_err = self._check_auth(request)
@@ -1438,7 +914,18 @@ class APIServerAdapter(BasePlatformAdapter):
input_messages.append({"role": "user", "content": item})
elif isinstance(item, dict):
role = item.get("role", "user")
content = _normalize_chat_content(item.get("content", ""))
content = item.get("content", "")
# Handle content that may be a list of content parts
if isinstance(content, list):
text_parts = []
for part in content:
if isinstance(part, dict) and part.get("type") == "input_text":
text_parts.append(part.get("text", ""))
elif isinstance(part, dict) and part.get("type") == "output_text":
text_parts.append(part.get("text", ""))
elif isinstance(part, str):
text_parts.append(part)
content = "\n".join(text_parts)
input_messages.append({"role": role, "content": content})
else:
return web.json_response(_openai_error("'input' must be a string or array"), status=400)
@@ -1465,13 +952,11 @@ class APIServerAdapter(BasePlatformAdapter):
if previous_response_id:
logger.debug("Both conversation_history and previous_response_id provided; using conversation_history")
stored_session_id = None
if not conversation_history and previous_response_id:
stored = self._response_store.get(previous_response_id)
if stored is None:
return web.json_response(_openai_error(f"Previous response not found: {previous_response_id}"), status=404)
conversation_history = list(stored.get("conversation_history", []))
stored_session_id = stored.get("session_id")
# If no instructions provided, carry forward from previous
if instructions is None:
instructions = stored.get("instructions")
@@ -1489,83 +974,8 @@ class APIServerAdapter(BasePlatformAdapter):
if body.get("truncation") == "auto" and len(conversation_history) > 100:
conversation_history = conversation_history[-100:]
# Reuse session from previous_response_id chain so the dashboard
# groups the entire conversation under one session entry.
session_id = stored_session_id or str(uuid.uuid4())
stream = bool(body.get("stream", False))
if stream:
# Streaming branch — emit OpenAI Responses SSE events as the
# agent runs so frontends can render text deltas and tool
# calls in real time. See _write_sse_responses for details.
import queue as _q
_stream_q: _q.Queue = _q.Queue()
def _on_delta(delta):
# None from the agent is a CLI box-close signal, not EOS.
# Forwarding would kill the SSE stream prematurely; the
# SSE writer detects completion via agent_task.done().
if delta is not None:
_stream_q.put(delta)
def _on_tool_progress(event_type, name, preview, args, **kwargs):
"""Queue non-start tool progress events if needed in future.
The structured Responses stream uses ``tool_start_callback``
and ``tool_complete_callback`` for exact call-id correlation,
so progress events are currently ignored here.
"""
return
def _on_tool_start(tool_call_id, function_name, function_args):
"""Queue a started tool for live function_call streaming."""
_stream_q.put(("__tool_started__", {
"tool_call_id": tool_call_id,
"name": function_name,
"arguments": function_args or {},
}))
def _on_tool_complete(tool_call_id, function_name, function_args, function_result):
"""Queue a completed tool result for live function_call_output streaming."""
_stream_q.put(("__tool_completed__", {
"tool_call_id": tool_call_id,
"name": function_name,
"arguments": function_args or {},
"result": function_result,
}))
agent_ref = [None]
agent_task = asyncio.ensure_future(self._run_agent(
user_message=user_message,
conversation_history=conversation_history,
ephemeral_system_prompt=instructions,
session_id=session_id,
stream_delta_callback=_on_delta,
tool_progress_callback=_on_tool_progress,
tool_start_callback=_on_tool_start,
tool_complete_callback=_on_tool_complete,
agent_ref=agent_ref,
))
response_id = f"resp_{uuid.uuid4().hex[:28]}"
model_name = body.get("model", self._model_name)
created_at = int(time.time())
return await self._write_sse_responses(
request=request,
response_id=response_id,
model=model_name,
created_at=created_at,
stream_q=_stream_q,
agent_task=agent_task,
agent_ref=agent_ref,
conversation_history=conversation_history,
user_message=user_message,
instructions=instructions,
conversation=conversation,
store=store,
session_id=session_id,
)
# Run the agent (with Idempotency-Key support)
session_id = str(uuid.uuid4())
async def _compute_response():
return await self._run_agent(
@@ -1640,7 +1050,6 @@ class APIServerAdapter(BasePlatformAdapter):
"response": response_data,
"conversation_history": full_history,
"instructions": instructions,
"session_id": session_id,
})
# Update conversation mapping so the next request with the same
# conversation name automatically chains to this response
@@ -1994,8 +1403,6 @@ class APIServerAdapter(BasePlatformAdapter):
session_id: Optional[str] = None,
stream_delta_callback=None,
tool_progress_callback=None,
tool_start_callback=None,
tool_complete_callback=None,
agent_ref: Optional[list] = None,
) -> tuple:
"""
@@ -2017,8 +1424,6 @@ class APIServerAdapter(BasePlatformAdapter):
session_id=session_id,
stream_delta_callback=stream_delta_callback,
tool_progress_callback=tool_progress_callback,
tool_start_callback=tool_start_callback,
tool_complete_callback=tool_complete_callback,
)
if agent_ref is not None:
agent_ref[0] = agent
@@ -2155,12 +1560,10 @@ class APIServerAdapter(BasePlatformAdapter):
if previous_response_id:
logger.debug("Both conversation_history and previous_response_id provided; using conversation_history")
stored_session_id = None
if not conversation_history and previous_response_id:
stored = self._response_store.get(previous_response_id)
if stored:
conversation_history = list(stored.get("conversation_history", []))
stored_session_id = stored.get("session_id")
if instructions is None:
instructions = stored.get("instructions")
@@ -2179,7 +1582,7 @@ class APIServerAdapter(BasePlatformAdapter):
)
conversation_history.append({"role": msg["role"], "content": str(content)})
session_id = body.get("session_id") or stored_session_id or run_id
session_id = body.get("session_id") or run_id
ephemeral_system_prompt = instructions
async def _run_and_close():
@@ -2308,30 +1711,6 @@ class APIServerAdapter(BasePlatformAdapter):
# BasePlatformAdapter interface
# ------------------------------------------------------------------
def _register_routes(self, app: "web.Application") -> None:
"""Register API and operator-cockpit routes on an aiohttp app."""
from gateway.platforms.api_server_ui import maybe_register_web_console
app.router.add_get("/health", self._handle_health)
app.router.add_get("/health/detailed", self._handle_health_detailed)
app.router.add_get("/v1/health", self._handle_health)
app.router.add_get("/v1/models", self._handle_models)
app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
app.router.add_post("/v1/responses", self._handle_responses)
app.router.add_get("/v1/responses/{response_id}", self._handle_get_response)
app.router.add_delete("/v1/responses/{response_id}", self._handle_delete_response)
app.router.add_get("/api/jobs", self._handle_list_jobs)
app.router.add_post("/api/jobs", self._handle_create_job)
app.router.add_get("/api/jobs/{job_id}", self._handle_get_job)
app.router.add_patch("/api/jobs/{job_id}", self._handle_update_job)
app.router.add_delete("/api/jobs/{job_id}", self._handle_delete_job)
app.router.add_post("/api/jobs/{job_id}/pause", self._handle_pause_job)
app.router.add_post("/api/jobs/{job_id}/resume", self._handle_resume_job)
app.router.add_post("/api/jobs/{job_id}/run", self._handle_run_job)
app.router.add_post("/v1/runs", self._handle_runs)
app.router.add_get("/v1/runs/{run_id}/events", self._handle_run_events)
maybe_register_web_console(app)
async def connect(self) -> bool:
"""Start the aiohttp web server."""
if not AIOHTTP_AVAILABLE:
@@ -2342,7 +1721,25 @@ class APIServerAdapter(BasePlatformAdapter):
mws = [mw for mw in (cors_middleware, body_limit_middleware, security_headers_middleware) if mw is not None]
self._app = web.Application(middlewares=mws)
self._app["api_server_adapter"] = self
self._register_routes(self._app)
self._app.router.add_get("/health", self._handle_health)
self._app.router.add_get("/v1/health", self._handle_health)
self._app.router.add_get("/v1/models", self._handle_models)
self._app.router.add_post("/v1/chat/completions", self._handle_chat_completions)
self._app.router.add_post("/v1/responses", self._handle_responses)
self._app.router.add_get("/v1/responses/{response_id}", self._handle_get_response)
self._app.router.add_delete("/v1/responses/{response_id}", self._handle_delete_response)
# Cron jobs management API
self._app.router.add_get("/api/jobs", self._handle_list_jobs)
self._app.router.add_post("/api/jobs", self._handle_create_job)
self._app.router.add_get("/api/jobs/{job_id}", self._handle_get_job)
self._app.router.add_patch("/api/jobs/{job_id}", self._handle_update_job)
self._app.router.add_delete("/api/jobs/{job_id}", self._handle_delete_job)
self._app.router.add_post("/api/jobs/{job_id}/pause", self._handle_pause_job)
self._app.router.add_post("/api/jobs/{job_id}/resume", self._handle_resume_job)
self._app.router.add_post("/api/jobs/{job_id}/run", self._handle_run_job)
# Structured event streaming
self._app.router.add_post("/v1/runs", self._handle_runs)
self._app.router.add_get("/v1/runs/{run_id}/events", self._handle_run_events)
# Start background sweep to clean up orphaned (unconsumed) run streams
sweep_task = asyncio.create_task(self._sweep_orphaned_runs())
try:
@@ -2361,23 +1758,6 @@ class APIServerAdapter(BasePlatformAdapter):
)
return False
# Refuse to start network-accessible with a placeholder key.
# Ported from openclaw/openclaw#64586.
if is_network_accessible(self._host) and self._api_key:
try:
from hermes_cli.auth import has_usable_secret
if not has_usable_secret(self._api_key, min_length=8):
logger.error(
"[%s] Refusing to start: API_SERVER_KEY is set to a "
"placeholder value. Generate a real secret "
"(e.g. `openssl rand -hex 32`) and set API_SERVER_KEY "
"before exposing the API server on %s.",
self.name, self._host,
)
return False
except ImportError:
pass
# Port conflict detection — fail fast if port is already in use
try:
with _socket.socket(_socket.AF_INET, _socket.SOCK_STREAM) as _s:

View File

@@ -1,194 +0,0 @@
"""Thin operator web console for the API server.
This keeps the UI intentionally small: an aiohttp-mounted cockpit that
surfaces Hermes health, browser runtime state, and ecosystem discovery
without introducing a second heavyweight frontend architecture.
"""
from __future__ import annotations
import json
from html import escape
from typing import Any, Dict
from aiohttp import web
from tools.browser_tool import browser_runtime_heal, browser_runtime_status
_DISCOVERY_FRONTENDS = [
"Open WebUI",
"LobeChat",
"LibreChat",
"AnythingLLM",
"NextChat",
"ChatBox",
]
def _adapter(request: web.Request):
return request.app["api_server_adapter"]
def _auth_or_none(request: web.Request):
adapter = _adapter(request)
return adapter._check_auth(request)
def _render_console_html(adapter) -> str:
health = {
"platform": "api_server",
"host": adapter._host,
"port": adapter._port,
"model": adapter._model_name,
"auth_required": bool(adapter._api_key),
}
health_json = escape(json.dumps(health, indent=2, ensure_ascii=False))
return f'''<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Hermes Web Console</title>
<style>
:root {{ color-scheme: dark; --bg: #0b1020; --panel: #121933; --fg: #e5ecff; --muted: #9aa8d1; --accent: #72b8ff; --good: #6dde8a; }}
body {{ margin: 0; font-family: ui-monospace, SFMono-Regular, Menlo, monospace; background: var(--bg); color: var(--fg); }}
header {{ padding: 20px 24px; border-bottom: 1px solid #243056; }}
main {{ padding: 24px; display: grid; gap: 16px; grid-template-columns: repeat(auto-fit, minmax(320px, 1fr)); }}
.panel {{ background: var(--panel); border: 1px solid #243056; border-radius: 12px; padding: 16px; box-shadow: 0 10px 30px rgba(0,0,0,.2); }}
h1, h2 {{ margin: 0 0 12px; }}
h1 {{ font-size: 24px; color: var(--accent); }}
h2 {{ font-size: 16px; color: var(--accent); }}
p, li, label {{ color: var(--muted); line-height: 1.5; }}
pre {{ margin: 0; white-space: pre-wrap; word-break: break-word; color: var(--fg); }}
button, input {{ font: inherit; }}
button {{ background: #1e2a52; color: var(--fg); border: 1px solid #39508f; border-radius: 8px; padding: 10px 14px; cursor: pointer; }}
button:hover {{ border-color: var(--accent); }}
input {{ width: 100%; box-sizing: border-box; background: #0d142a; color: var(--fg); border: 1px solid #243056; border-radius: 8px; padding: 10px 12px; margin-bottom: 12px; }}
.row {{ display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 12px; }}
.badge {{ display: inline-block; color: var(--good); border: 1px solid #2f6940; border-radius: 999px; padding: 2px 10px; margin-left: 10px; font-size: 12px; }}
ul {{ margin: 0; padding-left: 18px; }}
code {{ color: var(--good); }}
</style>
</head>
<body>
<header>
<h1>Hermes Web Console <span class="badge">operator cockpit</span></h1>
<p>Thin web UI over the existing API server, browser runtime, and streaming endpoints.</p>
</header>
<main>
<section class="panel">
<h2>Gateway Health</h2>
<pre id="health">{health_json}</pre>
</section>
<section class="panel">
<h2>Browser Cockpit</h2>
<label for="apiKey">Optional API key (only needed when API_SERVER_KEY is configured)</label>
<input id="apiKey" type="password" placeholder="sk-... or bearer token">
<div class="row">
<button id="refreshBtn">Refresh Browser Status</button>
<button id="healBtn">Heal Browser Layer</button>
</div>
<pre id="browserStatus">Loading...</pre>
</section>
<section class="panel">
<h2>Ecosystem Discovery</h2>
<ul>
<li><code>GET /v1/models</code> — OpenAI-compatible model discovery</li>
<li><code>POST /v1/chat/completions</code> — chat frontend compatibility</li>
<li><code>POST /v1/responses</code> — stateful responses API</li>
<li><code>POST /v1/runs</code> + <code>GET /v1/runs/{{run_id}}/events</code> — SSE lifecycle stream</li>
<li><code>GET /api/gui/browser/status</code> — browser runtime status</li>
<li><code>POST /api/gui/browser/heal</code> — cleanup + orphan reaper</li>
</ul>
<pre id="discovery">Loading...</pre>
</section>
</main>
<script>
function authHeaders() {{
const key = document.getElementById('apiKey').value.trim();
return key ? {{ 'Authorization': 'Bearer ' + key }} : {{}};
}}
async function loadJson(path, options) {{
const response = await fetch(path, options);
const text = await response.text();
try {{ return {{ status: response.status, body: JSON.parse(text) }}; }}
catch (_) {{ return {{ status: response.status, body: {{ raw: text }} }}; }}
}}
async function refreshBrowser() {{
const result = await loadJson('/api/gui/browser/status', {{ headers: authHeaders() }});
document.getElementById('browserStatus').textContent = JSON.stringify(result, null, 2);
}}
async function healBrowser() {{
const result = await loadJson('/api/gui/browser/heal', {{ method: 'POST', headers: authHeaders() }});
document.getElementById('browserStatus').textContent = JSON.stringify(result, null, 2);
}}
async function loadDiscovery() {{
const result = await loadJson('/api/gui/discovery');
document.getElementById('discovery').textContent = JSON.stringify(result, null, 2);
}}
document.getElementById('refreshBtn').addEventListener('click', refreshBrowser);
document.getElementById('healBtn').addEventListener('click', healBrowser);
refreshBrowser();
loadDiscovery();
</script>
</body>
</html>'''
async def handle_web_console_index(request: web.Request) -> web.Response:
return web.Response(text=_render_console_html(_adapter(request)), content_type="text/html")
async def handle_gui_health(request: web.Request) -> web.Response:
adapter = _adapter(request)
return web.json_response({
"status": "ok",
"platform": "api_server",
"host": adapter._host,
"port": adapter._port,
"model": adapter._model_name,
"auth_required": bool(adapter._api_key),
})
async def handle_browser_status(request: web.Request) -> web.Response:
auth_err = _auth_or_none(request)
if auth_err is not None:
return auth_err
return web.json_response(browser_runtime_status())
async def handle_browser_heal(request: web.Request) -> web.Response:
auth_err = _auth_or_none(request)
if auth_err is not None:
return auth_err
return web.json_response(browser_runtime_heal())
async def handle_discovery(request: web.Request) -> web.Response:
adapter = _adapter(request)
return web.json_response({
"frontends": _DISCOVERY_FRONTENDS,
"operator_cockpit": {
"root": "/",
"health": "/api/gui/health",
"browser_status": "/api/gui/browser/status",
"browser_heal": "/api/gui/browser/heal",
},
"openai_compatible": {
"models": "/v1/models",
"chat_completions": "/v1/chat/completions",
"responses": "/v1/responses",
"runs": "/v1/runs",
"run_events": "/v1/runs/{run_id}/events",
"model_name": adapter._model_name,
},
})
def maybe_register_web_console(app: web.Application) -> None:
app.router.add_get("/", handle_web_console_index)
app.router.add_get("/api/gui/health", handle_gui_health)
app.router.add_get("/api/gui/browser/status", handle_browser_status)
app.router.add_post("/api/gui/browser/heal", handle_browser_heal)
app.router.add_get("/api/gui/discovery", handle_discovery)

Some files were not shown because too many files have changed in this diff Show More