run_agent.py

#!/usr/bin/env python3
"""
AI Agent Runner with Tool Calling

This module provides a clean, standalone agent that can execute AI models
with tool calling capabilities. It handles the conversation loop, tool execution,
and response management.

Features:
- Automatic tool calling loop until completion
- Configurable model parameters
- Error handling and recovery
- Message history management
- Support for multiple model providers

Usage:
    from run_agent import AIAgent
    
    agent = AIAgent(base_url="http://localhost:30000/v1", model="claude-opus-4-20250514")
    response = agent.run_conversation("Tell me about the latest Python updates")
"""

import atexit
import asyncio
import base64
import concurrent.futures
import copy
import hashlib
import json
import logging
logger = logging.getLogger(__name__)
import os
import random
import re
import sys
import tempfile
import time
import threading
import weakref
from types import SimpleNamespace
import uuid
from typing import List, Dict, Any, Optional
from openai import OpenAI
import fire
from datetime import datetime
from pathlib import Path

# Load .env from ~/.hermes/.env first, then project root as dev fallback.
# User-managed env files should override stale shell exports on restart.
from hermes_cli.env_loader import load_hermes_dotenv

_hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
_project_env = Path(__file__).parent / '.env'
_loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)
if _loaded_env_paths:
    for _env_path in _loaded_env_paths:
        logger.info("Loaded environment variables from %s", _env_path)
else:
    logger.info("No .env file found. Using system environment variables.")

# Point mini-swe-agent at ~/.hermes/ so it shares our config
os.environ.setdefault("MSWEA_GLOBAL_CONFIG_DIR", str(_hermes_home))
os.environ.setdefault("MSWEA_SILENT_STARTUP", "1")

# Import our tool system
from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
from tools.terminal_tool import cleanup_vm
from tools.interrupt import set_interrupt as _set_interrupt
from tools.browser_tool import cleanup_browser

import requests

from hermes_constants import OPENROUTER_BASE_URL, OPENROUTER_MODELS_URL

# Agent internals extracted to agent/ package for modularity
from agent.prompt_builder import (
    DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS,
    MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE,
)
from agent.model_metadata import (
    fetch_model_metadata, get_model_context_length,
    estimate_tokens_rough, estimate_messages_tokens_rough,
    get_next_probe_tier, parse_context_limit_from_error,
    save_context_length,
)
from agent.context_compressor import ContextCompressor
from agent.prompt_caching import apply_anthropic_cache_control
from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt
from agent.display import (
    KawaiiSpinner, build_tool_preview as _build_tool_preview,
    get_cute_tool_message as _get_cute_tool_message_impl,
    _detect_tool_failure,
    get_tool_emoji as _get_tool_emoji,
)
from agent.trajectory import (
    convert_scratchpad_to_think, has_incomplete_scratchpad,
    save_trajectory as _save_trajectory_to_file,
)
from utils import atomic_json_write

HONCHO_TOOL_NAMES = {
    "honcho_context",
    "honcho_profile",
    "honcho_search",
    "honcho_conclude",
}


class _SafeWriter:
    """Transparent stdio wrapper that catches OSError from broken pipes.

    When hermes-agent runs as a systemd service, Docker container, or headless
    daemon, the stdout/stderr pipe can become unavailable (idle timeout, buffer
    exhaustion, socket reset). Any print() call then raises
    ``OSError: [Errno 5] Input/output error``, which can crash agent setup or
    run_conversation() — especially via double-fault when an except handler
    also tries to print.

    This wrapper delegates all writes to the underlying stream and silently
    catches OSError. It is transparent when the wrapped stream is healthy.
    """

    __slots__ = ("_inner",)

    def __init__(self, inner):
        object.__setattr__(self, "_inner", inner)

    def write(self, data):
        try:
            return self._inner.write(data)
        except OSError:
            return len(data) if isinstance(data, str) else 0

    def flush(self):
        try:
            self._inner.flush()
        except OSError:
            pass

    def fileno(self):
        return self._inner.fileno()

    def isatty(self):
        try:
            return self._inner.isatty()
        except OSError:
            return False

    def __getattr__(self, name):
        return getattr(self._inner, name)


def _install_safe_stdio() -> None:
    """Wrap stdout/stderr so best-effort console output cannot crash the agent."""
    for stream_name in ("stdout", "stderr"):
        stream = getattr(sys, stream_name, None)
        if stream is not None and not isinstance(stream, _SafeWriter):
            setattr(sys, stream_name, _SafeWriter(stream))


class IterationBudget:
    """Thread-safe shared iteration counter for parent and child agents.

    Tracks total LLM-call iterations consumed across a parent agent and all
    its subagents.  A single ``IterationBudget`` is created by the parent
    and passed to every child so they share the same cap.

    ``execute_code`` (programmatic tool calling) iterations are refunded via
    :meth:`refund` so they don't eat into the budget.
    """

    def __init__(self, max_total: int):
        self.max_total = max_total
        self._used = 0
        self._lock = threading.Lock()

    def consume(self) -> bool:
        """Try to consume one iteration.  Returns True if allowed."""
        with self._lock:
            if self._used >= self.max_total:
                return False
            self._used += 1
            return True

    def refund(self) -> None:
        """Give back one iteration (e.g. for execute_code turns)."""
        with self._lock:
            if self._used > 0:
                self._used -= 1

    @property
    def used(self) -> int:
        return self._used

    @property
    def remaining(self) -> int:
        with self._lock:
            return max(0, self.max_total - self._used)


# Tools that must never run concurrently (interactive / user-facing).
# When any of these appear in a batch, we fall back to sequential execution.
_NEVER_PARALLEL_TOOLS = frozenset({"clarify"})

# Maximum number of concurrent worker threads for parallel tool execution.
_MAX_TOOL_WORKERS = 8


def _inject_honcho_turn_context(content, turn_context: str):
    """Append Honcho recall to the current-turn user message without mutating history.

    The returned content is sent to the API for this turn only. Keeping Honcho
    recall out of the system prompt preserves the stable cache prefix while
    still giving the model continuity context.
    """
    if not turn_context:
        return content

    note = (
        "[System note: The following Honcho memory was retrieved from prior "
        "sessions. It is continuity context for this turn only, not new user "
        "input.]\n\n"
        f"{turn_context}"
    )

    if isinstance(content, list):
        return list(content) + [{"type": "text", "text": note}]

    text = "" if content is None else str(content)
    if not text.strip():
        return note
    return f"{text}\n\n{note}"


class AIAgent:
    """
    AI Agent with tool calling capabilities.
    
    This class manages the conversation flow, tool execution, and response handling
    for AI models that support function calling.
    """
    
    def __init__(
        self,
        base_url: str = None,
        api_key: str = None,
        provider: str = None,
        api_mode: str = None,
        model: str = "anthropic/claude-opus-4.6",  # OpenRouter format
        max_iterations: int = 90,  # Default tool-calling iterations (shared with subagents)
        tool_delay: float = 1.0,
        enabled_toolsets: List[str] = None,
        disabled_toolsets: List[str] = None,
        save_trajectories: bool = False,
        verbose_logging: bool = False,
        quiet_mode: bool = False,
        ephemeral_system_prompt: str = None,
        log_prefix_chars: int = 100,
        log_prefix: str = "",
        providers_allowed: List[str] = None,
        providers_ignored: List[str] = None,
        providers_order: List[str] = None,
        provider_sort: str = None,
        provider_require_parameters: bool = False,
        provider_data_collection: str = None,
        session_id: str = None,
        tool_progress_callback: callable = None,
        thinking_callback: callable = None,
        reasoning_callback: callable = None,
        clarify_callback: callable = None,
        step_callback: callable = None,
        max_tokens: int = None,
        reasoning_config: Dict[str, Any] = None,
        prefill_messages: List[Dict[str, Any]] = None,
        platform: str = None,
        skip_context_files: bool = False,
        skip_memory: bool = False,
        session_db=None,
        honcho_session_key: str = None,
        honcho_manager=None,
        honcho_config=None,
        iteration_budget: "IterationBudget" = None,
        fallback_model: Dict[str, Any] = None,
        checkpoints_enabled: bool = False,
        checkpoint_max_snapshots: int = 50,
        pass_session_id: bool = False,
    ):
        """
        Initialize the AI Agent.

        Args:
            base_url (str): Base URL for the model API (optional)
            api_key (str): API key for authentication (optional, uses env var if not provided)
            provider (str): Provider identifier (optional; used for telemetry/routing hints)
            api_mode (str): API mode override: "chat_completions" or "codex_responses"
            model (str): Model name to use (default: "anthropic/claude-opus-4.6")
            max_iterations (int): Maximum number of tool calling iterations (default: 90)
            tool_delay (float): Delay between tool calls in seconds (default: 1.0)
            enabled_toolsets (List[str]): Only enable tools from these toolsets (optional)
            disabled_toolsets (List[str]): Disable tools from these toolsets (optional)
            save_trajectories (bool): Whether to save conversation trajectories to JSONL files (default: False)
            verbose_logging (bool): Enable verbose logging for debugging (default: False)
            quiet_mode (bool): Suppress progress output for clean CLI experience (default: False)
            ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
            log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 100)
            log_prefix (str): Prefix to add to all log messages for identification in parallel processing (default: "")
            providers_allowed (List[str]): OpenRouter providers to allow (optional)
            providers_ignored (List[str]): OpenRouter providers to ignore (optional)
            providers_order (List[str]): OpenRouter providers to try in order (optional)
            provider_sort (str): Sort providers by price/throughput/latency (optional)
            session_id (str): Pre-generated session ID for logging (optional, auto-generated if not provided)
            tool_progress_callback (callable): Callback function(tool_name, args_preview) for progress notifications
            clarify_callback (callable): Callback function(question, choices) -> str for interactive user questions.
                Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error.
            max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
            reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking).
                If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning.
            prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
                Useful for injecting a few-shot example or priming the model's response style.
                Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
            platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp").
                Used to inject platform-specific formatting hints into the system prompt.
            skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules
                into the system prompt. Use this for batch processing and data generation to avoid
                polluting trajectories with user-specific persona or project instructions.
            honcho_session_key (str): Session key for Honcho integration (e.g., "telegram:123456" or CLI session_id).
                When provided and Honcho is enabled in config, enables persistent cross-session user modeling.
            honcho_manager: Optional shared HonchoSessionManager owned by the caller.
            honcho_config: Optional HonchoClientConfig corresponding to honcho_manager.
        """
        _install_safe_stdio()

        self.model = model
        self.max_iterations = max_iterations
        # Shared iteration budget — parent creates, children inherit.
        # Consumed by every LLM turn across parent + all subagents.
        self.iteration_budget = iteration_budget or IterationBudget(max_iterations)
        self.tool_delay = tool_delay
        self.save_trajectories = save_trajectories
        self.verbose_logging = verbose_logging
        self.quiet_mode = quiet_mode
        self.ephemeral_system_prompt = ephemeral_system_prompt
        self.platform = platform  # "cli", "telegram", "discord", "whatsapp", etc.
        self.skip_context_files = skip_context_files
        self.pass_session_id = pass_session_id
        self.log_prefix_chars = log_prefix_chars
        self.log_prefix = f"{log_prefix} " if log_prefix else ""
        # Store effective base URL for feature detection (prompt caching, reasoning, etc.)
        # When no base_url is provided, the client defaults to OpenRouter, so reflect that here.
        self.base_url = base_url or OPENROUTER_BASE_URL
        provider_name = provider.strip().lower() if isinstance(provider, str) and provider.strip() else None
        self.provider = provider_name or "openrouter"
        if api_mode in {"chat_completions", "codex_responses", "anthropic_messages"}:
            self.api_mode = api_mode
        elif self.provider == "openai-codex":
            self.api_mode = "codex_responses"
        elif (provider_name is None) and "chatgpt.com/backend-api/codex" in self.base_url.lower():
            self.api_mode = "codex_responses"
            self.provider = "openai-codex"
        elif self.provider == "anthropic" or (provider_name is None and "api.anthropic.com" in self.base_url.lower()):
            self.api_mode = "anthropic_messages"
            self.provider = "anthropic"
        else:
            self.api_mode = "chat_completions"

        self.tool_progress_callback = tool_progress_callback
        self.thinking_callback = thinking_callback
        self.reasoning_callback = reasoning_callback
        self.clarify_callback = clarify_callback
        self.step_callback = step_callback
        self._last_reported_tool = None  # Track for "new tool" mode
        
        # Interrupt mechanism for breaking out of tool loops
        self._interrupt_requested = False
        self._interrupt_message = None  # Optional message that triggered interrupt
        self._client_lock = threading.RLock()
        
        # Subagent delegation state
        self._delegate_depth = 0        # 0 = top-level agent, incremented for children
        self._active_children = []      # Running child AIAgents (for interrupt propagation)
        
        # Store OpenRouter provider preferences
        self.providers_allowed = providers_allowed
        self.providers_ignored = providers_ignored
        self.providers_order = providers_order
        self.provider_sort = provider_sort
        self.provider_require_parameters = provider_require_parameters
        self.provider_data_collection = provider_data_collection

        # Store toolset filtering options
        self.enabled_toolsets = enabled_toolsets
        self.disabled_toolsets = disabled_toolsets
        
        # Model response configuration
        self.max_tokens = max_tokens  # None = use model default
        self.reasoning_config = reasoning_config  # None = use default (medium for OpenRouter)
        self.prefill_messages = prefill_messages or []  # Prefilled conversation turns
        
        # Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
        # Reduces input costs by ~75% on multi-turn conversations by caching the
        # conversation prefix. Uses system_and_3 strategy (4 breakpoints).
        is_openrouter = "openrouter" in self.base_url.lower()
        is_claude = "claude" in self.model.lower()
        is_native_anthropic = self.api_mode == "anthropic_messages"
        self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic
        self._cache_ttl = "5m"  # Default 5-minute TTL (1.25x write cost)
        
        # Iteration budget pressure: warn the LLM as it approaches max_iterations.
        # Warnings are injected into the last tool result JSON (not as separate
        # messages) so they don't break message structure or invalidate caching.
        self._budget_caution_threshold = 0.7   # 70% — nudge to start wrapping up
        self._budget_warning_threshold = 0.9   # 90% — urgent, respond now
        self._budget_pressure_enabled = True

        # Persistent error log -- always writes WARNING+ to ~/.hermes/logs/errors.log
        # so tool failures, API errors, etc. are inspectable after the fact.
        # In gateway mode, each incoming message creates a new AIAgent instance,
        # while the root logger is process-global. Re-adding the same errors.log
        # handler would cause each warning/error line to be written multiple times.
        from logging.handlers import RotatingFileHandler
        root_logger = logging.getLogger()
        error_log_dir = _hermes_home / "logs"
        error_log_path = error_log_dir / "errors.log"
        resolved_error_log_path = error_log_path.resolve()
        has_errors_log_handler = any(
            isinstance(handler, RotatingFileHandler)
            and Path(getattr(handler, "baseFilename", "")).resolve() == resolved_error_log_path
            for handler in root_logger.handlers
        )
        if not has_errors_log_handler:
            from agent.redact import RedactingFormatter
            error_log_dir.mkdir(parents=True, exist_ok=True)
            error_file_handler = RotatingFileHandler(
                error_log_path, maxBytes=2 * 1024 * 1024, backupCount=2,
            )
            error_file_handler.setLevel(logging.WARNING)
            error_file_handler.setFormatter(RedactingFormatter(
                '%(asctime)s %(levelname)s %(name)s: %(message)s',
            ))
            root_logger.addHandler(error_file_handler)

        if self.verbose_logging:
            logging.basicConfig(
                level=logging.DEBUG,
                format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                datefmt='%H:%M:%S'
            )
            for handler in logging.getLogger().handlers:
                handler.setFormatter(RedactingFormatter(
                    '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                    datefmt='%H:%M:%S',
                ))
            # Keep third-party libraries at WARNING level to reduce noise
            # We have our own retry and error logging that's more informative
            logging.getLogger('openai').setLevel(logging.WARNING)
            logging.getLogger('openai._base_client').setLevel(logging.WARNING)
            logging.getLogger('httpx').setLevel(logging.WARNING)
            logging.getLogger('httpcore').setLevel(logging.WARNING)
            logging.getLogger('asyncio').setLevel(logging.WARNING)
            # Suppress Modal/gRPC related debug spam
            logging.getLogger('hpack').setLevel(logging.WARNING)
            logging.getLogger('hpack.hpack').setLevel(logging.WARNING)
            logging.getLogger('grpc').setLevel(logging.WARNING)
            logging.getLogger('modal').setLevel(logging.WARNING)
            logging.getLogger('rex-deploy').setLevel(logging.INFO)  # Keep INFO for sandbox status
            logger.info("Verbose logging enabled (third-party library logs suppressed)")
        else:
            # Set logging to INFO level for important messages only
            logging.basicConfig(
                level=logging.INFO,
                format='%(asctime)s - %(levelname)s - %(message)s',
                datefmt='%H:%M:%S'
            )
            # Suppress noisy library logging
            logging.getLogger('openai').setLevel(logging.ERROR)
            logging.getLogger('openai._base_client').setLevel(logging.ERROR)
            logging.getLogger('httpx').setLevel(logging.ERROR)
            logging.getLogger('httpcore').setLevel(logging.ERROR)
            if self.quiet_mode:
                # In quiet mode (CLI default), suppress all tool/infra log
                # noise. The TUI has its own rich display for status; logger
                # INFO/WARNING messages just clutter it.
                for quiet_logger in [
                    'tools',               # all tools.* (terminal, browser, web, file, etc.)
                    'minisweagent',         # mini-swe-agent execution backend
                    'run_agent',            # agent runner internals
                    'trajectory_compressor',
                    'cron',                 # scheduler (only relevant in daemon mode)
                    'hermes_cli',           # CLI helpers
                ]:
                    logging.getLogger(quiet_logger).setLevel(logging.ERROR)
        
        # Internal stream callback (set during streaming TTS).
        # Initialized here so _vprint can reference it before run_conversation.
        self._stream_callback = None

        # Optional current-turn user-message override used when the API-facing
        # user message intentionally differs from the persisted transcript
        # (e.g. CLI voice mode adds a temporary prefix for the live call only).
        self._persist_user_message_idx = None
        self._persist_user_message_override = None

        # Cache anthropic image-to-text fallbacks per image payload/URL so a
        # single tool loop does not repeatedly re-run auxiliary vision on the
        # same image history.
        self._anthropic_image_fallback_cache: Dict[str, str] = {}

        # Initialize LLM client via centralized provider router.
        # The router handles auth resolution, base URL, headers, and
        # Codex/Anthropic wrapping for all known providers.
        # raw_codex=True because the main agent needs direct responses.stream()
        # access for Codex Responses API streaming.
        self._anthropic_client = None

        if self.api_mode == "anthropic_messages":
            from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token
            effective_key = api_key or resolve_anthropic_token() or ""
            self._anthropic_api_key = effective_key
            self._anthropic_base_url = base_url
            self._anthropic_client = build_anthropic_client(effective_key, base_url)
            # No OpenAI client needed for Anthropic mode
            self.client = None
            self._client_kwargs = {}
            if not self.quiet_mode:
                print(f"🤖 AI Agent initialized with model: {self.model} (Anthropic native)")
                if effective_key and len(effective_key) > 12:
                    print(f"🔑 Using token: {effective_key[:8]}...{effective_key[-4:]}")
        else:
            if api_key and base_url:
                # Explicit credentials from CLI/gateway — construct directly.
                # The runtime provider resolver already handled auth for us.
                client_kwargs = {"api_key": api_key, "base_url": base_url}
                effective_base = base_url
                if "openrouter" in effective_base.lower():
                    client_kwargs["default_headers"] = {
                        "HTTP-Referer": "https://hermes-agent.nousresearch.com",
                        "X-OpenRouter-Title": "Hermes Agent",
                        "X-OpenRouter-Categories": "productivity,cli-agent",
                    }
                elif "api.kimi.com" in effective_base.lower():
                    client_kwargs["default_headers"] = {
                        "User-Agent": "KimiCLI/1.3",
                    }
            else:
                # No explicit creds — use the centralized provider router
                from agent.auxiliary_client import resolve_provider_client
                _routed_client, _ = resolve_provider_client(
                    self.provider or "auto", model=self.model, raw_codex=True)
                if _routed_client is not None:
                    client_kwargs = {
                        "api_key": _routed_client.api_key,
                        "base_url": str(_routed_client.base_url),
                    }
                    # Preserve any default_headers the router set
                    if hasattr(_routed_client, '_default_headers') and _routed_client._default_headers:
                        client_kwargs["default_headers"] = dict(_routed_client._default_headers)
                else:
                    # Final fallback: try raw OpenRouter key
                    client_kwargs = {
                        "api_key": os.getenv("OPENROUTER_API_KEY", ""),
                        "base_url": OPENROUTER_BASE_URL,
                        "default_headers": {
                            "HTTP-Referer": "https://hermes-agent.nousresearch.com",
                            "X-OpenRouter-Title": "Hermes Agent",
                            "X-OpenRouter-Categories": "productivity,cli-agent",
                        },
                    }
            
            self._client_kwargs = client_kwargs  # stored for rebuilding after interrupt
            try:
                self.client = self._create_openai_client(client_kwargs, reason="agent_init", shared=True)
                if not self.quiet_mode:
                    print(f"🤖 AI Agent initialized with model: {self.model}")
                    if base_url:
                        print(f"🔗 Using custom base URL: {base_url}")
                    # Always show API key info (masked) for debugging auth issues
                    key_used = client_kwargs.get("api_key", "none")
                    if key_used and key_used != "dummy-key" and len(key_used) > 12:
                        print(f"🔑 Using API key: {key_used[:8]}...{key_used[-4:]}")
                    else:
                        print(f"⚠️  Warning: API key appears invalid or missing (got: '{key_used[:20] if key_used else 'none'}...')")
            except Exception as e:
                raise RuntimeError(f"Failed to initialize OpenAI client: {e}")
        
        # Provider fallback — a single backup model/provider tried when the
        # primary is exhausted (rate-limit, overload, connection failure).
        # Config shape: {"provider": "openrouter", "model": "anthropic/claude-sonnet-4"}
        self._fallback_model = fallback_model if isinstance(fallback_model, dict) else None
        self._fallback_activated = False
        if self._fallback_model:
            fb_p = self._fallback_model.get("provider", "")
            fb_m = self._fallback_model.get("model", "")
            if fb_p and fb_m and not self.quiet_mode:
                print(f"🔄 Fallback model: {fb_m} ({fb_p})")

        # Get available tools with filtering
        self.tools = get_tool_definitions(
            enabled_toolsets=enabled_toolsets,
            disabled_toolsets=disabled_toolsets,
            quiet_mode=self.quiet_mode,
        )
        
        # Show tool configuration and store valid tool names for validation
        self.valid_tool_names = set()
        if self.tools:
            self.valid_tool_names = {tool["function"]["name"] for tool in self.tools}
            tool_names = sorted(self.valid_tool_names)
            if not self.quiet_mode:
                print(f"🛠️  Loaded {len(self.tools)} tools: {', '.join(tool_names)}")
                
                # Show filtering info if applied
                if enabled_toolsets:
                    print(f"   ✅ Enabled toolsets: {', '.join(enabled_toolsets)}")
                if disabled_toolsets:
                    print(f"   ❌ Disabled toolsets: {', '.join(disabled_toolsets)}")
        elif not self.quiet_mode:
            print("🛠️  No tools loaded (all tools filtered out or unavailable)")
        
        # Check tool requirements
        if self.tools and not self.quiet_mode:
            requirements = check_toolset_requirements()
            missing_reqs = [name for name, available in requirements.items() if not available]
            if missing_reqs:
                print(f"⚠️  Some tools may not work due to missing requirements: {missing_reqs}")
        
        # Show trajectory saving status
        if self.save_trajectories and not self.quiet_mode:
            print("📝 Trajectory saving enabled")
        
        # Show ephemeral system prompt status
        if self.ephemeral_system_prompt and not self.quiet_mode:
            prompt_preview = self.ephemeral_system_prompt[:60] + "..." if len(self.ephemeral_system_prompt) > 60 else self.ephemeral_system_prompt
            print(f"🔒 Ephemeral system prompt: '{prompt_preview}' (not saved to trajectories)")
        
        # Show prompt caching status
        if self._use_prompt_caching and not self.quiet_mode:
            source = "native Anthropic" if is_native_anthropic else "Claude via OpenRouter"
            print(f"💾 Prompt caching: ENABLED ({source}, {self._cache_ttl} TTL)")
        
        # Session logging setup - auto-save conversation trajectories for debugging
        self.session_start = datetime.now()
        if session_id:
            # Use provided session ID (e.g., from CLI)
            self.session_id = session_id
        else:
            # Generate a new session ID
            timestamp_str = self.session_start.strftime("%Y%m%d_%H%M%S")
            short_uuid = uuid.uuid4().hex[:6]
            self.session_id = f"{timestamp_str}_{short_uuid}"
        
        # Session logs go into ~/.hermes/sessions/ alongside gateway sessions
        hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
        self.logs_dir = hermes_home / "sessions"
        self.logs_dir.mkdir(parents=True, exist_ok=True)
        self.session_log_file = self.logs_dir / f"session_{self.session_id}.json"
        
        # Track conversation messages for session logging
        self._session_messages: List[Dict[str, Any]] = []
        
        # Cached system prompt -- built once per session, only rebuilt on compression
        self._cached_system_prompt: Optional[str] = None
        
        # Filesystem checkpoint manager (transparent — not a tool)
        from tools.checkpoint_manager import CheckpointManager
        self._checkpoint_mgr = CheckpointManager(
            enabled=checkpoints_enabled,
            max_snapshots=checkpoint_max_snapshots,
        )
        
        # SQLite session store (optional -- provided by CLI or gateway)
        self._session_db = session_db
        self._last_flushed_db_idx = 0  # tracks DB-write cursor to prevent duplicate writes
        if self._session_db:
            try:
                self._session_db.create_session(
                    session_id=self.session_id,
                    source=self.platform or "cli",
                    model=self.model,
                    model_config={
                        "max_iterations": self.max_iterations,
                        "reasoning_config": reasoning_config,
                        "max_tokens": max_tokens,
                    },
                    user_id=None,
                )
            except Exception as e:
                logger.debug("Session DB create_session failed: %s", e)
        
        # In-memory todo list for task planning (one per agent/session)
        from tools.todo_tool import TodoStore
        self._todo_store = TodoStore()
        
        # Persistent memory (MEMORY.md + USER.md) -- loaded from disk
        self._memory_store = None
        self._memory_enabled = False
        self._user_profile_enabled = False
        self._memory_nudge_interval = 10
        self._memory_flush_min_turns = 6
        if not skip_memory:
            try:
                from hermes_cli.config import load_config as _load_mem_config
                mem_config = _load_mem_config().get("memory", {})
                self._memory_enabled = mem_config.get("memory_enabled", False)
                self._user_profile_enabled = mem_config.get("user_profile_enabled", False)
                self._memory_nudge_interval = int(mem_config.get("nudge_interval", 10))
                self._memory_flush_min_turns = int(mem_config.get("flush_min_turns", 6))
                if self._memory_enabled or self._user_profile_enabled:
                    from tools.memory_tool import MemoryStore
                    self._memory_store = MemoryStore(
                        memory_char_limit=mem_config.get("memory_char_limit", 2200),
                        user_char_limit=mem_config.get("user_char_limit", 1375),
                    )
                    self._memory_store.load_from_disk()
            except Exception:
                pass  # Memory is optional -- don't break agent init
        
        # Honcho AI-native memory (cross-session user modeling)
        # Reads ~/.honcho/config.json as the single source of truth.
        self._honcho = None  # HonchoSessionManager | None
        self._honcho_session_key = honcho_session_key
        self._honcho_config = None  # HonchoClientConfig | None
        self._honcho_exit_hook_registered = False
        if not skip_memory:
            try:
                if honcho_manager is not None:
                    hcfg = honcho_config or getattr(honcho_manager, "_config", None)
                    self._honcho_config = hcfg
                    if hcfg and self._honcho_should_activate(hcfg):
                        self._honcho = honcho_manager
                        self._activate_honcho(
                            hcfg,
                            enabled_toolsets=enabled_toolsets,
                            disabled_toolsets=disabled_toolsets,
                            session_db=session_db,
                        )
                else:
                    from honcho_integration.client import HonchoClientConfig, get_honcho_client
                    hcfg = HonchoClientConfig.from_global_config()
                    self._honcho_config = hcfg
                    if self._honcho_should_activate(hcfg):
                        from honcho_integration.session import HonchoSessionManager
                        client = get_honcho_client(hcfg)
                        self._honcho = HonchoSessionManager(
                            honcho=client,
                            config=hcfg,
                            context_tokens=hcfg.context_tokens,
                        )
                        self._activate_honcho(
                            hcfg,
                            enabled_toolsets=enabled_toolsets,
                            disabled_toolsets=disabled_toolsets,
                            session_db=session_db,
                        )
                    else:
                        if not hcfg.enabled:
                            logger.debug("Honcho disabled in global config")
                        elif not hcfg.api_key:
                            logger.debug("Honcho enabled but no API key configured")
                        else:
                            logger.debug("Honcho enabled but missing API key or disabled in config")
            except Exception as e:
                logger.warning("Honcho init failed — memory disabled: %s", e)
                print(f"  Honcho init failed: {e}")
                print("  Run 'hermes honcho setup' to reconfigure.")
                self._honcho = None

        # Tools are initially discovered before Honcho activation. If Honcho
        # stays inactive, remove any stale honcho_* tools from prior process state.
        if not self._honcho:
            self._strip_honcho_tools_from_surface()

        # Gate local memory writes based on per-peer memory modes.
        # AI peer governs MEMORY.md; user peer governs USER.md.
        # "honcho" = Honcho only, disable local writes.
        if self._honcho_config and self._honcho:
            _hcfg = self._honcho_config
            _agent_mode = _hcfg.peer_memory_mode(_hcfg.ai_peer)
            _user_mode = _hcfg.peer_memory_mode(_hcfg.peer_name or "user")
            if _agent_mode == "honcho":
                self._memory_flush_min_turns = 0
                self._memory_enabled = False
                logger.debug("peer %s memory_mode=honcho: local MEMORY.md writes disabled", _hcfg.ai_peer)
            if _user_mode == "honcho":
                self._user_profile_enabled = False
                logger.debug("peer %s memory_mode=honcho: local USER.md writes disabled", _hcfg.peer_name or "user")

        # Skills config: nudge interval for skill creation reminders
        self._skill_nudge_interval = 15
        try:
            from hermes_cli.config import load_config as _load_skills_config
            skills_config = _load_skills_config().get("skills", {})
            self._skill_nudge_interval = int(skills_config.get("creation_nudge_interval", 15))
        except Exception:
            pass
        
        # Initialize context compressor for automatic context management
        # Compresses conversation when approaching model's context limit
        # Configuration via config.yaml (compression section) or environment variables
        compression_threshold = float(os.getenv("CONTEXT_COMPRESSION_THRESHOLD", "0.50"))
        compression_enabled = os.getenv("CONTEXT_COMPRESSION_ENABLED", "true").lower() in ("true", "1", "yes")
        compression_summary_model = os.getenv("CONTEXT_COMPRESSION_MODEL") or None
        
        self.context_compressor = ContextCompressor(
            model=self.model,
            threshold_percent=compression_threshold,
            protect_first_n=3,
            protect_last_n=4,
            summary_target_tokens=500,
            summary_model_override=compression_summary_model,
            quiet_mode=self.quiet_mode,
            base_url=self.base_url,
        )
        self.compression_enabled = compression_enabled
        self._user_turn_count = 0

        # Cumulative token usage for the session
        self.session_prompt_tokens = 0
        self.session_completion_tokens = 0
        self.session_total_tokens = 0
        self.session_api_calls = 0
        
        if not self.quiet_mode:
            if compression_enabled:
                print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {self.context_compressor.threshold_tokens:,})")
            else:
                print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
    
    def _vprint(self, *args, force: bool = False, **kwargs):
        """Verbose print — suppressed when streaming TTS is active.

        Pass ``force=True`` for error/warning messages that should always be
        shown even during streaming TTS playback.
        """
        if not force and getattr(self, "_stream_callback", None) is not None:
            return
        print(*args, **kwargs)

    def _max_tokens_param(self, value: int) -> dict:
        """Return the correct max tokens kwarg for the current provider.
        
        OpenAI's newer models (gpt-4o, o-series, gpt-5+) require
        'max_completion_tokens'. OpenRouter, local models, and older
        OpenAI models use 'max_tokens'.
        """
        _is_direct_openai = (
            "api.openai.com" in self.base_url.lower()
            and "openrouter" not in self.base_url.lower()
        )
        if _is_direct_openai:
            return {"max_completion_tokens": value}
        return {"max_tokens": value}

    def _has_content_after_think_block(self, content: str) -> bool:
        """
        Check if content has actual text after any <think></think> blocks.
        
        This detects cases where the model only outputs reasoning but no actual
        response, which indicates an incomplete generation that should be retried.
        
        Args:
            content: The assistant message content to check
            
        Returns:
            True if there's meaningful content after think blocks, False otherwise
        """
        if not content:
            return False
        
        # Remove all <think>...</think> blocks (including nested ones, non-greedy)
        cleaned = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
        
        # Check if there's any non-whitespace content remaining
        return bool(cleaned.strip())
    
    def _strip_think_blocks(self, content: str) -> str:
        """Remove <think>...</think> blocks from content, returning only visible text."""
        if not content:
            return ""
        return re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)

    def _looks_like_codex_intermediate_ack(
        self,
        user_message: str,
        assistant_content: str,
        messages: List[Dict[str, Any]],
    ) -> bool:
        """Detect a planning/ack message that should continue instead of ending the turn."""
        if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages):
            return False

        assistant_text = self._strip_think_blocks(assistant_content or "").strip().lower()
        if not assistant_text:
            return False
        if len(assistant_text) > 1200:
            return False

        has_future_ack = bool(
            re.search(r"\b(i['’]ll|i will|let me|i can do that|i can help with that)\b", assistant_text)
        )
        if not has_future_ack:
            return False

        action_markers = (
            "look into",
            "look at",
            "inspect",
            "scan",
            "check",
            "analyz",
            "review",
            "explore",
            "read",
            "open",
            "run",
            "test",
            "fix",
            "debug",
            "search",
            "find",
            "walkthrough",
            "report back",
            "summarize",
        )
        workspace_markers = (
            "directory",
            "current directory",
            "current dir",
            "cwd",
            "repo",
            "repository",
            "codebase",
            "project",
            "folder",
            "filesystem",
            "file tree",
            "files",
            "path",
        )

        user_text = (user_message or "").strip().lower()
        user_targets_workspace = (
            any(marker in user_text for marker in workspace_markers)
            or "~/" in user_text
            or "/" in user_text
        )
        assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
        assistant_targets_workspace = any(
            marker in assistant_text for marker in workspace_markers
        )
        return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action
    
    
    def _extract_reasoning(self, assistant_message) -> Optional[str]:
        """
        Extract reasoning/thinking content from an assistant message.
        
        OpenRouter and various providers can return reasoning in multiple formats:
        1. message.reasoning - Direct reasoning field (DeepSeek, Qwen, etc.)
        2. message.reasoning_content - Alternative field (Moonshot AI, Novita, etc.)
        3. message.reasoning_details - Array of {type, summary, ...} objects (OpenRouter unified)
        
        Args:
            assistant_message: The assistant message object from the API response
            
        Returns:
            Combined reasoning text, or None if no reasoning found
        """
        reasoning_parts = []
        
        # Check direct reasoning field
        if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning:
            reasoning_parts.append(assistant_message.reasoning)
        
        # Check reasoning_content field (alternative name used by some providers)
        if hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content:
            # Don't duplicate if same as reasoning
            if assistant_message.reasoning_content not in reasoning_parts:
                reasoning_parts.append(assistant_message.reasoning_content)
        
        # Check reasoning_details array (OpenRouter unified format)
        # Format: [{"type": "reasoning.summary", "summary": "...", ...}, ...]
        if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details:
            for detail in assistant_message.reasoning_details:
                if isinstance(detail, dict):
                    # Extract summary from reasoning detail object
                    summary = detail.get('summary') or detail.get('content') or detail.get('text')
                    if summary and summary not in reasoning_parts:
                        reasoning_parts.append(summary)
        
        # Combine all reasoning parts
        if reasoning_parts:
            return "\n\n".join(reasoning_parts)
        
        return None
    
    def _cleanup_task_resources(self, task_id: str) -> None:
        """Clean up VM and browser resources for a given task."""
        try:
            cleanup_vm(task_id)
        except Exception as e:
            if self.verbose_logging:
                logging.warning(f"Failed to cleanup VM for task {task_id}: {e}")
        try:
            cleanup_browser(task_id)
        except Exception as e:
            if self.verbose_logging:
                logging.warning(f"Failed to cleanup browser for task {task_id}: {e}")

    def _apply_persist_user_message_override(self, messages: List[Dict]) -> None:
        """Rewrite the current-turn user message before persistence/return.

        Some call paths need an API-only user-message variant without letting
        that synthetic text leak into persisted transcripts or resumed session
        history. When an override is configured for the active turn, mutate the
        in-memory messages list in place so both persistence and returned
        history stay clean.
        """
        idx = getattr(self, "_persist_user_message_idx", None)
        override = getattr(self, "_persist_user_message_override", None)
        if override is None or idx is None:
            return
        if 0 <= idx < len(messages):
            msg = messages[idx]
            if isinstance(msg, dict) and msg.get("role") == "user":
                msg["content"] = override

    def _persist_session(self, messages: List[Dict], conversation_history: List[Dict] = None):
        """Save session state to both JSON log and SQLite on any exit path.

        Ensures conversations are never lost, even on errors or early returns.
        """
        self._apply_persist_user_message_override(messages)
        self._session_messages = messages
        self._save_session_log(messages)
        self._flush_messages_to_session_db(messages, conversation_history)

    def _flush_messages_to_session_db(self, messages: List[Dict], conversation_history: List[Dict] = None):
        """Persist any un-flushed messages to the SQLite session store.

        Uses _last_flushed_db_idx to track which messages have already been
        written, so repeated calls (from multiple exit paths) only write
        truly new messages — preventing the duplicate-write bug (#860).
        """
        if not self._session_db:
            return
        self._apply_persist_user_message_override(messages)
        try:
            start_idx = len(conversation_history) if conversation_history else 0
            flush_from = max(start_idx, self._last_flushed_db_idx)
            for msg in messages[flush_from:]:
                role = msg.get("role", "unknown")
                content = msg.get("content")
                tool_calls_data = None
                if hasattr(msg, "tool_calls") and msg.tool_calls:
                    tool_calls_data = [
                        {"name": tc.function.name, "arguments": tc.function.arguments}
                        for tc in msg.tool_calls
                    ]
                elif isinstance(msg.get("tool_calls"), list):
                    tool_calls_data = msg["tool_calls"]
                self._session_db.append_message(
                    session_id=self.session_id,
                    role=role,
                    content=content,
                    tool_name=msg.get("tool_name"),
                    tool_calls=tool_calls_data,
                    tool_call_id=msg.get("tool_call_id"),
                    finish_reason=msg.get("finish_reason"),
                )
            self._last_flushed_db_idx = len(messages)
        except Exception as e:
            logger.debug("Session DB append_message failed: %s", e)

    def _get_messages_up_to_last_assistant(self, messages: List[Dict]) -> List[Dict]:
        """
        Get messages up to (but not including) the last assistant turn.
        
        This is used when we need to "roll back" to the last successful point
        in the conversation, typically when the final assistant message is
        incomplete or malformed.
        
        Args:
            messages: Full message list
            
        Returns:
            Messages up to the last complete assistant turn (ending with user/tool message)
        """
        if not messages:
            return []
        
        # Find the index of the last assistant message
        last_assistant_idx = None
        for i in range(len(messages) - 1, -1, -1):
            if messages[i].get("role") == "assistant":
                last_assistant_idx = i
                break
        
        if last_assistant_idx is None:
            # No assistant message found, return all messages
            return messages.copy()
        
        # Return everything up to (not including) the last assistant message
        return messages[:last_assistant_idx]
    
    def _format_tools_for_system_message(self) -> str:
        """
        Format tool definitions for the system message in the trajectory format.
        
        Returns:
            str: JSON string representation of tool definitions
        """
        if not self.tools:
            return "[]"
        
        # Convert tool definitions to the format expected in trajectories
        formatted_tools = []
        for tool in self.tools:
            func = tool["function"]
            formatted_tool = {
                "name": func["name"],
                "description": func.get("description", ""),
                "parameters": func.get("parameters", {}),
                "required": None  # Match the format in the example
            }
            formatted_tools.append(formatted_tool)
        
        return json.dumps(formatted_tools, ensure_ascii=False)
    
    def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]:
        """
        Convert internal message format to trajectory format for saving.
        
        Args:
            messages (List[Dict]): Internal message history
            user_query (str): Original user query
            completed (bool): Whether the conversation completed successfully
            
        Returns:
            List[Dict]: Messages in trajectory format
        """
        trajectory = []
        
        # Add system message with tool definitions
        system_msg = (
            "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. "
            "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting "
            "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug "
            "into functions. After calling & executing the functions, you will be provided with function results within "
            "<tool_response> </tool_response> XML tags. Here are the available tools:\n"
            f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n"
            "For each function call return a JSON object, with the following pydantic model json schema for each:\n"
            "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, "
            "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n"
            "Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n"
            "Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>"
        )
        
        trajectory.append({
            "from": "system",
            "value": system_msg
        })
        
        # Add the actual user prompt (from the dataset) as the first human message
        trajectory.append({
            "from": "human",
            "value": user_query
        })
        
        # Skip the first message (the user query) since we already added it above.
        # Prefill messages are injected at API-call time only (not in the messages
        # list), so no offset adjustment is needed here.
        i = 1
        
        while i < len(messages):
            msg = messages[i]
            
            if msg["role"] == "assistant":
                # Check if this message has tool calls
                if "tool_calls" in msg and msg["tool_calls"]:
                    # Format assistant message with tool calls
                    # Add <think> tags around reasoning for trajectory storage
                    content = ""
                    
                    # Prepend reasoning in <think> tags if available (native thinking tokens)
                    if msg.get("reasoning") and msg["reasoning"].strip():
                        content = f"<think>\n{msg['reasoning']}\n</think>\n"
                    
                    if msg.get("content") and msg["content"].strip():
                        # Convert any <REASONING_SCRATCHPAD> tags to <think> tags
                        # (used when native thinking is disabled and model reasons via XML)
                        content += convert_scratchpad_to_think(msg["content"]) + "\n"
                    
                    # Add tool calls wrapped in XML tags
                    for tool_call in msg["tool_calls"]:
                        # Parse arguments - should always succeed since we validate during conversation
                        # but keep try-except as safety net
                        try:
                            arguments = json.loads(tool_call["function"]["arguments"]) if isinstance(tool_call["function"]["arguments"], str) else tool_call["function"]["arguments"]
                        except json.JSONDecodeError:
                            # This shouldn't happen since we validate and retry during conversation,
                            # but if it does, log warning and use empty dict
                            logging.warning(f"Unexpected invalid JSON in trajectory conversion: {tool_call['function']['arguments'][:100]}")
                            arguments = {}
                        
                        tool_call_json = {
                            "name": tool_call["function"]["name"],
                            "arguments": arguments
                        }
                        content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
                    
                    # Ensure every gpt turn has a <think> block (empty if no reasoning)
                    # so the format is consistent for training data
                    if "<think>" not in content:
                        content = "<think>\n</think>\n" + content
                    
                    trajectory.append({
                        "from": "gpt",
                        "value": content.rstrip()
                    })
                    
                    # Collect all subsequent tool responses
                    tool_responses = []
                    j = i + 1
                    while j < len(messages) and messages[j]["role"] == "tool":
                        tool_msg = messages[j]
                        # Format tool response with XML tags
                        tool_response = f"<tool_response>\n"
                        
                        # Try to parse tool content as JSON if it looks like JSON
                        tool_content = tool_msg["content"]
                        try:
                            if tool_content.strip().startswith(("{", "[")):
                                tool_content = json.loads(tool_content)
                        except (json.JSONDecodeError, AttributeError):
                            pass  # Keep as string if not valid JSON
                        
                        tool_index = len(tool_responses)
                        tool_name = (
                            msg["tool_calls"][tool_index]["function"]["name"]
                            if tool_index < len(msg["tool_calls"])
                            else "unknown"
                        )
                        tool_response += json.dumps({
                            "tool_call_id": tool_msg.get("tool_call_id", ""),
                            "name": tool_name,
                            "content": tool_content
                        }, ensure_ascii=False)
                        tool_response += "\n</tool_response>"
                        tool_responses.append(tool_response)
                        j += 1
                    
                    # Add all tool responses as a single message
                    if tool_responses:
                        trajectory.append({
                            "from": "tool",
                            "value": "\n".join(tool_responses)
                        })
                        i = j - 1  # Skip the tool messages we just processed
                
                else:
                    # Regular assistant message without tool calls
                    # Add <think> tags around reasoning for trajectory storage
                    content = ""
                    
                    # Prepend reasoning in <think> tags if available (native thinking tokens)
                    if msg.get("reasoning") and msg["reasoning"].strip():
                        content = f"<think>\n{msg['reasoning']}\n</think>\n"
                    
                    # Convert any <REASONING_SCRATCHPAD> tags to <think> tags
                    # (used when native thinking is disabled and model reasons via XML)
                    raw_content = msg["content"] or ""
                    content += convert_scratchpad_to_think(raw_content)
                    
                    # Ensure every gpt turn has a <think> block (empty if no reasoning)
                    if "<think>" not in content:
                        content = "<think>\n</think>\n" + content
                    
                    trajectory.append({
                        "from": "gpt",
                        "value": content.strip()
                    })
            
            elif msg["role"] == "user":
                trajectory.append({
                    "from": "human",
                    "value": msg["content"]
                })
            
            i += 1
        
        return trajectory
    
    def _save_trajectory(self, messages: List[Dict[str, Any]], user_query: str, completed: bool):
        """
        Save conversation trajectory to JSONL file.
        
        Args:
            messages (List[Dict]): Complete message history
            user_query (str): Original user query
            completed (bool): Whether the conversation completed successfully
        """
        if not self.save_trajectories:
            return
        
        trajectory = self._convert_to_trajectory_format(messages, user_query, completed)
        _save_trajectory_to_file(trajectory, self.model, completed)
    
    def _mask_api_key_for_logs(self, key: Optional[str]) -> Optional[str]:
        if not key:
            return None
        if len(key) <= 12:
            return "***"
        return f"{key[:8]}...{key[-4:]}"

    def _dump_api_request_debug(
        self,
        api_kwargs: Dict[str, Any],
        *,
        reason: str,
        error: Optional[Exception] = None,
    ) -> Optional[Path]:
        """
        Dump a debug-friendly HTTP request record for chat.completions.create().

        Captures the request body from api_kwargs (excluding transport-only keys
        like timeout). Intended for debugging provider-side 4xx failures where
        retries are not useful.
        """
        try:
            body = copy.deepcopy(api_kwargs)
            body.pop("timeout", None)
            body = {k: v for k, v in body.items() if v is not None}

            api_key = None
            try:
                api_key = getattr(self.client, "api_key", None)
            except Exception as e:
                logger.debug("Could not extract API key for debug dump: %s", e)

            dump_payload: Dict[str, Any] = {
                "timestamp": datetime.now().isoformat(),
                "session_id": self.session_id,
                "reason": reason,
                "request": {
                    "method": "POST",
                    "url": f"{self.base_url.rstrip('/')}/chat/completions",
                    "headers": {
                        "Authorization": f"Bearer {self._mask_api_key_for_logs(api_key)}",
                        "Content-Type": "application/json",
                    },
                    "body": body,
                },
            }

            if error is not None:
                error_info: Dict[str, Any] = {
                    "type": type(error).__name__,
                    "message": str(error),
                }
                for attr_name in ("status_code", "request_id", "code", "param", "type"):
                    attr_value = getattr(error, attr_name, None)
                    if attr_value is not None:
                        error_info[attr_name] = attr_value

                body_attr = getattr(error, "body", None)
                if body_attr is not None:
                    error_info["body"] = body_attr

                response_obj = getattr(error, "response", None)
                if response_obj is not None:
                    try:
                        error_info["response_status"] = getattr(response_obj, "status_code", None)
                        error_info["response_text"] = response_obj.text
                    except Exception as e:
                        logger.debug("Could not extract error response details: %s", e)

                dump_payload["error"] = error_info

            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
            dump_file = self.logs_dir / f"request_dump_{self.session_id}_{timestamp}.json"
            dump_file.write_text(
                json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str),
                encoding="utf-8",
            )

            self._vprint(f"{self.log_prefix}🧾 Request debug dump written to: {dump_file}")

            if os.getenv("HERMES_DUMP_REQUEST_STDOUT", "").strip().lower() in {"1", "true", "yes", "on"}:
                print(json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str))

            return dump_file
        except Exception as dump_error:
            if self.verbose_logging:
                logging.warning(f"Failed to dump API request debug payload: {dump_error}")
            return None

    @staticmethod
    def _clean_session_content(content: str) -> str:
        """Convert REASONING_SCRATCHPAD to think tags and clean up whitespace."""
        if not content:
            return content
        content = convert_scratchpad_to_think(content)
        content = re.sub(r'\n+(<think>)', r'\n\1', content)
        content = re.sub(r'(</think>)\n+', r'\1\n', content)
        return content.strip()

    def _save_session_log(self, messages: List[Dict[str, Any]] = None):
        """
        Save the full raw session to a JSON file.

        Stores every message exactly as the agent sees it: user messages,
        assistant messages (with reasoning, finish_reason, tool_calls),
        tool responses (with tool_call_id, tool_name), and injected system
        messages (compression summaries, todo snapshots, etc.).

        REASONING_SCRATCHPAD tags are converted to <think> blocks for consistency.
        Overwritten after each turn so it always reflects the latest state.
        """
        messages = messages or self._session_messages
        if not messages:
            return

        try:
            # Clean assistant content for session logs
            cleaned = []
            for msg in messages:
                if msg.get("role") == "assistant" and msg.get("content"):
                    msg = dict(msg)
                    msg["content"] = self._clean_session_content(msg["content"])
                cleaned.append(msg)

            entry = {
                "session_id": self.session_id,
                "model": self.model,
                "base_url": self.base_url,
                "platform": self.platform,
                "session_start": self.session_start.isoformat(),
                "last_updated": datetime.now().isoformat(),
                "system_prompt": self._cached_system_prompt or "",
                "tools": self.tools or [],
                "message_count": len(cleaned),
                "messages": cleaned,
            }

            atomic_json_write(
                self.session_log_file,
                entry,
                indent=2,
                default=str,
            )

        except Exception as e:
            if self.verbose_logging:
                logging.warning(f"Failed to save session log: {e}")
    
    def interrupt(self, message: str = None) -> None:
        """
        Request the agent to interrupt its current tool-calling loop.
        
        Call this from another thread (e.g., input handler, message receiver)
        to gracefully stop the agent and process a new message.
        
        Also signals long-running tool executions (e.g. terminal commands)
        to terminate early, so the agent can respond immediately.
        
        Args:
            message: Optional new message that triggered the interrupt.
                     If provided, the agent will include this in its response context.
        
        Example (CLI):
            # In a separate input thread:
            if user_typed_something:
                agent.interrupt(user_input)
        
        Example (Messaging):
            # When new message arrives for active session:
            if session_has_running_agent:
                running_agent.interrupt(new_message.text)
        """
        self._interrupt_requested = True
        self._interrupt_message = message
        # Signal all tools to abort any in-flight operations immediately
        _set_interrupt(True)
        # Propagate interrupt to any running child agents (subagent delegation)
        for child in self._active_children:
            try:
                child.interrupt(message)
            except Exception as e:
                logger.debug("Failed to propagate interrupt to child agent: %s", e)
        if not self.quiet_mode:
            print(f"\n⚡ Interrupt requested" + (f": '{message[:40]}...'" if message and len(message) > 40 else f": '{message}'" if message else ""))
    
    def clear_interrupt(self) -> None:
        """Clear any pending interrupt request and the global tool interrupt signal."""
        self._interrupt_requested = False
        self._interrupt_message = None
        _set_interrupt(False)
    
    def _hydrate_todo_store(self, history: List[Dict[str, Any]]) -> None:
        """
        Recover todo state from conversation history.
        
        The gateway creates a fresh AIAgent per message, so the in-memory
        TodoStore is empty. We scan the history for the most recent todo
        tool response and replay it to reconstruct the state.
        """
        # Walk history backwards to find the most recent todo tool response
        last_todo_response = None
        for msg in reversed(history):
            if msg.get("role") != "tool":
                continue
            content = msg.get("content", "")
            # Quick check: todo responses contain "todos" key
            if '"todos"' not in content:
                continue
            try:
                data = json.loads(content)
                if "todos" in data and isinstance(data["todos"], list):
                    last_todo_response = data["todos"]
                    break
            except (json.JSONDecodeError, TypeError):
                continue
        
        if last_todo_response:
            # Replay the items into the store (replace mode)
            self._todo_store.write(last_todo_response, merge=False)
            if not self.quiet_mode:
                self._vprint(f"{self.log_prefix}📋 Restored {len(last_todo_response)} todo item(s) from history")
        _set_interrupt(False)
    
    @property
    def is_interrupted(self) -> bool:
        """Check if an interrupt has been requested."""
        return self._interrupt_requested

    # ── Honcho integration helpers ──

    def _honcho_should_activate(self, hcfg) -> bool:
        """Return True when remote Honcho should be active."""
        if not hcfg or not hcfg.enabled or not hcfg.api_key:
            return False
        return True

    def _strip_honcho_tools_from_surface(self) -> None:
        """Remove Honcho tools from the active tool surface."""
        if not self.tools:
            self.valid_tool_names = set()
            return

        self.tools = [
            tool for tool in self.tools
            if tool.get("function", {}).get("name") not in HONCHO_TOOL_NAMES
        ]
        self.valid_tool_names = {
            tool["function"]["name"] for tool in self.tools
        } if self.tools else set()

    def _activate_honcho(
        self,
        hcfg,
        *,
        enabled_toolsets: Optional[List[str]],
        disabled_toolsets: Optional[List[str]],
        session_db,
    ) -> None:
        """Finish Honcho setup once a session manager is available."""
        if not self._honcho:
            return

        if not self._honcho_session_key:
            session_title = None
            if session_db is not None:
                try:
                    session_title = session_db.get_session_title(self.session_id or "")
                except Exception:
                    pass
            self._honcho_session_key = (
                hcfg.resolve_session_name(
                    session_title=session_title,
                    session_id=self.session_id,
                )
                or "hermes-default"
            )

        honcho_sess = self._honcho.get_or_create(self._honcho_session_key)
        if not honcho_sess.messages:
            try:
                from hermes_cli.config import get_hermes_home

                mem_dir = str(get_hermes_home() / "memories")
                self._honcho.migrate_memory_files(
                    self._honcho_session_key,
                    mem_dir,
                )
            except Exception as exc:
                logger.debug("Memory files migration failed (non-fatal): %s", exc)

        from tools.honcho_tools import set_session_context

        set_session_context(self._honcho, self._honcho_session_key)

        # Rebuild tool surface after Honcho context injection. Tool availability
        # is check_fn-gated and may change once session context is attached.
        self.tools = get_tool_definitions(
            enabled_toolsets=enabled_toolsets,
            disabled_toolsets=disabled_toolsets,
            quiet_mode=True,
        )
        self.valid_tool_names = {
            tool["function"]["name"] for tool in self.tools
        } if self.tools else set()

        if hcfg.recall_mode == "context":
            self._strip_honcho_tools_from_surface()
            if not self.quiet_mode:
                print("  Honcho active — recall_mode: context (Honcho tools hidden)")
        else:
            if not self.quiet_mode:
                print(f"  Honcho active — recall_mode: {hcfg.recall_mode}")

        logger.info(
            "Honcho active (session: %s, user: %s, workspace: %s, "
            "write_frequency: %s, memory_mode: %s)",
            self._honcho_session_key,
            hcfg.peer_name,
            hcfg.workspace_id,
            hcfg.write_frequency,
            hcfg.memory_mode,
        )

        recall_mode = hcfg.recall_mode
        if recall_mode != "tools":
            try:
                ctx = self._honcho.get_prefetch_context(self._honcho_session_key)
                if ctx:
                    self._honcho.set_context_result(self._honcho_session_key, ctx)
                    logger.debug("Honcho context pre-warmed for first turn")
            except Exception as exc:
                logger.debug("Honcho context prefetch failed (non-fatal): %s", exc)

        self._register_honcho_exit_hook()

    def _register_honcho_exit_hook(self) -> None:
        """Register a process-exit flush hook without clobbering signal handlers."""
        if self._honcho_exit_hook_registered or not self._honcho:
            return

        honcho_ref = weakref.ref(self._honcho)

        def _flush_honcho_on_exit():
            manager = honcho_ref()
            if manager is None:
                return
            try:
                manager.flush_all()
            except Exception as exc:
                logger.debug("Honcho flush on exit failed (non-fatal): %s", exc)

        atexit.register(_flush_honcho_on_exit)
        self._honcho_exit_hook_registered = True

    def _queue_honcho_prefetch(self, user_message: str) -> None:
        """Queue turn-end Honcho prefetch so the next turn can consume cached results."""
        if not self._honcho or not self._honcho_session_key:
            return

        recall_mode = (self._honcho_config.recall_mode if self._honcho_config else "hybrid")
        if recall_mode == "tools":
            return

        try:
            self._honcho.prefetch_context(self._honcho_session_key, user_message)
            self._honcho.prefetch_dialectic(self._honcho_session_key, user_message or "What were we working on?")
        except Exception as exc:
            logger.debug("Honcho background prefetch failed (non-fatal): %s", exc)

    def _honcho_prefetch(self, user_message: str) -> str:
        """Assemble the first-turn Honcho context from the pre-warmed cache."""
        if not self._honcho or not self._honcho_session_key:
            return ""
        try:
            parts = []

            ctx = self._honcho.pop_context_result(self._honcho_session_key)
            if ctx:
                rep = ctx.get("representation", "")
                card = ctx.get("card", "")
                if rep:
                    parts.append(f"## User representation\n{rep}")
                if card:
                    parts.append(card)
                ai_rep = ctx.get("ai_representation", "")
                ai_card = ctx.get("ai_card", "")
                if ai_rep:
                    parts.append(f"## AI peer representation\n{ai_rep}")
                if ai_card:
                    parts.append(ai_card)

            dialectic = self._honcho.pop_dialectic_result(self._honcho_session_key)
            if dialectic:
                parts.append(f"## Continuity synthesis\n{dialectic}")

            if not parts:
                return ""
            header = (
                "# Honcho Memory (persistent cross-session context)\n"
                "Use this to answer questions about the user, prior sessions, "
                "and what you were working on together. Do not call tools to "
                "look up information that is already present here.\n"
            )
            return header + "\n\n".join(parts)
        except Exception as e:
            logger.debug("Honcho prefetch failed (non-fatal): %s", e)
            return ""

    def _honcho_save_user_observation(self, content: str) -> str:
        """Route a memory tool target=user add to Honcho.

        Sends the content as a user peer message so Honcho's reasoning
        model can incorporate it into the user representation.
        """
        if not content or not content.strip():
            return json.dumps({"success": False, "error": "Content cannot be empty."})
        try:
            session = self._honcho.get_or_create(self._honcho_session_key)
            session.add_message("user", f"[observation] {content.strip()}")
            self._honcho.save(session)
            return json.dumps({
                "success": True,
                "target": "user",
                "message": "Saved to Honcho user model.",
            })
        except Exception as e:
            logger.debug("Honcho user observation failed: %s", e)
            return json.dumps({"success": False, "error": f"Honcho save failed: {e}"})

    def _honcho_sync(self, user_content: str, assistant_content: str) -> None:
        """Sync the user/assistant message pair to Honcho."""
        if not self._honcho or not self._honcho_session_key:
            return
        try:
            session = self._honcho.get_or_create(self._honcho_session_key)
            session.add_message("user", user_content)
            session.add_message("assistant", assistant_content)
            self._honcho.save(session)
            logger.info("Honcho sync queued for session %s (%d messages)",
                        self._honcho_session_key, len(session.messages))
        except Exception as e:
            logger.warning("Honcho sync failed: %s", e)
            if not self.quiet_mode:
                print(f"  Honcho write failed: {e}")

    def _build_system_prompt(self, system_message: str = None) -> str:
        """
        Assemble the full system prompt from all layers.
        
        Called once per session (cached on self._cached_system_prompt) and only
        rebuilt after context compression events. This ensures the system prompt
        is stable across all turns in a session, maximizing prefix cache hits.
        """
        # Layers (in order):
        #   1. Default agent identity (always present)
        #   2. User / gateway system prompt (if provided)
        #   3. Persistent memory (frozen snapshot)
        #   4. Skills guidance (if skills tools are loaded)
        #   5. Context files (SOUL.md, AGENTS.md, .cursorrules)
        #   6. Current date & time (frozen at build time)
        #   7. Platform-specific formatting hint
        # If an AI peer name is configured in Honcho, personalise the identity line.
        _ai_peer_name = (
            self._honcho_config.ai_peer
            if self._honcho_config and self._honcho_config.ai_peer != "hermes"
            else None
        )
        if _ai_peer_name:
            _identity = DEFAULT_AGENT_IDENTITY.replace(
                "You are Hermes Agent",
                f"You are {_ai_peer_name}",
                1,
            )
        else:
            _identity = DEFAULT_AGENT_IDENTITY
        prompt_parts = [_identity]

        # Tool-aware behavioral guidance: only inject when the tools are loaded
        tool_guidance = []
        if "memory" in self.valid_tool_names:
            tool_guidance.append(MEMORY_GUIDANCE)
        if "session_search" in self.valid_tool_names:
            tool_guidance.append(SESSION_SEARCH_GUIDANCE)
        if "skill_manage" in self.valid_tool_names:
            tool_guidance.append(SKILLS_GUIDANCE)
        if tool_guidance:
            prompt_parts.append(" ".join(tool_guidance))

        # Honcho CLI awareness: tell Hermes about its own management commands
        # so it can refer the user to them rather than reinventing answers.
        if self._honcho and self._honcho_session_key:
            hcfg = self._honcho_config
            mode = hcfg.memory_mode if hcfg else "hybrid"
            freq = hcfg.write_frequency if hcfg else "async"
            recall_mode = hcfg.recall_mode if hcfg else "hybrid"
            honcho_block = (
                "# Honcho memory integration\n"
                f"Active. Session: {self._honcho_session_key}. "
                f"Mode: {mode}. Write frequency: {freq}. Recall: {recall_mode}.\n"
            )
            if recall_mode == "context":
                honcho_block += (
                    "Honcho context is injected into this system prompt below. "
                    "All memory retrieval comes from this context — no Honcho tools "
                    "are available. Answer questions about the user, prior sessions, "
                    "and recent work directly from the Honcho Memory section.\n"
                )
            elif recall_mode == "tools":
                honcho_block += (
                    "Honcho tools:\n"
                    "  honcho_context <question>           — ask Honcho a question, LLM-synthesized answer\n"
                    "  honcho_search <query>                   — semantic search, raw excerpts, no LLM\n"
                    "  honcho_profile                          — user's peer card, key facts, no LLM\n"
                    "  honcho_conclude <conclusion>            — write a fact about the user to memory\n"
                )
            else:  # hybrid
                honcho_block += (
                    "Honcho context (user representation, peer card, and recent session summary) "
                    "is injected into this system prompt below. Use it to answer continuity "
                    "questions ('where were we?', 'what were we working on?') WITHOUT calling "
                    "any tools. Only call Honcho tools when you need information beyond what is "
                    "already present in the Honcho Memory section.\n"
                    "Honcho tools:\n"
                    "  honcho_context <question>           — ask Honcho a question, LLM-synthesized answer\n"
                    "  honcho_search <query>                   — semantic search, raw excerpts, no LLM\n"
                    "  honcho_profile                          — user's peer card, key facts, no LLM\n"
                    "  honcho_conclude <conclusion>            — write a fact about the user to memory\n"
                )
            honcho_block += (
                "Management commands (refer users here instead of explaining manually):\n"
                "  hermes honcho status                    — show full config + connection\n"
                "  hermes honcho mode [hybrid|honcho]       — show or set memory mode\n"
                "  hermes honcho tokens [--context N] [--dialectic N] — show or set token budgets\n"
                "  hermes honcho peer [--user NAME] [--ai NAME] [--reasoning LEVEL]\n"
                "  hermes honcho sessions                  — list directory→session mappings\n"
                "  hermes honcho map <name>                — map cwd to a session name\n"
                "  hermes honcho identity [<file>] [--show] — seed or show AI peer identity\n"
                "  hermes honcho migrate                   — migration guide from openclaw-honcho\n"
                "  hermes honcho setup                     — full interactive wizard"
            )
            prompt_parts.append(honcho_block)

        # Note: ephemeral_system_prompt is NOT included here. It's injected at
        # API-call time only so it stays out of the cached/stored system prompt.
        if system_message is not None:
            prompt_parts.append(system_message)

        if self._memory_store:
            if self._memory_enabled:
                mem_block = self._memory_store.format_for_system_prompt("memory")
                if mem_block:
                    prompt_parts.append(mem_block)
            # USER.md is always included when enabled -- Honcho prefetch is additive.
            if self._user_profile_enabled:
                user_block = self._memory_store.format_for_system_prompt("user")
                if user_block:
                    prompt_parts.append(user_block)

        has_skills_tools = any(name in self.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage'])
        if has_skills_tools:
            avail_toolsets = {ts for ts, avail in check_toolset_requirements().items() if avail}
            skills_prompt = build_skills_system_prompt(
                available_tools=self.valid_tool_names,
                available_toolsets=avail_toolsets,
            )
        else:
            skills_prompt = ""
        if skills_prompt:
            prompt_parts.append(skills_prompt)

        if not self.skip_context_files:
            context_files_prompt = build_context_files_prompt()
            if context_files_prompt:
                prompt_parts.append(context_files_prompt)

        from hermes_time import now as _hermes_now
        now = _hermes_now()
        timestamp_line = f"Conversation started: {now.strftime('%A, %B %d, %Y %I:%M %p')}"
        if self.pass_session_id and self.session_id:
            timestamp_line += f"\nSession ID: {self.session_id}"
        prompt_parts.append(timestamp_line)

        platform_key = (self.platform or "").lower().strip()
        if platform_key in PLATFORM_HINTS:
            prompt_parts.append(PLATFORM_HINTS[platform_key])

        return "\n\n".join(prompt_parts)
    
    def _repair_tool_call(self, tool_name: str) -> str | None:
        """Attempt to repair a mismatched tool name before aborting.

        1. Try lowercase
        2. Try normalized (lowercase + hyphens/spaces -> underscores)
        3. Try fuzzy match (difflib, cutoff=0.7)

        Returns the repaired name if found in valid_tool_names, else None.
        """
        from difflib import get_close_matches

        # 1. Lowercase
        lowered = tool_name.lower()
        if lowered in self.valid_tool_names:
            return lowered

        # 2. Normalize
        normalized = lowered.replace("-", "_").replace(" ", "_")
        if normalized in self.valid_tool_names:
            return normalized

        # 3. Fuzzy match
        matches = get_close_matches(lowered, self.valid_tool_names, n=1, cutoff=0.7)
        if matches:
            return matches[0]

        return None

    def _invalidate_system_prompt(self):
        """
        Invalidate the cached system prompt, forcing a rebuild on the next turn.
        
        Called after context compression events. Also reloads memory from disk
        so the rebuilt prompt captures any writes from this session.
        """
        self._cached_system_prompt = None
        if self._memory_store:
            self._memory_store.load_from_disk()

    def _responses_tools(self, tools: Optional[List[Dict[str, Any]]] = None) -> Optional[List[Dict[str, Any]]]:
        """Convert chat-completions tool schemas to Responses function-tool schemas."""
        source_tools = tools if tools is not None else self.tools
        if not source_tools:
            return None

        converted: List[Dict[str, Any]] = []
        for item in source_tools:
            fn = item.get("function", {}) if isinstance(item, dict) else {}
            name = fn.get("name")
            if not isinstance(name, str) or not name.strip():
                continue
            converted.append({
                "type": "function",
                "name": name,
                "description": fn.get("description", ""),
                "strict": False,
                "parameters": fn.get("parameters", {"type": "object", "properties": {}}),
            })
        return converted or None

    @staticmethod
    def _split_responses_tool_id(raw_id: Any) -> tuple[Optional[str], Optional[str]]:
        """Split a stored tool id into (call_id, response_item_id)."""
        if not isinstance(raw_id, str):
            return None, None
        value = raw_id.strip()
        if not value:
            return None, None
        if "|" in value:
            call_id, response_item_id = value.split("|", 1)
            call_id = call_id.strip() or None
            response_item_id = response_item_id.strip() or None
            return call_id, response_item_id
        if value.startswith("fc_"):
            return None, value
        return value, None

    def _derive_responses_function_call_id(
        self,
        call_id: str,
        response_item_id: Optional[str] = None,
    ) -> str:
        """Build a valid Responses `function_call.id` (must start with `fc_`)."""
        if isinstance(response_item_id, str):
            candidate = response_item_id.strip()
            if candidate.startswith("fc_"):
                return candidate

        source = (call_id or "").strip()
        if source.startswith("fc_"):
            return source
        if source.startswith("call_") and len(source) > len("call_"):
            return f"fc_{source[len('call_'):]}"

        sanitized = re.sub(r"[^A-Za-z0-9_-]", "", source)
        if sanitized.startswith("fc_"):
            return sanitized
        if sanitized.startswith("call_") and len(sanitized) > len("call_"):
            return f"fc_{sanitized[len('call_'):]}"
        if sanitized:
            return f"fc_{sanitized[:48]}"

        seed = source or str(response_item_id or "") or uuid.uuid4().hex
        digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:24]
        return f"fc_{digest}"

    def _chat_messages_to_responses_input(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """Convert internal chat-style messages to Responses input items."""
        items: List[Dict[str, Any]] = []

        for msg in messages:
            if not isinstance(msg, dict):
                continue
            role = msg.get("role")
            if role == "system":
                continue

            if role in {"user", "assistant"}:
                content = msg.get("content", "")
                content_text = str(content) if content is not None else ""

                if role == "assistant":
                    # Replay encrypted reasoning items from previous turns
                    # so the API can maintain coherent reasoning chains.
                    codex_reasoning = msg.get("codex_reasoning_items")
                    if isinstance(codex_reasoning, list):
                        for ri in codex_reasoning:
                            if isinstance(ri, dict) and ri.get("encrypted_content"):
                                items.append(ri)

                    if content_text.strip():
                        items.append({"role": "assistant", "content": content_text})

                    tool_calls = msg.get("tool_calls")
                    if isinstance(tool_calls, list):
                        for tc in tool_calls:
                            if not isinstance(tc, dict):
                                continue
                            fn = tc.get("function", {})
                            fn_name = fn.get("name")
                            if not isinstance(fn_name, str) or not fn_name.strip():
                                continue

                            embedded_call_id, embedded_response_item_id = self._split_responses_tool_id(
                                tc.get("id")
                            )
                            call_id = tc.get("call_id")
                            if not isinstance(call_id, str) or not call_id.strip():
                                call_id = embedded_call_id
                            if not isinstance(call_id, str) or not call_id.strip():
                                if (
                                    isinstance(embedded_response_item_id, str)
                                    and embedded_response_item_id.startswith("fc_")
                                    and len(embedded_response_item_id) > len("fc_")
                                ):
                                    call_id = f"call_{embedded_response_item_id[len('fc_'):]}"
                                else:
                                    call_id = f"call_{uuid.uuid4().hex[:12]}"
                            call_id = call_id.strip()

                            arguments = fn.get("arguments", "{}")
                            if isinstance(arguments, dict):
                                arguments = json.dumps(arguments, ensure_ascii=False)
                            elif not isinstance(arguments, str):
                                arguments = str(arguments)
                            arguments = arguments.strip() or "{}"

                            items.append({
                                "type": "function_call",
                                "call_id": call_id,
                                "name": fn_name,
                                "arguments": arguments,
                            })
                    continue

                items.append({"role": role, "content": content_text})
                continue

            if role == "tool":
                raw_tool_call_id = msg.get("tool_call_id")
                call_id, _ = self._split_responses_tool_id(raw_tool_call_id)
                if not isinstance(call_id, str) or not call_id.strip():
                    if isinstance(raw_tool_call_id, str) and raw_tool_call_id.strip():
                        call_id = raw_tool_call_id.strip()
                if not isinstance(call_id, str) or not call_id.strip():
                    continue
                items.append({
                    "type": "function_call_output",
                    "call_id": call_id,
                    "output": str(msg.get("content", "") or ""),
                })

        return items

    def _preflight_codex_input_items(self, raw_items: Any) -> List[Dict[str, Any]]:
        if not isinstance(raw_items, list):
            raise ValueError("Codex Responses input must be a list of input items.")

        normalized: List[Dict[str, Any]] = []
        for idx, item in enumerate(raw_items):
            if not isinstance(item, dict):
                raise ValueError(f"Codex Responses input[{idx}] must be an object.")

            item_type = item.get("type")
            if item_type == "function_call":
                call_id = item.get("call_id")
                name = item.get("name")
                if not isinstance(call_id, str) or not call_id.strip():
                    raise ValueError(f"Codex Responses input[{idx}] function_call is missing call_id.")
                if not isinstance(name, str) or not name.strip():
                    raise ValueError(f"Codex Responses input[{idx}] function_call is missing name.")

                arguments = item.get("arguments", "{}")
                if isinstance(arguments, dict):
                    arguments = json.dumps(arguments, ensure_ascii=False)
                elif not isinstance(arguments, str):
                    arguments = str(arguments)
                arguments = arguments.strip() or "{}"

                normalized.append(
                    {
                        "type": "function_call",
                        "call_id": call_id.strip(),
                        "name": name.strip(),
                        "arguments": arguments,
                    }
                )
                continue

            if item_type == "function_call_output":
                call_id = item.get("call_id")
                if not isinstance(call_id, str) or not call_id.strip():
                    raise ValueError(f"Codex Responses input[{idx}] function_call_output is missing call_id.")
                output = item.get("output", "")
                if output is None:
                    output = ""
                if not isinstance(output, str):
                    output = str(output)

                normalized.append(
                    {
                        "type": "function_call_output",
                        "call_id": call_id.strip(),
                        "output": output,
                    }
                )
                continue

            if item_type == "reasoning":
                encrypted = item.get("encrypted_content")
                if isinstance(encrypted, str) and encrypted:
                    reasoning_item = {"type": "reasoning", "encrypted_content": encrypted}
                    item_id = item.get("id")
                    if isinstance(item_id, str) and item_id:
                        reasoning_item["id"] = item_id
                    summary = item.get("summary")
                    if isinstance(summary, list):
                        reasoning_item["summary"] = summary
                    else:
                        reasoning_item["summary"] = []
                    normalized.append(reasoning_item)
                continue

            role = item.get("role")
            if role in {"user", "assistant"}:
                content = item.get("content", "")
                if content is None:
                    content = ""
                if not isinstance(content, str):
                    content = str(content)

                normalized.append({"role": role, "content": content})
                continue

            raise ValueError(
                f"Codex Responses input[{idx}] has unsupported item shape (type={item_type!r}, role={role!r})."
            )

        return normalized

    def _preflight_codex_api_kwargs(
        self,
        api_kwargs: Any,
        *,
        allow_stream: bool = False,
    ) -> Dict[str, Any]:
        if not isinstance(api_kwargs, dict):
            raise ValueError("Codex Responses request must be a dict.")

        required = {"model", "instructions", "input"}
        missing = [key for key in required if key not in api_kwargs]
        if missing:
            raise ValueError(f"Codex Responses request missing required field(s): {', '.join(sorted(missing))}.")

        model = api_kwargs.get("model")
        if not isinstance(model, str) or not model.strip():
            raise ValueError("Codex Responses request 'model' must be a non-empty string.")
        model = model.strip()

        instructions = api_kwargs.get("instructions")
        if instructions is None:
            instructions = ""
        if not isinstance(instructions, str):
            instructions = str(instructions)
        instructions = instructions.strip() or DEFAULT_AGENT_IDENTITY

        normalized_input = self._preflight_codex_input_items(api_kwargs.get("input"))

        tools = api_kwargs.get("tools")
        normalized_tools = None
        if tools is not None:
            if not isinstance(tools, list):
                raise ValueError("Codex Responses request 'tools' must be a list when provided.")
            normalized_tools = []
            for idx, tool in enumerate(tools):
                if not isinstance(tool, dict):
                    raise ValueError(f"Codex Responses tools[{idx}] must be an object.")
                if tool.get("type") != "function":
                    raise ValueError(f"Codex Responses tools[{idx}] has unsupported type {tool.get('type')!r}.")

                name = tool.get("name")
                parameters = tool.get("parameters")
                if not isinstance(name, str) or not name.strip():
                    raise ValueError(f"Codex Responses tools[{idx}] is missing a valid name.")
                if not isinstance(parameters, dict):
                    raise ValueError(f"Codex Responses tools[{idx}] is missing valid parameters.")

                description = tool.get("description", "")
                if description is None:
                    description = ""
                if not isinstance(description, str):
                    description = str(description)

                strict = tool.get("strict", False)
                if not isinstance(strict, bool):
                    strict = bool(strict)

                normalized_tools.append(
                    {
                        "type": "function",
                        "name": name.strip(),
                        "description": description,
                        "strict": strict,
                        "parameters": parameters,
                    }
                )

        store = api_kwargs.get("store", False)
        if store is not False:
            raise ValueError("Codex Responses contract requires 'store' to be false.")

        allowed_keys = {
            "model", "instructions", "input", "tools", "store",
            "reasoning", "include", "max_output_tokens", "temperature",
            "tool_choice", "parallel_tool_calls", "prompt_cache_key",
        }
        normalized: Dict[str, Any] = {
            "model": model,
            "instructions": instructions,
            "input": normalized_input,
            "tools": normalized_tools,
            "store": False,
        }

        # Pass through reasoning config
        reasoning = api_kwargs.get("reasoning")
        if isinstance(reasoning, dict):
            normalized["reasoning"] = reasoning
        include = api_kwargs.get("include")
        if isinstance(include, list):
            normalized["include"] = include

        # Pass through max_output_tokens and temperature
        max_output_tokens = api_kwargs.get("max_output_tokens")
        if isinstance(max_output_tokens, (int, float)) and max_output_tokens > 0:
            normalized["max_output_tokens"] = int(max_output_tokens)
        temperature = api_kwargs.get("temperature")
        if isinstance(temperature, (int, float)):
            normalized["temperature"] = float(temperature)

        # Pass through tool_choice, parallel_tool_calls, prompt_cache_key
        for passthrough_key in ("tool_choice", "parallel_tool_calls", "prompt_cache_key"):
            val = api_kwargs.get(passthrough_key)
            if val is not None:
                normalized[passthrough_key] = val

        if allow_stream:
            stream = api_kwargs.get("stream")
            if stream is not None and stream is not True:
                raise ValueError("Codex Responses 'stream' must be true when set.")
            if stream is True:
                normalized["stream"] = True
            allowed_keys.add("stream")
        elif "stream" in api_kwargs:
            raise ValueError("Codex Responses stream flag is only allowed in fallback streaming requests.")

        unexpected = sorted(key for key in api_kwargs.keys() if key not in allowed_keys)
        if unexpected:
            raise ValueError(
                f"Codex Responses request has unsupported field(s): {', '.join(unexpected)}."
            )

        return normalized

    def _extract_responses_message_text(self, item: Any) -> str:
        """Extract assistant text from a Responses message output item."""
        content = getattr(item, "content", None)
        if not isinstance(content, list):
            return ""

        chunks: List[str] = []
        for part in content:
            ptype = getattr(part, "type", None)
            if ptype not in {"output_text", "text"}:
                continue
            text = getattr(part, "text", None)
            if isinstance(text, str) and text:
                chunks.append(text)
        return "".join(chunks).strip()

    def _extract_responses_reasoning_text(self, item: Any) -> str:
        """Extract a compact reasoning text from a Responses reasoning item."""
        summary = getattr(item, "summary", None)
        if isinstance(summary, list):
            chunks: List[str] = []
            for part in summary:
                text = getattr(part, "text", None)
                if isinstance(text, str) and text:
                    chunks.append(text)
            if chunks:
                return "\n".join(chunks).strip()
        text = getattr(item, "text", None)
        if isinstance(text, str) and text:
            return text.strip()
        return ""

    def _normalize_codex_response(self, response: Any) -> tuple[Any, str]:
        """Normalize a Responses API object to an assistant_message-like object."""
        output = getattr(response, "output", None)
        if not isinstance(output, list) or not output:
            raise RuntimeError("Responses API returned no output items")

        response_status = getattr(response, "status", None)
        if isinstance(response_status, str):
            response_status = response_status.strip().lower()
        else:
            response_status = None

        if response_status in {"failed", "cancelled"}:
            error_obj = getattr(response, "error", None)
            if isinstance(error_obj, dict):
                error_msg = error_obj.get("message") or str(error_obj)
            else:
                error_msg = str(error_obj) if error_obj else f"Responses API returned status '{response_status}'"
            raise RuntimeError(error_msg)

        content_parts: List[str] = []
        reasoning_parts: List[str] = []
        reasoning_items_raw: List[Dict[str, Any]] = []
        tool_calls: List[Any] = []
        has_incomplete_items = response_status in {"queued", "in_progress", "incomplete"}
        saw_commentary_phase = False
        saw_final_answer_phase = False

        for item in output:
            item_type = getattr(item, "type", None)
            item_status = getattr(item, "status", None)
            if isinstance(item_status, str):
                item_status = item_status.strip().lower()
            else:
                item_status = None

            if item_status in {"queued", "in_progress", "incomplete"}:
                has_incomplete_items = True

            if item_type == "message":
                item_phase = getattr(item, "phase", None)
                if isinstance(item_phase, str):
                    normalized_phase = item_phase.strip().lower()
                    if normalized_phase in {"commentary", "analysis"}:
                        saw_commentary_phase = True
                    elif normalized_phase in {"final_answer", "final"}:
                        saw_final_answer_phase = True
                message_text = self._extract_responses_message_text(item)
                if message_text:
                    content_parts.append(message_text)
            elif item_type == "reasoning":
                reasoning_text = self._extract_responses_reasoning_text(item)
                if reasoning_text:
                    reasoning_parts.append(reasoning_text)
                # Capture the full reasoning item for multi-turn continuity.
                # encrypted_content is an opaque blob the API needs back on
                # subsequent turns to maintain coherent reasoning chains.
                encrypted = getattr(item, "encrypted_content", None)
                if isinstance(encrypted, str) and encrypted:
                    raw_item = {"type": "reasoning", "encrypted_content": encrypted}
                    item_id = getattr(item, "id", None)
                    if isinstance(item_id, str) and item_id:
                        raw_item["id"] = item_id
                    # Capture summary — required by the API when replaying reasoning items
                    summary = getattr(item, "summary", None)
                    if isinstance(summary, list):
                        raw_summary = []
                        for part in summary:
                            text = getattr(part, "text", None)
                            if isinstance(text, str):
                                raw_summary.append({"type": "summary_text", "text": text})
                        raw_item["summary"] = raw_summary
                    reasoning_items_raw.append(raw_item)
            elif item_type == "function_call":
                if item_status in {"queued", "in_progress", "incomplete"}:
                    continue
                fn_name = getattr(item, "name", "") or ""
                arguments = getattr(item, "arguments", "{}")
                if not isinstance(arguments, str):
                    arguments = json.dumps(arguments, ensure_ascii=False)
                raw_call_id = getattr(item, "call_id", None)
                raw_item_id = getattr(item, "id", None)
                embedded_call_id, _ = self._split_responses_tool_id(raw_item_id)
                call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
                if not isinstance(call_id, str) or not call_id.strip():
                    call_id = f"call_{uuid.uuid4().hex[:12]}"
                call_id = call_id.strip()
                response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
                response_item_id = self._derive_responses_function_call_id(call_id, response_item_id)
                tool_calls.append(SimpleNamespace(
                    id=call_id,
                    call_id=call_id,
                    response_item_id=response_item_id,
                    type="function",
                    function=SimpleNamespace(name=fn_name, arguments=arguments),
                ))
            elif item_type == "custom_tool_call":
                fn_name = getattr(item, "name", "") or ""
                arguments = getattr(item, "input", "{}")
                if not isinstance(arguments, str):
                    arguments = json.dumps(arguments, ensure_ascii=False)
                raw_call_id = getattr(item, "call_id", None)
                raw_item_id = getattr(item, "id", None)
                embedded_call_id, _ = self._split_responses_tool_id(raw_item_id)
                call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
                if not isinstance(call_id, str) or not call_id.strip():
                    call_id = f"call_{uuid.uuid4().hex[:12]}"
                call_id = call_id.strip()
                response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
                response_item_id = self._derive_responses_function_call_id(call_id, response_item_id)
                tool_calls.append(SimpleNamespace(
                    id=call_id,
                    call_id=call_id,
                    response_item_id=response_item_id,
                    type="function",
                    function=SimpleNamespace(name=fn_name, arguments=arguments),
                ))

        final_text = "\n".join([p for p in content_parts if p]).strip()
        if not final_text and hasattr(response, "output_text"):
            out_text = getattr(response, "output_text", "")
            if isinstance(out_text, str):
                final_text = out_text.strip()

        assistant_message = SimpleNamespace(
            content=final_text,
            tool_calls=tool_calls,
            reasoning="\n\n".join(reasoning_parts).strip() if reasoning_parts else None,
            reasoning_content=None,
            reasoning_details=None,
            codex_reasoning_items=reasoning_items_raw or None,
        )

        if tool_calls:
            finish_reason = "tool_calls"
        elif has_incomplete_items or (saw_commentary_phase and not saw_final_answer_phase):
            finish_reason = "incomplete"
        else:
            finish_reason = "stop"
        return assistant_message, finish_reason

    def _thread_identity(self) -> str:
        thread = threading.current_thread()
        return f"{thread.name}:{thread.ident}"

    def _client_log_context(self) -> str:
        provider = getattr(self, "provider", "unknown")
        base_url = getattr(self, "base_url", "unknown")
        model = getattr(self, "model", "unknown")
        return (
            f"thread={self._thread_identity()} provider={provider} "
            f"base_url={base_url} model={model}"
        )

    def _openai_client_lock(self) -> threading.RLock:
        lock = getattr(self, "_client_lock", None)
        if lock is None:
            lock = threading.RLock()
            self._client_lock = lock
        return lock

    @staticmethod
    def _is_openai_client_closed(client: Any) -> bool:
        from unittest.mock import Mock

        if isinstance(client, Mock):
            return False
        http_client = getattr(client, "_client", None)
        return bool(getattr(http_client, "is_closed", False))

    def _create_openai_client(self, client_kwargs: dict, *, reason: str, shared: bool) -> Any:
        client = OpenAI(**client_kwargs)
        logger.info(
            "OpenAI client created (%s, shared=%s) %s",
            reason,
            shared,
            self._client_log_context(),
        )
        return client

    def _close_openai_client(self, client: Any, *, reason: str, shared: bool) -> None:
        if client is None:
            return
        try:
            client.close()
            logger.info(
                "OpenAI client closed (%s, shared=%s) %s",
                reason,
                shared,
                self._client_log_context(),
            )
        except Exception as exc:
            logger.debug(
                "OpenAI client close failed (%s, shared=%s) %s error=%s",
                reason,
                shared,
                self._client_log_context(),
                exc,
            )

    def _replace_primary_openai_client(self, *, reason: str) -> bool:
        with self._openai_client_lock():
            old_client = getattr(self, "client", None)
            try:
                new_client = self._create_openai_client(self._client_kwargs, reason=reason, shared=True)
            except Exception as exc:
                logger.warning(
                    "Failed to rebuild shared OpenAI client (%s) %s error=%s",
                    reason,
                    self._client_log_context(),
                    exc,
                )
                return False
            self.client = new_client
        self._close_openai_client(old_client, reason=f"replace:{reason}", shared=True)
        return True

    def _ensure_primary_openai_client(self, *, reason: str) -> Any:
        with self._openai_client_lock():
            client = getattr(self, "client", None)
            if client is not None and not self._is_openai_client_closed(client):
                return client

        logger.warning(
            "Detected closed shared OpenAI client; recreating before use (%s) %s",
            reason,
            self._client_log_context(),
        )
        if not self._replace_primary_openai_client(reason=f"recreate_closed:{reason}"):
            raise RuntimeError("Failed to recreate closed OpenAI client")
        with self._openai_client_lock():
            return self.client

    def _create_request_openai_client(self, *, reason: str) -> Any:
        from unittest.mock import Mock

        primary_client = self._ensure_primary_openai_client(reason=reason)
        if isinstance(primary_client, Mock):
            return primary_client
        with self._openai_client_lock():
            request_kwargs = dict(self._client_kwargs)
        return self._create_openai_client(request_kwargs, reason=reason, shared=False)

    def _close_request_openai_client(self, client: Any, *, reason: str) -> None:
        self._close_openai_client(client, reason=reason, shared=False)

    def _run_codex_stream(self, api_kwargs: dict, client: Any = None):
        """Execute one streaming Responses API request and return the final response."""
        active_client = client or self._ensure_primary_openai_client(reason="codex_stream_direct")
        max_stream_retries = 1
        for attempt in range(max_stream_retries + 1):
            try:
                with active_client.responses.stream(**api_kwargs) as stream:
                    for _ in stream:
                        pass
                    return stream.get_final_response()
            except RuntimeError as exc:
                err_text = str(exc)
                missing_completed = "response.completed" in err_text
                if missing_completed and attempt < max_stream_retries:
                    logger.debug(
                        "Responses stream closed before completion (attempt %s/%s); retrying. %s",
                        attempt + 1,
                        max_stream_retries + 1,
                        self._client_log_context(),
                    )
                    continue
                if missing_completed:
                    logger.debug(
                        "Responses stream did not emit response.completed; falling back to create(stream=True). %s",
                        self._client_log_context(),
                    )
                    return self._run_codex_create_stream_fallback(api_kwargs, client=active_client)
                raise

    def _run_codex_create_stream_fallback(self, api_kwargs: dict, client: Any = None):
        """Fallback path for stream completion edge cases on Codex-style Responses backends."""
        active_client = client or self._ensure_primary_openai_client(reason="codex_create_stream_fallback")
        fallback_kwargs = dict(api_kwargs)
        fallback_kwargs["stream"] = True
        fallback_kwargs = self._preflight_codex_api_kwargs(fallback_kwargs, allow_stream=True)
        stream_or_response = active_client.responses.create(**fallback_kwargs)

        # Compatibility shim for mocks or providers that still return a concrete response.
        if hasattr(stream_or_response, "output"):
            return stream_or_response
        if not hasattr(stream_or_response, "__iter__"):
            return stream_or_response

        terminal_response = None
        try:
            for event in stream_or_response:
                event_type = getattr(event, "type", None)
                if not event_type and isinstance(event, dict):
                    event_type = event.get("type")
                if event_type not in {"response.completed", "response.incomplete", "response.failed"}:
                    continue

                terminal_response = getattr(event, "response", None)
                if terminal_response is None and isinstance(event, dict):
                    terminal_response = event.get("response")
                if terminal_response is not None:
                    return terminal_response
        finally:
            close_fn = getattr(stream_or_response, "close", None)
            if callable(close_fn):
                try:
                    close_fn()
                except Exception:
                    pass

        if terminal_response is not None:
            return terminal_response
        raise RuntimeError("Responses create(stream=True) fallback did not emit a terminal response.")

    def _try_refresh_codex_client_credentials(self, *, force: bool = True) -> bool:
        if self.api_mode != "codex_responses" or self.provider != "openai-codex":
            return False

        try:
            from hermes_cli.auth import resolve_codex_runtime_credentials

            creds = resolve_codex_runtime_credentials(force_refresh=force)
        except Exception as exc:
            logger.debug("Codex credential refresh failed: %s", exc)
            return False

        api_key = creds.get("api_key")
        base_url = creds.get("base_url")
        if not isinstance(api_key, str) or not api_key.strip():
            return False
        if not isinstance(base_url, str) or not base_url.strip():
            return False

        self.api_key = api_key.strip()
        self.base_url = base_url.strip().rstrip("/")
        self._client_kwargs["api_key"] = self.api_key
        self._client_kwargs["base_url"] = self.base_url

        if not self._replace_primary_openai_client(reason="codex_credential_refresh"):
            return False

        return True

    def _try_refresh_nous_client_credentials(self, *, force: bool = True) -> bool:
        if self.api_mode != "chat_completions" or self.provider != "nous":
            return False

        try:
            from hermes_cli.auth import resolve_nous_runtime_credentials

            creds = resolve_nous_runtime_credentials(
                min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
                timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
                force_mint=force,
            )
        except Exception as exc:
            logger.debug("Nous credential refresh failed: %s", exc)
            return False

        api_key = creds.get("api_key")
        base_url = creds.get("base_url")
        if not isinstance(api_key, str) or not api_key.strip():
            return False
        if not isinstance(base_url, str) or not base_url.strip():
            return False

        self.api_key = api_key.strip()
        self.base_url = base_url.strip().rstrip("/")
        self._client_kwargs["api_key"] = self.api_key
        self._client_kwargs["base_url"] = self.base_url
        # Nous requests should not inherit OpenRouter-only attribution headers.
        self._client_kwargs.pop("default_headers", None)

        if not self._replace_primary_openai_client(reason="nous_credential_refresh"):
            return False

        return True

    def _try_refresh_anthropic_client_credentials(self) -> bool:
        if self.api_mode != "anthropic_messages" or not hasattr(self, "_anthropic_api_key"):
            return False

        try:
            from agent.anthropic_adapter import resolve_anthropic_token, build_anthropic_client

            new_token = resolve_anthropic_token()
        except Exception as exc:
            logger.debug("Anthropic credential refresh failed: %s", exc)
            return False

        if not isinstance(new_token, str) or not new_token.strip():
            return False
        new_token = new_token.strip()
        if new_token == self._anthropic_api_key:
            return False

        try:
            self._anthropic_client.close()
        except Exception:
            pass

        try:
            self._anthropic_client = build_anthropic_client(new_token, getattr(self, "_anthropic_base_url", None))
        except Exception as exc:
            logger.warning("Failed to rebuild Anthropic client after credential refresh: %s", exc)
            return False

        self._anthropic_api_key = new_token
        return True

    def _anthropic_messages_create(self, api_kwargs: dict):
        if self.api_mode == "anthropic_messages":
            self._try_refresh_anthropic_client_credentials()
        return self._anthropic_client.messages.create(**api_kwargs)

    def _interruptible_api_call(self, api_kwargs: dict):
        """
        Run the API call in a background thread so the main conversation loop
        can detect interrupts without waiting for the full HTTP round-trip.

        Each worker thread gets its own OpenAI client instance. Interrupts only
        close that worker-local client, so retries and other requests never
        inherit a closed transport.
        """
        result = {"response": None, "error": None}
        request_client_holder = {"client": None}

        def _call():
            try:
                if self.api_mode == "codex_responses":
                    request_client_holder["client"] = self._create_request_openai_client(reason="codex_stream_request")
                    result["response"] = self._run_codex_stream(
                        api_kwargs,
                        client=request_client_holder["client"],
                    )
                elif self.api_mode == "anthropic_messages":
                    result["response"] = self._anthropic_messages_create(api_kwargs)
                else:
                    request_client_holder["client"] = self._create_request_openai_client(reason="chat_completion_request")
                    result["response"] = request_client_holder["client"].chat.completions.create(**api_kwargs)
            except Exception as e:
                result["error"] = e
            finally:
                request_client = request_client_holder.get("client")
                if request_client is not None:
                    self._close_request_openai_client(request_client, reason="request_complete")

        t = threading.Thread(target=_call, daemon=True)
        t.start()
        while t.is_alive():
            t.join(timeout=0.3)
            if self._interrupt_requested:
                # Force-close the in-flight worker-local HTTP connection to stop
                # token generation without poisoning the shared client used to
                # seed future retries.
                try:
                    if self.api_mode == "anthropic_messages":
                        from agent.anthropic_adapter import build_anthropic_client

                        self._anthropic_client.close()
                        self._anthropic_client = build_anthropic_client(
                            self._anthropic_api_key,
                            getattr(self, "_anthropic_base_url", None),
                        )
                    else:
                        request_client = request_client_holder.get("client")
                        if request_client is not None:
                            self._close_request_openai_client(request_client, reason="interrupt_abort")
                except Exception:
                    pass
                raise InterruptedError("Agent interrupted during API call")
        if result["error"] is not None:
            raise result["error"]
        return result["response"]

    def _streaming_api_call(self, api_kwargs: dict, stream_callback):
        """Streaming variant of _interruptible_api_call for voice TTS pipeline.

        Uses ``stream=True`` and forwards content deltas to *stream_callback*
        in real-time.  Returns a ``SimpleNamespace`` that mimics a normal
        ``ChatCompletion`` so the rest of the agent loop works unchanged.

        This method is separate from ``_interruptible_api_call`` to keep the
        core agent loop untouched for non-voice users.
        """
        result = {"response": None, "error": None}
        request_client_holder = {"client": None}

        def _call():
            try:
                stream_kwargs = {**api_kwargs, "stream": True}
                request_client_holder["client"] = self._create_request_openai_client(
                    reason="chat_completion_stream_request"
                )
                stream = request_client_holder["client"].chat.completions.create(**stream_kwargs)

                content_parts: list[str] = []
                tool_calls_acc: dict[int, dict] = {}
                finish_reason = None
                model_name = None
                role = "assistant"

                for chunk in stream:
                    if not chunk.choices:
                        if hasattr(chunk, "model") and chunk.model:
                            model_name = chunk.model
                        continue

                    delta = chunk.choices[0].delta
                    if hasattr(chunk, "model") and chunk.model:
                        model_name = chunk.model

                    if delta and delta.content:
                        content_parts.append(delta.content)
                        try:
                            stream_callback(delta.content)
                        except Exception:
                            pass

                    if delta and delta.tool_calls:
                        for tc_delta in delta.tool_calls:
                            idx = tc_delta.index if tc_delta.index is not None else 0
                            if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]:
                                matched = False
                                for eidx, eentry in tool_calls_acc.items():
                                    if eentry["id"] == tc_delta.id:
                                        idx = eidx
                                        matched = True
                                        break
                                if not matched:
                                    idx = (max(k for k in tool_calls_acc if isinstance(k, int)) + 1) if tool_calls_acc else 0
                            if idx not in tool_calls_acc:
                                tool_calls_acc[idx] = {
                                    "id": tc_delta.id or "",
                                    "type": "function",
                                    "function": {"name": "", "arguments": ""},
                                }
                            entry = tool_calls_acc[idx]
                            if tc_delta.id:
                                entry["id"] = tc_delta.id
                            if tc_delta.function:
                                if tc_delta.function.name:
                                    entry["function"]["name"] += tc_delta.function.name
                                if tc_delta.function.arguments:
                                    entry["function"]["arguments"] += tc_delta.function.arguments

                    if chunk.choices[0].finish_reason:
                        finish_reason = chunk.choices[0].finish_reason

                full_content = "".join(content_parts) or None
                mock_tool_calls = None
                if tool_calls_acc:
                    mock_tool_calls = []
                    for idx in sorted(tool_calls_acc):
                        tc = tool_calls_acc[idx]
                        mock_tool_calls.append(SimpleNamespace(
                            id=tc["id"],
                            type=tc["type"],
                            function=SimpleNamespace(
                                name=tc["function"]["name"],
                                arguments=tc["function"]["arguments"],
                            ),
                        ))

                mock_message = SimpleNamespace(
                    role=role,
                    content=full_content,
                    tool_calls=mock_tool_calls,
                    reasoning_content=None,
                )
                mock_choice = SimpleNamespace(
                    index=0,
                    message=mock_message,
                    finish_reason=finish_reason or "stop",
                )
                mock_response = SimpleNamespace(
                    id="stream-" + str(uuid.uuid4()),
                    model=model_name,
                    choices=[mock_choice],
                    usage=None,
                )
                result["response"] = mock_response

            except Exception as e:
                result["error"] = e
            finally:
                request_client = request_client_holder.get("client")
                if request_client is not None:
                    self._close_request_openai_client(request_client, reason="stream_request_complete")

        t = threading.Thread(target=_call, daemon=True)
        t.start()
        while t.is_alive():
            t.join(timeout=0.3)
            if self._interrupt_requested:
                try:
                    if self.api_mode == "anthropic_messages":
                        from agent.anthropic_adapter import build_anthropic_client

                        self._anthropic_client.close()
                        self._anthropic_client = build_anthropic_client(
                            self._anthropic_api_key,
                            getattr(self, "_anthropic_base_url", None),
                        )
                    else:
                        request_client = request_client_holder.get("client")
                        if request_client is not None:
                            self._close_request_openai_client(request_client, reason="stream_interrupt_abort")
                except Exception:
                    pass
                raise InterruptedError("Agent interrupted during API call")
        if result["error"] is not None:
            raise result["error"]
        return result["response"]

    # ── Provider fallback ──────────────────────────────────────────────────

    def _try_activate_fallback(self) -> bool:
        """Switch to the configured fallback model/provider.

        Called when the primary model is failing after retries.  Swaps the
        OpenAI client, model slug, and provider in-place so the retry loop
        can continue with the new backend.  One-shot: returns False if
        already activated or not configured.

        Uses the centralized provider router (resolve_provider_client) for
        auth resolution and client construction — no duplicated provider→key
        mappings.
        """
        if self._fallback_activated or not self._fallback_model:
            return False

        fb = self._fallback_model
        fb_provider = (fb.get("provider") or "").strip().lower()
        fb_model = (fb.get("model") or "").strip()
        if not fb_provider or not fb_model:
            return False

        # Use centralized router for client construction.
        # raw_codex=True because the main agent needs direct responses.stream()
        # access for Codex providers.
        try:
            from agent.auxiliary_client import resolve_provider_client
            fb_client, _ = resolve_provider_client(
                fb_provider, model=fb_model, raw_codex=True)
            if fb_client is None:
                logging.warning(
                    "Fallback to %s failed: provider not configured",
                    fb_provider)
                return False

            # Determine api_mode from provider
            fb_api_mode = "chat_completions"
            if fb_provider == "openai-codex":
                fb_api_mode = "codex_responses"
            elif fb_provider == "anthropic":
                fb_api_mode = "anthropic_messages"
            fb_base_url = str(fb_client.base_url)

            old_model = self.model
            self.model = fb_model
            self.provider = fb_provider
            self.base_url = fb_base_url
            self.api_mode = fb_api_mode
            self._fallback_activated = True

            if fb_api_mode == "anthropic_messages":
                # Build native Anthropic client instead of using OpenAI client
                from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token
                effective_key = fb_client.api_key or resolve_anthropic_token() or ""
                self._anthropic_api_key = effective_key
                self._anthropic_base_url = getattr(fb_client, "base_url", None)
                self._anthropic_client = build_anthropic_client(effective_key, self._anthropic_base_url)
                self.client = None
                self._client_kwargs = {}
            else:
                # Swap OpenAI client and config in-place
                self.client = fb_client
                self._client_kwargs = {
                    "api_key": fb_client.api_key,
                    "base_url": fb_base_url,
                }

            # Re-evaluate prompt caching for the new provider/model
            is_native_anthropic = fb_api_mode == "anthropic_messages"
            self._use_prompt_caching = (
                ("openrouter" in fb_base_url.lower() and "claude" in fb_model.lower())
                or is_native_anthropic
            )

            print(
                f"{self.log_prefix}🔄 Primary model failed — switching to fallback: "
                f"{fb_model} via {fb_provider}"
            )
            logging.info(
                "Fallback activated: %s → %s (%s)",
                old_model, fb_model, fb_provider,
            )
            return True
        except Exception as e:
            logging.error("Failed to activate fallback model: %s", e)
            return False

    # ── End provider fallback ──────────────────────────────────────────────

    @staticmethod
    def _content_has_image_parts(content: Any) -> bool:
        if not isinstance(content, list):
            return False
        for part in content:
            if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}:
                return True
        return False

    @staticmethod
    def _materialize_data_url_for_vision(image_url: str) -> tuple[str, Optional[Path]]:
        header, _, data = str(image_url or "").partition(",")
        mime = "image/jpeg"
        if header.startswith("data:"):
            mime_part = header[len("data:"):].split(";", 1)[0].strip()
            if mime_part.startswith("image/"):
                mime = mime_part
        suffix = {
            "image/png": ".png",
            "image/gif": ".gif",
            "image/webp": ".webp",
            "image/jpeg": ".jpg",
            "image/jpg": ".jpg",
        }.get(mime, ".jpg")
        tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False)
        with tmp:
            tmp.write(base64.b64decode(data))
        path = Path(tmp.name)
        return str(path), path

    def _describe_image_for_anthropic_fallback(self, image_url: str, role: str) -> str:
        cache_key = hashlib.sha256(str(image_url or "").encode("utf-8")).hexdigest()
        cached = self._anthropic_image_fallback_cache.get(cache_key)
        if cached:
            return cached

        role_label = {
            "assistant": "assistant",
            "tool": "tool result",
        }.get(role, "user")
        analysis_prompt = (
            "Describe everything visible in this image in thorough detail. "
            "Include any text, code, UI, data, objects, people, layout, colors, "
            "and any other notable visual information."
        )

        vision_source = str(image_url or "")
        cleanup_path: Optional[Path] = None
        if vision_source.startswith("data:"):
            vision_source, cleanup_path = self._materialize_data_url_for_vision(vision_source)

        description = ""
        try:
            from tools.vision_tools import vision_analyze_tool

            result_json = asyncio.run(
                vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt)
            )
            result = json.loads(result_json) if isinstance(result_json, str) else {}
            description = (result.get("analysis") or "").strip()
        except Exception as e:
            description = f"Image analysis failed: {e}"
        finally:
            if cleanup_path and cleanup_path.exists():
                try:
                    cleanup_path.unlink()
                except OSError:
                    pass

        if not description:
            description = "Image analysis failed."

        note = f"[The {role_label} attached an image. Here's what it contains:\n{description}]"
        if vision_source and not str(image_url or "").startswith("data:"):
            note += (
                f"\n[If you need a closer look, use vision_analyze with image_url: {vision_source}]"
            )

        self._anthropic_image_fallback_cache[cache_key] = note
        return note

    def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
        if not self._content_has_image_parts(content):
            return content

        text_parts: List[str] = []
        image_notes: List[str] = []
        for part in content:
            if isinstance(part, str):
                if part.strip():
                    text_parts.append(part.strip())
                continue
            if not isinstance(part, dict):
                continue

            ptype = part.get("type")
            if ptype in {"text", "input_text"}:
                text = str(part.get("text", "") or "").strip()
                if text:
                    text_parts.append(text)
                continue

            if ptype in {"image_url", "input_image"}:
                image_data = part.get("image_url", {})
                image_url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data or "")
                if image_url:
                    image_notes.append(self._describe_image_for_anthropic_fallback(image_url, role))
                else:
                    image_notes.append("[An image was attached but no image source was available.]")
                continue

            text = str(part.get("text", "") or "").strip()
            if text:
                text_parts.append(text)

        prefix = "\n\n".join(note for note in image_notes if note).strip()
        suffix = "\n".join(text for text in text_parts if text).strip()
        if prefix and suffix:
            return f"{prefix}\n\n{suffix}"
        if prefix:
            return prefix
        if suffix:
            return suffix
        return "[A multimodal message was converted to text for Anthropic compatibility.]"

    def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
        if not any(
            isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
            for msg in api_messages
        ):
            return api_messages

        transformed = copy.deepcopy(api_messages)
        for msg in transformed:
            if not isinstance(msg, dict):
                continue
            msg["content"] = self._preprocess_anthropic_content(
                msg.get("content"),
                str(msg.get("role", "user") or "user"),
            )
        return transformed

    def _build_api_kwargs(self, api_messages: list) -> dict:
        """Build the keyword arguments dict for the active API mode."""
        if self.api_mode == "anthropic_messages":
            from agent.anthropic_adapter import build_anthropic_kwargs
            anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages)
            return build_anthropic_kwargs(
                model=self.model,
                messages=anthropic_messages,
                tools=self.tools,
                max_tokens=self.max_tokens,
                reasoning_config=self.reasoning_config,
            )

        if self.api_mode == "codex_responses":
            instructions = ""
            payload_messages = api_messages
            if api_messages and api_messages[0].get("role") == "system":
                instructions = str(api_messages[0].get("content") or "").strip()
                payload_messages = api_messages[1:]
            if not instructions:
                instructions = DEFAULT_AGENT_IDENTITY

            # Resolve reasoning effort: config > default (medium)
            reasoning_effort = "medium"
            reasoning_enabled = True
            if self.reasoning_config and isinstance(self.reasoning_config, dict):
                if self.reasoning_config.get("enabled") is False:
                    reasoning_enabled = False
                elif self.reasoning_config.get("effort"):
                    reasoning_effort = self.reasoning_config["effort"]

            kwargs = {
                "model": self.model,
                "instructions": instructions,
                "input": self._chat_messages_to_responses_input(payload_messages),
                "tools": self._responses_tools(),
                "tool_choice": "auto",
                "parallel_tool_calls": True,
                "store": False,
                "prompt_cache_key": self.session_id,
            }

            if reasoning_enabled:
                kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"}
                kwargs["include"] = ["reasoning.encrypted_content"]
            else:
                kwargs["include"] = []

            if self.max_tokens is not None:
                kwargs["max_output_tokens"] = self.max_tokens

            return kwargs

        sanitized_messages = api_messages
        needs_sanitization = False
        for msg in api_messages:
            if not isinstance(msg, dict):
                continue
            if "codex_reasoning_items" in msg:
                needs_sanitization = True
                break

            tool_calls = msg.get("tool_calls")
            if isinstance(tool_calls, list):
                for tool_call in tool_calls:
                    if not isinstance(tool_call, dict):
                        continue
                    if "call_id" in tool_call or "response_item_id" in tool_call:
                        needs_sanitization = True
                        break
                if needs_sanitization:
                    break

        if needs_sanitization:
            sanitized_messages = copy.deepcopy(api_messages)
            for msg in sanitized_messages:
                if not isinstance(msg, dict):
                    continue

                # Codex-only replay state must not leak into strict chat-completions APIs.
                msg.pop("codex_reasoning_items", None)

                tool_calls = msg.get("tool_calls")
                if isinstance(tool_calls, list):
                    for tool_call in tool_calls:
                        if isinstance(tool_call, dict):
                            tool_call.pop("call_id", None)
                            tool_call.pop("response_item_id", None)

        provider_preferences = {}
        if self.providers_allowed:
            provider_preferences["only"] = self.providers_allowed
        if self.providers_ignored:
            provider_preferences["ignore"] = self.providers_ignored
        if self.providers_order:
            provider_preferences["order"] = self.providers_order
        if self.provider_sort:
            provider_preferences["sort"] = self.provider_sort
        if self.provider_require_parameters:
            provider_preferences["require_parameters"] = True
        if self.provider_data_collection:
            provider_preferences["data_collection"] = self.provider_data_collection

        api_kwargs = {
            "model": self.model,
            "messages": sanitized_messages,
            "tools": self.tools if self.tools else None,
            "timeout": float(os.getenv("HERMES_API_TIMEOUT", 900.0)),
        }

        if self.max_tokens is not None:
            api_kwargs.update(self._max_tokens_param(self.max_tokens))

        extra_body = {}

        _is_openrouter = "openrouter" in self.base_url.lower()

        # Provider preferences (only, ignore, order, sort) are OpenRouter-
        # specific.  Only send to OpenRouter-compatible endpoints.
        # TODO: Nous Portal will add transparent proxy support — re-enable
        # for _is_nous when their backend is updated.
        if provider_preferences and _is_openrouter:
            extra_body["provider"] = provider_preferences
        _is_nous = "nousresearch" in self.base_url.lower()

        _is_mistral = "api.mistral.ai" in self.base_url.lower()
        if (_is_openrouter or _is_nous) and not _is_mistral:
            if self.reasoning_config is not None:
                rc = dict(self.reasoning_config)
                # Nous Portal requires reasoning enabled — don't send
                # enabled=false to it (would cause 400).
                if _is_nous and rc.get("enabled") is False:
                    pass  # omit reasoning entirely for Nous when disabled
                else:
                    extra_body["reasoning"] = rc
            else:
                extra_body["reasoning"] = {
                    "enabled": True,
                    "effort": "medium"
                }

        # Nous Portal product attribution
        if _is_nous:
            extra_body["tags"] = ["product=hermes-agent"]

        if extra_body:
            api_kwargs["extra_body"] = extra_body

        return api_kwargs

    def _build_assistant_message(self, assistant_message, finish_reason: str) -> dict:
        """Build a normalized assistant message dict from an API response message.

        Handles reasoning extraction, reasoning_details, and optional tool_calls
        so both the tool-call path and the final-response path share one builder.
        """
        reasoning_text = self._extract_reasoning(assistant_message)

        # Fallback: extract inline <think> blocks from content when no structured
        # reasoning fields are present (some models/providers embed thinking
        # directly in the content rather than returning separate API fields).
        if not reasoning_text:
            content = assistant_message.content or ""
            think_blocks = re.findall(r'<think>(.*?)</think>', content, flags=re.DOTALL)
            if think_blocks:
                combined = "\n\n".join(b.strip() for b in think_blocks if b.strip())
                reasoning_text = combined or None

        if reasoning_text and self.verbose_logging:
            logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}")

        if reasoning_text and self.reasoning_callback:
            try:
                self.reasoning_callback(reasoning_text)
            except Exception:
                pass

        msg = {
            "role": "assistant",
            "content": assistant_message.content or "",
            "reasoning": reasoning_text,
            "finish_reason": finish_reason,
        }

        if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details:
            # Pass reasoning_details back unmodified so providers (OpenRouter,
            # Anthropic, OpenAI) can maintain reasoning continuity across turns.
            # Each provider may include opaque fields (signature, encrypted_content)
            # that must be preserved exactly.
            raw_details = assistant_message.reasoning_details
            preserved = []
            for d in raw_details:
                if isinstance(d, dict):
                    preserved.append(d)
                elif hasattr(d, "__dict__"):
                    preserved.append(d.__dict__)
                elif hasattr(d, "model_dump"):
                    preserved.append(d.model_dump())
            if preserved:
                msg["reasoning_details"] = preserved

        # Codex Responses API: preserve encrypted reasoning items for
        # multi-turn continuity. These get replayed as input on the next turn.
        codex_items = getattr(assistant_message, "codex_reasoning_items", None)
        if codex_items:
            msg["codex_reasoning_items"] = codex_items

        if assistant_message.tool_calls:
            tool_calls = []
            for tool_call in assistant_message.tool_calls:
                raw_id = getattr(tool_call, "id", None)
                call_id = getattr(tool_call, "call_id", None)
                if not isinstance(call_id, str) or not call_id.strip():
                    embedded_call_id, _ = self._split_responses_tool_id(raw_id)
                    call_id = embedded_call_id
                if not isinstance(call_id, str) or not call_id.strip():
                    if isinstance(raw_id, str) and raw_id.strip():
                        call_id = raw_id.strip()
                    else:
                        call_id = f"call_{uuid.uuid4().hex[:12]}"
                call_id = call_id.strip()

                response_item_id = getattr(tool_call, "response_item_id", None)
                if not isinstance(response_item_id, str) or not response_item_id.strip():
                    _, embedded_response_item_id = self._split_responses_tool_id(raw_id)
                    response_item_id = embedded_response_item_id

                response_item_id = self._derive_responses_function_call_id(
                    call_id,
                    response_item_id if isinstance(response_item_id, str) else None,
                )

                tc_dict = {
                    "id": call_id,
                    "call_id": call_id,
                    "response_item_id": response_item_id,
                    "type": tool_call.type,
                    "function": {
                        "name": tool_call.function.name,
                        "arguments": tool_call.function.arguments
                    },
                }
                # Preserve extra_content (e.g. Gemini thought_signature) so it
                # is sent back on subsequent API calls.  Without this, Gemini 3
                # thinking models reject the request with a 400 error.
                extra = getattr(tool_call, "extra_content", None)
                if extra is not None:
                    if hasattr(extra, "model_dump"):
                        extra = extra.model_dump()
                    tc_dict["extra_content"] = extra
                tool_calls.append(tc_dict)
            msg["tool_calls"] = tool_calls

        return msg

    @staticmethod
    def _sanitize_tool_calls_for_strict_api(api_msg: dict) -> dict:
        """Strip Codex Responses API fields from tool_calls for strict providers.

        Providers like Mistral strictly validate the Chat Completions schema
        and reject unknown fields (call_id, response_item_id) with 422.
        These fields are preserved in the internal message history — this
        method only modifies the outgoing API copy.

        Creates new tool_call dicts rather than mutating in-place, so the
        original messages list retains call_id/response_item_id for Codex
        Responses API compatibility (e.g. if the session falls back to a
        Codex provider later).
        """
        tool_calls = api_msg.get("tool_calls")
        if not isinstance(tool_calls, list):
            return api_msg
        _STRIP_KEYS = {"call_id", "response_item_id"}
        api_msg["tool_calls"] = [
            {k: v for k, v in tc.items() if k not in _STRIP_KEYS}
            if isinstance(tc, dict) else tc
            for tc in tool_calls
        ]
        return api_msg

    def flush_memories(self, messages: list = None, min_turns: int = None):
        """Give the model one turn to persist memories before context is lost.

        Called before compression, session reset, or CLI exit. Injects a flush
        message, makes one API call, executes any memory tool calls, then
        strips all flush artifacts from the message list.

        Args:
            messages: The current conversation messages. If None, uses
                      self._session_messages (last run_conversation state).
            min_turns: Minimum user turns required to trigger the flush.
                       None = use config value (flush_min_turns).
                       0 = always flush (used for compression).
        """
        if self._memory_flush_min_turns == 0 and min_turns is None:
            return
        if "memory" not in self.valid_tool_names or not self._memory_store:
            return
        # honcho-only agent mode: skip local MEMORY.md flush
        _hcfg = getattr(self, '_honcho_config', None)
        if _hcfg and _hcfg.peer_memory_mode(_hcfg.ai_peer) == "honcho":
            return
        effective_min = min_turns if min_turns is not None else self._memory_flush_min_turns
        if self._user_turn_count < effective_min:
            return

        if messages is None:
            messages = getattr(self, '_session_messages', None)
        if not messages or len(messages) < 3:
            return

        flush_content = (
            "[System: The session is being compressed. "
            "Please save anything worth remembering to your memories.]"
        )
        _sentinel = f"__flush_{id(self)}_{time.monotonic()}"
        flush_msg = {"role": "user", "content": flush_content, "_flush_sentinel": _sentinel}
        messages.append(flush_msg)

        try:
            # Build API messages for the flush call
            _is_strict_api = "api.mistral.ai" in self.base_url.lower()
            api_messages = []
            for msg in messages:
                api_msg = msg.copy()
                if msg.get("role") == "assistant":
                    reasoning = msg.get("reasoning")
                    if reasoning:
                        api_msg["reasoning_content"] = reasoning
                api_msg.pop("reasoning", None)
                api_msg.pop("finish_reason", None)
                api_msg.pop("_flush_sentinel", None)
                if _is_strict_api:
                    self._sanitize_tool_calls_for_strict_api(api_msg)
                api_messages.append(api_msg)

            if self._cached_system_prompt:
                api_messages = [{"role": "system", "content": self._cached_system_prompt}] + api_messages

            # Make one API call with only the memory tool available
            memory_tool_def = None
            for t in (self.tools or []):
                if t.get("function", {}).get("name") == "memory":
                    memory_tool_def = t
                    break

            if not memory_tool_def:
                messages.pop()  # remove flush msg
                return

            # Use auxiliary client for the flush call when available --
            # it's cheaper and avoids Codex Responses API incompatibility.
            from agent.auxiliary_client import call_llm as _call_llm
            _aux_available = True
            try:
                response = _call_llm(
                    task="flush_memories",
                    messages=api_messages,
                    tools=[memory_tool_def],
                    temperature=0.3,
                    max_tokens=5120,
                    timeout=30.0,
                )
            except RuntimeError:
                _aux_available = False
                response = None

            if not _aux_available and self.api_mode == "codex_responses":
                # No auxiliary client -- use the Codex Responses path directly
                codex_kwargs = self._build_api_kwargs(api_messages)
                codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
                codex_kwargs["temperature"] = 0.3
                if "max_output_tokens" in codex_kwargs:
                    codex_kwargs["max_output_tokens"] = 5120
                response = self._run_codex_stream(codex_kwargs)
            elif not _aux_available and self.api_mode == "anthropic_messages":
                # Native Anthropic — use the Anthropic client directly
                from agent.anthropic_adapter import build_anthropic_kwargs as _build_ant_kwargs
                ant_kwargs = _build_ant_kwargs(
                    model=self.model, messages=api_messages,
                    tools=[memory_tool_def], max_tokens=5120,
                    reasoning_config=None,
                )
                response = self._anthropic_messages_create(ant_kwargs)
            elif not _aux_available:
                api_kwargs = {
                    "model": self.model,
                    "messages": api_messages,
                    "tools": [memory_tool_def],
                    "temperature": 0.3,
                    **self._max_tokens_param(5120),
                }
                response = self._ensure_primary_openai_client(reason="flush_memories").chat.completions.create(**api_kwargs, timeout=30.0)

            # Extract tool calls from the response, handling all API formats
            tool_calls = []
            if self.api_mode == "codex_responses" and not _aux_available:
                assistant_msg, _ = self._normalize_codex_response(response)
                if assistant_msg and assistant_msg.tool_calls:
                    tool_calls = assistant_msg.tool_calls
            elif self.api_mode == "anthropic_messages" and not _aux_available:
                from agent.anthropic_adapter import normalize_anthropic_response as _nar_flush
                _flush_msg, _ = _nar_flush(response)
                if _flush_msg and _flush_msg.tool_calls:
                    tool_calls = _flush_msg.tool_calls
            elif hasattr(response, "choices") and response.choices:
                assistant_message = response.choices[0].message
                if assistant_message.tool_calls:
                    tool_calls = assistant_message.tool_calls

            for tc in tool_calls:
                if tc.function.name == "memory":
                    try:
                        args = json.loads(tc.function.arguments)
                        flush_target = args.get("target", "memory")
                        from tools.memory_tool import memory_tool as _memory_tool
                        result = _memory_tool(
                            action=args.get("action"),
                            target=flush_target,
                            content=args.get("content"),
                            old_text=args.get("old_text"),
                            store=self._memory_store,
                        )
                        if self._honcho and flush_target == "user" and args.get("action") == "add":
                            self._honcho_save_user_observation(args.get("content", ""))
                        if not self.quiet_mode:
                            print(f"  🧠 Memory flush: saved to {args.get('target', 'memory')}")
                    except Exception as e:
                        logger.debug("Memory flush tool call failed: %s", e)
        except Exception as e:
            logger.debug("Memory flush API call failed: %s", e)
        finally:
            # Strip flush artifacts: remove everything from the flush message onward.
            # Use sentinel marker instead of identity check for robustness.
            while messages and messages[-1].get("_flush_sentinel") != _sentinel:
                messages.pop()
                if not messages:
                    break
            if messages and messages[-1].get("_flush_sentinel") == _sentinel:
                messages.pop()

    def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default") -> tuple:
        """Compress conversation context and split the session in SQLite.

        Returns:
            (compressed_messages, new_system_prompt) tuple
        """
        # Pre-compression memory flush: let the model save memories before they're lost
        self.flush_memories(messages, min_turns=0)

        compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens)

        todo_snapshot = self._todo_store.format_for_injection()
        if todo_snapshot:
            compressed.append({"role": "user", "content": todo_snapshot})

        # Preserve file-read history so the model doesn't re-read files
        # it already examined before compression.
        try:
            from tools.file_tools import get_read_files_summary
            read_files = get_read_files_summary(task_id)
            if read_files:
                file_list = "\n".join(
                    f"  - {f['path']} ({', '.join(f['regions'])})"
                    for f in read_files
                )
                compressed.append({"role": "user", "content": (
                    "[Files already read in this session — do NOT re-read these]\n"
                    f"{file_list}\n"
                    "Use the information from the context summary above. "
                    "Proceed with writing, editing, or responding."
                )})
        except Exception:
            pass  # Don't break compression if file tracking fails

        self._invalidate_system_prompt()
        new_system_prompt = self._build_system_prompt(system_message)
        self._cached_system_prompt = new_system_prompt

        if self._session_db:
            try:
                # Propagate title to the new session with auto-numbering
                old_title = self._session_db.get_session_title(self.session_id)
                self._session_db.end_session(self.session_id, "compression")
                old_session_id = self.session_id
                self.session_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
                self._session_db.create_session(
                    session_id=self.session_id,
                    source=self.platform or "cli",
                    model=self.model,
                    parent_session_id=old_session_id,
                )
                # Auto-number the title for the continuation session
                if old_title:
                    try:
                        new_title = self._session_db.get_next_title_in_lineage(old_title)
                        self._session_db.set_session_title(self.session_id, new_title)
                    except (ValueError, Exception) as e:
                        logger.debug("Could not propagate title on compression: %s", e)
                self._session_db.update_system_prompt(self.session_id, new_system_prompt)
                # Reset flush cursor — new session starts with no messages written
                self._last_flushed_db_idx = 0
            except Exception as e:
                logger.debug("Session DB compression split failed: %s", e)

        return compressed, new_system_prompt

    def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
        """Execute tool calls from the assistant message and append results to messages.

        Dispatches to concurrent execution when multiple independent tool calls
        are present, falling back to sequential execution for single calls or
        when interactive tools (e.g. clarify) are in the batch.
        """
        tool_calls = assistant_message.tool_calls

        # Single tool call or interactive tool present → sequential
        if (len(tool_calls) <= 1
                or any(tc.function.name in _NEVER_PARALLEL_TOOLS for tc in tool_calls)):
            return self._execute_tool_calls_sequential(
                assistant_message, messages, effective_task_id, api_call_count
            )

        # Multiple non-interactive tools → concurrent
        return self._execute_tool_calls_concurrent(
            assistant_message, messages, effective_task_id, api_call_count
        )

    def _invoke_tool(self, function_name: str, function_args: dict, effective_task_id: str) -> str:
        """Invoke a single tool and return the result string. No display logic.

        Handles both agent-level tools (todo, memory, etc.) and registry-dispatched
        tools. Used by the concurrent execution path; the sequential path retains
        its own inline invocation for backward-compatible display handling.
        """
        if function_name == "todo":
            from tools.todo_tool import todo_tool as _todo_tool
            return _todo_tool(
                todos=function_args.get("todos"),
                merge=function_args.get("merge", False),
                store=self._todo_store,
            )
        elif function_name == "session_search":
            if not self._session_db:
                return json.dumps({"success": False, "error": "Session database not available."})
            from tools.session_search_tool import session_search as _session_search
            return _session_search(
                query=function_args.get("query", ""),
                role_filter=function_args.get("role_filter"),
                limit=function_args.get("limit", 3),
                db=self._session_db,
                current_session_id=self.session_id,
            )
        elif function_name == "memory":
            target = function_args.get("target", "memory")
            from tools.memory_tool import memory_tool as _memory_tool
            result = _memory_tool(
                action=function_args.get("action"),
                target=target,
                content=function_args.get("content"),
                old_text=function_args.get("old_text"),
                store=self._memory_store,
            )
            # Also send user observations to Honcho when active
            if self._honcho and target == "user" and function_args.get("action") == "add":
                self._honcho_save_user_observation(function_args.get("content", ""))
            return result
        elif function_name == "clarify":
            from tools.clarify_tool import clarify_tool as _clarify_tool
            return _clarify_tool(
                question=function_args.get("question", ""),
                choices=function_args.get("choices"),
                callback=self.clarify_callback,
            )
        elif function_name == "delegate_task":
            from tools.delegate_tool import delegate_task as _delegate_task
            return _delegate_task(
                goal=function_args.get("goal"),
                context=function_args.get("context"),
                toolsets=function_args.get("toolsets"),
                tasks=function_args.get("tasks"),
                max_iterations=function_args.get("max_iterations"),
                parent_agent=self,
            )
        else:
            return handle_function_call(
                function_name, function_args, effective_task_id,
                enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
            )

    def _execute_tool_calls_concurrent(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
        """Execute multiple tool calls concurrently using a thread pool.

        Results are collected in the original tool-call order and appended to
        messages so the API sees them in the expected sequence.
        """
        tool_calls = assistant_message.tool_calls
        num_tools = len(tool_calls)

        # ── Pre-flight: interrupt check ──────────────────────────────────
        if self._interrupt_requested:
            print(f"{self.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
            for tc in tool_calls:
                messages.append({
                    "role": "tool",
                    "content": f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]",
                    "tool_call_id": tc.id,
                })
            return

        # ── Parse args + pre-execution bookkeeping ───────────────────────
        parsed_calls = []  # list of (tool_call, function_name, function_args)
        for tool_call in tool_calls:
            function_name = tool_call.function.name

            # Reset nudge counters
            if function_name == "memory":
                self._turns_since_memory = 0
            elif function_name == "skill_manage":
                self._iters_since_skill = 0

            try:
                function_args = json.loads(tool_call.function.arguments)
            except json.JSONDecodeError:
                function_args = {}
            if not isinstance(function_args, dict):
                function_args = {}

            # Checkpoint for file-mutating tools
            if function_name in ("write_file", "patch") and self._checkpoint_mgr.enabled:
                try:
                    file_path = function_args.get("path", "")
                    if file_path:
                        work_dir = self._checkpoint_mgr.get_working_dir_for_path(file_path)
                        self._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}")
                except Exception:
                    pass

            parsed_calls.append((tool_call, function_name, function_args))

        # ── Logging / callbacks ──────────────────────────────────────────
        tool_names_str = ", ".join(name for _, name, _ in parsed_calls)
        if not self.quiet_mode:
            print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
            for i, (tc, name, args) in enumerate(parsed_calls, 1):
                args_str = json.dumps(args, ensure_ascii=False)
                if self.verbose_logging:
                    print(f"  📞 Tool {i}: {name}({list(args.keys())})")
                    print(f"     Args: {args_str}")
                else:
                    args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
                    print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")

        for _, name, args in parsed_calls:
            if self.tool_progress_callback:
                try:
                    preview = _build_tool_preview(name, args)
                    self.tool_progress_callback(name, preview, args)
                except Exception as cb_err:
                    logging.debug(f"Tool progress callback error: {cb_err}")

        # ── Concurrent execution ─────────────────────────────────────────
        # Each slot holds (function_name, function_args, function_result, duration, error_flag)
        results = [None] * num_tools

        def _run_tool(index, tool_call, function_name, function_args):
            """Worker function executed in a thread."""
            start = time.time()
            try:
                result = self._invoke_tool(function_name, function_args, effective_task_id)
            except Exception as tool_error:
                result = f"Error executing tool '{function_name}': {tool_error}"
                logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
            duration = time.time() - start
            is_error, _ = _detect_tool_failure(function_name, result)
            results[index] = (function_name, function_args, result, duration, is_error)

        # Start spinner for CLI mode
        spinner = None
        if self.quiet_mode:
            face = random.choice(KawaiiSpinner.KAWAII_WAITING)
            spinner = KawaiiSpinner(f"{face} ⚡ running {num_tools} tools concurrently", spinner_type='dots')
            spinner.start()

        try:
            max_workers = min(num_tools, _MAX_TOOL_WORKERS)
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = []
                for i, (tc, name, args) in enumerate(parsed_calls):
                    f = executor.submit(_run_tool, i, tc, name, args)
                    futures.append(f)

                # Wait for all to complete (exceptions are captured inside _run_tool)
                concurrent.futures.wait(futures)
        finally:
            if spinner:
                # Build a summary message for the spinner stop
                completed = sum(1 for r in results if r is not None)
                total_dur = sum(r[3] for r in results if r is not None)
                spinner.stop(f"⚡ {completed}/{num_tools} tools completed in {total_dur:.1f}s total")

        # ── Post-execution: display per-tool results ─────────────────────
        for i, (tc, name, args) in enumerate(parsed_calls):
            r = results[i]
            if r is None:
                # Shouldn't happen, but safety fallback
                function_result = f"Error executing tool '{name}': thread did not return a result"
                tool_duration = 0.0
            else:
                function_name, function_args, function_result, tool_duration, is_error = r

                if is_error:
                    result_preview = function_result[:200] if len(function_result) > 200 else function_result
                    logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)

                if self.verbose_logging:
                    logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
                    logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")

            # Print cute message per tool
            if self.quiet_mode:
                cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result)
                print(f"  {cute_msg}")
            elif not self.quiet_mode:
                if self.verbose_logging:
                    print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s")
                    print(f"     Result: {function_result}")
                else:
                    response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
                    print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}")

            # Truncate oversized results
            MAX_TOOL_RESULT_CHARS = 100_000
            if len(function_result) > MAX_TOOL_RESULT_CHARS:
                original_len = len(function_result)
                function_result = (
                    function_result[:MAX_TOOL_RESULT_CHARS]
                    + f"\n\n[Truncated: tool response was {original_len:,} chars, "
                    f"exceeding the {MAX_TOOL_RESULT_CHARS:,} char limit]"
                )

            # Append tool result message in order
            tool_msg = {
                "role": "tool",
                "content": function_result,
                "tool_call_id": tc.id,
            }
            messages.append(tool_msg)

        # ── Budget pressure injection ────────────────────────────────────
        budget_warning = self._get_budget_warning(api_call_count)
        if budget_warning and messages and messages[-1].get("role") == "tool":
            last_content = messages[-1]["content"]
            try:
                parsed = json.loads(last_content)
                if isinstance(parsed, dict):
                    parsed["_budget_warning"] = budget_warning
                    messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
                else:
                    messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
            except (json.JSONDecodeError, TypeError):
                messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
            if not self.quiet_mode:
                remaining = self.max_iterations - api_call_count
                tier = "⚠️  WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
                print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")

    def _execute_tool_calls_sequential(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
        """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
        for i, tool_call in enumerate(assistant_message.tool_calls, 1):
            # SAFETY: check interrupt BEFORE starting each tool.
            # If the user sent "stop" during a previous tool's execution,
            # do NOT start any more tools -- skip them all immediately.
            if self._interrupt_requested:
                remaining_calls = assistant_message.tool_calls[i-1:]
                if remaining_calls:
                    self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True)
                for skipped_tc in remaining_calls:
                    skipped_name = skipped_tc.function.name
                    skip_msg = {
                        "role": "tool",
                        "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
                        "tool_call_id": skipped_tc.id,
                    }
                    messages.append(skip_msg)
                break

            function_name = tool_call.function.name

            # Reset nudge counters when the relevant tool is actually used
            if function_name == "memory":
                self._turns_since_memory = 0
            elif function_name == "skill_manage":
                self._iters_since_skill = 0

            try:
                function_args = json.loads(tool_call.function.arguments)
            except json.JSONDecodeError as e:
                logging.warning(f"Unexpected JSON error after validation: {e}")
                function_args = {}
            if not isinstance(function_args, dict):
                function_args = {}

            if not self.quiet_mode:
                args_str = json.dumps(function_args, ensure_ascii=False)
                if self.verbose_logging:
                    print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
                    print(f"     Args: {args_str}")
                else:
                    args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
                    print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")

            if self.tool_progress_callback:
                try:
                    preview = _build_tool_preview(function_name, function_args)
                    self.tool_progress_callback(function_name, preview, function_args)
                except Exception as cb_err:
                    logging.debug(f"Tool progress callback error: {cb_err}")

            # Checkpoint: snapshot working dir before file-mutating tools
            if function_name in ("write_file", "patch") and self._checkpoint_mgr.enabled:
                try:
                    file_path = function_args.get("path", "")
                    if file_path:
                        work_dir = self._checkpoint_mgr.get_working_dir_for_path(file_path)
                        self._checkpoint_mgr.ensure_checkpoint(
                            work_dir, f"before {function_name}"
                        )
                except Exception:
                    pass  # never block tool execution

            tool_start_time = time.time()

            if function_name == "todo":
                from tools.todo_tool import todo_tool as _todo_tool
                function_result = _todo_tool(
                    todos=function_args.get("todos"),
                    merge=function_args.get("merge", False),
                    store=self._todo_store,
                )
                tool_duration = time.time() - tool_start_time
                if self.quiet_mode:
                    self._vprint(f"  {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
            elif function_name == "session_search":
                if not self._session_db:
                    function_result = json.dumps({"success": False, "error": "Session database not available."})
                else:
                    from tools.session_search_tool import session_search as _session_search
                    function_result = _session_search(
                        query=function_args.get("query", ""),
                        role_filter=function_args.get("role_filter"),
                        limit=function_args.get("limit", 3),
                        db=self._session_db,
                        current_session_id=self.session_id,
                    )
                tool_duration = time.time() - tool_start_time
                if self.quiet_mode:
                    self._vprint(f"  {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
            elif function_name == "memory":
                target = function_args.get("target", "memory")
                from tools.memory_tool import memory_tool as _memory_tool
                function_result = _memory_tool(
                    action=function_args.get("action"),
                    target=target,
                    content=function_args.get("content"),
                    old_text=function_args.get("old_text"),
                    store=self._memory_store,
                )
                # Also send user observations to Honcho when active
                if self._honcho and target == "user" and function_args.get("action") == "add":
                    self._honcho_save_user_observation(function_args.get("content", ""))
                tool_duration = time.time() - tool_start_time
                if self.quiet_mode:
                    self._vprint(f"  {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
            elif function_name == "clarify":
                from tools.clarify_tool import clarify_tool as _clarify_tool
                function_result = _clarify_tool(
                    question=function_args.get("question", ""),
                    choices=function_args.get("choices"),
                    callback=self.clarify_callback,
                )
                tool_duration = time.time() - tool_start_time
                if self.quiet_mode:
                    self._vprint(f"  {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
            elif function_name == "delegate_task":
                from tools.delegate_tool import delegate_task as _delegate_task
                tasks_arg = function_args.get("tasks")
                if tasks_arg and isinstance(tasks_arg, list):
                    spinner_label = f"🔀 delegating {len(tasks_arg)} tasks"
                else:
                    goal_preview = (function_args.get("goal") or "")[:30]
                    spinner_label = f"🔀 {goal_preview}" if goal_preview else "🔀 delegating"
                spinner = None
                if self.quiet_mode:
                    face = random.choice(KawaiiSpinner.KAWAII_WAITING)
                    spinner = KawaiiSpinner(f"{face} {spinner_label}", spinner_type='dots')
                    spinner.start()
                self._delegate_spinner = spinner
                _delegate_result = None
                try:
                    function_result = _delegate_task(
                        goal=function_args.get("goal"),
                        context=function_args.get("context"),
                        toolsets=function_args.get("toolsets"),
                        tasks=tasks_arg,
                        max_iterations=function_args.get("max_iterations"),
                        parent_agent=self,
                    )
                    _delegate_result = function_result
                finally:
                    self._delegate_spinner = None
                    tool_duration = time.time() - tool_start_time
                    cute_msg = _get_cute_tool_message_impl('delegate_task', function_args, tool_duration, result=_delegate_result)
                    if spinner:
                        spinner.stop(cute_msg)
                    elif self.quiet_mode:
                        self._vprint(f"  {cute_msg}")
            elif self.quiet_mode and self._stream_callback is None:
                face = random.choice(KawaiiSpinner.KAWAII_WAITING)
                emoji = _get_tool_emoji(function_name)
                preview = _build_tool_preview(function_name, function_args) or function_name
                if len(preview) > 30:
                    preview = preview[:27] + "..."
                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots')
                spinner.start()
                _spinner_result = None
                try:
                    function_result = handle_function_call(
                        function_name, function_args, effective_task_id,
                        enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
                    )
                    _spinner_result = function_result
                except Exception as tool_error:
                    function_result = f"Error executing tool '{function_name}': {tool_error}"
                    logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
                finally:
                    tool_duration = time.time() - tool_start_time
                    cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
                    spinner.stop(cute_msg)
            else:
                try:
                    function_result = handle_function_call(
                        function_name, function_args, effective_task_id,
                        enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
                    )
                except Exception as tool_error:
                    function_result = f"Error executing tool '{function_name}': {tool_error}"
                    logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
                tool_duration = time.time() - tool_start_time

            result_preview = function_result if self.verbose_logging else (
                function_result[:200] if len(function_result) > 200 else function_result
            )

            # Log tool errors to the persistent error log so [error] tags
            # in the UI always have a corresponding detailed entry on disk.
            _is_error_result, _ = _detect_tool_failure(function_name, function_result)
            if _is_error_result:
                logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)

            if self.verbose_logging:
                logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
                logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")

            # Guard against tools returning absurdly large content that would
            # blow up the context window. 100K chars ≈ 25K tokens — generous
            # enough for any reasonable tool output but prevents catastrophic
            # context explosions (e.g. accidental base64 image dumps).
            MAX_TOOL_RESULT_CHARS = 100_000
            if len(function_result) > MAX_TOOL_RESULT_CHARS:
                original_len = len(function_result)
                function_result = (
                    function_result[:MAX_TOOL_RESULT_CHARS]
                    + f"\n\n[Truncated: tool response was {original_len:,} chars, "
                    f"exceeding the {MAX_TOOL_RESULT_CHARS:,} char limit]"
                )

            tool_msg = {
                "role": "tool",
                "content": function_result,
                "tool_call_id": tool_call.id
            }
            messages.append(tool_msg)

            if not self.quiet_mode:
                if self.verbose_logging:
                    print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s")
                    print(f"     Result: {function_result}")
                else:
                    response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
                    print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")

            if self._interrupt_requested and i < len(assistant_message.tool_calls):
                remaining = len(assistant_message.tool_calls) - i
                self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)", force=True)
                for skipped_tc in assistant_message.tool_calls[i:]:
                    skipped_name = skipped_tc.function.name
                    skip_msg = {
                        "role": "tool",
                        "content": f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]",
                        "tool_call_id": skipped_tc.id
                    }
                    messages.append(skip_msg)
                break

            if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
                time.sleep(self.tool_delay)

        # ── Budget pressure injection ─────────────────────────────────
        # After all tool calls in this turn are processed, check if we're
        # approaching max_iterations. If so, inject a warning into the LAST
        # tool result's JSON so the LLM sees it naturally when reading results.
        budget_warning = self._get_budget_warning(api_call_count)
        if budget_warning and messages and messages[-1].get("role") == "tool":
            last_content = messages[-1]["content"]
            try:
                parsed = json.loads(last_content)
                if isinstance(parsed, dict):
                    parsed["_budget_warning"] = budget_warning
                    messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
                else:
                    messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
            except (json.JSONDecodeError, TypeError):
                messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
            if not self.quiet_mode:
                remaining = self.max_iterations - api_call_count
                tier = "⚠️  WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
                print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")

    def _get_budget_warning(self, api_call_count: int) -> Optional[str]:
        """Return a budget pressure string, or None if not yet needed.

        Two-tier system:
          - Caution (70%): nudge to consolidate work
          - Warning (90%): urgent, must respond now
        """
        if not self._budget_pressure_enabled or self.max_iterations <= 0:
            return None
        progress = api_call_count / self.max_iterations
        remaining = self.max_iterations - api_call_count
        if progress >= self._budget_warning_threshold:
            return (
                f"[BUDGET WARNING: Iteration {api_call_count}/{self.max_iterations}. "
                f"Only {remaining} iteration(s) left. "
                "Provide your final response NOW. No more tool calls unless absolutely critical.]"
            )
        if progress >= self._budget_caution_threshold:
            return (
                f"[BUDGET: Iteration {api_call_count}/{self.max_iterations}. "
                f"{remaining} iterations left. Start consolidating your work.]"
            )
        return None

    def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
        """Request a summary when max iterations are reached. Returns the final response text."""
        print(f"⚠️  Reached maximum iterations ({self.max_iterations}). Requesting summary...")

        summary_request = (
            "You've reached the maximum number of tool-calling iterations allowed. "
            "Please provide a final response summarizing what you've found and accomplished so far, "
            "without calling any more tools."
        )
        messages.append({"role": "user", "content": summary_request})

        try:
            # Build API messages, stripping internal-only fields
            # (finish_reason, reasoning) that strict APIs like Mistral reject with 422
            _is_strict_api = "api.mistral.ai" in self.base_url.lower()
            api_messages = []
            for msg in messages:
                api_msg = msg.copy()
                for internal_field in ("reasoning", "finish_reason"):
                    api_msg.pop(internal_field, None)
                if _is_strict_api:
                    self._sanitize_tool_calls_for_strict_api(api_msg)
                api_messages.append(api_msg)

            effective_system = self._cached_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages
            if self.prefill_messages:
                sys_offset = 1 if effective_system else 0
                for idx, pfm in enumerate(self.prefill_messages):
                    api_messages.insert(sys_offset + idx, pfm.copy())

            summary_extra_body = {}
            _is_openrouter = "openrouter" in self.base_url.lower()
            _is_nous = "nousresearch" in self.base_url.lower()
            if _is_openrouter or _is_nous:
                if self.reasoning_config is not None:
                    summary_extra_body["reasoning"] = self.reasoning_config
                else:
                    summary_extra_body["reasoning"] = {
                        "enabled": True,
                        "effort": "medium"
                    }
            if _is_nous:
                summary_extra_body["tags"] = ["product=hermes-agent"]

            if self.api_mode == "codex_responses":
                codex_kwargs = self._build_api_kwargs(api_messages)
                codex_kwargs.pop("tools", None)
                summary_response = self._run_codex_stream(codex_kwargs)
                assistant_message, _ = self._normalize_codex_response(summary_response)
                final_response = (assistant_message.content or "").strip() if assistant_message else ""
            else:
                summary_kwargs = {
                    "model": self.model,
                    "messages": api_messages,
                }
                if self.max_tokens is not None:
                    summary_kwargs.update(self._max_tokens_param(self.max_tokens))

                # Include provider routing preferences
                provider_preferences = {}
                if self.providers_allowed:
                    provider_preferences["only"] = self.providers_allowed
                if self.providers_ignored:
                    provider_preferences["ignore"] = self.providers_ignored
                if self.providers_order:
                    provider_preferences["order"] = self.providers_order
                if self.provider_sort:
                    provider_preferences["sort"] = self.provider_sort
                if provider_preferences:
                    summary_extra_body["provider"] = provider_preferences

                if summary_extra_body:
                    summary_kwargs["extra_body"] = summary_extra_body

                if self.api_mode == "anthropic_messages":
                    from agent.anthropic_adapter import build_anthropic_kwargs as _bak, normalize_anthropic_response as _nar
                    _ant_kw = _bak(model=self.model, messages=api_messages, tools=None,
                                   max_tokens=self.max_tokens, reasoning_config=self.reasoning_config)
                    summary_response = self._anthropic_messages_create(_ant_kw)
                    _msg, _ = _nar(summary_response)
                    final_response = (_msg.content or "").strip()
                else:
                    summary_response = self._ensure_primary_openai_client(reason="iteration_limit_summary").chat.completions.create(**summary_kwargs)

                    if summary_response.choices and summary_response.choices[0].message.content:
                        final_response = summary_response.choices[0].message.content
                    else:
                        final_response = ""

            if final_response:
                if "<think>" in final_response:
                    final_response = re.sub(r'<think>.*?</think>\s*', '', final_response, flags=re.DOTALL).strip()
                if final_response:
                    messages.append({"role": "assistant", "content": final_response})
                else:
                    final_response = "I reached the iteration limit and couldn't generate a summary."
            else:
                # Retry summary generation
                if self.api_mode == "codex_responses":
                    codex_kwargs = self._build_api_kwargs(api_messages)
                    codex_kwargs.pop("tools", None)
                    retry_response = self._run_codex_stream(codex_kwargs)
                    retry_msg, _ = self._normalize_codex_response(retry_response)
                    final_response = (retry_msg.content or "").strip() if retry_msg else ""
                elif self.api_mode == "anthropic_messages":
                    from agent.anthropic_adapter import build_anthropic_kwargs as _bak2, normalize_anthropic_response as _nar2
                    _ant_kw2 = _bak2(model=self.model, messages=api_messages, tools=None,
                                     max_tokens=self.max_tokens, reasoning_config=self.reasoning_config)
                    retry_response = self._anthropic_messages_create(_ant_kw2)
                    _retry_msg, _ = _nar2(retry_response)
                    final_response = (_retry_msg.content or "").strip()
                else:
                    summary_kwargs = {
                        "model": self.model,
                        "messages": api_messages,
                    }
                    if self.max_tokens is not None:
                        summary_kwargs.update(self._max_tokens_param(self.max_tokens))
                    if summary_extra_body:
                        summary_kwargs["extra_body"] = summary_extra_body

                    summary_response = self._ensure_primary_openai_client(reason="iteration_limit_summary_retry").chat.completions.create(**summary_kwargs)

                    if summary_response.choices and summary_response.choices[0].message.content:
                        final_response = summary_response.choices[0].message.content
                    else:
                        final_response = ""

                if final_response:
                    if "<think>" in final_response:
                        final_response = re.sub(r'<think>.*?</think>\s*', '', final_response, flags=re.DOTALL).strip()
                    if final_response:
                        messages.append({"role": "assistant", "content": final_response})
                    else:
                        final_response = "I reached the iteration limit and couldn't generate a summary."
                else:
                    final_response = "I reached the iteration limit and couldn't generate a summary."

        except Exception as e:
            logging.warning(f"Failed to get summary response: {e}")
            final_response = f"I reached the maximum iterations ({self.max_iterations}) but couldn't summarize. Error: {str(e)}"

        return final_response

    def run_conversation(
        self,
        user_message: str,
        system_message: str = None,
        conversation_history: List[Dict[str, Any]] = None,
        task_id: str = None,
        stream_callback: Optional[callable] = None,
        persist_user_message: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Run a complete conversation with tool calling until completion.

        Args:
            user_message (str): The user's message/question
            system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided)
            conversation_history (List[Dict]): Previous conversation messages (optional)
            task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided)
            stream_callback: Optional callback invoked with each text delta during streaming.
                Used by the TTS pipeline to start audio generation before the full response.
                When None (default), API calls use the standard non-streaming path.
            persist_user_message: Optional clean user message to store in
                transcripts/history when user_message contains API-only
                synthetic prefixes.

        Returns:
            Dict: Complete conversation result with final response and message history
        """
        # Guard stdio against OSError from broken pipes (systemd/headless/daemon).
        # Installed once, transparent when streams are healthy, prevents crash on write.
        _install_safe_stdio()

        # Store stream callback for _interruptible_api_call to pick up
        self._stream_callback = stream_callback
        self._persist_user_message_idx = None
        self._persist_user_message_override = persist_user_message
        # Generate unique task_id if not provided to isolate VMs between concurrent tasks
        effective_task_id = task_id or str(uuid.uuid4())
        
        # Reset retry counters and iteration budget at the start of each turn
        # so subagent usage from a previous turn doesn't eat into the next one.
        self._invalid_tool_retries = 0
        self._invalid_json_retries = 0
        self._empty_content_retries = 0
        self._incomplete_scratchpad_retries = 0
        self._codex_incomplete_retries = 0
        self._last_content_with_tools = None
        self._turns_since_memory = 0
        self._iters_since_skill = 0
        self.iteration_budget = IterationBudget(self.max_iterations)
        
        # Initialize conversation (copy to avoid mutating the caller's list)
        messages = list(conversation_history) if conversation_history else []
        
        # Hydrate todo store from conversation history (gateway creates a fresh
        # AIAgent per message, so the in-memory store is empty -- we need to
        # recover the todo state from the most recent todo tool response in history)
        if conversation_history and not self._todo_store.has_items():
            self._hydrate_todo_store(conversation_history)
        
        # Prefill messages (few-shot priming) are injected at API-call time only,
        # never stored in the messages list. This keeps them ephemeral: they won't
        # be saved to session DB, session logs, or batch trajectories, but they're
        # automatically re-applied on every API call (including session continuations).
        
        # Track user turns for memory flush and periodic nudge logic
        self._user_turn_count += 1

        # Preserve the original user message before nudge injection.
        # Honcho should receive the actual user input, not system nudges.
        original_user_message = persist_user_message if persist_user_message is not None else user_message

        # Periodic memory nudge: remind the model to consider saving memories.
        # Counter resets whenever the memory tool is actually used.
        if (self._memory_nudge_interval > 0
                and "memory" in self.valid_tool_names
                and self._memory_store):
            self._turns_since_memory += 1
            if self._turns_since_memory >= self._memory_nudge_interval:
                user_message += (
                    "\n\n[System: You've had several exchanges in this session. "
                    "Consider whether there's anything worth saving to your memories.]"
                )
                self._turns_since_memory = 0

        # Skill creation nudge: fires on the first user message after a long tool loop.
        # The counter increments per API iteration in the tool loop and is checked here.
        if (self._skill_nudge_interval > 0
                and self._iters_since_skill >= self._skill_nudge_interval
                and "skill_manage" in self.valid_tool_names):
            user_message += (
                "\n\n[System: The previous task involved many steps. "
                "If you discovered a reusable workflow, consider saving it as a skill.]"
            )
            self._iters_since_skill = 0

        # Honcho prefetch consumption:
        # - First turn: bake into cached system prompt (stable for the session).
        # - Later turns: attach recall to the current-turn user message at
        #   API-call time only (never persisted to history / session DB).
        #
        # This keeps the system-prefix cache stable while still allowing turn N
        # to consume background prefetch results from turn N-1.
        self._honcho_context = ""
        self._honcho_turn_context = ""
        _recall_mode = (self._honcho_config.recall_mode if self._honcho_config else "hybrid")
        if self._honcho and self._honcho_session_key and _recall_mode != "tools":
            try:
                prefetched_context = self._honcho_prefetch(original_user_message)
                if prefetched_context:
                    if not conversation_history:
                        self._honcho_context = prefetched_context
                    else:
                        self._honcho_turn_context = prefetched_context
            except Exception as e:
                logger.debug("Honcho prefetch failed (non-fatal): %s", e)

        # Add user message
        user_msg = {"role": "user", "content": user_message}
        messages.append(user_msg)
        current_turn_user_idx = len(messages) - 1
        self._persist_user_message_idx = current_turn_user_idx
        
        if not self.quiet_mode:
            print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
        
        # ── System prompt (cached per session for prefix caching) ──
        # Built once on first call, reused for all subsequent calls.
        # Only rebuilt after context compression events (which invalidate
        # the cache and reload memory from disk).
        #
        # For continuing sessions (gateway creates a fresh AIAgent per
        # message), we load the stored system prompt from the session DB
        # instead of rebuilding.  Rebuilding would pick up memory changes
        # from disk that the model already knows about (it wrote them!),
        # producing a different system prompt and breaking the Anthropic
        # prefix cache.
        if self._cached_system_prompt is None:
            stored_prompt = None
            if conversation_history and self._session_db:
                try:
                    session_row = self._session_db.get_session(self.session_id)
                    if session_row:
                        stored_prompt = session_row.get("system_prompt") or None
                except Exception:
                    pass  # Fall through to build fresh

            if stored_prompt:
                # Continuing session — reuse the exact system prompt from
                # the previous turn so the Anthropic cache prefix matches.
                self._cached_system_prompt = stored_prompt
            else:
                # First turn of a new session — build from scratch.
                self._cached_system_prompt = self._build_system_prompt(system_message)
                # Bake Honcho context into the prompt so it's stable for
                # the entire session (not re-fetched per turn).
                if self._honcho_context:
                    self._cached_system_prompt = (
                        self._cached_system_prompt + "\n\n" + self._honcho_context
                    ).strip()
                # Store the system prompt snapshot in SQLite
                if self._session_db:
                    try:
                        self._session_db.update_system_prompt(self.session_id, self._cached_system_prompt)
                    except Exception as e:
                        logger.debug("Session DB update_system_prompt failed: %s", e)

        active_system_prompt = self._cached_system_prompt

        # ── Preflight context compression ──
        # Before entering the main loop, check if the loaded conversation
        # history already exceeds the model's context threshold.  This handles
        # cases where a user switches to a model with a smaller context window
        # while having a large existing session — compress proactively rather
        # than waiting for an API error (which might be caught as a non-retryable
        # 4xx and abort the request entirely).
        if (
            self.compression_enabled
            and len(messages) > self.context_compressor.protect_first_n
                                + self.context_compressor.protect_last_n + 1
        ):
            _sys_tok_est = estimate_tokens_rough(active_system_prompt or "")
            _msg_tok_est = estimate_messages_tokens_rough(messages)
            _preflight_tokens = _sys_tok_est + _msg_tok_est

            if _preflight_tokens >= self.context_compressor.threshold_tokens:
                logger.info(
                    "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
                    f"{_preflight_tokens:,}",
                    f"{self.context_compressor.threshold_tokens:,}",
                    self.model,
                    f"{self.context_compressor.context_length:,}",
                )
                if not self.quiet_mode:
                    print(
                        f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
                        f">= {self.context_compressor.threshold_tokens:,} threshold"
                    )
                # May need multiple passes for very large sessions with small
                # context windows (each pass summarises the middle N turns).
                for _pass in range(3):
                    _orig_len = len(messages)
                    messages, active_system_prompt = self._compress_context(
                        messages, system_message, approx_tokens=_preflight_tokens,
                        task_id=effective_task_id,
                    )
                    if len(messages) >= _orig_len:
                        break  # Cannot compress further
                    # Re-estimate after compression
                    _sys_tok_est = estimate_tokens_rough(active_system_prompt or "")
                    _msg_tok_est = estimate_messages_tokens_rough(messages)
                    _preflight_tokens = _sys_tok_est + _msg_tok_est
                    if _preflight_tokens < self.context_compressor.threshold_tokens:
                        break  # Under threshold

        # Main conversation loop
        api_call_count = 0
        final_response = None
        interrupted = False
        codex_ack_continuations = 0
        length_continue_retries = 0
        truncated_response_prefix = ""
        
        # Clear any stale interrupt state at start
        self.clear_interrupt()
        
        while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
            # Reset per-turn checkpoint dedup so each iteration can take one snapshot
            self._checkpoint_mgr.new_turn()

            # Check for interrupt request (e.g., user sent new message)
            if self._interrupt_requested:
                interrupted = True
                if not self.quiet_mode:
                    print(f"\n⚡ Breaking out of tool loop due to interrupt...")
                break
            
            api_call_count += 1
            if not self.iteration_budget.consume():
                if not self.quiet_mode:
                    print(f"\n⚠️  Session iteration budget exhausted ({self.iteration_budget.max_total} total across agent + subagents)")
                break

            # Fire step_callback for gateway hooks (agent:step event)
            if self.step_callback is not None:
                try:
                    prev_tools = []
                    for _m in reversed(messages):
                        if _m.get("role") == "assistant" and _m.get("tool_calls"):
                            prev_tools = [
                                tc["function"]["name"]
                                for tc in _m["tool_calls"]
                                if isinstance(tc, dict)
                            ]
                            break
                    self.step_callback(api_call_count, prev_tools)
                except Exception as _step_err:
                    logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err)

            # Track tool-calling iterations for skill nudge.
            # Counter resets whenever skill_manage is actually used.
            if (self._skill_nudge_interval > 0
                    and "skill_manage" in self.valid_tool_names):
                self._iters_since_skill += 1
            
            # Prepare messages for API call
            # If we have an ephemeral system prompt, prepend it to the messages
            # Note: Reasoning is embedded in content via <think> tags for trajectory storage.
            # However, providers like Moonshot AI require a separate 'reasoning_content' field
            # on assistant messages with tool_calls. We handle both cases here.
            api_messages = []
            for idx, msg in enumerate(messages):
                api_msg = msg.copy()

                if idx == current_turn_user_idx and msg.get("role") == "user" and self._honcho_turn_context:
                    api_msg["content"] = _inject_honcho_turn_context(
                        api_msg.get("content", ""), self._honcho_turn_context
                    )

                # For ALL assistant messages, pass reasoning back to the API
                # This ensures multi-turn reasoning context is preserved
                if msg.get("role") == "assistant":
                    reasoning_text = msg.get("reasoning")
                    if reasoning_text:
                        # Add reasoning_content for API compatibility (Moonshot AI, Novita, OpenRouter)
                        api_msg["reasoning_content"] = reasoning_text

                # Remove 'reasoning' field - it's for trajectory storage only
                # We've copied it to 'reasoning_content' for the API above
                if "reasoning" in api_msg:
                    api_msg.pop("reasoning")
                # Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
                if "finish_reason" in api_msg:
                    api_msg.pop("finish_reason")
                # Strip Codex Responses API fields (call_id, response_item_id) for
                # strict providers like Mistral that reject unknown fields with 422.
                # Uses new dicts so the internal messages list retains the fields
                # for Codex Responses compatibility.
                if "api.mistral.ai" in self.base_url.lower():
                    self._sanitize_tool_calls_for_strict_api(api_msg)
                # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context
                # The signature field helps maintain reasoning continuity
                api_messages.append(api_msg)

            # Build the final system message: cached prompt + ephemeral system prompt.
            # Ephemeral additions are API-call-time only (not persisted to session DB).
            # Honcho later-turn recall is intentionally kept OUT of the system prompt
            # so the stable cache prefix remains unchanged.
            effective_system = active_system_prompt or ""
            if self.ephemeral_system_prompt:
                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
            if effective_system:
                api_messages = [{"role": "system", "content": effective_system}] + api_messages

            # Inject ephemeral prefill messages right after the system prompt
            # but before conversation history. Same API-call-time-only pattern.
            if self.prefill_messages:
                sys_offset = 1 if effective_system else 0
                for idx, pfm in enumerate(self.prefill_messages):
                    api_messages.insert(sys_offset + idx, pfm.copy())

            # Apply Anthropic prompt caching for Claude models via OpenRouter.
            # Auto-detected: if model name contains "claude" and base_url is OpenRouter,
            # inject cache_control breakpoints (system + last 3 messages) to reduce
            # input token costs by ~75% on multi-turn conversations.
            if self._use_prompt_caching:
                api_messages = apply_anthropic_cache_control(api_messages, cache_ttl=self._cache_ttl)

            # Safety net: strip orphaned tool results / add stubs for missing
            # results before sending to the API.  The compressor handles this
            # during compression, but orphans can also sneak in from session
            # loading or manual message manipulation.
            if hasattr(self, 'context_compressor') and self.context_compressor:
                api_messages = self.context_compressor._sanitize_tool_pairs(api_messages)

            # Calculate approximate request size for logging
            total_chars = sum(len(str(msg)) for msg in api_messages)
            approx_tokens = total_chars // 4  # Rough estimate: 4 chars per token
            
            # Thinking spinner for quiet mode (animated during API call)
            thinking_spinner = None
            
            if not self.quiet_mode:
                self._vprint(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...")
                self._vprint(f"{self.log_prefix}   📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)")
                self._vprint(f"{self.log_prefix}   🔧 Available tools: {len(self.tools) if self.tools else 0}")
            elif self._stream_callback is None:
                # Animated thinking spinner in quiet mode (skip during streaming TTS)
                face = random.choice(KawaiiSpinner.KAWAII_THINKING)
                verb = random.choice(KawaiiSpinner.THINKING_VERBS)
                if self.thinking_callback:
                    # CLI TUI mode: use prompt_toolkit widget instead of raw spinner
                    self.thinking_callback(f"{face} {verb}...")
                else:
                    spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star'])
                    thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type)
                    thinking_spinner.start()
            
            # Log request details if verbose
            if self.verbose_logging:
                logging.debug(f"API Request - Model: {self.model}, Messages: {len(messages)}, Tools: {len(self.tools) if self.tools else 0}")
                logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}")
                logging.debug(f"Total message size: ~{approx_tokens:,} tokens")
            
            api_start_time = time.time()
            retry_count = 0
            max_retries = 3
            compression_attempts = 0
            max_compression_attempts = 3
            codex_auth_retry_attempted = False
            anthropic_auth_retry_attempted = False
            nous_auth_retry_attempted = False
            restart_with_compressed_messages = False
            restart_with_length_continuation = False

            finish_reason = "stop"
            response = None  # Guard against UnboundLocalError if all retries fail

            while retry_count < max_retries:
                try:
                    api_kwargs = self._build_api_kwargs(api_messages)
                    if self.api_mode == "codex_responses":
                        api_kwargs = self._preflight_codex_api_kwargs(api_kwargs, allow_stream=False)

                    if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
                        self._dump_api_request_debug(api_kwargs, reason="preflight")

                    cb = getattr(self, "_stream_callback", None)
                    if cb is not None and self.api_mode == "chat_completions":
                        response = self._streaming_api_call(api_kwargs, cb)
                    else:
                        response = self._interruptible_api_call(api_kwargs)
                        # Forward full response to TTS callback for non-streaming providers
                        # (e.g. Anthropic) so voice TTS still works via batch delivery.
                        if cb is not None and response:
                            try:
                                content = None
                                # Try choices first — _interruptible_api_call converts all
                                # providers (including Anthropic) to this format.
                                try:
                                    content = response.choices[0].message.content
                                except (AttributeError, IndexError):
                                    pass
                                # Fallback: Anthropic native content blocks
                                if not content and self.api_mode == "anthropic_messages":
                                    text_parts = [
                                        block.text for block in getattr(response, "content", [])
                                        if getattr(block, "type", None) == "text" and getattr(block, "text", None)
                                    ]
                                    content = " ".join(text_parts) if text_parts else None
                                if content:
                                    cb(content)
                            except Exception:
                                pass
                    
                    api_duration = time.time() - api_start_time
                    
                    # Stop thinking spinner silently -- the response box or tool
                    # execution messages that follow are more informative.
                    if thinking_spinner:
                        thinking_spinner.stop("")
                        thinking_spinner = None
                    if self.thinking_callback:
                        self.thinking_callback("")
                    
                    if not self.quiet_mode:
                        self._vprint(f"{self.log_prefix}⏱️  API call completed in {api_duration:.2f}s")
                    
                    if self.verbose_logging:
                        # Log response with provider info if available
                        resp_model = getattr(response, 'model', 'N/A') if response else 'N/A'
                        logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
                    
                    # Validate response shape before proceeding
                    response_invalid = False
                    error_details = []
                    if self.api_mode == "codex_responses":
                        output_items = getattr(response, "output", None) if response is not None else None
                        if response is None:
                            response_invalid = True
                            error_details.append("response is None")
                        elif not isinstance(output_items, list):
                            response_invalid = True
                            error_details.append("response.output is not a list")
                        elif len(output_items) == 0:
                            response_invalid = True
                            error_details.append("response.output is empty")
                    elif self.api_mode == "anthropic_messages":
                        content_blocks = getattr(response, "content", None) if response is not None else None
                        if response is None:
                            response_invalid = True
                            error_details.append("response is None")
                        elif not isinstance(content_blocks, list):
                            response_invalid = True
                            error_details.append("response.content is not a list")
                        elif len(content_blocks) == 0:
                            response_invalid = True
                            error_details.append("response.content is empty")
                    else:
                        if response is None or not hasattr(response, 'choices') or response.choices is None or len(response.choices) == 0:
                            response_invalid = True
                            if response is None:
                                error_details.append("response is None")
                            elif not hasattr(response, 'choices'):
                                error_details.append("response has no 'choices' attribute")
                            elif response.choices is None:
                                error_details.append("response.choices is None")
                            else:
                                error_details.append("response.choices is empty")

                    if response_invalid:
                        # Stop spinner before printing error messages
                        if thinking_spinner:
                            thinking_spinner.stop(f"(´;ω;`) oops, retrying...")
                            thinking_spinner = None
                        if self.thinking_callback:
                            self.thinking_callback("")
                        
                        # This is often rate limiting or provider returning malformed response
                        retry_count += 1
                        
                        # Check for error field in response (some providers include this)
                        error_msg = "Unknown"
                        provider_name = "Unknown"
                        if response and hasattr(response, 'error') and response.error:
                            error_msg = str(response.error)
                            # Try to extract provider from error metadata
                            if hasattr(response.error, 'metadata') and response.error.metadata:
                                provider_name = response.error.metadata.get('provider_name', 'Unknown')
                        elif response and hasattr(response, 'message') and response.message:
                            error_msg = str(response.message)
                        
                        # Try to get provider from model field (OpenRouter often returns actual model used)
                        if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model:
                            provider_name = f"model={response.model}"
                        
                        # Check for x-openrouter-provider or similar metadata
                        if provider_name == "Unknown" and response:
                            # Log all response attributes for debugging
                            resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')}
                            if self.verbose_logging:
                                logging.debug(f"Response attributes for invalid response: {resp_attrs}")
                        
                        self._vprint(f"{self.log_prefix}⚠️  Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True)
                        self._vprint(f"{self.log_prefix}   🏢 Provider: {provider_name}", force=True)
                        self._vprint(f"{self.log_prefix}   📝 Provider message: {error_msg[:200]}", force=True)
                        self._vprint(f"{self.log_prefix}   ⏱️  Response time: {api_duration:.2f}s (fast response often indicates rate limiting)", force=True)
                        
                        if retry_count >= max_retries:
                            # Try fallback before giving up
                            if self._try_activate_fallback():
                                retry_count = 0
                                continue
                            self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.", force=True)
                            logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.")
                            self._persist_session(messages, conversation_history)
                            return {
                                "messages": messages,
                                "completed": False,
                                "api_calls": api_call_count,
                                "error": "Invalid API response shape. Likely rate limited or malformed provider response.",
                                "failed": True  # Mark as failure for filtering
                            }
                        
                        # Longer backoff for rate limiting (likely cause of None choices)
                        wait_time = min(5 * (2 ** (retry_count - 1)), 120)  # 5s, 10s, 20s, 40s, 80s, 120s
                        self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...", force=True)
                        logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}")
                        
                        # Sleep in small increments to stay responsive to interrupts
                        sleep_end = time.time() + wait_time
                        while time.time() < sleep_end:
                            if self._interrupt_requested:
                                self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
                                self._persist_session(messages, conversation_history)
                                self.clear_interrupt()
                                return {
                                    "final_response": f"Operation interrupted: retrying API call after rate limit (retry {retry_count}/{max_retries}).",
                                    "messages": messages,
                                    "api_calls": api_call_count,
                                    "completed": False,
                                    "interrupted": True,
                                }
                            time.sleep(0.2)
                        continue  # Retry the API call

                    # Check finish_reason before proceeding
                    if self.api_mode == "codex_responses":
                        status = getattr(response, "status", None)
                        incomplete_details = getattr(response, "incomplete_details", None)
                        incomplete_reason = None
                        if isinstance(incomplete_details, dict):
                            incomplete_reason = incomplete_details.get("reason")
                        else:
                            incomplete_reason = getattr(incomplete_details, "reason", None)
                        if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}:
                            finish_reason = "length"
                        else:
                            finish_reason = "stop"
                    elif self.api_mode == "anthropic_messages":
                        stop_reason_map = {"end_turn": "stop", "tool_use": "tool_calls", "max_tokens": "length", "stop_sequence": "stop"}
                        finish_reason = stop_reason_map.get(response.stop_reason, "stop")
                    else:
                        finish_reason = response.choices[0].finish_reason

                    if finish_reason == "length":
                        self._vprint(f"{self.log_prefix}⚠️  Response truncated (finish_reason='length') - model hit max output tokens", force=True)

                        if self.api_mode == "chat_completions":
                            assistant_message = response.choices[0].message
                            if not assistant_message.tool_calls:
                                length_continue_retries += 1
                                interim_msg = self._build_assistant_message(assistant_message, finish_reason)
                                messages.append(interim_msg)
                                if assistant_message.content:
                                    truncated_response_prefix += assistant_message.content

                                if length_continue_retries < 3:
                                    self._vprint(
                                        f"{self.log_prefix}↻ Requesting continuation "
                                        f"({length_continue_retries}/3)..."
                                    )
                                    continue_msg = {
                                        "role": "user",
                                        "content": (
                                            "[System: Your previous response was truncated by the output "
                                            "length limit. Continue exactly where you left off. Do not "
                                            "restart or repeat prior text. Finish the answer directly.]"
                                        ),
                                    }
                                    messages.append(continue_msg)
                                    self._session_messages = messages
                                    self._save_session_log(messages)
                                    restart_with_length_continuation = True
                                    break

                                partial_response = self._strip_think_blocks(truncated_response_prefix).strip()
                                self._cleanup_task_resources(effective_task_id)
                                self._persist_session(messages, conversation_history)
                                return {
                                    "final_response": partial_response or None,
                                    "messages": messages,
                                    "api_calls": api_call_count,
                                    "completed": False,
                                    "partial": True,
                                    "error": "Response remained truncated after 3 continuation attempts",
                                }

                        # If we have prior messages, roll back to last complete state
                        if len(messages) > 1:
                            self._vprint(f"{self.log_prefix}   ⏪ Rolling back to last complete assistant turn")
                            rolled_back_messages = self._get_messages_up_to_last_assistant(messages)

                            self._cleanup_task_resources(effective_task_id)
                            self._persist_session(messages, conversation_history)

                            return {
                                "final_response": None,
                                "messages": rolled_back_messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "partial": True,
                                "error": "Response truncated due to output length limit"
                            }
                        else:
                            # First message was truncated - mark as failed
                            self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True)
                            self._persist_session(messages, conversation_history)
                            return {
                                "final_response": None,
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "failed": True,
                                "error": "First response truncated due to output length limit"
                            }
                    
                    # Track actual token usage from response for context management
                    if hasattr(response, 'usage') and response.usage:
                        if self.api_mode in ("codex_responses", "anthropic_messages"):
                            prompt_tokens = getattr(response.usage, 'input_tokens', 0) or 0
                            completion_tokens = getattr(response.usage, 'output_tokens', 0) or 0
                            total_tokens = (
                                getattr(response.usage, 'total_tokens', None)
                                or (prompt_tokens + completion_tokens)
                            )
                        else:
                            prompt_tokens = getattr(response.usage, 'prompt_tokens', 0) or 0
                            completion_tokens = getattr(response.usage, 'completion_tokens', 0) or 0
                            total_tokens = getattr(response.usage, 'total_tokens', 0) or 0
                        usage_dict = {
                            "prompt_tokens": prompt_tokens,
                            "completion_tokens": completion_tokens,
                            "total_tokens": total_tokens,
                        }
                        self.context_compressor.update_from_response(usage_dict)

                        # Cache discovered context length after successful call
                        if self.context_compressor._context_probed:
                            ctx = self.context_compressor.context_length
                            save_context_length(self.model, self.base_url, ctx)
                            print(f"{self.log_prefix}💾 Cached context length: {ctx:,} tokens for {self.model}")
                            self.context_compressor._context_probed = False

                        self.session_prompt_tokens += prompt_tokens
                        self.session_completion_tokens += completion_tokens
                        self.session_total_tokens += total_tokens
                        self.session_api_calls += 1
                        
                        if self.verbose_logging:
                            logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")
                        
                        # Log cache hit stats when prompt caching is active
                        if self._use_prompt_caching:
                            if self.api_mode == "anthropic_messages":
                                # Anthropic uses cache_read_input_tokens / cache_creation_input_tokens
                                cached = getattr(response.usage, 'cache_read_input_tokens', 0) or 0
                                written = getattr(response.usage, 'cache_creation_input_tokens', 0) or 0
                            else:
                                # OpenRouter uses prompt_tokens_details.cached_tokens
                                details = getattr(response.usage, 'prompt_tokens_details', None)
                                cached = getattr(details, 'cached_tokens', 0) or 0 if details else 0
                                written = getattr(details, 'cache_write_tokens', 0) or 0 if details else 0
                            prompt = usage_dict["prompt_tokens"]
                            hit_pct = (cached / prompt * 100) if prompt > 0 else 0
                            if not self.quiet_mode:
                                self._vprint(f"{self.log_prefix}   💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)")
                    
                    break  # Success, exit retry loop

                except InterruptedError:
                    if thinking_spinner:
                        thinking_spinner.stop("")
                        thinking_spinner = None
                    if self.thinking_callback:
                        self.thinking_callback("")
                    api_elapsed = time.time() - api_start_time
                    self._vprint(f"{self.log_prefix}⚡ Interrupted during API call.", force=True)
                    self._persist_session(messages, conversation_history)
                    interrupted = True
                    final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)."
                    break

                except Exception as api_error:
                    # Stop spinner before printing error messages
                    if thinking_spinner:
                        thinking_spinner.stop(f"(╥_╥) error, retrying...")
                        thinking_spinner = None
                    if self.thinking_callback:
                        self.thinking_callback("")

                    status_code = getattr(api_error, "status_code", None)
                    if (
                        self.api_mode == "codex_responses"
                        and self.provider == "openai-codex"
                        and status_code == 401
                        and not codex_auth_retry_attempted
                    ):
                        codex_auth_retry_attempted = True
                        if self._try_refresh_codex_client_credentials(force=True):
                            self._vprint(f"{self.log_prefix}🔐 Codex auth refreshed after 401. Retrying request...")
                            continue
                    if (
                        self.api_mode == "chat_completions"
                        and self.provider == "nous"
                        and status_code == 401
                        and not nous_auth_retry_attempted
                    ):
                        nous_auth_retry_attempted = True
                        if self._try_refresh_nous_client_credentials(force=True):
                            print(f"{self.log_prefix}🔐 Nous agent key refreshed after 401. Retrying request...")
                            continue
                    if (
                        self.api_mode == "anthropic_messages"
                        and status_code == 401
                        and hasattr(self, '_anthropic_api_key')
                        and not anthropic_auth_retry_attempted
                    ):
                        anthropic_auth_retry_attempted = True
                        from agent.anthropic_adapter import _is_oauth_token
                        if self._try_refresh_anthropic_client_credentials():
                            print(f"{self.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...")
                            continue
                        # Credential refresh didn't help — show diagnostic info
                        key = self._anthropic_api_key
                        auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)"
                        print(f"{self.log_prefix}🔐 Anthropic 401 — authentication failed.")
                        print(f"{self.log_prefix}   Auth method: {auth_method}")
                        print(f"{self.log_prefix}   Token prefix: {key[:12]}..." if key and len(key) > 12 else f"{self.log_prefix}   Token: (empty or short)")
                        print(f"{self.log_prefix}   Troubleshooting:")
                        print(f"{self.log_prefix}     • Check ANTHROPIC_TOKEN in ~/.hermes/.env for Hermes-managed OAuth/setup tokens")
                        print(f"{self.log_prefix}     • Check ANTHROPIC_API_KEY in ~/.hermes/.env for API keys or legacy token values")
                        print(f"{self.log_prefix}     • For API keys: verify at https://console.anthropic.com/settings/keys")
                        print(f"{self.log_prefix}     • For Claude Code: run 'claude /login' to refresh, then retry")
                        print(f"{self.log_prefix}     • Clear stale keys: hermes config set ANTHROPIC_TOKEN \"\"")
                        print(f"{self.log_prefix}     • Legacy cleanup: hermes config set ANTHROPIC_API_KEY \"\"")

                    retry_count += 1
                    elapsed_time = time.time() - api_start_time
                    
                    # Enhanced error logging
                    error_type = type(api_error).__name__
                    error_msg = str(api_error).lower()
                    logger.warning(
                        "API call failed (attempt %s/%s) error_type=%s %s error=%s",
                        retry_count,
                        max_retries,
                        error_type,
                        self._client_log_context(),
                        api_error,
                    )

                    self._vprint(f"{self.log_prefix}⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}", force=True)
                    self._vprint(f"{self.log_prefix}   ⏱️  Time elapsed before failure: {elapsed_time:.2f}s")
                    self._vprint(f"{self.log_prefix}   📝 Error: {str(api_error)[:200]}", force=True)
                    self._vprint(f"{self.log_prefix}   📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools")
                    
                    # Check for interrupt before deciding to retry
                    if self._interrupt_requested:
                        self._vprint(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True)
                        self._persist_session(messages, conversation_history)
                        self.clear_interrupt()
                        return {
                            "final_response": f"Operation interrupted: handling API error ({error_type}: {str(api_error)[:80]}).",
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "interrupted": True,
                        }
                    
                    # Check for 413 payload-too-large BEFORE generic 4xx handler.
                    # A 413 is a payload-size error — the correct response is to
                    # compress history and retry, not abort immediately.
                    status_code = getattr(api_error, "status_code", None)
                    is_payload_too_large = (
                        status_code == 413
                        or 'request entity too large' in error_msg
                        or 'payload too large' in error_msg
                        or 'error code: 413' in error_msg
                    )

                    if is_payload_too_large:
                        compression_attempts += 1
                        if compression_attempts > max_compression_attempts:
                            self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True)
                            logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.")
                            self._persist_session(messages, conversation_history)
                            return {
                                "messages": messages,
                                "completed": False,
                                "api_calls": api_call_count,
                                "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.",
                                "partial": True
                            }
                        self._vprint(f"{self.log_prefix}⚠️  Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...")

                        original_len = len(messages)
                        messages, active_system_prompt = self._compress_context(
                            messages, system_message, approx_tokens=approx_tokens,
                            task_id=effective_task_id,
                        )

                        if len(messages) < original_len:
                            self._vprint(f"{self.log_prefix}   🗜️  Compressed {original_len} → {len(messages)} messages, retrying...")
                            time.sleep(2)  # Brief pause between compression retries
                            restart_with_compressed_messages = True
                            break
                        else:
                            self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True)
                            logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.")
                            self._persist_session(messages, conversation_history)
                            return {
                                "messages": messages,
                                "completed": False,
                                "api_calls": api_call_count,
                                "error": "Request payload too large (413). Cannot compress further.",
                                "partial": True
                            }

                    # Check for context-length errors BEFORE generic 4xx handler.
                    # Local backends (LM Studio, Ollama, llama.cpp) often return
                    # HTTP 400 with messages like "Context size has been exceeded"
                    # which must trigger compression, not an immediate abort.
                    is_context_length_error = any(phrase in error_msg for phrase in [
                        'context length', 'context size', 'maximum context',
                        'token limit', 'too many tokens', 'reduce the length',
                        'exceeds the limit', 'context window',
                        'request entity too large',  # OpenRouter/Nous 413 safety net
                        'prompt is too long',  # Anthropic: "prompt is too long: N tokens > M maximum"
                    ])
                    
                    if is_context_length_error:
                        compressor = self.context_compressor
                        old_ctx = compressor.context_length

                        # Try to parse the actual limit from the error message
                        parsed_limit = parse_context_limit_from_error(error_msg)
                        if parsed_limit and parsed_limit < old_ctx:
                            new_ctx = parsed_limit
                            self._vprint(f"{self.log_prefix}⚠️  Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True)
                        else:
                            # Step down to the next probe tier
                            new_ctx = get_next_probe_tier(old_ctx)

                        if new_ctx and new_ctx < old_ctx:
                            compressor.context_length = new_ctx
                            compressor.threshold_tokens = int(new_ctx * compressor.threshold_percent)
                            compressor._context_probed = True
                            self._vprint(f"{self.log_prefix}⚠️  Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens", force=True)
                        else:
                            self._vprint(f"{self.log_prefix}⚠️  Context length exceeded at minimum tier — attempting compression...", force=True)

                        compression_attempts += 1
                        if compression_attempts > max_compression_attempts:
                            self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
                            logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
                            self._persist_session(messages, conversation_history)
                            return {
                                "messages": messages,
                                "completed": False,
                                "api_calls": api_call_count,
                                "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
                                "partial": True
                            }
                        self._vprint(f"{self.log_prefix}   🗜️  Context compression attempt {compression_attempts}/{max_compression_attempts}...")

                        original_len = len(messages)
                        messages, active_system_prompt = self._compress_context(
                            messages, system_message, approx_tokens=approx_tokens,
                            task_id=effective_task_id,
                        )

                        if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
                            if len(messages) < original_len:
                                self._vprint(f"{self.log_prefix}   🗜️  Compressed {original_len} → {len(messages)} messages, retrying...")
                            time.sleep(2)  # Brief pause between compression retries
                            restart_with_compressed_messages = True
                            break
                        else:
                            # Can't compress further and already at minimum tier
                            self._vprint(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.", force=True)
                            self._vprint(f"{self.log_prefix}   💡 The conversation has accumulated too much content.", force=True)
                            logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.")
                            self._persist_session(messages, conversation_history)
                            return {
                                "messages": messages,
                                "completed": False,
                                "api_calls": api_call_count,
                                "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.",
                                "partial": True
                            }

                    # Check for non-retryable client errors (4xx HTTP status codes).
                    # These indicate a problem with the request itself (bad model ID,
                    # invalid API key, forbidden, etc.) and will never succeed on retry.
                    # Note: 413 and context-length errors are excluded — handled above.
                    # Also catch local validation errors (ValueError, TypeError) — these
                    # are programming bugs, not transient failures.
                    is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
                    is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
                    is_client_error = (is_local_validation_error or is_client_status_error or any(phrase in error_msg for phrase in [
                        'error code: 401', 'error code: 403',
                        'error code: 404', 'error code: 422',
                        'is not a valid model', 'invalid model', 'model not found',
                        'invalid api key', 'invalid_api_key', 'authentication',
                        'unauthorized', 'forbidden', 'not found',
                    ])) and not is_context_length_error

                    if is_client_error:
                        # Try fallback before aborting — a different provider
                        # may not have the same issue (rate limit, auth, etc.)
                        if self._try_activate_fallback():
                            retry_count = 0
                            continue
                        self._dump_api_request_debug(
                            api_kwargs, reason="non_retryable_client_error", error=api_error,
                        )
                        self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.", force=True)
                        self._vprint(f"{self.log_prefix}   💡 This type of error won't be fixed by retrying.", force=True)
                        logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}")
                        self._persist_session(messages, conversation_history)
                        return {
                            "final_response": None,
                            "messages": messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "failed": True,
                            "error": str(api_error),
                        }

                    if retry_count >= max_retries:
                        # Try fallback before giving up entirely
                        if self._try_activate_fallback():
                            retry_count = 0
                            continue
                        self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.", force=True)
                        logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}")
                        logging.error(f"{self.log_prefix}Request details - Messages: {len(api_messages)}, Approx tokens: {approx_tokens:,}")
                        raise api_error

                    wait_time = min(2 ** retry_count, 60)  # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s
                    logger.warning(
                        "Retrying API call in %ss (attempt %s/%s) %s error=%s",
                        wait_time,
                        retry_count,
                        max_retries,
                        self._client_log_context(),
                        api_error,
                    )
                    if retry_count >= max_retries:
                        self._vprint(f"{self.log_prefix}⚠️  API call failed after {retry_count} attempts: {str(api_error)[:100]}")
                        self._vprint(f"{self.log_prefix}⏳ Final retry in {wait_time}s...")
                    
                    # Sleep in small increments so we can respond to interrupts quickly
                    # instead of blocking the entire wait_time in one sleep() call
                    sleep_end = time.time() + wait_time
                    while time.time() < sleep_end:
                        if self._interrupt_requested:
                            self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
                            self._persist_session(messages, conversation_history)
                            self.clear_interrupt()
                            return {
                                "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).",
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "interrupted": True,
                            }
                        time.sleep(0.2)  # Check interrupt every 200ms
            
            # If the API call was interrupted, skip response processing
            if interrupted:
                break

            if restart_with_compressed_messages:
                api_call_count -= 1
                self.iteration_budget.refund()
                continue

            if restart_with_length_continuation:
                continue

            # Guard: if all retries exhausted without a successful response
            # (e.g. repeated context-length errors that exhausted retry_count),
            # the `response` variable is still None. Break out cleanly.
            if response is None:
                print(f"{self.log_prefix}❌ All API retries exhausted with no successful response.")
                self._persist_session(messages, conversation_history)
                break

            try:
                if self.api_mode == "codex_responses":
                    assistant_message, finish_reason = self._normalize_codex_response(response)
                elif self.api_mode == "anthropic_messages":
                    from agent.anthropic_adapter import normalize_anthropic_response
                    assistant_message, finish_reason = normalize_anthropic_response(response)
                else:
                    assistant_message = response.choices[0].message
                
                # Normalize content to string — some OpenAI-compatible servers
                # (llama-server, etc.) return content as a dict or list instead
                # of a plain string, which crashes downstream .strip() calls.
                if assistant_message.content is not None and not isinstance(assistant_message.content, str):
                    raw = assistant_message.content
                    if isinstance(raw, dict):
                        assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw)
                    elif isinstance(raw, list):
                        # Multimodal content list — extract text parts
                        parts = []
                        for part in raw:
                            if isinstance(part, str):
                                parts.append(part)
                            elif isinstance(part, dict) and part.get("type") == "text":
                                parts.append(part.get("text", ""))
                            elif isinstance(part, dict) and "text" in part:
                                parts.append(str(part["text"]))
                        assistant_message.content = "\n".join(parts)
                    else:
                        assistant_message.content = str(raw)

                # Handle assistant response
                if assistant_message.content and not self.quiet_mode:
                    if self.verbose_logging:
                        self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content}")
                    else:
                        self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")

                # Notify progress callback of model's thinking (used by subagent
                # delegation to relay the child's reasoning to the parent display).
                # Guard: only fire for subagents (_delegate_depth >= 1) to avoid
                # spamming gateway platforms with the main agent's every thought.
                if (assistant_message.content and self.tool_progress_callback
                        and getattr(self, '_delegate_depth', 0) > 0):
                    _think_text = assistant_message.content.strip()
                    # Strip reasoning XML tags that shouldn't leak to parent display
                    _think_text = re.sub(
                        r'</?(?:REASONING_SCRATCHPAD|think|reasoning)>', '', _think_text
                    ).strip()
                    first_line = _think_text.split('\n')[0][:80] if _think_text else ""
                    if first_line:
                        try:
                            self.tool_progress_callback("_thinking", first_line)
                        except Exception:
                            pass
                
                # Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
                # This means the model ran out of output tokens mid-reasoning — retry up to 2 times
                if has_incomplete_scratchpad(assistant_message.content or ""):
                    if not hasattr(self, '_incomplete_scratchpad_retries'):
                        self._incomplete_scratchpad_retries = 0
                    self._incomplete_scratchpad_retries += 1
                    
                    self._vprint(f"{self.log_prefix}⚠️  Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
                    
                    if self._incomplete_scratchpad_retries <= 2:
                        self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
                        # Don't add the broken message, just retry
                        continue
                    else:
                        # Max retries - discard this turn and save as partial
                        self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True)
                        self._incomplete_scratchpad_retries = 0
                        
                        rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
                        self._cleanup_task_resources(effective_task_id)
                        self._persist_session(messages, conversation_history)
                        
                        return {
                            "final_response": None,
                            "messages": rolled_back_messages,
                            "api_calls": api_call_count,
                            "completed": False,
                            "partial": True,
                            "error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
                        }
                
                # Reset incomplete scratchpad counter on clean response
                if hasattr(self, '_incomplete_scratchpad_retries'):
                    self._incomplete_scratchpad_retries = 0

                if self.api_mode == "codex_responses" and finish_reason == "incomplete":
                    if not hasattr(self, "_codex_incomplete_retries"):
                        self._codex_incomplete_retries = 0
                    self._codex_incomplete_retries += 1

                    interim_msg = self._build_assistant_message(assistant_message, finish_reason)
                    interim_has_content = bool((interim_msg.get("content") or "").strip())
                    interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False

                    if interim_has_content or interim_has_reasoning:
                        last_msg = messages[-1] if messages else None
                        duplicate_interim = (
                            isinstance(last_msg, dict)
                            and last_msg.get("role") == "assistant"
                            and last_msg.get("finish_reason") == "incomplete"
                            and (last_msg.get("content") or "") == (interim_msg.get("content") or "")
                            and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "")
                        )
                        if not duplicate_interim:
                            messages.append(interim_msg)

                    if self._codex_incomplete_retries < 3:
                        if not self.quiet_mode:
                            self._vprint(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)")
                        self._session_messages = messages
                        self._save_session_log(messages)
                        continue

                    self._codex_incomplete_retries = 0
                    self._persist_session(messages, conversation_history)
                    return {
                        "final_response": None,
                        "messages": messages,
                        "api_calls": api_call_count,
                        "completed": False,
                        "partial": True,
                        "error": "Codex response remained incomplete after 3 continuation attempts",
                    }
                elif hasattr(self, "_codex_incomplete_retries"):
                    self._codex_incomplete_retries = 0
                
                # Check for tool calls
                if assistant_message.tool_calls:
                    if not self.quiet_mode:
                        self._vprint(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...")
                    
                    if self.verbose_logging:
                        for tc in assistant_message.tool_calls:
                            logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")
                    
                    # Validate tool call names - detect model hallucinations
                    # Repair mismatched tool names before validating
                    for tc in assistant_message.tool_calls:
                        if tc.function.name not in self.valid_tool_names:
                            repaired = self._repair_tool_call(tc.function.name)
                            if repaired:
                                print(f"{self.log_prefix}🔧 Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'")
                                tc.function.name = repaired
                    invalid_tool_calls = [
                        tc.function.name for tc in assistant_message.tool_calls
                        if tc.function.name not in self.valid_tool_names
                    ]
                    if invalid_tool_calls:
                        # Track retries for invalid tool calls
                        if not hasattr(self, '_invalid_tool_retries'):
                            self._invalid_tool_retries = 0
                        self._invalid_tool_retries += 1

                        # Return helpful error to model — model can self-correct next turn
                        available = ", ".join(sorted(self.valid_tool_names))
                        invalid_name = invalid_tool_calls[0]
                        invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name
                        self._vprint(f"{self.log_prefix}⚠️  Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)")

                        if self._invalid_tool_retries >= 3:
                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True)
                            self._invalid_tool_retries = 0
                            self._persist_session(messages, conversation_history)
                            return {
                                "final_response": None,
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "partial": True,
                                "error": f"Model generated invalid tool call: {invalid_preview}"
                            }

                        assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
                        messages.append(assistant_msg)
                        for tc in assistant_message.tool_calls:
                            if tc.function.name not in self.valid_tool_names:
                                content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}"
                            else:
                                content = f"Skipped: another tool call in this turn used an invalid name. Please retry this tool call."
                            messages.append({
                                "role": "tool",
                                "tool_call_id": tc.id,
                                "content": content,
                            })
                        continue
                    # Reset retry counter on successful tool call validation
                    if hasattr(self, '_invalid_tool_retries'):
                        self._invalid_tool_retries = 0
                    
                    # Validate tool call arguments are valid JSON
                    # Handle empty strings as empty objects (common model quirk)
                    invalid_json_args = []
                    for tc in assistant_message.tool_calls:
                        args = tc.function.arguments
                        if isinstance(args, (dict, list)):
                            tc.function.arguments = json.dumps(args)
                            continue
                        if args is not None and not isinstance(args, str):
                            tc.function.arguments = str(args)
                            args = tc.function.arguments
                        # Treat empty/whitespace strings as empty object
                        if not args or not args.strip():
                            tc.function.arguments = "{}"
                            continue
                        try:
                            json.loads(args)
                        except json.JSONDecodeError as e:
                            invalid_json_args.append((tc.function.name, str(e)))
                    
                    if invalid_json_args:
                        # Track retries for invalid JSON arguments
                        self._invalid_json_retries += 1
                        
                        tool_name, error_msg = invalid_json_args[0]
                        self._vprint(f"{self.log_prefix}⚠️  Invalid JSON in tool call arguments for '{tool_name}': {error_msg}")
                        
                        if self._invalid_json_retries < 3:
                            self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...")
                            # Don't add anything to messages, just retry the API call
                            continue
                        else:
                            # Instead of returning partial, inject a helpful message and let model recover
                            self._vprint(f"{self.log_prefix}⚠️  Injecting recovery message for invalid JSON...")
                            self._invalid_json_retries = 0  # Reset for next attempt
                            
                            # Add a user message explaining the issue
                            recovery_msg = (
                                f"Your tool call to '{tool_name}' had invalid JSON arguments. "
                                f"Error: {error_msg}. "
                                f"For tools with no required parameters, use an empty object: {{}}. "
                                f"Please either retry the tool call with valid JSON, or respond without using that tool."
                            )
                            recovery_dict = {"role": "user", "content": recovery_msg}
                            messages.append(recovery_dict)
                            continue
                    
                    # Reset retry counter on successful JSON validation
                    self._invalid_json_retries = 0
                    
                    assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
                    
                    # If this turn has both content AND tool_calls, capture the content
                    # as a fallback final response. Common pattern: model delivers its
                    # answer and calls memory/skill tools as a side-effect in the same
                    # turn. If the follow-up turn after tools is empty, we use this.
                    turn_content = assistant_message.content or ""
                    if turn_content and self._has_content_after_think_block(turn_content):
                        self._last_content_with_tools = turn_content
                        # Show intermediate commentary so the user can follow along
                        if self.quiet_mode:
                            clean = self._strip_think_blocks(turn_content).strip()
                            if clean:
                                self._vprint(f"  ┊ 💬 {clean}")
                    
                    messages.append(assistant_msg)
                    
                    _msg_count_before_tools = len(messages)
                    self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)

                    # Refund the iteration if the ONLY tool(s) called were
                    # execute_code (programmatic tool calling).  These are
                    # cheap RPC-style calls that shouldn't eat the budget.
                    _tc_names = {tc.function.name for tc in assistant_message.tool_calls}
                    if _tc_names == {"execute_code"}:
                        self.iteration_budget.refund()
                    
                    # Estimate next prompt size using real token counts from the
                    # last API response + rough estimate of newly appended tool
                    # results.  This catches cases where tool results push the
                    # context past the limit that last_prompt_tokens alone misses
                    # (e.g. large file reads, web extractions).
                    _compressor = self.context_compressor
                    _new_tool_msgs = messages[_msg_count_before_tools:]
                    _new_chars = sum(len(str(m.get("content", "") or "")) for m in _new_tool_msgs)
                    _estimated_next_prompt = (
                        _compressor.last_prompt_tokens
                        + _compressor.last_completion_tokens
                        + _new_chars // 3  # conservative: JSON-heavy tool results ≈ 3 chars/token
                    )
                    if self.compression_enabled and _compressor.should_compress(_estimated_next_prompt):
                        messages, active_system_prompt = self._compress_context(
                            messages, system_message,
                            approx_tokens=self.context_compressor.last_prompt_tokens,
                            task_id=effective_task_id,
                        )
                    
                    # Save session log incrementally (so progress is visible even if interrupted)
                    self._session_messages = messages
                    self._save_session_log(messages)
                    
                    # Continue loop for next response
                    continue
                
                else:
                    # No tool calls - this is the final response
                    final_response = assistant_message.content or ""
                    
                    # Check if response only has think block with no actual content after it
                    if not self._has_content_after_think_block(final_response):
                        # If the previous turn already delivered real content alongside
                        # tool calls (e.g. "You're welcome!" + memory save), the model
                        # has nothing more to say. Use the earlier content immediately
                        # instead of wasting API calls on retries that won't help.
                        fallback = getattr(self, '_last_content_with_tools', None)
                        if fallback:
                            logger.debug("Empty follow-up after tool calls — using prior turn content as final response")
                            self._last_content_with_tools = None
                            self._empty_content_retries = 0
                            for i in range(len(messages) - 1, -1, -1):
                                msg = messages[i]
                                if msg.get("role") == "assistant" and msg.get("tool_calls"):
                                    tool_names = []
                                    for tc in msg["tool_calls"]:
                                        fn = tc.get("function", {})
                                        tool_names.append(fn.get("name", "unknown"))
                                    msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
                                    break
                            final_response = self._strip_think_blocks(fallback).strip()
                            self._response_was_previewed = True
                            break

                        # No fallback available — this is a genuine empty response.
                        # Retry in case the model just had a bad generation.
                        if not hasattr(self, '_empty_content_retries'):
                            self._empty_content_retries = 0
                        self._empty_content_retries += 1
                        
                        reasoning_text = self._extract_reasoning(assistant_message)
                        self._vprint(f"{self.log_prefix}⚠️  Response only contains think block with no content after it")
                        if reasoning_text:
                            reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text
                            self._vprint(f"{self.log_prefix}   Reasoning: {reasoning_preview}")
                        else:
                            content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response
                            self._vprint(f"{self.log_prefix}   Content: '{content_preview}'")
                        
                        if self._empty_content_retries < 3:
                            self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...")
                            continue
                        else:
                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.", force=True)
                            self._empty_content_retries = 0
                            
                            # If a prior tool_calls turn had real content, salvage it:
                            # rewrite that turn's content to a brief tool description,
                            # and use the original content as the final response here.
                            fallback = getattr(self, '_last_content_with_tools', None)
                            if fallback:
                                self._last_content_with_tools = None
                                # Find the last assistant message with tool_calls and rewrite it
                                for i in range(len(messages) - 1, -1, -1):
                                    msg = messages[i]
                                    if msg.get("role") == "assistant" and msg.get("tool_calls"):
                                        tool_names = []
                                        for tc in msg["tool_calls"]:
                                            fn = tc.get("function", {})
                                            tool_names.append(fn.get("name", "unknown"))
                                        msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
                                        break
                                # Strip <think> blocks from fallback content for user display
                                final_response = self._strip_think_blocks(fallback).strip()
                                self._response_was_previewed = True
                                break
                            
                            # No fallback -- append the empty message as-is
                            empty_msg = {
                                "role": "assistant",
                                "content": final_response,
                                "reasoning": reasoning_text,
                                "finish_reason": finish_reason,
                            }
                            messages.append(empty_msg)
                            
                            self._cleanup_task_resources(effective_task_id)
                            self._persist_session(messages, conversation_history)
                            
                            return {
                                "final_response": final_response or None,
                                "messages": messages,
                                "api_calls": api_call_count,
                                "completed": False,
                                "partial": True,
                                "error": "Model generated only think blocks with no actual response after 3 retries"
                            }
                    
                    # Reset retry counter on successful content
                    if hasattr(self, '_empty_content_retries'):
                        self._empty_content_retries = 0

                    if (
                        self.api_mode == "codex_responses"
                        and self.valid_tool_names
                        and codex_ack_continuations < 2
                        and self._looks_like_codex_intermediate_ack(
                            user_message=user_message,
                            assistant_content=final_response,
                            messages=messages,
                        )
                    ):
                        codex_ack_continuations += 1
                        interim_msg = self._build_assistant_message(assistant_message, "incomplete")
                        messages.append(interim_msg)

                        continue_msg = {
                            "role": "user",
                            "content": (
                                "[System: Continue now. Execute the required tool calls and only "
                                "send your final answer after completing the task.]"
                            ),
                        }
                        messages.append(continue_msg)
                        self._session_messages = messages
                        self._save_session_log(messages)
                        continue

                    codex_ack_continuations = 0

                    if truncated_response_prefix:
                        final_response = truncated_response_prefix + final_response
                    
                    # Strip <think> blocks from user-facing response (keep raw in messages for trajectory)
                    final_response = self._strip_think_blocks(final_response).strip()
                    
                    final_msg = self._build_assistant_message(assistant_message, finish_reason)
                    
                    messages.append(final_msg)
                    
                    if not self.quiet_mode:
                        print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
                    break
                
            except Exception as e:
                error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}"
                print(f"❌ {error_msg}")
                
                if self.verbose_logging:
                    logging.exception("Detailed error information:")
                
                # If an assistant message with tool_calls was already appended,
                # the API expects a role="tool" result for every tool_call_id.
                # Fill in error results for any that weren't answered yet.
                pending_handled = False
                for idx in range(len(messages) - 1, -1, -1):
                    msg = messages[idx]
                    if not isinstance(msg, dict):
                        break
                    if msg.get("role") == "tool":
                        continue
                    if msg.get("role") == "assistant" and msg.get("tool_calls"):
                        answered_ids = {
                            m["tool_call_id"]
                            for m in messages[idx + 1:]
                            if isinstance(m, dict) and m.get("role") == "tool"
                        }
                        for tc in msg["tool_calls"]:
                            if tc["id"] not in answered_ids:
                                err_msg = {
                                    "role": "tool",
                                    "tool_call_id": tc["id"],
                                    "content": f"Error executing tool: {error_msg}",
                                }
                                messages.append(err_msg)
                        pending_handled = True
                    break
                
                if not pending_handled:
                    # Error happened before tool processing (e.g. response parsing).
                    # Use a user-role message so the model can see what went wrong
                    # without confusing the API with a fabricated assistant turn.
                    sys_err_msg = {
                        "role": "user",
                        "content": f"[System error during processing: {error_msg}]",
                    }
                    messages.append(sys_err_msg)
                
                # If we're near the limit, break to avoid infinite loops
                if api_call_count >= self.max_iterations - 1:
                    final_response = f"I apologize, but I encountered repeated errors: {error_msg}"
                    break
        
        if final_response is None and (
            api_call_count >= self.max_iterations
            or self.iteration_budget.remaining <= 0
        ):
            if self.iteration_budget.remaining <= 0 and not self.quiet_mode:
                print(f"\n⚠️  Session iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} used, including subagents)")
            final_response = self._handle_max_iterations(messages, api_call_count)
        
        # Determine if conversation completed successfully
        completed = final_response is not None and api_call_count < self.max_iterations

        # Save trajectory if enabled
        self._save_trajectory(messages, user_message, completed)

        # Clean up VM and browser for this task after conversation completes
        self._cleanup_task_resources(effective_task_id)

        # Persist session to both JSON log and SQLite
        self._persist_session(messages, conversation_history)

        # Sync conversation to Honcho for user modeling
        if final_response and not interrupted:
            self._honcho_sync(original_user_message, final_response)
            self._queue_honcho_prefetch(original_user_message)

        # Extract reasoning from the last assistant message (if any)
        last_reasoning = None
        for msg in reversed(messages):
            if msg.get("role") == "assistant" and msg.get("reasoning"):
                last_reasoning = msg["reasoning"]
                break

        # Build result with interrupt info if applicable
        result = {
            "final_response": final_response,
            "last_reasoning": last_reasoning,
            "messages": messages,
            "api_calls": api_call_count,
            "completed": completed,
            "partial": False,  # True only when stopped due to invalid tool calls
            "interrupted": interrupted,
            "response_previewed": getattr(self, "_response_was_previewed", False),
        }
        self._response_was_previewed = False
        
        # Include interrupt message if one triggered the interrupt
        if interrupted and self._interrupt_message:
            result["interrupt_message"] = self._interrupt_message
        
        # Clear interrupt state after handling
        self.clear_interrupt()

        # Clear stream callback so it doesn't leak into future calls
        self._stream_callback = None

        return result

    def chat(self, message: str, stream_callback: Optional[callable] = None) -> str:
        """
        Simple chat interface that returns just the final response.

        Args:
            message (str): User message
            stream_callback: Optional callback invoked with each text delta during streaming.

        Returns:
            str: Final assistant response
        """
        result = self.run_conversation(message, stream_callback=stream_callback)
        return result["final_response"]


def main(
    query: str = None,
    model: str = "anthropic/claude-opus-4.6",
    api_key: str = None,
    base_url: str = "https://openrouter.ai/api/v1",
    max_turns: int = 10,
    enabled_toolsets: str = None,
    disabled_toolsets: str = None,
    list_tools: bool = False,
    save_trajectories: bool = False,
    save_sample: bool = False,
    verbose: bool = False,
    log_prefix_chars: int = 20
):
    """
    Main function for running the agent directly.

    Args:
        query (str): Natural language query for the agent. Defaults to Python 3.13 example.
        model (str): Model name to use (OpenRouter format: provider/model). Defaults to anthropic/claude-sonnet-4.6.
        api_key (str): API key for authentication. Uses OPENROUTER_API_KEY env var if not provided.
        base_url (str): Base URL for the model API. Defaults to https://openrouter.ai/api/v1
        max_turns (int): Maximum number of API call iterations. Defaults to 10.
        enabled_toolsets (str): Comma-separated list of toolsets to enable. Supports predefined
                              toolsets (e.g., "research", "development", "safe").
                              Multiple toolsets can be combined: "web,vision"
        disabled_toolsets (str): Comma-separated list of toolsets to disable (e.g., "terminal")
        list_tools (bool): Just list available tools and exit
        save_trajectories (bool): Save conversation trajectories to JSONL files (appends to trajectory_samples.jsonl). Defaults to False.
        save_sample (bool): Save a single trajectory sample to a UUID-named JSONL file for inspection. Defaults to False.
        verbose (bool): Enable verbose logging for debugging. Defaults to False.
        log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses. Defaults to 20.

    Toolset Examples:
        - "research": Web search, extract, crawl + vision tools
    """
    print("🤖 AI Agent with Tool Calling")
    print("=" * 50)
    
    # Handle tool listing
    if list_tools:
        from model_tools import get_all_tool_names, get_toolset_for_tool, get_available_toolsets
        from toolsets import get_all_toolsets, get_toolset_info
        
        print("📋 Available Tools & Toolsets:")
        print("-" * 50)
        
        # Show new toolsets system
        print("\n🎯 Predefined Toolsets (New System):")
        print("-" * 40)
        all_toolsets = get_all_toolsets()
        
        # Group by category
        basic_toolsets = []
        composite_toolsets = []
        scenario_toolsets = []
        
        for name, toolset in all_toolsets.items():
            info = get_toolset_info(name)
            if info:
                entry = (name, info)
                if name in ["web", "terminal", "vision", "creative", "reasoning"]:
                    basic_toolsets.append(entry)
                elif name in ["research", "development", "analysis", "content_creation", "full_stack"]:
                    composite_toolsets.append(entry)
                else:
                    scenario_toolsets.append(entry)
        
        # Print basic toolsets
        print("\n📌 Basic Toolsets:")
        for name, info in basic_toolsets:
            tools_str = ', '.join(info['resolved_tools']) if info['resolved_tools'] else 'none'
            print(f"  • {name:15} - {info['description']}")
            print(f"    Tools: {tools_str}")
        
        # Print composite toolsets
        print("\n📂 Composite Toolsets (built from other toolsets):")
        for name, info in composite_toolsets:
            includes_str = ', '.join(info['includes']) if info['includes'] else 'none'
            print(f"  • {name:15} - {info['description']}")
            print(f"    Includes: {includes_str}")
            print(f"    Total tools: {info['tool_count']}")
        
        # Print scenario-specific toolsets
        print("\n🎭 Scenario-Specific Toolsets:")
        for name, info in scenario_toolsets:
            print(f"  • {name:20} - {info['description']}")
            print(f"    Total tools: {info['tool_count']}")
        
        
        # Show legacy toolset compatibility
        print("\n📦 Legacy Toolsets (for backward compatibility):")
        legacy_toolsets = get_available_toolsets()
        for name, info in legacy_toolsets.items():
            status = "✅" if info["available"] else "❌"
            print(f"  {status} {name}: {info['description']}")
            if not info["available"]:
                print(f"    Requirements: {', '.join(info['requirements'])}")
        
        # Show individual tools
        all_tools = get_all_tool_names()
        print(f"\n🔧 Individual Tools ({len(all_tools)} available):")
        for tool_name in sorted(all_tools):
            toolset = get_toolset_for_tool(tool_name)
            print(f"  📌 {tool_name} (from {toolset})")
        
        print(f"\n💡 Usage Examples:")
        print(f"  # Use predefined toolsets")
        print(f"  python run_agent.py --enabled_toolsets=research --query='search for Python news'")
        print(f"  python run_agent.py --enabled_toolsets=development --query='debug this code'")
        print(f"  python run_agent.py --enabled_toolsets=safe --query='analyze without terminal'")
        print(f"  ")
        print(f"  # Combine multiple toolsets")
        print(f"  python run_agent.py --enabled_toolsets=web,vision --query='analyze website'")
        print(f"  ")
        print(f"  # Disable toolsets")
        print(f"  python run_agent.py --disabled_toolsets=terminal --query='no command execution'")
        print(f"  ")
        print(f"  # Run with trajectory saving enabled")
        print(f"  python run_agent.py --save_trajectories --query='your question here'")
        return
    
    # Parse toolset selection arguments
    enabled_toolsets_list = None
    disabled_toolsets_list = None
    
    if enabled_toolsets:
        enabled_toolsets_list = [t.strip() for t in enabled_toolsets.split(",")]
        print(f"🎯 Enabled toolsets: {enabled_toolsets_list}")
    
    if disabled_toolsets:
        disabled_toolsets_list = [t.strip() for t in disabled_toolsets.split(",")]
        print(f"🚫 Disabled toolsets: {disabled_toolsets_list}")
    
    if save_trajectories:
        print(f"💾 Trajectory saving: ENABLED")
        print(f"   - Successful conversations → trajectory_samples.jsonl")
        print(f"   - Failed conversations → failed_trajectories.jsonl")
    
    # Initialize agent with provided parameters
    try:
        agent = AIAgent(
            base_url=base_url,
            model=model,
            api_key=api_key,
            max_iterations=max_turns,
            enabled_toolsets=enabled_toolsets_list,
            disabled_toolsets=disabled_toolsets_list,
            save_trajectories=save_trajectories,
            verbose_logging=verbose,
            log_prefix_chars=log_prefix_chars
        )
    except RuntimeError as e:
        print(f"❌ Failed to initialize agent: {e}")
        return
    
    # Use provided query or default to Python 3.13 example
    if query is None:
        user_query = (
            "Tell me about the latest developments in Python 3.13 and what new features "
            "developers should know about. Please search for current information and try it out."
        )
    else:
        user_query = query
    
    print(f"\n📝 User Query: {user_query}")
    print("\n" + "=" * 50)
    
    # Run conversation
    result = agent.run_conversation(user_query)
    
    print("\n" + "=" * 50)
    print("📋 CONVERSATION SUMMARY")
    print("=" * 50)
    print(f"✅ Completed: {result['completed']}")
    print(f"📞 API Calls: {result['api_calls']}")
    print(f"💬 Messages: {len(result['messages'])}")
    
    if result['final_response']:
        print(f"\n🎯 FINAL RESPONSE:")
        print("-" * 30)
        print(result['final_response'])
    
    # Save sample trajectory to UUID-named file if requested
    if save_sample:
        sample_id = str(uuid.uuid4())[:8]
        sample_filename = f"sample_{sample_id}.json"
        
        # Convert messages to trajectory format (same as batch_runner)
        trajectory = agent._convert_to_trajectory_format(
            result['messages'], 
            user_query, 
            result['completed']
        )
        
        entry = {
            "conversations": trajectory,
            "timestamp": datetime.now().isoformat(),
            "model": model,
            "completed": result['completed'],
            "query": user_query
        }
        
        try:
            with open(sample_filename, "w", encoding="utf-8") as f:
                # Pretty-print JSON with indent for readability
                f.write(json.dumps(entry, ensure_ascii=False, indent=2))
            print(f"\n💾 Sample trajectory saved to: {sample_filename}")
        except Exception as e:
            print(f"\n⚠️ Failed to save sample: {e}")
    
    print("\n👋 Agent execution completed!")


if __name__ == "__main__":
    fire.Fire(main)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								#!/usr/bin/env python3
 								"""
 								AI Agent Runner with Tool Calling
 								This module provides a clean, standalone agent that can execute AI models
 								with tool calling capabilities. It handles the conversation loop, tool execution,
 								and response management.
 								Features:
 								- Automatic tool calling loop until completion
 								- Configurable model parameters
 								- Error handling and recovery
 								- Message history management
 								- Support for multiple model providers
 								Usage:
 								    from run_agent import AIAgent
 								    agent = AIAgent(base_url="http://localhost:30000/v1", model="claude-opus-4-20250514")
 								    response = agent.run_conversation("Tell me about the latest Python updates")
 								"""
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								import atexit
-												fix: convert anthropic image content blocks

											
										
										
											2026-03-14 23:21:09 -07:00
+								import asyncio
 								import base64
-												feat: concurrent tool execution with ThreadPoolExecutor

When the model returns multiple tool calls in a single response, they are
now executed concurrently using a thread pool instead of sequentially.
This significantly reduces wall-clock time when multiple independent tools
are batched (e.g. parallel web_search, read_file, terminal calls).

Architecture:
- _execute_tool_calls() dispatches to sequential or concurrent path
- Single tool calls and batches containing 'clarify' use sequential path
- Multiple non-interactive tools use ThreadPoolExecutor (max 8 workers)
- Results are collected and appended to messages in original order
- _invoke_tool() extracted as shared tool invocation helper

Safety:
- Pre-flight interrupt check skips all tools if interrupted
- Per-tool exception handling: one failure doesn't crash the batch
- Result truncation (100k char limit) applied per tool
- Budget pressure injection after all tools complete
- Checkpoints taken before file-mutating tools
- CLI spinner shows batch progress, then per-tool completion messages

Tests: 10 new tests covering dispatch logic, ordering, error handling,
interrupt behavior, truncation, and _invoke_tool routing.

											
										
										
											2026-03-13 02:51:51 -07:00
+								import concurrent.futures
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
+								import copy
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								import hashlib
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								import json
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								import logging
-												refactor: implement structured logging across multiple modules

- Introduced logging functionality in cli.py, run_agent.py, scheduler.py, and various tool modules to replace print statements with structured logging.
- Enhanced error handling and informational messages to improve debugging and monitoring capabilities.
- Ensured consistent logging practices across the codebase, facilitating better traceability and maintenance.

											
										
										
											2026-02-21 03:11:11 -08:00
+								logger = logging.getLogger(__name__)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								import os
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								import random
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								import re
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								import sys
-												fix: convert anthropic image content blocks

											
										
										
											2026-03-14 23:21:09 -07:00
+								import tempfile
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								import time
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								import threading
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								import weakref
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								from types import SimpleNamespace
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								import uuid
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								from typing import List, Dict, Any, Optional
 								from openai import OpenAI
 								import fire
 								from datetime import datetime
-												Add environment configuration file, restructure tool imports, and enhance README setup instructions

											
										
										
											2025-10-01 09:54:17 +00:00
+								from pathlib import Path
-												fix(config): reload .env over stale shell overrides

Hermes startup entrypoints now load ~/.hermes/.env and project fallback env files with user config taking precedence over stale shell-exported values. This makes model/provider/base URL changes in .env actually take effect after restarting Hermes. Adds a shared env loader plus regression coverage, and reproduces the original bug case where OPENAI_BASE_URL and HERMES_INFERENCE_PROVIDER remained stuck on old shell values before import.

											
										
										
											2026-03-15 06:46:28 -07:00
+								# Load .env from ~/.hermes/.env first, then project root as dev fallback.
 								# User-managed env files should override stale shell exports on restart.
 								from hermes_cli.env_loader import load_hermes_dotenv
-												Add environment configuration file, restructure tool imports, and enhance README setup instructions

											
										
										
											2025-10-01 09:54:17 +00:00
-												fix: resolve .env and config paths from HERMES_HOME, not PROJECT_ROOT

The `hermes` CLI entry point (hermes_cli/main.py) and the agent runner
(run_agent.py) only loaded .env from the project installation directory.
After the standard installer, code lives at ~/.hermes/hermes-agent/ but
config lives at ~/.hermes/ — so the .env was never found.

Aligns these entry points with the pattern already used by gateway/run.py
and rl_cli.py: load ~/.hermes/.env first, fall back to project root .env
for dev-mode compatibility.

Also fixes:
- status.py checking .env existence and API keys at PROJECT_ROOT
- doctor.py KeyError on tool availability (missing_vars vs env_vars)
- doctor.py checking logs/ and Skills Hub at PROJECT_ROOT instead of HERMES_HOME
- doctor.py redundant logs/ check (already covered by subdirectory loop)
- mini-swe-agent loading config from platformdirs default instead of ~/.hermes/

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-26 16:49:14 +11:00
+								_hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
 								_project_env = Path(__file__).parent / '.env'
-												fix(config): reload .env over stale shell overrides

Hermes startup entrypoints now load ~/.hermes/.env and project fallback env files with user config taking precedence over stale shell-exported values. This makes model/provider/base URL changes in .env actually take effect after restarting Hermes. Adds a shared env loader plus regression coverage, and reproduces the original bug case where OPENAI_BASE_URL and HERMES_INFERENCE_PROVIDER remained stuck on old shell values before import.

											
										
										
											2026-03-15 06:46:28 -07:00
+								_loaded_env_paths = load_hermes_dotenv(hermes_home=_hermes_home, project_env=_project_env)
 								if _loaded_env_paths:
 								    for _env_path in _loaded_env_paths:
 								        logger.info("Loaded environment variables from %s", _env_path)
-												refactor: implement structured logging across multiple modules

- Introduced logging functionality in cli.py, run_agent.py, scheduler.py, and various tool modules to replace print statements with structured logging.
- Enhanced error handling and informational messages to improve debugging and monitoring capabilities.
- Ensured consistent logging practices across the codebase, facilitating better traceability and maintenance.

											
										
										
											2026-02-21 03:11:11 -08:00
+								else:
-												fix: resolve .env and config paths from HERMES_HOME, not PROJECT_ROOT

The `hermes` CLI entry point (hermes_cli/main.py) and the agent runner
(run_agent.py) only loaded .env from the project installation directory.
After the standard installer, code lives at ~/.hermes/hermes-agent/ but
config lives at ~/.hermes/ — so the .env was never found.

Aligns these entry points with the pattern already used by gateway/run.py
and rl_cli.py: load ~/.hermes/.env first, fall back to project root .env
for dev-mode compatibility.

Also fixes:
- status.py checking .env existence and API keys at PROJECT_ROOT
- doctor.py KeyError on tool availability (missing_vars vs env_vars)
- doctor.py checking logs/ and Skills Hub at PROJECT_ROOT instead of HERMES_HOME
- doctor.py redundant logs/ check (already covered by subdirectory loop)
- mini-swe-agent loading config from platformdirs default instead of ~/.hermes/

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-26 16:49:14 +11:00
+								    logger.info("No .env file found. Using system environment variables.")
 								# Point mini-swe-agent at ~/.hermes/ so it shares our config
 								os.environ.setdefault("MSWEA_GLOBAL_CONFIG_DIR", str(_hermes_home))
 								os.environ.setdefault("MSWEA_SILENT_STARTUP", "1")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
 								# Import our tool system
 								from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								from tools.terminal_tool import cleanup_vm
 								from tools.interrupt import set_interrupt as _set_interrupt
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								from tools.browser_tool import cleanup_browser
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								import requests
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								from hermes_constants import OPENROUTER_BASE_URL, OPENROUTER_MODELS_URL
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								# Agent internals extracted to agent/ package for modularity
-												Enhance agent guidance with memory and session search tools

- Introduced MEMORY_GUIDANCE and SESSION_SEARCH_GUIDANCE to improve agent's contextual awareness and proactive assistance.
- Updated AIAgent to conditionally include tool-aware guidance in prompts based on available tools.
- Enhanced descriptions in memory and session search schemas for clearer user instructions on when to utilize these features.

											
										
										
											2026-02-22 02:31:52 -08:00
+								from agent.prompt_builder import (
 								    DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS,
-												feat: introduce skills management features in AIAgent and CLI

- Added skills configuration options in cli-config.yaml.example, including a nudge interval for skill creation reminders.
- Implemented skills guidance in AIAgent to prompt users to save reusable workflows after complex tasks.
- Enhanced skills indexing in the prompt builder to include descriptions from SKILL.md files for better context.
- Updated the agent's behavior to periodically remind users about potential skills during tool-calling iterations.

											
										
										
											2026-02-22 13:28:13 -08:00
+								    MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE,
-												Enhance agent guidance with memory and session search tools

- Introduced MEMORY_GUIDANCE and SESSION_SEARCH_GUIDANCE to improve agent's contextual awareness and proactive assistance.
- Updated AIAgent to conditionally include tool-aware guidance in prompts based on available tools.
- Enhanced descriptions in memory and session search schemas for clearer user instructions on when to utilize these features.

											
										
										
											2026-02-22 02:31:52 -08:00
+								)
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								from agent.model_metadata import (
 								    fetch_model_metadata, get_model_context_length,
 								    estimate_tokens_rough, estimate_messages_tokens_rough,
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								    get_next_probe_tier, parse_context_limit_from_error,
 								    save_context_length,
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								)
 								from agent.context_compressor import ContextCompressor
 								from agent.prompt_caching import apply_anthropic_cache_control
 								from agent.prompt_builder import build_skills_system_prompt, build_context_files_prompt
 								from agent.display import (
 								    KawaiiSpinner, build_tool_preview as _build_tool_preview,
 								    get_cute_tool_message as _get_cute_tool_message_impl,
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
+								    _detect_tool_failure,
-												feat(tools): centralize tool emoji metadata in registry + skin integration

- Add 'emoji' field to ToolEntry and 'get_emoji()' to ToolRegistry
- Add emoji= to all 50+ registry.register() calls across tool files
- Add get_tool_emoji() helper in agent/display.py with 3-tier resolution:
  skin override → registry default → hardcoded fallback
- Replace hardcoded emoji maps in run_agent.py, delegate_tool.py, and
  gateway/run.py with centralized get_tool_emoji() calls
- Add 'tool_emojis' field to SkinConfig so skins can override per-tool
  emojis (e.g. ares skin could use swords instead of wrenches)
- Add 11 tests (5 registry emoji, 6 display/skin integration)
- Update AGENTS.md skin docs table

Based on the approach from PR #1061 by ForgingAlex (emoji centralization
in registry). This salvage fixes several issues from the original:
- Does NOT split the cronjob tool (which would crash on missing schemas)
- Does NOT change image_generate toolset/requires_env/is_async
- Does NOT delete existing tests
- Completes the centralization (gateway/run.py was missed)
- Hooks into the skin system for full customizability

											
										
										
											2026-03-15 20:21:21 -07:00
+								    get_tool_emoji as _get_tool_emoji,
-												Refactor Terminal and AIAgent cleanup

											
										
										
											2026-02-21 22:31:43 -08:00
+								)
 								from agent.trajectory import (
 								    convert_scratchpad_to_think, has_incomplete_scratchpad,
 								    save_trajectory as _save_trajectory_to_file,
-												Add platform-specific formatting hints and identity for AIAgent

- Introduced a default agent identity prompt to ensure consistent behavior across platforms.
- Added platform-specific formatting hints for CLI, WhatsApp, Telegram, and Discord to guide the agent's output style.
- Updated the AIAgent initialization to accept a platform parameter, enhancing adaptability to different interfaces.

											
										
										
											2026-02-12 16:11:16 -08:00
+								)
-												fix: reuse shared atomic session log helper

											
										
										
											2026-03-14 02:56:13 -07:00
+								from utils import atomic_json_write
-												Add platform-specific formatting hints and identity for AIAgent

- Introduced a default agent identity prompt to ensure consistent behavior across platforms.
- Added platform-specific formatting hints for CLI, WhatsApp, Telegram, and Discord to guide the agent's output style.
- Updated the AIAgent initialization to accept a platform parameter, enhancing adaptability to different interfaces.

											
										
										
											2026-02-12 16:11:16 -08:00
-												fix(honcho): harden tool gating and migration peer routing

Prevent stale Honcho tool exposure in context/local modes, restore reliable async write retry behavior, and ensure SOUL.md migration uploads target the AI peer instead of the user peer. Also align Honcho CLI key checks with host-scoped apiKey resolution and lock the fixes with regression tests.

Made-with: Cursor

											
										
										
											2026-03-11 18:21:27 -04:00
+								HONCHO_TOOL_NAMES = {
 								    "honcho_context",
 								    "honcho_profile",
 								    "honcho_search",
 								    "honcho_conclude",
 								}
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
-												fix: guard all print() calls against OSError with _SafeWriter

When hermes-agent runs as a systemd service, Docker container, or
headless daemon, the stdout pipe can become unavailable (idle timeout,
buffer exhaustion, socket reset). Any print() call then raises
OSError: [Errno 5] Input/output error, crashing run_conversation()
and causing cron jobs to fail.

Rather than wrapping individual print() calls (68 in run_conversation
alone), this adds a transparent _SafeWriter wrapper installed once at
the start of run_conversation(). It delegates all writes to the real
stdout and silently catches OSError. Zero overhead on the happy path,
comprehensive coverage of all print calls including future ones.

Fixes #845

Co-authored-by: J0hnLawMississippi <J0hnLawMississippi@users.noreply.github.com>

											
										
										
											2026-03-11 09:19:10 -07:00
+								class _SafeWriter:
-												fix: guard init-time stdio writes

											
										
										
											2026-03-14 02:19:46 -07:00
+								    """Transparent stdio wrapper that catches OSError from broken pipes.
-												fix: guard all print() calls against OSError with _SafeWriter

When hermes-agent runs as a systemd service, Docker container, or
headless daemon, the stdout pipe can become unavailable (idle timeout,
buffer exhaustion, socket reset). Any print() call then raises
OSError: [Errno 5] Input/output error, crashing run_conversation()
and causing cron jobs to fail.

Rather than wrapping individual print() calls (68 in run_conversation
alone), this adds a transparent _SafeWriter wrapper installed once at
the start of run_conversation(). It delegates all writes to the real
stdout and silently catches OSError. Zero overhead on the happy path,
comprehensive coverage of all print calls including future ones.

Fixes #845

Co-authored-by: J0hnLawMississippi <J0hnLawMississippi@users.noreply.github.com>

											
										
										
											2026-03-11 09:19:10 -07:00
 								    When hermes-agent runs as a systemd service, Docker container, or headless
-												fix: guard init-time stdio writes

											
										
										
											2026-03-14 02:19:46 -07:00
+								    daemon, the stdout/stderr pipe can become unavailable (idle timeout, buffer
-												fix: guard all print() calls against OSError with _SafeWriter

When hermes-agent runs as a systemd service, Docker container, or
headless daemon, the stdout pipe can become unavailable (idle timeout,
buffer exhaustion, socket reset). Any print() call then raises
OSError: [Errno 5] Input/output error, crashing run_conversation()
and causing cron jobs to fail.

Rather than wrapping individual print() calls (68 in run_conversation
alone), this adds a transparent _SafeWriter wrapper installed once at
the start of run_conversation(). It delegates all writes to the real
stdout and silently catches OSError. Zero overhead on the happy path,
comprehensive coverage of all print calls including future ones.

Fixes #845

Co-authored-by: J0hnLawMississippi <J0hnLawMississippi@users.noreply.github.com>

											
										
										
											2026-03-11 09:19:10 -07:00
+								    exhaustion, socket reset). Any print() call then raises
-												fix: guard init-time stdio writes

											
										
										
											2026-03-14 02:19:46 -07:00
+								    ``OSError: [Errno 5] Input/output error``, which can crash agent setup or
 								    run_conversation() — especially via double-fault when an except handler
-												fix: guard all print() calls against OSError with _SafeWriter

When hermes-agent runs as a systemd service, Docker container, or
headless daemon, the stdout pipe can become unavailable (idle timeout,
buffer exhaustion, socket reset). Any print() call then raises
OSError: [Errno 5] Input/output error, crashing run_conversation()
and causing cron jobs to fail.

Rather than wrapping individual print() calls (68 in run_conversation
alone), this adds a transparent _SafeWriter wrapper installed once at
the start of run_conversation(). It delegates all writes to the real
stdout and silently catches OSError. Zero overhead on the happy path,
comprehensive coverage of all print calls including future ones.

Fixes #845

Co-authored-by: J0hnLawMississippi <J0hnLawMississippi@users.noreply.github.com>

											
										
										
											2026-03-11 09:19:10 -07:00
+								    also tries to print.
 								    This wrapper delegates all writes to the underlying stream and silently
-												fix: guard init-time stdio writes

											
										
										
											2026-03-14 02:19:46 -07:00
+								    catches OSError. It is transparent when the wrapped stream is healthy.
-												fix: guard all print() calls against OSError with _SafeWriter

When hermes-agent runs as a systemd service, Docker container, or
headless daemon, the stdout pipe can become unavailable (idle timeout,
buffer exhaustion, socket reset). Any print() call then raises
OSError: [Errno 5] Input/output error, crashing run_conversation()
and causing cron jobs to fail.

Rather than wrapping individual print() calls (68 in run_conversation
alone), this adds a transparent _SafeWriter wrapper installed once at
the start of run_conversation(). It delegates all writes to the real
stdout and silently catches OSError. Zero overhead on the happy path,
comprehensive coverage of all print calls including future ones.

Fixes #845

Co-authored-by: J0hnLawMississippi <J0hnLawMississippi@users.noreply.github.com>

											
										
										
											2026-03-11 09:19:10 -07:00
+								    """
 								    __slots__ = ("_inner",)
 								    def __init__(self, inner):
 								        object.__setattr__(self, "_inner", inner)
 								    def write(self, data):
 								        try:
 								            return self._inner.write(data)
 								        except OSError:
 								            return len(data) if isinstance(data, str) else 0
 								    def flush(self):
 								        try:
 								            self._inner.flush()
 								        except OSError:
 								            pass
 								    def fileno(self):
 								        return self._inner.fileno()
 								    def isatty(self):
 								        try:
 								            return self._inner.isatty()
 								        except OSError:
 								            return False
 								    def __getattr__(self, name):
 								        return getattr(self._inner, name)
-												fix: guard init-time stdio writes

											
										
										
											2026-03-14 02:19:46 -07:00
+								def _install_safe_stdio() -> None:
 								    """Wrap stdout/stderr so best-effort console output cannot crash the agent."""
 								    for stream_name in ("stdout", "stderr"):
 								        stream = getattr(sys, stream_name, None)
 								        if stream is not None and not isinstance(stream, _SafeWriter):
 								            setattr(sys, stream_name, _SafeWriter(stream))
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
+								class IterationBudget:
 								    """Thread-safe shared iteration counter for parent and child agents.
 								    Tracks total LLM-call iterations consumed across a parent agent and all
 								    its subagents.  A single ``IterationBudget`` is created by the parent
 								    and passed to every child so they share the same cap.
 								    ``execute_code`` (programmatic tool calling) iterations are refunded via
 								    :meth:`refund` so they don't eat into the budget.
 								    """
 								    def __init__(self, max_total: int):
 								        self.max_total = max_total
 								        self._used = 0
 								        self._lock = threading.Lock()
 								    def consume(self) -> bool:
 								        """Try to consume one iteration.  Returns True if allowed."""
 								        with self._lock:
 								            if self._used >= self.max_total:
 								                return False
 								            self._used += 1
 								            return True
 								    def refund(self) -> None:
 								        """Give back one iteration (e.g. for execute_code turns)."""
 								        with self._lock:
 								            if self._used > 0:
 								                self._used -= 1
 								    @property
 								    def used(self) -> int:
 								        return self._used
 								    @property
 								    def remaining(self) -> int:
 								        with self._lock:
 								            return max(0, self.max_total - self._used)
-												feat: concurrent tool execution with ThreadPoolExecutor

When the model returns multiple tool calls in a single response, they are
now executed concurrently using a thread pool instead of sequentially.
This significantly reduces wall-clock time when multiple independent tools
are batched (e.g. parallel web_search, read_file, terminal calls).

Architecture:
- _execute_tool_calls() dispatches to sequential or concurrent path
- Single tool calls and batches containing 'clarify' use sequential path
- Multiple non-interactive tools use ThreadPoolExecutor (max 8 workers)
- Results are collected and appended to messages in original order
- _invoke_tool() extracted as shared tool invocation helper

Safety:
- Pre-flight interrupt check skips all tools if interrupted
- Per-tool exception handling: one failure doesn't crash the batch
- Result truncation (100k char limit) applied per tool
- Budget pressure injection after all tools complete
- Checkpoints taken before file-mutating tools
- CLI spinner shows batch progress, then per-tool completion messages

Tests: 10 new tests covering dispatch logic, ordering, error handling,
interrupt behavior, truncation, and _invoke_tool routing.

											
										
										
											2026-03-13 02:51:51 -07:00
+								# Tools that must never run concurrently (interactive / user-facing).
 								# When any of these appear in a batch, we fall back to sequential execution.
 								_NEVER_PARALLEL_TOOLS = frozenset({"clarify"})
 								# Maximum number of concurrent worker threads for parallel tool execution.
 								_MAX_TOOL_WORKERS = 8
-												fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API
call time instead of appending it to the system prompt. This preserves the
stable system-prefix cache while keeping Honcho continuity context
available for the turn.

Also adds regression coverage for the injection helper and for continuing
sessions so Honcho recall stays out of the system prompt.
											
										
										
											2026-03-13 21:07:00 -07:00
+								def _inject_honcho_turn_context(content, turn_context: str):
 								    """Append Honcho recall to the current-turn user message without mutating history.
 								    The returned content is sent to the API for this turn only. Keeping Honcho
 								    recall out of the system prompt preserves the stable cache prefix while
 								    still giving the model continuity context.
 								    """
 								    if not turn_context:
 								        return content
 								    note = (
 								        "[System note: The following Honcho memory was retrieved from prior "
 								        "sessions. It is continuity context for this turn only, not new user "
 								        "input.]\n\n"
 								        f"{turn_context}"
 								    )
 								    if isinstance(content, list):
 								        return list(content) + [{"type": "text", "text": note}]
 								    text = "" if content is None else str(content)
 								    if not text.strip():
 								        return note
 								    return f"{text}\n\n{note}"
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								class AIAgent:
 								    """
 								    AI Agent with tool calling capabilities.
 								    This class manages the conversation flow, tool execution, and response handling
 								    for AI models that support function calling.
 								    """
 								    def __init__(
-												fix leakage

											
										
										
											2025-11-03 17:42:23 -05:00
+								        self,
 								        base_url: str = None,
 								        api_key: str = None,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        provider: str = None,
 								        api_mode: str = None,
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								        model: str = "anthropic/claude-opus-4.6",  # OpenRouter format
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
+								        max_iterations: int = 90,  # Default tool-calling iterations (shared with subagents)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        tool_delay: float = 1.0,
 								        enabled_toolsets: List[str] = None,
 								        disabled_toolsets: List[str] = None,
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								        save_trajectories: bool = False,
-												Add ephemeral system prompt support in batch and agent runners. Update README with usage examples and documentation for the new feature. Ensure prompt is not saved to trajectories.

											
										
										
											2025-10-08 02:33:58 +00:00
+								        verbose_logging: bool = False,
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								        quiet_mode: bool = False,
-												add logging of prefix of tool call and tool response

											
										
										
											2025-11-07 14:43:44 -05:00
+								        ephemeral_system_prompt: str = None,
 								        log_prefix_chars: int = 100,
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								        log_prefix: str = "",
 								        providers_allowed: List[str] = None,
 								        providers_ignored: List[str] = None,
 								        providers_order: List[str] = None,
 								        provider_sort: str = None,
-												feat(provider-routing): add OpenRouter provider routing configuration

Introduced a new `provider_routing` section in the CLI configuration to control how requests are routed across providers when using OpenRouter. This includes options for sorting providers by throughput, latency, or price, as well as allowing or ignoring specific providers, setting the order of provider attempts, and managing data collection policies. Updated relevant classes and documentation to support these features, enhancing flexibility in provider selection.

											
										
										
											2026-03-01 18:24:27 -08:00
+								        provider_require_parameters: bool = False,
 								        provider_data_collection: str = None,
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								        session_id: str = None,
-												Add tool progress notifications for messaging channels

- Introduced a new callback mechanism in the AIAgent class to send tool progress messages during execution, enhancing user feedback in messaging platforms.
- Updated the GatewayRunner to support tool progress notifications, allowing users to enable or disable this feature via environment variables.
- Enhanced the CLI setup wizard to prompt users for enabling tool progress messages and selecting the notification mode (all or new), improving configuration options.
- Updated relevant documentation to reflect the new features and configuration settings for tool progress notifications.

											
										
										
											2026-02-03 14:54:43 -08:00
+								        tool_progress_callback: callable = None,
-												fix: stop terminal border flashing with steady cursor and TUI spinner widget

Cherry-picked and improved from PR #470 (fixes #464).

Problem: On Ubuntu 24.04 with ghostty + tmux, the prompt input box
border lines flash due to cursor blink and raw spinner terminal writes
conflicting with prompt_toolkit's rendering.

Changes:
- cli.py: Add CursorShape.BLOCK to Application() to disable cursor blink
- cli.py: Add thinking_callback + spinner_widget in TUI layout so
  thinking status displays as a proper prompt_toolkit widget instead of
  raw terminal writes that conflict with the TUI renderer
- run_agent.py: Add thinking_callback parameter to AIAgent; when set,
  uses the callback instead of KawaiiSpinner for thinking display

What was NOT changed (preserving existing behavior):
- agent/display.py: Untouched. KawaiiSpinner _write() stdout capture,
  _animate() logic, and 0.12s frame interval all preserved. This
  protects subagent stdout redirection and keeps smooth animations
  for non-CLI contexts (gateway, batch runner).
- Original emoji spinner types (brain/sparkle/pulse/moon/star) preserved
  for all non-CLI contexts.

Fixes from original PR #470:
- CursorShape.STEADY_BLOCK -> CursorShape.BLOCK (STEADY_BLOCK doesn't
  exist in prompt_toolkit 3.0.52)
- Removed duplicate self._spinner_text = '' line
- Removed redundant nested if-checks

Tested: 2706 tests pass, interactive CLI verified via tmux.

											
										
										
											2026-03-09 23:26:43 -07:00
+								        thinking_callback: callable = None,
-												feat(cli): add /reasoning command for effort level and display toggle

Combined implementation of reasoning management:
- /reasoning              Show current effort level and display state
- /reasoning <level>      Set reasoning effort (none, low, medium, high, xhigh)
- /reasoning show|on      Show model thinking/reasoning in output
- /reasoning hide|off     Hide model thinking/reasoning from output

Effort level changes persist to config and force agent re-init.
Display toggle updates the agent callback dynamically without re-init.

When display is enabled:
- Intermediate reasoning shown as dim [thinking] lines during tool loops
- Final reasoning shown in a bordered box above the response
- Long reasoning collapsed (5 lines intermediate, 10 lines final)

Also adds:
- reasoning_callback parameter to AIAgent
- last_reasoning in run_conversation result dict
- show_reasoning config option (display section, default: false)
- Display section in /config output
- 34 tests covering both features

Combines functionality from PR #789 and PR #790.

Co-authored-by: Aum Desai <Aum08Desai@users.noreply.github.com>
Co-authored-by: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>

											
										
										
											2026-03-11 05:53:21 -07:00
+								        reasoning_callback: callable = None,
-												feat: introduce clarifying questions tool for interactive user engagement

- Added a new `clarify_tool` to enable the agent to ask structured multiple-choice or open-ended questions to users.
- Implemented callback functionality for user interaction, allowing the platform to handle UI presentation.
- Updated the CLI and agent to support clarify questions, including timeout handling and response management.
- Enhanced toolset definitions and requirements to include the clarify tool, ensuring availability across platforms.

											
										
										
											2026-02-19 20:06:14 -08:00
+								        clarify_callback: callable = None,
-												feat(hooks): introduce event hooks system for lifecycle management

Add a new hooks system allowing users to run custom code at key lifecycle points in the agent's operation. This includes support for events such as `gateway:startup`, `session:start`, `agent:step`, and more. Documentation for creating hooks and available events has been added to `README.md` and a new `hooks.md` file. Additionally, integrate step callbacks in the agent to facilitate hook execution during tool-calling iterations.

											
										
										
											2026-02-28 17:09:26 -08:00
+								        step_callback: callable = None,
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								        max_tokens: int = None,
 								        reasoning_config: Dict[str, Any] = None,
 								        prefill_messages: List[Dict[str, Any]] = None,
-												Add platform-specific formatting hints and identity for AIAgent

- Introduced a default agent identity prompt to ensure consistent behavior across platforms.
- Added platform-specific formatting hints for CLI, WhatsApp, Telegram, and Discord to guide the agent's output style.
- Updated the AIAgent initialization to accept a platform parameter, enhancing adaptability to different interfaces.

											
										
										
											2026-02-12 16:11:16 -08:00
+								        platform: str = None,
-												Add skip_context_files option to AIAgent for batch processing

- Introduced a new parameter `skip_context_files` in the AIAgent class to control the inclusion of context files (SOUL.md, AGENTS.md, .cursorrules) in the system prompt.
- Updated the _process_single_prompt function to set `skip_context_files` to True, preventing pollution of trajectories during batch processing and data generation.

											
										
										
											2026-02-16 22:40:31 -08:00
+								        skip_context_files: bool = False,
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        skip_memory: bool = False,
 								        session_db=None,
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        honcho_session_key: str = None,
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								        honcho_manager=None,
 								        honcho_config=None,
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
+								        iteration_budget: "IterationBudget" = None,
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								        fallback_model: Dict[str, Any] = None,
-												feat: filesystem checkpoints and /rollback command

Automatic filesystem snapshots before destructive file operations,
with user-facing rollback.  Inspired by PR #559 (by @alireza78a).

Architecture:
- Shadow git repos at ~/.hermes/checkpoints/{hash}/ via GIT_DIR
- CheckpointManager: take/list/restore, turn-scoped dedup, pruning
- Transparent — the LLM never sees it, no tool schema, no tokens
- Once per turn — only first write_file/patch triggers a snapshot

Integration:
- Config: checkpoints.enabled + checkpoints.max_snapshots
- CLI flag: hermes --checkpoints
- Trigger: run_agent.py _execute_tool_calls() before write_file/patch
- /rollback slash command in CLI + gateway (list, restore by number)
- Pre-rollback snapshot auto-created on restore (undo the undo)

Safety:
- Never blocks file operations — all errors silently logged
- Skips root dir, home dir, dirs >50K files
- Disables gracefully when git not installed
- Shadow repo completely isolated from project git

Tests: 35 new tests, all passing (2798 total suite)
Docs: feature page, config reference, CLI commands reference

											
										
										
											2026-03-10 00:49:15 -07:00
+								        checkpoints_enabled: bool = False,
 								        checkpoint_max_snapshots: int = 50,
-												feat: include session ID in system prompt via --pass-session-id flag

Adds --pass-session-id CLI flag. When set, the agent's system prompt
includes the session ID:

  Conversation started: Sunday, March 08, 2026 06:32 PM
  Session ID: 20260308_183200_abc123

Usage:
  hermes --pass-session-id
  hermes chat --pass-session-id

Implementation threads the flag as a proper parameter through the full
chain (main.py → cli.py → run_agent.py) rather than using an env var,
avoiding collisions in multi-agent/multitenant setups.

Based on PR #726 by dmahan93, reworked to use instance parameter
instead of HERMES_PASS_SESSION_ID environment variable.

Co-authored-by: dmahan93 <dmahan93@users.noreply.github.com>

											
										
										
											2026-03-12 05:51:31 -07:00
+								        pass_session_id: bool = False,
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    ):
 								        """
 								        Initialize the AI Agent.
-												fix leakage

											
										
										
											2025-11-03 17:42:23 -05:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        Args:
 								            base_url (str): Base URL for the model API (optional)
 								            api_key (str): API key for authentication (optional, uses env var if not provided)
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								            provider (str): Provider identifier (optional; used for telemetry/routing hints)
 								            api_mode (str): API mode override: "chat_completions" or "codex_responses"
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								            model (str): Model name to use (default: "anthropic/claude-opus-4.6")
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
+								            max_iterations (int): Maximum number of tool calling iterations (default: 90)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            tool_delay (float): Delay between tool calls in seconds (default: 1.0)
 								            enabled_toolsets (List[str]): Only enable tools from these toolsets (optional)
 								            disabled_toolsets (List[str]): Disable tools from these toolsets (optional)
 								            save_trajectories (bool): Whether to save conversation trajectories to JSONL files (default: False)
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								            verbose_logging (bool): Enable verbose logging for debugging (default: False)
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								            quiet_mode (bool): Suppress progress output for clean CLI experience (default: False)
-												Add ephemeral system prompt support in batch and agent runners. Update README with usage examples and documentation for the new feature. Ensure prompt is not saved to trajectories.

											
										
										
											2025-10-08 02:33:58 +00:00
+								            ephemeral_system_prompt (str): System prompt used during agent execution but NOT saved to trajectories (optional)
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								            log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses (default: 100)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								            log_prefix (str): Prefix to add to all log messages for identification in parallel processing (default: "")
 								            providers_allowed (List[str]): OpenRouter providers to allow (optional)
 								            providers_ignored (List[str]): OpenRouter providers to ignore (optional)
 								            providers_order (List[str]): OpenRouter providers to try in order (optional)
 								            provider_sort (str): Sort providers by price/throughput/latency (optional)
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								            session_id (str): Pre-generated session ID for logging (optional, auto-generated if not provided)
-												Add tool progress notifications for messaging channels

- Introduced a new callback mechanism in the AIAgent class to send tool progress messages during execution, enhancing user feedback in messaging platforms.
- Updated the GatewayRunner to support tool progress notifications, allowing users to enable or disable this feature via environment variables.
- Enhanced the CLI setup wizard to prompt users for enabling tool progress messages and selecting the notification mode (all or new), improving configuration options.
- Updated relevant documentation to reflect the new features and configuration settings for tool progress notifications.

											
										
										
											2026-02-03 14:54:43 -08:00
+								            tool_progress_callback (callable): Callback function(tool_name, args_preview) for progress notifications
-												feat: introduce clarifying questions tool for interactive user engagement

- Added a new `clarify_tool` to enable the agent to ask structured multiple-choice or open-ended questions to users.
- Implemented callback functionality for user interaction, allowing the platform to handle UI presentation.
- Updated the CLI and agent to support clarify questions, including timeout handling and response management.
- Enhanced toolset definitions and requirements to include the clarify tool, ensuring availability across platforms.

											
										
										
											2026-02-19 20:06:14 -08:00
+								            clarify_callback (callable): Callback function(question, choices) -> str for interactive user questions.
 								                Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error.
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								            max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
 								            reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking).
-												feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to
medium reasoning effort instead of xhigh. Users can still override
via config or CLI flag. Updates code, tests, example config, and docs.

											
										
										
											2026-03-07 10:14:19 -08:00
+								                If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning.
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								            prefill_messages (List[Dict]): Messages to prepend to conversation history as prefilled context.
 								                Useful for injecting a few-shot example or priming the model's response style.
 								                Example: [{"role": "user", "content": "Hi!"}, {"role": "assistant", "content": "Hello!"}]
-												Add platform-specific formatting hints and identity for AIAgent

- Introduced a default agent identity prompt to ensure consistent behavior across platforms.
- Added platform-specific formatting hints for CLI, WhatsApp, Telegram, and Discord to guide the agent's output style.
- Updated the AIAgent initialization to accept a platform parameter, enhancing adaptability to different interfaces.

											
										
										
											2026-02-12 16:11:16 -08:00
+								            platform (str): The interface platform the user is on (e.g. "cli", "telegram", "discord", "whatsapp").
 								                Used to inject platform-specific formatting hints into the system prompt.
-												Add skip_context_files option to AIAgent for batch processing

- Introduced a new parameter `skip_context_files` in the AIAgent class to control the inclusion of context files (SOUL.md, AGENTS.md, .cursorrules) in the system prompt.
- Updated the _process_single_prompt function to set `skip_context_files` to True, preventing pollution of trajectories during batch processing and data generation.

											
										
										
											2026-02-16 22:40:31 -08:00
+								            skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules
 								                into the system prompt. Use this for batch processing and data generation to avoid
 								                polluting trajectories with user-specific persona or project instructions.
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								            honcho_session_key (str): Session key for Honcho integration (e.g., "telegram:123456" or CLI session_id).
 								                When provided and Honcho is enabled in config, enables persistent cross-session user modeling.
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								            honcho_manager: Optional shared HonchoSessionManager owned by the caller.
 								            honcho_config: Optional HonchoClientConfig corresponding to honcho_manager.
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        """
-												fix: guard init-time stdio writes

											
										
										
											2026-03-14 02:19:46 -07:00
+								        _install_safe_stdio()
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        self.model = model
 								        self.max_iterations = max_iterations
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
+								        # Shared iteration budget — parent creates, children inherit.
 								        # Consumed by every LLM turn across parent + all subagents.
 								        self.iteration_budget = iteration_budget or IterationBudget(max_iterations)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        self.tool_delay = tool_delay
 								        self.save_trajectories = save_trajectories
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								        self.verbose_logging = verbose_logging
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								        self.quiet_mode = quiet_mode
-												Add ephemeral system prompt support in batch and agent runners. Update README with usage examples and documentation for the new feature. Ensure prompt is not saved to trajectories.

											
										
										
											2025-10-08 02:33:58 +00:00
+								        self.ephemeral_system_prompt = ephemeral_system_prompt
-												Add platform-specific formatting hints and identity for AIAgent

- Introduced a default agent identity prompt to ensure consistent behavior across platforms.
- Added platform-specific formatting hints for CLI, WhatsApp, Telegram, and Discord to guide the agent's output style.
- Updated the AIAgent initialization to accept a platform parameter, enhancing adaptability to different interfaces.

											
										
										
											2026-02-12 16:11:16 -08:00
+								        self.platform = platform  # "cli", "telegram", "discord", "whatsapp", etc.
-												Add skip_context_files option to AIAgent for batch processing

- Introduced a new parameter `skip_context_files` in the AIAgent class to control the inclusion of context files (SOUL.md, AGENTS.md, .cursorrules) in the system prompt.
- Updated the _process_single_prompt function to set `skip_context_files` to True, preventing pollution of trajectories during batch processing and data generation.

											
										
										
											2026-02-16 22:40:31 -08:00
+								        self.skip_context_files = skip_context_files
-												feat: include session ID in system prompt via --pass-session-id flag

Adds --pass-session-id CLI flag. When set, the agent's system prompt
includes the session ID:

  Conversation started: Sunday, March 08, 2026 06:32 PM
  Session ID: 20260308_183200_abc123

Usage:
  hermes --pass-session-id
  hermes chat --pass-session-id

Implementation threads the flag as a proper parameter through the full
chain (main.py → cli.py → run_agent.py) rather than using an env var,
avoiding collisions in multi-agent/multitenant setups.

Based on PR #726 by dmahan93, reworked to use instance parameter
instead of HERMES_PASS_SESSION_ID environment variable.

Co-authored-by: dmahan93 <dmahan93@users.noreply.github.com>

											
										
										
											2026-03-12 05:51:31 -07:00
+								        self.pass_session_id = pass_session_id
-												add logging of prefix of tool call and tool response

											
										
										
											2025-11-07 14:43:44 -05:00
+								        self.log_prefix_chars = log_prefix_chars
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								        self.log_prefix = f"{log_prefix} " if log_prefix else ""
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
+								        # Store effective base URL for feature detection (prompt caching, reasoning, etc.)
 								        # When no base_url is provided, the client defaults to OpenRouter, so reflect that here.
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								        self.base_url = base_url or OPENROUTER_BASE_URL
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        provider_name = provider.strip().lower() if isinstance(provider, str) and provider.strip() else None
 								        self.provider = provider_name or "openrouter"
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								        if api_mode in {"chat_completions", "codex_responses", "anthropic_messages"}:
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								            self.api_mode = api_mode
 								        elif self.provider == "openai-codex":
 								            self.api_mode = "codex_responses"
 								        elif (provider_name is None) and "chatgpt.com/backend-api/codex" in self.base_url.lower():
 								            self.api_mode = "codex_responses"
 								            self.provider = "openai-codex"
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								        elif self.provider == "anthropic" or (provider_name is None and "api.anthropic.com" in self.base_url.lower()):
 								            self.api_mode = "anthropic_messages"
 								            self.provider = "anthropic"
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        else:
 								            self.api_mode = "chat_completions"
-												fix: allow Anthropic API URLs as custom OpenAI-compatible endpoints

Removed the hard block on base_url containing 'api.anthropic.com'.
Anthropic now offers an OpenAI-compatible /chat/completions endpoint,
so blocking their URL prevents legitimate use. If the endpoint isn't
compatible, the API call will fail with a proper error anyway.

Removed from: run_agent.py, mini_swe_runner.py
Updated test to verify Anthropic URLs are accepted.

											
										
										
											2026-03-07 23:36:35 -08:00
-												Add tool progress notifications for messaging channels

- Introduced a new callback mechanism in the AIAgent class to send tool progress messages during execution, enhancing user feedback in messaging platforms.
- Updated the GatewayRunner to support tool progress notifications, allowing users to enable or disable this feature via environment variables.
- Enhanced the CLI setup wizard to prompt users for enabling tool progress messages and selecting the notification mode (all or new), improving configuration options.
- Updated relevant documentation to reflect the new features and configuration settings for tool progress notifications.

											
										
										
											2026-02-03 14:54:43 -08:00
+								        self.tool_progress_callback = tool_progress_callback
-												fix: stop terminal border flashing with steady cursor and TUI spinner widget

Cherry-picked and improved from PR #470 (fixes #464).

Problem: On Ubuntu 24.04 with ghostty + tmux, the prompt input box
border lines flash due to cursor blink and raw spinner terminal writes
conflicting with prompt_toolkit's rendering.

Changes:
- cli.py: Add CursorShape.BLOCK to Application() to disable cursor blink
- cli.py: Add thinking_callback + spinner_widget in TUI layout so
  thinking status displays as a proper prompt_toolkit widget instead of
  raw terminal writes that conflict with the TUI renderer
- run_agent.py: Add thinking_callback parameter to AIAgent; when set,
  uses the callback instead of KawaiiSpinner for thinking display

What was NOT changed (preserving existing behavior):
- agent/display.py: Untouched. KawaiiSpinner _write() stdout capture,
  _animate() logic, and 0.12s frame interval all preserved. This
  protects subagent stdout redirection and keeps smooth animations
  for non-CLI contexts (gateway, batch runner).
- Original emoji spinner types (brain/sparkle/pulse/moon/star) preserved
  for all non-CLI contexts.

Fixes from original PR #470:
- CursorShape.STEADY_BLOCK -> CursorShape.BLOCK (STEADY_BLOCK doesn't
  exist in prompt_toolkit 3.0.52)
- Removed duplicate self._spinner_text = '' line
- Removed redundant nested if-checks

Tested: 2706 tests pass, interactive CLI verified via tmux.

											
										
										
											2026-03-09 23:26:43 -07:00
+								        self.thinking_callback = thinking_callback
-												feat(cli): add /reasoning command for effort level and display toggle

Combined implementation of reasoning management:
- /reasoning              Show current effort level and display state
- /reasoning <level>      Set reasoning effort (none, low, medium, high, xhigh)
- /reasoning show|on      Show model thinking/reasoning in output
- /reasoning hide|off     Hide model thinking/reasoning from output

Effort level changes persist to config and force agent re-init.
Display toggle updates the agent callback dynamically without re-init.

When display is enabled:
- Intermediate reasoning shown as dim [thinking] lines during tool loops
- Final reasoning shown in a bordered box above the response
- Long reasoning collapsed (5 lines intermediate, 10 lines final)

Also adds:
- reasoning_callback parameter to AIAgent
- last_reasoning in run_conversation result dict
- show_reasoning config option (display section, default: false)
- Display section in /config output
- 34 tests covering both features

Combines functionality from PR #789 and PR #790.

Co-authored-by: Aum Desai <Aum08Desai@users.noreply.github.com>
Co-authored-by: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>

											
										
										
											2026-03-11 05:53:21 -07:00
+								        self.reasoning_callback = reasoning_callback
-												feat: introduce clarifying questions tool for interactive user engagement

- Added a new `clarify_tool` to enable the agent to ask structured multiple-choice or open-ended questions to users.
- Implemented callback functionality for user interaction, allowing the platform to handle UI presentation.
- Updated the CLI and agent to support clarify questions, including timeout handling and response management.
- Enhanced toolset definitions and requirements to include the clarify tool, ensuring availability across platforms.

											
										
										
											2026-02-19 20:06:14 -08:00
+								        self.clarify_callback = clarify_callback
-												feat(hooks): introduce event hooks system for lifecycle management

Add a new hooks system allowing users to run custom code at key lifecycle points in the agent's operation. This includes support for events such as `gateway:startup`, `session:start`, `agent:step`, and more. Documentation for creating hooks and available events has been added to `README.md` and a new `hooks.md` file. Additionally, integrate step callbacks in the agent to facilitate hook execution during tool-calling iterations.

											
										
										
											2026-02-28 17:09:26 -08:00
+								        self.step_callback = step_callback
-												Add tool progress notifications for messaging channels

- Introduced a new callback mechanism in the AIAgent class to send tool progress messages during execution, enhancing user feedback in messaging platforms.
- Updated the GatewayRunner to support tool progress notifications, allowing users to enable or disable this feature via environment variables.
- Enhanced the CLI setup wizard to prompt users for enabling tool progress messages and selecting the notification mode (all or new), improving configuration options.
- Updated relevant documentation to reflect the new features and configuration settings for tool progress notifications.

											
										
										
											2026-02-03 14:54:43 -08:00
+								        self._last_reported_tool = None  # Track for "new tool" mode
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								        # Interrupt mechanism for breaking out of tool loops
 								        self._interrupt_requested = False
 								        self._interrupt_message = None  # Optional message that triggered interrupt
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        self._client_lock = threading.RLock()
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
-												feat: implement subagent delegation for task management

- Introduced the `delegate_task` tool, allowing the main agent to spawn child AIAgent instances with isolated context for complex tasks.
- Supported both single-task and batch processing (up to 3 concurrent tasks) to enhance task management capabilities.
- Updated configuration options for delegation, including maximum iterations and default toolsets for subagents.
- Enhanced documentation to provide clear guidance on using the delegation feature and its configuration.
- Added comprehensive tests to ensure the functionality and reliability of the delegation logic.

											
										
										
											2026-02-20 03:15:53 -08:00
+								        # Subagent delegation state
 								        self._delegate_depth = 0        # 0 = top-level agent, incremented for children
 								        self._active_children = []      # Running child AIAgents (for interrupt propagation)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								        # Store OpenRouter provider preferences
 								        self.providers_allowed = providers_allowed
 								        self.providers_ignored = providers_ignored
 								        self.providers_order = providers_order
 								        self.provider_sort = provider_sort
-												feat(provider-routing): add OpenRouter provider routing configuration

Introduced a new `provider_routing` section in the CLI configuration to control how requests are routed across providers when using OpenRouter. This includes options for sorting providers by throughput, latency, or price, as well as allowing or ignoring specific providers, setting the order of provider attempts, and managing data collection policies. Updated relevant classes and documentation to support these features, enhancing flexibility in provider selection.

											
										
										
											2026-03-01 18:24:27 -08:00
+								        self.provider_require_parameters = provider_require_parameters
 								        self.provider_data_collection = provider_data_collection
-												fix leakage

											
										
										
											2025-11-03 17:42:23 -05:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        # Store toolset filtering options
 								        self.enabled_toolsets = enabled_toolsets
 								        self.disabled_toolsets = disabled_toolsets
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								        # Model response configuration
 								        self.max_tokens = max_tokens  # None = use model default
-												feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to
medium reasoning effort instead of xhigh. Users can still override
via config or CLI flag. Updates code, tests, example config, and docs.

											
										
										
											2026-03-07 10:14:19 -08:00
+								        self.reasoning_config = reasoning_config  # None = use default (medium for OpenRouter)
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								        self.prefill_messages = prefill_messages or []  # Prefilled conversation turns
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
+								        # Anthropic prompt caching: auto-enabled for Claude models via OpenRouter.
 								        # Reduces input costs by ~75% on multi-turn conversations by caching the
 								        # conversation prefix. Uses system_and_3 strategy (4 breakpoints).
 								        is_openrouter = "openrouter" in self.base_url.lower()
 								        is_claude = "claude" in self.model.lower()
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								        is_native_anthropic = self.api_mode == "anthropic_messages"
 								        self._use_prompt_caching = (is_openrouter and is_claude) or is_native_anthropic
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
+								        self._cache_ttl = "5m"  # Default 5-minute TTL (1.25x write cost)
-												feat: iteration budget pressure via tool result injection

Two-tier warning system that nudges the LLM as it approaches
max_iterations, injected into the last tool result JSON rather
than as a separate system message:

- Caution (70%): {"_budget_warning": "[BUDGET: 42/60...]"}
- Warning (90%): {"_budget_warning": "[BUDGET WARNING: 54/60...]"}

For JSON tool results, adds a _budget_warning field to the existing
dict. For plain text results, appends the warning as text.

Key properties:
- No system messages injected mid-conversation
- No changes to message structure
- Prompt cache stays valid
- Configurable thresholds (0.7 / 0.9)
- Can be disabled: _budget_pressure_enabled = False

Inspired by PR #421 (@Bartok9) and issue #414.
8 tests covering thresholds, edge cases, JSON and text injection.

											
										
										
											2026-03-09 05:19:42 -07:00
+								        # Iteration budget pressure: warn the LLM as it approaches max_iterations.
 								        # Warnings are injected into the last tool result JSON (not as separate
 								        # messages) so they don't break message structure or invalidate caching.
 								        self._budget_caution_threshold = 0.7   # 70% — nudge to start wrapping up
 								        self._budget_warning_threshold = 0.9   # 90% — urgent, respond now
 								        self._budget_pressure_enabled = True
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
+								        # Persistent error log -- always writes WARNING+ to ~/.hermes/logs/errors.log
 								        # so tool failures, API errors, etc. are inspectable after the fact.
-												fix: prevent logging handler accumulation in gateway mode

Use exact Path comparison instead of endswith to detect existing
errors.log handlers, avoiding false positives from similarly-named
log files.

											
										
										
											2026-03-11 16:00:25 +08:00
+								        # In gateway mode, each incoming message creates a new AIAgent instance,
 								        # while the root logger is process-global. Re-adding the same errors.log
 								        # handler would cause each warning/error line to be written multiple times.
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
+								        from logging.handlers import RotatingFileHandler
-												fix: prevent logging handler accumulation in gateway mode

Use exact Path comparison instead of endswith to detect existing
errors.log handlers, avoiding false positives from similarly-named
log files.

											
										
										
											2026-03-11 16:00:25 +08:00
+								        root_logger = logging.getLogger()
 								        error_log_dir = _hermes_home / "logs"
 								        error_log_path = error_log_dir / "errors.log"
 								        resolved_error_log_path = error_log_path.resolve()
 								        has_errors_log_handler = any(
 								            isinstance(handler, RotatingFileHandler)
 								            and Path(getattr(handler, "baseFilename", "")).resolve() == resolved_error_log_path
 								            for handler in root_logger.handlers
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
+								        )
-												fix: prevent logging handler accumulation in gateway mode

Use exact Path comparison instead of endswith to detect existing
errors.log handlers, avoiding false positives from similarly-named
log files.

											
										
										
											2026-03-11 16:00:25 +08:00
+								        if not has_errors_log_handler:
 								            from agent.redact import RedactingFormatter
 								            error_log_dir.mkdir(parents=True, exist_ok=True)
 								            error_file_handler = RotatingFileHandler(
 								                error_log_path, maxBytes=2 * 1024 * 1024, backupCount=2,
 								            )
 								            error_file_handler.setLevel(logging.WARNING)
 								            error_file_handler.setFormatter(RedactingFormatter(
 								                '%(asctime)s %(levelname)s %(name)s: %(message)s',
 								            ))
 								            root_logger.addHandler(error_file_handler)
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								        if self.verbose_logging:
 								            logging.basicConfig(
 								                level=logging.DEBUG,
 								                format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 								                datefmt='%H:%M:%S'
 								            )
-												refactor(cli, auth): Add Codex/OpenAI OAuth Support - finalized

- Replace `hermes login` with `hermes model` for selecting providers and managing authentication.
- Update documentation and CLI commands to reflect the new provider selection process.
- Introduce a new redaction system for logging sensitive information.
- Enhance Codex model discovery by integrating API fetching and local cache.
- Adjust max turns configuration logic for better clarity and precedence.
- Improve error handling and user feedback during authentication processes.

											
										
										
											2026-02-28 21:56:05 -08:00
+								            for handler in logging.getLogger().handlers:
 								                handler.setFormatter(RedactingFormatter(
 								                    '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 								                    datefmt='%H:%M:%S',
 								                ))
-												Enhance logging and tool initialization for improved performance

- Updated logging configuration in `run_agent.py` to suppress debug messages from additional third-party libraries, reducing noise in logs.
- Enhanced shell scripts for terminal tasks to utilize Singularity for containerized execution, including pre-build SIF image logic and improved logging.
- Refactored tool initialization in `mixture_of_agents_tool.py`, `vision_tools.py`, and `web_tools.py` to implement lazy loading of API clients, optimizing resource usage and error handling.
- Updated ephemeral system prompts in shell scripts to provide clearer guidance on task execution and resource usage.

											
										
										
											2026-01-29 19:59:59 +00:00
+								            # Keep third-party libraries at WARNING level to reduce noise
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								            # We have our own retry and error logging that's more informative
 								            logging.getLogger('openai').setLevel(logging.WARNING)
 								            logging.getLogger('openai._base_client').setLevel(logging.WARNING)
-												some bugfixes

											
										
										
											2025-10-15 18:07:06 +00:00
+								            logging.getLogger('httpx').setLevel(logging.WARNING)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								            logging.getLogger('httpcore').setLevel(logging.WARNING)
-												Enhance logging and tool initialization for improved performance

- Updated logging configuration in `run_agent.py` to suppress debug messages from additional third-party libraries, reducing noise in logs.
- Enhanced shell scripts for terminal tasks to utilize Singularity for containerized execution, including pre-build SIF image logic and improved logging.
- Refactored tool initialization in `mixture_of_agents_tool.py`, `vision_tools.py`, and `web_tools.py` to implement lazy loading of API clients, optimizing resource usage and error handling.
- Updated ephemeral system prompts in shell scripts to provide clearer guidance on task execution and resource usage.

											
										
										
											2026-01-29 19:59:59 +00:00
+								            logging.getLogger('asyncio').setLevel(logging.WARNING)
 								            # Suppress Modal/gRPC related debug spam
 								            logging.getLogger('hpack').setLevel(logging.WARNING)
 								            logging.getLogger('hpack.hpack').setLevel(logging.WARNING)
 								            logging.getLogger('grpc').setLevel(logging.WARNING)
 								            logging.getLogger('modal').setLevel(logging.WARNING)
 								            logging.getLogger('rex-deploy').setLevel(logging.INFO)  # Keep INFO for sandbox status
-												refactor: implement structured logging across multiple modules

- Introduced logging functionality in cli.py, run_agent.py, scheduler.py, and various tool modules to replace print statements with structured logging.
- Enhanced error handling and informational messages to improve debugging and monitoring capabilities.
- Ensured consistent logging practices across the codebase, facilitating better traceability and maintenance.

											
										
										
											2026-02-21 03:11:11 -08:00
+								            logger.info("Verbose logging enabled (third-party library logs suppressed)")
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								        else:
 								            # Set logging to INFO level for important messages only
 								            logging.basicConfig(
 								                level=logging.INFO,
 								                format='%(asctime)s - %(levelname)s - %(message)s',
 								                datefmt='%H:%M:%S'
 								            )
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								            # Suppress noisy library logging
 								            logging.getLogger('openai').setLevel(logging.ERROR)
 								            logging.getLogger('openai._base_client').setLevel(logging.ERROR)
 								            logging.getLogger('httpx').setLevel(logging.ERROR)
 								            logging.getLogger('httpcore').setLevel(logging.ERROR)
-												feat: enhance logging in AIAgent for quiet mode

- Added functionality to suppress logging noise from specific modules when in quiet mode, improving user experience in CLI.
- Updated terminal_tool.py to change the log level for fallback directory usage from warning to debug, providing clearer context without cluttering logs.

											
										
										
											2026-02-21 12:41:05 -08:00
+								            if self.quiet_mode:
 								                # In quiet mode (CLI default), suppress all tool/infra log
 								                # noise. The TUI has its own rich display for status; logger
 								                # INFO/WARNING messages just clutter it.
 								                for quiet_logger in [
 								                    'tools',               # all tools.* (terminal, browser, web, file, etc.)
 								                    'minisweagent',         # mini-swe-agent execution backend
 								                    'run_agent',            # agent runner internals
 								                    'trajectory_compressor',
 								                    'cron',                 # scheduler (only relevant in daemon mode)
 								                    'hermes_cli',           # CLI helpers
 								                ]:
 								                    logging.getLogger(quiet_logger).setLevel(logging.ERROR)
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								        # Internal stream callback (set during streaming TTS).
 								        # Initialized here so _vprint can reference it before run_conversation.
 								        self._stream_callback = None
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								        # Optional current-turn user-message override used when the API-facing
 								        # user message intentionally differs from the persisted transcript
 								        # (e.g. CLI voice mode adds a temporary prefix for the live call only).
 								        self._persist_user_message_idx = None
 								        self._persist_user_message_override = None
-												fix: convert anthropic image content blocks

											
										
										
											2026-03-14 23:21:09 -07:00
+								        # Cache anthropic image-to-text fallbacks per image payload/URL so a
 								        # single tool loop does not repeatedly re-run auxiliary vision on the
 								        # same image history.
 								        self._anthropic_image_fallback_cache: Dict[str, str] = {}
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								        # Initialize LLM client via centralized provider router.
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
+								        # The router handles auth resolution, base URL, headers, and
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								        # Codex/Anthropic wrapping for all known providers.
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
+								        # raw_codex=True because the main agent needs direct responses.stream()
 								        # access for Codex Responses API streaming.
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								        self._anthropic_client = None
 								        if self.api_mode == "anthropic_messages":
-												fix: separate Anthropic OAuth tokens from API keys

Persist OAuth/setup tokens in ANTHROPIC_TOKEN instead of ANTHROPIC_API_KEY.
Reserve ANTHROPIC_API_KEY for regular Console API keys.

Changes:
- anthropic_adapter: reorder resolve_anthropic_token() priority —
  ANTHROPIC_TOKEN first, ANTHROPIC_API_KEY as legacy fallback
- config: add save_anthropic_oauth_token() / save_anthropic_api_key() helpers
  that clear the opposing slot to prevent priority conflicts
- config: show_config() prefers ANTHROPIC_TOKEN for display
- setup: OAuth login and pasted setup-tokens write to ANTHROPIC_TOKEN
- setup: API key entry writes to ANTHROPIC_API_KEY and clears ANTHROPIC_TOKEN
- main: same fixes in _run_anthropic_oauth_flow() and _model_flow_anthropic()
- main: _has_any_provider_configured() checks ANTHROPIC_TOKEN
- doctor: use _is_oauth_token() for correct auth method validation
- runtime_provider: updated error message
- run_agent: simplified client init to use resolve_anthropic_token()
- run_agent: updated 401 troubleshooting messages
- status: prefer ANTHROPIC_TOKEN in status display
- tests: updated priority test, added persistence helper tests

Cherry-picked from PR #1141 by kshitijk4poor, rebased onto current main
with unrelated changes (web_policy config, blocklist CLI) removed.

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-03-13 02:09:52 -07:00
+								            from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token
 								            effective_key = api_key or resolve_anthropic_token() or ""
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								            self._anthropic_api_key = effective_key
-												fix: address PR review round 5 — streaming guard, VC auth, history prefix, auto-TTS control

1. Gate _streaming_api_call to chat_completions mode only — Anthropic and
   Codex fall back to _interruptible_api_call. Preserve Anthropic base_url
   across all client rebuild paths (interrupt, fallback, 401 refresh).

2. Discord VC synthetic events now use chat_type="channel" instead of
   defaulting to "dm" — prevents session bleed into DM context.
   Authorization runs before echoing transcript. Sanitize @everyone/@here
   in voice transcripts.

3. CLI voice prefix ("[Voice input...]") is now API-call-local only —
   stripped from returned history so it never persists to session DB or
   resumed sessions.

4. /voice off now disables base adapter auto-TTS via _auto_tts_disabled_chats
   set — voice input no longer triggers TTS when voice mode is off.

											
										
										
											2026-03-14 10:31:49 +03:00
+								            self._anthropic_base_url = base_url
-												fix: anthropic adapter — max_tokens ignored, fallback crash, proxy base_url filtered

- Pass self.max_tokens to build_anthropic_kwargs instead of hardcoded None
- Add anthropic case to _try_activate_fallback (was only handling openai-codex)
- Remove 'anthropic in base_url' filter that blocked custom proxy URLs

											
										
										
											2026-03-13 04:22:16 +03:00
+								            self._anthropic_client = build_anthropic_client(effective_key, base_url)
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								            # No OpenAI client needed for Anthropic mode
 								            self.client = None
 								            self._client_kwargs = {}
 								            if not self.quiet_mode:
 								                print(f"🤖 AI Agent initialized with model: {self.model} (Anthropic native)")
 								                if effective_key and len(effective_key) > 12:
 								                    print(f"🔑 Using token: {effective_key[:8]}...{effective_key[-4:]}")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        else:
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								            if api_key and base_url:
 								                # Explicit credentials from CLI/gateway — construct directly.
 								                # The runtime provider resolver already handled auth for us.
 								                client_kwargs = {"api_key": api_key, "base_url": base_url}
 								                effective_base = base_url
 								                if "openrouter" in effective_base.lower():
 								                    client_kwargs["default_headers"] = {
-												fix: use hermes-agent.nousresearch.com as OpenRouter HTTP-Referer

* fix: stop rejecting unlisted models + auto-detect from /models endpoint

validate_requested_model() now accepts models not in the provider's API
listing with a warning instead of blocking. Removes hardcoded catalog
fallback for validation — if API is unreachable, accepts with a warning.

Model selection flows (setup + /model command) now probe the provider's
/models endpoint to get the real available models. Falls back to
hardcoded defaults with a clear warning when auto-detection fails:
'Could not auto-detect models — use Custom model if yours isn't listed.'

Z.AI setup no longer excludes GLM-5 on coding plans.

* fix: use hermes-agent.nousresearch.com as HTTP-Referer for OpenRouter

OpenRouter scrapes the favicon/logo from the HTTP-Referer URL for app
rankings. We were sending the GitHub repo URL, which gives us a generic
GitHub logo. Changed to the proper website URL so our actual branding
shows up in rankings.

Changed in run_agent.py (main agent client) and auxiliary_client.py
(vision/summarization clients).
											
										
										
											2026-03-12 16:20:22 -07:00
+								                        "HTTP-Referer": "https://hermes-agent.nousresearch.com",
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
+								                        "X-OpenRouter-Title": "Hermes Agent",
 								                        "X-OpenRouter-Categories": "productivity,cli-agent",
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                    }
 								                elif "api.kimi.com" in effective_base.lower():
 								                    client_kwargs["default_headers"] = {
 								                        "User-Agent": "KimiCLI/1.3",
 								                    }
 								            else:
 								                # No explicit creds — use the centralized provider router
 								                from agent.auxiliary_client import resolve_provider_client
 								                _routed_client, _ = resolve_provider_client(
 								                    self.provider or "auto", model=self.model, raw_codex=True)
 								                if _routed_client is not None:
 								                    client_kwargs = {
 								                        "api_key": _routed_client.api_key,
 								                        "base_url": str(_routed_client.base_url),
 								                    }
 								                    # Preserve any default_headers the router set
 								                    if hasattr(_routed_client, '_default_headers') and _routed_client._default_headers:
 								                        client_kwargs["default_headers"] = dict(_routed_client._default_headers)
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                else:
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                    # Final fallback: try raw OpenRouter key
 								                    client_kwargs = {
 								                        "api_key": os.getenv("OPENROUTER_API_KEY", ""),
 								                        "base_url": OPENROUTER_BASE_URL,
 								                        "default_headers": {
-												merge: resolve conflicts with main (URL update to hermes-agent.nousresearch.com)

											
										
										
											2026-03-12 17:49:26 -07:00
+								                            "HTTP-Referer": "https://hermes-agent.nousresearch.com",
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                            "X-OpenRouter-Title": "Hermes Agent",
 								                            "X-OpenRouter-Categories": "productivity,cli-agent",
 								                        },
 								                    }
 								            self._client_kwargs = client_kwargs  # stored for rebuilding after interrupt
 								            try:
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                self.client = self._create_openai_client(client_kwargs, reason="agent_init", shared=True)
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                if not self.quiet_mode:
 								                    print(f"🤖 AI Agent initialized with model: {self.model}")
 								                    if base_url:
 								                        print(f"🔗 Using custom base URL: {base_url}")
 								                    # Always show API key info (masked) for debugging auth issues
 								                    key_used = client_kwargs.get("api_key", "none")
 								                    if key_used and key_used != "dummy-key" and len(key_used) > 12:
 								                        print(f"🔑 Using API key: {key_used[:8]}...{key_used[-4:]}")
 								                    else:
 								                        print(f"⚠️  Warning: API key appears invalid or missing (got: '{key_used[:20] if key_used else 'none'}...')")
 								            except Exception as e:
 								                raise RuntimeError(f"Failed to initialize OpenAI client: {e}")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								        # Provider fallback — a single backup model/provider tried when the
 								        # primary is exhausted (rate-limit, overload, connection failure).
 								        # Config shape: {"provider": "openrouter", "model": "anthropic/claude-sonnet-4"}
 								        self._fallback_model = fallback_model if isinstance(fallback_model, dict) else None
 								        self._fallback_activated = False
 								        if self._fallback_model:
 								            fb_p = self._fallback_model.get("provider", "")
 								            fb_m = self._fallback_model.get("model", "")
 								            if fb_p and fb_m and not self.quiet_mode:
 								                print(f"🔄 Fallback model: {fb_m} ({fb_p})")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        # Get available tools with filtering
 								        self.tools = get_tool_definitions(
 								            enabled_toolsets=enabled_toolsets,
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								            disabled_toolsets=disabled_toolsets,
 								            quiet_mode=self.quiet_mode,
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        )
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								        # Show tool configuration and store valid tool names for validation
 								        self.valid_tool_names = set()
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        if self.tools:
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								            self.valid_tool_names = {tool["function"]["name"] for tool in self.tools}
 								            tool_names = sorted(self.valid_tool_names)
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								            if not self.quiet_mode:
 								                print(f"🛠️  Loaded {len(self.tools)} tools: {', '.join(tool_names)}")
 								                # Show filtering info if applied
 								                if enabled_toolsets:
 								                    print(f"   ✅ Enabled toolsets: {', '.join(enabled_toolsets)}")
 								                if disabled_toolsets:
 								                    print(f"   ❌ Disabled toolsets: {', '.join(disabled_toolsets)}")
 								        elif not self.quiet_mode:
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            print("🛠️  No tools loaded (all tools filtered out or unavailable)")
 								        # Check tool requirements
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								        if self.tools and not self.quiet_mode:
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            requirements = check_toolset_requirements()
 								            missing_reqs = [name for name, available in requirements.items() if not available]
 								            if missing_reqs:
 								                print(f"⚠️  Some tools may not work due to missing requirements: {missing_reqs}")
 								        # Show trajectory saving status
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								        if self.save_trajectories and not self.quiet_mode:
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            print("📝 Trajectory saving enabled")
-												Add ephemeral system prompt support in batch and agent runners. Update README with usage examples and documentation for the new feature. Ensure prompt is not saved to trajectories.

											
										
										
											2025-10-08 02:33:58 +00:00
 								        # Show ephemeral system prompt status
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								        if self.ephemeral_system_prompt and not self.quiet_mode:
-												Add ephemeral system prompt support in batch and agent runners. Update README with usage examples and documentation for the new feature. Ensure prompt is not saved to trajectories.

											
										
										
											2025-10-08 02:33:58 +00:00
+								            prompt_preview = self.ephemeral_system_prompt[:60] + "..." if len(self.ephemeral_system_prompt) > 60 else self.ephemeral_system_prompt
 								            print(f"🔒 Ephemeral system prompt: '{prompt_preview}' (not saved to trajectories)")
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
+								        # Show prompt caching status
 								        if self._use_prompt_caching and not self.quiet_mode:
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								            source = "native Anthropic" if is_native_anthropic else "Claude via OpenRouter"
 								            print(f"💾 Prompt caching: ENABLED ({source}, {self._cache_ttl} TTL)")
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								        # Session logging setup - auto-save conversation trajectories for debugging
 								        self.session_start = datetime.now()
 								        if session_id:
 								            # Use provided session ID (e.g., from CLI)
 								            self.session_id = session_id
 								        else:
 								            # Generate a new session ID
 								            timestamp_str = self.session_start.strftime("%Y%m%d_%H%M%S")
 								            short_uuid = uuid.uuid4().hex[:6]
 								            self.session_id = f"{timestamp_str}_{short_uuid}"
-												fix: update session logging directory path in README and code

- Changed the session logging directory from `~/.hermes-agent/logs/` to `~/.hermes/sessions/` for consistency.
- Updated the `run_agent.py` to reflect the new logging path, ensuring session logs are stored correctly alongside gateway sessions.

											
										
										
											2026-02-21 01:20:18 -08:00
+								        # Session logs go into ~/.hermes/sessions/ alongside gateway sessions
 								        hermes_home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
 								        self.logs_dir = hermes_home / "sessions"
 								        self.logs_dir.mkdir(parents=True, exist_ok=True)
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								        self.session_log_file = self.logs_dir / f"session_{self.session_id}.json"
 								        # Track conversation messages for session logging
 								        self._session_messages: List[Dict[str, Any]] = []
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        # Cached system prompt -- built once per session, only rebuilt on compression
 								        self._cached_system_prompt: Optional[str] = None
-												feat: filesystem checkpoints and /rollback command

Automatic filesystem snapshots before destructive file operations,
with user-facing rollback.  Inspired by PR #559 (by @alireza78a).

Architecture:
- Shadow git repos at ~/.hermes/checkpoints/{hash}/ via GIT_DIR
- CheckpointManager: take/list/restore, turn-scoped dedup, pruning
- Transparent — the LLM never sees it, no tool schema, no tokens
- Once per turn — only first write_file/patch triggers a snapshot

Integration:
- Config: checkpoints.enabled + checkpoints.max_snapshots
- CLI flag: hermes --checkpoints
- Trigger: run_agent.py _execute_tool_calls() before write_file/patch
- /rollback slash command in CLI + gateway (list, restore by number)
- Pre-rollback snapshot auto-created on restore (undo the undo)

Safety:
- Never blocks file operations — all errors silently logged
- Skips root dir, home dir, dirs >50K files
- Disables gracefully when git not installed
- Shadow repo completely isolated from project git

Tests: 35 new tests, all passing (2798 total suite)
Docs: feature page, config reference, CLI commands reference

											
										
										
											2026-03-10 00:49:15 -07:00
+								        # Filesystem checkpoint manager (transparent — not a tool)
 								        from tools.checkpoint_manager import CheckpointManager
 								        self._checkpoint_mgr = CheckpointManager(
 								            enabled=checkpoints_enabled,
 								            max_snapshots=checkpoint_max_snapshots,
 								        )
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        # SQLite session store (optional -- provided by CLI or gateway)
 								        self._session_db = session_db
-												fix: eliminate 3x SQLite message duplication in gateway sessions (#860)

Three separate code paths all wrote to the same SQLite state.db with
no deduplication, inflating session transcripts by 3-4x:

1. _log_msg_to_db() — wrote each message individually after append
2. _flush_messages_to_session_db() — re-wrote ALL new messages at
   every _persist_session() call (~18 exit points), with no tracking
   of what was already written
3. gateway append_to_transcript() — wrote everything a third time
   after the agent returned

Since load_transcript() prefers SQLite over JSONL, the inflated data
was loaded on every session resume, causing proportional token waste.

Fix:
- Remove _log_msg_to_db() and all 16 call sites (redundant with flush)
- Add _last_flushed_db_idx tracking in _flush_messages_to_session_db()
  so repeated _persist_session() calls only write truly new messages
- Reset flush cursor on compression (new session ID)
- Add skip_db parameter to SessionStore.append_to_transcript() so the
  gateway skips SQLite writes when the agent already persisted them
- Gateway now passes skip_db=True for agent-managed messages, still
  writes to JSONL as backup

Verified: a 12-message CLI session with tool calls produces exactly
12 SQLite rows with zero duplicates (previously would be 36-48).

Tests: 9 new tests covering flush deduplication, skip_db behavior,
compression reset, and initialization. Full suite passes (2869 tests).

											
										
										
											2026-03-10 15:22:44 -07:00
+								        self._last_flushed_db_idx = 0  # tracks DB-write cursor to prevent duplicate writes
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        if self._session_db:
 								            try:
 								                self._session_db.create_session(
 								                    session_id=self.session_id,
 								                    source=self.platform or "cli",
 								                    model=self.model,
 								                    model_config={
 								                        "max_iterations": self.max_iterations,
 								                        "reasoning_config": reasoning_config,
 								                        "max_tokens": max_tokens,
 								                    },
 								                    user_id=None,
 								                )
-												refactor: enhance error handling with structured logging across multiple modules

- Updated various modules including cli.py, run_agent.py, gateway, and tools to replace silent exception handling with structured logging.
- Improved error messages to provide more context, aiding in debugging and monitoring.
- Ensured consistent logging practices throughout the codebase, enhancing traceability and maintainability.

											
										
										
											2026-02-21 03:32:11 -08:00
+								            except Exception as e:
 								                logger.debug("Session DB create_session failed: %s", e)
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
-												Add todo tool for agent task planning and management

Single `todo` tool that reads (no params) or writes (provide todos array
with merge flag). In-memory TodoStore on AIAgent, no system prompt
mutation, behavioral guidance in tool description only. State re-injected
after context compression events. Gateway sessions hydrate from
conversation history. Added to all platform toolsets.

Also wired into RL agent_loop.py with per-run TodoStore and fixed
browser_snapshot user_task passthrough from first user message.

											
										
										
											2026-02-17 17:02:33 -08:00
+								        # In-memory todo list for task planning (one per agent/session)
 								        from tools.todo_tool import TodoStore
 								        self._todo_store = TodoStore()
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        # Persistent memory (MEMORY.md + USER.md) -- loaded from disk
 								        self._memory_store = None
 								        self._memory_enabled = False
 								        self._user_profile_enabled = False
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        self._memory_nudge_interval = 10
 								        self._memory_flush_min_turns = 6
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        if not skip_memory:
 								            try:
 								                from hermes_cli.config import load_config as _load_mem_config
 								                mem_config = _load_mem_config().get("memory", {})
 								                self._memory_enabled = mem_config.get("memory_enabled", False)
 								                self._user_profile_enabled = mem_config.get("user_profile_enabled", False)
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								                self._memory_nudge_interval = int(mem_config.get("nudge_interval", 10))
 								                self._memory_flush_min_turns = int(mem_config.get("flush_min_turns", 6))
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								                if self._memory_enabled or self._user_profile_enabled:
 								                    from tools.memory_tool import MemoryStore
 								                    self._memory_store = MemoryStore(
 								                        memory_char_limit=mem_config.get("memory_char_limit", 2200),
 								                        user_char_limit=mem_config.get("user_char_limit", 1375),
 								                    )
 								                    self._memory_store.load_from_disk()
 								            except Exception:
 								                pass  # Memory is optional -- don't break agent init
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        # Honcho AI-native memory (cross-session user modeling)
 								        # Reads ~/.honcho/config.json as the single source of truth.
 								        self._honcho = None  # HonchoSessionManager | None
 								        self._honcho_session_key = honcho_session_key
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								        self._honcho_config = None  # HonchoClientConfig | None
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        self._honcho_exit_hook_registered = False
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        if not skip_memory:
 								            try:
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								                if honcho_manager is not None:
 								                    hcfg = honcho_config or getattr(honcho_manager, "_config", None)
 								                    self._honcho_config = hcfg
 								                    if hcfg and self._honcho_should_activate(hcfg):
 								                        self._honcho = honcho_manager
 								                        self._activate_honcho(
 								                            hcfg,
 								                            enabled_toolsets=enabled_toolsets,
 								                            disabled_toolsets=disabled_toolsets,
 								                            session_db=session_db,
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								                        )
 								                else:
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								                    from honcho_integration.client import HonchoClientConfig, get_honcho_client
 								                    hcfg = HonchoClientConfig.from_global_config()
 								                    self._honcho_config = hcfg
 								                    if self._honcho_should_activate(hcfg):
 								                        from honcho_integration.session import HonchoSessionManager
 								                        client = get_honcho_client(hcfg)
 								                        self._honcho = HonchoSessionManager(
 								                            honcho=client,
 								                            config=hcfg,
 								                            context_tokens=hcfg.context_tokens,
 								                        )
 								                        self._activate_honcho(
 								                            hcfg,
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                            enabled_toolsets=enabled_toolsets,
 								                            disabled_toolsets=disabled_toolsets,
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								                            session_db=session_db,
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                        )
 								                    else:
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								                        if not hcfg.enabled:
 								                            logger.debug("Honcho disabled in global config")
 								                        elif not hcfg.api_key:
 								                            logger.debug("Honcho enabled but no API key configured")
 								                        else:
-												refactor(honcho): remove local memory mode

The "local" memoryMode was redundant with enabled: false. Simplifies
the mode system to hybrid and honcho only.

											
										
										
											2026-03-12 16:23:34 -04:00
+								                            logger.debug("Honcho enabled but missing API key or disabled in config")
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								            except Exception as e:
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                logger.warning("Honcho init failed — memory disabled: %s", e)
 								                print(f"  Honcho init failed: {e}")
 								                print("  Run 'hermes honcho setup' to reconfigure.")
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								                self._honcho = None
-												fix(honcho): harden tool gating and migration peer routing

Prevent stale Honcho tool exposure in context/local modes, restore reliable async write retry behavior, and ensure SOUL.md migration uploads target the AI peer instead of the user peer. Also align Honcho CLI key checks with host-scoped apiKey resolution and lock the fixes with regression tests.

Made-with: Cursor

											
										
										
											2026-03-11 18:21:27 -04:00
+								        # Tools are initially discovered before Honcho activation. If Honcho
 								        # stays inactive, remove any stale honcho_* tools from prior process state.
 								        if not self._honcho:
 								            self._strip_honcho_tools_from_surface()
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								        # Gate local memory writes based on per-peer memory modes.
 								        # AI peer governs MEMORY.md; user peer governs USER.md.
-												refactor(honcho): remove local memory mode

The "local" memoryMode was redundant with enabled: false. Simplifies
the mode system to hybrid and honcho only.

											
										
										
											2026-03-12 16:23:34 -04:00
+								        # "honcho" = Honcho only, disable local writes.
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								        if self._honcho_config and self._honcho:
 								            _hcfg = self._honcho_config
 								            _agent_mode = _hcfg.peer_memory_mode(_hcfg.ai_peer)
 								            _user_mode = _hcfg.peer_memory_mode(_hcfg.peer_name or "user")
 								            if _agent_mode == "honcho":
 								                self._memory_flush_min_turns = 0
 								                self._memory_enabled = False
 								                logger.debug("peer %s memory_mode=honcho: local MEMORY.md writes disabled", _hcfg.ai_peer)
 								            if _user_mode == "honcho":
 								                self._user_profile_enabled = False
 								                logger.debug("peer %s memory_mode=honcho: local USER.md writes disabled", _hcfg.peer_name or "user")
-												feat: introduce skills management features in AIAgent and CLI

- Added skills configuration options in cli-config.yaml.example, including a nudge interval for skill creation reminders.
- Implemented skills guidance in AIAgent to prompt users to save reusable workflows after complex tasks.
- Enhanced skills indexing in the prompt builder to include descriptions from SKILL.md files for better context.
- Updated the agent's behavior to periodically remind users about potential skills during tool-calling iterations.

											
										
										
											2026-02-22 13:28:13 -08:00
+								        # Skills config: nudge interval for skill creation reminders
 								        self._skill_nudge_interval = 15
 								        try:
 								            from hermes_cli.config import load_config as _load_skills_config
 								            skills_config = _load_skills_config().get("skills", {})
 								            self._skill_nudge_interval = int(skills_config.get("creation_nudge_interval", 15))
 								        except Exception:
 								            pass
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								        # Initialize context compressor for automatic context management
 								        # Compresses conversation when approaching model's context limit
-												refactor: update context compression configuration to use config.yaml and improve model handling

											
										
										
											2026-02-28 04:46:35 -08:00
+								        # Configuration via config.yaml (compression section) or environment variables
-												chore: lower default compression threshold from 85% to 50% (#1096)

* fix: ClawHub skill install — use /download ZIP endpoint

The ClawHub API v1 version endpoint only returns file metadata
(path, size, sha256, contentType) without inline content or download
URLs. Our code was looking for inline content in the metadata, which
never existed, causing all ClawHub installs to fail with:
'no inline/raw file content was available'

Fix: Use the /api/v1/download endpoint (same as the official clawhub
CLI) to download skills as ZIP bundles and extract files in-memory.

Changes:
- Add _download_zip() method that downloads and extracts ZIP bundles
- Retry on 429 rate limiting with Retry-After header support
- Path sanitization and binary file filtering for security
- Keep _extract_files() as a fallback for inline/raw content
- Also fix nested file lookup (version_data.version.files)

* chore: lower default compression threshold from 85% to 50%

Triggers context compression earlier — at 50% of the model's context
window instead of 85%. Updated in all four places where the default
is defined: context_compressor.py, cli.py, run_agent.py, config.py,
and gateway/run.py.
											
										
										
											2026-03-12 15:51:50 -07:00
+								        compression_threshold = float(os.getenv("CONTEXT_COMPRESSION_THRESHOLD", "0.50"))
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								        compression_enabled = os.getenv("CONTEXT_COMPRESSION_ENABLED", "true").lower() in ("true", "1", "yes")
-												refactor: update context compression configuration to use config.yaml and improve model handling

											
										
										
											2026-02-28 04:46:35 -08:00
+								        compression_summary_model = os.getenv("CONTEXT_COMPRESSION_MODEL") or None
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
 								        self.context_compressor = ContextCompressor(
 								            model=self.model,
 								            threshold_percent=compression_threshold,
-												refactor: remove unused compression model variable in AIAgent

- Eliminated the `compression_model` variable from the AIAgent class, as it was not being utilized.
- Cleaned up the context compressor initialization for improved clarity and maintainability.

											
										
										
											2026-02-22 02:17:33 -08:00
+								            protect_first_n=3,
 								            protect_last_n=4,
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								            summary_target_tokens=500,
-												refactor: update context compression configuration to use config.yaml and improve model handling

											
										
										
											2026-02-28 04:46:35 -08:00
+								            summary_model_override=compression_summary_model,
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								            quiet_mode=self.quiet_mode,
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								            base_url=self.base_url,
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								        )
 								        self.compression_enabled = compression_enabled
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        self._user_turn_count = 0
-												feat(cli): add /usage command to display session token usage

Introduced a new command "/usage" in the CLI to show cumulative token usage for the current session. This includes details on prompt tokens, completion tokens, total tokens, API calls, and context state. Updated command documentation to reflect this addition. Enhanced the AIAgent class to track token usage throughout the session.

											
										
										
											2026-03-01 00:23:19 -08:00
 								        # Cumulative token usage for the session
 								        self.session_prompt_tokens = 0
 								        self.session_completion_tokens = 0
 								        self.session_total_tokens = 0
 								        self.session_api_calls = 0
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
 								        if not self.quiet_mode:
 								            if compression_enabled:
 								                print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (compress at {int(compression_threshold*100)}% = {self.context_compressor.threshold_tokens:,})")
 								            else:
 								                print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								    def _vprint(self, *args, force: bool = False, **kwargs):
 								        """Verbose print — suppressed when streaming TTS is active.
 								        Pass ``force=True`` for error/warning messages that should always be
 								        shown even during streaming TTS playback.
 								        """
 								        if not force and getattr(self, "_stream_callback", None) is not None:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								            return
 								        print(*args, **kwargs)
-												feat(api): implement dynamic max tokens handling for various providers

- Added _max_tokens_param method in AIAgent to return appropriate max tokens parameter based on the provider (OpenAI vs. others).
- Updated API calls in AIAgent to utilize the new max tokens handling.
- Introduced auxiliary_max_tokens_param function in auxiliary_client for consistent max tokens management across auxiliary clients.
- Refactored multiple tools to use auxiliary_max_tokens_param for improved compatibility with different models and providers.

											
										
										
											2026-02-26 20:23:56 -08:00
+								    def _max_tokens_param(self, value: int) -> dict:
 								        """Return the correct max tokens kwarg for the current provider.
 								        OpenAI's newer models (gpt-4o, o-series, gpt-5+) require
 								        'max_completion_tokens'. OpenRouter, local models, and older
 								        OpenAI models use 'max_tokens'.
 								        """
 								        _is_direct_openai = (
 								            "api.openai.com" in self.base_url.lower()
 								            and "openrouter" not in self.base_url.lower()
 								        )
 								        if _is_direct_openai:
 								            return {"max_completion_tokens": value}
 								        return {"max_tokens": value}
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								    def _has_content_after_think_block(self, content: str) -> bool:
 								        """
 								        Check if content has actual text after any <think></think> blocks.
 								        This detects cases where the model only outputs reasoning but no actual
 								        response, which indicates an incomplete generation that should be retried.
 								        Args:
 								            content: The assistant message content to check
 								        Returns:
 								            True if there's meaningful content after think blocks, False otherwise
 								        """
 								        if not content:
 								            return False
 								        # Remove all <think>...</think> blocks (including nested ones, non-greedy)
 								        cleaned = re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
 								        # Check if there's any non-whitespace content remaining
 								        return bool(cleaned.strip())
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								    def _strip_think_blocks(self, content: str) -> str:
 								        """Remove <think>...</think> blocks from content, returning only visible text."""
 								        if not content:
 								            return ""
 								        return re.sub(r'<think>.*?</think>', '', content, flags=re.DOTALL)
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
 								    def _looks_like_codex_intermediate_ack(
 								        self,
 								        user_message: str,
 								        assistant_content: str,
 								        messages: List[Dict[str, Any]],
 								    ) -> bool:
 								        """Detect a planning/ack message that should continue instead of ending the turn."""
 								        if any(isinstance(msg, dict) and msg.get("role") == "tool" for msg in messages):
 								            return False
 								        assistant_text = self._strip_think_blocks(assistant_content or "").strip().lower()
 								        if not assistant_text:
 								            return False
 								        if len(assistant_text) > 1200:
 								            return False
 								        has_future_ack = bool(
 								            re.search(r"\b(i['’]ll|i will|let me|i can do that|i can help with that)\b", assistant_text)
 								        )
 								        if not has_future_ack:
 								            return False
 								        action_markers = (
 								            "look into",
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								            "look at",
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								            "inspect",
 								            "scan",
 								            "check",
 								            "analyz",
 								            "review",
 								            "explore",
 								            "read",
 								            "open",
 								            "run",
 								            "test",
 								            "fix",
 								            "debug",
 								            "search",
 								            "find",
 								            "walkthrough",
 								            "report back",
 								            "summarize",
 								        )
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								        workspace_markers = (
 								            "directory",
 								            "current directory",
 								            "current dir",
 								            "cwd",
 								            "repo",
 								            "repository",
 								            "codebase",
 								            "project",
 								            "folder",
 								            "filesystem",
 								            "file tree",
 								            "files",
 								            "path",
 								        )
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
 								        user_text = (user_message or "").strip().lower()
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								        user_targets_workspace = (
 								            any(marker in user_text for marker in workspace_markers)
 								            or "~/" in user_text
 								            or "/" in user_text
 								        )
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								        assistant_mentions_action = any(marker in assistant_text for marker in action_markers)
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								        assistant_targets_workspace = any(
 								            marker in assistant_text for marker in workspace_markers
 								        )
 								        return (user_targets_workspace or assistant_targets_workspace) and assistant_mentions_action
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
+								    def _extract_reasoning(self, assistant_message) -> Optional[str]:
 								        """
 								        Extract reasoning/thinking content from an assistant message.
 								        OpenRouter and various providers can return reasoning in multiple formats:
 . message.reasoning - Direct reasoning field (DeepSeek, Qwen, etc.)
 . message.reasoning_content - Alternative field (Moonshot AI, Novita, etc.)
 . message.reasoning_details - Array of {type, summary, ...} objects (OpenRouter unified)
 								        Args:
 								            assistant_message: The assistant message object from the API response
 								        Returns:
 								            Combined reasoning text, or None if no reasoning found
 								        """
 								        reasoning_parts = []
 								        # Check direct reasoning field
 								        if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning:
 								            reasoning_parts.append(assistant_message.reasoning)
 								        # Check reasoning_content field (alternative name used by some providers)
 								        if hasattr(assistant_message, 'reasoning_content') and assistant_message.reasoning_content:
 								            # Don't duplicate if same as reasoning
 								            if assistant_message.reasoning_content not in reasoning_parts:
 								                reasoning_parts.append(assistant_message.reasoning_content)
 								        # Check reasoning_details array (OpenRouter unified format)
 								        # Format: [{"type": "reasoning.summary", "summary": "...", ...}, ...]
 								        if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details:
 								            for detail in assistant_message.reasoning_details:
 								                if isinstance(detail, dict):
 								                    # Extract summary from reasoning detail object
 								                    summary = detail.get('summary') or detail.get('content') or detail.get('text')
 								                    if summary and summary not in reasoning_parts:
 								                        reasoning_parts.append(summary)
 								        # Combine all reasoning parts
 								        if reasoning_parts:
 								            return "\n\n".join(reasoning_parts)
 								        return None
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								    def _cleanup_task_resources(self, task_id: str) -> None:
 								        """Clean up VM and browser resources for a given task."""
 								        try:
 								            cleanup_vm(task_id)
 								        except Exception as e:
 								            if self.verbose_logging:
 								                logging.warning(f"Failed to cleanup VM for task {task_id}: {e}")
 								        try:
 								            cleanup_browser(task_id)
 								        except Exception as e:
 								            if self.verbose_logging:
 								                logging.warning(f"Failed to cleanup browser for task {task_id}: {e}")
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								    def _apply_persist_user_message_override(self, messages: List[Dict]) -> None:
 								        """Rewrite the current-turn user message before persistence/return.
 								        Some call paths need an API-only user-message variant without letting
 								        that synthetic text leak into persisted transcripts or resumed session
 								        history. When an override is configured for the active turn, mutate the
 								        in-memory messages list in place so both persistence and returned
 								        history stay clean.
 								        """
 								        idx = getattr(self, "_persist_user_message_idx", None)
 								        override = getattr(self, "_persist_user_message_override", None)
 								        if override is None or idx is None:
 								            return
 								        if 0 <= idx < len(messages):
 								            msg = messages[idx]
 								            if isinstance(msg, dict) and msg.get("role") == "user":
 								                msg["content"] = override
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								    def _persist_session(self, messages: List[Dict], conversation_history: List[Dict] = None):
 								        """Save session state to both JSON log and SQLite on any exit path.
 								        Ensures conversations are never lost, even on errors or early returns.
 								        """
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								        self._apply_persist_user_message_override(messages)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								        self._session_messages = messages
 								        self._save_session_log(messages)
 								        self._flush_messages_to_session_db(messages, conversation_history)
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								    def _flush_messages_to_session_db(self, messages: List[Dict], conversation_history: List[Dict] = None):
-												fix: eliminate 3x SQLite message duplication in gateway sessions (#860)

Three separate code paths all wrote to the same SQLite state.db with
no deduplication, inflating session transcripts by 3-4x:

1. _log_msg_to_db() — wrote each message individually after append
2. _flush_messages_to_session_db() — re-wrote ALL new messages at
   every _persist_session() call (~18 exit points), with no tracking
   of what was already written
3. gateway append_to_transcript() — wrote everything a third time
   after the agent returned

Since load_transcript() prefers SQLite over JSONL, the inflated data
was loaded on every session resume, causing proportional token waste.

Fix:
- Remove _log_msg_to_db() and all 16 call sites (redundant with flush)
- Add _last_flushed_db_idx tracking in _flush_messages_to_session_db()
  so repeated _persist_session() calls only write truly new messages
- Reset flush cursor on compression (new session ID)
- Add skip_db parameter to SessionStore.append_to_transcript() so the
  gateway skips SQLite writes when the agent already persisted them
- Gateway now passes skip_db=True for agent-managed messages, still
  writes to JSONL as backup

Verified: a 12-message CLI session with tool calls produces exactly
12 SQLite rows with zero duplicates (previously would be 36-48).

Tests: 9 new tests covering flush deduplication, skip_db behavior,
compression reset, and initialization. Full suite passes (2869 tests).

											
										
										
											2026-03-10 15:22:44 -07:00
+								        """Persist any un-flushed messages to the SQLite session store.
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
-												fix: eliminate 3x SQLite message duplication in gateway sessions (#860)

Three separate code paths all wrote to the same SQLite state.db with
no deduplication, inflating session transcripts by 3-4x:

1. _log_msg_to_db() — wrote each message individually after append
2. _flush_messages_to_session_db() — re-wrote ALL new messages at
   every _persist_session() call (~18 exit points), with no tracking
   of what was already written
3. gateway append_to_transcript() — wrote everything a third time
   after the agent returned

Since load_transcript() prefers SQLite over JSONL, the inflated data
was loaded on every session resume, causing proportional token waste.

Fix:
- Remove _log_msg_to_db() and all 16 call sites (redundant with flush)
- Add _last_flushed_db_idx tracking in _flush_messages_to_session_db()
  so repeated _persist_session() calls only write truly new messages
- Reset flush cursor on compression (new session ID)
- Add skip_db parameter to SessionStore.append_to_transcript() so the
  gateway skips SQLite writes when the agent already persisted them
- Gateway now passes skip_db=True for agent-managed messages, still
  writes to JSONL as backup

Verified: a 12-message CLI session with tool calls produces exactly
12 SQLite rows with zero duplicates (previously would be 36-48).

Tests: 9 new tests covering flush deduplication, skip_db behavior,
compression reset, and initialization. Full suite passes (2869 tests).

											
										
										
											2026-03-10 15:22:44 -07:00
+								        Uses _last_flushed_db_idx to track which messages have already been
 								        written, so repeated calls (from multiple exit paths) only write
 								        truly new messages — preventing the duplicate-write bug (#860).
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								        """
 								        if not self._session_db:
 								            return
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								        self._apply_persist_user_message_override(messages)
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								        try:
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								            start_idx = len(conversation_history) if conversation_history else 0
-												fix: eliminate 3x SQLite message duplication in gateway sessions (#860)

Three separate code paths all wrote to the same SQLite state.db with
no deduplication, inflating session transcripts by 3-4x:

1. _log_msg_to_db() — wrote each message individually after append
2. _flush_messages_to_session_db() — re-wrote ALL new messages at
   every _persist_session() call (~18 exit points), with no tracking
   of what was already written
3. gateway append_to_transcript() — wrote everything a third time
   after the agent returned

Since load_transcript() prefers SQLite over JSONL, the inflated data
was loaded on every session resume, causing proportional token waste.

Fix:
- Remove _log_msg_to_db() and all 16 call sites (redundant with flush)
- Add _last_flushed_db_idx tracking in _flush_messages_to_session_db()
  so repeated _persist_session() calls only write truly new messages
- Reset flush cursor on compression (new session ID)
- Add skip_db parameter to SessionStore.append_to_transcript() so the
  gateway skips SQLite writes when the agent already persisted them
- Gateway now passes skip_db=True for agent-managed messages, still
  writes to JSONL as backup

Verified: a 12-message CLI session with tool calls produces exactly
12 SQLite rows with zero duplicates (previously would be 36-48).

Tests: 9 new tests covering flush deduplication, skip_db behavior,
compression reset, and initialization. Full suite passes (2869 tests).

											
										
										
											2026-03-10 15:22:44 -07:00
+								            flush_from = max(start_idx, self._last_flushed_db_idx)
 								            for msg in messages[flush_from:]:
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								                role = msg.get("role", "unknown")
 								                content = msg.get("content")
 								                tool_calls_data = None
 								                if hasattr(msg, "tool_calls") and msg.tool_calls:
 								                    tool_calls_data = [
 								                        {"name": tc.function.name, "arguments": tc.function.arguments}
 								                        for tc in msg.tool_calls
 								                    ]
 								                elif isinstance(msg.get("tool_calls"), list):
 								                    tool_calls_data = msg["tool_calls"]
 								                self._session_db.append_message(
 								                    session_id=self.session_id,
 								                    role=role,
 								                    content=content,
 								                    tool_name=msg.get("tool_name"),
 								                    tool_calls=tool_calls_data,
 								                    tool_call_id=msg.get("tool_call_id"),
 								                    finish_reason=msg.get("finish_reason"),
 								                )
-												fix: eliminate 3x SQLite message duplication in gateway sessions (#860)

Three separate code paths all wrote to the same SQLite state.db with
no deduplication, inflating session transcripts by 3-4x:

1. _log_msg_to_db() — wrote each message individually after append
2. _flush_messages_to_session_db() — re-wrote ALL new messages at
   every _persist_session() call (~18 exit points), with no tracking
   of what was already written
3. gateway append_to_transcript() — wrote everything a third time
   after the agent returned

Since load_transcript() prefers SQLite over JSONL, the inflated data
was loaded on every session resume, causing proportional token waste.

Fix:
- Remove _log_msg_to_db() and all 16 call sites (redundant with flush)
- Add _last_flushed_db_idx tracking in _flush_messages_to_session_db()
  so repeated _persist_session() calls only write truly new messages
- Reset flush cursor on compression (new session ID)
- Add skip_db parameter to SessionStore.append_to_transcript() so the
  gateway skips SQLite writes when the agent already persisted them
- Gateway now passes skip_db=True for agent-managed messages, still
  writes to JSONL as backup

Verified: a 12-message CLI session with tool calls produces exactly
12 SQLite rows with zero duplicates (previously would be 36-48).

Tests: 9 new tests covering flush deduplication, skip_db behavior,
compression reset, and initialization. Full suite passes (2869 tests).

											
										
										
											2026-03-10 15:22:44 -07:00
+								            self._last_flushed_db_idx = len(messages)
-												refactor: enhance error handling with structured logging across multiple modules

- Updated various modules including cli.py, run_agent.py, gateway, and tools to replace silent exception handling with structured logging.
- Improved error messages to provide more context, aiding in debugging and monitoring.
- Ensured consistent logging practices throughout the codebase, enhancing traceability and maintainability.

											
										
										
											2026-02-21 03:32:11 -08:00
+								        except Exception as e:
 								            logger.debug("Session DB append_message failed: %s", e)
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								    def _get_messages_up_to_last_assistant(self, messages: List[Dict]) -> List[Dict]:
 								        """
 								        Get messages up to (but not including) the last assistant turn.
 								        This is used when we need to "roll back" to the last successful point
 								        in the conversation, typically when the final assistant message is
 								        incomplete or malformed.
 								        Args:
 								            messages: Full message list
 								        Returns:
 								            Messages up to the last complete assistant turn (ending with user/tool message)
 								        """
 								        if not messages:
 								            return []
 								        # Find the index of the last assistant message
 								        last_assistant_idx = None
 								        for i in range(len(messages) - 1, -1, -1):
 								            if messages[i].get("role") == "assistant":
 								                last_assistant_idx = i
 								                break
 								        if last_assistant_idx is None:
 								            # No assistant message found, return all messages
 								            return messages.copy()
 								        # Return everything up to (not including) the last assistant message
 								        return messages[:last_assistant_idx]
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    def _format_tools_for_system_message(self) -> str:
 								        """
 								        Format tool definitions for the system message in the trajectory format.
 								        Returns:
 								            str: JSON string representation of tool definitions
 								        """
 								        if not self.tools:
 								            return "[]"
 								        # Convert tool definitions to the format expected in trajectories
 								        formatted_tools = []
 								        for tool in self.tools:
 								            func = tool["function"]
 								            formatted_tool = {
 								                "name": func["name"],
 								                "description": func.get("description", ""),
 								                "parameters": func.get("parameters", {}),
 								                "required": None  # Match the format in the example
 								            }
 								            formatted_tools.append(formatted_tool)
-												some cleanups

											
										
										
											2025-11-05 03:47:17 +00:00
+								        return json.dumps(formatted_tools, ensure_ascii=False)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
 								    def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]:
 								        """
 								        Convert internal message format to trajectory format for saving.
 								        Args:
 								            messages (List[Dict]): Internal message history
 								            user_query (str): Original user query
 								            completed (bool): Whether the conversation completed successfully
 								        Returns:
 								            List[Dict]: Messages in trajectory format
 								        """
 								        trajectory = []
 								        # Add system message with tool definitions
 								        system_msg = (
 								            "You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. "
 								            "You may call one or more functions to assist with the user query. If available tools are not relevant in assisting "
 								            "with user query, just respond in natural conversational language. Don't make assumptions about what values to plug "
 								            "into functions. After calling & executing the functions, you will be provided with function results within "
 								            "<tool_response> </tool_response> XML tags. Here are the available tools:\n"
 								            f"<tools>\n{self._format_tools_for_system_message()}\n</tools>\n"
 								            "For each function call return a JSON object, with the following pydantic model json schema for each:\n"
 								            "{'title': 'FunctionCall', 'type': 'object', 'properties': {'name': {'title': 'Name', 'type': 'string'}, "
 								            "'arguments': {'title': 'Arguments', 'type': 'object'}}, 'required': ['name', 'arguments']}\n"
 								            "Each function call should be enclosed within <tool_call> </tool_call> XML tags.\n"
 								            "Example:\n<tool_call>\n{'name': <function-name>,'arguments': <args-dict>}\n</tool_call>"
 								        )
 								        trajectory.append({
 								            "from": "system",
 								            "value": system_msg
 								        })
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								        # Add the actual user prompt (from the dataset) as the first human message
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        trajectory.append({
 								            "from": "human",
 								            "value": user_query
 								        })
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								        # Skip the first message (the user query) since we already added it above.
 								        # Prefill messages are injected at API-call time only (not in the messages
 								        # list), so no offset adjustment is needed here.
 								        i = 1
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        while i < len(messages):
 								            msg = messages[i]
 								            if msg["role"] == "assistant":
 								                # Check if this message has tool calls
 								                if "tool_calls" in msg and msg["tool_calls"]:
 								                    # Format assistant message with tool calls
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                    # Add <think> tags around reasoning for trajectory storage
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                    content = ""
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                    # Prepend reasoning in <think> tags if available (native thinking tokens)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								                    if msg.get("reasoning") and msg["reasoning"].strip():
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        content = f"<think>\n{msg['reasoning']}\n</think>\n"
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                    if msg.get("content") and msg["content"].strip():
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                        # Convert any <REASONING_SCRATCHPAD> tags to <think> tags
 								                        # (used when native thinking is disabled and model reasons via XML)
-												refactor: streamline scratchpad handling in AIAgent

- Removed static methods for converting and checking <REASONING_SCRATCHPAD> tags, simplifying the codebase.
- Replaced calls to the removed methods with direct function calls for better clarity and maintainability.
- Updated trajectory saving logic to utilize a dedicated function for improved organization and readability.

											
										
										
											2026-02-23 09:55:09 -08:00
+								                        content += convert_scratchpad_to_think(msg["content"]) + "\n"
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
 								                    # Add tool calls wrapped in XML tags
 								                    for tool_call in msg["tool_calls"]:
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                        # Parse arguments - should always succeed since we validate during conversation
 								                        # but keep try-except as safety net
 								                        try:
 								                            arguments = json.loads(tool_call["function"]["arguments"]) if isinstance(tool_call["function"]["arguments"], str) else tool_call["function"]["arguments"]
 								                        except json.JSONDecodeError:
 								                            # This shouldn't happen since we validate and retry during conversation,
 								                            # but if it does, log warning and use empty dict
 								                            logging.warning(f"Unexpected invalid JSON in trajectory conversion: {tool_call['function']['arguments'][:100]}")
 								                            arguments = {}
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                        tool_call_json = {
 								                            "name": tool_call["function"]["name"],
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                            "arguments": arguments
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                        }
-												some cleanups

											
										
										
											2025-11-05 03:47:17 +00:00
+								                        content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
+								                    # Ensure every gpt turn has a <think> block (empty if no reasoning)
 								                    # so the format is consistent for training data
 								                    if "<think>" not in content:
 								                        content = "<think>\n</think>\n" + content
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                    trajectory.append({
 								                        "from": "gpt",
 								                        "value": content.rstrip()
 								                    })
 								                    # Collect all subsequent tool responses
 								                    tool_responses = []
 								                    j = i + 1
 								                    while j < len(messages) and messages[j]["role"] == "tool":
 								                        tool_msg = messages[j]
 								                        # Format tool response with XML tags
 								                        tool_response = f"<tool_response>\n"
 								                        # Try to parse tool content as JSON if it looks like JSON
 								                        tool_content = tool_msg["content"]
 								                        try:
 								                            if tool_content.strip().startswith(("{", "[")):
 								                                tool_content = json.loads(tool_content)
 								                        except (json.JSONDecodeError, AttributeError):
 								                            pass  # Keep as string if not valid JSON
-												feat: secure skill env setup on load (core #688)

When a skill declares required_environment_variables in its YAML
frontmatter, missing env vars trigger a secure TUI prompt (identical
to the sudo password widget) when the skill is loaded. Secrets flow
directly to ~/.hermes/.env, never entering LLM context.

Key changes:
- New required_environment_variables frontmatter field for skills
- Secure TUI widget (masked input, 120s timeout)
- Gateway safety: messaging platforms show local setup guidance
- Legacy prerequisites.env_vars normalized into new format
- Remote backend handling: conservative setup_needed=True
- Env var name validation, file permissions hardened to 0o600
- Redact patterns extended for secret-related JSON fields
- 12 existing skills updated with prerequisites declarations
- ~48 new tests covering skip, timeout, gateway, remote backends
- Dynamic panel widget sizing (fixes hardcoded width from original PR)

Cherry-picked from PR #723 by kshitijk4poor, rebased onto current main
with conflict resolution.

Fixes #688

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-03-13 03:14:04 -07:00
+								                        tool_index = len(tool_responses)
 								                        tool_name = (
 								                            msg["tool_calls"][tool_index]["function"]["name"]
 								                            if tool_index < len(msg["tool_calls"])
 								                            else "unknown"
 								                        )
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                        tool_response += json.dumps({
 								                            "tool_call_id": tool_msg.get("tool_call_id", ""),
-												feat: secure skill env setup on load (core #688)

When a skill declares required_environment_variables in its YAML
frontmatter, missing env vars trigger a secure TUI prompt (identical
to the sudo password widget) when the skill is loaded. Secrets flow
directly to ~/.hermes/.env, never entering LLM context.

Key changes:
- New required_environment_variables frontmatter field for skills
- Secure TUI widget (masked input, 120s timeout)
- Gateway safety: messaging platforms show local setup guidance
- Legacy prerequisites.env_vars normalized into new format
- Remote backend handling: conservative setup_needed=True
- Env var name validation, file permissions hardened to 0o600
- Redact patterns extended for secret-related JSON fields
- 12 existing skills updated with prerequisites declarations
- ~48 new tests covering skip, timeout, gateway, remote backends
- Dynamic panel widget sizing (fixes hardcoded width from original PR)

Cherry-picked from PR #723 by kshitijk4poor, rebased onto current main
with conflict resolution.

Fixes #688

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-03-13 03:14:04 -07:00
+								                            "name": tool_name,
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                            "content": tool_content
-												some cleanups

											
										
										
											2025-11-05 03:47:17 +00:00
+								                        }, ensure_ascii=False)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                        tool_response += "\n</tool_response>"
 								                        tool_responses.append(tool_response)
 								                        j += 1
 								                    # Add all tool responses as a single message
 								                    if tool_responses:
 								                        trajectory.append({
 								                            "from": "tool",
 								                            "value": "\n".join(tool_responses)
 								                        })
 								                        i = j - 1  # Skip the tool messages we just processed
 								                else:
 								                    # Regular assistant message without tool calls
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                    # Add <think> tags around reasoning for trajectory storage
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								                    content = ""
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                    # Prepend reasoning in <think> tags if available (native thinking tokens)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								                    if msg.get("reasoning") and msg["reasoning"].strip():
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        content = f"<think>\n{msg['reasoning']}\n</think>\n"
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                    # Convert any <REASONING_SCRATCHPAD> tags to <think> tags
 								                    # (used when native thinking is disabled and model reasons via XML)
 								                    raw_content = msg["content"] or ""
-												refactor: streamline scratchpad handling in AIAgent

- Removed static methods for converting and checking <REASONING_SCRATCHPAD> tags, simplifying the codebase.
- Replaced calls to the removed methods with direct function calls for better clarity and maintainability.
- Updated trajectory saving logic to utilize a dedicated function for improved organization and readability.

											
										
										
											2026-02-23 09:55:09 -08:00
+								                    content += convert_scratchpad_to_think(raw_content)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
+								                    # Ensure every gpt turn has a <think> block (empty if no reasoning)
 								                    if "<think>" not in content:
 								                        content = "<think>\n</think>\n" + content
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                    trajectory.append({
 								                        "from": "gpt",
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        "value": content.strip()
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                    })
 								            elif msg["role"] == "user":
 								                trajectory.append({
 								                    "from": "human",
 								                    "value": msg["content"]
 								                })
 								            i += 1
 								        return trajectory
 								    def _save_trajectory(self, messages: List[Dict[str, Any]], user_query: str, completed: bool):
 								        """
 								        Save conversation trajectory to JSONL file.
 								        Args:
 								            messages (List[Dict]): Complete message history
 								            user_query (str): Original user query
 								            completed (bool): Whether the conversation completed successfully
 								        """
 								        if not self.save_trajectories:
 								            return
 								        trajectory = self._convert_to_trajectory_format(messages, user_query, completed)
-												refactor: streamline scratchpad handling in AIAgent

- Removed static methods for converting and checking <REASONING_SCRATCHPAD> tags, simplifying the codebase.
- Replaced calls to the removed methods with direct function calls for better clarity and maintainability.
- Updated trajectory saving logic to utilize a dedicated function for improved organization and readability.

											
										
										
											2026-02-23 09:55:09 -08:00
+								        _save_trajectory_to_file(trajectory, self.model, completed)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												feat: add multi-provider authentication and inference provider selection

- Implemented a multi-provider authentication system for the Hermes Agent, supporting OAuth for Nous Portal and traditional API key methods for OpenRouter and custom endpoints.
- Enhanced CLI with commands for logging in and out of providers, allowing users to authenticate and manage their credentials easily.
- Updated configuration options to select inference providers, with detailed documentation on usage and setup.
- Improved status reporting to include authentication status and provider details, enhancing user awareness of their current configuration.
- Added new files for authentication handling and updated existing components to integrate the new provider system.

											
										
										
											2026-02-20 17:24:00 -08:00
+								    def _mask_api_key_for_logs(self, key: Optional[str]) -> Optional[str]:
 								        if not key:
 								            return None
 								        if len(key) <= 12:
 								            return "***"
 								        return f"{key[:8]}...{key[-4:]}"
 								    def _dump_api_request_debug(
 								        self,
 								        api_kwargs: Dict[str, Any],
 								        *,
 								        reason: str,
 								        error: Optional[Exception] = None,
 								    ) -> Optional[Path]:
 								        """
 								        Dump a debug-friendly HTTP request record for chat.completions.create().
 								        Captures the request body from api_kwargs (excluding transport-only keys
 								        like timeout). Intended for debugging provider-side 4xx failures where
 								        retries are not useful.
 								        """
 								        try:
 								            body = copy.deepcopy(api_kwargs)
 								            body.pop("timeout", None)
 								            body = {k: v for k, v in body.items() if v is not None}
 								            api_key = None
 								            try:
 								                api_key = getattr(self.client, "api_key", None)
-												refactor: enhance error handling with structured logging across multiple modules

- Updated various modules including cli.py, run_agent.py, gateway, and tools to replace silent exception handling with structured logging.
- Improved error messages to provide more context, aiding in debugging and monitoring.
- Ensured consistent logging practices throughout the codebase, enhancing traceability and maintainability.

											
										
										
											2026-02-21 03:32:11 -08:00
+								            except Exception as e:
 								                logger.debug("Could not extract API key for debug dump: %s", e)
-												feat: add multi-provider authentication and inference provider selection

- Implemented a multi-provider authentication system for the Hermes Agent, supporting OAuth for Nous Portal and traditional API key methods for OpenRouter and custom endpoints.
- Enhanced CLI with commands for logging in and out of providers, allowing users to authenticate and manage their credentials easily.
- Updated configuration options to select inference providers, with detailed documentation on usage and setup.
- Improved status reporting to include authentication status and provider details, enhancing user awareness of their current configuration.
- Added new files for authentication handling and updated existing components to integrate the new provider system.

											
										
										
											2026-02-20 17:24:00 -08:00
 								            dump_payload: Dict[str, Any] = {
 								                "timestamp": datetime.now().isoformat(),
 								                "session_id": self.session_id,
 								                "reason": reason,
 								                "request": {
 								                    "method": "POST",
 								                    "url": f"{self.base_url.rstrip('/')}/chat/completions",
 								                    "headers": {
 								                        "Authorization": f"Bearer {self._mask_api_key_for_logs(api_key)}",
 								                        "Content-Type": "application/json",
 								                    },
 								                    "body": body,
 								                },
 								            }
 								            if error is not None:
 								                error_info: Dict[str, Any] = {
 								                    "type": type(error).__name__,
 								                    "message": str(error),
 								                }
 								                for attr_name in ("status_code", "request_id", "code", "param", "type"):
 								                    attr_value = getattr(error, attr_name, None)
 								                    if attr_value is not None:
 								                        error_info[attr_name] = attr_value
 								                body_attr = getattr(error, "body", None)
 								                if body_attr is not None:
 								                    error_info["body"] = body_attr
 								                response_obj = getattr(error, "response", None)
 								                if response_obj is not None:
 								                    try:
 								                        error_info["response_status"] = getattr(response_obj, "status_code", None)
 								                        error_info["response_text"] = response_obj.text
-												refactor: enhance error handling with structured logging across multiple modules

- Updated various modules including cli.py, run_agent.py, gateway, and tools to replace silent exception handling with structured logging.
- Improved error messages to provide more context, aiding in debugging and monitoring.
- Ensured consistent logging practices throughout the codebase, enhancing traceability and maintainability.

											
										
										
											2026-02-21 03:32:11 -08:00
+								                    except Exception as e:
 								                        logger.debug("Could not extract error response details: %s", e)
-												feat: add multi-provider authentication and inference provider selection

- Implemented a multi-provider authentication system for the Hermes Agent, supporting OAuth for Nous Portal and traditional API key methods for OpenRouter and custom endpoints.
- Enhanced CLI with commands for logging in and out of providers, allowing users to authenticate and manage their credentials easily.
- Updated configuration options to select inference providers, with detailed documentation on usage and setup.
- Improved status reporting to include authentication status and provider details, enhancing user awareness of their current configuration.
- Added new files for authentication handling and updated existing components to integrate the new provider system.

											
										
										
											2026-02-20 17:24:00 -08:00
 								                dump_payload["error"] = error_info
 								            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
 								            dump_file = self.logs_dir / f"request_dump_{self.session_id}_{timestamp}.json"
 								            dump_file.write_text(
 								                json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str),
 								                encoding="utf-8",
 								            )
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								            self._vprint(f"{self.log_prefix}🧾 Request debug dump written to: {dump_file}")
-												feat: add multi-provider authentication and inference provider selection

- Implemented a multi-provider authentication system for the Hermes Agent, supporting OAuth for Nous Portal and traditional API key methods for OpenRouter and custom endpoints.
- Enhanced CLI with commands for logging in and out of providers, allowing users to authenticate and manage their credentials easily.
- Updated configuration options to select inference providers, with detailed documentation on usage and setup.
- Improved status reporting to include authentication status and provider details, enhancing user awareness of their current configuration.
- Added new files for authentication handling and updated existing components to integrate the new provider system.

											
										
										
											2026-02-20 17:24:00 -08:00
 								            if os.getenv("HERMES_DUMP_REQUEST_STDOUT", "").strip().lower() in {"1", "true", "yes", "on"}:
 								                print(json.dumps(dump_payload, ensure_ascii=False, indent=2, default=str))
 								            return dump_file
 								        except Exception as dump_error:
 								            if self.verbose_logging:
 								                logging.warning(f"Failed to dump API request debug payload: {dump_error}")
 								            return None
-												refactor: enhance session content handling in AIAgent and update TTS output path

- Introduced a new static method `_clean_session_content` in the `AIAgent` class to convert REASONING_SCRATCHPAD tags to <think> blocks and clean up whitespace in session logs.
- Updated the `_save_session_log` method to utilize the cleaned content for assistant messages, ensuring consistency in session logs.
- Changed the default output directory for TTS audio files from `~/voice-memos` to `~/.hermes/audio_cache`, reflecting a more appropriate storage location.

											
										
										
											2026-02-25 04:22:03 -08:00
+								    @staticmethod
 								    def _clean_session_content(content: str) -> str:
 								        """Convert REASONING_SCRATCHPAD to think tags and clean up whitespace."""
 								        if not content:
 								            return content
 								        content = convert_scratchpad_to_think(content)
 								        content = re.sub(r'\n+(<think>)', r'\n\1', content)
 								        content = re.sub(r'(</think>)\n+', r'\1\n', content)
 								        return content.strip()
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								    def _save_session_log(self, messages: List[Dict[str, Any]] = None):
 								        """
-												refactor: remove temporary API payload logging and enhance session log structure

- Eliminated the `_log_api_payload` method used for temporary debugging, streamlining the codebase.
- Updated the `_save_session_log` method to save the full raw session, including all messages and metadata, improving the clarity and completeness of session logs.
- Adjusted session log entry to include additional context such as `base_url` and `platform` for better tracking.

											
										
										
											2026-02-21 01:26:37 -08:00
+								        Save the full raw session to a JSON file.
 								        Stores every message exactly as the agent sees it: user messages,
 								        assistant messages (with reasoning, finish_reason, tool_calls),
 								        tool responses (with tool_call_id, tool_name), and injected system
 								        messages (compression summaries, todo snapshots, etc.).
-												refactor: enhance session content handling in AIAgent and update TTS output path

- Introduced a new static method `_clean_session_content` in the `AIAgent` class to convert REASONING_SCRATCHPAD tags to <think> blocks and clean up whitespace in session logs.
- Updated the `_save_session_log` method to utilize the cleaned content for assistant messages, ensuring consistency in session logs.
- Changed the default output directory for TTS audio files from `~/voice-memos` to `~/.hermes/audio_cache`, reflecting a more appropriate storage location.

											
										
										
											2026-02-25 04:22:03 -08:00
+								        REASONING_SCRATCHPAD tags are converted to <think> blocks for consistency.
-												refactor: remove temporary API payload logging and enhance session log structure

- Eliminated the `_log_api_payload` method used for temporary debugging, streamlining the codebase.
- Updated the `_save_session_log` method to save the full raw session, including all messages and metadata, improving the clarity and completeness of session logs.
- Adjusted session log entry to include additional context such as `base_url` and `platform` for better tracking.

											
										
										
											2026-02-21 01:26:37 -08:00
+								        Overwritten after each turn so it always reflects the latest state.
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								        """
 								        messages = messages or self._session_messages
 								        if not messages:
 								            return
-												refactor: remove temporary API payload logging and enhance session log structure

- Eliminated the `_log_api_payload` method used for temporary debugging, streamlining the codebase.
- Updated the `_save_session_log` method to save the full raw session, including all messages and metadata, improving the clarity and completeness of session logs.
- Adjusted session log entry to include additional context such as `base_url` and `platform` for better tracking.

											
										
										
											2026-02-21 01:26:37 -08:00
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								        try:
-												refactor: enhance session content handling in AIAgent and update TTS output path

- Introduced a new static method `_clean_session_content` in the `AIAgent` class to convert REASONING_SCRATCHPAD tags to <think> blocks and clean up whitespace in session logs.
- Updated the `_save_session_log` method to utilize the cleaned content for assistant messages, ensuring consistency in session logs.
- Changed the default output directory for TTS audio files from `~/voice-memos` to `~/.hermes/audio_cache`, reflecting a more appropriate storage location.

											
										
										
											2026-02-25 04:22:03 -08:00
+								            # Clean assistant content for session logs
 								            cleaned = []
 								            for msg in messages:
 								                if msg.get("role") == "assistant" and msg.get("content"):
 								                    msg = dict(msg)
 								                    msg["content"] = self._clean_session_content(msg["content"])
 								                cleaned.append(msg)
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								            entry = {
 								                "session_id": self.session_id,
 								                "model": self.model,
-												refactor: remove temporary API payload logging and enhance session log structure

- Eliminated the `_log_api_payload` method used for temporary debugging, streamlining the codebase.
- Updated the `_save_session_log` method to save the full raw session, including all messages and metadata, improving the clarity and completeness of session logs.
- Adjusted session log entry to include additional context such as `base_url` and `platform` for better tracking.

											
										
										
											2026-02-21 01:26:37 -08:00
+								                "base_url": self.base_url,
 								                "platform": self.platform,
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								                "session_start": self.session_start.isoformat(),
 								                "last_updated": datetime.now().isoformat(),
-												feat(agent): include system prompt in agent status output

Added the system prompt to the AIAgent class's status output, ensuring that the current system prompt is included in the agent's status information. This enhancement improves visibility into the agent's configuration during runtime.

											
										
										
											2026-03-01 23:50:54 -08:00
+								                "system_prompt": self._cached_system_prompt or "",
-												feat(agent): include tools in agent status output

Added the tools attribute to the AIAgent class's status output, ensuring that the current tools used by the agent are included in the status information. This enhancement improves the visibility of the agent's capabilities during runtime.

											
										
										
											2026-03-02 00:13:41 -08:00
+								                "tools": self.tools or [],
-												refactor: enhance session content handling in AIAgent and update TTS output path

- Introduced a new static method `_clean_session_content` in the `AIAgent` class to convert REASONING_SCRATCHPAD tags to <think> blocks and clean up whitespace in session logs.
- Updated the `_save_session_log` method to utilize the cleaned content for assistant messages, ensuring consistency in session logs.
- Changed the default output directory for TTS audio files from `~/voice-memos` to `~/.hermes/audio_cache`, reflecting a more appropriate storage location.

											
										
										
											2026-02-25 04:22:03 -08:00
+								                "message_count": len(cleaned),
 								                "messages": cleaned,
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								            }
-												refactor: remove temporary API payload logging and enhance session log structure

- Eliminated the `_log_api_payload` method used for temporary debugging, streamlining the codebase.
- Updated the `_save_session_log` method to save the full raw session, including all messages and metadata, improving the clarity and completeness of session logs.
- Adjusted session log entry to include additional context such as `base_url` and `platform` for better tracking.

											
										
										
											2026-02-21 01:26:37 -08:00
-												fix: reuse shared atomic session log helper

											
										
										
											2026-03-14 02:56:13 -07:00
+								            atomic_json_write(
 								                self.session_log_file,
 								                entry,
 								                indent=2,
 								                default=str,
 								            )
-												refactor: remove temporary API payload logging and enhance session log structure

- Eliminated the `_log_api_payload` method used for temporary debugging, streamlining the codebase.
- Updated the `_save_session_log` method to save the full raw session, including all messages and metadata, improving the clarity and completeness of session logs.
- Adjusted session log entry to include additional context such as `base_url` and `platform` for better tracking.

											
										
										
											2026-02-21 01:26:37 -08:00
-												Enhance session logging and interactive sudo support

- Implemented automatic session logging, saving conversation trajectories to the `logs/` directory in JSON format, with each session having a unique identifier.
- Updated the CLI to display the session ID in the welcome banner for easy reference.
- Introduced an interactive sudo password prompt in CLI mode, allowing users to enter their password with a 45-second timeout, enhancing user experience during command execution.
- Documented session logging and interactive sudo features in `README.md`, `cli.md`, and `cli-config.yaml.example` for better user guidance.

											
										
										
											2026-02-01 15:36:26 -08:00
+								        except Exception as e:
 								            if self.verbose_logging:
 								                logging.warning(f"Failed to save session log: {e}")
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								    def interrupt(self, message: str = None) -> None:
 								        """
 								        Request the agent to interrupt its current tool-calling loop.
 								        Call this from another thread (e.g., input handler, message receiver)
 								        to gracefully stop the agent and process a new message.
-												Implement interrupt handling for long-running tool executions in AIAgent

- Added functionality to signal and terminate long-running terminal commands when a new user message is received, allowing for immediate agent response.
- Introduced a global interrupt event in the terminal tool to facilitate early termination of subprocesses.
- Updated the AIAgent class to handle interrupts gracefully, ensuring that remaining tool calls are skipped and appropriate messages are returned to maintain valid message sequences.

											
										
										
											2026-02-10 16:34:27 -08:00
+								        Also signals long-running tool executions (e.g. terminal commands)
 								        to terminate early, so the agent can respond immediately.
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								        Args:
 								            message: Optional new message that triggered the interrupt.
 								                     If provided, the agent will include this in its response context.
 								        Example (CLI):
 								            # In a separate input thread:
 								            if user_typed_something:
 								                agent.interrupt(user_input)
 								        Example (Messaging):
 								            # When new message arrives for active session:
 								            if session_has_running_agent:
 								                running_agent.interrupt(new_message.text)
 								        """
 								        self._interrupt_requested = True
 								        self._interrupt_message = message
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								        # Signal all tools to abort any in-flight operations immediately
 								        _set_interrupt(True)
-												feat: implement subagent delegation for task management

- Introduced the `delegate_task` tool, allowing the main agent to spawn child AIAgent instances with isolated context for complex tasks.
- Supported both single-task and batch processing (up to 3 concurrent tasks) to enhance task management capabilities.
- Updated configuration options for delegation, including maximum iterations and default toolsets for subagents.
- Enhanced documentation to provide clear guidance on using the delegation feature and its configuration.
- Added comprehensive tests to ensure the functionality and reliability of the delegation logic.

											
										
										
											2026-02-20 03:15:53 -08:00
+								        # Propagate interrupt to any running child agents (subagent delegation)
 								        for child in self._active_children:
 								            try:
 								                child.interrupt(message)
-												refactor: enhance error handling with structured logging across multiple modules

- Updated various modules including cli.py, run_agent.py, gateway, and tools to replace silent exception handling with structured logging.
- Improved error messages to provide more context, aiding in debugging and monitoring.
- Ensured consistent logging practices throughout the codebase, enhancing traceability and maintainability.

											
										
										
											2026-02-21 03:32:11 -08:00
+								            except Exception as e:
 								                logger.debug("Failed to propagate interrupt to child agent: %s", e)
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								        if not self.quiet_mode:
 								            print(f"\n⚡ Interrupt requested" + (f": '{message[:40]}...'" if message and len(message) > 40 else f": '{message}'" if message else ""))
 								    def clear_interrupt(self) -> None:
-												refactor: enhance interrupt handling in AIAgent class

- Updated the `clear_interrupt` method to also reset the global tool interrupt signal, improving the clarity of interrupt management within the agent.
- This change ensures that all interrupt states are properly cleared, enhancing the reliability of the agent's operation.

											
										
										
											2026-02-25 03:45:47 -08:00
+								        """Clear any pending interrupt request and the global tool interrupt signal."""
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								        self._interrupt_requested = False
 								        self._interrupt_message = None
-												refactor: enhance interrupt handling in AIAgent class

- Updated the `clear_interrupt` method to also reset the global tool interrupt signal, improving the clarity of interrupt management within the agent.
- This change ensures that all interrupt states are properly cleared, enhancing the reliability of the agent's operation.

											
										
										
											2026-02-25 03:45:47 -08:00
+								        _set_interrupt(False)
-												Add todo tool for agent task planning and management

Single `todo` tool that reads (no params) or writes (provide todos array
with merge flag). In-memory TodoStore on AIAgent, no system prompt
mutation, behavioral guidance in tool description only. State re-injected
after context compression events. Gateway sessions hydrate from
conversation history. Added to all platform toolsets.

Also wired into RL agent_loop.py with per-run TodoStore and fixed
browser_snapshot user_task passthrough from first user message.

											
										
										
											2026-02-17 17:02:33 -08:00
 								    def _hydrate_todo_store(self, history: List[Dict[str, Any]]) -> None:
 								        """
 								        Recover todo state from conversation history.
 								        The gateway creates a fresh AIAgent per message, so the in-memory
 								        TodoStore is empty. We scan the history for the most recent todo
 								        tool response and replay it to reconstruct the state.
 								        """
 								        # Walk history backwards to find the most recent todo tool response
 								        last_todo_response = None
 								        for msg in reversed(history):
 								            if msg.get("role") != "tool":
 								                continue
 								            content = msg.get("content", "")
 								            # Quick check: todo responses contain "todos" key
 								            if '"todos"' not in content:
 								                continue
 								            try:
 								                data = json.loads(content)
 								                if "todos" in data and isinstance(data["todos"], list):
 								                    last_todo_response = data["todos"]
 								                    break
 								            except (json.JSONDecodeError, TypeError):
 								                continue
 								        if last_todo_response:
 								            # Replay the items into the store (replace mode)
 								            self._todo_store.write(last_todo_response, merge=False)
 								            if not self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                self._vprint(f"{self.log_prefix}📋 Restored {len(last_todo_response)} todo item(s) from history")
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								        _set_interrupt(False)
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
 								    @property
 								    def is_interrupted(self) -> bool:
 								        """Check if an interrupt has been requested."""
 								        return self._interrupt_requested
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
 								    # ── Honcho integration helpers ──
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								    def _honcho_should_activate(self, hcfg) -> bool:
 								        """Return True when remote Honcho should be active."""
 								        if not hcfg or not hcfg.enabled or not hcfg.api_key:
 								            return False
-												refactor(honcho): remove local memory mode

The "local" memoryMode was redundant with enabled: false. Simplifies
the mode system to hybrid and honcho only.

											
										
										
											2026-03-12 16:23:34 -04:00
+								        return True
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
-												fix(honcho): harden tool gating and migration peer routing

Prevent stale Honcho tool exposure in context/local modes, restore reliable async write retry behavior, and ensure SOUL.md migration uploads target the AI peer instead of the user peer. Also align Honcho CLI key checks with host-scoped apiKey resolution and lock the fixes with regression tests.

Made-with: Cursor

											
										
										
											2026-03-11 18:21:27 -04:00
+								    def _strip_honcho_tools_from_surface(self) -> None:
 								        """Remove Honcho tools from the active tool surface."""
 								        if not self.tools:
 								            self.valid_tool_names = set()
 								            return
 								        self.tools = [
 								            tool for tool in self.tools
 								            if tool.get("function", {}).get("name") not in HONCHO_TOOL_NAMES
 								        ]
 								        self.valid_tool_names = {
 								            tool["function"]["name"] for tool in self.tools
 								        } if self.tools else set()
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								    def _activate_honcho(
 								        self,
 								        hcfg,
 								        *,
 								        enabled_toolsets: Optional[List[str]],
 								        disabled_toolsets: Optional[List[str]],
 								        session_db,
 								    ) -> None:
 								        """Finish Honcho setup once a session manager is available."""
 								        if not self._honcho:
 								            return
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								        if not self._honcho_session_key:
 								            session_title = None
 								            if session_db is not None:
 								                try:
 								                    session_title = session_db.get_session_title(self.session_id or "")
 								                except Exception:
 								                    pass
 								            self._honcho_session_key = (
 								                hcfg.resolve_session_name(
 								                    session_title=session_title,
 								                    session_id=self.session_id,
 								                )
 								                or "hermes-default"
 								            )
 								        honcho_sess = self._honcho.get_or_create(self._honcho_session_key)
 								        if not honcho_sess.messages:
 								            try:
 								                from hermes_cli.config import get_hermes_home
 								                mem_dir = str(get_hermes_home() / "memories")
 								                self._honcho.migrate_memory_files(
 								                    self._honcho_session_key,
 								                    mem_dir,
 								                )
 								            except Exception as exc:
 								                logger.debug("Memory files migration failed (non-fatal): %s", exc)
 								        from tools.honcho_tools import set_session_context
 								        set_session_context(self._honcho, self._honcho_session_key)
-												fix(honcho): harden tool gating and migration peer routing

Prevent stale Honcho tool exposure in context/local modes, restore reliable async write retry behavior, and ensure SOUL.md migration uploads target the AI peer instead of the user peer. Also align Honcho CLI key checks with host-scoped apiKey resolution and lock the fixes with regression tests.

Made-with: Cursor

											
										
										
											2026-03-11 18:21:27 -04:00
+								        # Rebuild tool surface after Honcho context injection. Tool availability
 								        # is check_fn-gated and may change once session context is attached.
 								        self.tools = get_tool_definitions(
 								            enabled_toolsets=enabled_toolsets,
 								            disabled_toolsets=disabled_toolsets,
 								            quiet_mode=True,
 								        )
 								        self.valid_tool_names = {
 								            tool["function"]["name"] for tool in self.tools
 								        } if self.tools else set()
 								        if hcfg.recall_mode == "context":
 								            self._strip_honcho_tools_from_surface()
 								            if not self.quiet_mode:
-												refactor(honcho): rename memory tools to Honcho tools, clarify recall mode language

Replace "memory tools" with "Honcho tools" and "pre-warmed/prefetch"
with "auto-injected context" in all user-facing strings and docs.

											
										
										
											2026-03-12 16:26:10 -04:00
+								                print("  Honcho active — recall_mode: context (Honcho tools hidden)")
-												fix(honcho): harden tool gating and migration peer routing

Prevent stale Honcho tool exposure in context/local modes, restore reliable async write retry behavior, and ensure SOUL.md migration uploads target the AI peer instead of the user peer. Also align Honcho CLI key checks with host-scoped apiKey resolution and lock the fixes with regression tests.

Made-with: Cursor

											
										
										
											2026-03-11 18:21:27 -04:00
+								        else:
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								            if not self.quiet_mode:
 								                print(f"  Honcho active — recall_mode: {hcfg.recall_mode}")
 								        logger.info(
 								            "Honcho active (session: %s, user: %s, workspace: %s, "
 								            "write_frequency: %s, memory_mode: %s)",
 								            self._honcho_session_key,
 								            hcfg.peer_name,
 								            hcfg.workspace_id,
 								            hcfg.write_frequency,
 								            hcfg.memory_mode,
 								        )
 								        recall_mode = hcfg.recall_mode
 								        if recall_mode != "tools":
 								            try:
 								                ctx = self._honcho.get_prefetch_context(self._honcho_session_key)
 								                if ctx:
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								                    self._honcho.set_context_result(self._honcho_session_key, ctx)
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
+								                    logger.debug("Honcho context pre-warmed for first turn")
 								            except Exception as exc:
 								                logger.debug("Honcho context prefetch failed (non-fatal): %s", exc)
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        self._register_honcho_exit_hook()
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								    def _register_honcho_exit_hook(self) -> None:
 								        """Register a process-exit flush hook without clobbering signal handlers."""
 								        if self._honcho_exit_hook_registered or not self._honcho:
 								            return
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        honcho_ref = weakref.ref(self._honcho)
 								        def _flush_honcho_on_exit():
 								            manager = honcho_ref()
 								            if manager is None:
 								                return
 								            try:
 								                manager.flush_all()
 								            except Exception as exc:
 								                logger.debug("Honcho flush on exit failed (non-fatal): %s", exc)
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        atexit.register(_flush_honcho_on_exit)
 								        self._honcho_exit_hook_registered = True
 								    def _queue_honcho_prefetch(self, user_message: str) -> None:
 								        """Queue turn-end Honcho prefetch so the next turn can consume cached results."""
 								        if not self._honcho or not self._honcho_session_key:
 								            return
 								        recall_mode = (self._honcho_config.recall_mode if self._honcho_config else "hybrid")
 								        if recall_mode == "tools":
 								            return
 								        try:
 								            self._honcho.prefetch_context(self._honcho_session_key, user_message)
 								            self._honcho.prefetch_dialectic(self._honcho_session_key, user_message or "What were we working on?")
 								        except Exception as exc:
 								            logger.debug("Honcho background prefetch failed (non-fatal): %s", exc)
-												fix(honcho): enforce local mode and cache-safe warmup

											
										
										
											2026-03-10 02:05:28 -07:00
 								    def _honcho_prefetch(self, user_message: str) -> str:
 								        """Assemble the first-turn Honcho context from the pre-warmed cache."""
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        if not self._honcho or not self._honcho_session_key:
 								            return ""
 								        try:
 								            parts = []
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
 								            ctx = self._honcho.pop_context_result(self._honcho_session_key)
 								            if ctx:
 								                rep = ctx.get("representation", "")
 								                card = ctx.get("card", "")
 								                if rep:
 								                    parts.append(f"## User representation\n{rep}")
 								                if card:
 								                    parts.append(card)
 								                ai_rep = ctx.get("ai_representation", "")
 								                ai_card = ctx.get("ai_card", "")
 								                if ai_rep:
 								                    parts.append(f"## AI peer representation\n{ai_rep}")
 								                if ai_card:
 								                    parts.append(ai_card)
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								            dialectic = self._honcho.pop_dialectic_result(self._honcho_session_key)
 								            if dialectic:
 								                parts.append(f"## Continuity synthesis\n{dialectic}")
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								            if not parts:
 								                return ""
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								            header = (
 								                "# Honcho Memory (persistent cross-session context)\n"
 								                "Use this to answer questions about the user, prior sessions, "
 								                "and what you were working on together. Do not call tools to "
 								                "look up information that is already present here.\n"
 								            )
 								            return header + "\n\n".join(parts)
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        except Exception as e:
 								            logger.debug("Honcho prefetch failed (non-fatal): %s", e)
 								            return ""
-												feat: integrate Honcho with USER.md memory system

When Honcho is active:
- System prompt uses Honcho prefetch instead of USER.md
- memory tool target=user add routes to Honcho
- MEMORY.md untouched in all cases

When disabled, everything works as before.

Also wires up contextTokens config to cap prefetch size.

											
										
										
											2026-02-26 18:02:24 -05:00
+								    def _honcho_save_user_observation(self, content: str) -> str:
 								        """Route a memory tool target=user add to Honcho.
 								        Sends the content as a user peer message so Honcho's reasoning
 								        model can incorporate it into the user representation.
 								        """
 								        if not content or not content.strip():
 								            return json.dumps({"success": False, "error": "Content cannot be empty."})
 								        try:
 								            session = self._honcho.get_or_create(self._honcho_session_key)
 								            session.add_message("user", f"[observation] {content.strip()}")
 								            self._honcho.save(session)
 								            return json.dumps({
 								                "success": True,
 								                "target": "user",
 								                "message": "Saved to Honcho user model.",
 								            })
 								        except Exception as e:
 								            logger.debug("Honcho user observation failed: %s", e)
 								            return json.dumps({"success": False, "error": f"Honcho save failed: {e}"})
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								    def _honcho_sync(self, user_content: str, assistant_content: str) -> None:
 								        """Sync the user/assistant message pair to Honcho."""
 								        if not self._honcho or not self._honcho_session_key:
 								            return
 								        try:
 								            session = self._honcho.get_or_create(self._honcho_session_key)
 								            session.add_message("user", user_content)
 								            session.add_message("assistant", assistant_content)
 								            self._honcho.save(session)
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								            logger.info("Honcho sync queued for session %s (%d messages)",
 								                        self._honcho_session_key, len(session.messages))
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        except Exception as e:
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								            logger.warning("Honcho sync failed: %s", e)
 								            if not self.quiet_mode:
 								                print(f"  Honcho write failed: {e}")
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								    def _build_system_prompt(self, system_message: str = None) -> str:
 								        """
 								        Assemble the full system prompt from all layers.
 								        Called once per session (cached on self._cached_system_prompt) and only
 								        rebuilt after context compression events. This ensures the system prompt
 								        is stable across all turns in a session, maximizing prefix cache hits.
 								        """
 								        # Layers (in order):
 								        #   1. Default agent identity (always present)
 								        #   2. User / gateway system prompt (if provided)
 								        #   3. Persistent memory (frozen snapshot)
 								        #   4. Skills guidance (if skills tools are loaded)
 								        #   5. Context files (SOUL.md, AGENTS.md, .cursorrules)
 								        #   6. Current date & time (frozen at build time)
 								        #   7. Platform-specific formatting hint
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								        # If an AI peer name is configured in Honcho, personalise the identity line.
 								        _ai_peer_name = (
 								            self._honcho_config.ai_peer
 								            if self._honcho_config and self._honcho_config.ai_peer != "hermes"
 								            else None
 								        )
 								        if _ai_peer_name:
 								            _identity = DEFAULT_AGENT_IDENTITY.replace(
 								                "You are Hermes Agent",
 								                f"You are {_ai_peer_name}",
 ,
 								            )
 								        else:
 								            _identity = DEFAULT_AGENT_IDENTITY
 								        prompt_parts = [_identity]
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
-												Enhance agent guidance with memory and session search tools

- Introduced MEMORY_GUIDANCE and SESSION_SEARCH_GUIDANCE to improve agent's contextual awareness and proactive assistance.
- Updated AIAgent to conditionally include tool-aware guidance in prompts based on available tools.
- Enhanced descriptions in memory and session search schemas for clearer user instructions on when to utilize these features.

											
										
										
											2026-02-22 02:31:52 -08:00
+								        # Tool-aware behavioral guidance: only inject when the tools are loaded
 								        tool_guidance = []
 								        if "memory" in self.valid_tool_names:
 								            tool_guidance.append(MEMORY_GUIDANCE)
 								        if "session_search" in self.valid_tool_names:
 								            tool_guidance.append(SESSION_SEARCH_GUIDANCE)
-												feat: introduce skills management features in AIAgent and CLI

- Added skills configuration options in cli-config.yaml.example, including a nudge interval for skill creation reminders.
- Implemented skills guidance in AIAgent to prompt users to save reusable workflows after complex tasks.
- Enhanced skills indexing in the prompt builder to include descriptions from SKILL.md files for better context.
- Updated the agent's behavior to periodically remind users about potential skills during tool-calling iterations.

											
										
										
											2026-02-22 13:28:13 -08:00
+								        if "skill_manage" in self.valid_tool_names:
 								            tool_guidance.append(SKILLS_GUIDANCE)
-												Enhance agent guidance with memory and session search tools

- Introduced MEMORY_GUIDANCE and SESSION_SEARCH_GUIDANCE to improve agent's contextual awareness and proactive assistance.
- Updated AIAgent to conditionally include tool-aware guidance in prompts based on available tools.
- Enhanced descriptions in memory and session search schemas for clearer user instructions on when to utilize these features.

											
										
										
											2026-02-22 02:31:52 -08:00
+								        if tool_guidance:
 								            prompt_parts.append(" ".join(tool_guidance))
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								        # Honcho CLI awareness: tell Hermes about its own management commands
 								        # so it can refer the user to them rather than reinventing answers.
 								        if self._honcho and self._honcho_session_key:
 								            hcfg = self._honcho_config
 								            mode = hcfg.memory_mode if hcfg else "hybrid"
 								            freq = hcfg.write_frequency if hcfg else "async"
-												refactor(honcho): rename recallMode "auto" to "hybrid"

Matches the mental model: hybrid = context + tools,
context = context only, tools = tools only.

											
										
										
											2026-03-09 17:46:51 -04:00
+								            recall_mode = hcfg.recall_mode if hcfg else "hybrid"
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								            honcho_block = (
 								                "# Honcho memory integration\n"
 								                f"Active. Session: {self._honcho_session_key}. "
 								                f"Mode: {mode}. Write frequency: {freq}. Recall: {recall_mode}.\n"
 								            )
 								            if recall_mode == "context":
 								                honcho_block += (
-												refactor(honcho): rename memory tools to Honcho tools, clarify recall mode language

Replace "memory tools" with "Honcho tools" and "pre-warmed/prefetch"
with "auto-injected context" in all user-facing strings and docs.

											
										
										
											2026-03-12 16:26:10 -04:00
+								                    "Honcho context is injected into this system prompt below. "
 								                    "All memory retrieval comes from this context — no Honcho tools "
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                    "are available. Answer questions about the user, prior sessions, "
 								                    "and recent work directly from the Honcho Memory section.\n"
 								                )
 								            elif recall_mode == "tools":
 								                honcho_block += (
-												refactor(honcho): rename memory tools to Honcho tools, clarify recall mode language

Replace "memory tools" with "Honcho tools" and "pre-warmed/prefetch"
with "auto-injected context" in all user-facing strings and docs.

											
										
										
											2026-03-12 16:26:10 -04:00
+								                    "Honcho tools:\n"
-												refactor(honcho): rename query_user_context to honcho_context

Consistent naming: all honcho tools now prefixed with honcho_
(honcho_context, honcho_search, honcho_profile, honcho_conclude).

											
										
										
											2026-03-09 17:59:30 -04:00
+								                    "  honcho_context <question>           — ask Honcho a question, LLM-synthesized answer\n"
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                    "  honcho_search <query>                   — semantic search, raw excerpts, no LLM\n"
-												feat(honcho): add honcho_conclude tool for writing facts back to memory

New tool lets Hermes persist conclusions about the user (preferences,
corrections, project context) directly to Honcho via the conclusions
API. Feeds into the user's peer card and representation.

											
										
										
											2026-03-09 17:55:31 -04:00
+								                    "  honcho_profile                          — user's peer card, key facts, no LLM\n"
 								                    "  honcho_conclude <conclusion>            — write a fact about the user to memory\n"
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                )
-												feat(honcho): add honcho_conclude tool for writing facts back to memory

New tool lets Hermes persist conclusions about the user (preferences,
corrections, project context) directly to Honcho via the conclusions
API. Feeds into the user's peer card and representation.

											
										
										
											2026-03-09 17:55:31 -04:00
+								            else:  # hybrid
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                honcho_block += (
 								                    "Honcho context (user representation, peer card, and recent session summary) "
-												refactor(honcho): rename memory tools to Honcho tools, clarify recall mode language

Replace "memory tools" with "Honcho tools" and "pre-warmed/prefetch"
with "auto-injected context" in all user-facing strings and docs.

											
										
										
											2026-03-12 16:26:10 -04:00
+								                    "is injected into this system prompt below. Use it to answer continuity "
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                    "questions ('where were we?', 'what were we working on?') WITHOUT calling "
-												refactor(honcho): rename memory tools to Honcho tools, clarify recall mode language

Replace "memory tools" with "Honcho tools" and "pre-warmed/prefetch"
with "auto-injected context" in all user-facing strings and docs.

											
										
										
											2026-03-12 16:26:10 -04:00
+								                    "any tools. Only call Honcho tools when you need information beyond what is "
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                    "already present in the Honcho Memory section.\n"
-												refactor(honcho): rename memory tools to Honcho tools, clarify recall mode language

Replace "memory tools" with "Honcho tools" and "pre-warmed/prefetch"
with "auto-injected context" in all user-facing strings and docs.

											
										
										
											2026-03-12 16:26:10 -04:00
+								                    "Honcho tools:\n"
-												refactor(honcho): rename query_user_context to honcho_context

Consistent naming: all honcho tools now prefixed with honcho_
(honcho_context, honcho_search, honcho_profile, honcho_conclude).

											
										
										
											2026-03-09 17:59:30 -04:00
+								                    "  honcho_context <question>           — ask Honcho a question, LLM-synthesized answer\n"
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                    "  honcho_search <query>                   — semantic search, raw excerpts, no LLM\n"
-												feat(honcho): add honcho_conclude tool for writing facts back to memory

New tool lets Hermes persist conclusions about the user (preferences,
corrections, project context) directly to Honcho via the conclusions
API. Feeds into the user's peer card and representation.

											
										
										
											2026-03-09 17:55:31 -04:00
+								                    "  honcho_profile                          — user's peer card, key facts, no LLM\n"
 								                    "  honcho_conclude <conclusion>            — write a fact about the user to memory\n"
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                )
 								            honcho_block += (
 								                "Management commands (refer users here instead of explaining manually):\n"
 								                "  hermes honcho status                    — show full config + connection\n"
-												refactor(honcho): remove local memory mode

The "local" memoryMode was redundant with enabled: false. Simplifies
the mode system to hybrid and honcho only.

											
										
										
											2026-03-12 16:23:34 -04:00
+								                "  hermes honcho mode [hybrid|honcho]       — show or set memory mode\n"
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                "  hermes honcho tokens [--context N] [--dialectic N] — show or set token budgets\n"
 								                "  hermes honcho peer [--user NAME] [--ai NAME] [--reasoning LEVEL]\n"
 								                "  hermes honcho sessions                  — list directory→session mappings\n"
 								                "  hermes honcho map <name>                — map cwd to a session name\n"
 								                "  hermes honcho identity [<file>] [--show] — seed or show AI peer identity\n"
 								                "  hermes honcho migrate                   — migration guide from openclaw-honcho\n"
 								                "  hermes honcho setup                     — full interactive wizard"
 								            )
 								            prompt_parts.append(honcho_block)
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								        # Note: ephemeral_system_prompt is NOT included here. It's injected at
 								        # API-call time only so it stays out of the cached/stored system prompt.
 								        if system_message is not None:
 								            prompt_parts.append(system_message)
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
 								        if self._memory_store:
 								            if self._memory_enabled:
 								                mem_block = self._memory_store.format_for_system_prompt("memory")
 								                if mem_block:
 								                    prompt_parts.append(mem_block)
-												refactor: run Honcho and USER.md in tandem

USER.md stays in system prompt when Honcho is active -- prefetch is
additive context, not a replacement. Memory tool user observations
write to both USER.md (local) and Honcho (cross-session) simultaneously.

											
										
										
											2026-02-26 18:07:33 -05:00
+								            # USER.md is always included when enabled -- Honcho prefetch is additive.
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								            if self._user_profile_enabled:
 								                user_block = self._memory_store.format_for_system_prompt("user")
 								                if user_block:
 								                    prompt_parts.append(user_block)
-												feat: introduce skill management tool for agent-created skills and skills migration to ~/.hermes

- Added a new `skill_manager_tool` to enable agents to create, update, and delete their own skills, enhancing procedural memory capabilities.
- Updated the skills directory structure to support user-created skills in `~/.hermes/skills/`, allowing for better organization and management.
- Enhanced the CLI and documentation to reflect the new skill management functionalities, including detailed instructions on creating and modifying skills.
- Implemented a manifest-based syncing mechanism for bundled skills to ensure user modifications are preserved during updates.

											
										
										
											2026-02-19 18:25:53 -08:00
+								        has_skills_tools = any(name in self.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage'])
-												feat: conditional skill activation based on tool availability

Skills can now declare fallback_for_toolsets, fallback_for_tools,
requires_toolsets, and requires_tools in their SKILL.md frontmatter.
The system prompt builder filters skills automatically based on which
tools are available in the current session.

- Add _read_skill_conditions() to parse conditional frontmatter fields
- Add _skill_should_show() to evaluate conditions against available tools
- Update build_skills_system_prompt() to accept and apply tool availability
- Pass valid_tool_names and available toolsets from run_agent.py
- Backward compatible: skills without conditions always show; calling
  build_skills_system_prompt() with no args preserves existing behavior

Closes #539

											
										
										
											2026-03-09 23:13:39 +03:00
+								        if has_skills_tools:
 								            avail_toolsets = {ts for ts, avail in check_toolset_requirements().items() if avail}
 								            skills_prompt = build_skills_system_prompt(
 								                available_tools=self.valid_tool_names,
 								                available_toolsets=avail_toolsets,
 								            )
 								        else:
 								            skills_prompt = ""
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        if skills_prompt:
 								            prompt_parts.append(skills_prompt)
 								        if not self.skip_context_files:
 								            context_files_prompt = build_context_files_prompt()
 								            if context_files_prompt:
 								                prompt_parts.append(context_files_prompt)
-												fix(timezone): add timezone-aware clock across agent, cron, and execute_code

											
										
										
											2026-03-03 11:57:18 +05:30
+								        from hermes_time import now as _hermes_now
 								        now = _hermes_now()
-												feat: include session ID in system prompt via --pass-session-id flag

Adds --pass-session-id CLI flag. When set, the agent's system prompt
includes the session ID:

  Conversation started: Sunday, March 08, 2026 06:32 PM
  Session ID: 20260308_183200_abc123

Usage:
  hermes --pass-session-id
  hermes chat --pass-session-id

Implementation threads the flag as a proper parameter through the full
chain (main.py → cli.py → run_agent.py) rather than using an env var,
avoiding collisions in multi-agent/multitenant setups.

Based on PR #726 by dmahan93, reworked to use instance parameter
instead of HERMES_PASS_SESSION_ID environment variable.

Co-authored-by: dmahan93 <dmahan93@users.noreply.github.com>

											
										
										
											2026-03-12 05:51:31 -07:00
+								        timestamp_line = f"Conversation started: {now.strftime('%A, %B %d, %Y %I:%M %p')}"
 								        if self.pass_session_id and self.session_id:
 								            timestamp_line += f"\nSession ID: {self.session_id}"
 								        prompt_parts.append(timestamp_line)
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
 								        platform_key = (self.platform or "").lower().strip()
 								        if platform_key in PLATFORM_HINTS:
 								            prompt_parts.append(PLATFORM_HINTS[platform_key])
 								        return "\n\n".join(prompt_parts)
-												fix: tool call repair — auto-lowercase, fuzzy match, helpful error on unknown tool (#520)

- Add _repair_tool_call(): tries lowercase, normalize, then fuzzy match (difflib 0.7)
- Replace 3-retry-then-abort with graceful error: model receives helpful message and self-corrects
- Conversation stays alive instead of dying on hallucinated tool names

Closes #520

											
										
										
											2026-03-08 18:39:43 +03:00
+								    def _repair_tool_call(self, tool_name: str) -> str | None:
 								        """Attempt to repair a mismatched tool name before aborting.
 . Try lowercase
 . Try normalized (lowercase + hyphens/spaces -> underscores)
 . Try fuzzy match (difflib, cutoff=0.7)
 								        Returns the repaired name if found in valid_tool_names, else None.
 								        """
 								        from difflib import get_close_matches
 								        # 1. Lowercase
 								        lowered = tool_name.lower()
 								        if lowered in self.valid_tool_names:
 								            return lowered
 								        # 2. Normalize
 								        normalized = lowered.replace("-", "_").replace(" ", "_")
 								        if normalized in self.valid_tool_names:
 								            return normalized
 								        # 3. Fuzzy match
 								        matches = get_close_matches(lowered, self.valid_tool_names, n=1, cutoff=0.7)
 								        if matches:
 								            return matches[0]
 								        return None
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								    def _invalidate_system_prompt(self):
 								        """
 								        Invalidate the cached system prompt, forcing a rebuild on the next turn.
 								        Called after context compression events. Also reloads memory from disk
 								        so the rebuilt prompt captures any writes from this session.
 								        """
 								        self._cached_system_prompt = None
 								        if self._memory_store:
 								            self._memory_store.load_from_disk()
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								    def _responses_tools(self, tools: Optional[List[Dict[str, Any]]] = None) -> Optional[List[Dict[str, Any]]]:
 								        """Convert chat-completions tool schemas to Responses function-tool schemas."""
 								        source_tools = tools if tools is not None else self.tools
 								        if not source_tools:
 								            return None
 								        converted: List[Dict[str, Any]] = []
 								        for item in source_tools:
 								            fn = item.get("function", {}) if isinstance(item, dict) else {}
 								            name = fn.get("name")
 								            if not isinstance(name, str) or not name.strip():
 								                continue
 								            converted.append({
 								                "type": "function",
 								                "name": name,
 								                "description": fn.get("description", ""),
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                "strict": False,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                "parameters": fn.get("parameters", {"type": "object", "properties": {}}),
 								            })
 								        return converted or None
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								    @staticmethod
 								    def _split_responses_tool_id(raw_id: Any) -> tuple[Optional[str], Optional[str]]:
 								        """Split a stored tool id into (call_id, response_item_id)."""
 								        if not isinstance(raw_id, str):
 								            return None, None
 								        value = raw_id.strip()
 								        if not value:
 								            return None, None
 								        if "|" in value:
 								            call_id, response_item_id = value.split("|", 1)
 								            call_id = call_id.strip() or None
 								            response_item_id = response_item_id.strip() or None
 								            return call_id, response_item_id
 								        if value.startswith("fc_"):
 								            return None, value
 								        return value, None
 								    def _derive_responses_function_call_id(
 								        self,
 								        call_id: str,
 								        response_item_id: Optional[str] = None,
 								    ) -> str:
 								        """Build a valid Responses `function_call.id` (must start with `fc_`)."""
 								        if isinstance(response_item_id, str):
 								            candidate = response_item_id.strip()
 								            if candidate.startswith("fc_"):
 								                return candidate
 								        source = (call_id or "").strip()
 								        if source.startswith("fc_"):
 								            return source
 								        if source.startswith("call_") and len(source) > len("call_"):
 								            return f"fc_{source[len('call_'):]}"
 								        sanitized = re.sub(r"[^A-Za-z0-9_-]", "", source)
 								        if sanitized.startswith("fc_"):
 								            return sanitized
 								        if sanitized.startswith("call_") and len(sanitized) > len("call_"):
 								            return f"fc_{sanitized[len('call_'):]}"
 								        if sanitized:
 								            return f"fc_{sanitized[:48]}"
 								        seed = source or str(response_item_id or "") or uuid.uuid4().hex
 								        digest = hashlib.sha1(seed.encode("utf-8")).hexdigest()[:24]
 								        return f"fc_{digest}"
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								    def _chat_messages_to_responses_input(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
 								        """Convert internal chat-style messages to Responses input items."""
 								        items: List[Dict[str, Any]] = []
 								        for msg in messages:
 								            if not isinstance(msg, dict):
 								                continue
 								            role = msg.get("role")
 								            if role == "system":
 								                continue
 								            if role in {"user", "assistant"}:
 								                content = msg.get("content", "")
 								                content_text = str(content) if content is not None else ""
 								                if role == "assistant":
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                    # Replay encrypted reasoning items from previous turns
 								                    # so the API can maintain coherent reasoning chains.
 								                    codex_reasoning = msg.get("codex_reasoning_items")
 								                    if isinstance(codex_reasoning, list):
 								                        for ri in codex_reasoning:
 								                            if isinstance(ri, dict) and ri.get("encrypted_content"):
 								                                items.append(ri)
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    if content_text.strip():
 								                        items.append({"role": "assistant", "content": content_text})
 								                    tool_calls = msg.get("tool_calls")
 								                    if isinstance(tool_calls, list):
 								                        for tc in tool_calls:
 								                            if not isinstance(tc, dict):
 								                                continue
 								                            fn = tc.get("function", {})
 								                            fn_name = fn.get("name")
 								                            if not isinstance(fn_name, str) or not fn_name.strip():
 								                                continue
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                            embedded_call_id, embedded_response_item_id = self._split_responses_tool_id(
 								                                tc.get("id")
 								                            )
 								                            call_id = tc.get("call_id")
 								                            if not isinstance(call_id, str) or not call_id.strip():
 								                                call_id = embedded_call_id
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                            if not isinstance(call_id, str) or not call_id.strip():
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                                if (
 								                                    isinstance(embedded_response_item_id, str)
 								                                    and embedded_response_item_id.startswith("fc_")
 								                                    and len(embedded_response_item_id) > len("fc_")
 								                                ):
 								                                    call_id = f"call_{embedded_response_item_id[len('fc_'):]}"
 								                                else:
 								                                    call_id = f"call_{uuid.uuid4().hex[:12]}"
 								                            call_id = call_id.strip()
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                            arguments = fn.get("arguments", "{}")
 								                            if isinstance(arguments, dict):
 								                                arguments = json.dumps(arguments, ensure_ascii=False)
 								                            elif not isinstance(arguments, str):
 								                                arguments = str(arguments)
 								                            arguments = arguments.strip() or "{}"
 								                            items.append({
 								                                "type": "function_call",
 								                                "call_id": call_id,
 								                                "name": fn_name,
 								                                "arguments": arguments,
 								                            })
 								                    continue
 								                items.append({"role": role, "content": content_text})
 								                continue
 								            if role == "tool":
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                raw_tool_call_id = msg.get("tool_call_id")
 								                call_id, _ = self._split_responses_tool_id(raw_tool_call_id)
 								                if not isinstance(call_id, str) or not call_id.strip():
 								                    if isinstance(raw_tool_call_id, str) and raw_tool_call_id.strip():
 								                        call_id = raw_tool_call_id.strip()
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                if not isinstance(call_id, str) or not call_id.strip():
 								                    continue
 								                items.append({
 								                    "type": "function_call_output",
 								                    "call_id": call_id,
 								                    "output": str(msg.get("content", "") or ""),
 								                })
 								        return items
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								    def _preflight_codex_input_items(self, raw_items: Any) -> List[Dict[str, Any]]:
 								        if not isinstance(raw_items, list):
 								            raise ValueError("Codex Responses input must be a list of input items.")
 								        normalized: List[Dict[str, Any]] = []
 								        for idx, item in enumerate(raw_items):
 								            if not isinstance(item, dict):
 								                raise ValueError(f"Codex Responses input[{idx}] must be an object.")
 								            item_type = item.get("type")
 								            if item_type == "function_call":
 								                call_id = item.get("call_id")
 								                name = item.get("name")
 								                if not isinstance(call_id, str) or not call_id.strip():
 								                    raise ValueError(f"Codex Responses input[{idx}] function_call is missing call_id.")
 								                if not isinstance(name, str) or not name.strip():
 								                    raise ValueError(f"Codex Responses input[{idx}] function_call is missing name.")
 								                arguments = item.get("arguments", "{}")
 								                if isinstance(arguments, dict):
 								                    arguments = json.dumps(arguments, ensure_ascii=False)
 								                elif not isinstance(arguments, str):
 								                    arguments = str(arguments)
 								                arguments = arguments.strip() or "{}"
 								                normalized.append(
 								                    {
 								                        "type": "function_call",
 								                        "call_id": call_id.strip(),
 								                        "name": name.strip(),
 								                        "arguments": arguments,
 								                    }
 								                )
 								                continue
 								            if item_type == "function_call_output":
 								                call_id = item.get("call_id")
 								                if not isinstance(call_id, str) or not call_id.strip():
 								                    raise ValueError(f"Codex Responses input[{idx}] function_call_output is missing call_id.")
 								                output = item.get("output", "")
 								                if output is None:
 								                    output = ""
 								                if not isinstance(output, str):
 								                    output = str(output)
 								                normalized.append(
 								                    {
 								                        "type": "function_call_output",
 								                        "call_id": call_id.strip(),
 								                        "output": output,
 								                    }
 								                )
 								                continue
-												feat(agent): enhance reasoning handling and configuration

Added support for processing encrypted reasoning content within the AIAgent class. Introduced logic to determine reasoning effort and enable/disable reasoning based on configuration settings. Updated the kwargs to reflect these changes, ensuring proper handling of reasoning parameters during agent execution.

											
										
										
											2026-03-01 16:15:20 -08:00
+								            if item_type == "reasoning":
 								                encrypted = item.get("encrypted_content")
 								                if isinstance(encrypted, str) and encrypted:
 								                    reasoning_item = {"type": "reasoning", "encrypted_content": encrypted}
 								                    item_id = item.get("id")
 								                    if isinstance(item_id, str) and item_id:
 								                        reasoning_item["id"] = item_id
-												feat(agent): add summary handling for reasoning items

Enhanced the AIAgent class to capture and normalize summary information for reasoning items. Implemented logic to handle summaries as lists, ensuring proper formatting for API interactions. Updated tests to validate the inclusion of summaries in reasoning items, both for existing and default cases.

											
										
										
											2026-03-01 20:03:03 -08:00
+								                    summary = item.get("summary")
 								                    if isinstance(summary, list):
 								                        reasoning_item["summary"] = summary
 								                    else:
 								                        reasoning_item["summary"] = []
-												feat(agent): enhance reasoning handling and configuration

Added support for processing encrypted reasoning content within the AIAgent class. Introduced logic to determine reasoning effort and enable/disable reasoning based on configuration settings. Updated the kwargs to reflect these changes, ensuring proper handling of reasoning parameters during agent execution.

											
										
										
											2026-03-01 16:15:20 -08:00
+								                    normalized.append(reasoning_item)
 								                continue
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								            role = item.get("role")
 								            if role in {"user", "assistant"}:
 								                content = item.get("content", "")
 								                if content is None:
 								                    content = ""
 								                if not isinstance(content, str):
 								                    content = str(content)
 								                normalized.append({"role": role, "content": content})
 								                continue
 								            raise ValueError(
 								                f"Codex Responses input[{idx}] has unsupported item shape (type={item_type!r}, role={role!r})."
 								            )
 								        return normalized
 								    def _preflight_codex_api_kwargs(
 								        self,
 								        api_kwargs: Any,
 								        *,
 								        allow_stream: bool = False,
 								    ) -> Dict[str, Any]:
 								        if not isinstance(api_kwargs, dict):
 								            raise ValueError("Codex Responses request must be a dict.")
 								        required = {"model", "instructions", "input"}
 								        missing = [key for key in required if key not in api_kwargs]
 								        if missing:
 								            raise ValueError(f"Codex Responses request missing required field(s): {', '.join(sorted(missing))}.")
 								        model = api_kwargs.get("model")
 								        if not isinstance(model, str) or not model.strip():
 								            raise ValueError("Codex Responses request 'model' must be a non-empty string.")
 								        model = model.strip()
 								        instructions = api_kwargs.get("instructions")
 								        if instructions is None:
 								            instructions = ""
 								        if not isinstance(instructions, str):
 								            instructions = str(instructions)
 								        instructions = instructions.strip() or DEFAULT_AGENT_IDENTITY
 								        normalized_input = self._preflight_codex_input_items(api_kwargs.get("input"))
 								        tools = api_kwargs.get("tools")
 								        normalized_tools = None
 								        if tools is not None:
 								            if not isinstance(tools, list):
 								                raise ValueError("Codex Responses request 'tools' must be a list when provided.")
 								            normalized_tools = []
 								            for idx, tool in enumerate(tools):
 								                if not isinstance(tool, dict):
 								                    raise ValueError(f"Codex Responses tools[{idx}] must be an object.")
 								                if tool.get("type") != "function":
 								                    raise ValueError(f"Codex Responses tools[{idx}] has unsupported type {tool.get('type')!r}.")
 								                name = tool.get("name")
 								                parameters = tool.get("parameters")
 								                if not isinstance(name, str) or not name.strip():
 								                    raise ValueError(f"Codex Responses tools[{idx}] is missing a valid name.")
 								                if not isinstance(parameters, dict):
 								                    raise ValueError(f"Codex Responses tools[{idx}] is missing valid parameters.")
 								                description = tool.get("description", "")
 								                if description is None:
 								                    description = ""
 								                if not isinstance(description, str):
 								                    description = str(description)
 								                strict = tool.get("strict", False)
 								                if not isinstance(strict, bool):
 								                    strict = bool(strict)
 								                normalized_tools.append(
 								                    {
 								                        "type": "function",
 								                        "name": name.strip(),
 								                        "description": description,
 								                        "strict": strict,
 								                        "parameters": parameters,
 								                    }
 								                )
 								        store = api_kwargs.get("store", False)
 								        if store is not False:
 								            raise ValueError("Codex Responses contract requires 'store' to be false.")
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								        allowed_keys = {
 								            "model", "instructions", "input", "tools", "store",
 								            "reasoning", "include", "max_output_tokens", "temperature",
-												fix: allow tool_choice, parallel_tool_calls, prompt_cache_key in codex preflight

_preflight_codex_api_kwargs rejected these three fields as unsupported,
but _build_api_kwargs adds them to every codex request. This caused a
ValueError before _interruptible_api_call was reached, which was caught
by the retry loop and retried with exponential backoff — appearing as
an infinite hang in tests (275s total backoff across 6 retries).

The fix adds these keys to allowed_keys and passes them through to the
normalized request dict.

This fixes the hanging test_cron_run_job_codex_path_handles_internal_401_refresh
test (now passes in 2.6s instead of timing out).

											
										
										
											2026-03-11 07:00:14 -07:00
+								            "tool_choice", "parallel_tool_calls", "prompt_cache_key",
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								        }
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								        normalized: Dict[str, Any] = {
 								            "model": model,
 								            "instructions": instructions,
 								            "input": normalized_input,
 								            "tools": normalized_tools,
 								            "store": False,
 								        }
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								        # Pass through reasoning config
 								        reasoning = api_kwargs.get("reasoning")
 								        if isinstance(reasoning, dict):
 								            normalized["reasoning"] = reasoning
 								        include = api_kwargs.get("include")
 								        if isinstance(include, list):
 								            normalized["include"] = include
 								        # Pass through max_output_tokens and temperature
 								        max_output_tokens = api_kwargs.get("max_output_tokens")
 								        if isinstance(max_output_tokens, (int, float)) and max_output_tokens > 0:
 								            normalized["max_output_tokens"] = int(max_output_tokens)
 								        temperature = api_kwargs.get("temperature")
 								        if isinstance(temperature, (int, float)):
 								            normalized["temperature"] = float(temperature)
-												fix: allow tool_choice, parallel_tool_calls, prompt_cache_key in codex preflight

_preflight_codex_api_kwargs rejected these three fields as unsupported,
but _build_api_kwargs adds them to every codex request. This caused a
ValueError before _interruptible_api_call was reached, which was caught
by the retry loop and retried with exponential backoff — appearing as
an infinite hang in tests (275s total backoff across 6 retries).

The fix adds these keys to allowed_keys and passes them through to the
normalized request dict.

This fixes the hanging test_cron_run_job_codex_path_handles_internal_401_refresh
test (now passes in 2.6s instead of timing out).

											
										
										
											2026-03-11 07:00:14 -07:00
+								        # Pass through tool_choice, parallel_tool_calls, prompt_cache_key
 								        for passthrough_key in ("tool_choice", "parallel_tool_calls", "prompt_cache_key"):
 								            val = api_kwargs.get(passthrough_key)
 								            if val is not None:
 								                normalized[passthrough_key] = val
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								        if allow_stream:
 								            stream = api_kwargs.get("stream")
 								            if stream is not None and stream is not True:
 								                raise ValueError("Codex Responses 'stream' must be true when set.")
 								            if stream is True:
 								                normalized["stream"] = True
 								            allowed_keys.add("stream")
 								        elif "stream" in api_kwargs:
 								            raise ValueError("Codex Responses stream flag is only allowed in fallback streaming requests.")
 								        unexpected = sorted(key for key in api_kwargs.keys() if key not in allowed_keys)
 								        if unexpected:
 								            raise ValueError(
 								                f"Codex Responses request has unsupported field(s): {', '.join(unexpected)}."
 								            )
 								        return normalized
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								    def _extract_responses_message_text(self, item: Any) -> str:
 								        """Extract assistant text from a Responses message output item."""
 								        content = getattr(item, "content", None)
 								        if not isinstance(content, list):
 								            return ""
 								        chunks: List[str] = []
 								        for part in content:
 								            ptype = getattr(part, "type", None)
 								            if ptype not in {"output_text", "text"}:
 								                continue
 								            text = getattr(part, "text", None)
 								            if isinstance(text, str) and text:
 								                chunks.append(text)
 								        return "".join(chunks).strip()
 								    def _extract_responses_reasoning_text(self, item: Any) -> str:
 								        """Extract a compact reasoning text from a Responses reasoning item."""
 								        summary = getattr(item, "summary", None)
 								        if isinstance(summary, list):
 								            chunks: List[str] = []
 								            for part in summary:
 								                text = getattr(part, "text", None)
 								                if isinstance(text, str) and text:
 								                    chunks.append(text)
 								            if chunks:
 								                return "\n".join(chunks).strip()
 								        text = getattr(item, "text", None)
 								        if isinstance(text, str) and text:
 								            return text.strip()
 								        return ""
 								    def _normalize_codex_response(self, response: Any) -> tuple[Any, str]:
 								        """Normalize a Responses API object to an assistant_message-like object."""
 								        output = getattr(response, "output", None)
 								        if not isinstance(output, list) or not output:
 								            raise RuntimeError("Responses API returned no output items")
 								        response_status = getattr(response, "status", None)
 								        if isinstance(response_status, str):
 								            response_status = response_status.strip().lower()
 								        else:
 								            response_status = None
 								        if response_status in {"failed", "cancelled"}:
 								            error_obj = getattr(response, "error", None)
 								            if isinstance(error_obj, dict):
 								                error_msg = error_obj.get("message") or str(error_obj)
 								            else:
 								                error_msg = str(error_obj) if error_obj else f"Responses API returned status '{response_status}'"
 								            raise RuntimeError(error_msg)
 								        content_parts: List[str] = []
 								        reasoning_parts: List[str] = []
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								        reasoning_items_raw: List[Dict[str, Any]] = []
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        tool_calls: List[Any] = []
 								        has_incomplete_items = response_status in {"queued", "in_progress", "incomplete"}
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								        saw_commentary_phase = False
 								        saw_final_answer_phase = False
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
 								        for item in output:
 								            item_type = getattr(item, "type", None)
 								            item_status = getattr(item, "status", None)
 								            if isinstance(item_status, str):
 								                item_status = item_status.strip().lower()
 								            else:
 								                item_status = None
 								            if item_status in {"queued", "in_progress", "incomplete"}:
 								                has_incomplete_items = True
 								            if item_type == "message":
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                item_phase = getattr(item, "phase", None)
 								                if isinstance(item_phase, str):
 								                    normalized_phase = item_phase.strip().lower()
 								                    if normalized_phase in {"commentary", "analysis"}:
 								                        saw_commentary_phase = True
 								                    elif normalized_phase in {"final_answer", "final"}:
 								                        saw_final_answer_phase = True
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                message_text = self._extract_responses_message_text(item)
 								                if message_text:
 								                    content_parts.append(message_text)
 								            elif item_type == "reasoning":
 								                reasoning_text = self._extract_responses_reasoning_text(item)
 								                if reasoning_text:
 								                    reasoning_parts.append(reasoning_text)
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                # Capture the full reasoning item for multi-turn continuity.
 								                # encrypted_content is an opaque blob the API needs back on
 								                # subsequent turns to maintain coherent reasoning chains.
 								                encrypted = getattr(item, "encrypted_content", None)
 								                if isinstance(encrypted, str) and encrypted:
 								                    raw_item = {"type": "reasoning", "encrypted_content": encrypted}
 								                    item_id = getattr(item, "id", None)
 								                    if isinstance(item_id, str) and item_id:
 								                        raw_item["id"] = item_id
-												feat(agent): add summary handling for reasoning items

Enhanced the AIAgent class to capture and normalize summary information for reasoning items. Implemented logic to handle summaries as lists, ensuring proper formatting for API interactions. Updated tests to validate the inclusion of summaries in reasoning items, both for existing and default cases.

											
										
										
											2026-03-01 20:03:03 -08:00
+								                    # Capture summary — required by the API when replaying reasoning items
 								                    summary = getattr(item, "summary", None)
 								                    if isinstance(summary, list):
 								                        raw_summary = []
 								                        for part in summary:
 								                            text = getattr(part, "text", None)
 								                            if isinstance(text, str):
 								                                raw_summary.append({"type": "summary_text", "text": text})
 								                        raw_item["summary"] = raw_summary
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                    reasoning_items_raw.append(raw_item)
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								            elif item_type == "function_call":
 								                if item_status in {"queued", "in_progress", "incomplete"}:
 								                    continue
 								                fn_name = getattr(item, "name", "") or ""
 								                arguments = getattr(item, "arguments", "{}")
 								                if not isinstance(arguments, str):
-												fix: use json.dumps instead of str() for Codex Responses API arguments

When the Responses API returns tool call arguments as a dict,
str(dict) produces Python repr with single quotes (e.g. {'key': 'val'})
which is invalid JSON. Downstream json.loads() fails silently and the
tool gets called with empty arguments, losing all parameters.

Affects both function_call and custom_tool_call item types in
_normalize_codex_response().

											
										
										
											2026-03-13 00:01:14 +03:00
+								                    arguments = json.dumps(arguments, ensure_ascii=False)
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                raw_call_id = getattr(item, "call_id", None)
 								                raw_item_id = getattr(item, "id", None)
 								                embedded_call_id, _ = self._split_responses_tool_id(raw_item_id)
 								                call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
 								                if not isinstance(call_id, str) or not call_id.strip():
 								                    call_id = f"call_{uuid.uuid4().hex[:12]}"
 								                call_id = call_id.strip()
 								                response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
 								                response_item_id = self._derive_responses_function_call_id(call_id, response_item_id)
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                tool_calls.append(SimpleNamespace(
 								                    id=call_id,
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                    call_id=call_id,
 								                    response_item_id=response_item_id,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    type="function",
 								                    function=SimpleNamespace(name=fn_name, arguments=arguments),
 								                ))
 								            elif item_type == "custom_tool_call":
 								                fn_name = getattr(item, "name", "") or ""
 								                arguments = getattr(item, "input", "{}")
 								                if not isinstance(arguments, str):
-												fix: use json.dumps instead of str() for Codex Responses API arguments

When the Responses API returns tool call arguments as a dict,
str(dict) produces Python repr with single quotes (e.g. {'key': 'val'})
which is invalid JSON. Downstream json.loads() fails silently and the
tool gets called with empty arguments, losing all parameters.

Affects both function_call and custom_tool_call item types in
_normalize_codex_response().

											
										
										
											2026-03-13 00:01:14 +03:00
+								                    arguments = json.dumps(arguments, ensure_ascii=False)
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                raw_call_id = getattr(item, "call_id", None)
 								                raw_item_id = getattr(item, "id", None)
 								                embedded_call_id, _ = self._split_responses_tool_id(raw_item_id)
 								                call_id = raw_call_id if isinstance(raw_call_id, str) and raw_call_id.strip() else embedded_call_id
 								                if not isinstance(call_id, str) or not call_id.strip():
 								                    call_id = f"call_{uuid.uuid4().hex[:12]}"
 								                call_id = call_id.strip()
 								                response_item_id = raw_item_id if isinstance(raw_item_id, str) else None
 								                response_item_id = self._derive_responses_function_call_id(call_id, response_item_id)
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                tool_calls.append(SimpleNamespace(
 								                    id=call_id,
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                    call_id=call_id,
 								                    response_item_id=response_item_id,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    type="function",
 								                    function=SimpleNamespace(name=fn_name, arguments=arguments),
 								                ))
 								        final_text = "\n".join([p for p in content_parts if p]).strip()
 								        if not final_text and hasattr(response, "output_text"):
 								            out_text = getattr(response, "output_text", "")
 								            if isinstance(out_text, str):
 								                final_text = out_text.strip()
 								        assistant_message = SimpleNamespace(
 								            content=final_text,
 								            tool_calls=tool_calls,
 								            reasoning="\n\n".join(reasoning_parts).strip() if reasoning_parts else None,
 								            reasoning_content=None,
 								            reasoning_details=None,
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								            codex_reasoning_items=reasoning_items_raw or None,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        )
 								        if tool_calls:
 								            finish_reason = "tool_calls"
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								        elif has_incomplete_items or (saw_commentary_phase and not saw_final_answer_phase):
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								            finish_reason = "incomplete"
 								        else:
 								            finish_reason = "stop"
 								        return assistant_message, finish_reason
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								    def _thread_identity(self) -> str:
 								        thread = threading.current_thread()
 								        return f"{thread.name}:{thread.ident}"
 								    def _client_log_context(self) -> str:
 								        provider = getattr(self, "provider", "unknown")
 								        base_url = getattr(self, "base_url", "unknown")
 								        model = getattr(self, "model", "unknown")
 								        return (
 								            f"thread={self._thread_identity()} provider={provider} "
 								            f"base_url={base_url} model={model}"
 								        )
 								    def _openai_client_lock(self) -> threading.RLock:
 								        lock = getattr(self, "_client_lock", None)
 								        if lock is None:
 								            lock = threading.RLock()
 								            self._client_lock = lock
 								        return lock
 								    @staticmethod
 								    def _is_openai_client_closed(client: Any) -> bool:
 								        from unittest.mock import Mock
 								        if isinstance(client, Mock):
 								            return False
 								        http_client = getattr(client, "_client", None)
 								        return bool(getattr(http_client, "is_closed", False))
 								    def _create_openai_client(self, client_kwargs: dict, *, reason: str, shared: bool) -> Any:
 								        client = OpenAI(**client_kwargs)
 								        logger.info(
 								            "OpenAI client created (%s, shared=%s) %s",
 								            reason,
 								            shared,
 								            self._client_log_context(),
 								        )
 								        return client
 								    def _close_openai_client(self, client: Any, *, reason: str, shared: bool) -> None:
 								        if client is None:
 								            return
 								        try:
 								            client.close()
 								            logger.info(
 								                "OpenAI client closed (%s, shared=%s) %s",
 								                reason,
 								                shared,
 								                self._client_log_context(),
 								            )
 								        except Exception as exc:
 								            logger.debug(
 								                "OpenAI client close failed (%s, shared=%s) %s error=%s",
 								                reason,
 								                shared,
 								                self._client_log_context(),
 								                exc,
 								            )
 								    def _replace_primary_openai_client(self, *, reason: str) -> bool:
 								        with self._openai_client_lock():
 								            old_client = getattr(self, "client", None)
 								            try:
 								                new_client = self._create_openai_client(self._client_kwargs, reason=reason, shared=True)
 								            except Exception as exc:
 								                logger.warning(
 								                    "Failed to rebuild shared OpenAI client (%s) %s error=%s",
 								                    reason,
 								                    self._client_log_context(),
 								                    exc,
 								                )
 								                return False
 								            self.client = new_client
 								        self._close_openai_client(old_client, reason=f"replace:{reason}", shared=True)
 								        return True
 								    def _ensure_primary_openai_client(self, *, reason: str) -> Any:
 								        with self._openai_client_lock():
 								            client = getattr(self, "client", None)
 								            if client is not None and not self._is_openai_client_closed(client):
 								                return client
 								        logger.warning(
 								            "Detected closed shared OpenAI client; recreating before use (%s) %s",
 								            reason,
 								            self._client_log_context(),
 								        )
 								        if not self._replace_primary_openai_client(reason=f"recreate_closed:{reason}"):
 								            raise RuntimeError("Failed to recreate closed OpenAI client")
 								        with self._openai_client_lock():
 								            return self.client
 								    def _create_request_openai_client(self, *, reason: str) -> Any:
 								        from unittest.mock import Mock
 								        primary_client = self._ensure_primary_openai_client(reason=reason)
 								        if isinstance(primary_client, Mock):
 								            return primary_client
 								        with self._openai_client_lock():
 								            request_kwargs = dict(self._client_kwargs)
 								        return self._create_openai_client(request_kwargs, reason=reason, shared=False)
 								    def _close_request_openai_client(self, client: Any, *, reason: str) -> None:
 								        self._close_openai_client(client, reason=reason, shared=False)
 								    def _run_codex_stream(self, api_kwargs: dict, client: Any = None):
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        """Execute one streaming Responses API request and return the final response."""
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        active_client = client or self._ensure_primary_openai_client(reason="codex_stream_direct")
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								        max_stream_retries = 1
 								        for attempt in range(max_stream_retries + 1):
 								            try:
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                with active_client.responses.stream(**api_kwargs) as stream:
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								                    for _ in stream:
 								                        pass
 								                    return stream.get_final_response()
 								            except RuntimeError as exc:
 								                err_text = str(exc)
 								                missing_completed = "response.completed" in err_text
 								                if missing_completed and attempt < max_stream_retries:
 								                    logger.debug(
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                        "Responses stream closed before completion (attempt %s/%s); retrying. %s",
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								                        attempt + 1,
 								                        max_stream_retries + 1,
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                        self._client_log_context(),
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								                    )
 								                    continue
 								                if missing_completed:
 								                    logger.debug(
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                        "Responses stream did not emit response.completed; falling back to create(stream=True). %s",
 								                        self._client_log_context(),
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								                    )
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                    return self._run_codex_create_stream_fallback(api_kwargs, client=active_client)
-												Harden Codex stream handling and ack continuation

											
										
										
											2026-02-25 18:56:06 -08:00
+								                raise
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								    def _run_codex_create_stream_fallback(self, api_kwargs: dict, client: Any = None):
-												Fix Codex stream fallback for Responses completion gaps

											
										
										
											2026-02-25 19:08:11 -08:00
+								        """Fallback path for stream completion edge cases on Codex-style Responses backends."""
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        active_client = client or self._ensure_primary_openai_client(reason="codex_create_stream_fallback")
-												Fix Codex stream fallback for Responses completion gaps

											
										
										
											2026-02-25 19:08:11 -08:00
+								        fallback_kwargs = dict(api_kwargs)
 								        fallback_kwargs["stream"] = True
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								        fallback_kwargs = self._preflight_codex_api_kwargs(fallback_kwargs, allow_stream=True)
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        stream_or_response = active_client.responses.create(**fallback_kwargs)
-												Fix Codex stream fallback for Responses completion gaps

											
										
										
											2026-02-25 19:08:11 -08:00
 								        # Compatibility shim for mocks or providers that still return a concrete response.
 								        if hasattr(stream_or_response, "output"):
 								            return stream_or_response
 								        if not hasattr(stream_or_response, "__iter__"):
 								            return stream_or_response
 								        terminal_response = None
 								        try:
 								            for event in stream_or_response:
 								                event_type = getattr(event, "type", None)
 								                if not event_type and isinstance(event, dict):
 								                    event_type = event.get("type")
 								                if event_type not in {"response.completed", "response.incomplete", "response.failed"}:
 								                    continue
 								                terminal_response = getattr(event, "response", None)
 								                if terminal_response is None and isinstance(event, dict):
 								                    terminal_response = event.get("response")
 								                if terminal_response is not None:
 								                    return terminal_response
 								        finally:
 								            close_fn = getattr(stream_or_response, "close", None)
 								            if callable(close_fn):
 								                try:
 								                    close_fn()
 								                except Exception:
 								                    pass
 								        if terminal_response is not None:
 								            return terminal_response
 								        raise RuntimeError("Responses create(stream=True) fallback did not emit a terminal response.")
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								    def _try_refresh_codex_client_credentials(self, *, force: bool = True) -> bool:
 								        if self.api_mode != "codex_responses" or self.provider != "openai-codex":
 								            return False
 								        try:
 								            from hermes_cli.auth import resolve_codex_runtime_credentials
 								            creds = resolve_codex_runtime_credentials(force_refresh=force)
 								        except Exception as exc:
 								            logger.debug("Codex credential refresh failed: %s", exc)
 								            return False
 								        api_key = creds.get("api_key")
 								        base_url = creds.get("base_url")
 								        if not isinstance(api_key, str) or not api_key.strip():
 								            return False
 								        if not isinstance(base_url, str) or not base_url.strip():
 								            return False
 								        self.api_key = api_key.strip()
 								        self.base_url = base_url.strip().rstrip("/")
 								        self._client_kwargs["api_key"] = self.api_key
 								        self._client_kwargs["base_url"] = self.base_url
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        if not self._replace_primary_openai_client(reason="codex_credential_refresh"):
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								            return False
 								        return True
-												fix: implement Nous credential refresh on 401 error for retry logic

											
										
										
											2026-03-07 13:34:23 +11:00
+								    def _try_refresh_nous_client_credentials(self, *, force: bool = True) -> bool:
 								        if self.api_mode != "chat_completions" or self.provider != "nous":
 								            return False
 								        try:
 								            from hermes_cli.auth import resolve_nous_runtime_credentials
 								            creds = resolve_nous_runtime_credentials(
 								                min_key_ttl_seconds=max(60, int(os.getenv("HERMES_NOUS_MIN_KEY_TTL_SECONDS", "1800"))),
 								                timeout_seconds=float(os.getenv("HERMES_NOUS_TIMEOUT_SECONDS", "15")),
 								                force_mint=force,
 								            )
 								        except Exception as exc:
 								            logger.debug("Nous credential refresh failed: %s", exc)
 								            return False
 								        api_key = creds.get("api_key")
 								        base_url = creds.get("base_url")
 								        if not isinstance(api_key, str) or not api_key.strip():
 								            return False
 								        if not isinstance(base_url, str) or not base_url.strip():
 								            return False
 								        self.api_key = api_key.strip()
 								        self.base_url = base_url.strip().rstrip("/")
 								        self._client_kwargs["api_key"] = self.api_key
 								        self._client_kwargs["base_url"] = self.base_url
 								        # Nous requests should not inherit OpenRouter-only attribution headers.
 								        self._client_kwargs.pop("default_headers", None)
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        if not self._replace_primary_openai_client(reason="nous_credential_refresh"):
-												fix: implement Nous credential refresh on 401 error for retry logic

											
										
										
											2026-03-07 13:34:23 +11:00
+								            return False
 								        return True
-												fix: refresh Anthropic OAuth before stale env tokens

											
										
										
											2026-03-14 19:22:31 -07:00
+								    def _try_refresh_anthropic_client_credentials(self) -> bool:
 								        if self.api_mode != "anthropic_messages" or not hasattr(self, "_anthropic_api_key"):
 								            return False
 								        try:
 								            from agent.anthropic_adapter import resolve_anthropic_token, build_anthropic_client
 								            new_token = resolve_anthropic_token()
 								        except Exception as exc:
 								            logger.debug("Anthropic credential refresh failed: %s", exc)
 								            return False
 								        if not isinstance(new_token, str) or not new_token.strip():
 								            return False
 								        new_token = new_token.strip()
 								        if new_token == self._anthropic_api_key:
 								            return False
 								        try:
 								            self._anthropic_client.close()
 								        except Exception:
 								            pass
 								        try:
 								            self._anthropic_client = build_anthropic_client(new_token, getattr(self, "_anthropic_base_url", None))
 								        except Exception as exc:
 								            logger.warning("Failed to rebuild Anthropic client after credential refresh: %s", exc)
 								            return False
 								        self._anthropic_api_key = new_token
 								        return True
-												fix: preflight Anthropic auth and prefer Claude store

											
										
										
											2026-03-14 19:38:55 -07:00
+								    def _anthropic_messages_create(self, api_kwargs: dict):
 								        if self.api_mode == "anthropic_messages":
 								            self._try_refresh_anthropic_client_credentials()
 								        return self._anthropic_client.messages.create(**api_kwargs)
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								    def _interruptible_api_call(self, api_kwargs: dict):
 								        """
 								        Run the API call in a background thread so the main conversation loop
 								        can detect interrupts without waiting for the full HTTP round-trip.
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        Each worker thread gets its own OpenAI client instance. Interrupts only
 								        close that worker-local client, so retries and other requests never
 								        inherit a closed transport.
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								        """
 								        result = {"response": None, "error": None}
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        request_client_holder = {"client": None}
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
 								        def _call():
 								            try:
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                if self.api_mode == "codex_responses":
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                    request_client_holder["client"] = self._create_request_openai_client(reason="codex_stream_request")
 								                    result["response"] = self._run_codex_stream(
 								                        api_kwargs,
 								                        client=request_client_holder["client"],
 								                    )
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                elif self.api_mode == "anthropic_messages":
-												fix: preflight Anthropic auth and prefer Claude store

											
										
										
											2026-03-14 19:38:55 -07:00
+								                    result["response"] = self._anthropic_messages_create(api_kwargs)
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								                else:
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                    request_client_holder["client"] = self._create_request_openai_client(reason="chat_completion_request")
 								                    result["response"] = request_client_holder["client"].chat.completions.create(**api_kwargs)
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								            except Exception as e:
 								                result["error"] = e
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								            finally:
 								                request_client = request_client_holder.get("client")
 								                if request_client is not None:
 								                    self._close_request_openai_client(request_client, reason="request_complete")
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								        t = threading.Thread(target=_call, daemon=True)
 								        t.start()
 								        while t.is_alive():
 								            t.join(timeout=0.3)
 								            if self._interrupt_requested:
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                # Force-close the in-flight worker-local HTTP connection to stop
 								                # token generation without poisoning the shared client used to
 								                # seed future retries.
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								                try:
-												fix: restore Anthropic interrupt handler in _interruptible_api_call

Rebase auto-merge silently overwrote main's Anthropic-aware interrupt
handler with the older OpenAI-only version. Without this fix, interrupting
an Anthropic API call closes the wrong client and leaves token generation
running on the Anthropic side.

											
										
										
											2026-03-13 15:09:49 +03:00
+								                    if self.api_mode == "anthropic_messages":
 								                        from agent.anthropic_adapter import build_anthropic_client
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
 								                        self._anthropic_client.close()
 								                        self._anthropic_client = build_anthropic_client(
 								                            self._anthropic_api_key,
 								                            getattr(self, "_anthropic_base_url", None),
 								                        )
-												fix: restore Anthropic interrupt handler in _interruptible_api_call

Rebase auto-merge silently overwrote main's Anthropic-aware interrupt
handler with the older OpenAI-only version. Without this fix, interrupting
an Anthropic API call closes the wrong client and leaves token generation
running on the Anthropic side.

											
										
										
											2026-03-13 15:09:49 +03:00
+								                    else:
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                        request_client = request_client_holder.get("client")
 								                        if request_client is not None:
 								                            self._close_request_openai_client(request_client, reason="interrupt_abort")
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								                except Exception:
 								                    pass
 								                raise InterruptedError("Agent interrupted during API call")
 								        if result["error"] is not None:
 								            raise result["error"]
 								        return result["response"]
 								    def _streaming_api_call(self, api_kwargs: dict, stream_callback):
 								        """Streaming variant of _interruptible_api_call for voice TTS pipeline.
 								        Uses ``stream=True`` and forwards content deltas to *stream_callback*
 								        in real-time.  Returns a ``SimpleNamespace`` that mimics a normal
 								        ``ChatCompletion`` so the rest of the agent loop works unchanged.
 								        This method is separate from ``_interruptible_api_call`` to keep the
 								        core agent loop untouched for non-voice users.
 								        """
 								        result = {"response": None, "error": None}
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								        request_client_holder = {"client": None}
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
 								        def _call():
 								            try:
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								                stream_kwargs = {**api_kwargs, "stream": True}
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                request_client_holder["client"] = self._create_request_openai_client(
 								                    reason="chat_completion_stream_request"
 								                )
 								                stream = request_client_holder["client"].chat.completions.create(**stream_kwargs)
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
 								                content_parts: list[str] = []
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								                tool_calls_acc: dict[int, dict] = {}
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								                finish_reason = None
 								                model_name = None
 								                role = "assistant"
 								                for chunk in stream:
 								                    if not chunk.choices:
 								                        if hasattr(chunk, "model") and chunk.model:
 								                            model_name = chunk.model
 								                        continue
 								                    delta = chunk.choices[0].delta
 								                    if hasattr(chunk, "model") and chunk.model:
 								                        model_name = chunk.model
 								                    if delta and delta.content:
 								                        content_parts.append(delta.content)
 								                        try:
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								                            stream_callback(delta.content)
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								                        except Exception:
 								                            pass
 								                    if delta and delta.tool_calls:
 								                        for tc_delta in delta.tool_calls:
-												fix: streaming tool call parsing, error handling, and fake HA state mutation

- Fix Gemini streaming tool call merge bug: multiple tool calls with same
  index but different IDs are now parsed as separate calls instead of
  concatenating names (e.g. ha_call_serviceha_call_service)
- Handle partial results in voice mode: show error and stop continuous
  mode when agent returns partial/failed results with empty response
- Fix error display during streaming TTS: error messages are shown in
  full response box even when streaming box was already opened
- Add duplicate sentence filter in TTS: skip near-duplicate sentences
  from LLM repetition
- Fix fake HA server state mutation: turn_on/turn_off/set_temperature
  correctly update entity states; temperature sensor simulates change
  when thermostat is adjusted

											
										
										
											2026-03-07 01:49:12 +03:00
+								                            idx = tc_delta.index if tc_delta.index is not None else 0
 								                            if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]:
 								                                matched = False
 								                                for eidx, eentry in tool_calls_acc.items():
 								                                    if eentry["id"] == tc_delta.id:
 								                                        idx = eidx
 								                                        matched = True
 								                                        break
 								                                if not matched:
 								                                    idx = (max(k for k in tool_calls_acc if isinstance(k, int)) + 1) if tool_calls_acc else 0
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								                            if idx not in tool_calls_acc:
 								                                tool_calls_acc[idx] = {
 								                                    "id": tc_delta.id or "",
 								                                    "type": "function",
 								                                    "function": {"name": "", "arguments": ""},
 								                                }
 								                            entry = tool_calls_acc[idx]
 								                            if tc_delta.id:
 								                                entry["id"] = tc_delta.id
 								                            if tc_delta.function:
 								                                if tc_delta.function.name:
 								                                    entry["function"]["name"] += tc_delta.function.name
 								                                if tc_delta.function.arguments:
 								                                    entry["function"]["arguments"] += tc_delta.function.arguments
 								                    if chunk.choices[0].finish_reason:
 								                        finish_reason = chunk.choices[0].finish_reason
 								                full_content = "".join(content_parts) or None
 								                mock_tool_calls = None
 								                if tool_calls_acc:
 								                    mock_tool_calls = []
 								                    for idx in sorted(tool_calls_acc):
 								                        tc = tool_calls_acc[idx]
 								                        mock_tool_calls.append(SimpleNamespace(
 								                            id=tc["id"],
 								                            type=tc["type"],
 								                            function=SimpleNamespace(
 								                                name=tc["function"]["name"],
 								                                arguments=tc["function"]["arguments"],
 								                            ),
 								                        ))
 								                mock_message = SimpleNamespace(
 								                    role=role,
 								                    content=full_content,
 								                    tool_calls=mock_tool_calls,
 								                    reasoning_content=None,
 								                )
 								                mock_choice = SimpleNamespace(
 								                    index=0,
 								                    message=mock_message,
 								                    finish_reason=finish_reason or "stop",
 								                )
 								                mock_response = SimpleNamespace(
 								                    id="stream-" + str(uuid.uuid4()),
 								                    model=model_name,
 								                    choices=[mock_choice],
 								                    usage=None,
 								                )
 								                result["response"] = mock_response
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								            except Exception as e:
 								                result["error"] = e
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								            finally:
 								                request_client = request_client_holder.get("client")
 								                if request_client is not None:
 								                    self._close_request_openai_client(request_client, reason="stream_request_complete")
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
 								        t = threading.Thread(target=_call, daemon=True)
 								        t.start()
 								        while t.is_alive():
 								            t.join(timeout=0.3)
 								            if self._interrupt_requested:
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								                try:
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                    if self.api_mode == "anthropic_messages":
 								                        from agent.anthropic_adapter import build_anthropic_client
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
 								                        self._anthropic_client.close()
 								                        self._anthropic_client = build_anthropic_client(
 								                            self._anthropic_api_key,
 								                            getattr(self, "_anthropic_base_url", None),
 								                        )
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                    else:
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                        request_client = request_client_holder.get("client")
 								                        if request_client is not None:
 								                            self._close_request_openai_client(request_client, reason="stream_interrupt_abort")
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								                except Exception:
 								                    pass
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								                raise InterruptedError("Agent interrupted during API call")
 								        if result["error"] is not None:
 								            raise result["error"]
 								        return result["response"]
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								    # ── Provider fallback ──────────────────────────────────────────────────
 								    def _try_activate_fallback(self) -> bool:
 								        """Switch to the configured fallback model/provider.
 								        Called when the primary model is failing after retries.  Swaps the
 								        OpenAI client, model slug, and provider in-place so the retry loop
 								        can continue with the new backend.  One-shot: returns False if
 								        already activated or not configured.
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
 								        Uses the centralized provider router (resolve_provider_client) for
 								        auth resolution and client construction — no duplicated provider→key
 								        mappings.
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								        """
 								        if self._fallback_activated or not self._fallback_model:
 								            return False
 								        fb = self._fallback_model
 								        fb_provider = (fb.get("provider") or "").strip().lower()
 								        fb_model = (fb.get("model") or "").strip()
 								        if not fb_provider or not fb_model:
 								            return False
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
+								        # Use centralized router for client construction.
 								        # raw_codex=True because the main agent needs direct responses.stream()
 								        # access for Codex providers.
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								        try:
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
+								            from agent.auxiliary_client import resolve_provider_client
 								            fb_client, _ = resolve_provider_client(
 								                fb_provider, model=fb_model, raw_codex=True)
 								            if fb_client is None:
 								                logging.warning(
 								                    "Fallback to %s failed: provider not configured",
 								                    fb_provider)
 								                return False
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
+								            # Determine api_mode from provider
 								            fb_api_mode = "chat_completions"
 								            if fb_provider == "openai-codex":
 								                fb_api_mode = "codex_responses"
-												fix: anthropic adapter — max_tokens ignored, fallback crash, proxy base_url filtered

- Pass self.max_tokens to build_anthropic_kwargs instead of hardcoded None
- Add anthropic case to _try_activate_fallback (was only handling openai-codex)
- Remove 'anthropic in base_url' filter that blocked custom proxy URLs

											
										
										
											2026-03-13 04:22:16 +03:00
+								            elif fb_provider == "anthropic":
 								                fb_api_mode = "anthropic_messages"
-												refactor: route main agent client + fallback through centralized router

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.

											
										
										
											2026-03-11 21:38:29 -07:00
+								            fb_base_url = str(fb_client.base_url)
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								            old_model = self.model
 								            self.model = fb_model
 								            self.provider = fb_provider
 								            self.base_url = fb_base_url
-												feat: add openai-codex as fallback provider

Codex OAuth uses a different auth flow (OAuth tokens, not env vars)
and a different API mode (codex_responses, not chat_completions).
The fallback now handles this specially:

- Resolves credentials via resolve_codex_runtime_credentials()
- Sets api_mode to codex_responses
- Fails gracefully if no Codex OAuth session exists

Also added to the commented-out config.yaml example.
2 new tests (codex activation + graceful failure).

											
										
										
											2026-03-08 21:34:15 -07:00
+								            self.api_mode = fb_api_mode
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								            self._fallback_activated = True
-												fix: anthropic adapter — max_tokens ignored, fallback crash, proxy base_url filtered

- Pass self.max_tokens to build_anthropic_kwargs instead of hardcoded None
- Add anthropic case to _try_activate_fallback (was only handling openai-codex)
- Remove 'anthropic in base_url' filter that blocked custom proxy URLs

											
										
										
											2026-03-13 04:22:16 +03:00
+								            if fb_api_mode == "anthropic_messages":
 								                # Build native Anthropic client instead of using OpenAI client
 								                from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token
 								                effective_key = fb_client.api_key or resolve_anthropic_token() or ""
 								                self._anthropic_api_key = effective_key
-												fix: address PR review round 5 — streaming guard, VC auth, history prefix, auto-TTS control

1. Gate _streaming_api_call to chat_completions mode only — Anthropic and
   Codex fall back to _interruptible_api_call. Preserve Anthropic base_url
   across all client rebuild paths (interrupt, fallback, 401 refresh).

2. Discord VC synthetic events now use chat_type="channel" instead of
   defaulting to "dm" — prevents session bleed into DM context.
   Authorization runs before echoing transcript. Sanitize @everyone/@here
   in voice transcripts.

3. CLI voice prefix ("[Voice input...]") is now API-call-local only —
   stripped from returned history so it never persists to session DB or
   resumed sessions.

4. /voice off now disables base adapter auto-TTS via _auto_tts_disabled_chats
   set — voice input no longer triggers TTS when voice mode is off.

											
										
										
											2026-03-14 10:31:49 +03:00
+								                self._anthropic_base_url = getattr(fb_client, "base_url", None)
 								                self._anthropic_client = build_anthropic_client(effective_key, self._anthropic_base_url)
-												fix: anthropic adapter — max_tokens ignored, fallback crash, proxy base_url filtered

- Pass self.max_tokens to build_anthropic_kwargs instead of hardcoded None
- Add anthropic case to _try_activate_fallback (was only handling openai-codex)
- Remove 'anthropic in base_url' filter that blocked custom proxy URLs

											
										
										
											2026-03-13 04:22:16 +03:00
+								                self.client = None
 								                self._client_kwargs = {}
 								            else:
 								                # Swap OpenAI client and config in-place
 								                self.client = fb_client
 								                self._client_kwargs = {
 								                    "api_key": fb_client.api_key,
 								                    "base_url": fb_base_url,
 								                }
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								            # Re-evaluate prompt caching for the new provider/model
-												fix: anthropic adapter — max_tokens ignored, fallback crash, proxy base_url filtered

- Pass self.max_tokens to build_anthropic_kwargs instead of hardcoded None
- Add anthropic case to _try_activate_fallback (was only handling openai-codex)
- Remove 'anthropic in base_url' filter that blocked custom proxy URLs

											
										
										
											2026-03-13 04:22:16 +03:00
+								            is_native_anthropic = fb_api_mode == "anthropic_messages"
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								            self._use_prompt_caching = (
-												fix: anthropic adapter — max_tokens ignored, fallback crash, proxy base_url filtered

- Pass self.max_tokens to build_anthropic_kwargs instead of hardcoded None
- Add anthropic case to _try_activate_fallback (was only handling openai-codex)
- Remove 'anthropic in base_url' filter that blocked custom proxy URLs

											
										
										
											2026-03-13 04:22:16 +03:00
+								                ("openrouter" in fb_base_url.lower() and "claude" in fb_model.lower())
 								                or is_native_anthropic
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								            )
 								            print(
 								                f"{self.log_prefix}🔄 Primary model failed — switching to fallback: "
 								                f"{fb_model} via {fb_provider}"
 								            )
 								            logging.info(
 								                "Fallback activated: %s → %s (%s)",
 								                old_model, fb_model, fb_provider,
 								            )
 								            return True
 								        except Exception as e:
 								            logging.error("Failed to activate fallback model: %s", e)
 								            return False
 								    # ── End provider fallback ──────────────────────────────────────────────
-												fix: convert anthropic image content blocks

											
										
										
											2026-03-14 23:21:09 -07:00
+								    @staticmethod
 								    def _content_has_image_parts(content: Any) -> bool:
 								        if not isinstance(content, list):
 								            return False
 								        for part in content:
 								            if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}:
 								                return True
 								        return False
 								    @staticmethod
 								    def _materialize_data_url_for_vision(image_url: str) -> tuple[str, Optional[Path]]:
 								        header, _, data = str(image_url or "").partition(",")
 								        mime = "image/jpeg"
 								        if header.startswith("data:"):
 								            mime_part = header[len("data:"):].split(";", 1)[0].strip()
 								            if mime_part.startswith("image/"):
 								                mime = mime_part
 								        suffix = {
 								            "image/png": ".png",
 								            "image/gif": ".gif",
 								            "image/webp": ".webp",
 								            "image/jpeg": ".jpg",
 								            "image/jpg": ".jpg",
 								        }.get(mime, ".jpg")
 								        tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False)
 								        with tmp:
 								            tmp.write(base64.b64decode(data))
 								        path = Path(tmp.name)
 								        return str(path), path
 								    def _describe_image_for_anthropic_fallback(self, image_url: str, role: str) -> str:
 								        cache_key = hashlib.sha256(str(image_url or "").encode("utf-8")).hexdigest()
 								        cached = self._anthropic_image_fallback_cache.get(cache_key)
 								        if cached:
 								            return cached
 								        role_label = {
 								            "assistant": "assistant",
 								            "tool": "tool result",
 								        }.get(role, "user")
 								        analysis_prompt = (
 								            "Describe everything visible in this image in thorough detail. "
 								            "Include any text, code, UI, data, objects, people, layout, colors, "
 								            "and any other notable visual information."
 								        )
 								        vision_source = str(image_url or "")
 								        cleanup_path: Optional[Path] = None
 								        if vision_source.startswith("data:"):
 								            vision_source, cleanup_path = self._materialize_data_url_for_vision(vision_source)
 								        description = ""
 								        try:
 								            from tools.vision_tools import vision_analyze_tool
 								            result_json = asyncio.run(
 								                vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt)
 								            )
 								            result = json.loads(result_json) if isinstance(result_json, str) else {}
 								            description = (result.get("analysis") or "").strip()
 								        except Exception as e:
 								            description = f"Image analysis failed: {e}"
 								        finally:
 								            if cleanup_path and cleanup_path.exists():
 								                try:
 								                    cleanup_path.unlink()
 								                except OSError:
 								                    pass
 								        if not description:
 								            description = "Image analysis failed."
 								        note = f"[The {role_label} attached an image. Here's what it contains:\n{description}]"
 								        if vision_source and not str(image_url or "").startswith("data:"):
 								            note += (
 								                f"\n[If you need a closer look, use vision_analyze with image_url: {vision_source}]"
 								            )
 								        self._anthropic_image_fallback_cache[cache_key] = note
 								        return note
 								    def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
 								        if not self._content_has_image_parts(content):
 								            return content
 								        text_parts: List[str] = []
 								        image_notes: List[str] = []
 								        for part in content:
 								            if isinstance(part, str):
 								                if part.strip():
 								                    text_parts.append(part.strip())
 								                continue
 								            if not isinstance(part, dict):
 								                continue
 								            ptype = part.get("type")
 								            if ptype in {"text", "input_text"}:
 								                text = str(part.get("text", "") or "").strip()
 								                if text:
 								                    text_parts.append(text)
 								                continue
 								            if ptype in {"image_url", "input_image"}:
 								                image_data = part.get("image_url", {})
 								                image_url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data or "")
 								                if image_url:
 								                    image_notes.append(self._describe_image_for_anthropic_fallback(image_url, role))
 								                else:
 								                    image_notes.append("[An image was attached but no image source was available.]")
 								                continue
 								            text = str(part.get("text", "") or "").strip()
 								            if text:
 								                text_parts.append(text)
 								        prefix = "\n\n".join(note for note in image_notes if note).strip()
 								        suffix = "\n".join(text for text in text_parts if text).strip()
 								        if prefix and suffix:
 								            return f"{prefix}\n\n{suffix}"
 								        if prefix:
 								            return prefix
 								        if suffix:
 								            return suffix
 								        return "[A multimodal message was converted to text for Anthropic compatibility.]"
 								    def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
 								        if not any(
 								            isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
 								            for msg in api_messages
 								        ):
 								            return api_messages
 								        transformed = copy.deepcopy(api_messages)
 								        for msg in transformed:
 								            if not isinstance(msg, dict):
 								                continue
 								            msg["content"] = self._preprocess_anthropic_content(
 								                msg.get("content"),
 								                str(msg.get("role", "user") or "user"),
 								            )
 								        return transformed
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								    def _build_api_kwargs(self, api_messages: list) -> dict:
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        """Build the keyword arguments dict for the active API mode."""
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								        if self.api_mode == "anthropic_messages":
 								            from agent.anthropic_adapter import build_anthropic_kwargs
-												fix: convert anthropic image content blocks

											
										
										
											2026-03-14 23:21:09 -07:00
+								            anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages)
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								            return build_anthropic_kwargs(
 								                model=self.model,
-												fix: convert anthropic image content blocks

											
										
										
											2026-03-14 23:21:09 -07:00
+								                messages=anthropic_messages,
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                tools=self.tools,
-												fix: anthropic adapter — max_tokens ignored, fallback crash, proxy base_url filtered

- Pass self.max_tokens to build_anthropic_kwargs instead of hardcoded None
- Add anthropic case to _try_activate_fallback (was only handling openai-codex)
- Remove 'anthropic in base_url' filter that blocked custom proxy URLs

											
										
										
											2026-03-13 04:22:16 +03:00
+								                max_tokens=self.max_tokens,
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                reasoning_config=self.reasoning_config,
 								            )
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								        if self.api_mode == "codex_responses":
 								            instructions = ""
 								            payload_messages = api_messages
 								            if api_messages and api_messages[0].get("role") == "system":
 								                instructions = str(api_messages[0].get("content") or "").strip()
 								                payload_messages = api_messages[1:]
 								            if not instructions:
 								                instructions = DEFAULT_AGENT_IDENTITY
-												feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to
medium reasoning effort instead of xhigh. Users can still override
via config or CLI flag. Updates code, tests, example config, and docs.

											
										
										
											2026-03-07 10:14:19 -08:00
+								            # Resolve reasoning effort: config > default (medium)
 								            reasoning_effort = "medium"
-												feat(agent): enhance reasoning handling and configuration

Added support for processing encrypted reasoning content within the AIAgent class. Introduced logic to determine reasoning effort and enable/disable reasoning based on configuration settings. Updated the kwargs to reflect these changes, ensuring proper handling of reasoning parameters during agent execution.

											
										
										
											2026-03-01 16:15:20 -08:00
+								            reasoning_enabled = True
 								            if self.reasoning_config and isinstance(self.reasoning_config, dict):
 								                if self.reasoning_config.get("enabled") is False:
 								                    reasoning_enabled = False
 								                elif self.reasoning_config.get("effort"):
 								                    reasoning_effort = self.reasoning_config["effort"]
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								            kwargs = {
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                "model": self.model,
 								                "instructions": instructions,
 								                "input": self._chat_messages_to_responses_input(payload_messages),
 								                "tools": self._responses_tools(),
-												fix: add missing Responses API parameters for Codex provider

Adds tool_choice, parallel_tool_calls, and prompt_cache_key to the
Codex Responses API request kwargs — matching what the official Codex
CLI sends.

- tool_choice: 'auto' — enables the model to proactively call tools.
  Without this, the model may default to not using tools, which explains
  reports of the agent claiming it lacks shell access (#747).
- parallel_tool_calls: True — allows the model to issue multiple tool
  calls in a single turn for efficiency.
- prompt_cache_key: session_id — enables server-side prompt caching
  across turns in the same session, reducing latency and cost.

Refs #747

											
										
										
											2026-03-11 04:28:31 -07:00
+								                "tool_choice": "auto",
 								                "parallel_tool_calls": True,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                "store": False,
-												fix: add missing Responses API parameters for Codex provider

Adds tool_choice, parallel_tool_calls, and prompt_cache_key to the
Codex Responses API request kwargs — matching what the official Codex
CLI sends.

- tool_choice: 'auto' — enables the model to proactively call tools.
  Without this, the model may default to not using tools, which explains
  reports of the agent claiming it lacks shell access (#747).
- parallel_tool_calls: True — allows the model to issue multiple tool
  calls in a single turn for efficiency.
- prompt_cache_key: session_id — enables server-side prompt caching
  across turns in the same session, reducing latency and cost.

Refs #747

											
										
										
											2026-03-11 04:28:31 -07:00
+								                "prompt_cache_key": self.session_id,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								            }
-												feat(agent): enhance reasoning handling and configuration

Added support for processing encrypted reasoning content within the AIAgent class. Introduced logic to determine reasoning effort and enable/disable reasoning based on configuration settings. Updated the kwargs to reflect these changes, ensuring proper handling of reasoning parameters during agent execution.

											
										
										
											2026-03-01 16:15:20 -08:00
+								            if reasoning_enabled:
 								                kwargs["reasoning"] = {"effort": reasoning_effort, "summary": "auto"}
 								                kwargs["include"] = ["reasoning.encrypted_content"]
 								            else:
 								                kwargs["include"] = []
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
 								            if self.max_tokens is not None:
 								                kwargs["max_output_tokens"] = self.max_tokens
 								            return kwargs
-												fix: preserve chat kwargs identity when no sanitization is needed

											
										
										
											2026-03-11 08:35:27 +00:00
+								        sanitized_messages = api_messages
 								        needs_sanitization = False
 								        for msg in api_messages:
-												fix: sanitize chat payloads and provider precedence

											
										
										
											2026-03-13 23:59:12 -07:00
+								            if not isinstance(msg, dict):
 								                continue
-												fix: preserve chat kwargs identity when no sanitization is needed

											
										
										
											2026-03-11 08:35:27 +00:00
+								            if "codex_reasoning_items" in msg:
 								                needs_sanitization = True
 								                break
-												fix: sanitize chat payloads and provider precedence

											
										
										
											2026-03-13 23:59:12 -07:00
 								            tool_calls = msg.get("tool_calls")
 								            if isinstance(tool_calls, list):
 								                for tool_call in tool_calls:
-												fix: preserve chat kwargs identity when no sanitization is needed

											
										
										
											2026-03-11 08:35:27 +00:00
+								                    if not isinstance(tool_call, dict):
 								                        continue
 								                    if "call_id" in tool_call or "response_item_id" in tool_call:
 								                        needs_sanitization = True
 								                        break
 								                if needs_sanitization:
 								                    break
 								        if needs_sanitization:
 								            sanitized_messages = copy.deepcopy(api_messages)
 								            for msg in sanitized_messages:
 								                if not isinstance(msg, dict):
 								                    continue
 								                # Codex-only replay state must not leak into strict chat-completions APIs.
 								                msg.pop("codex_reasoning_items", None)
 								                tool_calls = msg.get("tool_calls")
 								                if isinstance(tool_calls, list):
 								                    for tool_call in tool_calls:
 								                        if isinstance(tool_call, dict):
 								                            tool_call.pop("call_id", None)
 								                            tool_call.pop("response_item_id", None)
-												fix: sanitize chat payloads and provider precedence

											
										
										
											2026-03-13 23:59:12 -07:00
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        provider_preferences = {}
 								        if self.providers_allowed:
 								            provider_preferences["only"] = self.providers_allowed
 								        if self.providers_ignored:
 								            provider_preferences["ignore"] = self.providers_ignored
 								        if self.providers_order:
 								            provider_preferences["order"] = self.providers_order
 								        if self.provider_sort:
 								            provider_preferences["sort"] = self.provider_sort
-												feat(provider-routing): add OpenRouter provider routing configuration

Introduced a new `provider_routing` section in the CLI configuration to control how requests are routed across providers when using OpenRouter. This includes options for sorting providers by throughput, latency, or price, as well as allowing or ignoring specific providers, setting the order of provider attempts, and managing data collection policies. Updated relevant classes and documentation to support these features, enhancing flexibility in provider selection.

											
										
										
											2026-03-01 18:24:27 -08:00
+								        if self.provider_require_parameters:
 								            provider_preferences["require_parameters"] = True
 								        if self.provider_data_collection:
 								            provider_preferences["data_collection"] = self.provider_data_collection
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								        api_kwargs = {
 								            "model": self.model,
-												fix: sanitize chat payloads and provider precedence

											
										
										
											2026-03-13 23:59:12 -07:00
+								            "messages": sanitized_messages,
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            "tools": self.tools if self.tools else None,
-												fix: handle YAML null values in session reset policy + configurable API timeout (#1194)

* fix: Home Assistant event filtering now closed by default

Previously, when no watch_domains or watch_entities were configured,
ALL state_changed events passed through to the agent, causing users
to be flooded with notifications for every HA entity change.

Now events are dropped by default unless the user explicitly configures:
- watch_domains: list of domains to monitor (e.g. climate, light)
- watch_entities: list of specific entity IDs to monitor
- watch_all: true (new option — opt-in to receive all events)

A warning is logged at connect time if no filters are configured,
guiding users to set up their HA platform config.

All 49 gateway HA tests + 52 HA tool tests pass.

* docs: update Home Assistant integration documentation

- homeassistant.md: Fix event filtering docs to reflect closed-by-default
  behavior. Add watch_all option. Replace Python dict config example with
  YAML. Fix defaults table (was incorrectly showing 'all'). Add required
  configuration warning admonition.
- environment-variables.md: Add HASS_TOKEN and HASS_URL to Messaging section.
- messaging/index.md: Add Home Assistant to description, architecture
  diagram, platform toolsets table, and Next Steps links.

* fix(terminal): strip provider env vars from background and PTY subprocesses

Extends the env var blocklist from #1157 to also cover the two remaining
leaky paths in process_registry.py:

- spawn_local() PTY path (line 156)
- spawn_local() background Popen path (line 197)

Both were still using raw os.environ, leaking provider vars to background
processes and interactive PTY sessions. Now uses the same dynamic
_HERMES_PROVIDER_ENV_BLOCKLIST from local.py.

Explicit env_vars passed to spawn_local() still override the blocklist,
matching the existing behavior for callers that intentionally need these.

Gap identified by PR #1004 (@PeterFile).

* feat(delegate): add observability metadata to subagent results

Enrich delegate_task results with metadata from the child AIAgent:

- model: which model the child used
- exit_reason: completed | interrupted | max_iterations
- tokens.input / tokens.output: token counts
- tool_trace: per-tool-call trace with byte sizes and ok/error status

Tool trace uses tool_call_id matching to correctly pair parallel tool
calls with their results, with a fallback for messages without IDs.

Cherry-picked from PR #872 by @omerkaz, with fixes:
- Fixed parallel tool call trace pairing (was always updating last entry)
- Removed redundant 'iterations' field (identical to existing 'api_calls')
- Added test for parallel tool call trace correctness

Co-authored-by: omerkaz <omerkaz@users.noreply.github.com>

* feat(stt): add free local whisper transcription via faster-whisper

Replace OpenAI-only STT with a dual-provider system mirroring the TTS
architecture (Edge TTS free / ElevenLabs paid):

  STT: faster-whisper local (free, default) / OpenAI Whisper API (paid)

Changes:
- tools/transcription_tools.py: Full rewrite with provider dispatch,
  config loading, local faster-whisper backend, and OpenAI API backend.
  Auto-downloads model (~150MB for 'base') on first voice message.
  Singleton model instance reused across calls.
- pyproject.toml: Add faster-whisper>=1.0.0 as core dependency
- hermes_cli/config.py: Expand stt config to match TTS pattern with
  provider selection and per-provider model settings
- agent/context_compressor.py: Fix .strip() crash when LLM returns
  non-string content (dict from llama.cpp, None). Fixes #1100 partially.
- tests/: 23 new tests for STT providers + 2 for compressor fix
- docs/: Updated Voice & TTS page with STT provider table, model sizes,
  config examples, and fallback behavior

Fallback behavior:
- Local not installed → OpenAI API (if key set)
- OpenAI key not set → local whisper (if installed)
- Neither → graceful error message to user

Co-authored-by: Jah-yee <Jah-yee@users.noreply.github.com>

* fix: handle YAML null values in session reset policy + configurable API timeout

Two fixes from PR #888 by @Jah-yee:

1. SessionResetPolicy.from_dict() — data.get('at_hour', 4) returns None
   when the YAML key exists with a null value. Now explicitly checks for
   None and falls back to defaults. Zero remains a valid value.

2. API timeout — hardcoded 900s is now configurable via HERMES_API_TIMEOUT
   env var. Useful for slow local models (llama.cpp) that need longer.

Co-authored-by: Jah-yee <Jah-yee@users.noreply.github.com>

---------

Co-authored-by: omerkaz <omerkaz@users.noreply.github.com>
Co-authored-by: Jah-yee <Jah-yee@users.noreply.github.com>
											
										
										
											2026-03-13 11:16:42 -07:00
+								            "timeout": float(os.getenv("HERMES_API_TIMEOUT", 900.0)),
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        }
 								        if self.max_tokens is not None:
-												feat(api): implement dynamic max tokens handling for various providers

- Added _max_tokens_param method in AIAgent to return appropriate max tokens parameter based on the provider (OpenAI vs. others).
- Updated API calls in AIAgent to utilize the new max tokens handling.
- Introduced auxiliary_max_tokens_param function in auxiliary_client for consistent max tokens management across auxiliary clients.
- Refactored multiple tools to use auxiliary_max_tokens_param for improved compatibility with different models and providers.

											
										
										
											2026-02-26 20:23:56 -08:00
+								            api_kwargs.update(self._max_tokens_param(self.max_tokens))
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								        extra_body = {}
-												refactor: streamline reasoning configuration checks in AIAgent

- Simplified the logic for determining support for reasoning based on the base URL by introducing clearer variable names.
- Added product attribution for the Nous Portal to the extra body of requests when applicable, enhancing tagging for better tracking.

											
										
										
											2026-02-25 16:49:41 -08:00
+								        _is_openrouter = "openrouter" in self.base_url.lower()
-												fix: don't send OpenRouter-specific provider preferences to Nous Portal

Two bugs in _build_api_kwargs that broke Nous Portal:

1. Provider preferences (only, ignore, order, sort) are OpenRouter-
   specific routing features. They were being sent in extra_body to ALL
   providers, including Nous Portal. When the config had
   providers_only=['google-vertex'], Nous Portal returned 404 'Inference
   host not found' because it doesn't have a google-vertex backend.

   Fix: Only include provider preferences when _is_openrouter is True.

2. Reasoning config with enabled=false was being sent to Nous Portal,
   which requires reasoning and returns 400 'Reasoning is mandatory for
   this endpoint and cannot be disabled.'

   Fix: Omit the reasoning parameter for Nous when enabled=false.

Root cause found via HERMES_DUMP_REQUESTS=1 which showed the exact
request payload being sent to Nous Portal's inference API.

											
										
										
											2026-03-11 22:41:33 -07:00
 								        # Provider preferences (only, ignore, order, sort) are OpenRouter-
-												revert: keep provider preferences for all providers (Nous will proxy)

Nous Portal backend will become a transparent proxy for OpenRouter-
specific parameters (provider preferences, etc.), so keep sending them
to all providers. The reasoning disabled fix is kept (that's a real
constraint of the Nous endpoint).

											
										
										
											2026-03-11 22:49:24 -07:00
+								        # specific.  Only send to OpenRouter-compatible endpoints.
 								        # TODO: Nous Portal will add transparent proxy support — re-enable
 								        # for _is_nous when their backend is updated.
-												fix: don't send OpenRouter-specific provider preferences to Nous Portal

Two bugs in _build_api_kwargs that broke Nous Portal:

1. Provider preferences (only, ignore, order, sort) are OpenRouter-
   specific routing features. They were being sent in extra_body to ALL
   providers, including Nous Portal. When the config had
   providers_only=['google-vertex'], Nous Portal returned 404 'Inference
   host not found' because it doesn't have a google-vertex backend.

   Fix: Only include provider preferences when _is_openrouter is True.

2. Reasoning config with enabled=false was being sent to Nous Portal,
   which requires reasoning and returns 400 'Reasoning is mandatory for
   this endpoint and cannot be disabled.'

   Fix: Omit the reasoning parameter for Nous when enabled=false.

Root cause found via HERMES_DUMP_REQUESTS=1 which showed the exact
request payload being sent to Nous Portal's inference API.

											
										
										
											2026-03-11 22:41:33 -07:00
+								        if provider_preferences and _is_openrouter:
 								            extra_body["provider"] = provider_preferences
-												refactor: streamline reasoning configuration checks in AIAgent

- Simplified the logic for determining support for reasoning based on the base URL by introducing clearer variable names.
- Added product attribution for the Nous Portal to the extra body of requests when applicable, enhancing tagging for better tracking.

											
										
										
											2026-02-25 16:49:41 -08:00
+								        _is_nous = "nousresearch" in self.base_url.lower()
-												fix(agent): strip finish_reason from assistant messages to fix Mistral 422 errors (#253)

* fix(agent): skip reasoning param for Mistral API to prevent 422 errors

* fix(agent): strip finish_reason from assistant messages to fix Mistral 422 errors
											
										
										
											2026-03-02 11:35:03 +03:00
+								        _is_mistral = "api.mistral.ai" in self.base_url.lower()
 								        if (_is_openrouter or _is_nous) and not _is_mistral:
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            if self.reasoning_config is not None:
-												fix: don't send OpenRouter-specific provider preferences to Nous Portal

Two bugs in _build_api_kwargs that broke Nous Portal:

1. Provider preferences (only, ignore, order, sort) are OpenRouter-
   specific routing features. They were being sent in extra_body to ALL
   providers, including Nous Portal. When the config had
   providers_only=['google-vertex'], Nous Portal returned 404 'Inference
   host not found' because it doesn't have a google-vertex backend.

   Fix: Only include provider preferences when _is_openrouter is True.

2. Reasoning config with enabled=false was being sent to Nous Portal,
   which requires reasoning and returns 400 'Reasoning is mandatory for
   this endpoint and cannot be disabled.'

   Fix: Omit the reasoning parameter for Nous when enabled=false.

Root cause found via HERMES_DUMP_REQUESTS=1 which showed the exact
request payload being sent to Nous Portal's inference API.

											
										
										
											2026-03-11 22:41:33 -07:00
+								                rc = dict(self.reasoning_config)
 								                # Nous Portal requires reasoning enabled — don't send
 								                # enabled=false to it (would cause 400).
 								                if _is_nous and rc.get("enabled") is False:
 								                    pass  # omit reasoning entirely for Nous when disabled
 								                else:
 								                    extra_body["reasoning"] = rc
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            else:
 								                extra_body["reasoning"] = {
 								                    "enabled": True,
-												feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to
medium reasoning effort instead of xhigh. Users can still override
via config or CLI flag. Updates code, tests, example config, and docs.

											
										
										
											2026-03-07 10:14:19 -08:00
+								                    "effort": "medium"
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                }
-												refactor: streamline reasoning configuration checks in AIAgent

- Simplified the logic for determining support for reasoning based on the base URL by introducing clearer variable names.
- Added product attribution for the Nous Portal to the extra body of requests when applicable, enhancing tagging for better tracking.

											
										
										
											2026-02-25 16:49:41 -08:00
+								        # Nous Portal product attribution
 								        if _is_nous:
 								            extra_body["tags"] = ["product=hermes-agent"]
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        if extra_body:
 								            api_kwargs["extra_body"] = extra_body
 								        return api_kwargs
 								    def _build_assistant_message(self, assistant_message, finish_reason: str) -> dict:
 								        """Build a normalized assistant message dict from an API response message.
 								        Handles reasoning extraction, reasoning_details, and optional tool_calls
 								        so both the tool-call path and the final-response path share one builder.
 								        """
 								        reasoning_text = self._extract_reasoning(assistant_message)
-												feat: include session ID in system prompt via --pass-session-id flag

Adds --pass-session-id CLI flag. When set, the agent's system prompt
includes the session ID:

  Conversation started: Sunday, March 08, 2026 06:32 PM
  Session ID: 20260308_183200_abc123

Usage:
  hermes --pass-session-id
  hermes chat --pass-session-id

Implementation threads the flag as a proper parameter through the full
chain (main.py → cli.py → run_agent.py) rather than using an env var,
avoiding collisions in multi-agent/multitenant setups.

Based on PR #726 by dmahan93, reworked to use instance parameter
instead of HERMES_PASS_SESSION_ID environment variable.

Co-authored-by: dmahan93 <dmahan93@users.noreply.github.com>

											
										
										
											2026-03-12 05:51:31 -07:00
+								        # Fallback: extract inline <think> blocks from content when no structured
 								        # reasoning fields are present (some models/providers embed thinking
 								        # directly in the content rather than returning separate API fields).
 								        if not reasoning_text:
 								            content = assistant_message.content or ""
 								            think_blocks = re.findall(r'<think>(.*?)</think>', content, flags=re.DOTALL)
 								            if think_blocks:
 								                combined = "\n\n".join(b.strip() for b in think_blocks if b.strip())
 								                reasoning_text = combined or None
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        if reasoning_text and self.verbose_logging:
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								            logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {reasoning_text}")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
-												feat(cli): add /reasoning command for effort level and display toggle

Combined implementation of reasoning management:
- /reasoning              Show current effort level and display state
- /reasoning <level>      Set reasoning effort (none, low, medium, high, xhigh)
- /reasoning show|on      Show model thinking/reasoning in output
- /reasoning hide|off     Hide model thinking/reasoning from output

Effort level changes persist to config and force agent re-init.
Display toggle updates the agent callback dynamically without re-init.

When display is enabled:
- Intermediate reasoning shown as dim [thinking] lines during tool loops
- Final reasoning shown in a bordered box above the response
- Long reasoning collapsed (5 lines intermediate, 10 lines final)

Also adds:
- reasoning_callback parameter to AIAgent
- last_reasoning in run_conversation result dict
- show_reasoning config option (display section, default: false)
- Display section in /config output
- 34 tests covering both features

Combines functionality from PR #789 and PR #790.

Co-authored-by: Aum Desai <Aum08Desai@users.noreply.github.com>
Co-authored-by: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>

											
										
										
											2026-03-11 05:53:21 -07:00
+								        if reasoning_text and self.reasoning_callback:
 								            try:
 								                self.reasoning_callback(reasoning_text)
 								            except Exception:
 								                pass
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        msg = {
 								            "role": "assistant",
 								            "content": assistant_message.content or "",
 								            "reasoning": reasoning_text,
 								            "finish_reason": finish_reason,
 								        }
 								        if hasattr(assistant_message, 'reasoning_details') and assistant_message.reasoning_details:
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								            # Pass reasoning_details back unmodified so providers (OpenRouter,
 								            # Anthropic, OpenAI) can maintain reasoning continuity across turns.
 								            # Each provider may include opaque fields (signature, encrypted_content)
 								            # that must be preserved exactly.
 								            raw_details = assistant_message.reasoning_details
 								            preserved = []
 								            for d in raw_details:
 								                if isinstance(d, dict):
 								                    preserved.append(d)
 								                elif hasattr(d, "__dict__"):
 								                    preserved.append(d.__dict__)
 								                elif hasattr(d, "model_dump"):
 								                    preserved.append(d.model_dump())
 								            if preserved:
 								                msg["reasoning_details"] = preserved
 								        # Codex Responses API: preserve encrypted reasoning items for
 								        # multi-turn continuity. These get replayed as input on the next turn.
 								        codex_items = getattr(assistant_message, "codex_reasoning_items", None)
 								        if codex_items:
 								            msg["codex_reasoning_items"] = codex_items
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								        if assistant_message.tool_calls:
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								            tool_calls = []
 								            for tool_call in assistant_message.tool_calls:
 								                raw_id = getattr(tool_call, "id", None)
 								                call_id = getattr(tool_call, "call_id", None)
 								                if not isinstance(call_id, str) or not call_id.strip():
 								                    embedded_call_id, _ = self._split_responses_tool_id(raw_id)
 								                    call_id = embedded_call_id
 								                if not isinstance(call_id, str) or not call_id.strip():
 								                    if isinstance(raw_id, str) and raw_id.strip():
 								                        call_id = raw_id.strip()
 								                    else:
 								                        call_id = f"call_{uuid.uuid4().hex[:12]}"
 								                call_id = call_id.strip()
 								                response_item_id = getattr(tool_call, "response_item_id", None)
 								                if not isinstance(response_item_id, str) or not response_item_id.strip():
 								                    _, embedded_response_item_id = self._split_responses_tool_id(raw_id)
 								                    response_item_id = embedded_response_item_id
 								                response_item_id = self._derive_responses_function_call_id(
 								                    call_id,
 								                    response_item_id if isinstance(response_item_id, str) else None,
 								                )
-												fix: preserve Gemini thought_signature in tool call messages

Gemini 3 thinking models attach extra_content with thought_signature
to function call responses. This must be echoed back on subsequent
API calls or the server rejects with a 400 error. The assistant
message builder was dropping this field, causing all Gemini 3 Flash/Pro
tool-calling flows to fail after the first function call.

											
										
										
											2026-02-28 18:01:13 +03:00
+								                tc_dict = {
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                    "id": call_id,
 								                    "call_id": call_id,
 								                    "response_item_id": response_item_id,
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    "type": tool_call.type,
 								                    "function": {
 								                        "name": tool_call.function.name,
 								                        "arguments": tool_call.function.arguments
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								                    },
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                }
-												fix: preserve Gemini thought_signature in tool call messages

Gemini 3 thinking models attach extra_content with thought_signature
to function call responses. This must be echoed back on subsequent
API calls or the server rejects with a 400 error. The assistant
message builder was dropping this field, causing all Gemini 3 Flash/Pro
tool-calling flows to fail after the first function call.

											
										
										
											2026-02-28 18:01:13 +03:00
+								                # Preserve extra_content (e.g. Gemini thought_signature) so it
 								                # is sent back on subsequent API calls.  Without this, Gemini 3
 								                # thinking models reject the request with a 400 error.
 								                extra = getattr(tool_call, "extra_content", None)
 								                if extra is not None:
 								                    if hasattr(extra, "model_dump"):
 								                        extra = extra.model_dump()
 								                    tc_dict["extra_content"] = extra
-												Merge remote-tracking branch 'origin/main' into feature/homeassistant-integration

# Conflicts:
#	run_agent.py

											
										
										
											2026-03-01 11:59:12 +03:00
+								                tool_calls.append(tc_dict)
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								            msg["tool_calls"] = tool_calls
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								        return msg
-												fix: strip call_id/response_item_id from tool_calls for Mistral compatibility

Mistral's API strictly validates the Chat Completions schema and rejects
unknown fields (call_id, response_item_id) with 422. These fields are
added by _build_assistant_message() for Codex Responses API support.

This fix:
- Only strips when targeting Mistral (api.mistral.ai in base_url)
- Creates new tool_call dicts instead of mutating originals (shallow
  copy safety — msg.copy() shares the tool_calls list)
- Preserves call_id/response_item_id in the internal message history
  so _chat_messages_to_responses_input() can still read them if the
  session falls back to a Codex provider mid-conversation

Applied in all 3 API message building locations:
- Main conversation loop (run_conversation)
- _handle_max_iterations()
- flush_memories()

Inspired by PR #864 (unmodeled-tyler) which identified the issue but
applied the fix unconditionally and mutated originals via shallow copy.

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>

											
										
										
											2026-03-12 08:18:27 -07:00
+								    @staticmethod
 								    def _sanitize_tool_calls_for_strict_api(api_msg: dict) -> dict:
 								        """Strip Codex Responses API fields from tool_calls for strict providers.
 								        Providers like Mistral strictly validate the Chat Completions schema
 								        and reject unknown fields (call_id, response_item_id) with 422.
 								        These fields are preserved in the internal message history — this
 								        method only modifies the outgoing API copy.
 								        Creates new tool_call dicts rather than mutating in-place, so the
 								        original messages list retains call_id/response_item_id for Codex
 								        Responses API compatibility (e.g. if the session falls back to a
 								        Codex provider later).
 								        """
 								        tool_calls = api_msg.get("tool_calls")
 								        if not isinstance(tool_calls, list):
 								            return api_msg
 								        _STRIP_KEYS = {"call_id", "response_item_id"}
 								        api_msg["tool_calls"] = [
 								            {k: v for k, v in tc.items() if k not in _STRIP_KEYS}
 								            if isinstance(tc, dict) else tc
 								            for tc in tool_calls
 								        ]
 								        return api_msg
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								    def flush_memories(self, messages: list = None, min_turns: int = None):
 								        """Give the model one turn to persist memories before context is lost.
 								        Called before compression, session reset, or CLI exit. Injects a flush
 								        message, makes one API call, executes any memory tool calls, then
 								        strips all flush artifacts from the message list.
 								        Args:
 								            messages: The current conversation messages. If None, uses
 								                      self._session_messages (last run_conversation state).
 								            min_turns: Minimum user turns required to trigger the flush.
 								                       None = use config value (flush_min_turns).
 = always flush (used for compression).
 								        """
 								        if self._memory_flush_min_turns == 0 and min_turns is None:
 								            return
 								        if "memory" not in self.valid_tool_names or not self._memory_store:
 								            return
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								        # honcho-only agent mode: skip local MEMORY.md flush
 								        _hcfg = getattr(self, '_honcho_config', None)
 								        if _hcfg and _hcfg.peer_memory_mode(_hcfg.ai_peer) == "honcho":
 								            return
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        effective_min = min_turns if min_turns is not None else self._memory_flush_min_turns
 								        if self._user_turn_count < effective_min:
 								            return
 								        if messages is None:
 								            messages = getattr(self, '_session_messages', None)
 								        if not messages or len(messages) < 3:
 								            return
 								        flush_content = (
 								            "[System: The session is being compressed. "
 								            "Please save anything worth remembering to your memories.]"
 								        )
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								        _sentinel = f"__flush_{id(self)}_{time.monotonic()}"
 								        flush_msg = {"role": "user", "content": flush_content, "_flush_sentinel": _sentinel}
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        messages.append(flush_msg)
 								        try:
 								            # Build API messages for the flush call
-												fix: strip call_id/response_item_id from tool_calls for Mistral compatibility

Mistral's API strictly validates the Chat Completions schema and rejects
unknown fields (call_id, response_item_id) with 422. These fields are
added by _build_assistant_message() for Codex Responses API support.

This fix:
- Only strips when targeting Mistral (api.mistral.ai in base_url)
- Creates new tool_call dicts instead of mutating originals (shallow
  copy safety — msg.copy() shares the tool_calls list)
- Preserves call_id/response_item_id in the internal message history
  so _chat_messages_to_responses_input() can still read them if the
  session falls back to a Codex provider mid-conversation

Applied in all 3 API message building locations:
- Main conversation loop (run_conversation)
- _handle_max_iterations()
- flush_memories()

Inspired by PR #864 (unmodeled-tyler) which identified the issue but
applied the fix unconditionally and mutated originals via shallow copy.

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>

											
										
										
											2026-03-12 08:18:27 -07:00
+								            _is_strict_api = "api.mistral.ai" in self.base_url.lower()
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								            api_messages = []
 								            for msg in messages:
 								                api_msg = msg.copy()
 								                if msg.get("role") == "assistant":
 								                    reasoning = msg.get("reasoning")
 								                    if reasoning:
 								                        api_msg["reasoning_content"] = reasoning
 								                api_msg.pop("reasoning", None)
-												fix(agent): strip finish_reason from assistant messages to fix Mistral 422 errors (#253)

* fix(agent): skip reasoning param for Mistral API to prevent 422 errors

* fix(agent): strip finish_reason from assistant messages to fix Mistral 422 errors
											
										
										
											2026-03-02 11:35:03 +03:00
+								                api_msg.pop("finish_reason", None)
-												fix(agent): strip _flush_sentinel from API messages

											
										
										
											2026-03-01 02:51:31 +03:00
+								                api_msg.pop("_flush_sentinel", None)
-												fix: strip call_id/response_item_id from tool_calls for Mistral compatibility

Mistral's API strictly validates the Chat Completions schema and rejects
unknown fields (call_id, response_item_id) with 422. These fields are
added by _build_assistant_message() for Codex Responses API support.

This fix:
- Only strips when targeting Mistral (api.mistral.ai in base_url)
- Creates new tool_call dicts instead of mutating originals (shallow
  copy safety — msg.copy() shares the tool_calls list)
- Preserves call_id/response_item_id in the internal message history
  so _chat_messages_to_responses_input() can still read them if the
  session falls back to a Codex provider mid-conversation

Applied in all 3 API message building locations:
- Main conversation loop (run_conversation)
- _handle_max_iterations()
- flush_memories()

Inspired by PR #864 (unmodeled-tyler) which identified the issue but
applied the fix unconditionally and mutated originals via shallow copy.

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>

											
										
										
											2026-03-12 08:18:27 -07:00
+								                if _is_strict_api:
 								                    self._sanitize_tool_calls_for_strict_api(api_msg)
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								                api_messages.append(api_msg)
 								            if self._cached_system_prompt:
 								                api_messages = [{"role": "system", "content": self._cached_system_prompt}] + api_messages
 								            # Make one API call with only the memory tool available
 								            memory_tool_def = None
 								            for t in (self.tools or []):
 								                if t.get("function", {}).get("name") == "memory":
 								                    memory_tool_def = t
 								                    break
 								            if not memory_tool_def:
 								                messages.pop()  # remove flush msg
 								                return
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								            # Use auxiliary client for the flush call when available --
 								            # it's cheaper and avoids Codex Responses API incompatibility.
-												feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the
full LLM request lifecycle:
  1. Resolve provider + model from task config or explicit args
  2. Get or create a cached client for that provider
  3. Format request args (max_tokens handling, provider extra_body)
  4. Make the API call with max_tokens/max_completion_tokens retry
  5. Return the response

Config: expanded auxiliary section with provider:model slots for all
tasks (compression, vision, web_extract, session_search, skills_hub,
mcp, flush_memories). Config version bumped to 7.

Migrated all auxiliary consumers:
- context_compressor.py: uses call_llm(task='compression')
- vision_tools.py: uses async_call_llm(task='vision')
- web_tools.py: uses async_call_llm(task='web_extract')
- session_search_tool.py: uses async_call_llm(task='session_search')
- browser_tool.py: uses call_llm(task='vision'/'web_extract')
- mcp_tool.py: uses call_llm(task='mcp')
- skills_guard.py: uses call_llm(provider='openrouter')
- run_agent.py flush_memories: uses call_llm(task='flush_memories')

Tests updated for context_compressor and MCP tool. Some test mocks
still need updating (15 remaining failures from mock pattern changes,
2 pre-existing).

											
										
										
											2026-03-11 20:52:19 -07:00
+								            from agent.auxiliary_client import call_llm as _call_llm
 								            _aux_available = True
 								            try:
 								                response = _call_llm(
 								                    task="flush_memories",
 								                    messages=api_messages,
 								                    tools=[memory_tool_def],
 								                    temperature=0.3,
 								                    max_tokens=5120,
 								                    timeout=30.0,
 								                )
 								            except RuntimeError:
 								                _aux_available = False
 								                response = None
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
-												feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the
full LLM request lifecycle:
  1. Resolve provider + model from task config or explicit args
  2. Get or create a cached client for that provider
  3. Format request args (max_tokens handling, provider extra_body)
  4. Make the API call with max_tokens/max_completion_tokens retry
  5. Return the response

Config: expanded auxiliary section with provider:model slots for all
tasks (compression, vision, web_extract, session_search, skills_hub,
mcp, flush_memories). Config version bumped to 7.

Migrated all auxiliary consumers:
- context_compressor.py: uses call_llm(task='compression')
- vision_tools.py: uses async_call_llm(task='vision')
- web_tools.py: uses async_call_llm(task='web_extract')
- session_search_tool.py: uses async_call_llm(task='session_search')
- browser_tool.py: uses call_llm(task='vision'/'web_extract')
- mcp_tool.py: uses call_llm(task='mcp')
- skills_guard.py: uses call_llm(provider='openrouter')
- run_agent.py flush_memories: uses call_llm(task='flush_memories')

Tests updated for context_compressor and MCP tool. Some test mocks
still need updating (15 remaining failures from mock pattern changes,
2 pre-existing).

											
										
										
											2026-03-11 20:52:19 -07:00
+								            if not _aux_available and self.api_mode == "codex_responses":
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                # No auxiliary client -- use the Codex Responses path directly
 								                codex_kwargs = self._build_api_kwargs(api_messages)
 								                codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
 								                codex_kwargs["temperature"] = 0.3
 								                if "max_output_tokens" in codex_kwargs:
 								                    codex_kwargs["max_output_tokens"] = 5120
 								                response = self._run_codex_stream(codex_kwargs)
-												fix(anthropic): final polish — tool ID sanitization, crash guards, temp=1

Remaining issues from deep scan:

Adapter (agent/anthropic_adapter.py):
- Add _sanitize_tool_id() — Anthropic requires IDs matching [a-zA-Z0-9_-],
  now strips invalid chars and ensures non-empty (both tool_use and tool_result)
- Empty tool result content → '(no output)' placeholder (Anthropic rejects empty)
- Set temperature=1 when thinking type='enabled' on older models (required)
- normalize_model_name now case-insensitive for 'Anthropic/' prefix
- Fix stale docstrings referencing only ~/.claude/.credentials.json

Agent loop (run_agent.py):
- Guard memory flush path (line ~2684) — was calling self.client.chat.completions
  which is None in anthropic_messages mode. Now routes through Anthropic client.
- Guard summary generation path (line ~3171) — same crash when reaching
  iteration limit. Now builds proper Anthropic kwargs and normalizes response.
- Guard retry summary path (line ~3200) — same fix for the summary retry loop.

All three self.client.chat.completions.create() calls outside the main
loop now have anthropic_messages branches to prevent NoneType crashes.

											
										
										
											2026-03-12 17:23:09 -07:00
+								            elif not _aux_available and self.api_mode == "anthropic_messages":
 								                # Native Anthropic — use the Anthropic client directly
 								                from agent.anthropic_adapter import build_anthropic_kwargs as _build_ant_kwargs
 								                ant_kwargs = _build_ant_kwargs(
 								                    model=self.model, messages=api_messages,
 								                    tools=[memory_tool_def], max_tokens=5120,
 								                    reasoning_config=None,
 								                )
-												fix: preflight Anthropic auth and prefer Claude store

											
										
										
											2026-03-14 19:38:55 -07:00
+								                response = self._anthropic_messages_create(ant_kwargs)
-												feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the
full LLM request lifecycle:
  1. Resolve provider + model from task config or explicit args
  2. Get or create a cached client for that provider
  3. Format request args (max_tokens handling, provider extra_body)
  4. Make the API call with max_tokens/max_completion_tokens retry
  5. Return the response

Config: expanded auxiliary section with provider:model slots for all
tasks (compression, vision, web_extract, session_search, skills_hub,
mcp, flush_memories). Config version bumped to 7.

Migrated all auxiliary consumers:
- context_compressor.py: uses call_llm(task='compression')
- vision_tools.py: uses async_call_llm(task='vision')
- web_tools.py: uses async_call_llm(task='web_extract')
- session_search_tool.py: uses async_call_llm(task='session_search')
- browser_tool.py: uses call_llm(task='vision'/'web_extract')
- mcp_tool.py: uses call_llm(task='mcp')
- skills_guard.py: uses call_llm(provider='openrouter')
- run_agent.py flush_memories: uses call_llm(task='flush_memories')

Tests updated for context_compressor and MCP tool. Some test mocks
still need updating (15 remaining failures from mock pattern changes,
2 pre-existing).

											
										
										
											2026-03-11 20:52:19 -07:00
+								            elif not _aux_available:
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                api_kwargs = {
 								                    "model": self.model,
 								                    "messages": api_messages,
 								                    "tools": [memory_tool_def],
 								                    "temperature": 0.3,
 								                    **self._max_tokens_param(5120),
 								                }
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                response = self._ensure_primary_openai_client(reason="flush_memories").chat.completions.create(**api_kwargs, timeout=30.0)
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
-												fix(anthropic): guard memory flush tool_calls extraction for Anthropic response format

The memory flush path extracted tool_calls from the response assuming
OpenAI format (response.choices[0].message.tool_calls). When using
the Anthropic client directly (aux unavailable), the response is an
Anthropic Message object which has no .choices attribute. Now uses
normalize_anthropic_response() to extract tool_calls correctly.

											
										
										
											2026-03-12 17:35:01 -07:00
+								            # Extract tool calls from the response, handling all API formats
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								            tool_calls = []
-												feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the
full LLM request lifecycle:
  1. Resolve provider + model from task config or explicit args
  2. Get or create a cached client for that provider
  3. Format request args (max_tokens handling, provider extra_body)
  4. Make the API call with max_tokens/max_completion_tokens retry
  5. Return the response

Config: expanded auxiliary section with provider:model slots for all
tasks (compression, vision, web_extract, session_search, skills_hub,
mcp, flush_memories). Config version bumped to 7.

Migrated all auxiliary consumers:
- context_compressor.py: uses call_llm(task='compression')
- vision_tools.py: uses async_call_llm(task='vision')
- web_tools.py: uses async_call_llm(task='web_extract')
- session_search_tool.py: uses async_call_llm(task='session_search')
- browser_tool.py: uses call_llm(task='vision'/'web_extract')
- mcp_tool.py: uses call_llm(task='mcp')
- skills_guard.py: uses call_llm(provider='openrouter')
- run_agent.py flush_memories: uses call_llm(task='flush_memories')

Tests updated for context_compressor and MCP tool. Some test mocks
still need updating (15 remaining failures from mock pattern changes,
2 pre-existing).

											
										
										
											2026-03-11 20:52:19 -07:00
+								            if self.api_mode == "codex_responses" and not _aux_available:
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                assistant_msg, _ = self._normalize_codex_response(response)
 								                if assistant_msg and assistant_msg.tool_calls:
 								                    tool_calls = assistant_msg.tool_calls
-												fix(anthropic): guard memory flush tool_calls extraction for Anthropic response format

The memory flush path extracted tool_calls from the response assuming
OpenAI format (response.choices[0].message.tool_calls). When using
the Anthropic client directly (aux unavailable), the response is an
Anthropic Message object which has no .choices attribute. Now uses
normalize_anthropic_response() to extract tool_calls correctly.

											
										
										
											2026-03-12 17:35:01 -07:00
+								            elif self.api_mode == "anthropic_messages" and not _aux_available:
 								                from agent.anthropic_adapter import normalize_anthropic_response as _nar_flush
 								                _flush_msg, _ = _nar_flush(response)
 								                if _flush_msg and _flush_msg.tool_calls:
 								                    tool_calls = _flush_msg.tool_calls
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								            elif hasattr(response, "choices") and response.choices:
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								                assistant_message = response.choices[0].message
 								                if assistant_message.tool_calls:
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                    tool_calls = assistant_message.tool_calls
 								            for tc in tool_calls:
 								                if tc.function.name == "memory":
 								                    try:
 								                        args = json.loads(tc.function.arguments)
 								                        flush_target = args.get("target", "memory")
 								                        from tools.memory_tool import memory_tool as _memory_tool
 								                        result = _memory_tool(
 								                            action=args.get("action"),
 								                            target=flush_target,
 								                            content=args.get("content"),
 								                            old_text=args.get("old_text"),
 								                            store=self._memory_store,
 								                        )
 								                        if self._honcho and flush_target == "user" and args.get("action") == "add":
 								                            self._honcho_save_user_observation(args.get("content", ""))
 								                        if not self.quiet_mode:
 								                            print(f"  🧠 Memory flush: saved to {args.get('target', 'memory')}")
 								                    except Exception as e:
 								                        logger.debug("Memory flush tool call failed: %s", e)
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        except Exception as e:
 								            logger.debug("Memory flush API call failed: %s", e)
 								        finally:
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								            # Strip flush artifacts: remove everything from the flush message onward.
 								            # Use sentinel marker instead of identity check for robustness.
 								            while messages and messages[-1].get("_flush_sentinel") != _sentinel:
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								                messages.pop()
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								                if not messages:
 								                    break
 								            if messages and messages[-1].get("_flush_sentinel") == _sentinel:
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								                messages.pop()
-												fix: detect and warn on file re-read loops after context compression

When context compression summarizes conversation history, the agent
loses track of which files it already read and re-reads them in a loop.
Users report the agent reading the same files endlessly without writing.

Root cause: context compression is lossy — file contents and read history
are lost in the summary. After compression, the model thinks it hasn't
examined the files yet and reads them again.

Fix (two-part):
1. Track file reads per task in file_tools.py. When the same file region
   is read again, include a _warning in the response telling the model
   to stop re-reading and use existing information.
2. After context compression, inject a structured message listing all
   files already read in the session with explicit "do NOT re-read"
   instruction, preserving read history across compression boundaries.

Adds 16 tests covering warning detection, task isolation, summary
accuracy, tracker cleanup, and compression history injection.

											
										
										
											2026-03-08 20:44:42 +03:00
+								    def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default") -> tuple:
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        """Compress conversation context and split the session in SQLite.
 								        Returns:
 								            (compressed_messages, new_system_prompt) tuple
 								        """
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        # Pre-compression memory flush: let the model save memories before they're lost
 								        self.flush_memories(messages, min_turns=0)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        compressed = self.context_compressor.compress(messages, current_tokens=approx_tokens)
 								        todo_snapshot = self._todo_store.format_for_injection()
 								        if todo_snapshot:
 								            compressed.append({"role": "user", "content": todo_snapshot})
-												fix: detect and warn on file re-read loops after context compression

When context compression summarizes conversation history, the agent
loses track of which files it already read and re-reads them in a loop.
Users report the agent reading the same files endlessly without writing.

Root cause: context compression is lossy — file contents and read history
are lost in the summary. After compression, the model thinks it hasn't
examined the files yet and reads them again.

Fix (two-part):
1. Track file reads per task in file_tools.py. When the same file region
   is read again, include a _warning in the response telling the model
   to stop re-reading and use existing information.
2. After context compression, inject a structured message listing all
   files already read in the session with explicit "do NOT re-read"
   instruction, preserving read history across compression boundaries.

Adds 16 tests covering warning detection, task isolation, summary
accuracy, tracker cleanup, and compression history injection.

											
										
										
											2026-03-08 20:44:42 +03:00
+								        # Preserve file-read history so the model doesn't re-read files
 								        # it already examined before compression.
 								        try:
 								            from tools.file_tools import get_read_files_summary
 								            read_files = get_read_files_summary(task_id)
 								            if read_files:
 								                file_list = "\n".join(
 								                    f"  - {f['path']} ({', '.join(f['regions'])})"
 								                    for f in read_files
 								                )
 								                compressed.append({"role": "user", "content": (
 								                    "[Files already read in this session — do NOT re-read these]\n"
 								                    f"{file_list}\n"
 								                    "Use the information from the context summary above. "
 								                    "Proceed with writing, editing, or responding."
 								                )})
 								        except Exception:
 								            pass  # Don't break compression if file tracking fails
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        self._invalidate_system_prompt()
 								        new_system_prompt = self._build_system_prompt(system_message)
 								        self._cached_system_prompt = new_system_prompt
 								        if self._session_db:
 								            try:
-												feat: session naming with unique titles, auto-lineage, rich listing, resume by name

- Schema v4: unique title index, migration from v2/v3
- set/get/resolve session titles with uniqueness enforcement
- Auto-lineage: context compression auto-numbers titles (Task -> Task #2 -> Task #3)
- resolve_session_by_title: auto-latest finds most recent continuation
- list_sessions_rich: preview (first 60 chars) + last_active timestamp
- CLI: -c accepts optional name arg (hermes -c 'my project')
- CLI: /title command with deferred mode (set before session exists)
- CLI: sessions list shows Title, Preview, Last Active, ID
- 27 new tests (1844 total passing)

											
										
										
											2026-03-08 15:20:29 -07:00
+								                # Propagate title to the new session with auto-numbering
 								                old_title = self._session_db.get_session_title(self.session_id)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                self._session_db.end_session(self.session_id, "compression")
 								                old_session_id = self.session_id
 								                self.session_id = f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{uuid.uuid4().hex[:6]}"
 								                self._session_db.create_session(
 								                    session_id=self.session_id,
 								                    source=self.platform or "cli",
 								                    model=self.model,
 								                    parent_session_id=old_session_id,
 								                )
-												feat: session naming with unique titles, auto-lineage, rich listing, resume by name

- Schema v4: unique title index, migration from v2/v3
- set/get/resolve session titles with uniqueness enforcement
- Auto-lineage: context compression auto-numbers titles (Task -> Task #2 -> Task #3)
- resolve_session_by_title: auto-latest finds most recent continuation
- list_sessions_rich: preview (first 60 chars) + last_active timestamp
- CLI: -c accepts optional name arg (hermes -c 'my project')
- CLI: /title command with deferred mode (set before session exists)
- CLI: sessions list shows Title, Preview, Last Active, ID
- 27 new tests (1844 total passing)

											
										
										
											2026-03-08 15:20:29 -07:00
+								                # Auto-number the title for the continuation session
 								                if old_title:
 								                    try:
 								                        new_title = self._session_db.get_next_title_in_lineage(old_title)
 								                        self._session_db.set_session_title(self.session_id, new_title)
 								                    except (ValueError, Exception) as e:
 								                        logger.debug("Could not propagate title on compression: %s", e)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                self._session_db.update_system_prompt(self.session_id, new_system_prompt)
-												fix: eliminate 3x SQLite message duplication in gateway sessions (#860)

Three separate code paths all wrote to the same SQLite state.db with
no deduplication, inflating session transcripts by 3-4x:

1. _log_msg_to_db() — wrote each message individually after append
2. _flush_messages_to_session_db() — re-wrote ALL new messages at
   every _persist_session() call (~18 exit points), with no tracking
   of what was already written
3. gateway append_to_transcript() — wrote everything a third time
   after the agent returned

Since load_transcript() prefers SQLite over JSONL, the inflated data
was loaded on every session resume, causing proportional token waste.

Fix:
- Remove _log_msg_to_db() and all 16 call sites (redundant with flush)
- Add _last_flushed_db_idx tracking in _flush_messages_to_session_db()
  so repeated _persist_session() calls only write truly new messages
- Reset flush cursor on compression (new session ID)
- Add skip_db parameter to SessionStore.append_to_transcript() so the
  gateway skips SQLite writes when the agent already persisted them
- Gateway now passes skip_db=True for agent-managed messages, still
  writes to JSONL as backup

Verified: a 12-message CLI session with tool calls produces exactly
12 SQLite rows with zero duplicates (previously would be 36-48).

Tests: 9 new tests covering flush deduplication, skip_db behavior,
compression reset, and initialization. Full suite passes (2869 tests).

											
										
										
											2026-03-10 15:22:44 -07:00
+								                # Reset flush cursor — new session starts with no messages written
 								                self._last_flushed_db_idx = 0
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            except Exception as e:
 								                logger.debug("Session DB compression split failed: %s", e)
 								        return compressed, new_system_prompt
-												feat: iteration budget pressure via tool result injection

Two-tier warning system that nudges the LLM as it approaches
max_iterations, injected into the last tool result JSON rather
than as a separate system message:

- Caution (70%): {"_budget_warning": "[BUDGET: 42/60...]"}
- Warning (90%): {"_budget_warning": "[BUDGET WARNING: 54/60...]"}

For JSON tool results, adds a _budget_warning field to the existing
dict. For plain text results, appends the warning as text.

Key properties:
- No system messages injected mid-conversation
- No changes to message structure
- Prompt cache stays valid
- Configurable thresholds (0.7 / 0.9)
- Can be disabled: _budget_pressure_enabled = False

Inspired by PR #421 (@Bartok9) and issue #414.
8 tests covering thresholds, edge cases, JSON and text injection.

											
										
										
											2026-03-09 05:19:42 -07:00
+								    def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
-												feat: concurrent tool execution with ThreadPoolExecutor

When the model returns multiple tool calls in a single response, they are
now executed concurrently using a thread pool instead of sequentially.
This significantly reduces wall-clock time when multiple independent tools
are batched (e.g. parallel web_search, read_file, terminal calls).

Architecture:
- _execute_tool_calls() dispatches to sequential or concurrent path
- Single tool calls and batches containing 'clarify' use sequential path
- Multiple non-interactive tools use ThreadPoolExecutor (max 8 workers)
- Results are collected and appended to messages in original order
- _invoke_tool() extracted as shared tool invocation helper

Safety:
- Pre-flight interrupt check skips all tools if interrupted
- Per-tool exception handling: one failure doesn't crash the batch
- Result truncation (100k char limit) applied per tool
- Budget pressure injection after all tools complete
- Checkpoints taken before file-mutating tools
- CLI spinner shows batch progress, then per-tool completion messages

Tests: 10 new tests covering dispatch logic, ordering, error handling,
interrupt behavior, truncation, and _invoke_tool routing.

											
										
										
											2026-03-13 02:51:51 -07:00
+								        """Execute tool calls from the assistant message and append results to messages.
 								        Dispatches to concurrent execution when multiple independent tool calls
 								        are present, falling back to sequential execution for single calls or
 								        when interactive tools (e.g. clarify) are in the batch.
 								        """
 								        tool_calls = assistant_message.tool_calls
 								        # Single tool call or interactive tool present → sequential
 								        if (len(tool_calls) <= 1
 								                or any(tc.function.name in _NEVER_PARALLEL_TOOLS for tc in tool_calls)):
 								            return self._execute_tool_calls_sequential(
 								                assistant_message, messages, effective_task_id, api_call_count
 								            )
 								        # Multiple non-interactive tools → concurrent
 								        return self._execute_tool_calls_concurrent(
 								            assistant_message, messages, effective_task_id, api_call_count
 								        )
 								    def _invoke_tool(self, function_name: str, function_args: dict, effective_task_id: str) -> str:
 								        """Invoke a single tool and return the result string. No display logic.
 								        Handles both agent-level tools (todo, memory, etc.) and registry-dispatched
 								        tools. Used by the concurrent execution path; the sequential path retains
 								        its own inline invocation for backward-compatible display handling.
 								        """
 								        if function_name == "todo":
 								            from tools.todo_tool import todo_tool as _todo_tool
 								            return _todo_tool(
 								                todos=function_args.get("todos"),
 								                merge=function_args.get("merge", False),
 								                store=self._todo_store,
 								            )
 								        elif function_name == "session_search":
 								            if not self._session_db:
 								                return json.dumps({"success": False, "error": "Session database not available."})
 								            from tools.session_search_tool import session_search as _session_search
 								            return _session_search(
 								                query=function_args.get("query", ""),
 								                role_filter=function_args.get("role_filter"),
 								                limit=function_args.get("limit", 3),
 								                db=self._session_db,
 								                current_session_id=self.session_id,
 								            )
 								        elif function_name == "memory":
 								            target = function_args.get("target", "memory")
 								            from tools.memory_tool import memory_tool as _memory_tool
 								            result = _memory_tool(
 								                action=function_args.get("action"),
 								                target=target,
 								                content=function_args.get("content"),
 								                old_text=function_args.get("old_text"),
 								                store=self._memory_store,
 								            )
 								            # Also send user observations to Honcho when active
 								            if self._honcho and target == "user" and function_args.get("action") == "add":
 								                self._honcho_save_user_observation(function_args.get("content", ""))
 								            return result
 								        elif function_name == "clarify":
 								            from tools.clarify_tool import clarify_tool as _clarify_tool
 								            return _clarify_tool(
 								                question=function_args.get("question", ""),
 								                choices=function_args.get("choices"),
 								                callback=self.clarify_callback,
 								            )
 								        elif function_name == "delegate_task":
 								            from tools.delegate_tool import delegate_task as _delegate_task
 								            return _delegate_task(
 								                goal=function_args.get("goal"),
 								                context=function_args.get("context"),
 								                toolsets=function_args.get("toolsets"),
 								                tasks=function_args.get("tasks"),
 								                max_iterations=function_args.get("max_iterations"),
 								                parent_agent=self,
 								            )
 								        else:
 								            return handle_function_call(
 								                function_name, function_args, effective_task_id,
 								                enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
 								            )
 								    def _execute_tool_calls_concurrent(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
 								        """Execute multiple tool calls concurrently using a thread pool.
 								        Results are collected in the original tool-call order and appended to
 								        messages so the API sees them in the expected sequence.
 								        """
 								        tool_calls = assistant_message.tool_calls
 								        num_tools = len(tool_calls)
 								        # ── Pre-flight: interrupt check ──────────────────────────────────
 								        if self._interrupt_requested:
 								            print(f"{self.log_prefix}⚡ Interrupt: skipping {num_tools} tool call(s)")
 								            for tc in tool_calls:
 								                messages.append({
 								                    "role": "tool",
 								                    "content": f"[Tool execution cancelled — {tc.function.name} was skipped due to user interrupt]",
 								                    "tool_call_id": tc.id,
 								                })
 								            return
 								        # ── Parse args + pre-execution bookkeeping ───────────────────────
 								        parsed_calls = []  # list of (tool_call, function_name, function_args)
 								        for tool_call in tool_calls:
 								            function_name = tool_call.function.name
 								            # Reset nudge counters
 								            if function_name == "memory":
 								                self._turns_since_memory = 0
 								            elif function_name == "skill_manage":
 								                self._iters_since_skill = 0
 								            try:
 								                function_args = json.loads(tool_call.function.arguments)
 								            except json.JSONDecodeError:
 								                function_args = {}
 								            if not isinstance(function_args, dict):
 								                function_args = {}
 								            # Checkpoint for file-mutating tools
 								            if function_name in ("write_file", "patch") and self._checkpoint_mgr.enabled:
 								                try:
 								                    file_path = function_args.get("path", "")
 								                    if file_path:
 								                        work_dir = self._checkpoint_mgr.get_working_dir_for_path(file_path)
 								                        self._checkpoint_mgr.ensure_checkpoint(work_dir, f"before {function_name}")
 								                except Exception:
 								                    pass
 								            parsed_calls.append((tool_call, function_name, function_args))
 								        # ── Logging / callbacks ──────────────────────────────────────────
 								        tool_names_str = ", ".join(name for _, name, _ in parsed_calls)
 								        if not self.quiet_mode:
 								            print(f"  ⚡ Concurrent: {num_tools} tool calls — {tool_names_str}")
 								            for i, (tc, name, args) in enumerate(parsed_calls, 1):
 								                args_str = json.dumps(args, ensure_ascii=False)
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								                if self.verbose_logging:
 								                    print(f"  📞 Tool {i}: {name}({list(args.keys())})")
 								                    print(f"     Args: {args_str}")
 								                else:
 								                    args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
 								                    print(f"  📞 Tool {i}: {name}({list(args.keys())}) - {args_preview}")
-												feat: concurrent tool execution with ThreadPoolExecutor

When the model returns multiple tool calls in a single response, they are
now executed concurrently using a thread pool instead of sequentially.
This significantly reduces wall-clock time when multiple independent tools
are batched (e.g. parallel web_search, read_file, terminal calls).

Architecture:
- _execute_tool_calls() dispatches to sequential or concurrent path
- Single tool calls and batches containing 'clarify' use sequential path
- Multiple non-interactive tools use ThreadPoolExecutor (max 8 workers)
- Results are collected and appended to messages in original order
- _invoke_tool() extracted as shared tool invocation helper

Safety:
- Pre-flight interrupt check skips all tools if interrupted
- Per-tool exception handling: one failure doesn't crash the batch
- Result truncation (100k char limit) applied per tool
- Budget pressure injection after all tools complete
- Checkpoints taken before file-mutating tools
- CLI spinner shows batch progress, then per-tool completion messages

Tests: 10 new tests covering dispatch logic, ordering, error handling,
interrupt behavior, truncation, and _invoke_tool routing.

											
										
										
											2026-03-13 02:51:51 -07:00
 								        for _, name, args in parsed_calls:
 								            if self.tool_progress_callback:
 								                try:
 								                    preview = _build_tool_preview(name, args)
 								                    self.tool_progress_callback(name, preview, args)
 								                except Exception as cb_err:
 								                    logging.debug(f"Tool progress callback error: {cb_err}")
 								        # ── Concurrent execution ─────────────────────────────────────────
 								        # Each slot holds (function_name, function_args, function_result, duration, error_flag)
 								        results = [None] * num_tools
 								        def _run_tool(index, tool_call, function_name, function_args):
 								            """Worker function executed in a thread."""
 								            start = time.time()
 								            try:
 								                result = self._invoke_tool(function_name, function_args, effective_task_id)
 								            except Exception as tool_error:
 								                result = f"Error executing tool '{function_name}': {tool_error}"
 								                logger.error("_invoke_tool raised for %s: %s", function_name, tool_error, exc_info=True)
 								            duration = time.time() - start
 								            is_error, _ = _detect_tool_failure(function_name, result)
 								            results[index] = (function_name, function_args, result, duration, is_error)
 								        # Start spinner for CLI mode
 								        spinner = None
 								        if self.quiet_mode:
 								            face = random.choice(KawaiiSpinner.KAWAII_WAITING)
 								            spinner = KawaiiSpinner(f"{face} ⚡ running {num_tools} tools concurrently", spinner_type='dots')
 								            spinner.start()
 								        try:
 								            max_workers = min(num_tools, _MAX_TOOL_WORKERS)
 								            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
 								                futures = []
 								                for i, (tc, name, args) in enumerate(parsed_calls):
 								                    f = executor.submit(_run_tool, i, tc, name, args)
 								                    futures.append(f)
 								                # Wait for all to complete (exceptions are captured inside _run_tool)
 								                concurrent.futures.wait(futures)
 								        finally:
 								            if spinner:
 								                # Build a summary message for the spinner stop
 								                completed = sum(1 for r in results if r is not None)
 								                total_dur = sum(r[3] for r in results if r is not None)
 								                spinner.stop(f"⚡ {completed}/{num_tools} tools completed in {total_dur:.1f}s total")
 								        # ── Post-execution: display per-tool results ─────────────────────
 								        for i, (tc, name, args) in enumerate(parsed_calls):
 								            r = results[i]
 								            if r is None:
 								                # Shouldn't happen, but safety fallback
 								                function_result = f"Error executing tool '{name}': thread did not return a result"
 								                tool_duration = 0.0
 								            else:
 								                function_name, function_args, function_result, tool_duration, is_error = r
 								                if is_error:
 								                    result_preview = function_result[:200] if len(function_result) > 200 else function_result
 								                    logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
 								                if self.verbose_logging:
 								                    logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								                    logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")
-												feat: concurrent tool execution with ThreadPoolExecutor

When the model returns multiple tool calls in a single response, they are
now executed concurrently using a thread pool instead of sequentially.
This significantly reduces wall-clock time when multiple independent tools
are batched (e.g. parallel web_search, read_file, terminal calls).

Architecture:
- _execute_tool_calls() dispatches to sequential or concurrent path
- Single tool calls and batches containing 'clarify' use sequential path
- Multiple non-interactive tools use ThreadPoolExecutor (max 8 workers)
- Results are collected and appended to messages in original order
- _invoke_tool() extracted as shared tool invocation helper

Safety:
- Pre-flight interrupt check skips all tools if interrupted
- Per-tool exception handling: one failure doesn't crash the batch
- Result truncation (100k char limit) applied per tool
- Budget pressure injection after all tools complete
- Checkpoints taken before file-mutating tools
- CLI spinner shows batch progress, then per-tool completion messages

Tests: 10 new tests covering dispatch logic, ordering, error handling,
interrupt behavior, truncation, and _invoke_tool routing.

											
										
										
											2026-03-13 02:51:51 -07:00
 								            # Print cute message per tool
 								            if self.quiet_mode:
 								                cute_msg = _get_cute_tool_message_impl(name, args, tool_duration, result=function_result)
 								                print(f"  {cute_msg}")
 								            elif not self.quiet_mode:
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								                if self.verbose_logging:
 								                    print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s")
 								                    print(f"     Result: {function_result}")
 								                else:
 								                    response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
 								                    print(f"  ✅ Tool {i+1} completed in {tool_duration:.2f}s - {response_preview}")
-												feat: concurrent tool execution with ThreadPoolExecutor

When the model returns multiple tool calls in a single response, they are
now executed concurrently using a thread pool instead of sequentially.
This significantly reduces wall-clock time when multiple independent tools
are batched (e.g. parallel web_search, read_file, terminal calls).

Architecture:
- _execute_tool_calls() dispatches to sequential or concurrent path
- Single tool calls and batches containing 'clarify' use sequential path
- Multiple non-interactive tools use ThreadPoolExecutor (max 8 workers)
- Results are collected and appended to messages in original order
- _invoke_tool() extracted as shared tool invocation helper

Safety:
- Pre-flight interrupt check skips all tools if interrupted
- Per-tool exception handling: one failure doesn't crash the batch
- Result truncation (100k char limit) applied per tool
- Budget pressure injection after all tools complete
- Checkpoints taken before file-mutating tools
- CLI spinner shows batch progress, then per-tool completion messages

Tests: 10 new tests covering dispatch logic, ordering, error handling,
interrupt behavior, truncation, and _invoke_tool routing.

											
										
										
											2026-03-13 02:51:51 -07:00
 								            # Truncate oversized results
 								            MAX_TOOL_RESULT_CHARS = 100_000
 								            if len(function_result) > MAX_TOOL_RESULT_CHARS:
 								                original_len = len(function_result)
 								                function_result = (
 								                    function_result[:MAX_TOOL_RESULT_CHARS]
 								                    + f"\n\n[Truncated: tool response was {original_len:,} chars, "
 								                    f"exceeding the {MAX_TOOL_RESULT_CHARS:,} char limit]"
 								                )
 								            # Append tool result message in order
 								            tool_msg = {
 								                "role": "tool",
 								                "content": function_result,
 								                "tool_call_id": tc.id,
 								            }
 								            messages.append(tool_msg)
 								        # ── Budget pressure injection ────────────────────────────────────
 								        budget_warning = self._get_budget_warning(api_call_count)
 								        if budget_warning and messages and messages[-1].get("role") == "tool":
 								            last_content = messages[-1]["content"]
 								            try:
 								                parsed = json.loads(last_content)
 								                if isinstance(parsed, dict):
 								                    parsed["_budget_warning"] = budget_warning
 								                    messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
 								                else:
 								                    messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
 								            except (json.JSONDecodeError, TypeError):
 								                messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
 								            if not self.quiet_mode:
 								                remaining = self.max_iterations - api_call_count
 								                tier = "⚠️  WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
 								                print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")
 								    def _execute_tool_calls_sequential(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
 								        """Execute tool calls sequentially (original behavior). Used for single calls or interactive tools."""
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								        for i, tool_call in enumerate(assistant_message.tool_calls, 1):
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								            # SAFETY: check interrupt BEFORE starting each tool.
 								            # If the user sent "stop" during a previous tool's execution,
 								            # do NOT start any more tools -- skip them all immediately.
 								            if self._interrupt_requested:
 								                remaining_calls = assistant_message.tool_calls[i-1:]
 								                if remaining_calls:
-												fix: harden web gateway security and fix error swallowing

- Use hmac.compare_digest for timing-safe token comparison (3 endpoints)
- Default bind to 127.0.0.1 instead of 0.0.0.0
- Sanitize upload filenames with Path.name to prevent path traversal
- Add DOMPurify to sanitize marked.parse() output against XSS
- Replace add_static with authenticated media handler
- Hide token in group chats for /remote-control command
- Use ctypes.util.find_library for Opus instead of hardcoded paths
- Add force=True to 5 interrupt _vprint calls for visibility
- Log Opus decode errors and voice restart failures instead of swallowing

											
										
										
											2026-03-13 15:29:18 +03:00
+								                    self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {len(remaining_calls)} tool call(s)", force=True)
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								                for skipped_tc in remaining_calls:
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                    skipped_name = skipped_tc.function.name
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								                    skip_msg = {
 								                        "role": "tool",
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                        "content": f"[Tool execution cancelled — {skipped_name} was skipped due to user interrupt]",
-												feat: enhance interrupt handling and container resource configuration

- Introduced a shared interrupt signaling mechanism to allow tools to check for user interrupts during long-running operations.
- Updated the AIAgent to handle interrupts more effectively, ensuring in-progress tool calls are canceled and multiple interrupt messages are combined into one prompt.
- Enhanced the CLI configuration to include container resource limits (CPU, memory, disk) and persistence options for Docker, Singularity, and Modal environments.
- Improved documentation to clarify interrupt behaviors and container resource settings, providing users with better guidance on configuration and usage.

											
										
										
											2026-02-23 02:11:33 -08:00
+								                        "tool_call_id": skipped_tc.id,
 								                    }
 								                    messages.append(skip_msg)
 								                break
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            function_name = tool_call.function.name
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								            # Reset nudge counters when the relevant tool is actually used
 								            if function_name == "memory":
 								                self._turns_since_memory = 0
 								            elif function_name == "skill_manage":
 								                self._iters_since_skill = 0
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            try:
 								                function_args = json.loads(tool_call.function.arguments)
 								            except json.JSONDecodeError as e:
 								                logging.warning(f"Unexpected JSON error after validation: {e}")
 								                function_args = {}
-												fix: add upstream guard for non-dict function_args + tests for build_tool_preview

Complements PR #453 by 0xbyt4. Adds isinstance(dict) guard in
run_agent.py to catch cases where json.loads returns non-dict
(e.g. null, list, string) before they reach downstream code.

Also adds 15 tests for build_tool_preview covering None args,
empty dicts, known/unknown tools, fallback keys, truncation,
and all special-cased tools (process, todo, memory, session_search).

											
										
										
											2026-03-09 21:01:40 -07:00
+								            if not isinstance(function_args, dict):
 								                function_args = {}
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								            if not self.quiet_mode:
 								                args_str = json.dumps(function_args, ensure_ascii=False)
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								                if self.verbose_logging:
 								                    print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())})")
 								                    print(f"     Args: {args_str}")
 								                else:
 								                    args_preview = args_str[:self.log_prefix_chars] + "..." if len(args_str) > self.log_prefix_chars else args_str
 								                    print(f"  📞 Tool {i}: {function_name}({list(function_args.keys())}) - {args_preview}")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								            if self.tool_progress_callback:
 								                try:
 								                    preview = _build_tool_preview(function_name, function_args)
-												feat: show detailed tool call args in gateway based on config

Issue #263: Telegram/Discord/WhatsApp/Slack now show tool call details
based on display.tool_progress in config.yaml.

Changes:
- gateway/run.py: 'verbose' mode shows full args (keys + JSON, 200 char
  max). 'all' mode preview increased from 40 to 80 chars. Added missing
  tool emojis (execute_code, delegate_task, clarify, skill_manage,
  search_files).
- agent/display.py: Added execute_code, delegate_task, clarify,
  skill_manage to primary_args. Added 'code' and 'goal' to fallback keys.
- run_agent.py: Pass function_args dict to tool_progress_callback so
  gateway can format based on its own verbosity config.

Config usage:
  display:
    tool_progress: verbose  # off | new | all | verbose

											
										
										
											2026-03-02 05:23:15 -08:00
+								                    self.tool_progress_callback(function_name, preview, function_args)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                except Exception as cb_err:
 								                    logging.debug(f"Tool progress callback error: {cb_err}")
-												feat: filesystem checkpoints and /rollback command

Automatic filesystem snapshots before destructive file operations,
with user-facing rollback.  Inspired by PR #559 (by @alireza78a).

Architecture:
- Shadow git repos at ~/.hermes/checkpoints/{hash}/ via GIT_DIR
- CheckpointManager: take/list/restore, turn-scoped dedup, pruning
- Transparent — the LLM never sees it, no tool schema, no tokens
- Once per turn — only first write_file/patch triggers a snapshot

Integration:
- Config: checkpoints.enabled + checkpoints.max_snapshots
- CLI flag: hermes --checkpoints
- Trigger: run_agent.py _execute_tool_calls() before write_file/patch
- /rollback slash command in CLI + gateway (list, restore by number)
- Pre-rollback snapshot auto-created on restore (undo the undo)

Safety:
- Never blocks file operations — all errors silently logged
- Skips root dir, home dir, dirs >50K files
- Disables gracefully when git not installed
- Shadow repo completely isolated from project git

Tests: 35 new tests, all passing (2798 total suite)
Docs: feature page, config reference, CLI commands reference

											
										
										
											2026-03-10 00:49:15 -07:00
+								            # Checkpoint: snapshot working dir before file-mutating tools
 								            if function_name in ("write_file", "patch") and self._checkpoint_mgr.enabled:
 								                try:
 								                    file_path = function_args.get("path", "")
 								                    if file_path:
 								                        work_dir = self._checkpoint_mgr.get_working_dir_for_path(file_path)
 								                        self._checkpoint_mgr.ensure_checkpoint(
 								                            work_dir, f"before {function_name}"
 								                        )
 								                except Exception:
 								                    pass  # never block tool execution
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            tool_start_time = time.time()
 								            if function_name == "todo":
 								                from tools.todo_tool import todo_tool as _todo_tool
 								                function_result = _todo_tool(
 								                    todos=function_args.get("todos"),
 								                    merge=function_args.get("merge", False),
 								                    store=self._todo_store,
 								                )
 								                tool_duration = time.time() - tool_start_time
 								                if self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"  {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
-												fix(gateway): Pass session_db to AIAgent, fixing session_search error

When running via the gateway (e.g. Telegram), the session_search tool
returned: {"error": "session_search must be handled by the agent loop"}

Root cause:
- gateway/run.py creates AIAgent without passing session_db=
- self._session_db is None in the agent instance
- The dispatch condition "elif function_name == 'session_search' and self._session_db"
  skips when _session_db is None, falling through to the generic error

This fix:
1. Initializes self._session_db in GatewayRunner.__init__()
2. Passes session_db to all AIAgent instantiations in gateway/run.py
3. Adds defensive fallback in run_agent.py to return a clear error when
   session_db is unavailable, instead of falling through

Fixes #105

											
										
										
											2026-02-27 00:32:17 -05:00
+								            elif function_name == "session_search":
 								                if not self._session_db:
 								                    function_result = json.dumps({"success": False, "error": "Session database not available."})
 								                else:
 								                    from tools.session_search_tool import session_search as _session_search
 								                    function_result = _session_search(
 								                        query=function_args.get("query", ""),
 								                        role_filter=function_args.get("role_filter"),
 								                        limit=function_args.get("limit", 3),
 								                        db=self._session_db,
-												fix: exclude current session from session_search results

session_search was returning the current session if it matched the
query, which is redundant — the agent already has the current
conversation context. This wasted an LLM summarization call and a
result slot.

Added current_session_id parameter to session_search(). The agent
passes self.session_id and the search filters out any results where
either the raw or parent-resolved session ID matches. Both the raw
match and the parent-resolved match are checked to handle child
sessions from delegation.

Two tests added verifying the exclusion works and that other
sessions are still returned.

											
										
										
											2026-03-04 06:06:40 -08:00
+								                        current_session_id=self.session_id,
-												fix(gateway): Pass session_db to AIAgent, fixing session_search error

When running via the gateway (e.g. Telegram), the session_search tool
returned: {"error": "session_search must be handled by the agent loop"}

Root cause:
- gateway/run.py creates AIAgent without passing session_db=
- self._session_db is None in the agent instance
- The dispatch condition "elif function_name == 'session_search' and self._session_db"
  skips when _session_db is None, falling through to the generic error

This fix:
1. Initializes self._session_db in GatewayRunner.__init__()
2. Passes session_db to all AIAgent instantiations in gateway/run.py
3. Adds defensive fallback in run_agent.py to return a clear error when
   session_db is unavailable, instead of falling through

Fixes #105

											
										
										
											2026-02-27 00:32:17 -05:00
+								                    )
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                tool_duration = time.time() - tool_start_time
 								                if self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"  {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            elif function_name == "memory":
-												feat: integrate Honcho with USER.md memory system

When Honcho is active:
- System prompt uses Honcho prefetch instead of USER.md
- memory tool target=user add routes to Honcho
- MEMORY.md untouched in all cases

When disabled, everything works as before.

Also wires up contextTokens config to cap prefetch size.

											
										
										
											2026-02-26 18:02:24 -05:00
+								                target = function_args.get("target", "memory")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                from tools.memory_tool import memory_tool as _memory_tool
 								                function_result = _memory_tool(
 								                    action=function_args.get("action"),
-												refactor: run Honcho and USER.md in tandem

USER.md stays in system prompt when Honcho is active -- prefetch is
additive context, not a replacement. Memory tool user observations
write to both USER.md (local) and Honcho (cross-session) simultaneously.

											
										
										
											2026-02-26 18:07:33 -05:00
+								                    target=target,
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    content=function_args.get("content"),
 								                    old_text=function_args.get("old_text"),
 								                    store=self._memory_store,
 								                )
-												refactor: run Honcho and USER.md in tandem

USER.md stays in system prompt when Honcho is active -- prefetch is
additive context, not a replacement. Memory tool user observations
write to both USER.md (local) and Honcho (cross-session) simultaneously.

											
										
										
											2026-02-26 18:07:33 -05:00
+								                # Also send user observations to Honcho when active
-												feat: integrate Honcho with USER.md memory system

When Honcho is active:
- System prompt uses Honcho prefetch instead of USER.md
- memory tool target=user add routes to Honcho
- MEMORY.md untouched in all cases

When disabled, everything works as before.

Also wires up contextTokens config to cap prefetch size.

											
										
										
											2026-02-26 18:02:24 -05:00
+								                if self._honcho and target == "user" and function_args.get("action") == "add":
-												refactor: run Honcho and USER.md in tandem

USER.md stays in system prompt when Honcho is active -- prefetch is
additive context, not a replacement. Memory tool user observations
write to both USER.md (local) and Honcho (cross-session) simultaneously.

											
										
										
											2026-02-26 18:07:33 -05:00
+								                    self._honcho_save_user_observation(function_args.get("content", ""))
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                tool_duration = time.time() - tool_start_time
 								                if self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"  {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            elif function_name == "clarify":
 								                from tools.clarify_tool import clarify_tool as _clarify_tool
 								                function_result = _clarify_tool(
 								                    question=function_args.get("question", ""),
 								                    choices=function_args.get("choices"),
 								                    callback=self.clarify_callback,
 								                )
 								                tool_duration = time.time() - tool_start_time
 								                if self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"  {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            elif function_name == "delegate_task":
 								                from tools.delegate_tool import delegate_task as _delegate_task
 								                tasks_arg = function_args.get("tasks")
 								                if tasks_arg and isinstance(tasks_arg, list):
 								                    spinner_label = f"🔀 delegating {len(tasks_arg)} tasks"
 								                else:
 								                    goal_preview = (function_args.get("goal") or "")[:30]
 								                    spinner_label = f"🔀 {goal_preview}" if goal_preview else "🔀 delegating"
 								                spinner = None
 								                if self.quiet_mode:
 								                    face = random.choice(KawaiiSpinner.KAWAII_WAITING)
 								                    spinner = KawaiiSpinner(f"{face} {spinner_label}", spinner_type='dots')
 								                    spinner.start()
 								                self._delegate_spinner = spinner
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                _delegate_result = None
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                try:
 								                    function_result = _delegate_task(
 								                        goal=function_args.get("goal"),
 								                        context=function_args.get("context"),
 								                        toolsets=function_args.get("toolsets"),
 								                        tasks=tasks_arg,
 								                        max_iterations=function_args.get("max_iterations"),
 								                        parent_agent=self,
 								                    )
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                    _delegate_result = function_result
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                finally:
 								                    self._delegate_spinner = None
 								                    tool_duration = time.time() - tool_start_time
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                    cute_msg = _get_cute_tool_message_impl('delegate_task', function_args, tool_duration, result=_delegate_result)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    if spinner:
 								                        spinner.stop(cute_msg)
 								                    elif self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"  {cute_msg}")
 								            elif self.quiet_mode and self._stream_callback is None:
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                face = random.choice(KawaiiSpinner.KAWAII_WAITING)
-												feat(tools): centralize tool emoji metadata in registry + skin integration

- Add 'emoji' field to ToolEntry and 'get_emoji()' to ToolRegistry
- Add emoji= to all 50+ registry.register() calls across tool files
- Add get_tool_emoji() helper in agent/display.py with 3-tier resolution:
  skin override → registry default → hardcoded fallback
- Replace hardcoded emoji maps in run_agent.py, delegate_tool.py, and
  gateway/run.py with centralized get_tool_emoji() calls
- Add 'tool_emojis' field to SkinConfig so skins can override per-tool
  emojis (e.g. ares skin could use swords instead of wrenches)
- Add 11 tests (5 registry emoji, 6 display/skin integration)
- Update AGENTS.md skin docs table

Based on the approach from PR #1061 by ForgingAlex (emoji centralization
in registry). This salvage fixes several issues from the original:
- Does NOT split the cronjob tool (which would crash on missing schemas)
- Does NOT change image_generate toolset/requires_env/is_async
- Does NOT delete existing tests
- Completes the centralization (gateway/run.py was missed)
- Hooks into the skin system for full customizability

											
										
										
											2026-03-15 20:21:21 -07:00
+								                emoji = _get_tool_emoji(function_name)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                preview = _build_tool_preview(function_name, function_args) or function_name
 								                if len(preview) > 30:
 								                    preview = preview[:27] + "..."
 								                spinner = KawaiiSpinner(f"{face} {emoji} {preview}", spinner_type='dots')
 								                spinner.start()
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                _spinner_result = None
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                try:
-												fix: wire up enabled_tools in agent loop + simplify sandbox tool selection

Completes the fix started in 8318a51 — handle_function_call() accepted
enabled_tools but run_agent.py never passed it. Now both call sites in
_execute_tool_calls() pass self.valid_tool_names, so each agent session
uses its own tool list instead of the process-global
_last_resolved_tool_names (which subagents can overwrite).

Also simplifies the redundant ternary in code_execution_tool.py:
sandbox_tools is already computed correctly (intersection with session
tools, or full SANDBOX_ALLOWED_TOOLS as fallback), so the conditional
was dead logic.

Inspired by PR #663 (JasonOA888). Closes #662.
Tests: 2857 passed.

											
										
										
											2026-03-10 06:35:28 -07:00
+								                    function_result = handle_function_call(
 								                        function_name, function_args, effective_task_id,
 								                        enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
 								                    )
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                    _spinner_result = function_result
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								                except Exception as tool_error:
 								                    function_result = f"Error executing tool '{function_name}': {tool_error}"
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
+								                    logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                finally:
 								                    tool_duration = time.time() - tool_start_time
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                    cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    spinner.stop(cute_msg)
 								            else:
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								                try:
-												fix: wire up enabled_tools in agent loop + simplify sandbox tool selection

Completes the fix started in 8318a51 — handle_function_call() accepted
enabled_tools but run_agent.py never passed it. Now both call sites in
_execute_tool_calls() pass self.valid_tool_names, so each agent session
uses its own tool list instead of the process-global
_last_resolved_tool_names (which subagents can overwrite).

Also simplifies the redundant ternary in code_execution_tool.py:
sandbox_tools is already computed correctly (intersection with session
tools, or full SANDBOX_ALLOWED_TOOLS as fallback), so the conditional
was dead logic.

Inspired by PR #663 (JasonOA888). Closes #662.
Tests: 2857 passed.

											
										
										
											2026-03-10 06:35:28 -07:00
+								                    function_result = handle_function_call(
 								                        function_name, function_args, effective_task_id,
 								                        enabled_tools=list(self.valid_tool_names) if self.valid_tool_names else None,
 								                    )
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								                except Exception as tool_error:
 								                    function_result = f"Error executing tool '{function_name}': {tool_error}"
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
+								                    logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                tool_duration = time.time() - tool_start_time
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								            result_preview = function_result if self.verbose_logging else (
 								                function_result[:200] if len(function_result) > 200 else function_result
 								            )
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
-												feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.

											
										
										
											2026-02-28 22:49:58 -08:00
+								            # Log tool errors to the persistent error log so [error] tags
 								            # in the UI always have a corresponding detailed entry on disk.
 								            _is_error_result, _ = _detect_tool_failure(function_name, function_result)
 								            if _is_error_result:
 								                logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            if self.verbose_logging:
 								                logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								                logging.debug(f"Tool result ({len(function_result)} chars): {function_result}")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								            # Guard against tools returning absurdly large content that would
 								            # blow up the context window. 100K chars ≈ 25K tokens — generous
 								            # enough for any reasonable tool output but prevents catastrophic
 								            # context explosions (e.g. accidental base64 image dumps).
 								            MAX_TOOL_RESULT_CHARS = 100_000
 								            if len(function_result) > MAX_TOOL_RESULT_CHARS:
 								                original_len = len(function_result)
 								                function_result = (
 								                    function_result[:MAX_TOOL_RESULT_CHARS]
 								                    + f"\n\n[Truncated: tool response was {original_len:,} chars, "
 								                    f"exceeding the {MAX_TOOL_RESULT_CHARS:,} char limit]"
 								                )
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								            tool_msg = {
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                "role": "tool",
 								                "content": function_result,
 								                "tool_call_id": tool_call.id
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								            }
 								            messages.append(tool_msg)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								            if not self.quiet_mode:
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								                if self.verbose_logging:
 								                    print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s")
 								                    print(f"     Result: {function_result}")
 								                else:
 								                    response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
 								                    print(f"  ✅ Tool {i} completed in {tool_duration:.2f}s - {response_preview}")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								            if self._interrupt_requested and i < len(assistant_message.tool_calls):
 								                remaining = len(assistant_message.tool_calls) - i
-												fix: harden web gateway security and fix error swallowing

- Use hmac.compare_digest for timing-safe token comparison (3 endpoints)
- Default bind to 127.0.0.1 instead of 0.0.0.0
- Sanitize upload filenames with Path.name to prevent path traversal
- Add DOMPurify to sanitize marked.parse() output against XSS
- Replace add_static with authenticated media handler
- Hide token in group chats for /remote-control command
- Use ctypes.util.find_library for Opus instead of hardcoded paths
- Add force=True to 5 interrupt _vprint calls for visibility
- Log Opus decode errors and voice restart failures instead of swallowing

											
										
										
											2026-03-13 15:29:18 +03:00
+								                self._vprint(f"{self.log_prefix}⚡ Interrupt: skipping {remaining} remaining tool call(s)", force=True)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                for skipped_tc in assistant_message.tool_calls[i:]:
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                    skipped_name = skipped_tc.function.name
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                    skip_msg = {
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                        "role": "tool",
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                        "content": f"[Tool execution skipped — {skipped_name} was not started. User sent a new message]",
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                        "tool_call_id": skipped_tc.id
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                    }
 								                    messages.append(skip_msg)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                break
 								            if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
 								                time.sleep(self.tool_delay)
-												feat: iteration budget pressure via tool result injection

Two-tier warning system that nudges the LLM as it approaches
max_iterations, injected into the last tool result JSON rather
than as a separate system message:

- Caution (70%): {"_budget_warning": "[BUDGET: 42/60...]"}
- Warning (90%): {"_budget_warning": "[BUDGET WARNING: 54/60...]"}

For JSON tool results, adds a _budget_warning field to the existing
dict. For plain text results, appends the warning as text.

Key properties:
- No system messages injected mid-conversation
- No changes to message structure
- Prompt cache stays valid
- Configurable thresholds (0.7 / 0.9)
- Can be disabled: _budget_pressure_enabled = False

Inspired by PR #421 (@Bartok9) and issue #414.
8 tests covering thresholds, edge cases, JSON and text injection.

											
										
										
											2026-03-09 05:19:42 -07:00
+								        # ── Budget pressure injection ─────────────────────────────────
 								        # After all tool calls in this turn are processed, check if we're
 								        # approaching max_iterations. If so, inject a warning into the LAST
 								        # tool result's JSON so the LLM sees it naturally when reading results.
 								        budget_warning = self._get_budget_warning(api_call_count)
 								        if budget_warning and messages and messages[-1].get("role") == "tool":
 								            last_content = messages[-1]["content"]
 								            try:
 								                parsed = json.loads(last_content)
 								                if isinstance(parsed, dict):
 								                    parsed["_budget_warning"] = budget_warning
 								                    messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
 								                else:
 								                    messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
 								            except (json.JSONDecodeError, TypeError):
 								                messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
 								            if not self.quiet_mode:
 								                remaining = self.max_iterations - api_call_count
 								                tier = "⚠️  WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
 								                print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")
 								    def _get_budget_warning(self, api_call_count: int) -> Optional[str]:
 								        """Return a budget pressure string, or None if not yet needed.
 								        Two-tier system:
 								          - Caution (70%): nudge to consolidate work
 								          - Warning (90%): urgent, must respond now
 								        """
 								        if not self._budget_pressure_enabled or self.max_iterations <= 0:
 								            return None
 								        progress = api_call_count / self.max_iterations
 								        remaining = self.max_iterations - api_call_count
 								        if progress >= self._budget_warning_threshold:
 								            return (
 								                f"[BUDGET WARNING: Iteration {api_call_count}/{self.max_iterations}. "
 								                f"Only {remaining} iteration(s) left. "
 								                "Provide your final response NOW. No more tool calls unless absolutely critical.]"
 								            )
 								        if progress >= self._budget_caution_threshold:
 								            return (
 								                f"[BUDGET: Iteration {api_call_count}/{self.max_iterations}. "
 								                f"{remaining} iterations left. Start consolidating your work.]"
 								            )
 								        return None
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								    def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
 								        """Request a summary when max iterations are reached. Returns the final response text."""
 								        print(f"⚠️  Reached maximum iterations ({self.max_iterations}). Requesting summary...")
 								        summary_request = (
 								            "You've reached the maximum number of tool-calling iterations allowed. "
 								            "Please provide a final response summarizing what you've found and accomplished so far, "
 								            "without calling any more tools."
 								        )
 								        messages.append({"role": "user", "content": summary_request})
 								        try:
-												fix: strip internal fields from API messages in _handle_max_iterations

The flush_memories() and run_conversation() code paths already stripped
finish_reason and reasoning from API messages (added in 7a0b377 via PR
#253), but _handle_max_iterations() was missed. It was sending raw
messages.copy() which could include finish_reason, causing 422 errors
on strict APIs like Mistral when the agent hit max iterations.

Now strips the same internal fields consistently across all three API
call sites.

											
										
										
											2026-03-04 21:08:20 -08:00
+								            # Build API messages, stripping internal-only fields
 								            # (finish_reason, reasoning) that strict APIs like Mistral reject with 422
-												fix: strip call_id/response_item_id from tool_calls for Mistral compatibility

Mistral's API strictly validates the Chat Completions schema and rejects
unknown fields (call_id, response_item_id) with 422. These fields are
added by _build_assistant_message() for Codex Responses API support.

This fix:
- Only strips when targeting Mistral (api.mistral.ai in base_url)
- Creates new tool_call dicts instead of mutating originals (shallow
  copy safety — msg.copy() shares the tool_calls list)
- Preserves call_id/response_item_id in the internal message history
  so _chat_messages_to_responses_input() can still read them if the
  session falls back to a Codex provider mid-conversation

Applied in all 3 API message building locations:
- Main conversation loop (run_conversation)
- _handle_max_iterations()
- flush_memories()

Inspired by PR #864 (unmodeled-tyler) which identified the issue but
applied the fix unconditionally and mutated originals via shallow copy.

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>

											
										
										
											2026-03-12 08:18:27 -07:00
+								            _is_strict_api = "api.mistral.ai" in self.base_url.lower()
-												fix: strip internal fields from API messages in _handle_max_iterations

The flush_memories() and run_conversation() code paths already stripped
finish_reason and reasoning from API messages (added in 7a0b377 via PR
#253), but _handle_max_iterations() was missed. It was sending raw
messages.copy() which could include finish_reason, causing 422 errors
on strict APIs like Mistral when the agent hit max iterations.

Now strips the same internal fields consistently across all three API
call sites.

											
										
										
											2026-03-04 21:08:20 -08:00
+								            api_messages = []
 								            for msg in messages:
 								                api_msg = msg.copy()
 								                for internal_field in ("reasoning", "finish_reason"):
 								                    api_msg.pop(internal_field, None)
-												fix: strip call_id/response_item_id from tool_calls for Mistral compatibility

Mistral's API strictly validates the Chat Completions schema and rejects
unknown fields (call_id, response_item_id) with 422. These fields are
added by _build_assistant_message() for Codex Responses API support.

This fix:
- Only strips when targeting Mistral (api.mistral.ai in base_url)
- Creates new tool_call dicts instead of mutating originals (shallow
  copy safety — msg.copy() shares the tool_calls list)
- Preserves call_id/response_item_id in the internal message history
  so _chat_messages_to_responses_input() can still read them if the
  session falls back to a Codex provider mid-conversation

Applied in all 3 API message building locations:
- Main conversation loop (run_conversation)
- _handle_max_iterations()
- flush_memories()

Inspired by PR #864 (unmodeled-tyler) which identified the issue but
applied the fix unconditionally and mutated originals via shallow copy.

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>

											
										
										
											2026-03-12 08:18:27 -07:00
+								                if _is_strict_api:
 								                    self._sanitize_tool_calls_for_strict_api(api_msg)
-												fix: strip internal fields from API messages in _handle_max_iterations

The flush_memories() and run_conversation() code paths already stripped
finish_reason and reasoning from API messages (added in 7a0b377 via PR
#253), but _handle_max_iterations() was missed. It was sending raw
messages.copy() which could include finish_reason, causing 422 errors
on strict APIs like Mistral when the agent hit max iterations.

Now strips the same internal fields consistently across all three API
call sites.

											
										
										
											2026-03-04 21:08:20 -08:00
+								                api_messages.append(api_msg)
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								            effective_system = self._cached_system_prompt or ""
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            if self.ephemeral_system_prompt:
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
 								            if effective_system:
 								                api_messages = [{"role": "system", "content": effective_system}] + api_messages
 								            if self.prefill_messages:
 								                sys_offset = 1 if effective_system else 0
 								                for idx, pfm in enumerate(self.prefill_messages):
 								                    api_messages.insert(sys_offset + idx, pfm.copy())
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								            summary_extra_body = {}
-												refactor: streamline reasoning configuration checks in AIAgent

- Simplified the logic for determining support for reasoning based on the base URL by introducing clearer variable names.
- Added product attribution for the Nous Portal to the extra body of requests when applicable, enhancing tagging for better tracking.

											
										
										
											2026-02-25 16:49:41 -08:00
+								            _is_openrouter = "openrouter" in self.base_url.lower()
 								            _is_nous = "nousresearch" in self.base_url.lower()
 								            if _is_openrouter or _is_nous:
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                if self.reasoning_config is not None:
 								                    summary_extra_body["reasoning"] = self.reasoning_config
 								                else:
 								                    summary_extra_body["reasoning"] = {
 								                        "enabled": True,
-												feat: default reasoning effort from xhigh to medium

Reduces token usage and latency for most tasks by defaulting to
medium reasoning effort instead of xhigh. Users can still override
via config or CLI flag. Updates code, tests, example config, and docs.

											
										
										
											2026-03-07 10:14:19 -08:00
+								                        "effort": "medium"
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    }
-												refactor: streamline reasoning configuration checks in AIAgent

- Simplified the logic for determining support for reasoning based on the base URL by introducing clearer variable names.
- Added product attribution for the Nous Portal to the extra body of requests when applicable, enhancing tagging for better tracking.

											
										
										
											2026-02-25 16:49:41 -08:00
+								            if _is_nous:
 								                summary_extra_body["tags"] = ["product=hermes-agent"]
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								            if self.api_mode == "codex_responses":
 								                codex_kwargs = self._build_api_kwargs(api_messages)
-												fix: NoneType not iterable error when summarizing at max iterations

In _handle_max_iterations, the codex_responses path set tools=None to
prevent tool calls during summarization. However, the OpenAI SDK's
_make_tools() treats None as a valid value (not its Omit sentinel) and
tries to iterate over it, causing TypeError: 'NoneType' object is not
iterable.

Fix: use codex_kwargs.pop('tools', None) to remove the key entirely,
so the SDK never receives it and uses its default omit behavior.

Fixes #300

											
										
										
											2026-03-03 03:42:44 -08:00
+								                codex_kwargs.pop("tools", None)
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                summary_response = self._run_codex_stream(codex_kwargs)
 								                assistant_message, _ = self._normalize_codex_response(summary_response)
 								                final_response = (assistant_message.content or "").strip() if assistant_message else ""
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            else:
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                summary_kwargs = {
 								                    "model": self.model,
 								                    "messages": api_messages,
 								                }
 								                if self.max_tokens is not None:
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                    summary_kwargs.update(self._max_tokens_param(self.max_tokens))
-												feat(provider-routing): add OpenRouter provider routing configuration

Introduced a new `provider_routing` section in the CLI configuration to control how requests are routed across providers when using OpenRouter. This includes options for sorting providers by throughput, latency, or price, as well as allowing or ignoring specific providers, setting the order of provider attempts, and managing data collection policies. Updated relevant classes and documentation to support these features, enhancing flexibility in provider selection.

											
										
										
											2026-03-01 18:24:27 -08:00
 								                # Include provider routing preferences
 								                provider_preferences = {}
 								                if self.providers_allowed:
 								                    provider_preferences["only"] = self.providers_allowed
 								                if self.providers_ignored:
 								                    provider_preferences["ignore"] = self.providers_ignored
 								                if self.providers_order:
 								                    provider_preferences["order"] = self.providers_order
 								                if self.provider_sort:
 								                    provider_preferences["sort"] = self.provider_sort
 								                if provider_preferences:
 								                    summary_extra_body["provider"] = provider_preferences
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                if summary_extra_body:
 								                    summary_kwargs["extra_body"] = summary_extra_body
-												fix(anthropic): final polish — tool ID sanitization, crash guards, temp=1

Remaining issues from deep scan:

Adapter (agent/anthropic_adapter.py):
- Add _sanitize_tool_id() — Anthropic requires IDs matching [a-zA-Z0-9_-],
  now strips invalid chars and ensures non-empty (both tool_use and tool_result)
- Empty tool result content → '(no output)' placeholder (Anthropic rejects empty)
- Set temperature=1 when thinking type='enabled' on older models (required)
- normalize_model_name now case-insensitive for 'Anthropic/' prefix
- Fix stale docstrings referencing only ~/.claude/.credentials.json

Agent loop (run_agent.py):
- Guard memory flush path (line ~2684) — was calling self.client.chat.completions
  which is None in anthropic_messages mode. Now routes through Anthropic client.
- Guard summary generation path (line ~3171) — same crash when reaching
  iteration limit. Now builds proper Anthropic kwargs and normalizes response.
- Guard retry summary path (line ~3200) — same fix for the summary retry loop.

All three self.client.chat.completions.create() calls outside the main
loop now have anthropic_messages branches to prevent NoneType crashes.

											
										
										
											2026-03-12 17:23:09 -07:00
+								                if self.api_mode == "anthropic_messages":
 								                    from agent.anthropic_adapter import build_anthropic_kwargs as _bak, normalize_anthropic_response as _nar
 								                    _ant_kw = _bak(model=self.model, messages=api_messages, tools=None,
 								                                   max_tokens=self.max_tokens, reasoning_config=self.reasoning_config)
-												fix: preflight Anthropic auth and prefer Claude store

											
										
										
											2026-03-14 19:38:55 -07:00
+								                    summary_response = self._anthropic_messages_create(_ant_kw)
-												fix(anthropic): final polish — tool ID sanitization, crash guards, temp=1

Remaining issues from deep scan:

Adapter (agent/anthropic_adapter.py):
- Add _sanitize_tool_id() — Anthropic requires IDs matching [a-zA-Z0-9_-],
  now strips invalid chars and ensures non-empty (both tool_use and tool_result)
- Empty tool result content → '(no output)' placeholder (Anthropic rejects empty)
- Set temperature=1 when thinking type='enabled' on older models (required)
- normalize_model_name now case-insensitive for 'Anthropic/' prefix
- Fix stale docstrings referencing only ~/.claude/.credentials.json

Agent loop (run_agent.py):
- Guard memory flush path (line ~2684) — was calling self.client.chat.completions
  which is None in anthropic_messages mode. Now routes through Anthropic client.
- Guard summary generation path (line ~3171) — same crash when reaching
  iteration limit. Now builds proper Anthropic kwargs and normalizes response.
- Guard retry summary path (line ~3200) — same fix for the summary retry loop.

All three self.client.chat.completions.create() calls outside the main
loop now have anthropic_messages branches to prevent NoneType crashes.

											
										
										
											2026-03-12 17:23:09 -07:00
+								                    _msg, _ = _nar(summary_response)
 								                    final_response = (_msg.content or "").strip()
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                else:
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                    summary_response = self._ensure_primary_openai_client(reason="iteration_limit_summary").chat.completions.create(**summary_kwargs)
-												fix(anthropic): final polish — tool ID sanitization, crash guards, temp=1

Remaining issues from deep scan:

Adapter (agent/anthropic_adapter.py):
- Add _sanitize_tool_id() — Anthropic requires IDs matching [a-zA-Z0-9_-],
  now strips invalid chars and ensures non-empty (both tool_use and tool_result)
- Empty tool result content → '(no output)' placeholder (Anthropic rejects empty)
- Set temperature=1 when thinking type='enabled' on older models (required)
- normalize_model_name now case-insensitive for 'Anthropic/' prefix
- Fix stale docstrings referencing only ~/.claude/.credentials.json

Agent loop (run_agent.py):
- Guard memory flush path (line ~2684) — was calling self.client.chat.completions
  which is None in anthropic_messages mode. Now routes through Anthropic client.
- Guard summary generation path (line ~3171) — same crash when reaching
  iteration limit. Now builds proper Anthropic kwargs and normalizes response.
- Guard retry summary path (line ~3200) — same fix for the summary retry loop.

All three self.client.chat.completions.create() calls outside the main
loop now have anthropic_messages branches to prevent NoneType crashes.

											
										
										
											2026-03-12 17:23:09 -07:00
 								                    if summary_response.choices and summary_response.choices[0].message.content:
 								                        final_response = summary_response.choices[0].message.content
 								                    else:
 								                        final_response = ""
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
 								            if final_response:
 								                if "<think>" in final_response:
 								                    final_response = re.sub(r'<think>.*?</think>\s*', '', final_response, flags=re.DOTALL).strip()
 								                if final_response:
 								                    messages.append({"role": "assistant", "content": final_response})
 								                else:
 								                    final_response = "I reached the iteration limit and couldn't generate a summary."
 								            else:
 								                # Retry summary generation
 								                if self.api_mode == "codex_responses":
 								                    codex_kwargs = self._build_api_kwargs(api_messages)
-												fix: NoneType not iterable error when summarizing at max iterations

In _handle_max_iterations, the codex_responses path set tools=None to
prevent tool calls during summarization. However, the OpenAI SDK's
_make_tools() treats None as a valid value (not its Omit sentinel) and
tries to iterate over it, causing TypeError: 'NoneType' object is not
iterable.

Fix: use codex_kwargs.pop('tools', None) to remove the key entirely,
so the SDK never receives it and uses its default omit behavior.

Fixes #300

											
										
										
											2026-03-03 03:42:44 -08:00
+								                    codex_kwargs.pop("tools", None)
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                    retry_response = self._run_codex_stream(codex_kwargs)
 								                    retry_msg, _ = self._normalize_codex_response(retry_response)
 								                    final_response = (retry_msg.content or "").strip() if retry_msg else ""
-												fix(anthropic): final polish — tool ID sanitization, crash guards, temp=1

Remaining issues from deep scan:

Adapter (agent/anthropic_adapter.py):
- Add _sanitize_tool_id() — Anthropic requires IDs matching [a-zA-Z0-9_-],
  now strips invalid chars and ensures non-empty (both tool_use and tool_result)
- Empty tool result content → '(no output)' placeholder (Anthropic rejects empty)
- Set temperature=1 when thinking type='enabled' on older models (required)
- normalize_model_name now case-insensitive for 'Anthropic/' prefix
- Fix stale docstrings referencing only ~/.claude/.credentials.json

Agent loop (run_agent.py):
- Guard memory flush path (line ~2684) — was calling self.client.chat.completions
  which is None in anthropic_messages mode. Now routes through Anthropic client.
- Guard summary generation path (line ~3171) — same crash when reaching
  iteration limit. Now builds proper Anthropic kwargs and normalizes response.
- Guard retry summary path (line ~3200) — same fix for the summary retry loop.

All three self.client.chat.completions.create() calls outside the main
loop now have anthropic_messages branches to prevent NoneType crashes.

											
										
										
											2026-03-12 17:23:09 -07:00
+								                elif self.api_mode == "anthropic_messages":
 								                    from agent.anthropic_adapter import build_anthropic_kwargs as _bak2, normalize_anthropic_response as _nar2
 								                    _ant_kw2 = _bak2(model=self.model, messages=api_messages, tools=None,
 								                                     max_tokens=self.max_tokens, reasoning_config=self.reasoning_config)
-												fix: preflight Anthropic auth and prefer Claude store

											
										
										
											2026-03-14 19:38:55 -07:00
+								                    retry_response = self._anthropic_messages_create(_ant_kw2)
-												fix(anthropic): final polish — tool ID sanitization, crash guards, temp=1

Remaining issues from deep scan:

Adapter (agent/anthropic_adapter.py):
- Add _sanitize_tool_id() — Anthropic requires IDs matching [a-zA-Z0-9_-],
  now strips invalid chars and ensures non-empty (both tool_use and tool_result)
- Empty tool result content → '(no output)' placeholder (Anthropic rejects empty)
- Set temperature=1 when thinking type='enabled' on older models (required)
- normalize_model_name now case-insensitive for 'Anthropic/' prefix
- Fix stale docstrings referencing only ~/.claude/.credentials.json

Agent loop (run_agent.py):
- Guard memory flush path (line ~2684) — was calling self.client.chat.completions
  which is None in anthropic_messages mode. Now routes through Anthropic client.
- Guard summary generation path (line ~3171) — same crash when reaching
  iteration limit. Now builds proper Anthropic kwargs and normalizes response.
- Guard retry summary path (line ~3200) — same fix for the summary retry loop.

All three self.client.chat.completions.create() calls outside the main
loop now have anthropic_messages branches to prevent NoneType crashes.

											
										
										
											2026-03-12 17:23:09 -07:00
+								                    _retry_msg, _ = _nar2(retry_response)
 								                    final_response = (_retry_msg.content or "").strip()
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                else:
 								                    summary_kwargs = {
 								                        "model": self.model,
 								                        "messages": api_messages,
 								                    }
 								                    if self.max_tokens is not None:
-												fix: use _max_tokens_param in max-iterations retry path

The retry summary in _handle_max_iterations hardcodes max_tokens instead
of calling _max_tokens_param(). For direct OpenAI API users (gpt-4o,
o-series), the correct parameter name is max_completion_tokens. The first
attempt at line 2697 already uses _max_tokens_param correctly but the
retry path at line 2743 was missed.

											
										
										
											2026-03-05 17:49:37 +03:00
+								                        summary_kwargs.update(self._max_tokens_param(self.max_tokens))
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
+								                    if summary_extra_body:
 								                        summary_kwargs["extra_body"] = summary_extra_body
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                    summary_response = self._ensure_primary_openai_client(reason="iteration_limit_summary_retry").chat.completions.create(**summary_kwargs)
-												refactor(cli): Finalize OpenAI Codex Integration with OAuth

- Enhanced Codex model discovery by fetching available models from the API, with fallback to local cache and defaults.
- Updated the context compressor's summary target tokens to 2500 for improved performance.
- Added external credential detection for Codex CLI to streamline authentication.
- Refactored various components to ensure consistent handling of authentication and model selection across the application.

											
										
										
											2026-02-28 21:47:51 -08:00
 								                    if summary_response.choices and summary_response.choices[0].message.content:
 								                        final_response = summary_response.choices[0].message.content
 								                    else:
 								                        final_response = ""
 								                if final_response:
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    if "<think>" in final_response:
 								                        final_response = re.sub(r'<think>.*?</think>\s*', '', final_response, flags=re.DOTALL).strip()
-												fix: add missing empty-content guard after think-block stripping in retry path

											
										
										
											2026-03-05 18:57:59 +03:00
+								                    if final_response:
 								                        messages.append({"role": "assistant", "content": final_response})
 								                    else:
 								                        final_response = "I reached the iteration limit and couldn't generate a summary."
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                else:
 								                    final_response = "I reached the iteration limit and couldn't generate a summary."
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
 								        except Exception as e:
 								            logging.warning(f"Failed to get summary response: {e}")
 								            final_response = f"I reached the maximum iterations ({self.max_iterations}) but couldn't summarize. Error: {str(e)}"
 								        return final_response
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    def run_conversation(
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
+								        self,
 								        user_message: str,
 								        system_message: str = None,
 								        conversation_history: List[Dict[str, Any]] = None,
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								        task_id: str = None,
 								        stream_callback: Optional[callable] = None,
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								        persist_user_message: Optional[str] = None,
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    ) -> Dict[str, Any]:
 								        """
 								        Run a complete conversation with tool calling until completion.
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        Args:
 								            user_message (str): The user's message/question
-												Add ephemeral system prompt support in batch and agent runners. Update README with usage examples and documentation for the new feature. Ensure prompt is not saved to trajectories.

											
										
										
											2025-10-08 02:33:58 +00:00
+								            system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            conversation_history (List[Dict]): Previous conversation messages (optional)
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
+								            task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided)
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								            stream_callback: Optional callback invoked with each text delta during streaming.
 								                Used by the TTS pipeline to start audio generation before the full response.
 								                When None (default), API calls use the standard non-streaming path.
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								            persist_user_message: Optional clean user message to store in
 								                transcripts/history when user_message contains API-only
 								                synthetic prefixes.
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        Returns:
 								            Dict: Complete conversation result with final response and message history
 								        """
-												fix: guard init-time stdio writes

											
										
										
											2026-03-14 02:19:46 -07:00
+								        # Guard stdio against OSError from broken pipes (systemd/headless/daemon).
 								        # Installed once, transparent when streams are healthy, prevents crash on write.
 								        _install_safe_stdio()
-												fix: guard all print() calls against OSError with _SafeWriter

When hermes-agent runs as a systemd service, Docker container, or
headless daemon, the stdout pipe can become unavailable (idle timeout,
buffer exhaustion, socket reset). Any print() call then raises
OSError: [Errno 5] Input/output error, crashing run_conversation()
and causing cron jobs to fail.

Rather than wrapping individual print() calls (68 in run_conversation
alone), this adds a transparent _SafeWriter wrapper installed once at
the start of run_conversation(). It delegates all writes to the real
stdout and silently catches OSError. Zero overhead on the happy path,
comprehensive coverage of all print calls including future ones.

Fixes #845

Co-authored-by: J0hnLawMississippi <J0hnLawMississippi@users.noreply.github.com>

											
										
										
											2026-03-11 09:19:10 -07:00
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								        # Store stream callback for _interruptible_api_call to pick up
 								        self._stream_callback = stream_callback
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								        self._persist_user_message_idx = None
 								        self._persist_user_message_override = persist_user_message
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
+								        # Generate unique task_id if not provided to isolate VMs between concurrent tasks
 								        effective_task_id = task_id or str(uuid.uuid4())
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
-												Critical: fix bug in new subagent tool call budget to not be session-level but tool call loop level

											
										
										
											2026-03-07 10:32:51 -08:00
+								        # Reset retry counters and iteration budget at the start of each turn
 								        # so subagent usage from a previous turn doesn't eat into the next one.
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								        self._invalid_tool_retries = 0
 								        self._invalid_json_retries = 0
 								        self._empty_content_retries = 0
-												fix: reset all retry counters at start of run_conversation()

_incomplete_scratchpad_retries and _codex_incomplete_retries were not
reset at the start of run_conversation(). In CLI mode, where the same
AIAgent instance is reused across conversations, stale counters from
a previous conversation could carry over, causing premature retry
exhaustion and partial responses.

											
										
										
											2026-03-07 20:12:08 +03:00
+								        self._incomplete_scratchpad_retries = 0
 								        self._codex_incomplete_retries = 0
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								        self._last_content_with_tools = None
 								        self._turns_since_memory = 0
 								        self._iters_since_skill = 0
-												Critical: fix bug in new subagent tool call budget to not be session-level but tool call loop level

											
										
										
											2026-03-07 10:32:51 -08:00
+								        self.iteration_budget = IterationBudget(self.max_iterations)
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
-												fix(agent): copy conversation_history to avoid mutating caller's list

											
										
										
											2026-03-01 03:06:13 +03:00
+								        # Initialize conversation (copy to avoid mutating the caller's list)
 								        messages = list(conversation_history) if conversation_history else []
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												Add todo tool for agent task planning and management

Single `todo` tool that reads (no params) or writes (provide todos array
with merge flag). In-memory TodoStore on AIAgent, no system prompt
mutation, behavioral guidance in tool description only. State re-injected
after context compression events. Gateway sessions hydrate from
conversation history. Added to all platform toolsets.

Also wired into RL agent_loop.py with per-run TodoStore and fixed
browser_snapshot user_task passthrough from first user message.

											
										
										
											2026-02-17 17:02:33 -08:00
+								        # Hydrate todo store from conversation history (gateway creates a fresh
 								        # AIAgent per message, so the in-memory store is empty -- we need to
 								        # recover the todo state from the most recent todo tool response in history)
 								        if conversation_history and not self._todo_store.has_items():
 								            self._hydrate_todo_store(conversation_history)
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								        # Prefill messages (few-shot priming) are injected at API-call time only,
 								        # never stored in the messages list. This keeps them ephemeral: they won't
 								        # be saved to session DB, session logs, or batch trajectories, but they're
 								        # automatically re-applied on every API call (including session continuations).
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        # Track user turns for memory flush and periodic nudge logic
 								        self._user_turn_count += 1
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        # Preserve the original user message before nudge injection.
 								        # Honcho should receive the actual user input, not system nudges.
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								        original_user_message = persist_user_message if persist_user_message is not None else user_message
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								        # Periodic memory nudge: remind the model to consider saving memories.
 								        # Counter resets whenever the memory tool is actually used.
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								        if (self._memory_nudge_interval > 0
 								                and "memory" in self.valid_tool_names
 								                and self._memory_store):
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								            self._turns_since_memory += 1
 								            if self._turns_since_memory >= self._memory_nudge_interval:
 								                user_message += (
 								                    "\n\n[System: You've had several exchanges in this session. "
 								                    "Consider whether there's anything worth saving to your memories.]"
 								                )
 								                self._turns_since_memory = 0
 								        # Skill creation nudge: fires on the first user message after a long tool loop.
 								        # The counter increments per API iteration in the tool loop and is checked here.
 								        if (self._skill_nudge_interval > 0
 								                and self._iters_since_skill >= self._skill_nudge_interval
 								                and "skill_manage" in self.valid_tool_names):
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								            user_message += (
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								                "\n\n[System: The previous task involved many steps. "
 								                "If you discovered a reusable workflow, consider saving it as a skill.]"
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
+								            )
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								            self._iters_since_skill = 0
-												feat: enhance memory management features in AIAgent and CLI

- Added configuration options for memory nudge interval and flush minimum turns in cli-config.yaml.example.
- Implemented memory flushing before conversation reset, clearing, and exit in the CLI to ensure memories are saved.
- Introduced a flush_memories method in AIAgent to handle memory persistence before context loss.
- Added periodic nudges to remind the agent to consider saving memories based on user interactions.

											
										
										
											2026-02-22 10:15:17 -08:00
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        # Honcho prefetch consumption:
 								        # - First turn: bake into cached system prompt (stable for the session).
-												fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API
call time instead of appending it to the system prompt. This preserves the
stable system-prefix cache while keeping Honcho continuity context
available for the turn.

Also adds regression coverage for the injection helper and for continuing
sessions so Honcho recall stays out of the system prompt.
											
										
										
											2026-03-13 21:07:00 -07:00
+								        # - Later turns: attach recall to the current-turn user message at
 								        #   API-call time only (never persisted to history / session DB).
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        #
-												fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API
call time instead of appending it to the system prompt. This preserves the
stable system-prefix cache while keeping Honcho continuity context
available for the turn.

Also adds regression coverage for the injection helper and for continuing
sessions so Honcho recall stays out of the system prompt.
											
										
										
											2026-03-13 21:07:00 -07:00
+								        # This keeps the system-prefix cache stable while still allowing turn N
 								        # to consume background prefetch results from turn N-1.
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								        self._honcho_context = ""
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        self._honcho_turn_context = ""
-												refactor(honcho): rename recallMode "auto" to "hybrid"

Matches the mental model: hybrid = context + tools,
context = context only, tools = tools only.

											
										
										
											2026-03-09 17:46:51 -04:00
+								        _recall_mode = (self._honcho_config.recall_mode if self._honcho_config else "hybrid")
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								        if self._honcho and self._honcho_session_key and _recall_mode != "tools":
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								            try:
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								                prefetched_context = self._honcho_prefetch(original_user_message)
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								                if prefetched_context:
 								                    if not conversation_history:
 								                        self._honcho_context = prefetched_context
 								                    else:
 								                        self._honcho_turn_context = prefetched_context
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
+								            except Exception as e:
 								                logger.debug("Honcho prefetch failed (non-fatal): %s", e)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        # Add user message
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								        user_msg = {"role": "user", "content": user_message}
 								        messages.append(user_msg)
-												fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API
call time instead of appending it to the system prompt. This preserves the
stable system-prefix cache while keeping Honcho continuity context
available for the turn.

Also adds regression coverage for the injection helper and for continuing
sessions so Honcho recall stays out of the system prompt.
											
										
										
											2026-03-13 21:07:00 -07:00
+								        current_turn_user_idx = len(messages) - 1
-												fix: persist clean voice transcripts and /voice off state

- keep CLI voice prefixes API-local while storing the original user text
- persist explicit gateway off state and restore adapter auto-TTS suppression on restart
- add regression coverage for both behaviors

											
										
										
											2026-03-14 06:14:22 -07:00
+								        self._persist_user_message_idx = current_turn_user_idx
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								        if not self.quiet_mode:
 								            print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        # ── System prompt (cached per session for prefix caching) ──
 								        # Built once on first call, reused for all subsequent calls.
 								        # Only rebuilt after context compression events (which invalidate
 								        # the cache and reload memory from disk).
-												fix: stabilize system prompt across gateway turns for cache hits

Two changes to prevent unnecessary Anthropic prompt cache misses in the
gateway, where a fresh AIAgent is created per user message:

1. Reuse stored system prompt for continuing sessions:
   When conversation_history is non-empty, load the system prompt from
   the session DB instead of rebuilding from disk. The model already has
   updated memory in its conversation history (it wrote it!), so
   re-reading memory from disk produces a different system prompt that
   breaks the cache prefix.

2. Stabilize Honcho context per session:
   - Only prefetch Honcho context on the first turn (empty history)
   - Bake Honcho context into the cached system prompt and store to DB
   - Remove the per-turn Honcho injection from the API call loop

   This ensures the system message is identical across all turns in a
   session. Previously, re-fetching Honcho could return different context
   on each turn, changing the system message and invalidating the cache.

Both changes preserve the existing behavior for compression (which
invalidates the prompt and rebuilds from scratch) and for the CLI
(where the same AIAgent persists and the cached prompt is already
stable across turns).

Tests: 2556 passed (6 new)

											
										
										
											2026-03-09 01:50:58 -07:00
+								        #
 								        # For continuing sessions (gateway creates a fresh AIAgent per
 								        # message), we load the stored system prompt from the session DB
 								        # instead of rebuilding.  Rebuilding would pick up memory changes
 								        # from disk that the model already knows about (it wrote them!),
 								        # producing a different system prompt and breaking the Anthropic
 								        # prefix cache.
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        if self._cached_system_prompt is None:
-												fix: stabilize system prompt across gateway turns for cache hits

Two changes to prevent unnecessary Anthropic prompt cache misses in the
gateway, where a fresh AIAgent is created per user message:

1. Reuse stored system prompt for continuing sessions:
   When conversation_history is non-empty, load the system prompt from
   the session DB instead of rebuilding from disk. The model already has
   updated memory in its conversation history (it wrote it!), so
   re-reading memory from disk produces a different system prompt that
   breaks the cache prefix.

2. Stabilize Honcho context per session:
   - Only prefetch Honcho context on the first turn (empty history)
   - Bake Honcho context into the cached system prompt and store to DB
   - Remove the per-turn Honcho injection from the API call loop

   This ensures the system message is identical across all turns in a
   session. Previously, re-fetching Honcho could return different context
   on each turn, changing the system message and invalidating the cache.

Both changes preserve the existing behavior for compression (which
invalidates the prompt and rebuilds from scratch) and for the CLI
(where the same AIAgent persists and the cached prompt is already
stable across turns).

Tests: 2556 passed (6 new)

											
										
										
											2026-03-09 01:50:58 -07:00
+								            stored_prompt = None
 								            if conversation_history and self._session_db:
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								                try:
-												fix: stabilize system prompt across gateway turns for cache hits

Two changes to prevent unnecessary Anthropic prompt cache misses in the
gateway, where a fresh AIAgent is created per user message:

1. Reuse stored system prompt for continuing sessions:
   When conversation_history is non-empty, load the system prompt from
   the session DB instead of rebuilding from disk. The model already has
   updated memory in its conversation history (it wrote it!), so
   re-reading memory from disk produces a different system prompt that
   breaks the cache prefix.

2. Stabilize Honcho context per session:
   - Only prefetch Honcho context on the first turn (empty history)
   - Bake Honcho context into the cached system prompt and store to DB
   - Remove the per-turn Honcho injection from the API call loop

   This ensures the system message is identical across all turns in a
   session. Previously, re-fetching Honcho could return different context
   on each turn, changing the system message and invalidating the cache.

Both changes preserve the existing behavior for compression (which
invalidates the prompt and rebuilds from scratch) and for the CLI
(where the same AIAgent persists and the cached prompt is already
stable across turns).

Tests: 2556 passed (6 new)

											
										
										
											2026-03-09 01:50:58 -07:00
+								                    session_row = self._session_db.get_session(self.session_id)
 								                    if session_row:
 								                        stored_prompt = session_row.get("system_prompt") or None
 								                except Exception:
 								                    pass  # Fall through to build fresh
 								            if stored_prompt:
 								                # Continuing session — reuse the exact system prompt from
 								                # the previous turn so the Anthropic cache prefix matches.
 								                self._cached_system_prompt = stored_prompt
 								            else:
 								                # First turn of a new session — build from scratch.
 								                self._cached_system_prompt = self._build_system_prompt(system_message)
 								                # Bake Honcho context into the prompt so it's stable for
 								                # the entire session (not re-fetched per turn).
 								                if self._honcho_context:
 								                    self._cached_system_prompt = (
 								                        self._cached_system_prompt + "\n\n" + self._honcho_context
 								                    ).strip()
 								                # Store the system prompt snapshot in SQLite
 								                if self._session_db:
 								                    try:
 								                        self._session_db.update_system_prompt(self.session_id, self._cached_system_prompt)
 								                    except Exception as e:
 								                        logger.debug("Session DB update_system_prompt failed: %s", e)
-												Add platform-specific formatting hints and identity for AIAgent

- Introduced a default agent identity prompt to ensure consistent behavior across platforms.
- Added platform-specific formatting hints for CLI, WhatsApp, Telegram, and Discord to guide the agent's output style.
- Updated the AIAgent initialization to accept a platform parameter, enhancing adaptability to different interfaces.

											
										
										
											2026-02-12 16:11:16 -08:00
-												feat: add persistent memory system + SQLite session store

Two-part implementation:

Part A - Curated Bounded Memory:
- New memory tool (tools/memory_tool.py) with MEMORY.md + USER.md stores
- Character-limited (2200/1375 chars), § delimited entries
- Frozen snapshot injected into system prompt at session start
- Model manages pruning via replace/remove with substring matching
- Usage indicator shown in system prompt header

Part B - SQLite Session Store:
- New hermes_state.py with SessionDB class, FTS5 full-text search
- Gateway session.py rewritten to dual-write SQLite + legacy JSONL
- Compression-triggered session splitting with parent_session_id chains
- New session_search tool with Gemini Flash summarization of matched sessions
- CLI session lifecycle (create on launch, close on exit)

Also:
- System prompt now cached per session, only rebuilt on compression
  (fixes prefix cache invalidation from date/time changes every turn)
- Config version bumped to 3, hermes doctor checks for new artifacts
- Disabled in batch_runner and RL environments

											
										
										
											2026-02-19 00:57:31 -08:00
+								        active_system_prompt = self._cached_system_prompt
-												Add platform-specific formatting hints and identity for AIAgent

- Introduced a default agent identity prompt to ensure consistent behavior across platforms.
- Added platform-specific formatting hints for CLI, WhatsApp, Telegram, and Discord to guide the agent's output style.
- Updated the AIAgent initialization to accept a platform parameter, enhancing adaptability to different interfaces.

											
										
										
											2026-02-12 16:11:16 -08:00
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
+								        # ── Preflight context compression ──
 								        # Before entering the main loop, check if the loaded conversation
 								        # history already exceeds the model's context threshold.  This handles
 								        # cases where a user switches to a model with a smaller context window
 								        # while having a large existing session — compress proactively rather
 								        # than waiting for an API error (which might be caught as a non-retryable
 								        # 4xx and abort the request entirely).
 								        if (
 								            self.compression_enabled
 								            and len(messages) > self.context_compressor.protect_first_n
 								                                + self.context_compressor.protect_last_n + 1
 								        ):
 								            _sys_tok_est = estimate_tokens_rough(active_system_prompt or "")
 								            _msg_tok_est = estimate_messages_tokens_rough(messages)
 								            _preflight_tokens = _sys_tok_est + _msg_tok_est
 								            if _preflight_tokens >= self.context_compressor.threshold_tokens:
 								                logger.info(
 								                    "Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
 								                    f"{_preflight_tokens:,}",
 								                    f"{self.context_compressor.threshold_tokens:,}",
 								                    self.model,
 								                    f"{self.context_compressor.context_length:,}",
 								                )
 								                if not self.quiet_mode:
 								                    print(
 								                        f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
 								                        f">= {self.context_compressor.threshold_tokens:,} threshold"
 								                    )
 								                # May need multiple passes for very large sessions with small
 								                # context windows (each pass summarises the middle N turns).
 								                for _pass in range(3):
 								                    _orig_len = len(messages)
 								                    messages, active_system_prompt = self._compress_context(
-												fix: detect and warn on file re-read loops after context compression

When context compression summarizes conversation history, the agent
loses track of which files it already read and re-reads them in a loop.
Users report the agent reading the same files endlessly without writing.

Root cause: context compression is lossy — file contents and read history
are lost in the summary. After compression, the model thinks it hasn't
examined the files yet and reads them again.

Fix (two-part):
1. Track file reads per task in file_tools.py. When the same file region
   is read again, include a _warning in the response telling the model
   to stop re-reading and use existing information.
2. After context compression, inject a structured message listing all
   files already read in the session with explicit "do NOT re-read"
   instruction, preserving read history across compression boundaries.

Adds 16 tests covering warning detection, task isolation, summary
accuracy, tracker cleanup, and compression history injection.

											
										
										
											2026-03-08 20:44:42 +03:00
+								                        messages, system_message, approx_tokens=_preflight_tokens,
 								                        task_id=effective_task_id,
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
+								                    )
 								                    if len(messages) >= _orig_len:
 								                        break  # Cannot compress further
 								                    # Re-estimate after compression
 								                    _sys_tok_est = estimate_tokens_rough(active_system_prompt or "")
 								                    _msg_tok_est = estimate_messages_tokens_rough(messages)
 								                    _preflight_tokens = _sys_tok_est + _msg_tok_est
 								                    if _preflight_tokens < self.context_compressor.threshold_tokens:
 								                        break  # Under threshold
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        # Main conversation loop
 								        api_call_count = 0
 								        final_response = None
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								        interrupted = False
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
+								        codex_ack_continuations = 0
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
+								        length_continue_retries = 0
 								        truncated_response_prefix = ""
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
 								        # Clear any stale interrupt state at start
 								        self.clear_interrupt()
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
+								        while api_call_count < self.max_iterations and self.iteration_budget.remaining > 0:
-												feat: filesystem checkpoints and /rollback command

Automatic filesystem snapshots before destructive file operations,
with user-facing rollback.  Inspired by PR #559 (by @alireza78a).

Architecture:
- Shadow git repos at ~/.hermes/checkpoints/{hash}/ via GIT_DIR
- CheckpointManager: take/list/restore, turn-scoped dedup, pruning
- Transparent — the LLM never sees it, no tool schema, no tokens
- Once per turn — only first write_file/patch triggers a snapshot

Integration:
- Config: checkpoints.enabled + checkpoints.max_snapshots
- CLI flag: hermes --checkpoints
- Trigger: run_agent.py _execute_tool_calls() before write_file/patch
- /rollback slash command in CLI + gateway (list, restore by number)
- Pre-rollback snapshot auto-created on restore (undo the undo)

Safety:
- Never blocks file operations — all errors silently logged
- Skips root dir, home dir, dirs >50K files
- Disables gracefully when git not installed
- Shadow repo completely isolated from project git

Tests: 35 new tests, all passing (2798 total suite)
Docs: feature page, config reference, CLI commands reference

											
										
										
											2026-03-10 00:49:15 -07:00
+								            # Reset per-turn checkpoint dedup so each iteration can take one snapshot
 								            self._checkpoint_mgr.new_turn()
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								            # Check for interrupt request (e.g., user sent new message)
 								            if self._interrupt_requested:
 								                interrupted = True
 								                if not self.quiet_mode:
 								                    print(f"\n⚡ Breaking out of tool loop due to interrupt...")
 								                break
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            api_call_count += 1
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
+								            if not self.iteration_budget.consume():
 								                if not self.quiet_mode:
 								                    print(f"\n⚠️  Session iteration budget exhausted ({self.iteration_budget.max_total} total across agent + subagents)")
 								                break
-												feat: introduce skills management features in AIAgent and CLI

- Added skills configuration options in cli-config.yaml.example, including a nudge interval for skill creation reminders.
- Implemented skills guidance in AIAgent to prompt users to save reusable workflows after complex tasks.
- Enhanced skills indexing in the prompt builder to include descriptions from SKILL.md files for better context.
- Updated the agent's behavior to periodically remind users about potential skills during tool-calling iterations.

											
										
										
											2026-02-22 13:28:13 -08:00
-												feat(hooks): introduce event hooks system for lifecycle management

Add a new hooks system allowing users to run custom code at key lifecycle points in the agent's operation. This includes support for events such as `gateway:startup`, `session:start`, `agent:step`, and more. Documentation for creating hooks and available events has been added to `README.md` and a new `hooks.md` file. Additionally, integrate step callbacks in the agent to facilitate hook execution during tool-calling iterations.

											
										
										
											2026-02-28 17:09:26 -08:00
+								            # Fire step_callback for gateway hooks (agent:step event)
 								            if self.step_callback is not None:
 								                try:
 								                    prev_tools = []
 								                    for _m in reversed(messages):
 								                        if _m.get("role") == "assistant" and _m.get("tool_calls"):
 								                            prev_tools = [
 								                                tc["function"]["name"]
 								                                for tc in _m["tool_calls"]
 								                                if isinstance(tc, dict)
 								                            ]
 								                            break
 								                    self.step_callback(api_call_count, prev_tools)
 								                except Exception as _step_err:
 								                    logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err)
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								            # Track tool-calling iterations for skill nudge.
 								            # Counter resets whenever skill_manage is actually used.
-												feat: introduce skills management features in AIAgent and CLI

- Added skills configuration options in cli-config.yaml.example, including a nudge interval for skill creation reminders.
- Implemented skills guidance in AIAgent to prompt users to save reusable workflows after complex tasks.
- Enhanced skills indexing in the prompt builder to include descriptions from SKILL.md files for better context.
- Updated the agent's behavior to periodically remind users about potential skills during tool-calling iterations.

											
										
										
											2026-02-22 13:28:13 -08:00
+								            if (self._skill_nudge_interval > 0
 								                    and "skill_manage" in self.valid_tool_names):
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								                self._iters_since_skill += 1
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
 								            # Prepare messages for API call
 								            # If we have an ephemeral system prompt, prepend it to the messages
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								            # Note: Reasoning is embedded in content via <think> tags for trajectory storage.
 								            # However, providers like Moonshot AI require a separate 'reasoning_content' field
 								            # on assistant messages with tool_calls. We handle both cases here.
 								            api_messages = []
-												fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API
call time instead of appending it to the system prompt. This preserves the
stable system-prefix cache while keeping Honcho continuity context
available for the turn.

Also adds regression coverage for the injection helper and for continuing
sessions so Honcho recall stays out of the system prompt.
											
										
										
											2026-03-13 21:07:00 -07:00
+								            for idx, msg in enumerate(messages):
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                api_msg = msg.copy()
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
-												fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API
call time instead of appending it to the system prompt. This preserves the
stable system-prefix cache while keeping Honcho continuity context
available for the turn.

Also adds regression coverage for the injection helper and for continuing
sessions so Honcho recall stays out of the system prompt.
											
										
										
											2026-03-13 21:07:00 -07:00
+								                if idx == current_turn_user_idx and msg.get("role") == "user" and self._honcho_turn_context:
 								                    api_msg["content"] = _inject_honcho_turn_context(
 								                        api_msg.get("content", ""), self._honcho_turn_context
 								                    )
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
+								                # For ALL assistant messages, pass reasoning back to the API
 								                # This ensures multi-turn reasoning context is preserved
 								                if msg.get("role") == "assistant":
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                    reasoning_text = msg.get("reasoning")
 								                    if reasoning_text:
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
+								                        # Add reasoning_content for API compatibility (Moonshot AI, Novita, OpenRouter)
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        api_msg["reasoning_content"] = reasoning_text
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                # Remove 'reasoning' field - it's for trajectory storage only
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
+								                # We've copied it to 'reasoning_content' for the API above
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                if "reasoning" in api_msg:
 								                    api_msg.pop("reasoning")
-												fix(agent): strip finish_reason from assistant messages to fix Mistral 422 errors (#253)

* fix(agent): skip reasoning param for Mistral API to prevent 422 errors

* fix(agent): strip finish_reason from assistant messages to fix Mistral 422 errors
											
										
										
											2026-03-02 11:35:03 +03:00
+								                # Remove finish_reason - not accepted by strict APIs (e.g. Mistral)
 								                if "finish_reason" in api_msg:
 								                    api_msg.pop("finish_reason")
-												fix: strip call_id/response_item_id from tool_calls for Mistral compatibility

Mistral's API strictly validates the Chat Completions schema and rejects
unknown fields (call_id, response_item_id) with 422. These fields are
added by _build_assistant_message() for Codex Responses API support.

This fix:
- Only strips when targeting Mistral (api.mistral.ai in base_url)
- Creates new tool_call dicts instead of mutating originals (shallow
  copy safety — msg.copy() shares the tool_calls list)
- Preserves call_id/response_item_id in the internal message history
  so _chat_messages_to_responses_input() can still read them if the
  session falls back to a Codex provider mid-conversation

Applied in all 3 API message building locations:
- Main conversation loop (run_conversation)
- _handle_max_iterations()
- flush_memories()

Inspired by PR #864 (unmodeled-tyler) which identified the issue but
applied the fix unconditionally and mutated originals via shallow copy.

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>

											
										
										
											2026-03-12 08:18:27 -07:00
+								                # Strip Codex Responses API fields (call_id, response_item_id) for
 								                # strict providers like Mistral that reject unknown fields with 422.
 								                # Uses new dicts so the internal messages list retains the fields
 								                # for Codex Responses compatibility.
 								                if "api.mistral.ai" in self.base_url.lower():
 								                    self._sanitize_tool_calls_for_strict_api(api_msg)
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
+								                # Keep 'reasoning_details' - OpenRouter uses this for multi-turn reasoning context
 								                # The signature field helps maintain reasoning continuity
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                api_messages.append(api_msg)
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								            # Build the final system message: cached prompt + ephemeral system prompt.
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								            # Ephemeral additions are API-call-time only (not persisted to session DB).
-												fix: keep honcho recall out of cached system prefix (#1201)

Attach later-turn Honcho recall to the current-turn user message at API
call time instead of appending it to the system prompt. This preserves the
stable system-prefix cache while keeping Honcho continuity context
available for the turn.

Also adds regression coverage for the injection helper and for continuing
sessions so Honcho recall stays out of the system prompt.
											
										
										
											2026-03-13 21:07:00 -07:00
+								            # Honcho later-turn recall is intentionally kept OUT of the system prompt
 								            # so the stable cache prefix remains unchanged.
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								            effective_system = active_system_prompt or ""
 								            if self.ephemeral_system_prompt:
 								                effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
 								            if effective_system:
 								                api_messages = [{"role": "system", "content": effective_system}] + api_messages
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
-												feat: add ephemeral prefill messages and system prompt loading

- Implemented functionality to load ephemeral prefill messages from a JSON file, enhancing few-shot priming capabilities for the agent.
- Introduced a mechanism to load an ephemeral system prompt from environment variables or configuration files, ensuring dynamic prompt adjustments at API-call time.
- Updated the CLI and agent initialization to utilize the new prefill messages and system prompt, improving the overall interaction experience.
- Enhanced configuration options with new environment variables for prefill messages and system prompts, allowing for greater customization without persistence.

											
										
										
											2026-02-23 23:55:42 -08:00
+								            # Inject ephemeral prefill messages right after the system prompt
 								            # but before conversation history. Same API-call-time-only pattern.
 								            if self.prefill_messages:
 								                sys_offset = 1 if effective_system else 0
 								                for idx, pfm in enumerate(self.prefill_messages):
 								                    api_messages.insert(sys_offset + idx, pfm.copy())
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
+								            # Apply Anthropic prompt caching for Claude models via OpenRouter.
 								            # Auto-detected: if model name contains "claude" and base_url is OpenRouter,
 								            # inject cache_control breakpoints (system + last 3 messages) to reduce
 								            # input token costs by ~75% on multi-turn conversations.
 								            if self._use_prompt_caching:
 								                api_messages = apply_anthropic_cache_control(api_messages, cache_ttl=self._cache_ttl)
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
-												fix: sanitize orphaned tool-call/result pairs in message compression

Enhance message compression by adding a method to clean up orphaned tool-call and tool-result pairs. This ensures that the API receives well-formed messages, preventing errors related to mismatched IDs. The new functionality includes removing orphaned results and adding stub results for missing calls, improving overall message integrity during compression.

											
										
										
											2026-03-07 08:08:00 -08:00
+								            # Safety net: strip orphaned tool results / add stubs for missing
 								            # results before sending to the API.  The compressor handles this
 								            # during compression, but orphans can also sneak in from session
 								            # loading or manual message manipulation.
 								            if hasattr(self, 'context_compressor') and self.context_compressor:
 								                api_messages = self.context_compressor._sanitize_tool_pairs(api_messages)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								            # Calculate approximate request size for logging
 								            total_chars = sum(len(str(msg)) for msg in api_messages)
 								            approx_tokens = total_chars // 4  # Rough estimate: 4 chars per token
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								            # Thinking spinner for quiet mode (animated during API call)
 								            thinking_spinner = None
 								            if not self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                self._vprint(f"\n{self.log_prefix}🔄 Making API call #{api_call_count}/{self.max_iterations}...")
 								                self._vprint(f"{self.log_prefix}   📊 Request size: {len(api_messages)} messages, ~{approx_tokens:,} tokens (~{total_chars:,} chars)")
 								                self._vprint(f"{self.log_prefix}   🔧 Available tools: {len(self.tools) if self.tools else 0}")
 								            elif self._stream_callback is None:
 								                # Animated thinking spinner in quiet mode (skip during streaming TTS)
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                face = random.choice(KawaiiSpinner.KAWAII_THINKING)
 								                verb = random.choice(KawaiiSpinner.THINKING_VERBS)
-												fix: stop terminal border flashing with steady cursor and TUI spinner widget

Cherry-picked and improved from PR #470 (fixes #464).

Problem: On Ubuntu 24.04 with ghostty + tmux, the prompt input box
border lines flash due to cursor blink and raw spinner terminal writes
conflicting with prompt_toolkit's rendering.

Changes:
- cli.py: Add CursorShape.BLOCK to Application() to disable cursor blink
- cli.py: Add thinking_callback + spinner_widget in TUI layout so
  thinking status displays as a proper prompt_toolkit widget instead of
  raw terminal writes that conflict with the TUI renderer
- run_agent.py: Add thinking_callback parameter to AIAgent; when set,
  uses the callback instead of KawaiiSpinner for thinking display

What was NOT changed (preserving existing behavior):
- agent/display.py: Untouched. KawaiiSpinner _write() stdout capture,
  _animate() logic, and 0.12s frame interval all preserved. This
  protects subagent stdout redirection and keeps smooth animations
  for non-CLI contexts (gateway, batch runner).
- Original emoji spinner types (brain/sparkle/pulse/moon/star) preserved
  for all non-CLI contexts.

Fixes from original PR #470:
- CursorShape.STEADY_BLOCK -> CursorShape.BLOCK (STEADY_BLOCK doesn't
  exist in prompt_toolkit 3.0.52)
- Removed duplicate self._spinner_text = '' line
- Removed redundant nested if-checks

Tested: 2706 tests pass, interactive CLI verified via tmux.

											
										
										
											2026-03-09 23:26:43 -07:00
+								                if self.thinking_callback:
 								                    # CLI TUI mode: use prompt_toolkit widget instead of raw spinner
 								                    self.thinking_callback(f"{face} {verb}...")
 								                else:
 								                    spinner_type = random.choice(['brain', 'sparkle', 'pulse', 'moon', 'star'])
 								                    thinking_spinner = KawaiiSpinner(f"{face} {verb}...", spinner_type=spinner_type)
 								                    thinking_spinner.start()
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								            # Log request details if verbose
 								            if self.verbose_logging:
 								                logging.debug(f"API Request - Model: {self.model}, Messages: {len(messages)}, Tools: {len(self.tools) if self.tools else 0}")
 								                logging.debug(f"Last message role: {messages[-1]['role'] if messages else 'none'}")
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								                logging.debug(f"Total message size: ~{approx_tokens:,} tokens")
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
 								            api_start_time = time.time()
 								            retry_count = 0
-												fix: reduce max_retries to 3 and make ValueError/TypeError non-retryable

- max_retries reduced from 6 to 3 — 6 retries with exponential backoff
  could stall for ~275s total on persistent errors
- ValueError and TypeError now detected as non-retryable client errors
  and abort immediately instead of being retried with backoff (these are
  local validation/programming errors that will never succeed on retry)

											
										
										
											2026-03-11 07:04:46 -07:00
+								            max_retries = 3
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								            compression_attempts = 0
 								            max_compression_attempts = 3
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								            codex_auth_retry_attempted = False
-												fix(anthropic): deep scan fixes — auth, retries, edge cases

Fixes from comprehensive code review and cross-referencing with
clawdbot/OpenCode implementations:

CRITICAL:
- Add one-shot guard (anthropic_auth_retry_attempted) to prevent
  infinite 401 retry loops when credentials keep changing
- Fix _is_oauth_token(): managed keys from ~/.claude.json are NOT
  regular API keys (don't start with sk-ant-api). Inverted the logic:
  only sk-ant-api* is treated as API key auth, everything else uses
  Bearer auth + oauth beta headers

HIGH:
- Wrap json.loads(args) in try/except in message conversion — malformed
  tool_call arguments no longer crash the entire conversation
- Raise AuthError in runtime_provider when no Anthropic token found
  (was silently passing empty string, causing confusing API errors)
- Remove broken _try_anthropic() from auxiliary vision chain — the
  centralized router creates an OpenAI client for api_key providers
  which doesn't work with Anthropic's Messages API

MEDIUM:
- Handle empty assistant message content — Anthropic rejects empty
  content blocks, now inserts '(empty)' placeholder
- Fix setup.py existing_key logic — set to 'KEEP' sentinel instead
  of None to prevent falling through to the auth choice prompt
- Add debug logging to _fetch_anthropic_models on failure

Tests: 43 adapter tests (2 new for token detection), 3197 total passed

											
										
										
											2026-03-12 17:14:22 -07:00
+								            anthropic_auth_retry_attempted = False
-												fix: implement Nous credential refresh on 401 error for retry logic

											
										
										
											2026-03-07 13:34:23 +11:00
+								            nous_auth_retry_attempted = False
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
+								            restart_with_compressed_messages = False
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
+								            restart_with_length_continuation = False
-												rate limits

											
										
										
											2025-11-17 18:35:36 -05:00
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								            finish_reason = "stop"
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								            response = None  # Guard against UnboundLocalError if all retries fail
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								            while retry_count < max_retries:
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								                try:
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    api_kwargs = self._build_api_kwargs(api_messages)
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								                    if self.api_mode == "codex_responses":
 								                        api_kwargs = self._preflight_codex_api_kwargs(api_kwargs, allow_stream=False)
-												feat: add multi-provider authentication and inference provider selection

- Implemented a multi-provider authentication system for the Hermes Agent, supporting OAuth for Nous Portal and traditional API key methods for OpenRouter and custom endpoints.
- Enhanced CLI with commands for logging in and out of providers, allowing users to authenticate and manage their credentials easily.
- Updated configuration options to select inference providers, with detailed documentation on usage and setup.
- Improved status reporting to include authentication status and provider details, enhancing user awareness of their current configuration.
- Added new files for authentication handling and updated existing components to integrate the new provider system.

											
										
										
											2026-02-20 17:24:00 -08:00
 								                    if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
 								                        self._dump_api_request_debug(api_kwargs, reason="preflight")
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								                    cb = getattr(self, "_stream_callback", None)
-												fix: address PR review round 5 — streaming guard, VC auth, history prefix, auto-TTS control

1. Gate _streaming_api_call to chat_completions mode only — Anthropic and
   Codex fall back to _interruptible_api_call. Preserve Anthropic base_url
   across all client rebuild paths (interrupt, fallback, 401 refresh).

2. Discord VC synthetic events now use chat_type="channel" instead of
   defaulting to "dm" — prevents session bleed into DM context.
   Authorization runs before echoing transcript. Sanitize @everyone/@here
   in voice transcripts.

3. CLI voice prefix ("[Voice input...]") is now API-call-local only —
   stripped from returned history so it never persists to session DB or
   resumed sessions.

4. /voice off now disables base adapter auto-TTS via _auto_tts_disabled_chats
   set — voice input no longer triggers TTS when voice mode is off.

											
										
										
											2026-03-14 10:31:49 +03:00
+								                    if cb is not None and self.api_mode == "chat_completions":
-												fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and
   openai are never imported at module level. Each is imported only when
   the feature is explicitly activated, preventing crashes in headless
   environments (SSH, Docker, WSL, no PortAudio).

2. No core agent loop changes: streaming TTS path extracted from
   _interruptible_api_call() into separate _streaming_api_call() method.
   The original method is restored to its upstream form.

3. Configurable key binding: push-to-talk key changed from Ctrl+R
   (conflicts with readline reverse-search) to Ctrl+B by default.
   Configurable via voice.push_to_talk_key in config.yaml.

4. Environment detection: new detect_audio_environment() function checks
   for SSH, Docker, WSL, and missing audio devices before enabling voice
   mode. Auto-disables with clear warnings in incompatible environments.

5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream,
   sd.OutputStream) wrapped in try/except with ImportError/OSError
   handling. Failures produce warnings, not crashes.

											
										
										
											2026-03-09 12:48:49 +03:00
+								                        response = self._streaming_api_call(api_kwargs, cb)
 								                    else:
 								                        response = self._interruptible_api_call(api_kwargs)
-												fix: voice pipeline hardening — 7 bug fixes with tests

1. Anthropic + ElevenLabs TTS silence: forward full response to TTS
   callback for non-streaming providers (choices first, then native
   content blocks fallback).

2. Subprocess timeout kill: play_audio_file now kills the process on
   TimeoutExpired instead of leaving zombie processes.

3. Discord disconnect cleanup: leave all voice channels before closing
   the client to prevent leaked state.

4. Audio stream leak: close InputStream if stream.start() fails.

5. Race condition: read/write _on_silence_stop under lock in audio
   callback thread.

6. _vprint force=True: show API error, retry, and truncation messages
   even during streaming TTS.

7. _refresh_level lock: read _voice_recording under _voice_lock.

											
										
										
											2026-03-14 13:06:49 +03:00
+								                        # Forward full response to TTS callback for non-streaming providers
 								                        # (e.g. Anthropic) so voice TTS still works via batch delivery.
 								                        if cb is not None and response:
 								                            try:
 								                                content = None
 								                                # Try choices first — _interruptible_api_call converts all
 								                                # providers (including Anthropic) to this format.
 								                                try:
 								                                    content = response.choices[0].message.content
 								                                except (AttributeError, IndexError):
 								                                    pass
 								                                # Fallback: Anthropic native content blocks
 								                                if not content and self.api_mode == "anthropic_messages":
 								                                    text_parts = [
 								                                        block.text for block in getattr(response, "content", [])
 								                                        if getattr(block, "type", None) == "text" and getattr(block, "text", None)
 								                                    ]
 								                                    content = " ".join(text_parts) if text_parts else None
 								                                if content:
 								                                    cb(content)
 								                            except Exception:
 								                                pass
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
 								                    api_duration = time.time() - api_start_time
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
-												refactor: streamline thinking spinner behavior in AIAgent

- Updated the logic for stopping the thinking spinner to improve clarity in tool execution messages.
- Removed unnecessary checks for tool calls, simplifying the spinner's stop behavior while maintaining informative output for users.

											
										
										
											2026-02-19 01:56:04 -08:00
+								                    # Stop thinking spinner silently -- the response box or tool
 								                    # execution messages that follow are more informative.
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                    if thinking_spinner:
-												refactor: streamline thinking spinner behavior in AIAgent

- Updated the logic for stopping the thinking spinner to improve clarity in tool execution messages.
- Removed unnecessary checks for tool calls, simplifying the spinner's stop behavior while maintaining informative output for users.

											
										
										
											2026-02-19 01:56:04 -08:00
+								                        thinking_spinner.stop("")
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                        thinking_spinner = None
-												fix: stop terminal border flashing with steady cursor and TUI spinner widget

Cherry-picked and improved from PR #470 (fixes #464).

Problem: On Ubuntu 24.04 with ghostty + tmux, the prompt input box
border lines flash due to cursor blink and raw spinner terminal writes
conflicting with prompt_toolkit's rendering.

Changes:
- cli.py: Add CursorShape.BLOCK to Application() to disable cursor blink
- cli.py: Add thinking_callback + spinner_widget in TUI layout so
  thinking status displays as a proper prompt_toolkit widget instead of
  raw terminal writes that conflict with the TUI renderer
- run_agent.py: Add thinking_callback parameter to AIAgent; when set,
  uses the callback instead of KawaiiSpinner for thinking display

What was NOT changed (preserving existing behavior):
- agent/display.py: Untouched. KawaiiSpinner _write() stdout capture,
  _animate() logic, and 0.12s frame interval all preserved. This
  protects subagent stdout redirection and keeps smooth animations
  for non-CLI contexts (gateway, batch runner).
- Original emoji spinner types (brain/sparkle/pulse/moon/star) preserved
  for all non-CLI contexts.

Fixes from original PR #470:
- CursorShape.STEADY_BLOCK -> CursorShape.BLOCK (STEADY_BLOCK doesn't
  exist in prompt_toolkit 3.0.52)
- Removed duplicate self._spinner_text = '' line
- Removed redundant nested if-checks

Tested: 2706 tests pass, interactive CLI verified via tmux.

											
										
										
											2026-03-09 23:26:43 -07:00
+								                    if self.thinking_callback:
 								                        self.thinking_callback("")
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
 								                    if not self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}⏱️  API call completed in {api_duration:.2f}s")
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
 								                    if self.verbose_logging:
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        # Log response with provider info if available
 								                        resp_model = getattr(response, 'model', 'N/A') if response else 'N/A'
 								                        logging.debug(f"API Response received - Model: {resp_model}, Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    # Validate response shape before proceeding
 								                    response_invalid = False
 								                    error_details = []
 								                    if self.api_mode == "codex_responses":
 								                        output_items = getattr(response, "output", None) if response is not None else None
 								                        if response is None:
 								                            response_invalid = True
 								                            error_details.append("response is None")
 								                        elif not isinstance(output_items, list):
 								                            response_invalid = True
 								                            error_details.append("response.output is not a list")
 								                        elif len(output_items) == 0:
 								                            response_invalid = True
 								                            error_details.append("response.output is empty")
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                    elif self.api_mode == "anthropic_messages":
 								                        content_blocks = getattr(response, "content", None) if response is not None else None
 								                        if response is None:
 								                            response_invalid = True
 								                            error_details.append("response is None")
 								                        elif not isinstance(content_blocks, list):
 								                            response_invalid = True
 								                            error_details.append("response.content is not a list")
 								                        elif len(content_blocks) == 0:
 								                            response_invalid = True
 								                            error_details.append("response.content is empty")
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    else:
 								                        if response is None or not hasattr(response, 'choices') or response.choices is None or len(response.choices) == 0:
 								                            response_invalid = True
 								                            if response is None:
 								                                error_details.append("response is None")
 								                            elif not hasattr(response, 'choices'):
 								                                error_details.append("response has no 'choices' attribute")
 								                            elif response.choices is None:
 								                                error_details.append("response.choices is None")
 								                            else:
 								                                error_details.append("response.choices is empty")
 								                    if response_invalid:
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                        # Stop spinner before printing error messages
 								                        if thinking_spinner:
 								                            thinking_spinner.stop(f"(´;ω;`) oops, retrying...")
 								                            thinking_spinner = None
-												fix: stop terminal border flashing with steady cursor and TUI spinner widget

Cherry-picked and improved from PR #470 (fixes #464).

Problem: On Ubuntu 24.04 with ghostty + tmux, the prompt input box
border lines flash due to cursor blink and raw spinner terminal writes
conflicting with prompt_toolkit's rendering.

Changes:
- cli.py: Add CursorShape.BLOCK to Application() to disable cursor blink
- cli.py: Add thinking_callback + spinner_widget in TUI layout so
  thinking status displays as a proper prompt_toolkit widget instead of
  raw terminal writes that conflict with the TUI renderer
- run_agent.py: Add thinking_callback parameter to AIAgent; when set,
  uses the callback instead of KawaiiSpinner for thinking display

What was NOT changed (preserving existing behavior):
- agent/display.py: Untouched. KawaiiSpinner _write() stdout capture,
  _animate() logic, and 0.12s frame interval all preserved. This
  protects subagent stdout redirection and keeps smooth animations
  for non-CLI contexts (gateway, batch runner).
- Original emoji spinner types (brain/sparkle/pulse/moon/star) preserved
  for all non-CLI contexts.

Fixes from original PR #470:
- CursorShape.STEADY_BLOCK -> CursorShape.BLOCK (STEADY_BLOCK doesn't
  exist in prompt_toolkit 3.0.52)
- Removed duplicate self._spinner_text = '' line
- Removed redundant nested if-checks

Tested: 2706 tests pass, interactive CLI verified via tmux.

											
										
										
											2026-03-09 23:26:43 -07:00
+								                        if self.thinking_callback:
 								                            self.thinking_callback("")
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                        # This is often rate limiting or provider returning malformed response
 								                        retry_count += 1
 								                        # Check for error field in response (some providers include this)
 								                        error_msg = "Unknown"
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        provider_name = "Unknown"
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                        if response and hasattr(response, 'error') and response.error:
 								                            error_msg = str(response.error)
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                            # Try to extract provider from error metadata
 								                            if hasattr(response.error, 'metadata') and response.error.metadata:
 								                                provider_name = response.error.metadata.get('provider_name', 'Unknown')
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                        elif response and hasattr(response, 'message') and response.message:
 								                            error_msg = str(response.message)
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        # Try to get provider from model field (OpenRouter often returns actual model used)
 								                        if provider_name == "Unknown" and response and hasattr(response, 'model') and response.model:
 								                            provider_name = f"model={response.model}"
 								                        # Check for x-openrouter-provider or similar metadata
 								                        if provider_name == "Unknown" and response:
 								                            # Log all response attributes for debugging
 								                            resp_attrs = {k: str(v)[:100] for k, v in vars(response).items() if not k.startswith('_')}
 								                            if self.verbose_logging:
 								                                logging.debug(f"Response attributes for invalid response: {resp_attrs}")
-												fix: voice pipeline hardening — 7 bug fixes with tests

1. Anthropic + ElevenLabs TTS silence: forward full response to TTS
   callback for non-streaming providers (choices first, then native
   content blocks fallback).

2. Subprocess timeout kill: play_audio_file now kills the process on
   TimeoutExpired instead of leaving zombie processes.

3. Discord disconnect cleanup: leave all voice channels before closing
   the client to prevent leaked state.

4. Audio stream leak: close InputStream if stream.start() fails.

5. Race condition: read/write _on_silence_stop under lock in audio
   callback thread.

6. _vprint force=True: show API error, retry, and truncation messages
   even during streaming TTS.

7. _refresh_level lock: read _voice_recording under _voice_lock.

											
										
										
											2026-03-14 13:06:49 +03:00
+								                        self._vprint(f"{self.log_prefix}⚠️  Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}", force=True)
 								                        self._vprint(f"{self.log_prefix}   🏢 Provider: {provider_name}", force=True)
 								                        self._vprint(f"{self.log_prefix}   📝 Provider message: {error_msg[:200]}", force=True)
 								                        self._vprint(f"{self.log_prefix}   ⏱️  Response time: {api_duration:.2f}s (fast response often indicates rate limiting)", force=True)
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
-												fix: correct off-by-one in retry exhaustion checks

The retry exhaustion checks used > instead of >= to compare
retry_count against max_retries. Since the while loop condition is
retry_count < max_retries, the check retry_count > max_retries can
never be true inside the loop. When retries are exhausted, the loop
exits and falls through to response.choices[0] on an invalid response,
crashing with IndexError instead of returning a proper error.

											
										
										
											2026-03-01 02:27:26 +03:00
+								                        if retry_count >= max_retries:
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								                            # Try fallback before giving up
 								                            if self._try_activate_fallback():
 								                                retry_count = 0
 								                                continue
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.", force=True)
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                            logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.")
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                            self._persist_session(messages, conversation_history)
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                            return {
 								                                "messages": messages,
 								                                "completed": False,
 								                                "api_calls": api_call_count,
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                                "error": "Invalid API response shape. Likely rate limited or malformed provider response.",
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                                "failed": True  # Mark as failure for filtering
 								                            }
 								                        # Longer backoff for rate limiting (likely cause of None choices)
 								                        wait_time = min(5 * (2 ** (retry_count - 1)), 120)  # 5s, 10s, 20s, 40s, 80s, 120s
-												fix: voice pipeline hardening — 7 bug fixes with tests

1. Anthropic + ElevenLabs TTS silence: forward full response to TTS
   callback for non-streaming providers (choices first, then native
   content blocks fallback).

2. Subprocess timeout kill: play_audio_file now kills the process on
   TimeoutExpired instead of leaving zombie processes.

3. Discord disconnect cleanup: leave all voice channels before closing
   the client to prevent leaked state.

4. Audio stream leak: close InputStream if stream.start() fails.

5. Race condition: read/write _on_silence_stop under lock in audio
   callback thread.

6. _vprint force=True: show API error, retry, and truncation messages
   even during streaming TTS.

7. _refresh_level lock: read _voice_recording under _voice_lock.

											
										
										
											2026-03-14 13:06:49 +03:00
+								                        self._vprint(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...", force=True)
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                        logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)} | Provider: {provider_name}")
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
 								                        # Sleep in small increments to stay responsive to interrupts
 								                        sleep_end = time.time() + wait_time
 								                        while time.time() < sleep_end:
 								                            if self._interrupt_requested:
-												fix: harden web gateway security and fix error swallowing

- Use hmac.compare_digest for timing-safe token comparison (3 endpoints)
- Default bind to 127.0.0.1 instead of 0.0.0.0
- Sanitize upload filenames with Path.name to prevent path traversal
- Add DOMPurify to sanitize marked.parse() output against XSS
- Replace add_static with authenticated media handler
- Hide token in group chats for /remote-control command
- Use ctypes.util.find_library for Opus instead of hardcoded paths
- Add force=True to 5 interrupt _vprint calls for visibility
- Log Opus decode errors and voice restart failures instead of swallowing

											
										
										
											2026-03-13 15:29:18 +03:00
+								                                self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                                self._persist_session(messages, conversation_history)
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								                                self.clear_interrupt()
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                                return {
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                                    "final_response": f"Operation interrupted: retrying API call after rate limit (retry {retry_count}/{max_retries}).",
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                                    "messages": messages,
 								                                    "api_calls": api_call_count,
 								                                    "completed": False,
 								                                    "interrupted": True,
 								                                }
 								                            time.sleep(0.2)
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                        continue  # Retry the API call
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                    # Check finish_reason before proceeding
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    if self.api_mode == "codex_responses":
 								                        status = getattr(response, "status", None)
 								                        incomplete_details = getattr(response, "incomplete_details", None)
 								                        incomplete_reason = None
 								                        if isinstance(incomplete_details, dict):
 								                            incomplete_reason = incomplete_details.get("reason")
 								                        else:
 								                            incomplete_reason = getattr(incomplete_details, "reason", None)
 								                        if status == "incomplete" and incomplete_reason in {"max_output_tokens", "length"}:
 								                            finish_reason = "length"
 								                        else:
 								                            finish_reason = "stop"
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                    elif self.api_mode == "anthropic_messages":
 								                        stop_reason_map = {"end_turn": "stop", "tool_use": "tool_calls", "max_tokens": "length", "stop_sequence": "stop"}
 								                        finish_reason = stop_reason_map.get(response.stop_reason, "stop")
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    else:
 								                        finish_reason = response.choices[0].finish_reason
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                    if finish_reason == "length":
-												fix: voice pipeline hardening — 7 bug fixes with tests

1. Anthropic + ElevenLabs TTS silence: forward full response to TTS
   callback for non-streaming providers (choices first, then native
   content blocks fallback).

2. Subprocess timeout kill: play_audio_file now kills the process on
   TimeoutExpired instead of leaving zombie processes.

3. Discord disconnect cleanup: leave all voice channels before closing
   the client to prevent leaked state.

4. Audio stream leak: close InputStream if stream.start() fails.

5. Race condition: read/write _on_silence_stop under lock in audio
   callback thread.

6. _vprint force=True: show API error, retry, and truncation messages
   even during streaming TTS.

7. _refresh_level lock: read _voice_recording under _voice_lock.

											
										
										
											2026-03-14 13:06:49 +03:00
+								                        self._vprint(f"{self.log_prefix}⚠️  Response truncated (finish_reason='length') - model hit max output tokens", force=True)
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
 								                        if self.api_mode == "chat_completions":
 								                            assistant_message = response.choices[0].message
 								                            if not assistant_message.tool_calls:
 								                                length_continue_retries += 1
 								                                interim_msg = self._build_assistant_message(assistant_message, finish_reason)
 								                                messages.append(interim_msg)
 								                                if assistant_message.content:
 								                                    truncated_response_prefix += assistant_message.content
 								                                if length_continue_retries < 3:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                                    self._vprint(
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
+								                                        f"{self.log_prefix}↻ Requesting continuation "
 								                                        f"({length_continue_retries}/3)..."
 								                                    )
 								                                    continue_msg = {
 								                                        "role": "user",
 								                                        "content": (
 								                                            "[System: Your previous response was truncated by the output "
 								                                            "length limit. Continue exactly where you left off. Do not "
 								                                            "restart or repeat prior text. Finish the answer directly.]"
 								                                        ),
 								                                    }
 								                                    messages.append(continue_msg)
 								                                    self._session_messages = messages
 								                                    self._save_session_log(messages)
 								                                    restart_with_length_continuation = True
 								                                    break
 								                                partial_response = self._strip_think_blocks(truncated_response_prefix).strip()
 								                                self._cleanup_task_resources(effective_task_id)
 								                                self._persist_session(messages, conversation_history)
 								                                return {
 								                                    "final_response": partial_response or None,
 								                                    "messages": messages,
 								                                    "api_calls": api_call_count,
 								                                    "completed": False,
 								                                    "partial": True,
 								                                    "error": "Response remained truncated after 3 continuation attempts",
 								                                }
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                        # If we have prior messages, roll back to last complete state
 								                        if len(messages) > 1:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}   ⏪ Rolling back to last complete assistant turn")
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                            rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								                            self._cleanup_task_resources(effective_task_id)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                            self._persist_session(messages, conversation_history)
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                            return {
 								                                "final_response": None,
 								                                "messages": rolled_back_messages,
 								                                "api_calls": api_call_count,
 								                                "completed": False,
 								                                "partial": True,
 								                                "error": "Response truncated due to output length limit"
 								                            }
 								                        else:
 								                            # First message was truncated - mark as failed
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ First response truncated - cannot recover", force=True)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                            self._persist_session(messages, conversation_history)
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                            return {
 								                                "final_response": None,
 								                                "messages": messages,
 								                                "api_calls": api_call_count,
 								                                "completed": False,
 								                                "failed": True,
 								                                "error": "First response truncated due to output length limit"
 								                            }
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                    # Track actual token usage from response for context management
 								                    if hasattr(response, 'usage') and response.usage:
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                        if self.api_mode in ("codex_responses", "anthropic_messages"):
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                            prompt_tokens = getattr(response.usage, 'input_tokens', 0) or 0
 								                            completion_tokens = getattr(response.usage, 'output_tokens', 0) or 0
 								                            total_tokens = (
 								                                getattr(response.usage, 'total_tokens', None)
 								                                or (prompt_tokens + completion_tokens)
 								                            )
 								                        else:
 								                            prompt_tokens = getattr(response.usage, 'prompt_tokens', 0) or 0
 								                            completion_tokens = getattr(response.usage, 'completion_tokens', 0) or 0
 								                            total_tokens = getattr(response.usage, 'total_tokens', 0) or 0
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                        usage_dict = {
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                            "prompt_tokens": prompt_tokens,
 								                            "completion_tokens": completion_tokens,
 								                            "total_tokens": total_tokens,
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                        }
 								                        self.context_compressor.update_from_response(usage_dict)
-												feat(cli): add /usage command to display session token usage

Introduced a new command "/usage" in the CLI to show cumulative token usage for the current session. This includes details on prompt tokens, completion tokens, total tokens, API calls, and context state. Updated command documentation to reflect this addition. Enhanced the AIAgent class to track token usage throughout the session.

											
										
										
											2026-03-01 00:23:19 -08:00
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								                        # Cache discovered context length after successful call
 								                        if self.context_compressor._context_probed:
 								                            ctx = self.context_compressor.context_length
 								                            save_context_length(self.model, self.base_url, ctx)
 								                            print(f"{self.log_prefix}💾 Cached context length: {ctx:,} tokens for {self.model}")
 								                            self.context_compressor._context_probed = False
-												feat(cli): add /usage command to display session token usage

Introduced a new command "/usage" in the CLI to show cumulative token usage for the current session. This includes details on prompt tokens, completion tokens, total tokens, API calls, and context state. Updated command documentation to reflect this addition. Enhanced the AIAgent class to track token usage throughout the session.

											
										
										
											2026-03-01 00:23:19 -08:00
+								                        self.session_prompt_tokens += prompt_tokens
 								                        self.session_completion_tokens += completion_tokens
 								                        self.session_total_tokens += total_tokens
 								                        self.session_api_calls += 1
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
 								                        if self.verbose_logging:
 								                            logging.debug(f"Token usage: prompt={usage_dict['prompt_tokens']:,}, completion={usage_dict['completion_tokens']:,}, total={usage_dict['total_tokens']:,}")
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
 								                        # Log cache hit stats when prompt caching is active
 								                        if self._use_prompt_caching:
-												fix(anthropic): address gaps found in deep-dive audit

After studying clawdbot (OpenClaw) and OpenCode implementations:

## Beta headers
- Add interleaved-thinking-2025-05-14 and fine-grained-tool-streaming-2025-05-14
  as common betas (sent with ALL auth types, not just OAuth)
- OAuth tokens additionally get oauth-2025-04-20
- API keys now also get the common betas (previously got none)

## Vision/image support
- Add _convert_vision_content() to convert OpenAI multimodal format
  (image_url blocks) to Anthropic format (image blocks with base64/url source)
- Handles both data: URIs (base64) and regular URLs

## Role alternation enforcement
- Anthropic strictly rejects consecutive same-role messages (400 error)
- Add post-processing step that merges consecutive user/assistant messages
- Handles string, list, and mixed content types during merge

## Tool choice support
- Add tool_choice parameter to build_anthropic_kwargs()
- Maps OpenAI values: auto→auto, required→any, none→omit, name→tool

## Cache metrics tracking
- Anthropic uses cache_read_input_tokens / cache_creation_input_tokens
  (different from OpenRouter's prompt_tokens_details.cached_tokens)
- Add api_mode-aware branch in run_agent.py cache stats logging

## Credential refresh on 401
- On 401 error during anthropic_messages mode, re-read credentials
  via resolve_anthropic_token() (picks up refreshed Claude Code tokens)
- Rebuild client if new token differs from current one
- Follows same pattern as Codex/Nous 401 refresh handlers

## Tests
- 44 adapter tests (8 new: vision conversion, role alternation, tool choice)
- Updated beta header tests to verify new structure
- Full suite: 3198 passed, 0 regressions

											
										
										
											2026-03-12 16:00:46 -07:00
+								                            if self.api_mode == "anthropic_messages":
 								                                # Anthropic uses cache_read_input_tokens / cache_creation_input_tokens
 								                                cached = getattr(response.usage, 'cache_read_input_tokens', 0) or 0
 								                                written = getattr(response.usage, 'cache_creation_input_tokens', 0) or 0
 								                            else:
 								                                # OpenRouter uses prompt_tokens_details.cached_tokens
 								                                details = getattr(response.usage, 'prompt_tokens_details', None)
 								                                cached = getattr(details, 'cached_tokens', 0) or 0 if details else 0
 								                                written = getattr(details, 'cache_write_tokens', 0) or 0 if details else 0
-												Implement Anthropic prompt caching for Claude models via OpenRouter

- Introduced a caching strategy that reduces input token costs by ~75% on multi-turn conversations by caching the conversation prefix.
- Added functions to apply cache control markers to messages, enhancing efficiency in token usage.
- Updated AIAgent to auto-enable prompt caching for Claude models, with configurable cache TTL.
- Enhanced logging to track cache hit statistics when caching is active, improving monitoring of token usage.

											
										
										
											2026-02-10 06:49:41 +00:00
+								                            prompt = usage_dict["prompt_tokens"]
 								                            hit_pct = (cached / prompt * 100) if prompt > 0 else 0
 								                            if not self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                                self._vprint(f"{self.log_prefix}   💾 Cache: {cached:,}/{prompt:,} tokens ({hit_pct:.0f}% hit, {written:,} written)")
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								                    break  # Success, exit retry loop
-												rate limits

											
										
										
											2025-11-17 18:35:36 -05:00
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								                except InterruptedError:
 								                    if thinking_spinner:
 								                        thinking_spinner.stop("")
 								                        thinking_spinner = None
-												fix: stop terminal border flashing with steady cursor and TUI spinner widget

Cherry-picked and improved from PR #470 (fixes #464).

Problem: On Ubuntu 24.04 with ghostty + tmux, the prompt input box
border lines flash due to cursor blink and raw spinner terminal writes
conflicting with prompt_toolkit's rendering.

Changes:
- cli.py: Add CursorShape.BLOCK to Application() to disable cursor blink
- cli.py: Add thinking_callback + spinner_widget in TUI layout so
  thinking status displays as a proper prompt_toolkit widget instead of
  raw terminal writes that conflict with the TUI renderer
- run_agent.py: Add thinking_callback parameter to AIAgent; when set,
  uses the callback instead of KawaiiSpinner for thinking display

What was NOT changed (preserving existing behavior):
- agent/display.py: Untouched. KawaiiSpinner _write() stdout capture,
  _animate() logic, and 0.12s frame interval all preserved. This
  protects subagent stdout redirection and keeps smooth animations
  for non-CLI contexts (gateway, batch runner).
- Original emoji spinner types (brain/sparkle/pulse/moon/star) preserved
  for all non-CLI contexts.

Fixes from original PR #470:
- CursorShape.STEADY_BLOCK -> CursorShape.BLOCK (STEADY_BLOCK doesn't
  exist in prompt_toolkit 3.0.52)
- Removed duplicate self._spinner_text = '' line
- Removed redundant nested if-checks

Tested: 2706 tests pass, interactive CLI verified via tmux.

											
										
										
											2026-03-09 23:26:43 -07:00
+								                    if self.thinking_callback:
 								                        self.thinking_callback("")
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                    api_elapsed = time.time() - api_start_time
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"{self.log_prefix}⚡ Interrupted during API call.", force=True)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                    self._persist_session(messages, conversation_history)
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								                    interrupted = True
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                    final_response = f"Operation interrupted: waiting for model response ({api_elapsed:.1f}s elapsed)."
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								                    break
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								                except Exception as api_error:
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                    # Stop spinner before printing error messages
 								                    if thinking_spinner:
 								                        thinking_spinner.stop(f"(╥_╥) error, retrying...")
 								                        thinking_spinner = None
-												fix: stop terminal border flashing with steady cursor and TUI spinner widget

Cherry-picked and improved from PR #470 (fixes #464).

Problem: On Ubuntu 24.04 with ghostty + tmux, the prompt input box
border lines flash due to cursor blink and raw spinner terminal writes
conflicting with prompt_toolkit's rendering.

Changes:
- cli.py: Add CursorShape.BLOCK to Application() to disable cursor blink
- cli.py: Add thinking_callback + spinner_widget in TUI layout so
  thinking status displays as a proper prompt_toolkit widget instead of
  raw terminal writes that conflict with the TUI renderer
- run_agent.py: Add thinking_callback parameter to AIAgent; when set,
  uses the callback instead of KawaiiSpinner for thinking display

What was NOT changed (preserving existing behavior):
- agent/display.py: Untouched. KawaiiSpinner _write() stdout capture,
  _animate() logic, and 0.12s frame interval all preserved. This
  protects subagent stdout redirection and keeps smooth animations
  for non-CLI contexts (gateway, batch runner).
- Original emoji spinner types (brain/sparkle/pulse/moon/star) preserved
  for all non-CLI contexts.

Fixes from original PR #470:
- CursorShape.STEADY_BLOCK -> CursorShape.BLOCK (STEADY_BLOCK doesn't
  exist in prompt_toolkit 3.0.52)
- Removed duplicate self._spinner_text = '' line
- Removed redundant nested if-checks

Tested: 2706 tests pass, interactive CLI verified via tmux.

											
										
										
											2026-03-09 23:26:43 -07:00
+								                    if self.thinking_callback:
 								                        self.thinking_callback("")
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
 								                    status_code = getattr(api_error, "status_code", None)
 								                    if (
 								                        self.api_mode == "codex_responses"
 								                        and self.provider == "openai-codex"
 								                        and status_code == 401
 								                        and not codex_auth_retry_attempted
 								                    ):
 								                        codex_auth_retry_attempted = True
 								                        if self._try_refresh_codex_client_credentials(force=True):
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}🔐 Codex auth refreshed after 401. Retrying request...")
-												Harden Codex auth refresh and responses compatibility

											
										
										
											2026-02-25 19:27:54 -08:00
+								                            continue
-												fix: implement Nous credential refresh on 401 error for retry logic

											
										
										
											2026-03-07 13:34:23 +11:00
+								                    if (
 								                        self.api_mode == "chat_completions"
 								                        and self.provider == "nous"
 								                        and status_code == 401
 								                        and not nous_auth_retry_attempted
 								                    ):
 								                        nous_auth_retry_attempted = True
 								                        if self._try_refresh_nous_client_credentials(force=True):
 								                            print(f"{self.log_prefix}🔐 Nous agent key refreshed after 401. Retrying request...")
 								                            continue
-												fix(anthropic): address gaps found in deep-dive audit

After studying clawdbot (OpenClaw) and OpenCode implementations:

## Beta headers
- Add interleaved-thinking-2025-05-14 and fine-grained-tool-streaming-2025-05-14
  as common betas (sent with ALL auth types, not just OAuth)
- OAuth tokens additionally get oauth-2025-04-20
- API keys now also get the common betas (previously got none)

## Vision/image support
- Add _convert_vision_content() to convert OpenAI multimodal format
  (image_url blocks) to Anthropic format (image blocks with base64/url source)
- Handles both data: URIs (base64) and regular URLs

## Role alternation enforcement
- Anthropic strictly rejects consecutive same-role messages (400 error)
- Add post-processing step that merges consecutive user/assistant messages
- Handles string, list, and mixed content types during merge

## Tool choice support
- Add tool_choice parameter to build_anthropic_kwargs()
- Maps OpenAI values: auto→auto, required→any, none→omit, name→tool

## Cache metrics tracking
- Anthropic uses cache_read_input_tokens / cache_creation_input_tokens
  (different from OpenRouter's prompt_tokens_details.cached_tokens)
- Add api_mode-aware branch in run_agent.py cache stats logging

## Credential refresh on 401
- On 401 error during anthropic_messages mode, re-read credentials
  via resolve_anthropic_token() (picks up refreshed Claude Code tokens)
- Rebuild client if new token differs from current one
- Follows same pattern as Codex/Nous 401 refresh handlers

## Tests
- 44 adapter tests (8 new: vision conversion, role alternation, tool choice)
- Updated beta header tests to verify new structure
- Full suite: 3198 passed, 0 regressions

											
										
										
											2026-03-12 16:00:46 -07:00
+								                    if (
 								                        self.api_mode == "anthropic_messages"
 								                        and status_code == 401
 								                        and hasattr(self, '_anthropic_api_key')
-												fix(anthropic): deep scan fixes — auth, retries, edge cases

Fixes from comprehensive code review and cross-referencing with
clawdbot/OpenCode implementations:

CRITICAL:
- Add one-shot guard (anthropic_auth_retry_attempted) to prevent
  infinite 401 retry loops when credentials keep changing
- Fix _is_oauth_token(): managed keys from ~/.claude.json are NOT
  regular API keys (don't start with sk-ant-api). Inverted the logic:
  only sk-ant-api* is treated as API key auth, everything else uses
  Bearer auth + oauth beta headers

HIGH:
- Wrap json.loads(args) in try/except in message conversion — malformed
  tool_call arguments no longer crash the entire conversation
- Raise AuthError in runtime_provider when no Anthropic token found
  (was silently passing empty string, causing confusing API errors)
- Remove broken _try_anthropic() from auxiliary vision chain — the
  centralized router creates an OpenAI client for api_key providers
  which doesn't work with Anthropic's Messages API

MEDIUM:
- Handle empty assistant message content — Anthropic rejects empty
  content blocks, now inserts '(empty)' placeholder
- Fix setup.py existing_key logic — set to 'KEEP' sentinel instead
  of None to prevent falling through to the auth choice prompt
- Add debug logging to _fetch_anthropic_models on failure

Tests: 43 adapter tests (2 new for token detection), 3197 total passed

											
										
										
											2026-03-12 17:14:22 -07:00
+								                        and not anthropic_auth_retry_attempted
-												fix(anthropic): address gaps found in deep-dive audit

After studying clawdbot (OpenClaw) and OpenCode implementations:

## Beta headers
- Add interleaved-thinking-2025-05-14 and fine-grained-tool-streaming-2025-05-14
  as common betas (sent with ALL auth types, not just OAuth)
- OAuth tokens additionally get oauth-2025-04-20
- API keys now also get the common betas (previously got none)

## Vision/image support
- Add _convert_vision_content() to convert OpenAI multimodal format
  (image_url blocks) to Anthropic format (image blocks with base64/url source)
- Handles both data: URIs (base64) and regular URLs

## Role alternation enforcement
- Anthropic strictly rejects consecutive same-role messages (400 error)
- Add post-processing step that merges consecutive user/assistant messages
- Handles string, list, and mixed content types during merge

## Tool choice support
- Add tool_choice parameter to build_anthropic_kwargs()
- Maps OpenAI values: auto→auto, required→any, none→omit, name→tool

## Cache metrics tracking
- Anthropic uses cache_read_input_tokens / cache_creation_input_tokens
  (different from OpenRouter's prompt_tokens_details.cached_tokens)
- Add api_mode-aware branch in run_agent.py cache stats logging

## Credential refresh on 401
- On 401 error during anthropic_messages mode, re-read credentials
  via resolve_anthropic_token() (picks up refreshed Claude Code tokens)
- Rebuild client if new token differs from current one
- Follows same pattern as Codex/Nous 401 refresh handlers

## Tests
- 44 adapter tests (8 new: vision conversion, role alternation, tool choice)
- Updated beta header tests to verify new structure
- Full suite: 3198 passed, 0 regressions

											
										
										
											2026-03-12 16:00:46 -07:00
+								                    ):
-												fix(anthropic): deep scan fixes — auth, retries, edge cases

Fixes from comprehensive code review and cross-referencing with
clawdbot/OpenCode implementations:

CRITICAL:
- Add one-shot guard (anthropic_auth_retry_attempted) to prevent
  infinite 401 retry loops when credentials keep changing
- Fix _is_oauth_token(): managed keys from ~/.claude.json are NOT
  regular API keys (don't start with sk-ant-api). Inverted the logic:
  only sk-ant-api* is treated as API key auth, everything else uses
  Bearer auth + oauth beta headers

HIGH:
- Wrap json.loads(args) in try/except in message conversion — malformed
  tool_call arguments no longer crash the entire conversation
- Raise AuthError in runtime_provider when no Anthropic token found
  (was silently passing empty string, causing confusing API errors)
- Remove broken _try_anthropic() from auxiliary vision chain — the
  centralized router creates an OpenAI client for api_key providers
  which doesn't work with Anthropic's Messages API

MEDIUM:
- Handle empty assistant message content — Anthropic rejects empty
  content blocks, now inserts '(empty)' placeholder
- Fix setup.py existing_key logic — set to 'KEEP' sentinel instead
  of None to prevent falling through to the auth choice prompt
- Add debug logging to _fetch_anthropic_models on failure

Tests: 43 adapter tests (2 new for token detection), 3197 total passed

											
										
										
											2026-03-12 17:14:22 -07:00
+								                        anthropic_auth_retry_attempted = True
-												fix: refresh Anthropic OAuth before stale env tokens

											
										
										
											2026-03-14 19:22:31 -07:00
+								                        from agent.anthropic_adapter import _is_oauth_token
 								                        if self._try_refresh_anthropic_client_credentials():
-												fix(anthropic): address gaps found in deep-dive audit

After studying clawdbot (OpenClaw) and OpenCode implementations:

## Beta headers
- Add interleaved-thinking-2025-05-14 and fine-grained-tool-streaming-2025-05-14
  as common betas (sent with ALL auth types, not just OAuth)
- OAuth tokens additionally get oauth-2025-04-20
- API keys now also get the common betas (previously got none)

## Vision/image support
- Add _convert_vision_content() to convert OpenAI multimodal format
  (image_url blocks) to Anthropic format (image blocks with base64/url source)
- Handles both data: URIs (base64) and regular URLs

## Role alternation enforcement
- Anthropic strictly rejects consecutive same-role messages (400 error)
- Add post-processing step that merges consecutive user/assistant messages
- Handles string, list, and mixed content types during merge

## Tool choice support
- Add tool_choice parameter to build_anthropic_kwargs()
- Maps OpenAI values: auto→auto, required→any, none→omit, name→tool

## Cache metrics tracking
- Anthropic uses cache_read_input_tokens / cache_creation_input_tokens
  (different from OpenRouter's prompt_tokens_details.cached_tokens)
- Add api_mode-aware branch in run_agent.py cache stats logging

## Credential refresh on 401
- On 401 error during anthropic_messages mode, re-read credentials
  via resolve_anthropic_token() (picks up refreshed Claude Code tokens)
- Rebuild client if new token differs from current one
- Follows same pattern as Codex/Nous 401 refresh handlers

## Tests
- 44 adapter tests (8 new: vision conversion, role alternation, tool choice)
- Updated beta header tests to verify new structure
- Full suite: 3198 passed, 0 regressions

											
										
										
											2026-03-12 16:00:46 -07:00
+								                            print(f"{self.log_prefix}🔐 Anthropic credentials refreshed after 401. Retrying request...")
 								                            continue
-												fix(anthropic): add diagnostic output on 401 auth failures

When Anthropic returns 401 and credential refresh doesn't help,
now prints actionable troubleshooting info:
- Which auth method was used (Bearer vs x-api-key)
- Token prefix for debugging
- Common fixes (stale ANTHROPIC_API_KEY, verify key, refresh login)
- How to clear stale keys

											
										
										
											2026-03-12 19:09:06 -07:00
+								                        # Credential refresh didn't help — show diagnostic info
 								                        key = self._anthropic_api_key
 								                        auth_method = "Bearer (OAuth/setup-token)" if _is_oauth_token(key) else "x-api-key (API key)"
 								                        print(f"{self.log_prefix}🔐 Anthropic 401 — authentication failed.")
 								                        print(f"{self.log_prefix}   Auth method: {auth_method}")
 								                        print(f"{self.log_prefix}   Token prefix: {key[:12]}..." if key and len(key) > 12 else f"{self.log_prefix}   Token: (empty or short)")
 								                        print(f"{self.log_prefix}   Troubleshooting:")
-												fix: separate Anthropic OAuth tokens from API keys

Persist OAuth/setup tokens in ANTHROPIC_TOKEN instead of ANTHROPIC_API_KEY.
Reserve ANTHROPIC_API_KEY for regular Console API keys.

Changes:
- anthropic_adapter: reorder resolve_anthropic_token() priority —
  ANTHROPIC_TOKEN first, ANTHROPIC_API_KEY as legacy fallback
- config: add save_anthropic_oauth_token() / save_anthropic_api_key() helpers
  that clear the opposing slot to prevent priority conflicts
- config: show_config() prefers ANTHROPIC_TOKEN for display
- setup: OAuth login and pasted setup-tokens write to ANTHROPIC_TOKEN
- setup: API key entry writes to ANTHROPIC_API_KEY and clears ANTHROPIC_TOKEN
- main: same fixes in _run_anthropic_oauth_flow() and _model_flow_anthropic()
- main: _has_any_provider_configured() checks ANTHROPIC_TOKEN
- doctor: use _is_oauth_token() for correct auth method validation
- runtime_provider: updated error message
- run_agent: simplified client init to use resolve_anthropic_token()
- run_agent: updated 401 troubleshooting messages
- status: prefer ANTHROPIC_TOKEN in status display
- tests: updated priority test, added persistence helper tests

Cherry-picked from PR #1141 by kshitijk4poor, rebased onto current main
with unrelated changes (web_policy config, blocklist CLI) removed.

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-03-13 02:09:52 -07:00
+								                        print(f"{self.log_prefix}     • Check ANTHROPIC_TOKEN in ~/.hermes/.env for Hermes-managed OAuth/setup tokens")
 								                        print(f"{self.log_prefix}     • Check ANTHROPIC_API_KEY in ~/.hermes/.env for API keys or legacy token values")
-												fix(anthropic): add diagnostic output on 401 auth failures

When Anthropic returns 401 and credential refresh doesn't help,
now prints actionable troubleshooting info:
- Which auth method was used (Bearer vs x-api-key)
- Token prefix for debugging
- Common fixes (stale ANTHROPIC_API_KEY, verify key, refresh login)
- How to clear stale keys

											
										
										
											2026-03-12 19:09:06 -07:00
+								                        print(f"{self.log_prefix}     • For API keys: verify at https://console.anthropic.com/settings/keys")
 								                        print(f"{self.log_prefix}     • For Claude Code: run 'claude /login' to refresh, then retry")
-												fix: separate Anthropic OAuth tokens from API keys

Persist OAuth/setup tokens in ANTHROPIC_TOKEN instead of ANTHROPIC_API_KEY.
Reserve ANTHROPIC_API_KEY for regular Console API keys.

Changes:
- anthropic_adapter: reorder resolve_anthropic_token() priority —
  ANTHROPIC_TOKEN first, ANTHROPIC_API_KEY as legacy fallback
- config: add save_anthropic_oauth_token() / save_anthropic_api_key() helpers
  that clear the opposing slot to prevent priority conflicts
- config: show_config() prefers ANTHROPIC_TOKEN for display
- setup: OAuth login and pasted setup-tokens write to ANTHROPIC_TOKEN
- setup: API key entry writes to ANTHROPIC_API_KEY and clears ANTHROPIC_TOKEN
- main: same fixes in _run_anthropic_oauth_flow() and _model_flow_anthropic()
- main: _has_any_provider_configured() checks ANTHROPIC_TOKEN
- doctor: use _is_oauth_token() for correct auth method validation
- runtime_provider: updated error message
- run_agent: simplified client init to use resolve_anthropic_token()
- run_agent: updated 401 troubleshooting messages
- status: prefer ANTHROPIC_TOKEN in status display
- tests: updated priority test, added persistence helper tests

Cherry-picked from PR #1141 by kshitijk4poor, rebased onto current main
with unrelated changes (web_policy config, blocklist CLI) removed.

Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>

											
										
										
											2026-03-13 02:09:52 -07:00
+								                        print(f"{self.log_prefix}     • Clear stale keys: hermes config set ANTHROPIC_TOKEN \"\"")
 								                        print(f"{self.log_prefix}     • Legacy cleanup: hermes config set ANTHROPIC_API_KEY \"\"")
-												feat(agent): enhance reasoning handling and configuration

Added support for processing encrypted reasoning content within the AIAgent class. Introduced logic to determine reasoning effort and enable/disable reasoning based on configuration settings. Updated the kwargs to reflect these changes, ensuring proper handling of reasoning parameters during agent execution.

											
										
										
											2026-03-01 16:15:20 -08:00
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								                    retry_count += 1
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								                    elapsed_time = time.time() - api_start_time
 								                    # Enhanced error logging
 								                    error_type = type(api_error).__name__
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                    error_msg = str(api_error).lower()
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                    logger.warning(
 								                        "API call failed (attempt %s/%s) error_type=%s %s error=%s",
 								                        retry_count,
 								                        max_retries,
 								                        error_type,
 								                        self._client_log_context(),
 								                        api_error,
 								                    )
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                    self._vprint(f"{self.log_prefix}⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}", force=True)
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"{self.log_prefix}   ⏱️  Time elapsed before failure: {elapsed_time:.2f}s")
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                    self._vprint(f"{self.log_prefix}   📝 Error: {str(api_error)[:200]}", force=True)
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"{self.log_prefix}   📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools")
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                    # Check for interrupt before deciding to retry
 								                    if self._interrupt_requested:
-												fix: harden web gateway security and fix error swallowing

- Use hmac.compare_digest for timing-safe token comparison (3 endpoints)
- Default bind to 127.0.0.1 instead of 0.0.0.0
- Sanitize upload filenames with Path.name to prevent path traversal
- Add DOMPurify to sanitize marked.parse() output against XSS
- Replace add_static with authenticated media handler
- Hide token in group chats for /remote-control command
- Use ctypes.util.find_library for Opus instead of hardcoded paths
- Add force=True to 5 interrupt _vprint calls for visibility
- Log Opus decode errors and voice restart failures instead of swallowing

											
										
										
											2026-03-13 15:29:18 +03:00
+								                        self._vprint(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.", force=True)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                        self._persist_session(messages, conversation_history)
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								                        self.clear_interrupt()
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                        return {
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                            "final_response": f"Operation interrupted: handling API error ({error_type}: {str(api_error)[:80]}).",
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                            "messages": messages,
 								                            "api_calls": api_call_count,
 								                            "completed": False,
 								                            "interrupted": True,
 								                        }
-												fix(agent): handle 413 payload-too-large via compression instead of aborting

The 413 "Request Entity Too Large" error from the LLM API was caught by the
generic 4xx handler which aborts immediately. This is wrong for 413 — it's a
payload-size issue that can be resolved by compressing conversation history.

- Intercept 413 before the generic 4xx block and route to _compress_context
- Exclude 413 from generic is_client_error detection
- Add 'request entity too large' to context-length phrases as safety net
- Add tests for 413 compression behavior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-27 12:21:27 -05:00
+								                    # Check for 413 payload-too-large BEFORE generic 4xx handler.
 								                    # A 413 is a payload-size error — the correct response is to
 								                    # compress history and retry, not abort immediately.
 								                    status_code = getattr(api_error, "status_code", None)
 								                    is_payload_too_large = (
 								                        status_code == 413
 								                        or 'request entity too large' in error_msg
-												fix(agent): enhance 413 error handling and improve conversation history management in tests

											
										
										
											2026-02-27 23:04:32 -08:00
+								                        or 'payload too large' in error_msg
-												fix(agent): handle 413 payload-too-large via compression instead of aborting

The 413 "Request Entity Too Large" error from the LLM API was caught by the
generic 4xx handler which aborts immediately. This is wrong for 413 — it's a
payload-size issue that can be resolved by compressing conversation history.

- Intercept 413 before the generic 4xx block and route to _compress_context
- Exclude 413 from generic is_client_error detection
- Add 'request entity too large' to context-length phrases as safety net
- Add tests for 413 compression behavior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-27 12:21:27 -05:00
+								                        or 'error code: 413' in error_msg
 								                    )
 								                    if is_payload_too_large:
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								                        compression_attempts += 1
 								                        if compression_attempts > max_compression_attempts:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached for payload-too-large error.", force=True)
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								                            logging.error(f"{self.log_prefix}413 compression failed after {max_compression_attempts} attempts.")
 								                            self._persist_session(messages, conversation_history)
 								                            return {
 								                                "messages": messages,
 								                                "completed": False,
 								                                "api_calls": api_call_count,
 								                                "error": f"Request payload too large: max compression attempts ({max_compression_attempts}) reached.",
 								                                "partial": True
 								                            }
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}⚠️  Request payload too large (413) — compression attempt {compression_attempts}/{max_compression_attempts}...")
-												fix(agent): handle 413 payload-too-large via compression instead of aborting

The 413 "Request Entity Too Large" error from the LLM API was caught by the
generic 4xx handler which aborts immediately. This is wrong for 413 — it's a
payload-size issue that can be resolved by compressing conversation history.

- Intercept 413 before the generic 4xx block and route to _compress_context
- Exclude 413 from generic is_client_error detection
- Add 'request entity too large' to context-length phrases as safety net
- Add tests for 413 compression behavior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-27 12:21:27 -05:00
 								                        original_len = len(messages)
 								                        messages, active_system_prompt = self._compress_context(
-												fix: detect and warn on file re-read loops after context compression

When context compression summarizes conversation history, the agent
loses track of which files it already read and re-reads them in a loop.
Users report the agent reading the same files endlessly without writing.

Root cause: context compression is lossy — file contents and read history
are lost in the summary. After compression, the model thinks it hasn't
examined the files yet and reads them again.

Fix (two-part):
1. Track file reads per task in file_tools.py. When the same file region
   is read again, include a _warning in the response telling the model
   to stop re-reading and use existing information.
2. After context compression, inject a structured message listing all
   files already read in the session with explicit "do NOT re-read"
   instruction, preserving read history across compression boundaries.

Adds 16 tests covering warning detection, task isolation, summary
accuracy, tracker cleanup, and compression history injection.

											
										
										
											2026-03-08 20:44:42 +03:00
+								                            messages, system_message, approx_tokens=approx_tokens,
 								                            task_id=effective_task_id,
-												fix(agent): handle 413 payload-too-large via compression instead of aborting

The 413 "Request Entity Too Large" error from the LLM API was caught by the
generic 4xx handler which aborts immediately. This is wrong for 413 — it's a
payload-size issue that can be resolved by compressing conversation history.

- Intercept 413 before the generic 4xx block and route to _compress_context
- Exclude 413 from generic is_client_error detection
- Add 'request entity too large' to context-length phrases as safety net
- Add tests for 413 compression behavior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-27 12:21:27 -05:00
+								                        )
 								                        if len(messages) < original_len:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}   🗜️  Compressed {original_len} → {len(messages)} messages, retrying...")
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								                            time.sleep(2)  # Brief pause between compression retries
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
+								                            restart_with_compressed_messages = True
 								                            break
-												fix(agent): handle 413 payload-too-large via compression instead of aborting

The 413 "Request Entity Too Large" error from the LLM API was caught by the
generic 4xx handler which aborts immediately. This is wrong for 413 — it's a
payload-size issue that can be resolved by compressing conversation history.

- Intercept 413 before the generic 4xx block and route to _compress_context
- Exclude 413 from generic is_client_error detection
- Add 'request entity too large' to context-length phrases as safety net
- Add tests for 413 compression behavior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-27 12:21:27 -05:00
+								                        else:
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ Payload too large and cannot compress further.", force=True)
-												fix(agent): handle 413 payload-too-large via compression instead of aborting

The 413 "Request Entity Too Large" error from the LLM API was caught by the
generic 4xx handler which aborts immediately. This is wrong for 413 — it's a
payload-size issue that can be resolved by compressing conversation history.

- Intercept 413 before the generic 4xx block and route to _compress_context
- Exclude 413 from generic is_client_error detection
- Add 'request entity too large' to context-length phrases as safety net
- Add tests for 413 compression behavior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-27 12:21:27 -05:00
+								                            logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.")
 								                            self._persist_session(messages, conversation_history)
 								                            return {
 								                                "messages": messages,
 								                                "completed": False,
 								                                "api_calls": api_call_count,
 								                                "error": "Request payload too large (413). Cannot compress further.",
 								                                "partial": True
 								                            }
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
+								                    # Check for context-length errors BEFORE generic 4xx handler.
-												Fix context overrun crash with local LLM backends (fixes #348)

Local backends (LM Studio, Ollama, llama.cpp) return HTTP 400
with messages like "Context size has been exceeded" when the
context window is full. The error phrase list did not include
"context size" or "context window", so these errors fell through
to the generic 4xx abort handler instead of triggering compression.

Changes:
- Move context-length check above generic 4xx handler so it runs
  first (same pattern as the existing 413 check)
- Add "context size" and "context window" to the phrase list
- Guard 4xx handler with `not is_context_length_error` to prevent
  context-related 400s from being treated as non-retryable

											
										
										
											2026-03-05 01:12:34 +03:00
+								                    # Local backends (LM Studio, Ollama, llama.cpp) often return
 								                    # HTTP 400 with messages like "Context size has been exceeded"
 								                    # which must trigger compression, not an immediate abort.
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                    is_context_length_error = any(phrase in error_msg for phrase in [
-												Fix context overrun crash with local LLM backends (fixes #348)

Local backends (LM Studio, Ollama, llama.cpp) return HTTP 400
with messages like "Context size has been exceeded" when the
context window is full. The error phrase list did not include
"context size" or "context window", so these errors fell through
to the generic 4xx abort handler instead of triggering compression.

Changes:
- Move context-length check above generic 4xx handler so it runs
  first (same pattern as the existing 413 check)
- Add "context size" and "context window" to the phrase list
- Guard 4xx handler with `not is_context_length_error` to prevent
  context-related 400s from being treated as non-retryable

											
										
										
											2026-03-05 01:12:34 +03:00
+								                        'context length', 'context size', 'maximum context',
 								                        'token limit', 'too many tokens', 'reduce the length',
 								                        'exceeds the limit', 'context window',
-												fix(agent): handle 413 payload-too-large via compression instead of aborting

The 413 "Request Entity Too Large" error from the LLM API was caught by the
generic 4xx handler which aborts immediately. This is wrong for 413 — it's a
payload-size issue that can be resolved by compressing conversation history.

- Intercept 413 before the generic 4xx block and route to _compress_context
- Exclude 413 from generic is_client_error detection
- Add 'request entity too large' to context-length phrases as safety net
- Add tests for 413 compression behavior

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

											
										
										
											2026-02-27 12:21:27 -05:00
+								                        'request entity too large',  # OpenRouter/Nous 413 safety net
-												fix: proactive compression after large tool results + Anthropic error detection

Two fixes for context overflow handling:

1. Proactive compression after tool execution: The compression check now
   estimates the next prompt size using real token counts from the last API
   response (prompt_tokens + completion_tokens) plus a conservative estimate
   of newly appended tool results (chars // 3 for JSON-heavy content).
   Previously, should_compress() only checked last_prompt_tokens which
   didn't account for tool results — so a 130k prompt + 100k chars of tool
   output would pass the 140k threshold check but fail the 200k API limit.

2. Safety net: Added 'prompt is too long' to context-length error detection
   phrases. Anthropic returns 'prompt is too long: N tokens > M maximum'
   on HTTP 400, which wasn't matched by existing phrases. This ensures
   compression fires even if the proactive check underestimates.

Fixes #813

											
										
										
											2026-03-11 08:04:52 -07:00
+								                        'prompt is too long',  # Anthropic: "prompt is too long: N tokens > M maximum"
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                    ])
 								                    if is_context_length_error:
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								                        compressor = self.context_compressor
 								                        old_ctx = compressor.context_length
 								                        # Try to parse the actual limit from the error message
 								                        parsed_limit = parse_context_limit_from_error(error_msg)
 								                        if parsed_limit and parsed_limit < old_ctx:
 								                            new_ctx = parsed_limit
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}⚠️  Context limit detected from API: {new_ctx:,} tokens (was {old_ctx:,})", force=True)
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								                        else:
 								                            # Step down to the next probe tier
 								                            new_ctx = get_next_probe_tier(old_ctx)
 								                        if new_ctx and new_ctx < old_ctx:
 								                            compressor.context_length = new_ctx
 								                            compressor.threshold_tokens = int(new_ctx * compressor.threshold_percent)
 								                            compressor._context_probed = True
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}⚠️  Context length exceeded — stepping down: {old_ctx:,} → {new_ctx:,} tokens", force=True)
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								                        else:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}⚠️  Context length exceeded at minimum tier — attempting compression...", force=True)
-												Fix context overrun crash with local LLM backends (fixes #348)

Local backends (LM Studio, Ollama, llama.cpp) return HTTP 400
with messages like "Context size has been exceeded" when the
context window is full. The error phrase list did not include
"context size" or "context window", so these errors fell through
to the generic 4xx abort handler instead of triggering compression.

Changes:
- Move context-length check above generic 4xx handler so it runs
  first (same pattern as the existing 413 check)
- Add "context size" and "context window" to the phrase list
- Guard 4xx handler with `not is_context_length_error` to prevent
  context-related 400s from being treated as non-retryable

											
										
										
											2026-03-05 01:12:34 +03:00
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								                        compression_attempts += 1
 								                        if compression_attempts > max_compression_attempts:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ Max compression attempts ({max_compression_attempts}) reached.", force=True)
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								                            logging.error(f"{self.log_prefix}Context compression failed after {max_compression_attempts} attempts.")
 								                            self._persist_session(messages, conversation_history)
 								                            return {
 								                                "messages": messages,
 								                                "completed": False,
 								                                "api_calls": api_call_count,
 								                                "error": f"Context length exceeded: max compression attempts ({max_compression_attempts}) reached.",
 								                                "partial": True
 								                            }
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}   🗜️  Context compression attempt {compression_attempts}/{max_compression_attempts}...")
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                        original_len = len(messages)
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                        messages, active_system_prompt = self._compress_context(
-												fix: detect and warn on file re-read loops after context compression

When context compression summarizes conversation history, the agent
loses track of which files it already read and re-reads them in a loop.
Users report the agent reading the same files endlessly without writing.

Root cause: context compression is lossy — file contents and read history
are lost in the summary. After compression, the model thinks it hasn't
examined the files yet and reads them again.

Fix (two-part):
1. Track file reads per task in file_tools.py. When the same file region
   is read again, include a _warning in the response telling the model
   to stop re-reading and use existing information.
2. After context compression, inject a structured message listing all
   files already read in the session with explicit "do NOT re-read"
   instruction, preserving read history across compression boundaries.

Adds 16 tests covering warning detection, task isolation, summary
accuracy, tracker cleanup, and compression history injection.

											
										
										
											2026-03-08 20:44:42 +03:00
+								                            messages, system_message, approx_tokens=approx_tokens,
 								                            task_id=effective_task_id,
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                        )
-												Fix context overrun crash with local LLM backends (fixes #348)

Local backends (LM Studio, Ollama, llama.cpp) return HTTP 400
with messages like "Context size has been exceeded" when the
context window is full. The error phrase list did not include
"context size" or "context window", so these errors fell through
to the generic 4xx abort handler instead of triggering compression.

Changes:
- Move context-length check above generic 4xx handler so it runs
  first (same pattern as the existing 413 check)
- Add "context size" and "context window" to the phrase list
- Guard 4xx handler with `not is_context_length_error` to prevent
  context-related 400s from being treated as non-retryable

											
										
										
											2026-03-05 01:12:34 +03:00
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								                        if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
 								                            if len(messages) < original_len:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                                self._vprint(f"{self.log_prefix}   🗜️  Compressed {original_len} → {len(messages)} messages, retrying...")
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								                            time.sleep(2)  # Brief pause between compression retries
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
+								                            restart_with_compressed_messages = True
 								                            break
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                        else:
-												feat: smart context length probing with persistent caching + banner display

Replaces the unsafe 128K fallback for unknown models with a descending
probe strategy (2M → 1M → 512K → 200K → 128K → 64K → 32K). When a
context-length error occurs, the agent steps down tiers and retries.
The discovered limit is cached per model+provider combo in
~/.hermes/context_length_cache.yaml so subsequent sessions skip probing.

Also parses API error messages to extract the actual context limit
(e.g. 'maximum context length is 32768 tokens') for instant resolution.

The CLI banner now displays the context window size next to the model
name (e.g. 'claude-opus-4 · 200K context · Nous Research').

Changes:
- agent/model_metadata.py: CONTEXT_PROBE_TIERS, persistent cache
  (save/load/get), parse_context_limit_from_error(), get_next_probe_tier()
- agent/context_compressor.py: accepts base_url, passes to metadata
- run_agent.py: step-down logic in context error handler, caches on success
- cli.py + hermes_cli/banner.py: context length in welcome banner
- tests: 22 new tests for probing, parsing, and caching

Addresses #132. PR #319's approach (8K default) rejected — too conservative.

											
										
										
											2026-03-05 16:09:57 -08:00
+								                            # Can't compress further and already at minimum tier
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ Context length exceeded and cannot compress further.", force=True)
 								                            self._vprint(f"{self.log_prefix}   💡 The conversation has accumulated too much content.", force=True)
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                            logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot compress further.")
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                            self._persist_session(messages, conversation_history)
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                            return {
 								                                "messages": messages,
 								                                "completed": False,
 								                                "api_calls": api_call_count,
 								                                "error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.",
 								                                "partial": True
 								                            }
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
 								                    # Check for non-retryable client errors (4xx HTTP status codes).
 								                    # These indicate a problem with the request itself (bad model ID,
 								                    # invalid API key, forbidden, etc.) and will never succeed on retry.
-												Fix context overrun crash with local LLM backends (fixes #348)

Local backends (LM Studio, Ollama, llama.cpp) return HTTP 400
with messages like "Context size has been exceeded" when the
context window is full. The error phrase list did not include
"context size" or "context window", so these errors fell through
to the generic 4xx abort handler instead of triggering compression.

Changes:
- Move context-length check above generic 4xx handler so it runs
  first (same pattern as the existing 413 check)
- Add "context size" and "context window" to the phrase list
- Guard 4xx handler with `not is_context_length_error` to prevent
  context-related 400s from being treated as non-retryable

											
										
										
											2026-03-05 01:12:34 +03:00
+								                    # Note: 413 and context-length errors are excluded — handled above.
-												fix: reduce max_retries to 3 and make ValueError/TypeError non-retryable

- max_retries reduced from 6 to 3 — 6 retries with exponential backoff
  could stall for ~275s total on persistent errors
- ValueError and TypeError now detected as non-retryable client errors
  and abort immediately instead of being retried with backoff (these are
  local validation/programming errors that will never succeed on retry)

											
										
										
											2026-03-11 07:04:46 -07:00
+								                    # Also catch local validation errors (ValueError, TypeError) — these
 								                    # are programming bugs, not transient failures.
 								                    is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
+								                    is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
-												fix: reduce max_retries to 3 and make ValueError/TypeError non-retryable

- max_retries reduced from 6 to 3 — 6 retries with exponential backoff
  could stall for ~275s total on persistent errors
- ValueError and TypeError now detected as non-retryable client errors
  and abort immediately instead of being retried with backoff (these are
  local validation/programming errors that will never succeed on retry)

											
										
										
											2026-03-11 07:04:46 -07:00
+								                    is_client_error = (is_local_validation_error or is_client_status_error or any(phrase in error_msg for phrase in [
-												Fix context overrun crash with local LLM backends (fixes #348)

Local backends (LM Studio, Ollama, llama.cpp) return HTTP 400
with messages like "Context size has been exceeded" when the
context window is full. The error phrase list did not include
"context size" or "context window", so these errors fell through
to the generic 4xx abort handler instead of triggering compression.

Changes:
- Move context-length check above generic 4xx handler so it runs
  first (same pattern as the existing 413 check)
- Add "context size" and "context window" to the phrase list
- Guard 4xx handler with `not is_context_length_error` to prevent
  context-related 400s from being treated as non-retryable

											
										
										
											2026-03-05 01:12:34 +03:00
+								                        'error code: 401', 'error code: 403',
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
+								                        'error code: 404', 'error code: 422',
 								                        'is not a valid model', 'invalid model', 'model not found',
 								                        'invalid api key', 'invalid_api_key', 'authentication',
 								                        'unauthorized', 'forbidden', 'not found',
 								                    ])) and not is_context_length_error
 								                    if is_client_error:
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								                        # Try fallback before aborting — a different provider
 								                        # may not have the same issue (rate limit, auth, etc.)
 								                        if self._try_activate_fallback():
 								                            retry_count = 0
 								                            continue
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
+								                        self._dump_api_request_debug(
 								                            api_kwargs, reason="non_retryable_client_error", error=api_error,
 								                        )
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                        self._vprint(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.", force=True)
 								                        self._vprint(f"{self.log_prefix}   💡 This type of error won't be fixed by retrying.", force=True)
-												fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.

											
										
										
											2026-03-04 14:42:41 -08:00
+								                        logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}")
 								                        self._persist_session(messages, conversation_history)
 								                        return {
 								                            "final_response": None,
 								                            "messages": messages,
 								                            "api_calls": api_call_count,
 								                            "completed": False,
 								                            "failed": True,
 								                            "error": str(api_error),
 								                        }
-												Fix context overrun crash with local LLM backends (fixes #348)

Local backends (LM Studio, Ollama, llama.cpp) return HTTP 400
with messages like "Context size has been exceeded" when the
context window is full. The error phrase list did not include
"context size" or "context window", so these errors fell through
to the generic 4xx abort handler instead of triggering compression.

Changes:
- Move context-length check above generic 4xx handler so it runs
  first (same pattern as the existing 413 check)
- Add "context size" and "context window" to the phrase list
- Guard 4xx handler with `not is_context_length_error` to prevent
  context-related 400s from being treated as non-retryable

											
										
										
											2026-03-05 01:12:34 +03:00
-												fix: correct off-by-one in retry exhaustion checks

The retry exhaustion checks used > instead of >= to compare
retry_count against max_retries. Since the while loop condition is
retry_count < max_retries, the check retry_count > max_retries can
never be true inside the loop. When retries are exhausted, the loop
exits and falls through to response.choices[0] on an invalid response,
crashing with IndexError instead of returning a proper error.

											
										
										
											2026-03-01 02:27:26 +03:00
+								                    if retry_count >= max_retries:
-												feat: simple fallback model for provider resilience

When the primary model/provider fails after retries (rate limit, overload,
auth errors, connection failures), Hermes automatically switches to a
configured fallback model for the remainder of the session.

Config (in ~/.hermes/config.yaml):

  fallback_model:
    provider: openrouter
    model: anthropic/claude-sonnet-4

Supports all major providers: OpenRouter, OpenAI, Nous, DeepSeek, Together,
Groq, Fireworks, Mistral, Gemini — plus custom endpoints via base_url and
api_key_env overrides.

Design principles:
- Dead simple: one fallback model, not a chain
- One-shot: switches once, doesn't ping-pong back
- Zero new dependencies: uses existing OpenAI client
- Minimal code: ~100 lines in run_agent.py, ~5 lines in cli.py/gateway
- Three trigger points: max retries exhausted, non-retryable client errors,
  and invalid response exhaustion

Does NOT trigger on context overflow or payload-too-large errors (those
are handled by the existing compression system).

Addresses #737.

25 new tests, 2492 total passing.

											
										
										
											2026-03-08 20:22:33 -07:00
+								                        # Try fallback before giving up entirely
 								                        if self._try_activate_fallback():
 								                            retry_count = 0
 								                            continue
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.", force=True)
-												updates for stability and speed

											
										
										
											2026-01-08 08:57:51 +00:00
+								                        logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}")
 								                        logging.error(f"{self.log_prefix}Request details - Messages: {len(api_messages)}, Approx tokens: {approx_tokens:,}")
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								                        raise api_error
-												rate limits

											
										
										
											2025-11-17 18:35:36 -05:00
 								                    wait_time = min(2 ** retry_count, 60)  # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s
-												fix: prevent closed OpenAI client reuse across retries

Use per-request OpenAI clients inside _interruptible_api_call so interrupts and transport failures do not poison later retries. Also add closed-client detection/recreation for the shared client and regression tests covering retry and concurrency behavior.

											
										
										
											2026-03-14 04:13:05 +00:00
+								                    logger.warning(
 								                        "Retrying API call in %ss (attempt %s/%s) %s error=%s",
 								                        wait_time,
 								                        retry_count,
 								                        max_retries,
 								                        self._client_log_context(),
 								                        api_error,
 								                    )
-												fix(cli): increase max iterations for child agents and extend API call timeout for improved reliability

											
										
										
											2026-02-27 17:35:26 -08:00
+								                    if retry_count >= max_retries:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}⚠️  API call failed after {retry_count} attempts: {str(api_error)[:100]}")
 								                        self._vprint(f"{self.log_prefix}⏳ Final retry in {wait_time}s...")
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
 								                    # Sleep in small increments so we can respond to interrupts quickly
 								                    # instead of blocking the entire wait_time in one sleep() call
 								                    sleep_end = time.time() + wait_time
 								                    while time.time() < sleep_end:
 								                        if self._interrupt_requested:
-												fix: harden web gateway security and fix error swallowing

- Use hmac.compare_digest for timing-safe token comparison (3 endpoints)
- Default bind to 127.0.0.1 instead of 0.0.0.0
- Sanitize upload filenames with Path.name to prevent path traversal
- Add DOMPurify to sanitize marked.parse() output against XSS
- Replace add_static with authenticated media handler
- Hide token in group chats for /remote-control command
- Use ctypes.util.find_library for Opus instead of hardcoded paths
- Add force=True to 5 interrupt _vprint calls for visibility
- Log Opus decode errors and voice restart failures instead of swallowing

											
										
										
											2026-03-13 15:29:18 +03:00
+								                            self._vprint(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.", force=True)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                            self._persist_session(messages, conversation_history)
-												Cherry-pick 6 bug fixes from PR #76 and update documentation

Code fixes (run_agent.py):
- Fix off-by-one in _flush_messages_to_session_db skipping one message per flush
- Add clear_interrupt() to 3 early-return paths preventing stale interrupt state
- Wrap handle_function_call in try/except so tool crashes don't kill the conversation
- Replace fragile `is` identity check with _flush_sentinel marker for memory flush cleanup
- Fix retry loop off-by-one (6 attempts not 7)
- Remove redundant inline `import re`

											
										
										
											2026-02-27 03:21:49 -08:00
+								                            self.clear_interrupt()
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                            return {
-												fix: add context to interruption messages for model awareness

When the agent is interrupted, the model now receives descriptive
context instead of a generic 'Operation interrupted.' string:

- Tool skip messages include the tool name:
  '[Tool execution cancelled — terminal was skipped due to user interrupt]'
  '[Tool execution skipped — web_search was not started. User sent a new message]'

- API call interrupts include timing:
  'Operation interrupted: waiting for model response (4.2s elapsed).'

- Retry/error interrupts include retry context:
  'Operation interrupted: retrying API call after rate limit (retry 2/5).'
  'Operation interrupted: handling API error (Timeout: connection timed out).'

This helps the model understand what was happening when it was
interrupted, reducing wasted iterations spent re-discovering state.

											
										
										
											2026-03-08 18:58:23 -07:00
+								                                "final_response": f"Operation interrupted: retrying API call after error (retry {retry_count}/{max_retries}).",
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								                                "messages": messages,
 								                                "api_calls": api_call_count,
 								                                "completed": False,
 								                                "interrupted": True,
 								                            }
 								                        time.sleep(0.2)  # Check interrupt every 200ms
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
-												feat: implement interactive prompts for sudo password and command approval in CLI

- Added methods for handling sudo password and dangerous command approval prompts using a callback mechanism in cli.py.
- Integrated these prompts with the prompt_toolkit UI for improved user experience.
- Updated terminal_tool.py to support callback registration for interactive prompts, enhancing the CLI's interactivity.
- Introduced a background thread for API calls in run_agent.py to allow for interrupt handling during long-running operations.
- Enhanced error handling for interrupted API calls, ensuring graceful degradation of user experience.

											
										
										
											2026-02-21 12:15:40 -08:00
+								            # If the API call was interrupted, skip response processing
 								            if interrupted:
 								                break
-												fix: rebuild compressed payload before retry

											
										
										
											2026-03-07 15:13:45 -05:00
+								            if restart_with_compressed_messages:
 								                api_call_count -= 1
 								                self.iteration_budget.refund()
 								                continue
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
+								            if restart_with_length_continuation:
 								                continue
-												fix: enhance payload and context compression handling

Added logic to manage multiple compression attempts for large payloads and context length errors. Introduced limits on compression attempts to prevent infinite retries, with appropriate logging and error handling. This ensures better resilience and user feedback when facing compression issues during API calls.

											
										
										
											2026-03-07 09:19:07 -08:00
+								            # Guard: if all retries exhausted without a successful response
 								            # (e.g. repeated context-length errors that exhausted retry_count),
 								            # the `response` variable is still None. Break out cleanly.
 								            if response is None:
 								                print(f"{self.log_prefix}❌ All API retries exhausted with no successful response.")
 								                self._persist_session(messages, conversation_history)
 								                break
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            try:
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                if self.api_mode == "codex_responses":
 								                    assistant_message, finish_reason = self._normalize_codex_response(response)
-												feat: native Anthropic provider with Claude Code credential auto-discovery

Add Anthropic as a first-class inference provider, bypassing OpenRouter
for direct API access. Uses the native Anthropic SDK with a full format
adapter (same pattern as the codex_responses api_mode).

## Auth (three methods, priority order)
1. ANTHROPIC_API_KEY env var (regular API key, sk-ant-api-*)
2. ANTHROPIC_TOKEN / CLAUDE_CODE_OAUTH_TOKEN env var (setup-token, sk-ant-oat-*)
3. Auto-discovery from ~/.claude/.credentials.json (Claude Code subscription)
   - Reads Claude Code's OAuth credentials
   - Checks token expiry with 60s buffer
   - Setup tokens use Bearer auth + anthropic-beta: oauth-2025-04-20 header
   - Regular API keys use standard x-api-key header

## Changes by file

### New files
- agent/anthropic_adapter.py — Client builder, message/tool/response
  format conversion, Claude Code credential reader, token resolver.
  Handles system prompt extraction, tool_use/tool_result blocks,
  thinking/reasoning, orphaned tool_use cleanup, cache_control.
- tests/test_anthropic_adapter.py — 36 tests covering all adapter logic

### Modified files
- pyproject.toml — Add anthropic>=0.39.0 dependency
- hermes_cli/auth.py — Add 'anthropic' to PROVIDER_REGISTRY with
  three env vars, plus 'claude'/'claude-code' aliases
- hermes_cli/models.py — Add model catalog, labels, aliases, provider order
- hermes_cli/main.py — Add 'anthropic' to --provider CLI choices
- hermes_cli/runtime_provider.py — Add Anthropic branch returning
  api_mode='anthropic_messages' (before generic api_key fallthrough)
- hermes_cli/setup.py — Add Anthropic setup wizard with Claude Code
  credential auto-discovery, model selection, OpenRouter tools prompt
- agent/auxiliary_client.py — Add claude-haiku-4-5 as aux model
- agent/model_metadata.py — Add bare Claude model context lengths
- run_agent.py — Add anthropic_messages api_mode:
  * Client init (Anthropic SDK instead of OpenAI)
  * API call dispatch (_anthropic_client.messages.create)
  * Response validation (content blocks)
  * finish_reason mapping (stop_reason -> finish_reason)
  * Token usage (input_tokens/output_tokens)
  * Response normalization (normalize_anthropic_response)
  * Client interrupt/rebuild
  * Prompt caching auto-enabled for native Anthropic
- tests/test_run_agent.py — Update test_anthropic_base_url_accepted to
  expect native routing, add test_prompt_caching_native_anthropic

											
										
										
											2026-03-12 15:47:45 -07:00
+								                elif self.api_mode == "anthropic_messages":
 								                    from agent.anthropic_adapter import normalize_anthropic_response
 								                    assistant_message, finish_reason = normalize_anthropic_response(response)
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                else:
 								                    assistant_message = response.choices[0].message
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												fix: handle non-string content from OpenAI-compatible servers (#759)

Some local LLM servers (llama-server, etc.) return message.content as
a dict or list instead of a plain string. This caused AttributeError
'dict object has no attribute strip' on every API call.

Normalizes content to string immediately after receiving the response:
- dict: extracts 'text' or 'content' field, falls back to json.dumps
- list: extracts text parts (OpenAI multimodal content format)
- other: str() conversion

Applied at the single point where response.choices[0].message is read
in the main agent loop, so all downstream .strip()/.startswith()/[:100]
operations work regardless of server implementation.

Closes #759

											
										
										
											2026-03-09 03:32:32 -07:00
+								                # Normalize content to string — some OpenAI-compatible servers
 								                # (llama-server, etc.) return content as a dict or list instead
 								                # of a plain string, which crashes downstream .strip() calls.
 								                if assistant_message.content is not None and not isinstance(assistant_message.content, str):
 								                    raw = assistant_message.content
 								                    if isinstance(raw, dict):
 								                        assistant_message.content = raw.get("text", "") or raw.get("content", "") or json.dumps(raw)
 								                    elif isinstance(raw, list):
 								                        # Multimodal content list — extract text parts
 								                        parts = []
 								                        for part in raw:
 								                            if isinstance(part, str):
 								                                parts.append(part)
 								                            elif isinstance(part, dict) and part.get("type") == "text":
 								                                parts.append(part.get("text", ""))
 								                            elif isinstance(part, dict) and "text" in part:
 								                                parts.append(str(part["text"]))
 								                        assistant_message.content = "\n".join(parts)
 								                    else:
 								                        assistant_message.content = str(raw)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                # Handle assistant response
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                if assistant_message.content and not self.quiet_mode:
-												fix: verbose mode shows full untruncated output

* fix(cli): silence tirith prefetch install warnings at startup

* fix: verbose mode now shows full untruncated tool args, results, content, and think blocks

When tool progress is set to 'verbose' (via /verbose or config), the display
was still truncating tool arguments to 100 chars, tool results to 100-200 chars,
assistant content to 100 chars, and think blocks to 5 lines. This defeated the
purpose of verbose mode.

Changes:
- Tool args: show full JSON args (not truncated to log_prefix_chars)
- Tool results: show full result content in both display and debug logs
- Assistant content: show full content during tool-call loops
- Think blocks: show full reasoning text (not truncated to 5 lines/100 chars)
- Auto-enable reasoning display when verbose mode is active
- Fix initial agent creation to respect verbose config (was always quiet_mode=True)
- Updated verbose label to mention think blocks
											
										
										
											2026-03-15 20:03:37 -07:00
+								                    if self.verbose_logging:
 								                        self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content}")
 								                    else:
 								                        self._vprint(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
-												feat(gateway): expose subagent tool calls and thinking to user (fixes #169) (#186)

When subagents run via delegate_task, the user now sees real-time
progress instead of silence:

CLI: tree-view activity lines print above the delegation spinner
  🔀 Delegating: research quantum computing
     ├─ 💭 "I'll search for papers first..."
     ├─ 🔍 web_search  "quantum computing"
     ├─ 📖 read_file  "paper.pdf"
     └─ ⠹ working... (18.2s)

Gateway (Telegram/Discord): batched progress summaries sent every
5 tool calls to avoid message spam. Remaining tools flushed on
subagent completion.

Changes:
- agent/display.py: add KawaiiSpinner.print_above() to print
  status lines above an active spinner without disrupting animation.
  Uses captured stdout (self._out) so it works inside the child's
  redirect_stdout(devnull).

- tools/delegate_tool.py: add _build_child_progress_callback()
  that creates a per-child callback relaying tool calls and
  thinking events to the parent's spinner (CLI) or progress
  queue (gateway). Each child gets its own callback instance,
  so parallel subagents don't share state. Includes _flush()
  for gateway batch completion.

- run_agent.py: fire tool_progress_callback with '_thinking'
  event when the model produces text content. Guarded by
  _delegate_depth > 0 so only subagents fire this (prevents
  gateway spam from main agent). REASONING_SCRATCHPAD/think/
  reasoning XML tags are stripped before display.

Tests: 21 new tests covering print_above, callback builder,
thinking relay, SCRATCHPAD filtering, batching, flush, thread
isolation, delegate_depth guard, and prefix handling.
											
										
										
											2026-03-01 10:18:00 +03:00
 								                # Notify progress callback of model's thinking (used by subagent
 								                # delegation to relay the child's reasoning to the parent display).
 								                # Guard: only fire for subagents (_delegate_depth >= 1) to avoid
 								                # spamming gateway platforms with the main agent's every thought.
 								                if (assistant_message.content and self.tool_progress_callback
 								                        and getattr(self, '_delegate_depth', 0) > 0):
 								                    _think_text = assistant_message.content.strip()
 								                    # Strip reasoning XML tags that shouldn't leak to parent display
 								                    _think_text = re.sub(
 								                        r'</?(?:REASONING_SCRATCHPAD|think|reasoning)>', '', _think_text
 								                    ).strip()
 								                    first_line = _think_text.split('\n')[0][:80] if _think_text else ""
 								                    if first_line:
 								                        try:
 								                            self.tool_progress_callback("_thinking", first_line)
 								                        except Exception:
 								                            pass
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
+								                # Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
 								                # This means the model ran out of output tokens mid-reasoning — retry up to 2 times
-												refactor: streamline scratchpad handling in AIAgent

- Removed static methods for converting and checking <REASONING_SCRATCHPAD> tags, simplifying the codebase.
- Replaced calls to the removed methods with direct function calls for better clarity and maintainability.
- Updated trajectory saving logic to utilize a dedicated function for improved organization and readability.

											
										
										
											2026-02-23 09:55:09 -08:00
+								                if has_incomplete_scratchpad(assistant_message.content or ""):
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
+								                    if not hasattr(self, '_incomplete_scratchpad_retries'):
 								                        self._incomplete_scratchpad_retries = 0
 								                    self._incomplete_scratchpad_retries += 1
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                    self._vprint(f"{self.log_prefix}⚠️  Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
 								                    if self._incomplete_scratchpad_retries <= 2:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
+								                        # Don't add the broken message, just retry
 								                        continue
 								                    else:
 								                        # Max retries - discard this turn and save as partial
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                        self._vprint(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.", force=True)
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
+								                        self._incomplete_scratchpad_retries = 0
 								                        rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								                        self._cleanup_task_resources(effective_task_id)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                        self._persist_session(messages, conversation_history)
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
 								                        return {
 								                            "final_response": None,
 								                            "messages": rolled_back_messages,
 								                            "api_calls": api_call_count,
 								                            "completed": False,
 								                            "partial": True,
 								                            "error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
 								                        }
 								                # Reset incomplete scratchpad counter on clean response
 								                if hasattr(self, '_incomplete_scratchpad_retries'):
 								                    self._incomplete_scratchpad_retries = 0
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
 								                if self.api_mode == "codex_responses" and finish_reason == "incomplete":
 								                    if not hasattr(self, "_codex_incomplete_retries"):
 								                        self._codex_incomplete_retries = 0
 								                    self._codex_incomplete_retries += 1
 								                    interim_msg = self._build_assistant_message(assistant_message, finish_reason)
-												fix: handle None message content across codebase (fixes #276)

The OpenAI API returns content: null on assistant messages with tool
calls. msg.get('content', '') returns None when the key exists with
value None, causing TypeError on len(), string concatenation, and
.strip() in downstream code paths.

Fixed 4 locations that process conversation messages:
- agent/auxiliary_client.py:84 — None passed to API calls
- cli.py:1288 — crash on content[:200] and len(content)
- run_agent.py:3444 — crash on None.strip()
- honcho_integration/session.py:445 — 'None' rendered in transcript

13 other instances were verified safe (already protected, only process
user/tool messages, or use the safe pattern).

Pattern: msg.get('content', '') → msg.get('content') or ''

Fixes #276

											
										
										
											2026-03-02 02:23:53 -08:00
+								                    interim_has_content = bool((interim_msg.get("content") or "").strip())
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                    interim_has_reasoning = bool(interim_msg.get("reasoning", "").strip()) if isinstance(interim_msg.get("reasoning"), str) else False
 								                    if interim_has_content or interim_has_reasoning:
 								                        last_msg = messages[-1] if messages else None
 								                        duplicate_interim = (
 								                            isinstance(last_msg, dict)
 								                            and last_msg.get("role") == "assistant"
 								                            and last_msg.get("finish_reason") == "incomplete"
 								                            and (last_msg.get("content") or "") == (interim_msg.get("content") or "")
 								                            and (last_msg.get("reasoning") or "") == (interim_msg.get("reasoning") or "")
 								                        )
 								                        if not duplicate_interim:
 								                            messages.append(interim_msg)
 								                    if self._codex_incomplete_retries < 3:
 								                        if not self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}↻ Codex response incomplete; continuing turn ({self._codex_incomplete_retries}/3)")
-												Add OpenAI Codex provider runtime and responses integration (without .agent/PLANS.md)

											
										
										
											2026-02-25 18:20:38 -08:00
+								                        self._session_messages = messages
 								                        self._save_session_log(messages)
 								                        continue
 								                    self._codex_incomplete_retries = 0
 								                    self._persist_session(messages, conversation_history)
 								                    return {
 								                        "final_response": None,
 								                        "messages": messages,
 								                        "api_calls": api_call_count,
 								                        "completed": False,
 								                        "partial": True,
 								                        "error": "Codex response remained incomplete after 3 continuation attempts",
 								                    }
 								                elif hasattr(self, "_codex_incomplete_retries"):
 								                    self._codex_incomplete_retries = 0
-												Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema.
- Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns.
- Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics.
- Updated documentation and comments for clarity on new features and changes.

											
										
										
											2026-02-08 20:19:14 +00:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                # Check for tool calls
 								                if assistant_message.tool_calls:
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                    if not self.quiet_mode:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}🔧 Processing {len(assistant_message.tool_calls)} tool call(s)...")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								                    if self.verbose_logging:
 								                        for tc in assistant_message.tool_calls:
 								                            logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                    # Validate tool call names - detect model hallucinations
-												fix: tool call repair — auto-lowercase, fuzzy match, helpful error on unknown tool (#520)

- Add _repair_tool_call(): tries lowercase, normalize, then fuzzy match (difflib 0.7)
- Replace 3-retry-then-abort with graceful error: model receives helpful message and self-corrects
- Conversation stays alive instead of dying on hallucinated tool names

Closes #520

											
										
										
											2026-03-08 18:39:43 +03:00
+								                    # Repair mismatched tool names before validating
 								                    for tc in assistant_message.tool_calls:
 								                        if tc.function.name not in self.valid_tool_names:
 								                            repaired = self._repair_tool_call(tc.function.name)
 								                            if repaired:
 								                                print(f"{self.log_prefix}🔧 Auto-repaired tool name: '{tc.function.name}' -> '{repaired}'")
 								                                tc.function.name = repaired
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                    invalid_tool_calls = [
-												fix: tool call repair — auto-lowercase, fuzzy match, helpful error on unknown tool (#520)

- Add _repair_tool_call(): tries lowercase, normalize, then fuzzy match (difflib 0.7)
- Replace 3-retry-then-abort with graceful error: model receives helpful message and self-corrects
- Conversation stays alive instead of dying on hallucinated tool names

Closes #520

											
										
										
											2026-03-08 18:39:43 +03:00
+								                        tc.function.name for tc in assistant_message.tool_calls
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                        if tc.function.name not in self.valid_tool_names
 								                    ]
 								                    if invalid_tool_calls:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        # Track retries for invalid tool calls
 								                        if not hasattr(self, '_invalid_tool_retries'):
 								                            self._invalid_tool_retries = 0
 								                        self._invalid_tool_retries += 1
-												fix: tool call repair — auto-lowercase, fuzzy match, helpful error on unknown tool (#520)

- Add _repair_tool_call(): tries lowercase, normalize, then fuzzy match (difflib 0.7)
- Replace 3-retry-then-abort with graceful error: model receives helpful message and self-corrects
- Conversation stays alive instead of dying on hallucinated tool names

Closes #520

											
										
										
											2026-03-08 18:39:43 +03:00
+								                        # Return helpful error to model — model can self-correct next turn
 								                        available = ", ".join(sorted(self.valid_tool_names))
 								                        invalid_name = invalid_tool_calls[0]
 								                        invalid_preview = invalid_name[:80] + "..." if len(invalid_name) > 80 else invalid_name
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}⚠️  Unknown tool '{invalid_preview}' — sending error to model for self-correction ({self._invalid_tool_retries}/3)")
 								                        if self._invalid_tool_retries >= 3:
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.", force=True)
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._invalid_tool_retries = 0
 								                            self._persist_session(messages, conversation_history)
 								                            return {
 								                                "final_response": None,
 								                                "messages": messages,
 								                                "api_calls": api_call_count,
 								                                "completed": False,
 								                                "partial": True,
 								                                "error": f"Model generated invalid tool call: {invalid_preview}"
 								                            }
-												fix: tool call repair — auto-lowercase, fuzzy match, helpful error on unknown tool (#520)

- Add _repair_tool_call(): tries lowercase, normalize, then fuzzy match (difflib 0.7)
- Replace 3-retry-then-abort with graceful error: model receives helpful message and self-corrects
- Conversation stays alive instead of dying on hallucinated tool names

Closes #520

											
										
										
											2026-03-08 18:39:43 +03:00
+								                        assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
 								                        messages.append(assistant_msg)
 								                        for tc in assistant_message.tool_calls:
 								                            if tc.function.name not in self.valid_tool_names:
 								                                content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}"
 								                            else:
 								                                content = f"Skipped: another tool call in this turn used an invalid name. Please retry this tool call."
 								                            messages.append({
 								                                "role": "tool",
 								                                "tool_call_id": tc.id,
 								                                "content": content,
 								                            })
 								                        continue
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								                    # Reset retry counter on successful tool call validation
 								                    if hasattr(self, '_invalid_tool_retries'):
 								                        self._invalid_tool_retries = 0
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                    # Validate tool call arguments are valid JSON
-												Update RL tools and enhance configuration management

- Modified `model_tools.py` to update default model IDs and add new RL function `rl_test_inference`.
- Enhanced `README.md` with installation instructions for submodules and updated API key usage.
- Improved `rl_cli.py` to load configuration from `~/.hermes/config.yaml` and set terminal working directory for RL tools.
- Updated `run_agent.py` to handle empty string arguments as empty objects for better JSON validation.
- Refined installation scripts to ensure submodules are cloned and installed correctly, enhancing setup experience.

											
										
										
											2026-02-04 13:57:59 -08:00
+								                    # Handle empty strings as empty objects (common model quirk)
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                    invalid_json_args = []
 								                    for tc in assistant_message.tool_calls:
-												Update RL tools and enhance configuration management

- Modified `model_tools.py` to update default model IDs and add new RL function `rl_test_inference`.
- Enhanced `README.md` with installation instructions for submodules and updated API key usage.
- Improved `rl_cli.py` to load configuration from `~/.hermes/config.yaml` and set terminal working directory for RL tools.
- Updated `run_agent.py` to handle empty string arguments as empty objects for better JSON validation.
- Refined installation scripts to ensure submodules are cloned and installed correctly, enhancing setup experience.

											
										
										
											2026-02-04 13:57:59 -08:00
+								                        args = tc.function.arguments
-												fix: handle dict tool call arguments from local backends

Normalize tool call arguments when OpenAI-compatible backends return parsed dict/list payloads instead of JSON strings. This prevents the .strip() crash during tool-call validation for llama.cpp and similar servers, while preserving existing empty-string and invalid-JSON handling. Adds a focused regression test for dict arguments in the agent loop.

											
										
										
											2026-03-15 08:00:19 -07:00
+								                        if isinstance(args, (dict, list)):
 								                            tc.function.arguments = json.dumps(args)
 								                            continue
 								                        if args is not None and not isinstance(args, str):
 								                            tc.function.arguments = str(args)
 								                            args = tc.function.arguments
-												Update RL tools and enhance configuration management

- Modified `model_tools.py` to update default model IDs and add new RL function `rl_test_inference`.
- Enhanced `README.md` with installation instructions for submodules and updated API key usage.
- Improved `rl_cli.py` to load configuration from `~/.hermes/config.yaml` and set terminal working directory for RL tools.
- Updated `run_agent.py` to handle empty string arguments as empty objects for better JSON validation.
- Refined installation scripts to ensure submodules are cloned and installed correctly, enhancing setup experience.

											
										
										
											2026-02-04 13:57:59 -08:00
+								                        # Treat empty/whitespace strings as empty object
 								                        if not args or not args.strip():
 								                            tc.function.arguments = "{}"
 								                            continue
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                        try:
-												Update RL tools and enhance configuration management

- Modified `model_tools.py` to update default model IDs and add new RL function `rl_test_inference`.
- Enhanced `README.md` with installation instructions for submodules and updated API key usage.
- Improved `rl_cli.py` to load configuration from `~/.hermes/config.yaml` and set terminal working directory for RL tools.
- Updated `run_agent.py` to handle empty string arguments as empty objects for better JSON validation.
- Refined installation scripts to ensure submodules are cloned and installed correctly, enhancing setup experience.

											
										
										
											2026-02-04 13:57:59 -08:00
+								                            json.loads(args)
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                        except json.JSONDecodeError as e:
 								                            invalid_json_args.append((tc.function.name, str(e)))
 								                    if invalid_json_args:
 								                        # Track retries for invalid JSON arguments
 								                        self._invalid_json_retries += 1
 								                        tool_name, error_msg = invalid_json_args[0]
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}⚠️  Invalid JSON in tool call arguments for '{tool_name}': {error_msg}")
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
 								                        if self._invalid_json_retries < 3:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_json_retries}/3)...")
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                            # Don't add anything to messages, just retry the API call
 								                            continue
 								                        else:
-												Enhance skills tool to have an arg so it is more reliably called, and error handling in agent

- Updated the `skills_categories` function to include a `verbose` parameter, allowing users to request skill counts per category.
- Modified the `handle_skills_function_call` method to pass the `verbose` argument to `skills_categories`.
- Improved error handling in the `AIAgent` class by injecting a recovery message when invalid JSON arguments are detected, guiding users on how to correct their tool calls.
- Enhanced the `GatewayRunner` to return a user-friendly error message if the agent fails to generate a final response, improving overall user experience.

											
										
										
											2026-02-03 15:26:59 -08:00
+								                            # Instead of returning partial, inject a helpful message and let model recover
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}⚠️  Injecting recovery message for invalid JSON...")
-												Enhance skills tool to have an arg so it is more reliably called, and error handling in agent

- Updated the `skills_categories` function to include a `verbose` parameter, allowing users to request skill counts per category.
- Modified the `handle_skills_function_call` method to pass the `verbose` argument to `skills_categories`.
- Improved error handling in the `AIAgent` class by injecting a recovery message when invalid JSON arguments are detected, guiding users on how to correct their tool calls.
- Enhanced the `GatewayRunner` to return a user-friendly error message if the agent fails to generate a final response, improving overall user experience.

											
										
										
											2026-02-03 15:26:59 -08:00
+								                            self._invalid_json_retries = 0  # Reset for next attempt
 								                            # Add a user message explaining the issue
 								                            recovery_msg = (
 								                                f"Your tool call to '{tool_name}' had invalid JSON arguments. "
 								                                f"Error: {error_msg}. "
 								                                f"For tools with no required parameters, use an empty object: {{}}. "
 								                                f"Please either retry the tool call with valid JSON, or respond without using that tool."
 								                            )
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                            recovery_dict = {"role": "user", "content": recovery_msg}
 								                            messages.append(recovery_dict)
-												Enhance skills tool to have an arg so it is more reliably called, and error handling in agent

- Updated the `skills_categories` function to include a `verbose` parameter, allowing users to request skill counts per category.
- Modified the `handle_skills_function_call` method to pass the `verbose` argument to `skills_categories`.
- Improved error handling in the `AIAgent` class by injecting a recovery message when invalid JSON arguments are detected, guiding users on how to correct their tool calls.
- Enhanced the `GatewayRunner` to return a user-friendly error message if the agent fails to generate a final response, improving overall user experience.

											
										
										
											2026-02-03 15:26:59 -08:00
+								                            continue
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
 								                    # Reset retry counter on successful JSON validation
 								                    self._invalid_json_retries = 0
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								                    # If this turn has both content AND tool_calls, capture the content
 								                    # as a fallback final response. Common pattern: model delivers its
 								                    # answer and calls memory/skill tools as a side-effect in the same
 								                    # turn. If the follow-up turn after tools is empty, we use this.
 								                    turn_content = assistant_message.content or ""
 								                    if turn_content and self._has_content_after_think_block(turn_content):
 								                        self._last_content_with_tools = turn_content
 								                        # Show intermediate commentary so the user can follow along
 								                        if self.quiet_mode:
 								                            clean = self._strip_think_blocks(turn_content).strip()
 								                            if clean:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                                self._vprint(f"  ┊ 💬 {clean}")
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                    messages.append(assistant_msg)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												fix: proactive compression after large tool results + Anthropic error detection

Two fixes for context overflow handling:

1. Proactive compression after tool execution: The compression check now
   estimates the next prompt size using real token counts from the last API
   response (prompt_tokens + completion_tokens) plus a conservative estimate
   of newly appended tool results (chars // 3 for JSON-heavy content).
   Previously, should_compress() only checked last_prompt_tokens which
   didn't account for tool results — so a 130k prompt + 100k chars of tool
   output would pass the 140k threshold check but fail the 200k API limit.

2. Safety net: Added 'prompt is too long' to context-length error detection
   phrases. Anthropic returns 'prompt is too long: N tokens > M maximum'
   on HTTP 400, which wasn't matched by existing phrases. This ensures
   compression fires even if the proactive check underestimates.

Fixes #813

											
										
										
											2026-03-11 08:04:52 -07:00
+								                    _msg_count_before_tools = len(messages)
-												feat: iteration budget pressure via tool result injection

Two-tier warning system that nudges the LLM as it approaches
max_iterations, injected into the last tool result JSON rather
than as a separate system message:

- Caution (70%): {"_budget_warning": "[BUDGET: 42/60...]"}
- Warning (90%): {"_budget_warning": "[BUDGET WARNING: 54/60...]"}

For JSON tool results, adds a _budget_warning field to the existing
dict. For plain text results, appends the warning as text.

Key properties:
- No system messages injected mid-conversation
- No changes to message structure
- Prompt cache stays valid
- Configurable thresholds (0.7 / 0.9)
- Can be disabled: _budget_pressure_enabled = False

Inspired by PR #421 (@Bartok9) and issue #414.
8 tests covering thresholds, edge cases, JSON and text injection.

											
										
										
											2026-03-09 05:19:42 -07:00
+								                    self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)
-												feat: shared iteration budget across parent + subagents

Subagent tool calls now count toward the same session-wide iteration
limit as the parent agent. Previously, each subagent had its own
independent counter, so a parent with max_iterations=60 could spawn
3 subagents each doing 50 calls = 150 total tool calls unmetered.

Changes:
- IterationBudget: thread-safe shared counter (run_agent.py)
  - consume(): try to use one iteration, returns False if exhausted
  - refund(): give back one iteration (for execute_code turns)
  - Thread-safe via Lock (subagents run in ThreadPoolExecutor)
- Parent creates the budget, children inherit it via delegate_tool.py
- execute_code turns are refunded (don't count against budget)
- Default raised from 60 → 90 to account for shared consumption
- Per-child cap (50) still applies as a safety valve

The per-child max_iterations (default 50) remains as a per-child
ceiling, but the shared budget is the hard session-wide limit.
A child stops at whichever comes first.

											
										
										
											2026-03-07 08:16:37 -08:00
 								                    # Refund the iteration if the ONLY tool(s) called were
 								                    # execute_code (programmatic tool calling).  These are
 								                    # cheap RPC-style calls that shouldn't eat the budget.
 								                    _tc_names = {tc.function.name for tc in assistant_message.tool_calls}
 								                    if _tc_names == {"execute_code"}:
 								                        self.iteration_budget.refund()
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												fix: proactive compression after large tool results + Anthropic error detection

Two fixes for context overflow handling:

1. Proactive compression after tool execution: The compression check now
   estimates the next prompt size using real token counts from the last API
   response (prompt_tokens + completion_tokens) plus a conservative estimate
   of newly appended tool results (chars // 3 for JSON-heavy content).
   Previously, should_compress() only checked last_prompt_tokens which
   didn't account for tool results — so a 130k prompt + 100k chars of tool
   output would pass the 140k threshold check but fail the 200k API limit.

2. Safety net: Added 'prompt is too long' to context-length error detection
   phrases. Anthropic returns 'prompt is too long: N tokens > M maximum'
   on HTTP 400, which wasn't matched by existing phrases. This ensures
   compression fires even if the proactive check underestimates.

Fixes #813

											
										
										
											2026-03-11 08:04:52 -07:00
+								                    # Estimate next prompt size using real token counts from the
 								                    # last API response + rough estimate of newly appended tool
 								                    # results.  This catches cases where tool results push the
 								                    # context past the limit that last_prompt_tokens alone misses
 								                    # (e.g. large file reads, web extractions).
 								                    _compressor = self.context_compressor
 								                    _new_tool_msgs = messages[_msg_count_before_tools:]
 								                    _new_chars = sum(len(str(m.get("content", "") or "")) for m in _new_tool_msgs)
 								                    _estimated_next_prompt = (
 								                        _compressor.last_prompt_tokens
 								                        + _compressor.last_completion_tokens
 								                        + _new_chars // 3  # conservative: JSON-heavy tool results ≈ 3 chars/token
 								                    )
 								                    if self.compression_enabled and _compressor.should_compress(_estimated_next_prompt):
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                        messages, active_system_prompt = self._compress_context(
 								                            messages, system_message,
-												fix: detect and warn on file re-read loops after context compression

When context compression summarizes conversation history, the agent
loses track of which files it already read and re-reads them in a loop.
Users report the agent reading the same files endlessly without writing.

Root cause: context compression is lossy — file contents and read history
are lost in the summary. After compression, the model thinks it hasn't
examined the files yet and reads them again.

Fix (two-part):
1. Track file reads per task in file_tools.py. When the same file region
   is read again, include a _warning in the response telling the model
   to stop re-reading and use existing information.
2. After context compression, inject a structured message listing all
   files already read in the session with explicit "do NOT re-read"
   instruction, preserving read history across compression boundaries.

Adds 16 tests covering warning detection, task isolation, summary
accuracy, tracker cleanup, and compression history injection.

											
										
										
											2026-03-08 20:44:42 +03:00
+								                            approx_tokens=self.context_compressor.last_prompt_tokens,
 								                            task_id=effective_task_id,
-												Add context compression feature for long conversations

- Implemented automatic context compression to manage long conversations that approach the model's context limit.
- Configured the feature to summarize middle turns while protecting the first three and last four turns, ensuring important context is retained.
- Added configuration options in `cli-config.yaml` and environment variables for enabling/disabling compression and setting thresholds.
- Updated documentation in `README.md`, `cli.md`, and `.env.example` to explain the context compression functionality and its configuration.
- Enhanced the `cli.py` to load compression settings into environment variables, ensuring seamless integration with the CLI.
- Completed the implementation of context compression as outlined in the TODO list, marking it as a significant enhancement to conversation management.

											
										
										
											2026-02-01 18:01:31 -08:00
+								                        )
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
+								                    # Save session log incrementally (so progress is visible even if interrupted)
 								                    self._session_messages = messages
 								                    self._save_session_log(messages)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                    # Continue loop for next response
 								                    continue
 								                else:
 								                    # No tool calls - this is the final response
 								                    final_response = assistant_message.content or ""
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                    # Check if response only has think block with no actual content after it
 								                    if not self._has_content_after_think_block(final_response):
-												fix: improve handling of empty responses after tool calls

- Added fallback mechanism to utilize previous content when the model generates an empty response after tool calls, reducing unnecessary API retries.
- Enhanced logging to indicate when prior content is used as a final response.
- Updated logic to ensure that genuine empty responses are retried appropriately, maintaining user experience.

											
										
										
											2026-03-06 16:54:31 -08:00
+								                        # If the previous turn already delivered real content alongside
 								                        # tool calls (e.g. "You're welcome!" + memory save), the model
 								                        # has nothing more to say. Use the earlier content immediately
 								                        # instead of wasting API calls on retries that won't help.
 								                        fallback = getattr(self, '_last_content_with_tools', None)
 								                        if fallback:
 								                            logger.debug("Empty follow-up after tool calls — using prior turn content as final response")
 								                            self._last_content_with_tools = None
 								                            self._empty_content_retries = 0
 								                            for i in range(len(messages) - 1, -1, -1):
 								                                msg = messages[i]
 								                                if msg.get("role") == "assistant" and msg.get("tool_calls"):
 								                                    tool_names = []
 								                                    for tc in msg["tool_calls"]:
 								                                        fn = tc.get("function", {})
 								                                        tool_names.append(fn.get("name", "unknown"))
 								                                    msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
 								                                    break
 								                            final_response = self._strip_think_blocks(fallback).strip()
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                            self._response_was_previewed = True
-												fix: improve handling of empty responses after tool calls

- Added fallback mechanism to utilize previous content when the model generates an empty response after tool calls, reducing unnecessary API retries.
- Enhanced logging to indicate when prior content is used as a final response.
- Updated logic to ensure that genuine empty responses are retried appropriately, maintaining user experience.

											
										
										
											2026-03-06 16:54:31 -08:00
+								                            break
 								                        # No fallback available — this is a genuine empty response.
 								                        # Retry in case the model just had a bad generation.
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                        if not hasattr(self, '_empty_content_retries'):
 								                            self._empty_content_retries = 0
 								                        self._empty_content_retries += 1
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								                        reasoning_text = self._extract_reasoning(assistant_message)
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                        self._vprint(f"{self.log_prefix}⚠️  Response only contains think block with no content after it")
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								                        if reasoning_text:
 								                            reasoning_preview = reasoning_text[:500] + "..." if len(reasoning_text) > 500 else reasoning_text
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}   Reasoning: {reasoning_preview}")
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								                        else:
 								                            content_preview = final_response[:80] + "..." if len(final_response) > 80 else final_response
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}   Content: '{content_preview}'")
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
 								                        if self._empty_content_retries < 3:
-												fix: suppress verbose logs during streaming TTS, improve hallucination filter, stop continuous mode on errors

- Add _vprint() helper to suppress log output when stream_callback is active
- Expand Whisper hallucination filter with multi-language phrases and regex pattern for repetitive text
- Stop continuous voice mode when agent returns a failed result (e.g. 429 rate limit)

											
										
										
											2026-03-06 01:51:10 +03:00
+								                            self._vprint(f"{self.log_prefix}🔄 Retrying API call ({self._empty_content_retries}/3)...")
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                            continue
 								                        else:
-												fix: address voice mode PR review (streaming TTS, prompt cache, _vprint)

Bug A: Replace stale _HAS_ELEVENLABS/_HAS_AUDIO boolean imports with
lazy import function calls (_import_elevenlabs, _import_sounddevice).
The old constants no longer exist in tts_tool -- the try/except
silently swallowed the ImportError, leaving streaming TTS dead.

Bug B: Use user message prefix instead of modifying system prompt for
voice mode instruction. Changing ephemeral_system_prompt mid-session
invalidates the prompt cache. Now the concise-response hint is
prepended to the user_message passed to run_conversation while
conversation_history keeps the original text.

Minor: Add force parameter to _vprint so critical error messages
(max retries, non-retryable errors, API failures) are always shown
even during streaming TTS playback.

Tests: 15 new tests in test_voice_cli_integration.py covering all
three fixes -- lazy import activation, message prefix behavior,
history cleanliness, system prompt stability, and AST verification
that all critical _vprint calls use force=True.

											
										
										
											2026-03-10 03:43:03 +03:00
+								                            self._vprint(f"{self.log_prefix}❌ Max retries (3) for empty content exceeded.", force=True)
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								                            self._empty_content_retries = 0
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								                            # If a prior tool_calls turn had real content, salvage it:
 								                            # rewrite that turn's content to a brief tool description,
 								                            # and use the original content as the final response here.
 								                            fallback = getattr(self, '_last_content_with_tools', None)
 								                            if fallback:
 								                                self._last_content_with_tools = None
 								                                # Find the last assistant message with tool_calls and rewrite it
 								                                for i in range(len(messages) - 1, -1, -1):
 								                                    msg = messages[i]
 								                                    if msg.get("role") == "assistant" and msg.get("tool_calls"):
 								                                        tool_names = []
 								                                        for tc in msg["tool_calls"]:
 								                                            fn = tc.get("function", {})
 								                                            tool_names.append(fn.get("name", "unknown"))
 								                                        msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
 								                                        break
-												fix: strip <think> blocks from final response to users

Fixes #149

The _strip_think_blocks() method existed but was not applied to the
final_response in the normal completion path. This caused <think>...</think>
XML tags to leak into user-facing responses on all platforms (CLI, Telegram,
Discord, Slack, WhatsApp).

Changes:
- Strip think blocks from final_response before returning in normal path (line ~2600)
- Strip think blocks from fallback content when salvaging from prior tool_calls turn

Notes:
- The raw content with think blocks is preserved in messages[] for trajectory
  export - this only affects the user-facing final_response
- The _has_content_after_think_block() check still uses raw content before
  stripping, which is correct for detecting think-only responses

											
										
										
											2026-02-28 03:06:20 -05:00
+								                                # Strip <think> blocks from fallback content for user display
 								                                final_response = self._strip_think_blocks(fallback).strip()
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								                                self._response_was_previewed = True
-												feat: enhance AIAgent's tool usage nudges and content handling

- Introduced a method to strip <think> blocks from content, improving text visibility.
- Implemented counters to reset nudge intervals when memory and skill tools are used, enhancing user guidance.
- Captured content from turns with tool calls to provide fallback responses, ensuring continuity in conversation.
- Updated nudge logic to remind users about saving memories and creating skills based on interaction patterns.

											
										
										
											2026-02-22 21:33:28 -08:00
+								                                break
 								                            # No fallback -- append the empty message as-is
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								                            empty_msg = {
 								                                "role": "assistant",
 								                                "content": final_response,
 								                                "reasoning": reasoning_text,
 								                                "finish_reason": finish_reason,
 								                            }
 								                            messages.append(empty_msg)
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								                            self._cleanup_task_resources(effective_task_id)
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                            self._persist_session(messages, conversation_history)
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
 								                            return {
-												feat: update database schema and enhance message persistence

- Incremented schema version to 2 and added a new column `finish_reason` to the `messages` table.
- Implemented a method to flush un-logged messages to the session database, ensuring data integrity during conversation interruptions.
- Enhanced error handling to persist messages in various early-return scenarios, preventing data loss.

											
										
										
											2026-02-21 00:05:39 -08:00
+								                                "final_response": final_response or None,
 								                                "messages": messages,
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
+								                                "api_calls": api_call_count,
 								                                "completed": False,
 								                                "partial": True,
 								                                "error": "Model generated only think blocks with no actual response after 3 retries"
 								                            }
 								                    # Reset retry counter on successful content
 								                    if hasattr(self, '_empty_content_retries'):
 								                        self._empty_content_retries = 0
-												Fix Codex Responses continuation and schema parity

											
										
										
											2026-02-25 18:11:26 -08:00
 								                    if (
 								                        self.api_mode == "codex_responses"
 								                        and self.valid_tool_names
 								                        and codex_ack_continuations < 2
 								                        and self._looks_like_codex_intermediate_ack(
 								                            user_message=user_message,
 								                            assistant_content=final_response,
 								                            messages=messages,
 								                        )
 								                    ):
 								                        codex_ack_continuations += 1
 								                        interim_msg = self._build_assistant_message(assistant_message, "incomplete")
 								                        messages.append(interim_msg)
 								                        continue_msg = {
 								                            "role": "user",
 								                            "content": (
 								                                "[System: Continue now. Execute the required tool calls and only "
 								                                "send your final answer after completing the task.]"
 								                            ),
 								                        }
 								                        messages.append(continue_msg)
 								                        self._session_messages = messages
 								                        self._save_session_log(messages)
 								                        continue
 								                    codex_ack_continuations = 0
-												fix: continue after output-length truncation

											
										
										
											2026-03-07 18:45:17 -05:00
 								                    if truncated_response_prefix:
 								                        final_response = truncated_response_prefix + final_response
-												Implement enhanced response handling and tool call validation in run_agent

- Added methods to check for meaningful content after <think> blocks and to retrieve messages up to the last complete assistant turn.
- Introduced retry logic for handling truncated responses and invalid JSON arguments in tool calls, with a maximum retry limit.
- Improved logging for invalid JSON and empty responses, ensuring better error tracking and handling.
- Updated the batch data generation script to adjust dataset file, batch size, and ephemeral system prompt for improved context management.

											
										
										
											2026-01-10 13:04:43 +00:00
-												fix: strip <think> blocks from final response to users

Fixes #149

The _strip_think_blocks() method existed but was not applied to the
final_response in the normal completion path. This caused <think>...</think>
XML tags to leak into user-facing responses on all platforms (CLI, Telegram,
Discord, Slack, WhatsApp).

Changes:
- Strip think blocks from final_response before returning in normal path (line ~2600)
- Strip think blocks from fallback content when salvaging from prior tool_calls turn

Notes:
- The raw content with think blocks is preserved in messages[] for trajectory
  export - this only affects the user-facing final_response
- The _has_content_after_think_block() check still uses raw content before
  stripping, which is correct for detecting think-only responses

											
										
										
											2026-02-28 03:06:20 -05:00
+								                    # Strip <think> blocks from user-facing response (keep raw in messages for trajectory)
 								                    final_response = self._strip_think_blocks(final_response).strip()
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								                    final_msg = self._build_assistant_message(assistant_message, finish_reason)
-												Implement reasoning extraction and enhance assistant message handling

- Added a new method `_extract_reasoning` to extract reasoning content from assistant messages, accommodating multiple formats from various providers.
- Updated message handling to ensure all assistant messages include reasoning content for API compatibility, preserving multi-turn reasoning context.
- Enhanced logging to capture reasoning details for debugging and analysis.
- Modified the TODO.md to reflect changes in planning and task management, emphasizing the need for structured task decomposition and progress tracking.

											
										
										
											2026-02-01 22:48:18 -08:00
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								                    messages.append(final_msg)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
-												Add a claude code-like CLI

- Introduced `cli-config.yaml.example` to provide a template for configuring the CLI behavior, including model settings, terminal tool configurations, agent behavior, and toolsets.
- Created `cli.py` for an interactive terminal interface, allowing users to start the Hermes Agent with various options and toolsets.
- Added `hermes` launcher script for convenient CLI access.
- Updated `model_tools.py` to support quiet mode for suppressing output during tool initialization and execution.
- Enhanced logging in various tools to respect quiet mode, improving user experience by reducing unnecessary output.
- Added `prompt_toolkit` to `requirements.txt` for improved CLI interaction capabilities.
- Created `TODO.md` for future improvements and enhancements to the Hermes Agent framework.

											
										
										
											2026-01-31 06:30:48 +00:00
+								                    if not self.quiet_mode:
 								                        print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                    break
 								            except Exception as e:
-												rate limits

											
										
										
											2025-11-17 18:35:36 -05:00
+								                error_msg = f"Error during OpenAI-compatible API call #{api_call_count}: {str(e)}"
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                print(f"❌ {error_msg}")
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								                if self.verbose_logging:
 								                    logging.exception("Detailed error information:")
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                # If an assistant message with tool_calls was already appended,
 								                # the API expects a role="tool" result for every tool_call_id.
 								                # Fill in error results for any that weren't answered yet.
 								                pending_handled = False
 								                for idx in range(len(messages) - 1, -1, -1):
 								                    msg = messages[idx]
 								                    if not isinstance(msg, dict):
 								                        break
 								                    if msg.get("role") == "tool":
 								                        continue
 								                    if msg.get("role") == "assistant" and msg.get("tool_calls"):
 								                        answered_ids = {
 								                            m["tool_call_id"]
 								                            for m in messages[idx + 1:]
 								                            if isinstance(m, dict) and m.get("role") == "tool"
 								                        }
 								                        for tc in msg["tool_calls"]:
 								                            if tc["id"] not in answered_ids:
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                                err_msg = {
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                                    "role": "tool",
 								                                    "tool_call_id": tc["id"],
 								                                    "content": f"Error executing tool: {error_msg}",
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                                }
 								                                messages.append(err_msg)
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                        pending_handled = True
 								                    break
 								                if not pending_handled:
 								                    # Error happened before tool processing (e.g. response parsing).
 								                    # Use a user-role message so the model can see what went wrong
 								                    # without confusing the API with a fabricated assistant turn.
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                    sys_err_msg = {
-												Hermes Agent UX Improvements

											
										
										
											2026-02-22 02:16:11 -08:00
+								                        "role": "user",
 								                        "content": f"[System error during processing: {error_msg}]",
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								                    }
 								                    messages.append(sys_err_msg)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
 								                # If we're near the limit, break to avoid infinite loops
 								                if api_call_count >= self.max_iterations - 1:
 								                    final_response = f"I apologize, but I encountered repeated errors: {error_msg}"
 								                    break
-												Critical: fix bug in new subagent tool call budget to not be session-level but tool call loop level

											
										
										
											2026-03-07 10:32:51 -08:00
+								        if final_response is None and (
 								            api_call_count >= self.max_iterations
 								            or self.iteration_budget.remaining <= 0
 								        ):
 								            if self.iteration_budget.remaining <= 0 and not self.quiet_mode:
 								                print(f"\n⚠️  Session iteration budget exhausted ({self.iteration_budget.used}/{self.iteration_budget.max_total} used, including subagents)")
-												refactor: enhance API interaction and message handling in AIAgent

- Introduced new methods in run_agent.py for building API keyword arguments and normalizing assistant messages from API responses.
- Added functionality for compressing conversation context and managing session state in SQLite.
- Improved tool call execution handling, including enhanced logging and error management.
- Updated path handling in multiple platform files to utilize pathlib for better compatibility and readability.

											
										
										
											2026-02-21 04:17:27 -08:00
+								            final_response = self._handle_max_iterations(messages, api_call_count)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
 								        # Determine if conversation completed successfully
 								        completed = final_response is not None and api_call_count < self.max_iterations
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        # Save trajectory if enabled
 								        self._save_trajectory(messages, user_message, completed)
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								        # Clean up VM and browser for this task after conversation completes
-												Cleanup time!

											
										
										
											2026-02-20 23:23:32 -08:00
+								        self._cleanup_task_resources(effective_task_id)
-												prevent leakage of morph instances between tasks

											
										
										
											2025-11-04 03:32:43 -05:00
-												Fix session saving to DB with full conversation history (not just user/assistant messages without tool calls)

											
										
										
											2026-02-22 17:10:24 -08:00
+								        # Persist session to both JSON log and SQLite
 								        self._persist_session(messages, conversation_history)
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
 								        # Sync conversation to Honcho for user modeling
 								        if final_response and not interrupted:
 								            self._honcho_sync(original_user_message, final_response)
-												fix(honcho): resolve review blockers for merge

Address merge-blocking review feedback by removing unsafe signal handler overrides, wiring next-turn Honcho prefetch, restoring per-directory session defaults, and exposing all Honcho tools to the model surface. Also harden prefetch cache access with public thread-safe accessors and remove duplicate browser cleanup code.

Made-with: Cursor

											
										
										
											2026-03-11 11:46:37 -04:00
+								            self._queue_honcho_prefetch(original_user_message)
-												feat: add Honcho AI-native memory integration

Opt-in persistent cross-session user modeling via Honcho. Reads
~/.honcho/config.json as single source of truth (shared with
Claude Code, Cursor, and other Honcho-enabled tools). Zero impact
when disabled or unconfigured.

- honcho_integration/ package (client, session manager, peer resolution)
- Host-based config resolution matching claude-honcho/cursor-honcho pattern
- Prefetch user context into system prompt per conversation turn
- Sync user/assistant messages to Honcho after each exchange
- query_user_context tool for mid-conversation dialectic reasoning
- Gated activation: requires ~/.honcho/config.json with enabled=true

											
										
										
											2026-02-25 19:34:25 -05:00
-												feat(cli): add /reasoning command for effort level and display toggle

Combined implementation of reasoning management:
- /reasoning              Show current effort level and display state
- /reasoning <level>      Set reasoning effort (none, low, medium, high, xhigh)
- /reasoning show|on      Show model thinking/reasoning in output
- /reasoning hide|off     Hide model thinking/reasoning from output

Effort level changes persist to config and force agent re-init.
Display toggle updates the agent callback dynamically without re-init.

When display is enabled:
- Intermediate reasoning shown as dim [thinking] lines during tool loops
- Final reasoning shown in a bordered box above the response
- Long reasoning collapsed (5 lines intermediate, 10 lines final)

Also adds:
- reasoning_callback parameter to AIAgent
- last_reasoning in run_conversation result dict
- show_reasoning config option (display section, default: false)
- Display section in /config output
- 34 tests covering both features

Combines functionality from PR #789 and PR #790.

Co-authored-by: Aum Desai <Aum08Desai@users.noreply.github.com>
Co-authored-by: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>

											
										
										
											2026-03-11 05:53:21 -07:00
+								        # Extract reasoning from the last assistant message (if any)
 								        last_reasoning = None
 								        for msg in reversed(messages):
 								            if msg.get("role") == "assistant" and msg.get("reasoning"):
 								                last_reasoning = msg["reasoning"]
 								                break
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								        # Build result with interrupt info if applicable
 								        result = {
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            "final_response": final_response,
-												feat(cli): add /reasoning command for effort level and display toggle

Combined implementation of reasoning management:
- /reasoning              Show current effort level and display state
- /reasoning <level>      Set reasoning effort (none, low, medium, high, xhigh)
- /reasoning show|on      Show model thinking/reasoning in output
- /reasoning hide|off     Hide model thinking/reasoning from output

Effort level changes persist to config and force agent re-init.
Display toggle updates the agent callback dynamically without re-init.

When display is enabled:
- Intermediate reasoning shown as dim [thinking] lines during tool loops
- Final reasoning shown in a bordered box above the response
- Long reasoning collapsed (5 lines intermediate, 10 lines final)

Also adds:
- reasoning_callback parameter to AIAgent
- last_reasoning in run_conversation result dict
- show_reasoning config option (display section, default: false)
- Display section in /config output
- 34 tests covering both features

Combines functionality from PR #789 and PR #790.

Co-authored-by: Aum Desai <Aum08Desai@users.noreply.github.com>
Co-authored-by: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>

											
										
										
											2026-03-11 05:53:21 -07:00
+								            "last_reasoning": last_reasoning,
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								            "messages": messages,
 								            "api_calls": api_call_count,
-												Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.

											
										
										
											2026-01-10 05:56:26 +00:00
+								            "completed": completed,
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								            "partial": False,  # True only when stopped due to invalid tool calls
 								            "interrupted": interrupted,
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								            "response_previewed": getattr(self, "_response_was_previewed", False),
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        }
-												feat(honcho): async memory integration with prefetch pipeline and recallMode

Adds full Honcho memory integration to Hermes:

- Session manager with async background writes, memory modes (honcho/hybrid/local),
  and dialectic prefetch for first-turn context warming
- Agent integration: prefetch pipeline, tool surface gated by recallMode,
  system prompt context injection, SIGTERM/SIGINT flush handlers
- CLI commands: setup, status, mode, tokens, peer, identity, migrate
- recallMode setting (auto | context | tools) for A/B testing retrieval strategies
- Session strategies: per-session, per-repo (git tree root), per-directory, global
- Polymorphic memoryMode config: string shorthand or per-peer object overrides
- 97 tests covering async writes, client config, session resolution, and memory modes

											
										
										
											2026-03-09 15:58:22 -04:00
+								        self._response_was_previewed = False
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
 								        # Include interrupt message if one triggered the interrupt
 								        if interrupted and self._interrupt_message:
 								            result["interrupt_message"] = self._interrupt_message
 								        # Clear interrupt state after handling
 								        self.clear_interrupt()
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
 								        # Clear stream callback so it doesn't leak into future calls
 								        self._stream_callback = None
-												Implement interrupt handling for agent and CLI input and persistent prompt line at bottom of CLI :)

- Enhanced the AIAgent class to support interrupt requests, allowing for graceful interruption of ongoing tasks and processing of new messages.
- Updated the HermesCLI to manage user input in a persistent manner, enabling real-time interruption of the agent's conversation.
- Introduced a mechanism in the GatewayRunner to handle incoming messages while an agent is running, allowing for immediate response to user commands.
- Improved overall user experience by providing feedback during interruptions and ensuring that pending messages are processed correctly.

											
										
										
											2026-02-03 16:15:49 -08:00
+								        return result
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
 								    def chat(self, message: str, stream_callback: Optional[callable] = None) -> str:
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        """
 								        Simple chat interface that returns just the final response.
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        Args:
 								            message (str): User message
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								            stream_callback: Optional callback invoked with each text delta during streaming.
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        Returns:
 								            str: Final assistant response
 								        """
-												feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).

											
										
										
											2026-03-03 23:03:42 +03:00
+								        result = self.run_conversation(message, stream_callback=stream_callback)
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        return result["final_response"]
 								def main(
 								    query: str = None,
-												Enhance BatchRunner and AIAgent with new configuration options, default model now opus 4.6, default summarizer gemini flash 3

- Added `max_tokens`, `reasoning_config`, and `prefill_messages` parameters to `BatchRunner` and `AIAgent` for improved model response control.
- Updated CLI to support new options for reasoning effort and prefill messages from a JSON file.
- Modified example configuration files to reflect changes in default model and summary model.
- Improved error handling for loading prefill messages and reasoning configurations in the CLI.
- Updated documentation to include new parameters and usage examples.

											
										
										
											2026-02-08 10:49:24 +00:00
+								    model: str = "anthropic/claude-opus-4.6",
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    api_key: str = None,
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								    base_url: str = "https://openrouter.ai/api/v1",
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    max_turns: int = 10,
 								    enabled_toolsets: str = None,
 								    disabled_toolsets: str = None,
 								    list_tools: bool = False,
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								    save_trajectories: bool = False,
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								    save_sample: bool = False,
-												add logging of prefix of tool call and tool response

											
										
										
											2025-11-07 14:43:44 -05:00
+								    verbose: bool = False,
 								    log_prefix_chars: int = 20
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								):
 								    """
 								    Main function for running the agent directly.
-												add logging of prefix of tool call and tool response

											
										
										
											2025-11-07 14:43:44 -05:00
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    Args:
 								        query (str): Natural language query for the agent. Defaults to Python 3.13 example.
-												fix: update model version in agent configurations

Updated the default model version from "anthropic/claude-sonnet-4-20250514" to "anthropic/claude-sonnet-4.6" across multiple files including AGENTS.md, batch_runner.py, mini_swe_runner.py, and run_agent.py for consistency and to reflect the latest model improvements.

											
										
										
											2026-03-07 09:06:37 -08:00
+								        model (str): Model name to use (OpenRouter format: provider/model). Defaults to anthropic/claude-sonnet-4.6.
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								        api_key (str): API key for authentication. Uses OPENROUTER_API_KEY env var if not provided.
 								        base_url (str): Base URL for the model API. Defaults to https://openrouter.ai/api/v1
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        max_turns (int): Maximum number of API call iterations. Defaults to 10.
-												add logging of prefix of tool call and tool response

											
										
										
											2025-11-07 14:43:44 -05:00
+								        enabled_toolsets (str): Comma-separated list of toolsets to enable. Supports predefined
 								                              toolsets (e.g., "research", "development", "safe").
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								                              Multiple toolsets can be combined: "web,vision"
 								        disabled_toolsets (str): Comma-separated list of toolsets to disable (e.g., "terminal")
 								        list_tools (bool): Just list available tools and exit
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								        save_trajectories (bool): Save conversation trajectories to JSONL files (appends to trajectory_samples.jsonl). Defaults to False.
 								        save_sample (bool): Save a single trajectory sample to a UUID-named JSONL file for inspection. Defaults to False.
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								        verbose (bool): Enable verbose logging for debugging. Defaults to False.
-												add logging of prefix of tool call and tool response

											
										
										
											2025-11-07 14:43:44 -05:00
+								        log_prefix_chars (int): Number of characters to show in log previews for tool calls/responses. Defaults to 20.
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    Toolset Examples:
 								        - "research": Web search, extract, crawl + vision tools
 								    """
 								    print("🤖 AI Agent with Tool Calling")
 								    print("=" * 50)
 								    # Handle tool listing
 								    if list_tools:
 								        from model_tools import get_all_tool_names, get_toolset_for_tool, get_available_toolsets
 								        from toolsets import get_all_toolsets, get_toolset_info
 								        print("📋 Available Tools & Toolsets:")
 								        print("-" * 50)
 								        # Show new toolsets system
 								        print("\n🎯 Predefined Toolsets (New System):")
 								        print("-" * 40)
 								        all_toolsets = get_all_toolsets()
 								        # Group by category
 								        basic_toolsets = []
 								        composite_toolsets = []
 								        scenario_toolsets = []
 								        for name, toolset in all_toolsets.items():
 								            info = get_toolset_info(name)
 								            if info:
 								                entry = (name, info)
 								                if name in ["web", "terminal", "vision", "creative", "reasoning"]:
 								                    basic_toolsets.append(entry)
 								                elif name in ["research", "development", "analysis", "content_creation", "full_stack"]:
 								                    composite_toolsets.append(entry)
 								                else:
 								                    scenario_toolsets.append(entry)
 								        # Print basic toolsets
 								        print("\n📌 Basic Toolsets:")
 								        for name, info in basic_toolsets:
 								            tools_str = ', '.join(info['resolved_tools']) if info['resolved_tools'] else 'none'
 								            print(f"  • {name:15} - {info['description']}")
 								            print(f"    Tools: {tools_str}")
 								        # Print composite toolsets
 								        print("\n📂 Composite Toolsets (built from other toolsets):")
 								        for name, info in composite_toolsets:
 								            includes_str = ', '.join(info['includes']) if info['includes'] else 'none'
 								            print(f"  • {name:15} - {info['description']}")
 								            print(f"    Includes: {includes_str}")
 								            print(f"    Total tools: {info['tool_count']}")
 								        # Print scenario-specific toolsets
 								        print("\n🎭 Scenario-Specific Toolsets:")
 								        for name, info in scenario_toolsets:
 								            print(f"  • {name:20} - {info['description']}")
 								            print(f"    Total tools: {info['tool_count']}")
 								        # Show legacy toolset compatibility
 								        print("\n📦 Legacy Toolsets (for backward compatibility):")
 								        legacy_toolsets = get_available_toolsets()
 								        for name, info in legacy_toolsets.items():
 								            status = "✅" if info["available"] else "❌"
 								            print(f"  {status} {name}: {info['description']}")
 								            if not info["available"]:
 								                print(f"    Requirements: {', '.join(info['requirements'])}")
 								        # Show individual tools
 								        all_tools = get_all_tool_names()
 								        print(f"\n🔧 Individual Tools ({len(all_tools)} available):")
 								        for tool_name in sorted(all_tools):
 								            toolset = get_toolset_for_tool(tool_name)
 								            print(f"  📌 {tool_name} (from {toolset})")
 								        print(f"\n💡 Usage Examples:")
 								        print(f"  # Use predefined toolsets")
 								        print(f"  python run_agent.py --enabled_toolsets=research --query='search for Python news'")
 								        print(f"  python run_agent.py --enabled_toolsets=development --query='debug this code'")
 								        print(f"  python run_agent.py --enabled_toolsets=safe --query='analyze without terminal'")
 								        print(f"  ")
 								        print(f"  # Combine multiple toolsets")
 								        print(f"  python run_agent.py --enabled_toolsets=web,vision --query='analyze website'")
 								        print(f"  ")
 								        print(f"  # Disable toolsets")
 								        print(f"  python run_agent.py --disabled_toolsets=terminal --query='no command execution'")
 								        print(f"  ")
 								        print(f"  # Run with trajectory saving enabled")
 								        print(f"  python run_agent.py --save_trajectories --query='your question here'")
 								        return
 								    # Parse toolset selection arguments
 								    enabled_toolsets_list = None
 								    disabled_toolsets_list = None
 								    if enabled_toolsets:
 								        enabled_toolsets_list = [t.strip() for t in enabled_toolsets.split(",")]
 								        print(f"🎯 Enabled toolsets: {enabled_toolsets_list}")
 								    if disabled_toolsets:
 								        disabled_toolsets_list = [t.strip() for t in disabled_toolsets.split(",")]
 								        print(f"🚫 Disabled toolsets: {disabled_toolsets_list}")
 								    if save_trajectories:
 								        print(f"💾 Trajectory saving: ENABLED")
 								        print(f"   - Successful conversations → trajectory_samples.jsonl")
 								        print(f"   - Failed conversations → failed_trajectories.jsonl")
 								    # Initialize agent with provided parameters
 								    try:
 								        agent = AIAgent(
 								            base_url=base_url,
 								            model=model,
 								            api_key=api_key,
 								            max_iterations=max_turns,
 								            enabled_toolsets=enabled_toolsets_list,
 								            disabled_toolsets=disabled_toolsets_list,
-												add better logging when requests fail

											
										
										
											2025-09-10 00:51:41 -07:00
+								            save_trajectories=save_trajectories,
-												add logging of prefix of tool call and tool response

											
										
										
											2025-11-07 14:43:44 -05:00
+								            verbose_logging=verbose,
 								            log_prefix_chars=log_prefix_chars
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								        )
 								    except RuntimeError as e:
 								        print(f"❌ Failed to initialize agent: {e}")
 								        return
 								    # Use provided query or default to Python 3.13 example
 								    if query is None:
 								        user_query = (
 								            "Tell me about the latest developments in Python 3.13 and what new features "
 								            "developers should know about. Please search for current information and try it out."
 								        )
 								    else:
 								        user_query = query
 								    print(f"\n📝 User Query: {user_query}")
 								    print("\n" + "=" * 50)
 								    # Run conversation
 								    result = agent.run_conversation(user_query)
 								    print("\n" + "=" * 50)
 								    print("📋 CONVERSATION SUMMARY")
 								    print("=" * 50)
 								    print(f"✅ Completed: {result['completed']}")
 								    print(f"📞 API Calls: {result['api_calls']}")
 								    print(f"💬 Messages: {len(result['messages'])}")
 								    if result['final_response']:
 								        print(f"\n🎯 FINAL RESPONSE:")
 								        print("-" * 30)
 								        print(result['final_response'])
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								    # Save sample trajectory to UUID-named file if requested
 								    if save_sample:
 								        sample_id = str(uuid.uuid4())[:8]
-												Add timeout configuration for trajectory processing

- Updated `trajectory_compression.yaml` to include a new `per_trajectory_timeout` setting, allowing for a timeout of 300 seconds per trajectory. This enhancement helps prevent hanging on problematic entries during processing, improving overall reliability and efficiency in trajectory handling.

											
										
										
											2026-01-30 07:34:58 +00:00
+								        sample_filename = f"sample_{sample_id}.json"
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
 								        # Convert messages to trajectory format (same as batch_runner)
 								        trajectory = agent._convert_to_trajectory_format(
 								            result['messages'],
 								            user_query,
 								            result['completed']
 								        )
 								        entry = {
 								            "conversations": trajectory,
 								            "timestamp": datetime.now().isoformat(),
 								            "model": model,
 								            "completed": result['completed'],
 								            "query": user_query
 								        }
 								        try:
 								            with open(sample_filename, "w", encoding="utf-8") as f:
-												Add timeout configuration for trajectory processing

- Updated `trajectory_compression.yaml` to include a new `per_trajectory_timeout` setting, allowing for a timeout of 300 seconds per trajectory. This enhancement helps prevent hanging on problematic entries during processing, improving overall reliability and efficiency in trajectory handling.

											
										
										
											2026-01-30 07:34:58 +00:00
+								                # Pretty-print JSON with indent for readability
 								                f.write(json.dumps(entry, ensure_ascii=False, indent=2))
-												Add browser automation tools and enhance environment configuration

- Introduced new browser automation tools in `browser_tool.py` for navigating, interacting with, and extracting content from web pages using the agent-browser CLI and Browserbase cloud execution.
- Updated `.env.example` to include new configuration options for Browserbase API keys and session settings.
- Enhanced `model_tools.py` and `toolsets.py` to integrate browser tools into the existing tool framework, ensuring consistent access across toolsets.
- Updated `README.md` with setup instructions for browser tools and their usage examples.
- Added new test script `test_modal_terminal.py` to validate Modal terminal backend functionality.
- Improved `run_agent.py` to support browser tool integration and logging enhancements for better tracking of API responses.

											
										
										
											2026-01-29 06:10:24 +00:00
+								            print(f"\n💾 Sample trajectory saved to: {sample_filename}")
 								        except Exception as e:
 								            print(f"\n⚠️ Failed to save sample: {e}")
-												Update to use toolsets and make them easy to create and configure

											
										
										
											2025-09-10 00:43:55 -07:00
+								    print("\n👋 Agent execution completed!")
 								if __name__ == "__main__":
 								    fire.Fire(main)