feat(logging): implement persistent error logging for tool failures

- Introduce a separate error log for capturing warnings and errors related to tool execution, ensuring detailed inspection of issues post-failure.
- Enhance error handling in the AIAgent class to log exceptions with stack traces for better debugging.
- Add a similar error logging mechanism in the gateway to streamline debugging processes.
This commit is contained in:
teknium1
2026-02-28 22:49:58 -08:00
parent a7c2b9e280
commit 23d0b7af6a
2 changed files with 34 additions and 3 deletions

View File

@@ -2032,6 +2032,16 @@ async def start_gateway(config: Optional[GatewayConfig] = None) -> bool:
logging.getLogger().addHandler(file_handler)
logging.getLogger().setLevel(logging.INFO)
# Separate errors-only log for easy debugging
error_handler = RotatingFileHandler(
log_dir / 'errors.log',
maxBytes=2 * 1024 * 1024,
backupCount=2,
)
error_handler.setLevel(logging.WARNING)
error_handler.setFormatter(RedactingFormatter('%(asctime)s %(levelname)s %(name)s: %(message)s'))
logging.getLogger().addHandler(error_handler)
runner = GatewayRunner(config)
# Set up signal handlers

View File

@@ -89,6 +89,7 @@ from agent.prompt_builder import build_skills_system_prompt, build_context_files
from agent.display import (
KawaiiSpinner, build_tool_preview as _build_tool_preview,
get_cute_tool_message as _get_cute_tool_message_impl,
_detect_tool_failure,
)
from agent.trajectory import (
convert_scratchpad_to_think, has_incomplete_scratchpad,
@@ -247,8 +248,22 @@ class AIAgent:
self._use_prompt_caching = is_openrouter and is_claude
self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost)
# Configure logging with secret redaction
# Persistent error log -- always writes WARNING+ to ~/.hermes/logs/errors.log
# so tool failures, API errors, etc. are inspectable after the fact.
from agent.redact import RedactingFormatter
_error_log_dir = Path.home() / ".hermes" / "logs"
_error_log_dir.mkdir(parents=True, exist_ok=True)
_error_log_path = _error_log_dir / "errors.log"
from logging.handlers import RotatingFileHandler
_error_file_handler = RotatingFileHandler(
_error_log_path, maxBytes=2 * 1024 * 1024, backupCount=2,
)
_error_file_handler.setLevel(logging.WARNING)
_error_file_handler.setFormatter(RedactingFormatter(
'%(asctime)s %(levelname)s %(name)s: %(message)s',
))
logging.getLogger().addHandler(_error_file_handler)
if self.verbose_logging:
logging.basicConfig(
level=logging.DEBUG,
@@ -2499,7 +2514,7 @@ class AIAgent:
_spinner_result = function_result
except Exception as tool_error:
function_result = f"Error executing tool '{function_name}': {tool_error}"
logger.error("handle_function_call raised for %s: %s", function_name, tool_error)
logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
finally:
tool_duration = time.time() - tool_start_time
cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
@@ -2509,11 +2524,17 @@ class AIAgent:
function_result = handle_function_call(function_name, function_args, effective_task_id)
except Exception as tool_error:
function_result = f"Error executing tool '{function_name}': {tool_error}"
logger.error("handle_function_call raised for %s: %s", function_name, tool_error)
logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
tool_duration = time.time() - tool_start_time
result_preview = function_result[:200] if len(function_result) > 200 else function_result
# Log tool errors to the persistent error log so [error] tags
# in the UI always have a corresponding detailed entry on disk.
_is_error_result, _ = _detect_tool_failure(function_name, function_result)
if _is_error_result:
logger.warning("Tool %s returned error (%.2fs): %s", function_name, tool_duration, result_preview)
if self.verbose_logging:
logging.debug(f"Tool {function_name} completed in {tool_duration:.2f}s")
logging.debug(f"Tool result preview: {result_preview}...")