Merge branch 'main' into codex/align-codex-provider-conventions-mainrepo

This commit is contained in:
Teknium
2026-02-28 18:13:38 -08:00
committed by GitHub
96 changed files with 10884 additions and 447 deletions

View File

@@ -128,6 +128,7 @@ class AIAgent:
session_id: str = None,
tool_progress_callback: callable = None,
clarify_callback: callable = None,
step_callback: callable = None,
max_tokens: int = None,
reasoning_config: Dict[str, Any] = None,
prefill_messages: List[Dict[str, Any]] = None,
@@ -135,6 +136,7 @@ class AIAgent:
skip_context_files: bool = False,
skip_memory: bool = False,
session_db=None,
honcho_session_key: str = None,
):
"""
Initialize the AI Agent.
@@ -174,6 +176,8 @@ class AIAgent:
skip_context_files (bool): If True, skip auto-injection of SOUL.md, AGENTS.md, and .cursorrules
into the system prompt. Use this for batch processing and data generation to avoid
polluting trajectories with user-specific persona or project instructions.
honcho_session_key (str): Session key for Honcho integration (e.g., "telegram:123456" or CLI session_id).
When provided and Honcho is enabled in config, enables persistent cross-session user modeling.
"""
self.model = model
self.max_iterations = max_iterations
@@ -200,8 +204,16 @@ class AIAgent:
self.provider = "openai-codex"
else:
self.api_mode = "chat_completions"
if base_url and "api.anthropic.com" in base_url.strip().lower():
raise ValueError(
"Anthropic's native /v1/messages API is not supported yet (planned for a future release). "
"Hermes currently requires OpenAI-compatible /chat/completions endpoints. "
"To use Claude models now, route through OpenRouter (OPENROUTER_API_KEY) "
"or any OpenAI-compatible proxy that wraps the Anthropic API."
)
self.tool_progress_callback = tool_progress_callback
self.clarify_callback = clarify_callback
self.step_callback = step_callback
self._last_reported_tool = None # Track for "new tool" mode
# Interrupt mechanism for breaking out of tool loops
@@ -304,7 +316,7 @@ class AIAgent:
client_kwargs["default_headers"] = {
"HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
"X-OpenRouter-Title": "Hermes Agent",
"X-OpenRouter-Categories": "cli-agent",
"X-OpenRouter-Categories": "productivity,cli-agent",
}
self._client_kwargs = client_kwargs # stored for rebuilding after interrupt
@@ -435,6 +447,46 @@ class AIAgent:
except Exception:
pass # Memory is optional -- don't break agent init
# Honcho AI-native memory (cross-session user modeling)
# Reads ~/.honcho/config.json as the single source of truth.
self._honcho = None # HonchoSessionManager | None
self._honcho_session_key = honcho_session_key
if not skip_memory:
try:
from honcho_integration.client import HonchoClientConfig, get_honcho_client
hcfg = HonchoClientConfig.from_global_config()
if hcfg.enabled and hcfg.api_key:
from honcho_integration.session import HonchoSessionManager
client = get_honcho_client(hcfg)
self._honcho = HonchoSessionManager(
honcho=client,
config=hcfg,
context_tokens=hcfg.context_tokens,
)
# Resolve session key: explicit arg > global sessions map > fallback
if not self._honcho_session_key:
self._honcho_session_key = (
hcfg.resolve_session_name()
or "hermes-default"
)
# Ensure session exists in Honcho
self._honcho.get_or_create(self._honcho_session_key)
# Inject session context into the honcho tool module
from tools.honcho_tools import set_session_context
set_session_context(self._honcho, self._honcho_session_key)
logger.info(
"Honcho active (session: %s, user: %s, workspace: %s)",
self._honcho_session_key, hcfg.peer_name, hcfg.workspace_id,
)
else:
if not hcfg.enabled:
logger.debug("Honcho disabled in global config")
elif not hcfg.api_key:
logger.debug("Honcho enabled but no API key configured")
except Exception as e:
logger.debug("Honcho init failed (non-fatal): %s", e)
self._honcho = None
# Skills config: nudge interval for skill creation reminders
self._skill_nudge_interval = 15
try:
@@ -446,9 +498,10 @@ class AIAgent:
# Initialize context compressor for automatic context management
# Compresses conversation when approaching model's context limit
# Configuration via environment variables (can be set in .env or cli-config.yaml)
# Configuration via config.yaml (compression section) or environment variables
compression_threshold = float(os.getenv("CONTEXT_COMPRESSION_THRESHOLD", "0.85"))
compression_enabled = os.getenv("CONTEXT_COMPRESSION_ENABLED", "true").lower() in ("true", "1", "yes")
compression_summary_model = os.getenv("CONTEXT_COMPRESSION_MODEL") or None
self.context_compressor = ContextCompressor(
model=self.model,
@@ -456,6 +509,7 @@ class AIAgent:
protect_first_n=3,
protect_last_n=4,
summary_target_tokens=500,
summary_model_override=compression_summary_model,
quiet_mode=self.quiet_mode,
)
self.compression_enabled = compression_enabled
@@ -467,6 +521,21 @@ class AIAgent:
else:
print(f"📊 Context limit: {self.context_compressor.context_length:,} tokens (auto-compression disabled)")
def _max_tokens_param(self, value: int) -> dict:
"""Return the correct max tokens kwarg for the current provider.
OpenAI's newer models (gpt-4o, o-series, gpt-5+) require
'max_completion_tokens'. OpenRouter, local models, and older
OpenAI models use 'max_tokens'.
"""
_is_direct_openai = (
"api.openai.com" in self.base_url.lower()
and "openrouter" not in self.base_url.lower()
)
if _is_direct_openai:
return {"max_completion_tokens": value}
return {"max_tokens": value}
def _has_content_after_think_block(self, content: str) -> bool:
"""
Check if content has actual text after any <think></think> blocks.
@@ -669,7 +738,7 @@ class AIAgent:
if not self._session_db:
return
try:
start_idx = (len(conversation_history) if conversation_history else 0) + 1
start_idx = len(conversation_history) if conversation_history else 0
for msg in messages[start_idx:]:
role = msg.get("role", "unknown")
content = msg.get("content")
@@ -1016,8 +1085,6 @@ class AIAgent:
if not content:
return content
content = convert_scratchpad_to_think(content)
# Strip extra newlines before/after think blocks
import re
content = re.sub(r'\n+(<think>)', r'\n\1', content)
content = re.sub(r'(</think>)\n+', r'\1\n', content)
return content.strip()
@@ -1144,7 +1211,67 @@ class AIAgent:
def is_interrupted(self) -> bool:
"""Check if an interrupt has been requested."""
return self._interrupt_requested
# ── Honcho integration helpers ──
def _honcho_prefetch(self, user_message: str) -> str:
"""Fetch user context from Honcho for system prompt injection.
Returns a formatted context block, or empty string if unavailable.
"""
if not self._honcho or not self._honcho_session_key:
return ""
try:
ctx = self._honcho.get_prefetch_context(self._honcho_session_key, user_message)
if not ctx:
return ""
parts = []
rep = ctx.get("representation", "")
card = ctx.get("card", "")
if rep:
parts.append(rep)
if card:
parts.append(card)
if not parts:
return ""
return "# Honcho User Context\n" + "\n\n".join(parts)
except Exception as e:
logger.debug("Honcho prefetch failed (non-fatal): %s", e)
return ""
def _honcho_save_user_observation(self, content: str) -> str:
"""Route a memory tool target=user add to Honcho.
Sends the content as a user peer message so Honcho's reasoning
model can incorporate it into the user representation.
"""
if not content or not content.strip():
return json.dumps({"success": False, "error": "Content cannot be empty."})
try:
session = self._honcho.get_or_create(self._honcho_session_key)
session.add_message("user", f"[observation] {content.strip()}")
self._honcho.save(session)
return json.dumps({
"success": True,
"target": "user",
"message": "Saved to Honcho user model.",
})
except Exception as e:
logger.debug("Honcho user observation failed: %s", e)
return json.dumps({"success": False, "error": f"Honcho save failed: {e}"})
def _honcho_sync(self, user_content: str, assistant_content: str) -> None:
"""Sync the user/assistant message pair to Honcho."""
if not self._honcho or not self._honcho_session_key:
return
try:
session = self._honcho.get_or_create(self._honcho_session_key)
session.add_message("user", user_content)
session.add_message("assistant", assistant_content)
self._honcho.save(session)
except Exception as e:
logger.debug("Honcho sync failed (non-fatal): %s", e)
def _build_system_prompt(self, system_message: str = None) -> str:
"""
Assemble the full system prompt from all layers.
@@ -1184,6 +1311,7 @@ class AIAgent:
mem_block = self._memory_store.format_for_system_prompt("memory")
if mem_block:
prompt_parts.append(mem_block)
# USER.md is always included when enabled -- Honcho prefetch is additive.
if self._user_profile_enabled:
user_block = self._memory_store.format_for_system_prompt("user")
if user_block:
@@ -1865,11 +1993,11 @@ class AIAgent:
"model": self.model,
"messages": api_messages,
"tools": self.tools if self.tools else None,
"timeout": 600.0,
"timeout": 900.0,
}
if self.max_tokens is not None:
api_kwargs["max_tokens"] = self.max_tokens
api_kwargs.update(self._max_tokens_param(self.max_tokens))
extra_body = {}
@@ -1994,7 +2122,8 @@ class AIAgent:
"[System: The session is being compressed. "
"Please save anything worth remembering to your memories.]"
)
flush_msg = {"role": "user", "content": flush_content}
_sentinel = f"__flush_{id(self)}_{time.monotonic()}"
flush_msg = {"role": "user", "content": flush_content, "_flush_sentinel": _sentinel}
messages.append(flush_msg)
try:
@@ -2023,50 +2152,50 @@ class AIAgent:
messages.pop() # remove flush msg
return
if self.api_mode == "codex_responses":
codex_kwargs = self._build_api_kwargs(api_messages)
codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
response = self._run_codex_stream(codex_kwargs)
assistant_message, _ = self._normalize_codex_response(response)
else:
api_kwargs = {
"model": self.model,
"messages": api_messages,
"tools": [memory_tool_def],
"temperature": 0.3,
"max_tokens": 1024,
}
response = self.client.chat.completions.create(**api_kwargs, timeout=30.0)
if not response.choices:
assistant_message = None
else:
assistant_message = response.choices[0].message
api_kwargs = {
"model": self.model,
"messages": api_messages,
"tools": [memory_tool_def],
"temperature": 0.3,
**self._max_tokens_param(1024),
}
if assistant_message and assistant_message.tool_calls:
# Execute only memory tool calls
for tc in assistant_message.tool_calls:
if tc.function.name == "memory":
try:
args = json.loads(tc.function.arguments)
from tools.memory_tool import memory_tool as _memory_tool
_memory_tool(
action=args.get("action"),
target=args.get("target", "memory"),
content=args.get("content"),
old_text=args.get("old_text"),
store=self._memory_store,
)
if not self.quiet_mode:
print(f" 🧠 Memory flush: saved to {args.get('target', 'memory')}")
except Exception as e:
logger.debug("Memory flush tool call failed: %s", e)
response = self.client.chat.completions.create(**api_kwargs, timeout=30.0)
if response.choices:
assistant_message = response.choices[0].message
if assistant_message.tool_calls:
# Execute only memory tool calls
for tc in assistant_message.tool_calls:
if tc.function.name == "memory":
try:
args = json.loads(tc.function.arguments)
flush_target = args.get("target", "memory")
from tools.memory_tool import memory_tool as _memory_tool
result = _memory_tool(
action=args.get("action"),
target=flush_target,
content=args.get("content"),
old_text=args.get("old_text"),
store=self._memory_store,
)
# Also send user observations to Honcho when active
if self._honcho and flush_target == "user" and args.get("action") == "add":
self._honcho_save_user_observation(args.get("content", ""))
if not self.quiet_mode:
print(f" 🧠 Memory flush: saved to {args.get('target', 'memory')}")
except Exception as e:
logger.debug("Memory flush tool call failed: %s", e)
except Exception as e:
logger.debug("Memory flush API call failed: %s", e)
finally:
# Strip flush artifacts: remove everything from the flush message onward
while messages and messages[-1] is not flush_msg and len(messages) > 0:
# Strip flush artifacts: remove everything from the flush message onward.
# Use sentinel marker instead of identity check for robustness.
while messages and messages[-1].get("_flush_sentinel") != _sentinel:
messages.pop()
if messages and messages[-1] is flush_msg:
if not messages:
break
if messages and messages[-1].get("_flush_sentinel") == _sentinel:
messages.pop()
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None) -> tuple:
@@ -2163,26 +2292,33 @@ class AIAgent:
tool_duration = time.time() - tool_start_time
if self.quiet_mode:
print(f" {_get_cute_tool_message_impl('todo', function_args, tool_duration, result=function_result)}")
elif function_name == "session_search" and self._session_db:
from tools.session_search_tool import session_search as _session_search
function_result = _session_search(
query=function_args.get("query", ""),
role_filter=function_args.get("role_filter"),
limit=function_args.get("limit", 3),
db=self._session_db,
)
elif function_name == "session_search":
if not self._session_db:
function_result = json.dumps({"success": False, "error": "Session database not available."})
else:
from tools.session_search_tool import session_search as _session_search
function_result = _session_search(
query=function_args.get("query", ""),
role_filter=function_args.get("role_filter"),
limit=function_args.get("limit", 3),
db=self._session_db,
)
tool_duration = time.time() - tool_start_time
if self.quiet_mode:
print(f" {_get_cute_tool_message_impl('session_search', function_args, tool_duration, result=function_result)}")
elif function_name == "memory":
target = function_args.get("target", "memory")
from tools.memory_tool import memory_tool as _memory_tool
function_result = _memory_tool(
action=function_args.get("action"),
target=function_args.get("target", "memory"),
target=target,
content=function_args.get("content"),
old_text=function_args.get("old_text"),
store=self._memory_store,
)
# Also send user observations to Honcho when active
if self._honcho and target == "user" and function_args.get("action") == "add":
self._honcho_save_user_observation(function_args.get("content", ""))
tool_duration = time.time() - tool_start_time
if self.quiet_mode:
print(f" {_get_cute_tool_message_impl('memory', function_args, tool_duration, result=function_result)}")
@@ -2258,12 +2394,19 @@ class AIAgent:
try:
function_result = handle_function_call(function_name, function_args, effective_task_id)
_spinner_result = function_result
except Exception as tool_error:
function_result = f"Error executing tool '{function_name}': {tool_error}"
logger.error("handle_function_call raised for %s: %s", function_name, tool_error)
finally:
tool_duration = time.time() - tool_start_time
cute_msg = _get_cute_tool_message_impl(function_name, function_args, tool_duration, result=_spinner_result)
spinner.stop(cute_msg)
else:
function_result = handle_function_call(function_name, function_args, effective_task_id)
try:
function_result = handle_function_call(function_name, function_args, effective_task_id)
except Exception as tool_error:
function_result = f"Error executing tool '{function_name}': {tool_error}"
logger.error("handle_function_call raised for %s: %s", function_name, tool_error)
tool_duration = time.time() - tool_start_time
result_preview = function_result[:200] if len(function_result) > 200 else function_result
@@ -2350,12 +2493,19 @@ class AIAgent:
if _is_nous:
summary_extra_body["tags"] = ["product=hermes-agent"]
if self.api_mode == "codex_responses":
summary_kwargs = self._build_api_kwargs(api_messages)
summary_kwargs["tools"] = None
summary_response = self._run_codex_stream(summary_kwargs)
assistant_message, _ = self._normalize_codex_response(summary_response)
final_response = assistant_message.content or ""
summary_kwargs = {
"model": self.model,
"messages": api_messages,
}
if self.max_tokens is not None:
summary_kwargs.update(self._max_tokens_param(self.max_tokens))
if summary_extra_body:
summary_kwargs["extra_body"] = summary_extra_body
summary_response = self.client.chat.completions.create(**summary_kwargs)
if summary_response.choices and summary_response.choices[0].message.content:
final_response = summary_response.choices[0].message.content
if "<think>" in final_response:
final_response = re.sub(r'<think>.*?</think>\s*', '', final_response, flags=re.DOTALL).strip()
if final_response:
@@ -2435,6 +2585,10 @@ class AIAgent:
# Track user turns for memory flush and periodic nudge logic
self._user_turn_count += 1
# Preserve the original user message before nudge injection.
# Honcho should receive the actual user input, not system nudges.
original_user_message = user_message
# Periodic memory nudge: remind the model to consider saving memories.
# Counter resets whenever the memory tool is actually used.
if (self._memory_nudge_interval > 0
@@ -2459,6 +2613,14 @@ class AIAgent:
)
self._iters_since_skill = 0
# Honcho prefetch: retrieve user context for system prompt injection
self._honcho_context = ""
if self._honcho and self._honcho_session_key:
try:
self._honcho_context = self._honcho_prefetch(user_message)
except Exception as e:
logger.debug("Honcho prefetch failed (non-fatal): %s", e)
# Add user message
user_msg = {"role": "user", "content": user_message}
messages.append(user_msg)
@@ -2501,6 +2663,22 @@ class AIAgent:
api_call_count += 1
# Fire step_callback for gateway hooks (agent:step event)
if self.step_callback is not None:
try:
prev_tools = []
for _m in reversed(messages):
if _m.get("role") == "assistant" and _m.get("tool_calls"):
prev_tools = [
tc["function"]["name"]
for tc in _m["tool_calls"]
if isinstance(tc, dict)
]
break
self.step_callback(api_call_count, prev_tools)
except Exception as _step_err:
logger.debug("step_callback error (iteration %s): %s", api_call_count, _step_err)
# Track tool-calling iterations for skill nudge.
# Counter resets whenever skill_manage is actually used.
if (self._skill_nudge_interval > 0
@@ -2538,6 +2716,8 @@ class AIAgent:
effective_system = active_system_prompt or ""
if self.ephemeral_system_prompt:
effective_system = (effective_system + "\n\n" + self.ephemeral_system_prompt).strip()
if self._honcho_context:
effective_system = (effective_system + "\n\n" + self._honcho_context).strip()
if effective_system:
api_messages = [{"role": "system", "content": effective_system}] + api_messages
@@ -2587,7 +2767,7 @@ class AIAgent:
finish_reason = "stop"
while retry_count <= max_retries:
while retry_count < max_retries:
try:
api_kwargs = self._build_api_kwargs(api_messages)
if self.api_mode == "codex_responses":
@@ -2699,6 +2879,7 @@ class AIAgent:
if self._interrupt_requested:
print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.")
self._persist_session(messages, conversation_history)
self.clear_interrupt()
return {
"final_response": "Operation interrupted.",
"messages": messages,
@@ -2837,6 +3018,7 @@ class AIAgent:
if self._interrupt_requested:
print(f"{self.log_prefix}⚡ Interrupt detected during error handling, aborting retries.")
self._persist_session(messages, conversation_history)
self.clear_interrupt()
return {
"final_response": "Operation interrupted.",
"messages": messages,
@@ -2845,10 +3027,45 @@ class AIAgent:
"interrupted": True,
}
# Check for 413 payload-too-large BEFORE generic 4xx handler.
# A 413 is a payload-size error — the correct response is to
# compress history and retry, not abort immediately.
status_code = getattr(api_error, "status_code", None)
is_payload_too_large = (
status_code == 413
or 'request entity too large' in error_msg
or 'payload too large' in error_msg
or 'error code: 413' in error_msg
)
if is_payload_too_large:
print(f"{self.log_prefix}⚠️ Request payload too large (413) - attempting compression...")
original_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=approx_tokens
)
if len(messages) < original_len:
print(f"{self.log_prefix} 🗜️ Compressed {original_len}{len(messages)} messages, retrying...")
continue # Retry with compressed messages
else:
print(f"{self.log_prefix}❌ Payload too large and cannot compress further.")
logging.error(f"{self.log_prefix}413 payload too large. Cannot compress further.")
self._persist_session(messages, conversation_history)
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": "Request payload too large (413). Cannot compress further.",
"partial": True
}
# Check for non-retryable client errors (4xx HTTP status codes).
# These indicate a problem with the request itself (bad model ID,
# invalid API key, forbidden, etc.) and will never succeed on retry.
is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500
# Note: 413 is excluded — it's handled above via compression.
is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
is_client_error = is_client_status_error or any(phrase in error_msg for phrase in [
'error code: 400', 'error code: 401', 'error code: 403',
'error code: 404', 'error code: 422',
@@ -2856,7 +3073,7 @@ class AIAgent:
'invalid api key', 'invalid_api_key', 'authentication',
'unauthorized', 'forbidden', 'not found',
])
if is_client_error:
self._dump_api_request_debug(
api_kwargs, reason="non_retryable_client_error", error=api_error,
@@ -2876,8 +3093,9 @@ class AIAgent:
# Check for non-retryable errors (context length exceeded)
is_context_length_error = any(phrase in error_msg for phrase in [
'context length', 'maximum context', 'token limit',
'too many tokens', 'reduce the length', 'exceeds the limit'
'context length', 'maximum context', 'token limit',
'too many tokens', 'reduce the length', 'exceeds the limit',
'request entity too large', # OpenRouter/Nous 413 safety net
])
if is_context_length_error:
@@ -2912,9 +3130,10 @@ class AIAgent:
raise api_error
wait_time = min(2 ** retry_count, 60) # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s, 60s
print(f"⚠️ OpenAI-compatible API call failed (attempt {retry_count}/{max_retries}): {str(api_error)[:100]}")
print(f"⏳ Retrying in {wait_time}s...")
logging.warning(f"API retry {retry_count}/{max_retries} after error: {api_error}")
if retry_count >= max_retries:
print(f"{self.log_prefix}⚠️ API call failed after {retry_count} attempts: {str(api_error)[:100]}")
print(f"{self.log_prefix}⏳ Final retry in {wait_time}s...")
# Sleep in small increments so we can respond to interrupts quickly
# instead of blocking the entire wait_time in one sleep() call
@@ -2923,6 +3142,7 @@ class AIAgent:
if self._interrupt_requested:
print(f"{self.log_prefix}⚡ Interrupt detected during retry wait, aborting.")
self._persist_session(messages, conversation_history)
self.clear_interrupt()
return {
"final_response": "Operation interrupted.",
"messages": messages,
@@ -3194,7 +3414,8 @@ class AIAgent:
tool_names.append(fn.get("name", "unknown"))
msg["content"] = f"Calling the {', '.join(tool_names)} tool{'s' if len(tool_names) > 1 else ''}..."
break
final_response = fallback
# Strip <think> blocks from fallback content for user display
final_response = self._strip_think_blocks(fallback).strip()
break
# No fallback -- append the empty message as-is
@@ -3253,6 +3474,9 @@ class AIAgent:
codex_ack_continuations = 0
# Strip <think> blocks from user-facing response (keep raw in messages for trajectory)
final_response = self._strip_think_blocks(final_response).strip()
final_msg = self._build_assistant_message(assistant_message, finish_reason)
messages.append(final_msg)
@@ -3327,7 +3551,11 @@ class AIAgent:
# Persist session to both JSON log and SQLite
self._persist_session(messages, conversation_history)
# Sync conversation to Honcho for user modeling
if final_response and not interrupted:
self._honcho_sync(original_user_message, final_response)
# Build result with interrupt info if applicable
result = {
"final_response": final_response,