Compare commits
7 Commits
fix/kimi-f
...
epic-999-p
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
735118f21d | ||
|
|
2da4a3d937 | ||
|
|
9a5a299724 | ||
|
|
e5b844af3a | ||
|
|
ab7fd52ae3 | ||
|
|
c266661bff | ||
|
|
5f1cdfc9e4 |
158
agent/claw_runtime.py
Normal file
158
agent/claw_runtime.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""
|
||||
agent/claw_runtime.py — Claw Code runtime decomposition scaffold.
|
||||
Part of EPIC-999 Phase II — The Forge.
|
||||
|
||||
This module introduces the 5-class decomposition of the monolithic AIAgent
|
||||
to enable competing sub-agent rewrites and future runtime replacement.
|
||||
|
||||
Migration rule: each class begins as a thin facade over AIAgent methods.
|
||||
Logic migrates incrementally from run_agent.py into these classes.
|
||||
"""
|
||||
|
||||
from typing import List, Dict, Any, Optional, Callable
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
class ModelResponse:
|
||||
"""Normalized model response, independent of provider."""
|
||||
def __init__(self, content: str = None, tool_calls: list = None, reasoning: str = None):
|
||||
self.content = content or ""
|
||||
self.tool_calls = tool_calls or []
|
||||
self.reasoning = reasoning or ""
|
||||
|
||||
|
||||
class ToolResult:
|
||||
"""Normalized tool execution result."""
|
||||
def __init__(self, tool_call_id: str, output: str, error: str = None):
|
||||
self.tool_call_id = tool_call_id
|
||||
self.output = output
|
||||
self.error = error
|
||||
|
||||
|
||||
class ConversationLoop:
|
||||
"""
|
||||
Owns the while-loop invariant: iteration budget, termination conditions,
|
||||
and the high-level orchestration of turn-taking.
|
||||
"""
|
||||
|
||||
def __init__(self, agent: "AIAgent"):
|
||||
self.agent = agent
|
||||
|
||||
def run(
|
||||
self,
|
||||
messages: List[Dict[str, Any]],
|
||||
tools: List[Dict[str, Any]],
|
||||
system_message: str = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Run the conversation until completion or budget exhaustion.
|
||||
|
||||
Invariant: must terminate before max_iterations and iteration_budget <= 0.
|
||||
"""
|
||||
# Facade: delegate to AIAgent.run_conversation for now.
|
||||
return self.agent.run_conversation(
|
||||
user_message=messages[-1]["content"] if messages else "",
|
||||
system_message=system_message,
|
||||
conversation_history=messages[:-1] if len(messages) > 1 else None,
|
||||
)
|
||||
|
||||
|
||||
class ModelDispatcher:
|
||||
"""
|
||||
Owns all interaction with the LLM client: streaming, fallback activation,
|
||||
response normalization, and provider-specific quirks.
|
||||
"""
|
||||
|
||||
def __init__(self, agent: "AIAgent"):
|
||||
self.agent = agent
|
||||
|
||||
def call(self, model: str, messages: List[Dict], tools: List[Dict], **kwargs) -> ModelResponse:
|
||||
"""
|
||||
Dispatch a single API call and return a normalized response.
|
||||
|
||||
Invariant: always returns a ModelResponse with .content, .tool_calls, .reasoning.
|
||||
"""
|
||||
# Facade: will be populated with logic from AIAgent._interruptible_streaming_api_call
|
||||
# and related normalization helpers.
|
||||
raise NotImplementedError("ModelDispatcher.call() — migrate from AIAgent streaming logic")
|
||||
|
||||
|
||||
class ToolExecutor:
|
||||
"""
|
||||
Owns tool execution: sequential vs concurrent dispatch, error wrapping,
|
||||
and result formatting.
|
||||
"""
|
||||
|
||||
def __init__(self, agent: "AIAgent"):
|
||||
self.agent = agent
|
||||
|
||||
def execute(self, tool_calls: List[Any], task_id: str = None) -> List[ToolResult]:
|
||||
"""
|
||||
Execute a list of tool calls and return normalized results.
|
||||
|
||||
Invariant: every tool_call produces exactly one ToolResult.
|
||||
"""
|
||||
# Facade: delegate to AIAgent._execute_tool_calls_sequential / _concurrent
|
||||
if hasattr(self.agent, "_execute_tool_calls_sequential"):
|
||||
return self.agent._execute_tool_calls_sequential(tool_calls, task_id=task_id)
|
||||
raise NotImplementedError("ToolExecutor.execute() — migrate from AIAgent tool execution")
|
||||
|
||||
|
||||
class MemoryInterceptor:
|
||||
"""
|
||||
Intercepts agent-level tools (memory, todo) before they reach the global registry.
|
||||
Also handles flush-on-exit for pending memories.
|
||||
"""
|
||||
|
||||
def __init__(self, agent: "AIAgent"):
|
||||
self.agent = agent
|
||||
|
||||
def intercept(self, tool_name: str, args: Dict[str, Any], task_id: str = None) -> Optional[str]:
|
||||
"""
|
||||
If the tool_name is 'memory' or 'todo', handle it directly and return the result.
|
||||
Otherwise return None to signal pass-through to the ToolExecutor.
|
||||
|
||||
Invariant: must not mutate agent state except through explicit flush().
|
||||
"""
|
||||
# Facade: will be populated with logic from run_agent.py memory/todo interception.
|
||||
if tool_name in ("memory", "todo"):
|
||||
# Placeholder: actual migration will move the interception block here.
|
||||
return None
|
||||
return None
|
||||
|
||||
def flush(self):
|
||||
"""Flush any pending memories to persistent storage."""
|
||||
if hasattr(self.agent, "flush_memories"):
|
||||
self.agent.flush_memories()
|
||||
|
||||
|
||||
class PromptBuilder:
|
||||
"""
|
||||
Owns system prompt assembly, skill injection, context compression,
|
||||
and prompt caching marker placement.
|
||||
"""
|
||||
|
||||
def __init__(self, agent: "AIAgent"):
|
||||
self.agent = agent
|
||||
|
||||
def build(
|
||||
self,
|
||||
user_message: str,
|
||||
conversation_history: List[Dict[str, Any]] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Build the full message list for the API call.
|
||||
|
||||
Invariant: output list must start with a system message (or equivalent).
|
||||
"""
|
||||
# Facade: delegate to AIAgent._build_system_prompt and related helpers.
|
||||
if hasattr(self.agent, "_build_system_prompt"):
|
||||
system_msg = self.agent._build_system_prompt(user_message)
|
||||
messages = []
|
||||
if system_msg:
|
||||
messages.append({"role": "system", "content": system_msg})
|
||||
if conversation_history:
|
||||
messages.extend(conversation_history)
|
||||
messages.append({"role": "user", "content": user_message})
|
||||
return messages
|
||||
raise NotImplementedError("PromptBuilder.build() — migrate from AIAgent prompt assembly")
|
||||
@@ -64,7 +64,7 @@ class ContextCompressor:
|
||||
model: str,
|
||||
threshold_percent: float = 0.50,
|
||||
protect_first_n: int = 3,
|
||||
protect_last_n: int = 20,
|
||||
protect_last_n: int = 5,
|
||||
summary_target_ratio: float = 0.20,
|
||||
quiet_mode: bool = False,
|
||||
summary_model_override: str = None,
|
||||
|
||||
4657
docs/ouroboros/artifacts/call_graph.json
Normal file
4657
docs/ouroboros/artifacts/call_graph.json
Normal file
File diff suppressed because it is too large
Load Diff
4291
docs/ouroboros/artifacts/core_analysis.json
Normal file
4291
docs/ouroboros/artifacts/core_analysis.json
Normal file
File diff suppressed because it is too large
Load Diff
39340
docs/ouroboros/artifacts/import_graph.json
Normal file
39340
docs/ouroboros/artifacts/import_graph.json
Normal file
File diff suppressed because it is too large
Load Diff
3397
docs/ouroboros/artifacts/module_inventory.json
Normal file
3397
docs/ouroboros/artifacts/module_inventory.json
Normal file
File diff suppressed because it is too large
Load Diff
74
docs/ouroboros/specs/AIAgent_DECOMPOSITION.md
Normal file
74
docs/ouroboros/specs/AIAgent_DECOMPOSITION.md
Normal file
@@ -0,0 +1,74 @@
|
||||
# AIAgent Decomposition Plan (EPIC-999 Phase II Prep)
|
||||
|
||||
## Current State
|
||||
`run_agent.py` contains `AIAgent` — a ~7,000-SLOC class that is the highest-blast-radius module in Hermes.
|
||||
|
||||
## Goal
|
||||
Decompose `AIAgent` into 5 focused classes with strict interfaces, enabling:
|
||||
- Parallel rewrites by competing sub-agents (Phase II)
|
||||
- Independent testing of loop semantics vs. model I/O vs. memory
|
||||
- Future runtime replacement (Hermes Ω) without touching tool infrastructure
|
||||
|
||||
## Proposed Decomposition
|
||||
|
||||
### 1. `ConversationLoop`
|
||||
**Responsibility:** Own the `while` loop invariant, iteration budget, and termination conditions.
|
||||
**Interface:**
|
||||
```python
|
||||
class ConversationLoop:
|
||||
def run(self, messages: list, tools: list, client) -> dict:
|
||||
...
|
||||
```
|
||||
**Invariant:** Must terminate before `max_iterations` and `iteration_budget.remaining <= 0`.
|
||||
|
||||
### 2. `ModelDispatcher`
|
||||
**Responsibility:** All interaction with `client.chat.completions.create`, including streaming, fallback activation, and response normalization.
|
||||
**Interface:**
|
||||
```python
|
||||
class ModelDispatcher:
|
||||
def call(self, model: str, messages: list, tools: list, **kwargs) -> ModelResponse:
|
||||
...
|
||||
```
|
||||
**Invariant:** Must always return a normalized object with `.content`, `.tool_calls`, `.reasoning`.
|
||||
|
||||
### 3. `ToolExecutor`
|
||||
**Responsibility:** Execute tool calls (sequential or concurrent), handle errors, and format results.
|
||||
**Interface:**
|
||||
```python
|
||||
class ToolExecutor:
|
||||
def execute(self, tool_calls: list, task_id: str = None) -> list[ToolResult]:
|
||||
...
|
||||
```
|
||||
**Invariant:** Every tool_call produces exactly one ToolResult, and errors are JSON-serializable.
|
||||
|
||||
### 4. `MemoryInterceptor`
|
||||
**Responsibility:** Intercept `memory` and `todo` tool calls before they reach the registry, plus flush memories on session end.
|
||||
**Interface:**
|
||||
```python
|
||||
class MemoryInterceptor:
|
||||
def intercept(self, tool_name: str, args: dict, task_id: str = None) -> str | None:
|
||||
... # returns result if intercepted, None if pass-through
|
||||
```
|
||||
**Invariant:** Must not mutate agent state except through explicit `flush()` calls.
|
||||
|
||||
### 5. `PromptBuilder`
|
||||
**Responsibility:** Assemble system prompt, inject skills, apply context compression, and manage prompt caching markers.
|
||||
**Interface:**
|
||||
```python
|
||||
class PromptBuilder:
|
||||
def build(self, user_message: str, conversation_history: list) -> list:
|
||||
...
|
||||
```
|
||||
**Invariant:** Output list must start with a system message (or equivalent provider parameter).
|
||||
|
||||
## Migration Path
|
||||
1. Create the 5 classes as thin facades that delegate back to `AIAgent` methods.
|
||||
2. Move logic incrementally from `AIAgent` into the new classes.
|
||||
3. Once `AIAgent` is a pure coordinator (~500 SLOC), freeze the interface.
|
||||
4. Phase II competing agents rewrite one class at a time.
|
||||
|
||||
## Acceptance Criteria
|
||||
- [ ] `AIAgent` reduced to < 1,000 SLOC
|
||||
- [ ] Each new class has > 80% test coverage
|
||||
- [ ] Full existing test suite still passes
|
||||
- [ ] No behavioral regressions in shadow mode
|
||||
263
docs/ouroboros/specs/SPEC.md
Normal file
263
docs/ouroboros/specs/SPEC.md
Normal file
@@ -0,0 +1,263 @@
|
||||
# Hermes Ω Specification Draft (Ouroboros Phase I)
|
||||
|
||||
> Auto-generated by Ezra as part of EPIC-999. This document is a living artifact.
|
||||
|
||||
## Scope
|
||||
This specification covers the core runtime of Hermes agent v0.7.x as found in the `hermes-agent` codebase.
|
||||
|
||||
## High-Level Architecture
|
||||
|
||||
```
|
||||
User Message
|
||||
↓
|
||||
Gateway (gateway/run.py) — platform adapter (Telegram, Discord, CLI, etc.)
|
||||
↓
|
||||
HermesCLI (cli.py) or AIAgent.chat() (run_agent.py)
|
||||
↓
|
||||
ModelTools (model_tools.py) — tool discovery, schema assembly, dispatch
|
||||
↓
|
||||
Tool Registry (tools/registry.py) — handler lookup, availability checks
|
||||
↓
|
||||
Individual Tool Implementations (tools/*.py)
|
||||
↓
|
||||
Results returned up the stack
|
||||
```
|
||||
|
||||
## Module Specifications
|
||||
|
||||
### `run_agent.py`
|
||||
**Lines of Code:** 8948
|
||||
|
||||
**Classes:**
|
||||
- `_SafeWriter`
|
||||
- *Transparent stdio wrapper that catches OSError/ValueError from broken pipes.*
|
||||
- `__init__(self, inner)`
|
||||
- `write(self, data)`
|
||||
- `flush(self)`
|
||||
- `fileno(self)`
|
||||
- `isatty(self)`
|
||||
- ... and 1 more methods
|
||||
- `IterationBudget`
|
||||
- *Thread-safe iteration counter for an agent.*
|
||||
- `__init__(self, max_total)`
|
||||
- `consume(self)`
|
||||
- `refund(self)`
|
||||
- `used(self)`
|
||||
- `remaining(self)`
|
||||
- `AIAgent`
|
||||
- *AI Agent with tool calling capabilities.*
|
||||
- `base_url(self)`
|
||||
- `base_url(self, value)`
|
||||
- `__init__(self, base_url, api_key, provider, api_mode, acp_command, acp_args, command, args, model, max_iterations, tool_delay, enabled_toolsets, disabled_toolsets, save_trajectories, verbose_logging, quiet_mode, ephemeral_system_prompt, log_prefix_chars, log_prefix, providers_allowed, providers_ignored, providers_order, provider_sort, provider_require_parameters, provider_data_collection, session_id, tool_progress_callback, tool_start_callback, tool_complete_callback, thinking_callback, reasoning_callback, clarify_callback, step_callback, stream_delta_callback, tool_gen_callback, status_callback, max_tokens, reasoning_config, prefill_messages, platform, skip_context_files, skip_memory, session_db, iteration_budget, fallback_model, credential_pool, checkpoints_enabled, checkpoint_max_snapshots, pass_session_id, persist_session)`
|
||||
- `reset_session_state(self)`
|
||||
- `_safe_print(self)`
|
||||
- ... and 100 more methods
|
||||
|
||||
**Top-Level Functions:**
|
||||
- `_install_safe_stdio()`
|
||||
- `_is_destructive_command(cmd)`
|
||||
- `_should_parallelize_tool_batch(tool_calls)`
|
||||
- `_extract_parallel_scope_path(tool_name, function_args)`
|
||||
- `_paths_overlap(left, right)`
|
||||
- `_sanitize_surrogates(text)`
|
||||
- `_sanitize_messages_surrogates(messages)`
|
||||
- `_strip_budget_warnings_from_history(messages)`
|
||||
- `main(query, model, api_key, base_url, max_turns, enabled_toolsets, disabled_toolsets, list_tools, save_trajectories, save_sample, verbose, log_prefix_chars)`
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Persists state to SQLite database.
|
||||
- Performs file I/O.
|
||||
- Makes HTTP network calls.
|
||||
- Uses global mutable state (risk factor).
|
||||
|
||||
### `model_tools.py`
|
||||
**Lines of Code:** 466
|
||||
|
||||
**Top-Level Functions:**
|
||||
- `_get_tool_loop()`
|
||||
- `_get_worker_loop()`
|
||||
- `_run_async(coro)`
|
||||
- `_discover_tools()`
|
||||
- `get_tool_definitions(enabled_toolsets, disabled_toolsets, quiet_mode)`
|
||||
- `handle_function_call(function_name, function_args, task_id, user_task, enabled_tools)`
|
||||
- `get_all_tool_names()`
|
||||
- `get_toolset_for_tool(tool_name)`
|
||||
- `get_available_toolsets()`
|
||||
- `check_toolset_requirements()`
|
||||
- ... and 1 more functions
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Uses global mutable state (risk factor).
|
||||
- Primarily pure Python logic / orchestration.
|
||||
|
||||
### `cli.py`
|
||||
**Lines of Code:** 8280
|
||||
|
||||
**Classes:**
|
||||
- `ChatConsole`
|
||||
- *Rich Console adapter for prompt_toolkit's patch_stdout context.*
|
||||
- `__init__(self)`
|
||||
- `print(self)`
|
||||
- `HermesCLI`
|
||||
- *Interactive CLI for the Hermes Agent.*
|
||||
- `__init__(self, model, toolsets, provider, api_key, base_url, max_turns, verbose, compact, resume, checkpoints, pass_session_id)`
|
||||
- `_invalidate(self, min_interval)`
|
||||
- `_status_bar_context_style(self, percent_used)`
|
||||
- `_build_context_bar(self, percent_used, width)`
|
||||
- `_get_status_bar_snapshot(self)`
|
||||
- ... and 106 more methods
|
||||
|
||||
**Top-Level Functions:**
|
||||
- `_load_prefill_messages(file_path)`
|
||||
- `_parse_reasoning_config(effort)`
|
||||
- `load_cli_config()`
|
||||
- `_run_cleanup()`
|
||||
- `_git_repo_root()`
|
||||
- `_path_is_within_root(path, root)`
|
||||
- `_setup_worktree(repo_root)`
|
||||
- `_cleanup_worktree(info)`
|
||||
- `_prune_stale_worktrees(repo_root, max_age_hours)`
|
||||
- `_accent_hex()`
|
||||
- ... and 9 more functions
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Persists state to SQLite database.
|
||||
- Performs file I/O.
|
||||
- Spawns subprocesses / shell commands.
|
||||
- Uses global mutable state (risk factor).
|
||||
|
||||
### `tools/registry.py`
|
||||
**Lines of Code:** 275
|
||||
|
||||
**Classes:**
|
||||
- `ToolEntry`
|
||||
- *Metadata for a single registered tool.*
|
||||
- `__init__(self, name, toolset, schema, handler, check_fn, requires_env, is_async, description, emoji)`
|
||||
- `ToolRegistry`
|
||||
- *Singleton registry that collects tool schemas + handlers from tool files.*
|
||||
- `__init__(self)`
|
||||
- `register(self, name, toolset, schema, handler, check_fn, requires_env, is_async, description, emoji)`
|
||||
- `deregister(self, name)`
|
||||
- `get_definitions(self, tool_names, quiet)`
|
||||
- `dispatch(self, name, args)`
|
||||
- ... and 10 more methods
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Primarily pure Python logic / orchestration.
|
||||
|
||||
### `gateway/run.py`
|
||||
**Lines of Code:** 6657
|
||||
|
||||
**Classes:**
|
||||
- `GatewayRunner`
|
||||
- *Main gateway controller.*
|
||||
- `__init__(self, config)`
|
||||
- `_has_setup_skill(self)`
|
||||
- `_load_voice_modes(self)`
|
||||
- `_save_voice_modes(self)`
|
||||
- `_set_adapter_auto_tts_disabled(self, adapter, chat_id, disabled)`
|
||||
- ... and 78 more methods
|
||||
|
||||
**Top-Level Functions:**
|
||||
- `_ensure_ssl_certs()`
|
||||
- `_normalize_whatsapp_identifier(value)`
|
||||
- `_expand_whatsapp_auth_aliases(identifier)`
|
||||
- `_resolve_runtime_agent_kwargs()`
|
||||
- `_build_media_placeholder(event)`
|
||||
- `_dequeue_pending_text(adapter, session_key)`
|
||||
- `_check_unavailable_skill(command_name)`
|
||||
- `_platform_config_key(platform)`
|
||||
- `_load_gateway_config()`
|
||||
- `_resolve_gateway_model(config)`
|
||||
- ... and 4 more functions
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Persists state to SQLite database.
|
||||
- Performs file I/O.
|
||||
- Spawns subprocesses / shell commands.
|
||||
- Contains async code paths.
|
||||
- Uses global mutable state (risk factor).
|
||||
|
||||
### `hermes_state.py`
|
||||
**Lines of Code:** 1270
|
||||
|
||||
**Classes:**
|
||||
- `SessionDB`
|
||||
- *SQLite-backed session storage with FTS5 search.*
|
||||
- `__init__(self, db_path)`
|
||||
- `_execute_write(self, fn)`
|
||||
- `_try_wal_checkpoint(self)`
|
||||
- `close(self)`
|
||||
- `_init_schema(self)`
|
||||
- ... and 29 more methods
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Persists state to SQLite database.
|
||||
|
||||
### `agent/context_compressor.py`
|
||||
**Lines of Code:** 676
|
||||
|
||||
**Classes:**
|
||||
- `ContextCompressor`
|
||||
- *Compresses conversation context when approaching the model's context limit.*
|
||||
- `__init__(self, model, threshold_percent, protect_first_n, protect_last_n, summary_target_ratio, quiet_mode, summary_model_override, base_url, api_key, config_context_length, provider)`
|
||||
- `update_from_response(self, usage)`
|
||||
- `should_compress(self, prompt_tokens)`
|
||||
- `should_compress_preflight(self, messages)`
|
||||
- `get_status(self)`
|
||||
- ... and 11 more methods
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Primarily pure Python logic / orchestration.
|
||||
|
||||
### `agent/prompt_caching.py`
|
||||
**Lines of Code:** 72
|
||||
|
||||
**Top-Level Functions:**
|
||||
- `_apply_cache_marker(msg, cache_marker, native_anthropic)`
|
||||
- `apply_anthropic_cache_control(api_messages, cache_ttl, native_anthropic)`
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Primarily pure Python logic / orchestration.
|
||||
|
||||
### `agent/skill_commands.py`
|
||||
**Lines of Code:** 297
|
||||
|
||||
**Top-Level Functions:**
|
||||
- `build_plan_path(user_instruction)`
|
||||
- `_load_skill_payload(skill_identifier, task_id)`
|
||||
- `_build_skill_message(loaded_skill, skill_dir, activation_note, user_instruction, runtime_note)`
|
||||
- `scan_skill_commands()`
|
||||
- `get_skill_commands()`
|
||||
- `build_skill_invocation_message(cmd_key, user_instruction, task_id, runtime_note)`
|
||||
- `build_preloaded_skills_prompt(skill_identifiers, task_id)`
|
||||
|
||||
**Inferred Side Effects & Invariants:**
|
||||
- Uses global mutable state (risk factor).
|
||||
- Primarily pure Python logic / orchestration.
|
||||
|
||||
## Cross-Module Dependencies
|
||||
|
||||
Key data flow:
|
||||
1. `run_agent.py` defines `AIAgent` — the canonical conversation loop.
|
||||
2. `model_tools.py` assembles tool schemas and dispatches function calls.
|
||||
3. `tools/registry.py` maintains the central registry; all tool files import it.
|
||||
4. `gateway/run.py` adapts platform events into `AIAgent.run_conversation()` calls.
|
||||
5. `cli.py` (`HermesCLI`) provides the interactive shell and slash-command routing.
|
||||
|
||||
## Known Coupling Risks
|
||||
|
||||
- `run_agent.py` is ~7k SLOC and contains the core loop, todo/memory interception, context compression, and trajectory saving. High blast radius.
|
||||
- `cli.py` is ~6.5k SLOC and combines UI (Rich/prompt_toolkit), config loading, and command dispatch. Tightly coupled to display state.
|
||||
- `model_tools.py` holds a process-global `_last_resolved_tool_names`. Subagent execution saves/restores this global.
|
||||
- `tools/registry.py` is imported by ALL tool files; schema generation happens at import time.
|
||||
|
||||
## Next Actions (Phase II Prep)
|
||||
|
||||
1. Decompose `AIAgent` into: `ConversationLoop`, `ContextManager`, `ToolDispatcher`, `MemoryInterceptor`.
|
||||
2. Extract CLI display logic from command dispatch.
|
||||
3. Define strict interfaces between gateway → agent → tools.
|
||||
4. Write property-based tests for the conversation loop invariant: *given the same message history and tool results, the agent must produce deterministic tool_call ordering*.
|
||||
|
||||
---
|
||||
Generated: 2026-04-05 by Ezra (Phase I)
|
||||
137
docs/ouroboros/specs/test_invariants_stubs.py
Normal file
137
docs/ouroboros/specs/test_invariants_stubs.py
Normal file
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Property-based test stubs for Hermes core invariants.
|
||||
Part of EPIC-999 Phase I — The Mirror.
|
||||
|
||||
These tests define behavioral contracts that ANY rewrite of the runtime
|
||||
must satisfy, including the Hermes Ω target.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Conversation Loop Invariants
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class TestConversationLoopInvariants:
|
||||
"""
|
||||
Invariants for AIAgent.run_conversation and its successors.
|
||||
"""
|
||||
|
||||
def test_deterministic_tool_ordering(self):
|
||||
"""
|
||||
Given the same message history and available tools,
|
||||
the agent must produce the same tool_call ordering.
|
||||
|
||||
(If non-determinism is introduced by temperature > 0,
|
||||
this becomes a statistical test.)
|
||||
"""
|
||||
pytest.skip("TODO: implement with seeded mock model responses")
|
||||
|
||||
def test_tool_result_always_appended_to_history(self):
|
||||
"""
|
||||
After any tool_call is executed, its result MUST appear
|
||||
in the conversation history before the next assistant turn.
|
||||
"""
|
||||
pytest.skip("TODO: mock model with forced tool_call and verify history")
|
||||
|
||||
def test_iteration_budget_never_exceeded(self):
|
||||
"""
|
||||
The loop must terminate before api_call_count >= max_iterations
|
||||
AND before iteration_budget.remaining <= 0.
|
||||
"""
|
||||
pytest.skip("TODO: mock model to always return tool_calls; verify termination")
|
||||
|
||||
def test_system_prompt_presence(self):
|
||||
"""
|
||||
Every API call must include a system message as the first message
|
||||
(or system parameter for providers that support it).
|
||||
"""
|
||||
pytest.skip("TODO: intercept all client.chat.completions.create calls")
|
||||
|
||||
def test_compression_preserves_last_n_messages(self):
|
||||
"""
|
||||
After context compression, the final N messages (configurable,
|
||||
default ~4) must remain uncompressed to preserve local context.
|
||||
"""
|
||||
pytest.skip("TODO: create history > threshold, compress, verify tail")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Tool Registry Invariants
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class TestToolRegistryInvariants:
|
||||
"""
|
||||
Invariants for tools.registry.Registry.
|
||||
"""
|
||||
|
||||
def test_register_then_list_contains_tool(self):
|
||||
"""
|
||||
After register() is called with a valid schema and handler,
|
||||
list_tools() must include the registered name.
|
||||
"""
|
||||
pytest.skip("TODO: instantiate fresh Registry, register, assert membership")
|
||||
|
||||
def test_dispatch_unknown_tool_returns_error_json(self):
|
||||
"""
|
||||
Calling dispatch() with an unregistered tool name must return
|
||||
a JSON string containing an error key, never raise raw.
|
||||
"""
|
||||
pytest.skip("TODO: call dispatch with 'nonexistent_tool', parse result")
|
||||
|
||||
def test_handler_receives_task_id_kwarg(self):
|
||||
"""
|
||||
Registered handlers that accept **kwargs must receive task_id
|
||||
when dispatch is called with one.
|
||||
"""
|
||||
pytest.skip("TODO: register mock handler, dispatch with task_id, verify")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# State Persistence Invariants
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class TestStatePersistenceInvariants:
|
||||
"""
|
||||
Invariants for hermes_state.SessionDB.
|
||||
"""
|
||||
|
||||
def test_saved_message_is_retrievable_by_session_id(self):
|
||||
"""
|
||||
After save_message(session_id, ...), get_messages(session_id)
|
||||
must return the message.
|
||||
"""
|
||||
pytest.skip("TODO: use temp SQLite DB, save, query, assert")
|
||||
|
||||
def test_fts_search_returns_relevant_messages(self):
|
||||
"""
|
||||
After indexing messages, FTS search for a unique keyword
|
||||
must return the message containing it.
|
||||
"""
|
||||
pytest.skip("TODO: seed DB with messages, search unique token")
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Context Compressor Invariants
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
class TestContextCompressorInvariants:
|
||||
"""
|
||||
Invariants for agent.context_compressor.ContextCompressor.
|
||||
"""
|
||||
|
||||
def test_compression_reduces_token_count(self):
|
||||
"""
|
||||
compress_messages(output) must have fewer tokens than
|
||||
the uncompressed input (for any input > threshold).
|
||||
"""
|
||||
pytest.skip("TODO: mock tokenizer, provide long history, assert reduction")
|
||||
|
||||
def test_compression_never_drops_system_message(self):
|
||||
"""
|
||||
The system message must survive compression and remain
|
||||
at index 0 of the returned message list.
|
||||
"""
|
||||
pytest.skip("TODO: compress history with system msg, verify position")
|
||||
252
reports/ezra-quarterly-report-april-2026.md
Normal file
252
reports/ezra-quarterly-report-april-2026.md
Normal file
@@ -0,0 +1,252 @@
|
||||
# Ezra — Quarterly Technical & Strategic Report
|
||||
**April 2026**
|
||||
|
||||
---
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This report consolidates the principal technical and strategic outputs from Q1/Q2 2026. Three major workstreams are covered:
|
||||
|
||||
1. **Security & Performance Hardening** — Shipped V-011 obfuscation detection and context-compressor tuning.
|
||||
2. **System Formalization Audit** — Identified ~6,300 lines of homegrown infrastructure that can be replaced by well-maintained open-source projects.
|
||||
3. **Business Development** — Formalized a pure-contracting go-to-market plan ("Operation Get A Job") to monetize the engineering collective.
|
||||
|
||||
---
|
||||
|
||||
## 1. Recent Deliverables
|
||||
|
||||
### 1.1 V-011 Obfuscation Bypass Detection
|
||||
|
||||
A significant security enhancement was shipped to the skills-guard subsystem to defeat obfuscated malicious skill code.
|
||||
|
||||
**Technical additions:**
|
||||
- `normalize_input()` with NFKC normalization, case folding, and zero-width character removal to defeat homoglyph and ZWSP evasion.
|
||||
- `PythonSecurityAnalyzer` AST visitor detecting `eval`/`exec`/`compile`, `getattr` dunder access, and imports of `base64`/`codecs`/`marshal`/`types`/`ctypes`.
|
||||
- Additional regex patterns for `getattr` builtins chains, `__import__` os/subprocess, and nested base64 decoding.
|
||||
- Full integration into `scan_file()`; Python files now receive both normalized regex scanning and AST-based analysis.
|
||||
|
||||
**Verification:** All tests passing (`103 passed, 4 warnings`).
|
||||
|
||||
**Reference:** Forge PR #131 — `[EPIC-999/Phase II] The Forge — V-011 obfuscation fix + compressor tuning`
|
||||
|
||||
### 1.2 Context Compressor Tuning
|
||||
|
||||
The default `protect_last_n` parameter was reduced from `20` to `5`. The previous default was overly conservative, preventing meaningful compression on long sessions. The new default preserves the five most recent conversational turns while allowing the compressor to effectively reduce token pressure.
|
||||
|
||||
A regression test was added verifying that the last five turns are never summarized away.
|
||||
|
||||
### 1.3 Burn Mode Resilience
|
||||
|
||||
The agent loop was enhanced with a configurable `burn_mode` flag that increases concurrent tool execution capacity and adds transient-failure retry logic.
|
||||
|
||||
**Changes:**
|
||||
- `max_tool_workers` increased from `8` to `16` in burn mode.
|
||||
- Expanded parallel tool coverage to include browser, vision, skill, and session-search tools.
|
||||
- Added batch timeout protection (300s in burn mode / 180s normal) to prevent hung threads from blocking the agent loop.
|
||||
- Thread-pool shutdown now uses `executor.shutdown(wait=False)` for immediate control return.
|
||||
- Transient errors (timeouts, rate limits, 502/503/504) trigger one automatic retry in burn mode.
|
||||
|
||||
---
|
||||
|
||||
## 2. System Formalization Audit
|
||||
|
||||
A comprehensive audit was performed across the `hermes-agent` codebase to identify homegrown modules that could be replaced by mature open-source alternatives. The objective is efficiency: reduce maintenance burden, leverage community expertise, and improve reliability.
|
||||
|
||||
### 2.1 Candidate Matrix
|
||||
|
||||
| Priority | Component | Lines | Current State | Proposed Replacement | Effort | ROI |
|
||||
|:--------:|-----------|------:|---------------|----------------------|:------:|:---:|
|
||||
| **P0** | MCP Client | 2,176 | Custom asyncio transport, sampling, schema translation | `mcp` (official Python SDK) | 2-3 wks | Very High |
|
||||
| **P0** | Cron Scheduler | ~1,500 | Custom JSON job store, manual tick loop | `APScheduler` | 1-2 wks | Very High |
|
||||
| **P0** | Config Management | 2,589 | Manual YAML loader, no type safety | `pydantic-settings` + Pydantic v2 | 3-4 wks | High |
|
||||
| **P1** | Checkpoint Manager | 548 | Shells out to `git` binary | `dulwich` (pure-Python git) | 1 wk | Medium-High |
|
||||
| **P1** | Auth / Credential Pool | ~3,800 | Custom JWT decode, OAuth refresh, JSON auth store | `authlib` + `keyring` + `PyJWT` | 2-3 wks | Medium |
|
||||
| **P1** | Batch Runner | 1,285 | Custom `multiprocessing.Pool` wrapper | `joblib` (local) or `celery` (distributed) | 1-2 wks | Medium |
|
||||
| **P2** | SQLite Session Store | ~2,400 | Raw SQLite + FTS5, manual schema | SQLAlchemy ORM + Alembic | 2-3 wks | Medium |
|
||||
| **P2** | Trajectory Compressor | 1,518 | Custom tokenizer + summarization pipeline | Keep core logic; add `zstandard` for binary storage | 3 days | Low-Medium |
|
||||
| **P2** | Process Registry | 889 | Custom background process tracking | Keep (adds too much ops complexity) | — | Low |
|
||||
| **P2** | Web Tools | 2,080+ | Firecrawl + Parallel wrappers | Keep (Firecrawl is already best-in-class) | — | Low |
|
||||
|
||||
### 2.2 P0 Replacements
|
||||
|
||||
#### MCP Client → Official `mcp` Python SDK
|
||||
|
||||
**Current:** `tools/mcp_tool.py` (2,176 lines) contains custom stdio/HTTP transport lifecycle, manual `anyio` cancel-scope cleanup, hand-rolled schema translation, custom sampling bridge, credential stripping, and reconnection backoff.
|
||||
|
||||
**Problem:** The Model Context Protocol is evolving rapidly. Maintaining a custom 2K-line client means every protocol revision requires manual patches. The official SDK already handles transport negotiation, lifecycle management, and type-safe schema generation.
|
||||
|
||||
**Migration Plan:**
|
||||
1. Add `mcp>=1.0.0` to dependencies.
|
||||
2. Build a thin `HermesMCPBridge` class that instantiates `mcp.ClientSession`, maps MCP `Tool` schemas to Hermes registry calls, forwards tool invocations, and preserves the sampling callback.
|
||||
3. Deprecate the `_mcp_loop` background thread and `anyio`-based transport code.
|
||||
4. Add integration tests against a test MCP server.
|
||||
|
||||
**Lines Saved:** ~1,600
|
||||
**Risk:** Medium — sampling and timeout behavior need parity testing.
|
||||
|
||||
#### Cron Scheduler → APScheduler
|
||||
|
||||
**Current:** `cron/jobs.py` (753 lines) + `cron/scheduler.py` (~740 lines) use a JSON file as the job store, custom `parse_duration` and `compute_next_run` logic, a manual tick loop, and ad-hoc delivery orchestration.
|
||||
|
||||
**Problem:** Scheduling is a solved problem. The homegrown system lacks timezone support, job concurrency controls, graceful clustering, and durable execution guarantees.
|
||||
|
||||
**Migration Plan:**
|
||||
1. Introduce `APScheduler` with a `SQLAlchemyJobStore` (or custom JSON store).
|
||||
2. Refactor each Hermes cron job into an APScheduler `Job` function.
|
||||
3. Preserve existing delivery logic (`_deliver_result`, `_build_job_prompt`, `_run_job_script`) as the job body.
|
||||
4. Migrate `jobs.json` entries into APScheduler jobs on first run.
|
||||
5. Expose `/cron` status via a thin CLI wrapper.
|
||||
|
||||
**Lines Saved:** ~700
|
||||
**Risk:** Low — delivery logic is preserved; only the trigger mechanism changes.
|
||||
|
||||
#### Config Management → `pydantic-settings`
|
||||
|
||||
**Current:** `hermes_cli/config.py` (2,589 lines) uses manual YAML parsing with hardcoded defaults, a complex migration chain (`_config_version` currently at 11), no runtime type validation, and stringly-typed env var resolution.
|
||||
|
||||
**Problem:** Every new config option requires touching multiple places. Migration logic is ~400 lines and growing. Typo'd config values are only caught at runtime, often deep in the agent loop.
|
||||
|
||||
**Migration Plan:**
|
||||
1. Define a `HermesConfig` Pydantic model with nested sections (`ModelConfig`, `ProviderConfig`, `AgentConfig`, `CompressionConfig`, etc.).
|
||||
2. Use `pydantic-settings`'s `SettingsConfigDict(yaml_file="~/.hermes/config.yaml")` to auto-load.
|
||||
3. Map env vars via `env_prefix="HERMES_"` or field-level `validation_alias`.
|
||||
4. Keep the migration layer as a one-time upgrade function, then remove it after two releases.
|
||||
5. Replace `load_config()` call sites with `HermesConfig()` instantiation.
|
||||
|
||||
**Lines Saved:** ~1,500
|
||||
**Risk:** Medium-High — large blast radius; every module reads config. Requires backward compatibility.
|
||||
|
||||
### 2.3 P1 Replacements
|
||||
|
||||
**Checkpoint Manager → `dulwich`**
|
||||
- Replace `subprocess.run(["git", ...])` calls with `dulwich.porcelain` equivalents.
|
||||
- Use `dulwich.repo.Repo.init_bare()` for shadow repos.
|
||||
- Snapshotting becomes an in-memory `Index` write + `commit()`.
|
||||
- **Lines Saved:** ~200
|
||||
- **Risk:** Low
|
||||
|
||||
**Auth / Credential Pool → `authlib` + `keyring` + `PyJWT`**
|
||||
- Use `authlib` for OAuth2 session and token refresh.
|
||||
- Replace custom JWT decoding with `PyJWT`.
|
||||
- Migrate the auth store JSON to `keyring`-backed secure storage where available.
|
||||
- Keep Hermes-specific credential pool strategies (round-robin, least-used, etc.).
|
||||
- **Lines Saved:** ~800
|
||||
- **Risk:** Medium
|
||||
|
||||
**Batch Runner → `joblib`**
|
||||
- For typical local batch sizes, `joblib.Parallel(n_jobs=-1, backend='loky')` replaces the custom worker pool.
|
||||
- Only migrate to Celery if cross-machine distribution is required.
|
||||
- **Lines Saved:** ~400
|
||||
- **Risk:** Low for `joblib`
|
||||
|
||||
### 2.4 Execution Roadmap
|
||||
|
||||
1. **Week 1-2:** Migrate Checkpoint Manager to `dulwich` (quick win, low risk)
|
||||
2. **Week 3-4:** Migrate Cron Scheduler to `APScheduler` (high value, well-contained)
|
||||
3. **Week 5-8:** Migrate MCP Client to official `mcp` SDK (highest complexity, highest payoff)
|
||||
4. **Week 9-12:** Migrate Config Management to `pydantic-settings` (largest blast radius, do last)
|
||||
5. **Ongoing:** Evaluate Auth/Credential Pool and Batch Runner replacements as follow-up epics.
|
||||
|
||||
### 2.5 Cost-Benefit Summary
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Total homebrew lines audited | ~17,000 |
|
||||
| Lines recommended for replacement | ~6,300 |
|
||||
| Estimated dev weeks (P0 + P1) | 10-14 weeks |
|
||||
| New runtime dependencies added | 4-6 well-maintained packages |
|
||||
| Maintenance burden reduction | Very High |
|
||||
| Risk level | Medium (mitigated by strong test coverage) |
|
||||
|
||||
---
|
||||
|
||||
## 3. Strategic Initiative: Operation Get A Job
|
||||
|
||||
### 3.1 Thesis
|
||||
|
||||
The engineering collective is capable of 10x delivery velocity compared to typical market offerings. The strategic opportunity is to monetize this capability through pure contracting — high-tempo, fixed-scope engagements with no exclusivity or employer-like constraints.
|
||||
|
||||
### 3.2 Service Menu
|
||||
|
||||
**Tier A — White-Glove Agent Infrastructure ($400-600/hr)**
|
||||
- Custom AI agent deployment with tool use (Slack, Discord, Telegram, webhooks)
|
||||
- MCP server development
|
||||
- Local LLM stack setup (on-premise / VPC)
|
||||
- Agent security audit and red teaming
|
||||
|
||||
**Tier B — Security Hardening & Code Review ($250-400/hr)**
|
||||
- Security backlog burn-down (CVE-class bugs)
|
||||
- Skills-guard / sandbox hardening
|
||||
- Architecture review
|
||||
|
||||
**Tier C — Automation & Integration ($150-250/hr)**
|
||||
- Webhook-to-action pipelines
|
||||
- Research and intelligence reporting
|
||||
- Content-to-code workflows
|
||||
|
||||
### 3.3 Engagement Packages
|
||||
|
||||
| Service | Description | Timeline | Investment |
|
||||
|---------|-------------|----------|------------|
|
||||
| Agent Security Audit | Review of one AI agent pipeline + written findings | 2-3 business days | $4,500 |
|
||||
| MCP Server Build | One custom MCP server with 3-5 tools + docs + tests | 1-2 weeks | $8,000 |
|
||||
| Custom Bot Deployment | End-to-end bot with up to 5 tools, deployed to client platform | 2-3 weeks | $12,000 |
|
||||
| Security Sprint | Close top 5 security issues in a Python/JS repo | 1-2 weeks | $6,500 |
|
||||
| Monthly Retainer — Core | 20 hrs/month prioritized engineering + triage | Ongoing | $6,000/mo |
|
||||
| Monthly Retainer — Scale | 40 hrs/month prioritized engineering + on-call | Ongoing | $11,000/mo |
|
||||
|
||||
### 3.4 Go-to-Market Motion
|
||||
|
||||
**Immediate channels:**
|
||||
- Cold outbound to CTOs/VPEs at Series A-C AI startups
|
||||
- LinkedIn authority content (architecture reviews, security bulletins)
|
||||
- Platform presence (Gun.io, Toptal, Upwork for specific niche keywords)
|
||||
|
||||
**Lead magnet:** Free 15-minute architecture review. No pitch. One concrete risk identified.
|
||||
|
||||
### 3.5 Infrastructure Foundation
|
||||
|
||||
The Hermes Agent framework serves as both the delivery platform and the portfolio piece:
|
||||
- Open-source runtime with ~3,000 tests
|
||||
- Gateway architecture supporting 8+ messaging platforms
|
||||
- Native MCP client, cron scheduling, subagent delegation
|
||||
- Self-hosted Forge (Gitea) with CI and automated PR review
|
||||
- Local Gemma 4 inference stack on bare metal
|
||||
|
||||
### 3.6 90-Day Revenue Model
|
||||
|
||||
| Month | Target |
|
||||
|-------|--------|
|
||||
| Month 1 | $9-12K (1x retainer or 2x audits) |
|
||||
| Month 2 | $17K (+ 1x MCP build) |
|
||||
| Month 3 | $29K (+ 1x bot deployment + new retainer) |
|
||||
|
||||
### 3.7 Immediate Action Items
|
||||
|
||||
- File Wyoming LLC and obtain EIN
|
||||
- Open Mercury business bank account
|
||||
- Secure E&O insurance
|
||||
- Update LinkedIn profile and publish first authority post
|
||||
- Customize capabilities deck and begin warm outbound
|
||||
|
||||
---
|
||||
|
||||
## 4. Fleet Status Summary
|
||||
|
||||
| House | Host | Model / Provider | Gateway Status |
|
||||
|-------|------|------------------|----------------|
|
||||
| Ezra | Hermes VPS | `kimi-for-coding` (Kimi K2.5) | API `8658`, webhook `8648` — Active |
|
||||
| Bezalel | Hermes VPS | Claude Opus 4.6 (Anthropic) | Port `8645` — Active |
|
||||
| Allegro-Primus | Hermes VPS | Kimi K2.5 | Port `8644` — Requires restart |
|
||||
| Bilbo | External | Gemma 4B (local) | Telegram dual-mode — Active |
|
||||
|
||||
**Network:** Hermes VPS public IP `143.198.27.163` (Ubuntu 24.04.3 LTS). Local Gemma 4 fallback on `127.0.0.1:11435`.
|
||||
|
||||
---
|
||||
|
||||
## 5. Conclusion
|
||||
|
||||
The codebase is in a strong position: security is hardened, the agent loop is more resilient, and a clear roadmap exists to replace high-maintenance homegrown infrastructure with battle-tested open-source projects. The commercialization strategy is formalized and ready for execution. The next critical path is the human-facing work of entity formation, sales outreach, and closing the first fixed-scope engagement.
|
||||
|
||||
Prepared by **Ezra**
|
||||
April 2026
|
||||
BIN
reports/ezra-quarterly-report-april-2026.pdf
Normal file
BIN
reports/ezra-quarterly-report-april-2026.pdf
Normal file
Binary file not shown.
@@ -1145,7 +1145,7 @@ class AIAgent:
|
||||
compression_enabled = str(_compression_cfg.get("enabled", True)).lower() in ("true", "1", "yes")
|
||||
compression_summary_model = _compression_cfg.get("summary_model") or None
|
||||
compression_target_ratio = float(_compression_cfg.get("target_ratio", 0.20))
|
||||
compression_protect_last = int(_compression_cfg.get("protect_last_n", 20))
|
||||
compression_protect_last = int(_compression_cfg.get("protect_last_n", 5))
|
||||
|
||||
# Read explicit context_length override from model config
|
||||
_model_cfg = _agent_cfg.get("model", {})
|
||||
|
||||
191
scripts/forge.py
Normal file
191
scripts/forge.py
Normal file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
forge.py — The Forge: competing sub-agent rewrite pipeline.
|
||||
Part of EPIC-999 Phase II.
|
||||
|
||||
Given a target module, spawn N sub-agents to rewrite it independently.
|
||||
An Arbiter evaluates each candidate on:
|
||||
1. Test pass rate
|
||||
2. SLOC reduction (or bounded increase)
|
||||
3. Cyclomatic complexity
|
||||
4. API surface stability (diff against original public interface)
|
||||
|
||||
The winner is promoted to the integration branch.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
@dataclass
|
||||
class RewriteCandidate:
|
||||
agent_name: str
|
||||
branch_name: str
|
||||
module_path: Path
|
||||
temp_dir: Path
|
||||
test_pass_rate: float = 0.0
|
||||
sloc_delta: int = 0
|
||||
complexity_score: float = 0.0
|
||||
api_surface_score: float = 0.0
|
||||
total_score: float = 0.0
|
||||
logs: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
class ForgePipeline:
|
||||
"""Competing rewrite pipeline for clean-room module evolution."""
|
||||
|
||||
def __init__(self, repo_path: Path, target_module: str, agents: List[str]):
|
||||
self.repo_path = repo_path.resolve()
|
||||
self.target_module = target_module
|
||||
self.agents = agents
|
||||
self.work_dir = Path(tempfile.mkdtemp(prefix="forge_"))
|
||||
self.candidates: List[RewriteCandidate] = []
|
||||
|
||||
def _spawn_agent_rewrite(self, agent_name: str, index: int) -> RewriteCandidate:
|
||||
"""Spawn a single sub-agent rewrite."""
|
||||
branch_name = f"forge-{agent_name}-{int(time.time())}-{index}"
|
||||
candidate_dir = self.work_dir / branch_name
|
||||
candidate_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Copy repo into candidate workspace
|
||||
subprocess.run(
|
||||
["cp", "-r", str(self.repo_path), str(candidate_dir / "repo")],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
candidate = RewriteCandidate(
|
||||
agent_name=agent_name,
|
||||
branch_name=branch_name,
|
||||
module_path=candidate_dir / "repo" / self.target_module,
|
||||
temp_dir=candidate_dir,
|
||||
)
|
||||
|
||||
# TODO: replace with actual sub-agent invocation via delegate_tool.py
|
||||
# For now, we write a marker file so the pipeline structure is testable.
|
||||
marker = candidate_dir / "rewrite.marker"
|
||||
marker.write_text(
|
||||
f"agent={agent_name}\n"
|
||||
f"target={self.target_module}\n"
|
||||
f"timestamp={time.time()}\n"
|
||||
)
|
||||
candidate.logs.append(f"Spawned {agent_name} in {branch_name}")
|
||||
return candidate
|
||||
|
||||
def run_rewrites(self) -> List[RewriteCandidate]:
|
||||
"""Run all competing rewrites in parallel."""
|
||||
print(f"[Forge] Starting {len(self.agents)} competing rewrites for {self.target_module}")
|
||||
for idx, agent in enumerate(self.agents):
|
||||
candidate = self._spawn_agent_rewrite(agent, idx)
|
||||
self.candidates.append(candidate)
|
||||
print(f" -> {candidate.branch_name}")
|
||||
return self.candidates
|
||||
|
||||
def evaluate_candidate(self, candidate: RewriteCandidate) -> RewriteCandidate:
|
||||
"""Run test suite and metrics on a candidate."""
|
||||
repo = candidate.temp_dir / "repo"
|
||||
|
||||
# SLOC calculation
|
||||
try:
|
||||
with open(candidate.module_path, "r", encoding="utf-8") as f:
|
||||
candidate.sloc_delta = len(f.readlines())
|
||||
except Exception as e:
|
||||
candidate.logs.append(f"SLOC error: {e}")
|
||||
|
||||
# Test execution (best-effort; requires venv + deps)
|
||||
test_cmd = [
|
||||
sys.executable, "-m", "pytest",
|
||||
"tests/", "-q", "--tb=short",
|
||||
"-x",
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(
|
||||
test_cmd,
|
||||
cwd=repo,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300,
|
||||
)
|
||||
# Naive pass-rate parsing
|
||||
if "passed" in result.stdout:
|
||||
parts = result.stdout.split(",")
|
||||
passed = 0
|
||||
total = 1
|
||||
for part in parts:
|
||||
if "passed" in part:
|
||||
passed = int(part.strip().split()[0])
|
||||
if "failed" in part or "error" in part:
|
||||
total += int(part.strip().split()[0])
|
||||
total = max(total, passed)
|
||||
candidate.test_pass_rate = passed / total if total else 0.0
|
||||
elif result.returncode == 0:
|
||||
candidate.test_pass_rate = 1.0
|
||||
else:
|
||||
candidate.test_pass_rate = 0.0
|
||||
candidate.logs.append(f"Tests: returncode={result.returncode}")
|
||||
except Exception as e:
|
||||
candidate.logs.append(f"Test error: {e}")
|
||||
candidate.test_pass_rate = 0.0
|
||||
|
||||
# Scoring (naive weighted sum; will be refined)
|
||||
candidate.total_score = (
|
||||
candidate.test_pass_rate * 100.0
|
||||
- max(candidate.sloc_delta - 500, 0) * 0.01 # penalty for bloat
|
||||
)
|
||||
return candidate
|
||||
|
||||
def arbitrate(self) -> RewriteCandidate:
|
||||
"""Evaluate all candidates and return the winner."""
|
||||
print("[Forge] Evaluating candidates...")
|
||||
for candidate in self.candidates:
|
||||
self.evaluate_candidate(candidate)
|
||||
print(f" {candidate.agent_name}: tests={candidate.test_pass_rate:.2%} "
|
||||
f"sloc={candidate.sloc_delta} score={candidate.total_score:.2f}")
|
||||
|
||||
winner = max(self.candidates, key=lambda c: c.total_score)
|
||||
print(f"[Forge] Winner: {winner.agent_name} ({winner.branch_name}) "
|
||||
f"score={winner.total_score:.2f}")
|
||||
return winner
|
||||
|
||||
def promote_winner(self, winner: RewriteCandidate, integration_branch: str):
|
||||
"""Promote the winning candidate to the integration branch."""
|
||||
# TODO: git checkout -b integration_branch, copy winner module, commit, push
|
||||
print(f"[Forge] Promoting {winner.branch_name} -> {integration_branch}")
|
||||
marker = self.repo_path / "FORGE_WINNER.marker"
|
||||
marker.write_text(
|
||||
f"winner={winner.agent_name}\n"
|
||||
f"branch={winner.branch_name}\n"
|
||||
f"score={winner.total_score}\n"
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="The Forge — competing rewrite pipeline")
|
||||
parser.add_argument("--repo", required=True, help="Path to the target repo")
|
||||
parser.add_argument("--module", required=True, help="Target module path (relative to repo)")
|
||||
parser.add_argument("--agents", nargs="+", default=["allegro", "bezalel"],
|
||||
help="Agent names to compete")
|
||||
parser.add_argument("--integration-branch", default="forge-integration",
|
||||
help="Branch to promote winner into")
|
||||
args = parser.parse_args()
|
||||
|
||||
forge = ForgePipeline(
|
||||
repo_path=Path(args.repo),
|
||||
target_module=args.module,
|
||||
agents=args.agents,
|
||||
)
|
||||
forge.run_rewrites()
|
||||
winner = forge.arbitrate()
|
||||
forge.promote_winner(winner, args.integration_branch)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -557,8 +557,41 @@ class TestSummaryTargetRatio:
|
||||
assert c.threshold_percent == 0.50
|
||||
assert c.threshold_tokens == 50_000
|
||||
|
||||
def test_default_protect_last_n_is_20(self):
|
||||
"""Default protect_last_n should be 20."""
|
||||
def test_default_protect_last_n_is_5(self):
|
||||
"""Default protect_last_n should be 5 (Last 5 Turns protection)."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=100_000):
|
||||
c = ContextCompressor(model="test", quiet_mode=True)
|
||||
assert c.protect_last_n == 20
|
||||
assert c.protect_last_n == 5
|
||||
|
||||
def test_last_5_turns_protected_from_compression(self):
|
||||
"""The most recent 5 messages must never be summarized away."""
|
||||
with patch("agent.context_compressor.get_model_context_length", return_value=10_000):
|
||||
c = ContextCompressor(
|
||||
model="test",
|
||||
quiet_mode=True,
|
||||
protect_first_n=2,
|
||||
protect_last_n=5,
|
||||
threshold_percent=0.50,
|
||||
)
|
||||
# Build messages: system + 11 user/assistant exchanges
|
||||
messages = [{"role": "system", "content": "sys"}]
|
||||
for i in range(11):
|
||||
messages.append({"role": "user", "content": f"user {i}"})
|
||||
messages.append({"role": "assistant", "content": f"assistant {i}"})
|
||||
|
||||
c.last_prompt_tokens = 6_000
|
||||
|
||||
with patch.object(c, "_generate_summary", return_value="[SUMMARY]"):
|
||||
result = c.compress(messages, current_tokens=6_000)
|
||||
|
||||
# The tail should preserve the last 5 raw messages from the original list.
|
||||
# Original last 5 messages: assistant 8, user 9, assistant 9, user 10, assistant 10
|
||||
# The summary may be merged into the first tail message to avoid role collision.
|
||||
tail_roles = [m.get("role") for m in result[-5:]]
|
||||
tail_contents = [m.get("content", "") for m in result[-5:]]
|
||||
assert tail_roles == ["assistant", "user", "assistant", "user", "assistant"]
|
||||
assert tail_contents[-4:] == [
|
||||
"user 9", "assistant 9", "user 10", "assistant 10"
|
||||
]
|
||||
# First tail message has the original content preserved (possibly merged with summary)
|
||||
assert "assistant 8" in tail_contents[0]
|
||||
|
||||
@@ -514,3 +514,79 @@ class TestSymlinkPrefixConfusionRegression:
|
||||
new_escapes = not resolved.is_relative_to(skill_dir_resolved)
|
||||
assert old_escapes is False
|
||||
assert new_escapes is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# V-011 Obfuscation Bypass Tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
from tools.skills_guard_v011 import normalize_input, analyze_python_ast
|
||||
|
||||
|
||||
class TestNormalizeInput:
|
||||
def test_nfkc_homoglyphs(self):
|
||||
assert normalize_input("eval") == "eval"
|
||||
|
||||
def test_case_folding(self):
|
||||
assert normalize_input("EVaL") == "eval"
|
||||
|
||||
def test_zwsp_removal(self):
|
||||
assert normalize_input("ev\u200bal") == "eval"
|
||||
assert normalize_input("ex\u200ce\u200dc") == "exec"
|
||||
assert normalize_input("get\ufeffattr") == "getattr"
|
||||
|
||||
def test_combined_obfuscation(self):
|
||||
assert normalize_input("E\u200bVaL") == "eval"
|
||||
|
||||
|
||||
class TestPythonSecurityAnalyzer:
|
||||
def test_detects_eval_call(self):
|
||||
code = "eval('1+1')"
|
||||
findings = analyze_python_ast(code, "test.py")
|
||||
assert any(f.pattern_id == "dynamic_eval_exec" for f in findings)
|
||||
|
||||
def test_detects_exec_call(self):
|
||||
code = "exec('print(1)')"
|
||||
findings = analyze_python_ast(code, "test.py")
|
||||
assert any(f.pattern_id == "dynamic_eval_exec" for f in findings)
|
||||
|
||||
def test_detects_compile_call(self):
|
||||
code = "compile('pass', '<string>', 'exec')"
|
||||
findings = analyze_python_ast(code, "test.py")
|
||||
assert any(f.pattern_id == "dynamic_compile" for f in findings)
|
||||
|
||||
def test_detects_getattr_dunder(self):
|
||||
code = 'getattr(os, "__import__")'
|
||||
findings = analyze_python_ast(code, "test.py")
|
||||
assert any(f.pattern_id == "getattr_dunder" for f in findings)
|
||||
|
||||
def test_detects_import_base64(self):
|
||||
code = "import base64"
|
||||
findings = analyze_python_ast(code, "test.py")
|
||||
assert any(f.pattern_id == "import_base64" for f in findings)
|
||||
|
||||
def test_no_false_positives_on_safe_code(self):
|
||||
code = "print('hello world')"
|
||||
findings = analyze_python_ast(code, "test.py")
|
||||
assert len(findings) == 0
|
||||
|
||||
|
||||
class TestV011Integration:
|
||||
def test_scan_file_catches_obfuscated_eval(self, tmp_path):
|
||||
f = tmp_path / "bad.py"
|
||||
f.write_text("e\u200bVal('1+1')")
|
||||
findings = scan_file(f, "bad.py")
|
||||
assert any("eval" in f.description.lower() for f in findings)
|
||||
|
||||
def test_scan_file_catches_dynamic_exec(self, tmp_path):
|
||||
f = tmp_path / "bad.py"
|
||||
f.write_text("exec('import os')")
|
||||
findings = scan_file(f, "bad.py")
|
||||
assert any(f.pattern_id == "dynamic_eval_exec" for f in findings)
|
||||
|
||||
def test_scan_file_catches_obfuscated_import(self, tmp_path):
|
||||
f = tmp_path / "bad.py"
|
||||
f.write_text("__import__('os')")
|
||||
findings = scan_file(f, "bad.py")
|
||||
# Should be caught by regex after normalization
|
||||
assert any("__import__" in f.description.lower() for f in findings)
|
||||
|
||||
@@ -29,6 +29,12 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
|
||||
from tools.skills_guard_v011 import (
|
||||
normalize_input,
|
||||
analyze_python_ast,
|
||||
V011_OBFUSCATION_PATTERNS,
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -481,7 +487,7 @@ THREAT_PATTERNS = [
|
||||
(r'(send|post|upload|transmit)\s+.*\s+(to|at)\s+https?://',
|
||||
"send_to_url", "high", "exfiltration",
|
||||
"instructs agent to send data to a URL"),
|
||||
]
|
||||
] + V011_OBFUSCATION_PATTERNS
|
||||
|
||||
# Structural limits for skill directories
|
||||
MAX_FILE_COUNT = 50 # skills shouldn't have 50+ files
|
||||
@@ -553,9 +559,12 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
|
||||
lines = content.split('\n')
|
||||
seen = set() # (pattern_id, line_number) for deduplication
|
||||
|
||||
# Regex pattern matching
|
||||
# V-011: Normalize each line before pattern matching
|
||||
normalized_lines = [normalize_input(line) for line in lines]
|
||||
|
||||
# Regex pattern matching (against normalized lines)
|
||||
for pattern, pid, severity, category, description in THREAT_PATTERNS:
|
||||
for i, line in enumerate(lines, start=1):
|
||||
for i, line in enumerate(normalized_lines, start=1):
|
||||
if (pid, i) in seen:
|
||||
continue
|
||||
if re.search(pattern, line, re.IGNORECASE):
|
||||
@@ -589,6 +598,10 @@ def scan_file(file_path: Path, rel_path: str = "") -> List[Finding]:
|
||||
))
|
||||
break # one finding per line for invisible chars
|
||||
|
||||
# V-011: AST-based analysis for Python files
|
||||
if file_path.suffix.lower() == ".py":
|
||||
findings.extend(analyze_python_ast(content, rel_path))
|
||||
|
||||
return findings
|
||||
|
||||
|
||||
|
||||
186
tools/skills_guard_v011.py
Normal file
186
tools/skills_guard_v011.py
Normal file
@@ -0,0 +1,186 @@
|
||||
"""
|
||||
V-011 Skills Guard Bypass fix — Obfuscation detection.
|
||||
|
||||
Adds:
|
||||
- normalize_input() with NFKC + case folding + ZWSP removal
|
||||
- PythonSecurityAnalyzer AST visitor for dynamic execution patterns
|
||||
- Additional obfuscation threat patterns
|
||||
"""
|
||||
|
||||
import ast
|
||||
import re
|
||||
import unicodedata
|
||||
from dataclasses import dataclass
|
||||
from typing import List
|
||||
|
||||
|
||||
@dataclass
|
||||
class Finding:
|
||||
"""Minimal Finding dataclass to avoid circular import with skills_guard.py."""
|
||||
pattern_id: str
|
||||
severity: str
|
||||
category: str
|
||||
file: str
|
||||
line: int
|
||||
match: str
|
||||
description: str
|
||||
|
||||
|
||||
def normalize_input(text: str) -> str:
|
||||
"""
|
||||
Normalize text to defeat common obfuscation bypasses.
|
||||
|
||||
1. Compatibility decomposition (NFKC) — homoglyphs, compat chars
|
||||
2. Case folding — uppercase ↔ lowercase evasion
|
||||
3. Zero-width space / joiner removal
|
||||
"""
|
||||
text = unicodedata.normalize("NFKC", text)
|
||||
text = text.casefold()
|
||||
# Remove zero-width and control characters used for hiding
|
||||
zwsp_chars = "\u200b\u200c\u200d\ufeff\u2060\u180e"
|
||||
for ch in zwsp_chars:
|
||||
text = text.replace(ch, "")
|
||||
return text
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# AST-based Python security analysis
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class PythonSecurityAnalyzer(ast.NodeVisitor):
|
||||
"""AST visitor that detects obfuscated/dynamic execution in Python code."""
|
||||
|
||||
def __init__(self, rel_path: str):
|
||||
self.rel_path = rel_path
|
||||
self.findings: List[Finding] = []
|
||||
self.seen = set()
|
||||
|
||||
def _add(self, pattern_id: str, severity: str, category: str, line: int, match: str, description: str):
|
||||
key = (pattern_id, line, match)
|
||||
if key in self.seen:
|
||||
return
|
||||
self.seen.add(key)
|
||||
if len(match) > 120:
|
||||
match = match[:117] + "..."
|
||||
self.findings.append(Finding(
|
||||
pattern_id=pattern_id,
|
||||
severity=severity,
|
||||
category=category,
|
||||
file=self.rel_path,
|
||||
line=line,
|
||||
match=match,
|
||||
description=description,
|
||||
))
|
||||
|
||||
def visit_Call(self, node: ast.Call):
|
||||
# Detect eval/exec/compile with non-literal args
|
||||
if isinstance(node.func, ast.Name):
|
||||
if node.func.id in ("eval", "exec"):
|
||||
self._add(
|
||||
"dynamic_eval_exec",
|
||||
"high",
|
||||
"obfuscation",
|
||||
getattr(node, "lineno", 0),
|
||||
ast.dump(node)[:120],
|
||||
f"Dynamic {node.func.id}() call detected (possible obfuscation)",
|
||||
)
|
||||
elif node.func.id == "compile":
|
||||
self._add(
|
||||
"dynamic_compile",
|
||||
"high",
|
||||
"obfuscation",
|
||||
getattr(node, "lineno", 0),
|
||||
ast.dump(node)[:120],
|
||||
"compile() call detected (possible code obfuscation)",
|
||||
)
|
||||
elif node.func.id == "getattr" and len(node.args) >= 2:
|
||||
# getattr(..., "__import__") or similar
|
||||
if isinstance(node.args[1], ast.Constant) and isinstance(node.args[1].value, str):
|
||||
if node.args[1].value.startswith("__") and node.args[1].value.endswith("__"):
|
||||
self._add(
|
||||
"getattr_dunder",
|
||||
"high",
|
||||
"obfuscation",
|
||||
getattr(node, "lineno", 0),
|
||||
f'getattr(..., "{node.args[1].value}")',
|
||||
"getattr used to access dunder attribute (possible sandbox escape)",
|
||||
)
|
||||
elif isinstance(node.func, ast.Attribute):
|
||||
if node.func.attr in ("__import__", "_import"):
|
||||
self._add(
|
||||
"dynamic_import",
|
||||
"high",
|
||||
"obfuscation",
|
||||
getattr(node, "lineno", 0),
|
||||
ast.dump(node)[:120],
|
||||
"Dynamic __import__ invocation detected",
|
||||
)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_Import(self, node: ast.Import):
|
||||
# Detect import of known obfuscation modules
|
||||
obf_modules = {"base64", "codecs", "marshal", "types", "ctypes"}
|
||||
for alias in node.names:
|
||||
if alias.name in obf_modules:
|
||||
self._add(
|
||||
f"import_{alias.name}",
|
||||
"medium",
|
||||
"obfuscation",
|
||||
getattr(node, "lineno", 0),
|
||||
f"import {alias.name}",
|
||||
f"{alias.name} import (possible encoding/obfuscation helper)",
|
||||
)
|
||||
self.generic_visit(node)
|
||||
|
||||
def visit_ImportFrom(self, node: ast.ImportFrom):
|
||||
obf_modules = {"base64", "codecs", "marshal", "types", "ctypes"}
|
||||
if node.module in obf_modules:
|
||||
self._add(
|
||||
f"import_from_{node.module}",
|
||||
"medium",
|
||||
"obfuscation",
|
||||
getattr(node, "lineno", 0),
|
||||
f"from {node.module} import ...",
|
||||
f"{node.module} import (possible encoding/obfuscation helper)",
|
||||
)
|
||||
self.generic_visit(node)
|
||||
|
||||
|
||||
def analyze_python_ast(content: str, rel_path: str) -> List[Finding]:
|
||||
"""Run AST analysis on Python content and return findings."""
|
||||
try:
|
||||
tree = ast.parse(content)
|
||||
except SyntaxError:
|
||||
return []
|
||||
analyzer = PythonSecurityAnalyzer(rel_path)
|
||||
analyzer.visit(tree)
|
||||
return analyzer.findings
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Additional obfuscation patterns for regex scanning
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
V011_OBFUSCATION_PATTERNS = [
|
||||
# getattr/__import__ chains as strings
|
||||
(r'getattr\s*\([^)]*__builtins__[^)]*\)',
|
||||
"getattr_builtins_chain", "high", "obfuscation",
|
||||
"getattr chain targeting __builtins__ (sandbox escape)"),
|
||||
(r'__import__\s*\(\s*["\']os["\']',
|
||||
"dunder_import_os", "high", "obfuscation",
|
||||
"__import__ used to load os module (obfuscation)"),
|
||||
(r'__import__\s*\(\s*["\']subprocess["\']',
|
||||
"dunder_import_subprocess", "high", "obfuscation",
|
||||
"__import__ used to load subprocess module (obfuscation)"),
|
||||
# exec/eval with obfuscated wrappers
|
||||
(r'\beval\s*\(\s*\+',
|
||||
"eval_plus_concat", "high", "obfuscation",
|
||||
"eval with string concatenation (obfuscation)"),
|
||||
(r'\bexec\s*\(\s*\+',
|
||||
"exec_plus_concat", "high", "obfuscation",
|
||||
"exec with string concatenation (obfuscation)"),
|
||||
# Base64/hex dynamic execution
|
||||
(r'base64\.(b64decode|decode)\s*\([^)]*\)\s*\)\s*\)',
|
||||
"base64_nested_decode", "high", "obfuscation",
|
||||
"nested base64 decode followed by execution (obfuscation)"),
|
||||
]
|
||||
Reference in New Issue
Block a user