feat: secure skill env setup on load (core #688)

When a skill declares required_environment_variables in its YAML frontmatter, missing env vars trigger a secure TUI prompt (identical to the sudo password widget) when the skill is loaded. Secrets flow directly to ~/.hermes/.env, never entering LLM context. Key changes: - New required_environment_variables frontmatter field for skills - Secure TUI widget (masked input, 120s timeout) - Gateway safety: messaging platforms show local setup guidance - Legacy prerequisites.env_vars normalized into new format - Remote backend handling: conservative setup_needed=True - Env var name validation, file permissions hardened to 0o600 - Redact patterns extended for secret-related JSON fields - 12 existing skills updated with prerequisites declarations - ~48 new tests covering skip, timeout, gateway, remote backends - Dynamic panel widget sizing (fixes hardcoded width from original PR) Cherry-picked from PR #723 by kshitijk4poor, rebased onto current main with conflict resolution. Fixes #688 Co-authored-by: kshitijk4poor <kshitijk4poor@users.noreply.github.com>
2026-03-13 03:14:04 -07:00
parent c097e56142
commit ccfbf42844
34 changed files with 2419 additions and 403 deletions
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -9,19 +9,20 @@ import json
 import re
 import uuid
 from types import SimpleNamespace
-from unittest.mock import MagicMock, patch, PropertyMock
+from unittest.mock import MagicMock, patch

 import pytest

 from honcho_integration.client import HonchoClientConfig
 from run_agent import AIAgent
-from agent.prompt_builder import DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS
+from agent.prompt_builder import DEFAULT_AGENT_IDENTITY


 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------

+
 def _make_tool_defs(*names: str) -> list:
    """Build minimal tool definition list accepted by AIAgent.__init__."""
    return [
@@ -41,7 +42,9 @@ def _make_tool_defs(*names: str) -> list:
 def agent():
    """Minimal AIAgent with mocked OpenAI client and tool loading."""
    with (
-        patch("run_agent.get_tool_definitions", return_value=_make_tool_defs("web_search")),
+        patch(
+            "run_agent.get_tool_definitions", return_value=_make_tool_defs("web_search")
+        ),
        patch("run_agent.check_toolset_requirements", return_value={}),
        patch("run_agent.OpenAI"),
    ):
@@ -59,7 +62,10 @@ def agent():
 def agent_with_memory_tool():
    """Agent whose valid_tool_names includes 'memory'."""
    with (
-        patch("run_agent.get_tool_definitions", return_value=_make_tool_defs("web_search", "memory")),
+        patch(
+            "run_agent.get_tool_definitions",
+            return_value=_make_tool_defs("web_search", "memory"),
+        ),
        patch("run_agent.check_toolset_requirements", return_value={}),
        patch("run_agent.OpenAI"),
    ):
@@ -77,6 +83,7 @@ def agent_with_memory_tool():
 # Helper to build mock assistant messages (API response objects)
 # ---------------------------------------------------------------------------

+
 def _mock_assistant_msg(
    content="Hello",
    tool_calls=None,
@@ -95,7 +102,7 @@ def _mock_assistant_msg(
    return msg


-def _mock_tool_call(name="web_search", arguments='{}', call_id=None):
+def _mock_tool_call(name="web_search", arguments="{}", call_id=None):
    """Return a SimpleNamespace mimicking a tool call object."""
    return SimpleNamespace(
        id=call_id or f"call_{uuid.uuid4().hex[:8]}",
@@ -104,8 +111,9 @@ def _mock_tool_call(name="web_search", arguments='{}', call_id=None):
    )


-def _mock_response(content="Hello", finish_reason="stop", tool_calls=None,
-                    reasoning=None, usage=None):
+def _mock_response(
+    content="Hello", finish_reason="stop", tool_calls=None, reasoning=None, usage=None
+):
    """Return a SimpleNamespace mimicking an OpenAI ChatCompletion response."""
    msg = _mock_assistant_msg(
        content=content,
@@ -137,7 +145,10 @@ class TestHasContentAfterThinkBlock:
        assert agent._has_content_after_think_block("<think>reasoning</think>") is False

    def test_content_after_think_returns_true(self, agent):
-        assert agent._has_content_after_think_block("<think>r</think> actual answer") is True
+        assert (
+            agent._has_content_after_think_block("<think>r</think> actual answer")
+            is True
+        )

    def test_no_think_block_returns_true(self, agent):
        assert agent._has_content_after_think_block("just normal content") is True
@@ -439,7 +450,11 @@ class TestHydrateTodoStore:
        history = [
            {"role": "user", "content": "plan"},
            {"role": "assistant", "content": "ok"},
-            {"role": "tool", "content": json.dumps({"todos": todos}), "tool_call_id": "c1"},
+            {
+                "role": "tool",
+                "content": json.dumps({"todos": todos}),
+                "tool_call_id": "c1",
+            },
        ]
        with patch("run_agent._set_interrupt"):
            agent._hydrate_todo_store(history)
@@ -447,7 +462,11 @@ class TestHydrateTodoStore:

    def test_skips_non_todo_tools(self, agent):
        history = [
-            {"role": "tool", "content": '{"result": "search done"}', "tool_call_id": "c1"},
+            {
+                "role": "tool",
+                "content": '{"result": "search done"}',
+                "tool_call_id": "c1",
+            },
        ]
        with patch("run_agent._set_interrupt"):
            agent._hydrate_todo_store(history)
@@ -455,7 +474,11 @@ class TestHydrateTodoStore:

    def test_invalid_json_skipped(self, agent):
        history = [
-            {"role": "tool", "content": 'not valid json "todos" oops', "tool_call_id": "c1"},
+            {
+                "role": "tool",
+                "content": 'not valid json "todos" oops',
+                "tool_call_id": "c1",
+            },
        ]
        with patch("run_agent._set_interrupt"):
            agent._hydrate_todo_store(history)
@@ -473,11 +496,13 @@ class TestBuildSystemPrompt:

    def test_memory_guidance_when_memory_tool_loaded(self, agent_with_memory_tool):
        from agent.prompt_builder import MEMORY_GUIDANCE
+
        prompt = agent_with_memory_tool._build_system_prompt()
        assert MEMORY_GUIDANCE in prompt

    def test_no_memory_guidance_without_tool(self, agent):
        from agent.prompt_builder import MEMORY_GUIDANCE
+
        prompt = agent._build_system_prompt()
        assert MEMORY_GUIDANCE not in prompt

@@ -571,7 +596,9 @@ class TestBuildAssistantMessage:
    def test_tool_call_extra_content_preserved(self, agent):
        """Gemini thinking models attach extra_content with thought_signature
        to tool calls. This must be preserved so subsequent API calls include it."""
-        tc = _mock_tool_call(name="get_weather", arguments='{"city":"NYC"}', call_id="c2")
+        tc = _mock_tool_call(
+            name="get_weather", arguments='{"city":"NYC"}', call_id="c2"
+        )
        tc.extra_content = {"google": {"thought_signature": "abc123"}}
        msg = _mock_assistant_msg(content="", tool_calls=[tc])
        result = agent._build_assistant_message(msg, "tool_calls")
@@ -581,7 +608,7 @@ class TestBuildAssistantMessage:

    def test_tool_call_without_extra_content(self, agent):
        """Standard tool calls (no thinking model) should not have extra_content."""
-        tc = _mock_tool_call(name="web_search", arguments='{}', call_id="c3")
+        tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c3")
        msg = _mock_assistant_msg(content="", tool_calls=[tc])
        result = agent._build_assistant_message(msg, "tool_calls")
        assert "extra_content" not in result["tool_calls"][0]
@@ -618,7 +645,9 @@ class TestExecuteToolCalls:
        tc = _mock_tool_call(name="web_search", arguments='{"q":"test"}', call_id="c1")
        mock_msg = _mock_assistant_msg(content="", tool_calls=[tc])
        messages = []
-        with patch("run_agent.handle_function_call", return_value="search result") as mock_hfc:
+        with patch(
+            "run_agent.handle_function_call", return_value="search result"
+        ) as mock_hfc:
            agent._execute_tool_calls(mock_msg, messages, "task-1")
            # enabled_tools passes the agent's own valid_tool_names
            args, kwargs = mock_hfc.call_args
@@ -629,8 +658,8 @@ class TestExecuteToolCalls:
        assert "search result" in messages[0]["content"]

    def test_interrupt_skips_remaining(self, agent):
-        tc1 = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
-        tc2 = _mock_tool_call(name="web_search", arguments='{}', call_id="c2")
+        tc1 = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
+        tc2 = _mock_tool_call(name="web_search", arguments="{}", call_id="c2")
        mock_msg = _mock_assistant_msg(content="", tool_calls=[tc1, tc2])
        messages = []

@@ -640,10 +669,15 @@ class TestExecuteToolCalls:
        agent._execute_tool_calls(mock_msg, messages, "task-1")
        # Both calls should be skipped with cancellation messages
        assert len(messages) == 2
-        assert "cancelled" in messages[0]["content"].lower() or "interrupted" in messages[0]["content"].lower()
+        assert (
+            "cancelled" in messages[0]["content"].lower()
+            or "interrupted" in messages[0]["content"].lower()
+        )

    def test_invalid_json_args_defaults_empty(self, agent):
-        tc = _mock_tool_call(name="web_search", arguments="not valid json", call_id="c1")
+        tc = _mock_tool_call(
+            name="web_search", arguments="not valid json", call_id="c1"
+        )
        mock_msg = _mock_assistant_msg(content="", tool_calls=[tc])
        messages = []
        with patch("run_agent.handle_function_call", return_value="ok") as mock_hfc:
@@ -657,7 +691,7 @@ class TestExecuteToolCalls:
        assert messages[0]["tool_call_id"] == "c1"

    def test_result_truncation_over_100k(self, agent):
-        tc = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
+        tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
        mock_msg = _mock_assistant_msg(content="", tool_calls=[tc])
        messages = []
        big_result = "x" * 150_000
@@ -719,7 +753,7 @@ class TestRunConversation:

    def test_tool_calls_then_stop(self, agent):
        self._setup_agent(agent)
-        tc = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
+        tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
        resp1 = _mock_response(content="", finish_reason="tool_calls", tool_calls=[tc])
        resp2 = _mock_response(content="Done searching", finish_reason="stop")
        agent.client.chat.completions.create.side_effect = [resp1, resp2]
@@ -745,7 +779,9 @@ class TestRunConversation:
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
            patch("run_agent._set_interrupt"),
-            patch.object(agent, "_interruptible_api_call", side_effect=interrupt_side_effect),
+            patch.object(
+                agent, "_interruptible_api_call", side_effect=interrupt_side_effect
+            ),
        ):
            result = agent.run_conversation("hello")
        assert result["interrupted"] is True
@@ -753,8 +789,10 @@ class TestRunConversation:
    def test_invalid_tool_name_retry(self, agent):
        """Model hallucinates an invalid tool name, agent retries and succeeds."""
        self._setup_agent(agent)
-        bad_tc = _mock_tool_call(name="nonexistent_tool", arguments='{}', call_id="c1")
-        resp_bad = _mock_response(content="", finish_reason="tool_calls", tool_calls=[bad_tc])
+        bad_tc = _mock_tool_call(name="nonexistent_tool", arguments="{}", call_id="c1")
+        resp_bad = _mock_response(
+            content="", finish_reason="tool_calls", tool_calls=[bad_tc]
+        )
        resp_good = _mock_response(content="Got it", finish_reason="stop")
        agent.client.chat.completions.create.side_effect = [resp_bad, resp_good]
        with (
@@ -776,7 +814,9 @@ class TestRunConversation:
        )
        # Return empty 3 times to exhaust retries
        agent.client.chat.completions.create.side_effect = [
-            empty_resp, empty_resp, empty_resp,
+            empty_resp,
+            empty_resp,
+            empty_resp,
        ]
        with (
            patch.object(agent, "_persist_session"),
@@ -804,7 +844,9 @@ class TestRunConversation:
            calls["api"] += 1
            if calls["api"] == 1:
                raise _UnauthorizedError()
-            return _mock_response(content="Recovered after remint", finish_reason="stop")
+            return _mock_response(
+                content="Recovered after remint", finish_reason="stop"
+            )

        def _fake_refresh(*, force=True):
            calls["refresh"] += 1
@@ -816,7 +858,9 @@ class TestRunConversation:
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
            patch.object(agent, "_interruptible_api_call", side_effect=_fake_api_call),
-            patch.object(agent, "_try_refresh_nous_client_credentials", side_effect=_fake_refresh),
+            patch.object(
+                agent, "_try_refresh_nous_client_credentials", side_effect=_fake_refresh
+            ),
        ):
            result = agent.run_conversation("hello")

@@ -830,14 +874,16 @@ class TestRunConversation:
        self._setup_agent(agent)
        agent.compression_enabled = True

-        tc = _mock_tool_call(name="web_search", arguments='{}', call_id="c1")
+        tc = _mock_tool_call(name="web_search", arguments="{}", call_id="c1")
        resp1 = _mock_response(content="", finish_reason="tool_calls", tool_calls=[tc])
        resp2 = _mock_response(content="All done", finish_reason="stop")
        agent.client.chat.completions.create.side_effect = [resp1, resp2]

        with (
            patch("run_agent.handle_function_call", return_value="result"),
-            patch.object(agent.context_compressor, "should_compress", return_value=True),
+            patch.object(
+                agent.context_compressor, "should_compress", return_value=True
+            ),
            patch.object(agent, "_compress_context") as mock_compress,
            patch.object(agent, "_persist_session"),
            patch.object(agent, "_save_trajectory"),
@@ -931,7 +977,9 @@ class TestRetryExhaustion:
            patch("run_agent.time", self._make_fast_time_mock()),
        ):
            result = agent.run_conversation("hello")
-        assert result.get("completed") is False, f"Expected completed=False, got: {result}"
+        assert result.get("completed") is False, (
+            f"Expected completed=False, got: {result}"
+        )
        assert result.get("failed") is True
        assert "error" in result
        assert "Invalid API response" in result["error"]
@@ -954,6 +1002,7 @@ class TestRetryExhaustion:
 # Flush sentinel leak
 # ---------------------------------------------------------------------------

+
 class TestFlushSentinelNotLeaked:
    """_flush_sentinel must be stripped before sending messages to the API."""

@@ -995,6 +1044,7 @@ class TestFlushSentinelNotLeaked:
 # Conversation history mutation
 # ---------------------------------------------------------------------------

+
 class TestConversationHistoryNotMutated:
    """run_conversation must not mutate the caller's conversation_history list."""

@@ -1014,7 +1064,9 @@ class TestConversationHistoryNotMutated:
            patch.object(agent, "_save_trajectory"),
            patch.object(agent, "_cleanup_task_resources"),
        ):
-            result = agent.run_conversation("new question", conversation_history=history)
+            result = agent.run_conversation(
+                "new question", conversation_history=history
+            )

        # Caller's list must be untouched
        assert len(history) == original_len, (
@@ -1028,10 +1080,13 @@ class TestConversationHistoryNotMutated:
 # _max_tokens_param consistency
 # ---------------------------------------------------------------------------

+
 class TestNousCredentialRefresh:
    """Verify Nous credential refresh rebuilds the runtime client."""

-    def test_try_refresh_nous_client_credentials_rebuilds_client(self, agent, monkeypatch):
+    def test_try_refresh_nous_client_credentials_rebuilds_client(
+        self, agent, monkeypatch
+    ):
        agent.provider = "nous"
        agent.api_mode = "chat_completions"

@@ -1057,7 +1112,9 @@ class TestNousCredentialRefresh:
            rebuilt["kwargs"] = kwargs
            return _RebuiltClient()

-        monkeypatch.setattr("hermes_cli.auth.resolve_nous_runtime_credentials", _fake_resolve)
+        monkeypatch.setattr(
+            "hermes_cli.auth.resolve_nous_runtime_credentials", _fake_resolve
+        )

        agent.client = _ExistingClient()
        with patch("run_agent.OpenAI", side_effect=_fake_openai):
@@ -1067,7 +1124,9 @@ class TestNousCredentialRefresh:
        assert closed["value"] is True
        assert captured["force_mint"] is True
        assert rebuilt["kwargs"]["api_key"] == "new-nous-key"
-        assert rebuilt["kwargs"]["base_url"] == "https://inference-api.nousresearch.com/v1"
+        assert (
+            rebuilt["kwargs"]["base_url"] == "https://inference-api.nousresearch.com/v1"
+        )
        assert "default_headers" not in rebuilt["kwargs"]
        assert isinstance(agent.client, _RebuiltClient)