feat: add exit code context for common CLI tools in terminal results (#5144)

When commands like grep, diff, test, or find return non-zero exit codes that aren't actual errors (grep 1 = no matches, diff 1 = files differ), the model wastes turns investigating non-problems. This adds an exit_code_meaning field to the terminal JSON result that explains informational exit codes, so the agent can move on instead of debugging. Covers grep/rg/ag/ack (no matches), diff (files differ), find (partial access), test/[ (condition false), curl (timeouts, DNS, HTTP errors), and git (context-dependent). Correctly extracts the last command from pipelines and chains, strips full paths and env var assignments. The exit_code field itself is unchanged — this is purely additive context.
2026-04-04 16:57:24 -07:00
parent 5879b3ef82
commit aa475aef31
2 changed files with 233 additions and 1 deletions
--- a/tests/tools/test_terminal_exit_semantics.py
+++ b/tests/tools/test_terminal_exit_semantics.py
@@ -0,0 +1,152 @@
+"""Tests for terminal command exit code semantic interpretation."""
+
+import pytest
+
+from tools.terminal_tool import _interpret_exit_code
+
+
+class TestInterpretExitCode:
+    """Test _interpret_exit_code returns correct notes for known command semantics."""
+
+    # ---- exit code 0 always returns None ----
+
+    def test_success_returns_none(self):
+        assert _interpret_exit_code("grep foo bar", 0) is None
+        assert _interpret_exit_code("diff a b", 0) is None
+        assert _interpret_exit_code("test -f /etc/passwd", 0) is None
+
+    # ---- grep / rg family: exit 1 = no matches ----
+
+    @pytest.mark.parametrize("cmd", [
+        "grep 'pattern' file.txt",
+        "egrep 'pattern' file.txt",
+        "fgrep 'pattern' file.txt",
+        "rg 'foo' .",
+        "ag 'foo' .",
+        "ack 'foo' .",
+    ])
+    def test_grep_family_no_matches(self, cmd):
+        result = _interpret_exit_code(cmd, 1)
+        assert result is not None
+        assert "no matches" in result.lower()
+
+    def test_grep_real_error_no_note(self):
+        """grep exit 2+ is a real error — should return None."""
+        assert _interpret_exit_code("grep 'foo' bar", 2) is None
+        assert _interpret_exit_code("rg 'foo' .", 2) is None
+
+    # ---- diff: exit 1 = files differ ----
+
+    def test_diff_files_differ(self):
+        result = _interpret_exit_code("diff file1 file2", 1)
+        assert result is not None
+        assert "differ" in result.lower()
+
+    def test_colordiff_files_differ(self):
+        result = _interpret_exit_code("colordiff file1 file2", 1)
+        assert result is not None
+        assert "differ" in result.lower()
+
+    def test_diff_real_error_no_note(self):
+        assert _interpret_exit_code("diff a b", 2) is None
+
+    # ---- test / [: exit 1 = condition false ----
+
+    def test_test_condition_false(self):
+        result = _interpret_exit_code("test -f /nonexistent", 1)
+        assert result is not None
+        assert "false" in result.lower()
+
+    def test_bracket_condition_false(self):
+        result = _interpret_exit_code("[ -f /nonexistent ]", 1)
+        assert result is not None
+        assert "false" in result.lower()
+
+    # ---- find: exit 1 = partial success ----
+
+    def test_find_partial_success(self):
+        result = _interpret_exit_code("find . -name '*.py'", 1)
+        assert result is not None
+        assert "inaccessible" in result.lower()
+
+    # ---- curl: various informational codes ----
+
+    def test_curl_timeout(self):
+        result = _interpret_exit_code("curl https://example.com", 28)
+        assert result is not None
+        assert "timed out" in result.lower()
+
+    def test_curl_connection_refused(self):
+        result = _interpret_exit_code("curl http://localhost:99999", 7)
+        assert result is not None
+        assert "connect" in result.lower()
+
+    # ---- git: exit 1 is context-dependent ----
+
+    def test_git_diff_exit_1(self):
+        result = _interpret_exit_code("git diff HEAD~1", 1)
+        assert result is not None
+        assert "normal" in result.lower()
+
+    # ---- pipeline / chain handling ----
+
+    def test_pipeline_last_command(self):
+        """In a pipeline, the last command determines the exit code."""
+        result = _interpret_exit_code("ls -la | grep 'pattern'", 1)
+        assert result is not None
+        assert "no matches" in result.lower()
+
+    def test_and_chain_last_command(self):
+        result = _interpret_exit_code("cd /tmp && grep foo bar", 1)
+        assert result is not None
+        assert "no matches" in result.lower()
+
+    def test_semicolon_chain_last_command(self):
+        result = _interpret_exit_code("cat file; diff a b", 1)
+        assert result is not None
+        assert "differ" in result.lower()
+
+    def test_or_chain_last_command(self):
+        result = _interpret_exit_code("false || grep foo bar", 1)
+        assert result is not None
+        assert "no matches" in result.lower()
+
+    # ---- full paths ----
+
+    def test_full_path_command(self):
+        result = _interpret_exit_code("/usr/bin/grep 'foo' bar", 1)
+        assert result is not None
+        assert "no matches" in result.lower()
+
+    # ---- env var prefix ----
+
+    def test_env_var_prefix_stripped(self):
+        result = _interpret_exit_code("LANG=C grep 'foo' bar", 1)
+        assert result is not None
+        assert "no matches" in result.lower()
+
+    def test_multiple_env_vars(self):
+        result = _interpret_exit_code("FOO=1 BAR=2 grep 'foo' bar", 1)
+        assert result is not None
+        assert "no matches" in result.lower()
+
+    # ---- unknown commands return None ----
+
+    @pytest.mark.parametrize("cmd", [
+        "python3 script.py",
+        "rm -rf /tmp/test",
+        "npm test",
+        "make build",
+        "cargo build",
+    ])
+    def test_unknown_commands_return_none(self, cmd):
+        assert _interpret_exit_code(cmd, 1) is None
+
+    # ---- edge cases ----
+
+    def test_empty_command(self):
+        assert _interpret_exit_code("", 1) is None
+
+    def test_only_env_vars(self):
+        """Command with only env var assignments, no actual command."""
+        assert _interpret_exit_code("FOO=bar", 1) is None
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@@ -35,6 +35,7 @@ import json
 import logging
 import os
 import platform
+import re
 import time
 import threading
 import atexit
@@ -899,6 +900,78 @@ def _atexit_cleanup():
 atexit.register(_atexit_cleanup)


+# =============================================================================
+# Exit Code Context for Common CLI Tools
+# =============================================================================
+# Many Unix commands use non-zero exit codes for informational purposes, not
+# to indicate failure.  The model sees a raw exit_code=1 from `grep` and
+# wastes a turn investigating something that just means "no matches".
+# This lookup adds a human-readable note so the agent can move on.
+
+def _interpret_exit_code(command: str, exit_code: int) -> str | None:
+    """Return a human-readable note when a non-zero exit code is non-erroneous.
+
+    Returns None when the exit code is 0 or genuinely signals an error.
+    The note is appended to the tool result so the model doesn't waste
+    turns investigating expected exit codes.
+    """
+    if exit_code == 0:
+        return None
+
+    # Extract the last command in a pipeline/chain — that determines the
+    # exit code.  Handles  `cmd1 && cmd2`, `cmd1 | cmd2`, `cmd1; cmd2`.
+    # Deliberately simple: split on shell operators and take the last piece.
+    segments = re.split(r'\s*(?:\|\||&&|[|;])\s*', command)
+    last_segment = (segments[-1] if segments else command).strip()
+
+    # Get base command name (first word), stripping env var assignments
+    # like  VAR=val cmd ...
+    words = last_segment.split()
+    base_cmd = ""
+    for w in words:
+        if "=" in w and not w.startswith("-"):
+            continue  # skip VAR=val
+        base_cmd = w.split("/")[-1]  # handle /usr/bin/grep -> grep
+        break
+
+    if not base_cmd:
+        return None
+
+    # Command-specific semantics
+    semantics: dict[str, dict[int, str]] = {
+        # grep/rg/ag/ack: 1=no matches found (normal), 2+=real error
+        "grep":  {1: "No matches found (not an error)"},
+        "egrep": {1: "No matches found (not an error)"},
+        "fgrep": {1: "No matches found (not an error)"},
+        "rg":    {1: "No matches found (not an error)"},
+        "ag":    {1: "No matches found (not an error)"},
+        "ack":   {1: "No matches found (not an error)"},
+        # diff: 1=files differ (expected), 2+=real error
+        "diff":  {1: "Files differ (expected, not an error)"},
+        "colordiff": {1: "Files differ (expected, not an error)"},
+        # find: 1=some dirs inaccessible but results may still be valid
+        "find":  {1: "Some directories were inaccessible (partial results may still be valid)"},
+        # test/[: 1=condition is false (expected)
+        "test":  {1: "Condition evaluated to false (expected, not an error)"},
+        "[":     {1: "Condition evaluated to false (expected, not an error)"},
+        # curl: common non-error codes
+        "curl":  {
+            6: "Could not resolve host",
+            7: "Failed to connect to host",
+            22: "HTTP response code indicated error (e.g. 404, 500)",
+            28: "Operation timed out",
+        },
+        # git: 1 is context-dependent but often normal (e.g. git diff with changes)
+        "git":   {1: "Non-zero exit (often normal — e.g. 'git diff' returns 1 when files differ)"},
+    }
+
+    cmd_semantics = semantics.get(base_cmd)
+    if cmd_semantics and exit_code in cmd_semantics:
+        return cmd_semantics[exit_code]
+
+    return None
+
+
 def terminal_tool(
    command: str,
    background: bool = False,
@@ -1242,13 +1315,20 @@ def terminal_tool(
            from agent.redact import redact_sensitive_text
            output = redact_sensitive_text(output.strip()) if output else ""

+            # Interpret non-zero exit codes that aren't real errors
+            # (e.g. grep=1 means "no matches", diff=1 means "files differ")
+            exit_note = _interpret_exit_code(command, returncode)
+
            result_dict = {
                "output": output,
                "exit_code": returncode,
-                "error": None
+                "error": None,
            }
            if approval_note:
                result_dict["approval"] = approval_note
+            if exit_note:
+                result_dict["exit_code_meaning"] = exit_note
+
            return json.dumps(result_dict, ensure_ascii=False)

    except Exception as e: