Files
Timmy-time-dashboard/tests/test_self_modify.py
Alexander Payne 8fec9c41a5 feat: autonomous self-modifying agent with multi-backend LLM support
Adds SelfModifyLoop — an edit→validate→test→commit cycle that can read
its own failure reports, diagnose root causes, and restart autonomously.

Key capabilities:
- Multi-backend LLM: Anthropic Claude API, Ollama, or auto-detect
- Syntax validation via compile() before writing to disk
- Autonomous self-correction loop with configurable max cycles
- XML-based output format to avoid triple-quote delimiter conflicts
- Branch creation skipped by default to prevent container restarts
- CLI: self-modify run "instruction" --backend auto --autonomous
- 939 tests passing, 30 skipped

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-25 17:23:47 -05:00

451 lines
18 KiB
Python

"""Tests for the self-modification loop (self_modify/loop.py).
All tests are fully mocked — no Ollama, no real file I/O, no git.
"""
from unittest.mock import MagicMock, patch
from pathlib import Path
import pytest
from self_modify.loop import SelfModifyLoop, ModifyRequest, ModifyResult
# ── Dataclass tests ───────────────────────────────────────────────────────────
class TestModifyRequest:
def test_defaults(self):
req = ModifyRequest(instruction="Fix the bug")
assert req.instruction == "Fix the bug"
assert req.target_files == []
assert req.dry_run is False
def test_with_target_files(self):
req = ModifyRequest(
instruction="Add docstring",
target_files=["src/foo.py"],
dry_run=True,
)
assert req.target_files == ["src/foo.py"]
assert req.dry_run is True
class TestModifyResult:
def test_success_result(self):
result = ModifyResult(
success=True,
files_changed=["src/foo.py"],
test_passed=True,
commit_sha="abc12345",
branch_name="timmy/self-modify-123",
llm_response="...",
attempts=1,
)
assert result.success
assert result.commit_sha == "abc12345"
assert result.error is None
assert result.autonomous_cycles == 0
def test_failure_result(self):
result = ModifyResult(success=False, error="something broke")
assert not result.success
assert result.error == "something broke"
assert result.files_changed == []
# ── SelfModifyLoop unit tests ────────────────────────────────────────────────
class TestSelfModifyLoop:
def test_init_defaults(self):
loop = SelfModifyLoop()
assert loop._max_retries == 2
def test_init_custom_retries(self):
loop = SelfModifyLoop(max_retries=5)
assert loop._max_retries == 5
def test_init_backend(self):
loop = SelfModifyLoop(backend="anthropic")
assert loop._backend == "anthropic"
def test_init_autonomous(self):
loop = SelfModifyLoop(autonomous=True, max_autonomous_cycles=5)
assert loop._autonomous is True
assert loop._max_autonomous_cycles == 5
@patch("self_modify.loop.settings")
def test_run_disabled(self, mock_settings):
mock_settings.self_modify_enabled = False
loop = SelfModifyLoop()
result = loop.run(ModifyRequest(instruction="test"))
assert not result.success
assert "disabled" in result.error.lower()
@patch("self_modify.loop.os.environ", {"SELF_MODIFY_SKIP_BRANCH": "1"})
@patch("self_modify.loop.settings")
def test_run_no_target_files(self, mock_settings):
mock_settings.self_modify_enabled = True
mock_settings.self_modify_max_retries = 0
mock_settings.self_modify_allowed_dirs = "src,tests"
mock_settings.self_modify_backend = "ollama"
loop = SelfModifyLoop()
loop._infer_target_files = MagicMock(return_value=[])
result = loop.run(ModifyRequest(instruction="do something vague"))
assert not result.success
assert "no target files" in result.error.lower()
@patch("self_modify.loop.os.environ", {"SELF_MODIFY_SKIP_BRANCH": "1"})
@patch("self_modify.loop.settings")
def test_run_success_path(self, mock_settings):
mock_settings.self_modify_enabled = True
mock_settings.self_modify_max_retries = 2
mock_settings.self_modify_allowed_dirs = "src,tests"
mock_settings.self_modify_backend = "ollama"
loop = SelfModifyLoop()
loop._read_files = MagicMock(return_value={"src/foo.py": "old content"})
loop._generate_edits = MagicMock(
return_value=({"src/foo.py": "x = 1\n"}, "llm raw")
)
loop._write_files = MagicMock(return_value=["src/foo.py"])
loop._run_tests = MagicMock(return_value=(True, "5 passed"))
loop._git_commit = MagicMock(return_value="abc12345")
loop._validate_paths = MagicMock()
result = loop.run(
ModifyRequest(instruction="Add docstring", target_files=["src/foo.py"])
)
assert result.success
assert result.test_passed
assert result.commit_sha == "abc12345"
assert result.files_changed == ["src/foo.py"]
loop._run_tests.assert_called_once()
loop._git_commit.assert_called_once()
@patch("self_modify.loop.os.environ", {"SELF_MODIFY_SKIP_BRANCH": "1"})
@patch("self_modify.loop.settings")
def test_run_test_failure_reverts(self, mock_settings):
mock_settings.self_modify_enabled = True
mock_settings.self_modify_max_retries = 0
mock_settings.self_modify_allowed_dirs = "src,tests"
mock_settings.self_modify_backend = "ollama"
loop = SelfModifyLoop(max_retries=0)
loop._read_files = MagicMock(return_value={"src/foo.py": "old content"})
loop._generate_edits = MagicMock(
return_value=({"src/foo.py": "x = 1\n"}, "llm raw")
)
loop._write_files = MagicMock(return_value=["src/foo.py"])
loop._run_tests = MagicMock(return_value=(False, "1 failed"))
loop._revert_files = MagicMock()
loop._validate_paths = MagicMock()
result = loop.run(
ModifyRequest(instruction="Break it", target_files=["src/foo.py"])
)
assert not result.success
assert not result.test_passed
loop._revert_files.assert_called()
@patch("self_modify.loop.os.environ", {"SELF_MODIFY_SKIP_BRANCH": "1"})
@patch("self_modify.loop.settings")
def test_dry_run(self, mock_settings):
mock_settings.self_modify_enabled = True
mock_settings.self_modify_max_retries = 2
mock_settings.self_modify_allowed_dirs = "src,tests"
mock_settings.self_modify_backend = "ollama"
loop = SelfModifyLoop()
loop._read_files = MagicMock(return_value={"src/foo.py": "old content"})
loop._generate_edits = MagicMock(
return_value=({"src/foo.py": "x = 1\n"}, "llm raw")
)
loop._validate_paths = MagicMock()
result = loop.run(
ModifyRequest(
instruction="Add docstring",
target_files=["src/foo.py"],
dry_run=True,
)
)
assert result.success
assert result.files_changed == ["src/foo.py"]
# ── Syntax validation tests ─────────────────────────────────────────────────
class TestSyntaxValidation:
def test_valid_python_passes(self):
loop = SelfModifyLoop()
errors = loop._validate_syntax({"src/foo.py": "x = 1\nprint(x)\n"})
assert errors == {}
def test_invalid_python_caught(self):
loop = SelfModifyLoop()
errors = loop._validate_syntax({"src/foo.py": "def foo(\n"})
assert "src/foo.py" in errors
assert "line" in errors["src/foo.py"]
def test_unterminated_string_caught(self):
loop = SelfModifyLoop()
bad_code = '"""\nTIMMY = """\nstuff\n"""\n'
errors = loop._validate_syntax({"src/foo.py": bad_code})
# This specific code is actually valid, but let's test truly broken code
broken = '"""\nunclosed string\n'
errors = loop._validate_syntax({"src/foo.py": broken})
assert "src/foo.py" in errors
def test_non_python_files_skipped(self):
loop = SelfModifyLoop()
errors = loop._validate_syntax({"README.md": "this is not python {{{}"})
assert errors == {}
@patch("self_modify.loop.os.environ", {"SELF_MODIFY_SKIP_BRANCH": "1"})
@patch("self_modify.loop.settings")
def test_syntax_error_skips_write(self, mock_settings):
"""When LLM produces invalid syntax, we skip writing and retry."""
mock_settings.self_modify_enabled = True
mock_settings.self_modify_max_retries = 1
mock_settings.self_modify_allowed_dirs = "src,tests"
mock_settings.self_modify_backend = "ollama"
loop = SelfModifyLoop(max_retries=1)
loop._read_files = MagicMock(return_value={"src/foo.py": "x = 1\n"})
# First call returns broken syntax, second returns valid
loop._generate_edits = MagicMock(side_effect=[
({"src/foo.py": "def foo(\n"}, "bad llm"),
({"src/foo.py": "def foo():\n pass\n"}, "good llm"),
])
loop._write_files = MagicMock(return_value=["src/foo.py"])
loop._run_tests = MagicMock(return_value=(True, "passed"))
loop._git_commit = MagicMock(return_value="abc123")
loop._validate_paths = MagicMock()
result = loop.run(
ModifyRequest(instruction="Fix foo", target_files=["src/foo.py"])
)
assert result.success
# _write_files should only be called once (for the valid attempt)
loop._write_files.assert_called_once()
# ── Multi-backend tests ──────────────────────────────────────────────────────
class TestBackendResolution:
def test_resolve_ollama(self):
loop = SelfModifyLoop(backend="ollama")
assert loop._resolve_backend() == "ollama"
def test_resolve_anthropic(self):
loop = SelfModifyLoop(backend="anthropic")
assert loop._resolve_backend() == "anthropic"
@patch.dict("os.environ", {"ANTHROPIC_API_KEY": "sk-test-123"})
def test_resolve_auto_with_key(self):
loop = SelfModifyLoop(backend="auto")
assert loop._resolve_backend() == "anthropic"
@patch.dict("os.environ", {}, clear=True)
def test_resolve_auto_without_key(self):
loop = SelfModifyLoop(backend="auto")
assert loop._resolve_backend() == "ollama"
# ── Autonomous loop tests ────────────────────────────────────────────────────
class TestAutonomousLoop:
@patch("self_modify.loop.os.environ", {"SELF_MODIFY_SKIP_BRANCH": "1"})
@patch("self_modify.loop.settings")
def test_autonomous_retries_after_failure(self, mock_settings):
mock_settings.self_modify_enabled = True
mock_settings.self_modify_max_retries = 0
mock_settings.self_modify_allowed_dirs = "src,tests"
mock_settings.self_modify_backend = "ollama"
loop = SelfModifyLoop(max_retries=0, autonomous=True, max_autonomous_cycles=2)
loop._validate_paths = MagicMock()
loop._read_files = MagicMock(return_value={"src/foo.py": "x = 1\n"})
# First run fails, autonomous cycle 1 succeeds
call_count = [0]
def fake_generate(instruction, contents, prev_test_output=None, prev_syntax_errors=None):
call_count[0] += 1
return ({"src/foo.py": "x = 2\n"}, "llm raw")
loop._generate_edits = MagicMock(side_effect=fake_generate)
loop._write_files = MagicMock(return_value=["src/foo.py"])
loop._revert_files = MagicMock()
# First call fails tests, second succeeds
test_results = [(False, "FAILED"), (True, "PASSED")]
loop._run_tests = MagicMock(side_effect=test_results)
loop._git_commit = MagicMock(return_value="abc123")
loop._diagnose_failure = MagicMock(return_value="Fix: do X instead of Y")
result = loop.run(
ModifyRequest(instruction="Fix foo", target_files=["src/foo.py"])
)
assert result.success
assert result.autonomous_cycles == 1
loop._diagnose_failure.assert_called_once()
def test_diagnose_failure_reads_report(self, tmp_path):
report = tmp_path / "report.md"
report.write_text("# Report\n**Error:** SyntaxError line 5\n")
loop = SelfModifyLoop(backend="ollama")
loop._call_llm = MagicMock(return_value="ROOT CAUSE: Missing closing paren")
diagnosis = loop._diagnose_failure(report)
assert "Missing closing paren" in diagnosis
loop._call_llm.assert_called_once()
def test_diagnose_failure_handles_missing_report(self, tmp_path):
loop = SelfModifyLoop(backend="ollama")
result = loop._diagnose_failure(tmp_path / "nonexistent.md")
assert result is None
# ── Path validation tests ─────────────────────────────────────────────────────
class TestPathValidation:
def test_rejects_path_outside_repo(self):
loop = SelfModifyLoop(repo_path=Path("/tmp/test-repo"))
with pytest.raises(ValueError, match="escapes repository"):
loop._validate_paths(["../../etc/passwd"])
def test_rejects_path_outside_allowed_dirs(self):
loop = SelfModifyLoop(repo_path=Path("/tmp/test-repo"))
with pytest.raises(ValueError, match="not in allowed directories"):
loop._validate_paths(["docs/secret.py"])
def test_accepts_src_path(self):
loop = SelfModifyLoop(repo_path=Path("/tmp/test-repo"))
loop._validate_paths(["src/some_module.py"])
def test_accepts_tests_path(self):
loop = SelfModifyLoop(repo_path=Path("/tmp/test-repo"))
loop._validate_paths(["tests/test_something.py"])
# ── File inference tests ──────────────────────────────────────────────────────
class TestFileInference:
def test_infer_explicit_py_path(self):
loop = SelfModifyLoop()
files = loop._infer_target_files("fix bug in src/dashboard/app.py")
assert "src/dashboard/app.py" in files
def test_infer_from_keyword_config(self):
loop = SelfModifyLoop()
files = loop._infer_target_files("update the config to add a new setting")
assert "src/config.py" in files
def test_infer_from_keyword_agent(self):
loop = SelfModifyLoop()
files = loop._infer_target_files("modify the agent prompt")
assert "src/timmy/agent.py" in files
def test_infer_returns_empty_for_vague(self):
loop = SelfModifyLoop()
files = loop._infer_target_files("do something cool")
assert files == []
# ── NLU intent tests ──────────────────────────────────────────────────────────
class TestCodeIntent:
def test_detects_modify_code(self):
from voice.nlu import detect_intent
intent = detect_intent("modify the code in config.py")
assert intent.name == "code"
def test_detects_self_modify(self):
from voice.nlu import detect_intent
intent = detect_intent("self-modify to add a new endpoint")
assert intent.name == "code"
def test_detects_edit_source(self):
from voice.nlu import detect_intent
intent = detect_intent("edit the source to fix the bug")
assert intent.name == "code"
def test_detects_update_your_code(self):
from voice.nlu import detect_intent
intent = detect_intent("update your code to handle errors")
assert intent.name == "code"
def test_detects_fix_function(self):
from voice.nlu import detect_intent
intent = detect_intent("fix the function that calculates totals")
assert intent.name == "code"
def test_does_not_match_general_chat(self):
from voice.nlu import detect_intent
intent = detect_intent("tell me about the weather today")
assert intent.name == "chat"
def test_extracts_target_file_entity(self):
from voice.nlu import detect_intent
intent = detect_intent("modify file src/config.py to add debug flag")
assert intent.entities.get("target_file") == "src/config.py"
# ── Route tests ───────────────────────────────────────────────────────────────
class TestSelfModifyRoutes:
def test_status_endpoint(self, client):
resp = client.get("/self-modify/status")
assert resp.status_code == 200
data = resp.json()
assert "enabled" in data
assert data["enabled"] is False # Default
def test_run_when_disabled(self, client):
resp = client.post("/self-modify/run", data={"instruction": "test"})
assert resp.status_code == 403
# ── DirectToolExecutor integration ────────────────────────────────────────────
class TestDirectToolExecutor:
def test_code_task_falls_back_when_disabled(self):
from swarm.tool_executor import DirectToolExecutor
executor = DirectToolExecutor("forge", "forge-test-001")
result = executor.execute_with_tools("modify the code to fix bug")
# Should fall back to simulated since self_modify_enabled=False
assert isinstance(result, dict)
assert "result" in result or "success" in result
def test_non_code_task_delegates_to_parent(self):
from swarm.tool_executor import DirectToolExecutor
executor = DirectToolExecutor("echo", "echo-test-001")
result = executor.execute_with_tools("search for information")
assert isinstance(result, dict)