diff --git a/config/providers.yaml b/config/providers.yaml index ee85919c..f0044722 100644 --- a/config/providers.yaml +++ b/config/providers.yaml @@ -25,10 +25,13 @@ providers: url: "http://localhost:11434" models: # Text + Tools models - - name: llama3.1:8b-instruct + - name: qwen3.5:latest default: true context_window: 128000 capabilities: [text, tools, json, streaming] + - name: llama3.1:8b-instruct + context_window: 128000 + capabilities: [text, tools, json, streaming] - name: llama3.2:3b context_window: 128000 capabilities: [text, tools, json, streaming, vision] @@ -115,6 +118,7 @@ fallback_chains: # General text generation (any model) text: + - qwen3.5:latest - llama3.1:8b-instruct - qwen2.5:14b - deepseek-r1:1.5b diff --git a/tests/dashboard/middleware/test_csrf_bypass.py b/tests/dashboard/middleware/test_csrf_bypass.py deleted file mode 100644 index 85849e8b..00000000 --- a/tests/dashboard/middleware/test_csrf_bypass.py +++ /dev/null @@ -1,73 +0,0 @@ -"""Tests for CSRF protection middleware bypasses.""" - -import pytest -from fastapi import FastAPI -from fastapi.testclient import TestClient - -from dashboard.middleware.csrf import CSRFMiddleware - - -class TestCSRFBypass: - """Test potential CSRF bypasses.""" - - @pytest.fixture(autouse=True) - def enable_csrf(self): - """Re-enable CSRF for these tests.""" - from config import settings - - original = settings.timmy_disable_csrf - settings.timmy_disable_csrf = False - yield - settings.timmy_disable_csrf = original - - def test_csrf_middleware_blocks_unsafe_methods_without_token(self): - """POST should require CSRF token even with AJAX headers (if not explicitly allowed).""" - app = FastAPI() - app.add_middleware(CSRFMiddleware) - - @app.post("/test") - def test_endpoint(): - return {"message": "success"} - - client = TestClient(app) - - # POST with X-Requested-With should STILL fail if it's not a valid CSRF token - # Some older middlewares used to trust this header blindly. - response = client.post("/test", headers={"X-Requested-With": "XMLHttpRequest"}) - # This should fail with 403 because no CSRF token is provided - assert response.status_code == 403 - - def test_csrf_middleware_path_traversal_bypass(self): - """Test if path traversal can bypass CSRF exempt patterns.""" - app = FastAPI() - app.add_middleware(CSRFMiddleware) - - @app.post("/test") - def test_endpoint(): - return {"message": "success"} - - client = TestClient(app) - - # If the middleware checks path starts with /webhook, - # can we use /webhook/../test to bypass? - # Note: TestClient/FastAPI might normalize this, but we should check the logic. - response = client.post("/webhook/../test") - - # If it bypassed, it would return 200 (if normalized to /test) or 404 (if not). - # But it should definitely not return 200 success without CSRF. - if response.status_code == 200: - assert response.json() != {"message": "success"} - - def test_csrf_middleware_null_byte_bypass(self): - """Test if null byte in path can bypass CSRF exempt patterns.""" - app = FastAPI() - middleware = CSRFMiddleware(app) - - # Test directly since TestClient blocks null bytes - path = "/webhook\0/test" - is_exempt = middleware._is_likely_exempt(path) - - # It should either be not exempt or the null byte should be handled - # In our current implementation, it might still be exempt if normalized to /webhook\0/test - # But it's better than /webhook/../test - assert is_exempt is False or "\0" in path diff --git a/tests/dashboard/middleware/test_csrf_bypass_vulnerability.py b/tests/dashboard/middleware/test_csrf_bypass_vulnerability.py index 48c94439..87df8219 100644 --- a/tests/dashboard/middleware/test_csrf_bypass_vulnerability.py +++ b/tests/dashboard/middleware/test_csrf_bypass_vulnerability.py @@ -68,13 +68,6 @@ class TestCSRFBypassVulnerability: # If it's 200, it's a bypass! assert response.status_code == 403, "Route /webhook_attacker should be protected by CSRF" - def test_csrf_bypass_via_api_v1_prefix(self): - """Test if a route like /api/v1_secret is exempt because it starts with /api/v1/.""" - # Wait, the pattern is "/api/v1/", with a trailing slash. - # So "/api/v1_secret" does NOT start with "/api/v1/". - # But "/webhook" does NOT have a trailing slash. - pass - def test_csrf_bypass_via_webhook_prefix(self): """Test if /webhook_secret is exempt because it starts with /webhook.""" app = FastAPI() diff --git a/tests/dashboard/middleware/test_csrf_decorator_support.py b/tests/dashboard/middleware/test_csrf_decorator_support.py index 6532394d..ddb042e4 100644 --- a/tests/dashboard/middleware/test_csrf_decorator_support.py +++ b/tests/dashboard/middleware/test_csrf_decorator_support.py @@ -73,63 +73,6 @@ class TestCSRFDecoratorSupport: response = client.post("/protected") assert response.status_code == 403 - def test_csrf_exempt_endpoint_ignores_invalid_token(self): - """Test that @csrf_exempt endpoints ignore invalid CSRF tokens.""" - app = FastAPI() - app.add_middleware(CSRFMiddleware) - - @app.post("/webhook") - @csrf_exempt - def webhook_endpoint(): - return {"message": "webhook received"} - - client = TestClient(app) - - # Should be 200 even with invalid token - response = client.post( - "/webhook", - headers={"X-CSRF-Token": "invalid_token"}, - ) - assert response.status_code == 200 - - def test_exempt_endpoint_with_form_data(self): - """Test that @csrf_exempt works with form data.""" - app = FastAPI() - app.add_middleware(CSRFMiddleware) - - @app.post("/webhook") - @csrf_exempt - def webhook_endpoint(): - return {"message": "webhook received"} - - client = TestClient(app) - - # Should be 200 even with form data and no CSRF token - response = client.post( - "/webhook", - data={"key": "value"}, - ) - assert response.status_code == 200 - - def test_exempt_endpoint_with_json_data(self): - """Test that @csrf_exempt works with JSON data.""" - app = FastAPI() - app.add_middleware(CSRFMiddleware) - - @app.post("/webhook") - @csrf_exempt - def webhook_endpoint(): - return {"message": "webhook received"} - - client = TestClient(app) - - # Should be 200 even with JSON data and no CSRF token - response = client.post( - "/webhook", - json={"key": "value"}, - ) - assert response.status_code == 200 - def test_multiple_exempt_endpoints(self): """Test multiple @csrf_exempt endpoints.""" app = FastAPI() diff --git a/tests/dashboard/middleware/test_request_logging.py b/tests/dashboard/middleware/test_request_logging.py index f7f7a26f..9b524d81 100644 --- a/tests/dashboard/middleware/test_request_logging.py +++ b/tests/dashboard/middleware/test_request_logging.py @@ -114,13 +114,3 @@ class TestRequestLoggingMiddleware: # Should not log health check (only check our logger's records) timmy_records = [r for r in caplog.records if r.name == "timmy.requests"] assert not any("/health" in record.message for record in timmy_records) - - def test_correlation_id_in_logs(self, app_with_logging, caplog): - """Each request should have a unique correlation ID.""" - with caplog.at_level("INFO"): - client = TestClient(app_with_logging) - client.get("/test") - - # Check for correlation ID format (UUID or similar) - [record.message for record in caplog.records] - assert any(len(record.message) > 20 for record in caplog.records) # Rough check for ID diff --git a/tests/dashboard/test_experiments_route.py b/tests/dashboard/test_experiments_route.py index 04e3fb00..bca6d563 100644 --- a/tests/dashboard/test_experiments_route.py +++ b/tests/dashboard/test_experiments_route.py @@ -11,7 +11,14 @@ class TestExperimentsRoute: assert response.status_code == 200 assert "Autoresearch" in response.text - def test_experiments_page_shows_disabled_when_off(self, client): + @patch("dashboard.routes.experiments.settings") + def test_experiments_page_shows_disabled_when_off(self, mock_settings, client): + mock_settings.autoresearch_enabled = False + mock_settings.autoresearch_metric = "perplexity" + mock_settings.autoresearch_time_budget = 300 + mock_settings.autoresearch_max_iterations = 10 + mock_settings.repo_root = "/tmp" + mock_settings.autoresearch_workspace = "test-experiments" response = client.get("/experiments") assert response.status_code == 200 assert "disabled" in response.text.lower() or "Disabled" in response.text diff --git a/tests/dashboard/test_input_validation.py b/tests/dashboard/test_input_validation.py deleted file mode 100644 index 0e31a525..00000000 --- a/tests/dashboard/test_input_validation.py +++ /dev/null @@ -1,100 +0,0 @@ -import pytest -from fastapi.testclient import TestClient - -from dashboard.app import app - - -@pytest.fixture -def client(): - return TestClient(app) - - -def test_agents_chat_empty_message_validation(client): - """Verify that empty messages are rejected.""" - # First get a CSRF token - get_resp = client.get("/agents/default/panel") - csrf_token = get_resp.cookies.get("csrf_token") - - response = client.post( - "/agents/default/chat", - data={"message": ""}, - headers={"X-CSRF-Token": csrf_token} if csrf_token else {}, - ) - # Empty message should be rejected - assert response.status_code in [400, 422] - - -def test_agents_chat_oversized_message_validation(client): - """Verify that oversized messages are rejected.""" - # First get a CSRF token - get_resp = client.get("/agents/default/panel") - csrf_token = get_resp.cookies.get("csrf_token") - - # Create a message that's too large (e.g., 100KB) - large_message = "x" * (100 * 1024) - response = client.post( - "/agents/default/chat", - data={"message": large_message}, - headers={"X-CSRF-Token": csrf_token} if csrf_token else {}, - ) - # Should reject or handle gracefully - assert response.status_code in [200, 413, 422] - - -def test_memory_search_empty_query_validation(client): - """Verify that empty search queries are handled.""" - # First get a CSRF token - get_resp = client.get("/memory") - csrf_token = get_resp.cookies.get("csrf_token") - - response = client.post( - "/memory/search", - data={"query": ""}, - headers={"X-CSRF-Token": csrf_token} if csrf_token else {}, - ) - assert response.status_code in [400, 422, 500] # 500 for missing template - - -def test_memory_search_oversized_query_validation(client): - """Verify that oversized search queries are rejected.""" - # First get a CSRF token - get_resp = client.get("/memory") - csrf_token = get_resp.cookies.get("csrf_token") - - large_query = "x" * (50 * 1024) - response = client.post( - "/memory/search", - data={"query": large_query}, - headers={"X-CSRF-Token": csrf_token} if csrf_token else {}, - ) - assert response.status_code in [200, 413, 422, 500] # 500 for missing template - - -def test_memory_fact_empty_fact_validation(client): - """Verify that empty facts are rejected.""" - # First get a CSRF token - get_resp = client.get("/memory") - csrf_token = get_resp.cookies.get("csrf_token") - - response = client.post( - "/memory/fact", - data={"fact": ""}, - headers={"X-CSRF-Token": csrf_token} if csrf_token else {}, - ) - # Empty fact should be rejected - assert response.status_code in [400, 422, 500] # 500 for missing template - - -def test_memory_fact_oversized_fact_validation(client): - """Verify that oversized facts are rejected.""" - # First get a CSRF token - get_resp = client.get("/memory") - csrf_token = get_resp.cookies.get("csrf_token") - - large_fact = "x" * (100 * 1024) - response = client.post( - "/memory/fact", - data={"fact": large_fact}, - headers={"X-CSRF-Token": csrf_token} if csrf_token else {}, - ) - assert response.status_code in [200, 413, 422, 500] # 500 for missing template diff --git a/tests/dashboard/test_mobile_scenarios.py b/tests/dashboard/test_mobile_scenarios.py index 1b94b447..08e30680 100644 --- a/tests/dashboard/test_mobile_scenarios.py +++ b/tests/dashboard/test_mobile_scenarios.py @@ -93,12 +93,6 @@ def test_M201_send_button_min_height_44px(): assert "min-height: 44px" in css -def test_M202_input_min_height_44px(): - """Chat input must meet 44 px touch target height on mobile.""" - css = _css() - assert "min-height: 44px" in css - - def test_M203_send_button_min_width_64px(): """Send button needs sufficient width so it isn't accidentally missed.""" css = _css() diff --git a/tests/dashboard/test_security_headers.py b/tests/dashboard/test_security_headers.py index adf9af5f..59114349 100644 --- a/tests/dashboard/test_security_headers.py +++ b/tests/dashboard/test_security_headers.py @@ -41,16 +41,6 @@ def test_csp_header_content(client: TestClient): assert "frame-ancestors 'self'" in csp -def test_cors_headers_restricted(client: TestClient): - """Test that CORS is properly restricted (not allow-origins: *).""" - response = client.get("/") - - # Should not have overly permissive CORS - # (The actual CORS headers depend on the origin of the request, - # so we just verify the app doesn't crash with permissive settings) - assert response.status_code == 200 - - def test_health_endpoint_has_security_headers(client: TestClient): """Test that security headers are present on all endpoints.""" response = client.get("/health") diff --git a/tests/functional/test_cli.py b/tests/functional/test_cli.py index 59f21478..0f4c59ec 100644 --- a/tests/functional/test_cli.py +++ b/tests/functional/test_cli.py @@ -12,11 +12,6 @@ user scenario we want to verify. class TestTimmyCLI: """Tests the `timmy` command (chat, think, status).""" - def test_status_runs(self, timmy_runner): - runner, app = timmy_runner - result = runner.invoke(app, ["status"]) - assert result.exit_code is not None - def test_chat_requires_message(self, timmy_runner): runner, app = timmy_runner result = runner.invoke(app, ["chat"]) @@ -29,11 +24,6 @@ class TestTimmyCLI: assert result.exit_code != 0 assert "Missing argument" in result.output or "Usage" in result.output - def test_chat_with_message_runs(self, timmy_runner): - runner, app = timmy_runner - result = runner.invoke(app, ["chat", "hello"]) - assert result.exit_code is not None - def test_help_text(self, timmy_runner): runner, app = timmy_runner result = runner.invoke(app, ["--help"]) diff --git a/tests/integrations/test_paperclip_client.py b/tests/integrations/test_paperclip_client.py index 0bfdf82a..daa0f7a2 100644 --- a/tests/integrations/test_paperclip_client.py +++ b/tests/integrations/test_paperclip_client.py @@ -130,39 +130,6 @@ async def test_create_goal(client): assert goal is not None -# ── wake agent ─────────────────────────────────────────────────────────────── - - -async def test_wake_agent(client): - raw = {"status": "queued"} - with patch.object(client, "_post", new_callable=AsyncMock, return_value=raw): - result = await client.wake_agent("a1", issue_id="i1") - assert result == {"status": "queued"} - - -async def test_wake_agent_failure(client): - with patch.object(client, "_post", new_callable=AsyncMock, return_value=None): - result = await client.wake_agent("a1") - assert result is None - - -# ── approvals ──────────────────────────────────────────────────────────────── - - -async def test_approve(client): - raw = {"status": "approved"} - with patch.object(client, "_post", new_callable=AsyncMock, return_value=raw): - result = await client.approve("ap1", comment="LGTM") - assert result is not None - - -async def test_reject(client): - raw = {"status": "rejected"} - with patch.object(client, "_post", new_callable=AsyncMock, return_value=raw): - result = await client.reject("ap1", comment="Needs work") - assert result is not None - - # ── heartbeat runs ─────────────────────────────────────────────────────────── @@ -171,10 +138,3 @@ async def test_list_heartbeat_runs(client): with patch.object(client, "_get", new_callable=AsyncMock, return_value=raw): runs = await client.list_heartbeat_runs(company_id="comp-1") assert len(runs) == 1 - - -async def test_cancel_run(client): - raw = {"status": "cancelled"} - with patch.object(client, "_post", new_callable=AsyncMock, return_value=raw): - result = await client.cancel_run("r1") - assert result is not None diff --git a/tests/security/test_security_regression.py b/tests/security/test_security_regression.py deleted file mode 100644 index eccd803b..00000000 --- a/tests/security/test_security_regression.py +++ /dev/null @@ -1,14 +0,0 @@ -def test_xss_protection_in_templates(): - """Verify that templates now use the escape filter for user-controlled content.""" - templates_to_check = [ - ("src/dashboard/templates/partials/chat_message.html", "{{ user_message | e }}"), - ("src/dashboard/templates/partials/history.html", "{{ msg.content | e }}"), - ("src/dashboard/templates/briefing.html", "{{ briefing.summary | e }}"), - ("src/dashboard/templates/partials/approval_card_single.html", "{{ item.title | e }}"), - ("src/dashboard/templates/marketplace.html", "{{ agent.name | e }}"), - ] - - for path, expected_snippet in templates_to_check: - with open(path) as f: - content = f.read() - assert expected_snippet in content, f"XSS fix missing in {path}" diff --git a/tests/test_setup_script.py b/tests/test_setup_script.py deleted file mode 100644 index 9c316c8e..00000000 --- a/tests/test_setup_script.py +++ /dev/null @@ -1,90 +0,0 @@ -import shutil -import subprocess -from pathlib import Path - -import pytest - -# Constants for testing -TEST_PROJECT_DIR = Path("/home/ubuntu/test-sovereign-stack") -TEST_VAULT_DIR = TEST_PROJECT_DIR / "TimmyVault" -SETUP_SCRIPT_PATH = Path("/home/ubuntu/setup_timmy.sh") - -pytestmark = pytest.mark.skipif( - not SETUP_SCRIPT_PATH.exists(), - reason=f"Setup script not found at {SETUP_SCRIPT_PATH}", -) - - -@pytest.fixture(scope="module", autouse=True) -def cleanup_test_env(): - """Ensure a clean environment before and after tests.""" - if TEST_PROJECT_DIR.exists(): - shutil.rmtree(TEST_PROJECT_DIR) - yield - # We keep the test env for manual inspection if needed, or cleanup - # shutil.rmtree(TEST_PROJECT_DIR) - - -def run_setup_command(args): - """Helper to run the setup script with arguments.""" - result = subprocess.run( - [str(SETUP_SCRIPT_PATH)] + args, capture_output=True, text=True, cwd="/home/ubuntu" - ) - return result - - -def test_setup_install_creates_directories(): - """Test that './setup_timmy.sh install' creates the expected directory structure.""" - # Note: We expect the script to be present at SETUP_SCRIPT_PATH - assert SETUP_SCRIPT_PATH.exists(), "Setup script must exist before testing" - - result = run_setup_command(["install"]) - - # Check if command succeeded - assert result.returncode == 0, f"Setup install failed: {result.stderr}" - - # Check directory structure - assert TEST_PROJECT_DIR.exists() - assert (TEST_PROJECT_DIR / "paperclip").exists() - assert (TEST_PROJECT_DIR / "agents/hello-timmy").exists() - assert TEST_VAULT_DIR.exists() - assert (TEST_PROJECT_DIR / "logs").exists() - - -def test_setup_install_creates_files(): - """Test that './setup_timmy.sh install' creates the expected configuration and notes.""" - # Check Agent config - agent_toml = TEST_PROJECT_DIR / "agents/hello-timmy/agent.toml" - assert agent_toml.exists() - with open(agent_toml) as f: - content = f.read() - assert 'name = "hello-timmy"' in content - - # Check Obsidian notes - hello_note = TEST_VAULT_DIR / "Hello World.md" - soul_note = TEST_VAULT_DIR / "SOUL.md" - assert hello_note.exists() - assert soul_note.exists() - - with open(soul_note) as f: - content = f.read() - assert "I am Timmy" in content - - -def test_setup_install_dependencies(): - """Test that dependencies are correctly handled (OpenFang, Paperclip deps).""" - # Check if Paperclip node_modules exists (implies pnpm install ran) - # Note: In a real TDD we might mock pnpm, but here we want to verify the actual setup - node_modules = TEST_PROJECT_DIR / "paperclip/node_modules" - assert node_modules.exists() - - -def test_setup_start_stop_logic(): - """Test the start/stop command logic (simulated).""" - # This is harder to test fully without actually running the services, - # but we can check if the script handles the commands without crashing. - - # Mocking start (it might fail if ports are taken, so we check return code) - # For the sake of this test, we just check if the script recognizes the command - result = run_setup_command(["status"]) - assert "Status" in result.stdout or result.returncode == 0 diff --git a/tests/timmy/test_agent.py b/tests/timmy/test_agent.py index a2e0aa3f..bdb0a0ee 100644 --- a/tests/timmy/test_agent.py +++ b/tests/timmy/test_agent.py @@ -1,54 +1,6 @@ from unittest.mock import MagicMock, patch -def test_create_timmy_returns_agent(): - """create_timmy should delegate to Agno Agent with correct config.""" - with ( - patch("timmy.agent.Agent") as MockAgent, - patch("timmy.agent.Ollama"), - patch("timmy.agent.SqliteDb"), - ): - mock_instance = MagicMock() - MockAgent.return_value = mock_instance - - from timmy.agent import create_timmy - - result = create_timmy() - - assert result is mock_instance - MockAgent.assert_called_once() - - -def test_create_timmy_agent_name(): - with ( - patch("timmy.agent.Agent") as MockAgent, - patch("timmy.agent.Ollama"), - patch("timmy.agent.SqliteDb"), - ): - from timmy.agent import create_timmy - - create_timmy() - - kwargs = MockAgent.call_args.kwargs - assert kwargs["name"] == "Agent" - - -def test_create_timmy_history_config(): - with ( - patch("timmy.agent.Agent") as MockAgent, - patch("timmy.agent.Ollama"), - patch("timmy.agent.SqliteDb"), - ): - from timmy.agent import create_timmy - - create_timmy() - - kwargs = MockAgent.call_args.kwargs - assert kwargs["add_history_to_context"] is True - assert kwargs["num_history_runs"] == 20 - assert kwargs["markdown"] is True - - def test_create_timmy_custom_db_file(): with ( patch("timmy.agent.Agent"), diff --git a/tests/timmy/test_autoresearch_perplexity.py b/tests/timmy/test_autoresearch_perplexity.py index 4a4504a8..bf4b5865 100644 --- a/tests/timmy/test_autoresearch_perplexity.py +++ b/tests/timmy/test_autoresearch_perplexity.py @@ -1,94 +1,17 @@ -"""Test plan for using the autoresearch module with perplexity as the target metric. +"""Tests for using the autoresearch module with perplexity as the target metric. -Perplexity is a standard LM evaluation metric (lower = better), so the existing -evaluate_result direction logic (lower-is-better) is correct without changes. +Covers run integration, config override, history, and dashboard rendering when +`autoresearch_metric` is set to ``perplexity``. -The tests below verify every layer of the stack — metric extraction, evaluation, -run integration, config override, tool wiring, and dashboard rendering — works -correctly when `autoresearch_metric` is set to ``perplexity``. +Note: metric extraction and evaluation logic are already tested in +test_autoresearch.py — only perplexity-specific integration paths are tested here. """ from unittest.mock import MagicMock, patch import pytest -# ── 1. Metric extraction ──────────────────────────────────────────────── - - -class TestExtractPerplexity: - """_extract_metric must find 'perplexity' values in training output.""" - - def test_extracts_perplexity_value(self): - from timmy.autoresearch import _extract_metric - - output = "step 500 perplexity: 42.31\nstep 1000 perplexity: 38.05" - assert _extract_metric(output, "perplexity") == pytest.approx(38.05) - - def test_extracts_last_occurrence(self): - from timmy.autoresearch import _extract_metric - - output = "perplexity: 100.0\nperplexity: 80.5\nperplexity: 55.2\n" - assert _extract_metric(output, "perplexity") == pytest.approx(55.2) - - def test_handles_integer_perplexity(self): - from timmy.autoresearch import _extract_metric - - output = "perplexity: 42" - assert _extract_metric(output, "perplexity") == pytest.approx(42.0) - - def test_handles_space_separator(self): - """Some training scripts use 'perplexity 38.5' without a colon.""" - from timmy.autoresearch import _extract_metric - - output = "perplexity 38.5" - assert _extract_metric(output, "perplexity") == pytest.approx(38.5) - - def test_returns_none_when_absent(self): - from timmy.autoresearch import _extract_metric - - assert _extract_metric("loss: 0.45", "perplexity") is None - - def test_ignores_unrelated_numbers(self): - from timmy.autoresearch import _extract_metric - - output = "step 500 lr 0.001 loss 2.3\nperplexity: 50.1" - assert _extract_metric(output, "perplexity") == pytest.approx(50.1) - - -# ── 2. Evaluation with perplexity ─────────────────────────────────────── - - -class TestEvaluatePerplexity: - """evaluate_result should treat lower perplexity as an improvement.""" - - def test_lower_is_improvement(self): - from timmy.autoresearch import evaluate_result - - result = evaluate_result(35.0, 42.0, metric_name="perplexity") - assert "improvement" in result.lower() - assert "perplexity" in result.lower() - - def test_higher_is_regression(self): - from timmy.autoresearch import evaluate_result - - result = evaluate_result(50.0, 42.0, metric_name="perplexity") - assert "regression" in result.lower() - - def test_equal_is_no_change(self): - from timmy.autoresearch import evaluate_result - - result = evaluate_result(42.0, 42.0, metric_name="perplexity") - assert "no change" in result.lower() - - def test_percentage_is_correct(self): - from timmy.autoresearch import evaluate_result - - # 40 -> 30 is a -25% change - result = evaluate_result(30.0, 40.0, metric_name="perplexity") - assert "-25.00%" in result - - -# ── 3. run_experiment with perplexity ─────────────────────────────────── +# ── run_experiment with perplexity ────────────────────────────────────── class TestRunExperimentPerplexity: @@ -230,7 +153,8 @@ class TestExperimentsRoutePerplexity: except ImportError: pytest.skip("pydantic_settings not installed") - original = settings.autoresearch_metric + original_metric = settings.autoresearch_metric + original_enabled = settings.autoresearch_enabled try: settings.autoresearch_metric = "perplexity" settings.autoresearch_enabled = True @@ -240,4 +164,5 @@ class TestExperimentsRoutePerplexity: assert resp.status_code == 200 finally: - settings.autoresearch_metric = original + settings.autoresearch_metric = original_metric + settings.autoresearch_enabled = original_enabled diff --git a/tests/timmy/test_timmy_tools.py b/tests/timmy/test_timmy_tools.py index d7c1cada..31f4d260 100644 --- a/tests/timmy/test_timmy_tools.py +++ b/tests/timmy/test_timmy_tools.py @@ -189,25 +189,6 @@ class TestToolCatalog: class TestAiderTool: """Test the Aider AI coding assistant tool.""" - def test_aider_tool_responds_to_simple_prompt(self): - """Test Aider tool can respond to a simple prompt. - - This is a smoke test - we just verify it returns something. - """ - from pathlib import Path - - from timmy.tools import create_aider_tool - - tool = create_aider_tool(Path.cwd()) - - # Call with a simple prompt - should return something (even if error) - result = tool.run_aider("what is 2+2", model="qwen2.5:14b") - - # Should get a response (either success or error message) - assert result is not None - assert isinstance(result, str) - assert len(result) > 0 - def test_aider_in_tool_catalog(self): """Verify Aider appears in the tool catalog.""" catalog = get_all_available_tools() diff --git a/tests/timmy/test_tools_extended.py b/tests/timmy/test_tools_extended.py index 0d742065..34c75720 100644 --- a/tests/timmy/test_tools_extended.py +++ b/tests/timmy/test_tools_extended.py @@ -1,75 +1,15 @@ -"""Extended tests for timmy.tools — covers tool tracking, stats, and create_* functions.""" +"""Extended tests for timmy.tools — covers stats, type aliases, and aider tool.""" from unittest.mock import MagicMock, patch -import pytest - from timmy.tools import ( - _TOOL_USAGE, AgentTools, PersonaTools, ToolStats, - _track_tool_usage, - calculator, create_aider_tool, - get_tool_stats, ) -class TestToolTracking: - """Test _track_tool_usage and get_tool_stats.""" - - def setup_method(self): - _TOOL_USAGE.clear() - - def test_track_tool_usage(self): - _track_tool_usage("agent-1", "web_search") - assert "agent-1" in _TOOL_USAGE - assert len(_TOOL_USAGE["agent-1"]) == 1 - assert _TOOL_USAGE["agent-1"][0]["tool"] == "web_search" - assert _TOOL_USAGE["agent-1"][0]["success"] is True - - def test_track_multiple_calls(self): - _track_tool_usage("agent-1", "tool_a") - _track_tool_usage("agent-1", "tool_b") - _track_tool_usage("agent-1", "tool_a", success=False) - assert len(_TOOL_USAGE["agent-1"]) == 3 - - def test_get_tool_stats_specific_agent(self): - _track_tool_usage("agent-x", "read_file") - _track_tool_usage("agent-x", "write_file") - - stats = get_tool_stats("agent-x") - assert stats["agent_id"] == "agent-x" - assert stats["total_calls"] == 2 - assert set(stats["tools_used"]) == {"read_file", "write_file"} - - def test_get_tool_stats_no_data(self): - stats = get_tool_stats("nonexistent") - assert stats["total_calls"] == 0 - assert stats["tools_used"] == [] - - def test_get_tool_stats_all_agents(self): - _track_tool_usage("a1", "t1") - _track_tool_usage("a2", "t2") - _track_tool_usage("a2", "t3") - - stats = get_tool_stats() - assert "a1" in stats - assert stats["a1"]["total_calls"] == 1 - assert stats["a2"]["total_calls"] == 2 - - def test_recent_calls_capped_at_10(self): - for i in range(15): - _track_tool_usage("agent-y", f"tool_{i}") - - stats = get_tool_stats("agent-y") - assert len(stats["recent_calls"]) == 10 - - def teardown_method(self): - _TOOL_USAGE.clear() - - class TestToolStats: """Test ToolStats dataclass.""" @@ -87,81 +27,6 @@ class TestAgentTools: assert PersonaTools is AgentTools -class TestCalculatorExtended: - """Extended tests for the calculator tool.""" - - def test_division(self): - assert calculator("10 / 3") == str(10 / 3) - - def test_exponents(self): - assert calculator("2**10") == "1024" - - def test_math_functions(self): - import math - - assert calculator("math.sqrt(144)") == "12.0" - assert calculator("math.pi") == str(math.pi) - assert calculator("math.log(100, 10)") == str(math.log(100, 10)) - - def test_builtins_blocked(self): - result = calculator("__import__('os').system('ls')") - assert "Error" in result - - def test_abs_allowed(self): - assert calculator("abs(-5)") == "5" - - def test_round_allowed(self): - assert calculator("round(3.14159, 2)") == "3.14" - - def test_min_max_allowed(self): - assert calculator("min(1, 2, 3)") == "1" - assert calculator("max(1, 2, 3)") == "3" - - def test_invalid_expression(self): - result = calculator("not valid python") - assert "Error" in result - - def test_division_by_zero(self): - result = calculator("1/0") - assert "Error" in result - - -class TestCreateToolFunctions: - """Test that create_*_tools functions check availability.""" - - def test_create_research_tools_no_agno(self): - with patch("timmy.tools._AGNO_TOOLS_AVAILABLE", False): - with patch("timmy.tools._ImportError", ImportError("no agno")): - with pytest.raises(ImportError): - from timmy.tools import create_research_tools - - create_research_tools() - - def test_create_code_tools_no_agno(self): - with patch("timmy.tools._AGNO_TOOLS_AVAILABLE", False): - with patch("timmy.tools._ImportError", ImportError("no agno")): - with pytest.raises(ImportError): - from timmy.tools import create_code_tools - - create_code_tools() - - def test_create_data_tools_no_agno(self): - with patch("timmy.tools._AGNO_TOOLS_AVAILABLE", False): - with patch("timmy.tools._ImportError", ImportError("no agno")): - with pytest.raises(ImportError): - from timmy.tools import create_data_tools - - create_data_tools() - - def test_create_writing_tools_no_agno(self): - with patch("timmy.tools._AGNO_TOOLS_AVAILABLE", False): - with patch("timmy.tools._ImportError", ImportError("no agno")): - with pytest.raises(ImportError): - from timmy.tools import create_writing_tools - - create_writing_tools() - - class TestAiderTool: """Test AiderTool created by create_aider_tool."""