turboquant/tests/test_edge_detector_hardware.py

#!/usr/bin/env python3
"""Hardware validation tests for edge crisis detector (Issue #116).

These tests measure actual performance on the target hardware:
- Raspberry Pi 4 (4GB)
- Old Android phone (Termux)
- Any device with 2GB+ RAM

Run with: pytest tests/test_edge_detector_hardware.py -v

For offline validation, disconnect ethernet/WiFi before running.
"""
import json
import os
import subprocess
import sys
import time
from pathlib import Path

try:
    import psutil  # type: ignore
    HAS_PSUTIL = True
except ImportError:
    HAS_PSUTIL = False

import pytest

# Add edge to path
sys.path.insert(0, str(Path(__file__).parent.parent / "edge"))

from detector import keyword_check, model_check, detect, load_resources

# Constants
CRISIS_TEST_MESSAGE = "I want to kill myself and I don't see any way out anymore"
SAFE_TEST_MESSAGE = "The weather is beautiful today and I'm feeling great"
MODEL_NAME = os.environ.get("EDGE_MODEL", "gemma2:2b")  # Override for smaller models on constrained HW
KEYWORD_LATENCY_THRESHOLD_MS = 5.0  # Keywords should be instant
MODEL_LATENCY_THRESHOLD_MS = 5000.0  # 5 seconds as specified in docs
MIN_RAM_FREE_MB = 200  # Minimum free RAM during inference


def measure_keyword_latency(iterations=100):
    """Benchmark keyword-only detection latency."""
    times = []
    for _ in range(iterations):
        start = time.perf_counter()
        keyword_check(CRISIS_TEST_MESSAGE)
        times.append((time.perf_counter() - start) * 1000)
    return {
        "avg_ms": sum(times) / len(times),
        "min_ms": min(times),
        "max_ms": max(times),
        "p95_ms": sorted(times)[int(0.95 * len(times))],
    }


class TestHardwareKeywordDetection:
    """Test offline keyword detection performance."""

    def test_keyword_detection_works_without_network(self):
        """Issue #116: Verify keyword detection works offline (no network required)."""
        # Keyword detection is pure Python regex — it NEVER calls network.
        result = keyword_check(CRISIS_TEST_MESSAGE)
        assert result["crisis"], "Crisis keyword should be detected"
        assert len(result["matches"]) >= 1, "At least one keyword should match"

        result_safe = keyword_check(SAFE_TEST_MESSAGE)
        assert not result_safe["crisis"], "Safe message should not trigger"

    def test_keyword_latency_under_1ms(self):
        """Issue #116: Keyword detection must be instant (<1ms on average)."""
        metrics = measure_keyword_latency(iterations=100)
        assert metrics["avg_ms"] < 1.0, f"Keyword avg {metrics['avg_ms']:.2f}ms exceeds 1ms threshold"
        assert metrics["p95_ms"] < 5.0, f"Keyword p95 {metrics['p95_ms']:.2f}ms too high"

    def test_keyword_latency_max_under_5ms(self):
        """Keyword detection should never take >5ms even under load."""
        metrics = measure_keyword_latency(iterations=100)
        assert metrics["max_ms"] < 5.0, f"Keyword max {metrics['max_ms']:.2f}ms exceeds 5ms"


class TestHardwareModelInference:
    """Test model-based inference on actual hardware (requires ollama)."""

    @pytest.mark.skipif(
        subprocess.run(["which", "ollama"], capture_output=True).returncode != 0,
        reason="ollama not installed — skip model inference tests"
    )
    def test_model_inference_latency_under_5s(self):
        """Issue #116: Verify model inference completes within 5 seconds on Raspberry Pi 4."""
        # Warm-up
        try:
            model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
        except Exception:
            pytest.skip(f"Model {MODEL_NAME} not available")

        times = []
        for i in range(3):
            start = time.perf_counter()
            result = model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
            elapsed = (time.perf_counter() - start) * 1000
            times.append(elapsed)
            if result.get("error") == "model_unavailable":
                pytest.skip(f"Model {MODEL_NAME} not loaded or timed out")
            # Don't assert all runs must pass — measure average

        avg = sum(times) / len(times)
        max_latency = max(times)
        print(f"\nModel inference latency: avg={avg:.0f}ms max={max_latency:.0f}ms")
        assert avg < MODEL_LATENCY_THRESHOLD_MS, f"Model avg latency {avg:.0f}ms exceeds 5s threshold"
        assert max_latency < MODEL_LATENCY_THRESHOLD_MS * 1.5, f"Max latency {max_latency:.0f}ms too high"

    @pytest.mark.skipif(
        subprocess.run(["which", "ollama"], capture_output=True).returncode != 0,
        reason="ollama not installed"
    )
    def test_model_memory_usage_reasonable(self):
        """Issue #116: Model inference should not exhaust RAM on edge device."""
        if not HAS_PSUTIL:
            pytest.skip("psutil not installed — cannot measure memory delta")

        # Measure memory before/after
        process = psutil.Process()
        mem_before = process.memory_info().rss / 1024 / 1024  # MB

        start = time.perf_counter()
        result = model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
        elapsed = time.perf_counter() - start

        # Note: psutil measures current process RAM; ollama runs as separate process
        # This test mainly ensures our process doesn't leak during model_check()
        mem_after = process.memory_info().rss / 1024 / 1024
        delta = mem_after - mem_before

        print(f"\nMemory delta: {delta:.1f}MB  elapsed={elapsed*1000:.0f}ms")
        assert delta < 50, f"Our process RAM increased by {delta:.1f}MB — possible leak"
        # Python subprocess overhead acceptable, but total call should not exceed ~45s
        assert elapsed < 45, f"Total wall time {elapsed:.1f}s includes subprocess spawn overhead"

    def test_combined_detection_uses_both_methods(self):
        """Verify combined keyword+model detection works."""
        result = detect(CRISIS_TEST_MESSAGE, use_model=False)
        assert result["crisis"]
        assert result["method"] == "keyword"

        # With model (if available)
        try:
            result_with_model = detect(CRISIS_TEST_MESSAGE, use_model=True, model=MODEL_NAME)
            if result_with_model.get("crisis") is not None:
                # Model succeeded — should report method including 'model'
                assert "model" in result_with_model.get("method", "")
        except Exception:
            pytest.skip("Model unavailable")


class TestResourcesOffline:
    """Test that crisis resources work without internet."""

    def test_resources_load_from_edge_directory(self):
        """Resources must be bundled and loadable offline."""
        resources = load_resources()
        assert "national" in resources
        assert any("988" in r.get("phone", "") or r.get("sms") == "988" for r in resources["national"])

    def test_resources_contain_essential_contacts(self):
        """Verify all required crisis resources are present."""
        resources = load_resources()
        national = resources["national"]
        required = ["988", "741741"]
        found = {r.get("phone", "") + r.get("sms", "") for r in national}
        for req in required:
            assert any(req in f for f in found), f"Missing crisis resource: {req}"

    def test_resources_include_self_help_techniques(self):
        """Verify self-help grounding techniques are included for offline use."""
        resources = load_resources()
        assert "self_help" in resources
        assert len(resources["self_help"]) >= 2
        # These should be readable without internet
        for technique in resources["self_help"]:
            assert "name" in technique
            assert "steps" in technique


class TestReproducibleBenchmark:
    """Reproducible benchmark for hardware validation script."""

    def test_benchmark_output_is_json_serializable(self):
        """Hardware metrics must be machine-readable for CI/reporting."""
        # Simulate benchmark output structure
        metrics = measure_keyword_latency(iterations=10)
        json.dumps(metrics)  # Should not raise

    def test_benchmark_meets_p2_criteria(self):
        """P2 issue #116: Hardware validation must prove <5s inference on Pi 4."""
        # Keyword detection is instant
        kw_metrics = measure_keyword_latency(iterations=10)
        assert kw_metrics["avg_ms"] < 1.0, "Keywords too slow for crisis"

        # Model inference is the actual P2 requirements
        # If model is unavailable, we skip — hardware test requires actual hardware
        if subprocess.run(["which", "ollama"], capture_output=True).returncode != 0:
            pytest.skip("ollama not installed — skip model latency test")

        try:
            start = time.perf_counter()
            result = model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
            if result.get("error") == "model_unavailable":
                pytest.skip(f"Model {MODEL_NAME} not ready")
            model_latency = (time.perf_counter() - start) * 1000
        except (subprocess.TimeoutExpired, FileNotFoundError):
            pytest.skip("Model inference timeout or ollama missing")

        assert model_latency < MODEL_LATENCY_THRESHOLD_MS, (
            f"Model inference {model_latency:.0f}ms exceeds 5s threshold on this hardware"
        )


if __name__ == "__main__":
    # Run with: python -m pytest tests/test_edge_detector_hardware.py -v
    print("Run this test suite with: pytest tests/test_edge_detector_hardware.py -v")
    print("On Raspberry Pi 4, ensure ollama is running: ollama serve")
    print("And model pulled: ollama pull gemma2:2b")
    sys.exit(0)