All checks were successful
Smoke Test / smoke (pull_request) Successful in 8s
Implements #116 — hardware validation testing for edge crisis detector on Raspberry Pi 4 and other edge devices. Adds edge detector (keyword + optional Ollama model), crisis_resources.json, deployment docs, and two test files: - test_edge_detector.py: unit tests for keyword logic - test_edge_detector_hardware.py: hardware validation suite Hardware validation measures keyword detection (<1ms), model inference (<5s on Pi 4), offline operation, and provides reproducible benchmark via `python3 edge/detector.py --benchmark`. Re-implements the functionality from closed PR #111 with expanded tests.
225 lines
9.3 KiB
Python
225 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
"""Hardware validation tests for edge crisis detector (Issue #116).
|
|
|
|
These tests measure actual performance on the target hardware:
|
|
- Raspberry Pi 4 (4GB)
|
|
- Old Android phone (Termux)
|
|
- Any device with 2GB+ RAM
|
|
|
|
Run with: pytest tests/test_edge_detector_hardware.py -v
|
|
|
|
For offline validation, disconnect ethernet/WiFi before running.
|
|
"""
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import psutil # type: ignore
|
|
HAS_PSUTIL = True
|
|
except ImportError:
|
|
HAS_PSUTIL = False
|
|
|
|
import pytest
|
|
|
|
# Add edge to path
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "edge"))
|
|
|
|
from detector import keyword_check, model_check, detect, load_resources
|
|
|
|
# Constants
|
|
CRISIS_TEST_MESSAGE = "I want to kill myself and I don't see any way out anymore"
|
|
SAFE_TEST_MESSAGE = "The weather is beautiful today and I'm feeling great"
|
|
MODEL_NAME = os.environ.get("EDGE_MODEL", "gemma2:2b") # Override for smaller models on constrained HW
|
|
KEYWORD_LATENCY_THRESHOLD_MS = 5.0 # Keywords should be instant
|
|
MODEL_LATENCY_THRESHOLD_MS = 5000.0 # 5 seconds as specified in docs
|
|
MIN_RAM_FREE_MB = 200 # Minimum free RAM during inference
|
|
|
|
|
|
def measure_keyword_latency(iterations=100):
|
|
"""Benchmark keyword-only detection latency."""
|
|
times = []
|
|
for _ in range(iterations):
|
|
start = time.perf_counter()
|
|
keyword_check(CRISIS_TEST_MESSAGE)
|
|
times.append((time.perf_counter() - start) * 1000)
|
|
return {
|
|
"avg_ms": sum(times) / len(times),
|
|
"min_ms": min(times),
|
|
"max_ms": max(times),
|
|
"p95_ms": sorted(times)[int(0.95 * len(times))],
|
|
}
|
|
|
|
|
|
class TestHardwareKeywordDetection:
|
|
"""Test offline keyword detection performance."""
|
|
|
|
def test_keyword_detection_works_without_network(self):
|
|
"""Issue #116: Verify keyword detection works offline (no network required)."""
|
|
# Keyword detection is pure Python regex — it NEVER calls network.
|
|
result = keyword_check(CRISIS_TEST_MESSAGE)
|
|
assert result["crisis"], "Crisis keyword should be detected"
|
|
assert len(result["matches"]) >= 1, "At least one keyword should match"
|
|
|
|
result_safe = keyword_check(SAFE_TEST_MESSAGE)
|
|
assert not result_safe["crisis"], "Safe message should not trigger"
|
|
|
|
def test_keyword_latency_under_1ms(self):
|
|
"""Issue #116: Keyword detection must be instant (<1ms on average)."""
|
|
metrics = measure_keyword_latency(iterations=100)
|
|
assert metrics["avg_ms"] < 1.0, f"Keyword avg {metrics['avg_ms']:.2f}ms exceeds 1ms threshold"
|
|
assert metrics["p95_ms"] < 5.0, f"Keyword p95 {metrics['p95_ms']:.2f}ms too high"
|
|
|
|
def test_keyword_latency_max_under_5ms(self):
|
|
"""Keyword detection should never take >5ms even under load."""
|
|
metrics = measure_keyword_latency(iterations=100)
|
|
assert metrics["max_ms"] < 5.0, f"Keyword max {metrics['max_ms']:.2f}ms exceeds 5ms"
|
|
|
|
|
|
class TestHardwareModelInference:
|
|
"""Test model-based inference on actual hardware (requires ollama)."""
|
|
|
|
@pytest.mark.skipif(
|
|
subprocess.run(["which", "ollama"], capture_output=True).returncode != 0,
|
|
reason="ollama not installed — skip model inference tests"
|
|
)
|
|
def test_model_inference_latency_under_5s(self):
|
|
"""Issue #116: Verify model inference completes within 5 seconds on Raspberry Pi 4."""
|
|
# Warm-up
|
|
try:
|
|
model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
|
|
except Exception:
|
|
pytest.skip(f"Model {MODEL_NAME} not available")
|
|
|
|
times = []
|
|
for i in range(3):
|
|
start = time.perf_counter()
|
|
result = model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
|
|
elapsed = (time.perf_counter() - start) * 1000
|
|
times.append(elapsed)
|
|
if result.get("error") == "model_unavailable":
|
|
pytest.skip(f"Model {MODEL_NAME} not loaded or timed out")
|
|
# Don't assert all runs must pass — measure average
|
|
|
|
avg = sum(times) / len(times)
|
|
max_latency = max(times)
|
|
print(f"\nModel inference latency: avg={avg:.0f}ms max={max_latency:.0f}ms")
|
|
assert avg < MODEL_LATENCY_THRESHOLD_MS, f"Model avg latency {avg:.0f}ms exceeds 5s threshold"
|
|
assert max_latency < MODEL_LATENCY_THRESHOLD_MS * 1.5, f"Max latency {max_latency:.0f}ms too high"
|
|
|
|
@pytest.mark.skipif(
|
|
subprocess.run(["which", "ollama"], capture_output=True).returncode != 0,
|
|
reason="ollama not installed"
|
|
)
|
|
def test_model_memory_usage_reasonable(self):
|
|
"""Issue #116: Model inference should not exhaust RAM on edge device."""
|
|
if not HAS_PSUTIL:
|
|
pytest.skip("psutil not installed — cannot measure memory delta")
|
|
|
|
# Measure memory before/after
|
|
process = psutil.Process()
|
|
mem_before = process.memory_info().rss / 1024 / 1024 # MB
|
|
|
|
start = time.perf_counter()
|
|
result = model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
|
|
elapsed = time.perf_counter() - start
|
|
|
|
# Note: psutil measures current process RAM; ollama runs as separate process
|
|
# This test mainly ensures our process doesn't leak during model_check()
|
|
mem_after = process.memory_info().rss / 1024 / 1024
|
|
delta = mem_after - mem_before
|
|
|
|
print(f"\nMemory delta: {delta:.1f}MB elapsed={elapsed*1000:.0f}ms")
|
|
assert delta < 50, f"Our process RAM increased by {delta:.1f}MB — possible leak"
|
|
# Python subprocess overhead acceptable, but total call should not exceed ~45s
|
|
assert elapsed < 45, f"Total wall time {elapsed:.1f}s includes subprocess spawn overhead"
|
|
|
|
def test_combined_detection_uses_both_methods(self):
|
|
"""Verify combined keyword+model detection works."""
|
|
result = detect(CRISIS_TEST_MESSAGE, use_model=False)
|
|
assert result["crisis"]
|
|
assert result["method"] == "keyword"
|
|
|
|
# With model (if available)
|
|
try:
|
|
result_with_model = detect(CRISIS_TEST_MESSAGE, use_model=True, model=MODEL_NAME)
|
|
if result_with_model.get("crisis") is not None:
|
|
# Model succeeded — should report method including 'model'
|
|
assert "model" in result_with_model.get("method", "")
|
|
except Exception:
|
|
pytest.skip("Model unavailable")
|
|
|
|
|
|
class TestResourcesOffline:
|
|
"""Test that crisis resources work without internet."""
|
|
|
|
def test_resources_load_from_edge_directory(self):
|
|
"""Resources must be bundled and loadable offline."""
|
|
resources = load_resources()
|
|
assert "national" in resources
|
|
assert any("988" in r.get("phone", "") or r.get("sms") == "988" for r in resources["national"])
|
|
|
|
def test_resources_contain_essential_contacts(self):
|
|
"""Verify all required crisis resources are present."""
|
|
resources = load_resources()
|
|
national = resources["national"]
|
|
required = ["988", "741741"]
|
|
found = {r.get("phone", "") + r.get("sms", "") for r in national}
|
|
for req in required:
|
|
assert any(req in f for f in found), f"Missing crisis resource: {req}"
|
|
|
|
def test_resources_include_self_help_techniques(self):
|
|
"""Verify self-help grounding techniques are included for offline use."""
|
|
resources = load_resources()
|
|
assert "self_help" in resources
|
|
assert len(resources["self_help"]) >= 2
|
|
# These should be readable without internet
|
|
for technique in resources["self_help"]:
|
|
assert "name" in technique
|
|
assert "steps" in technique
|
|
|
|
|
|
class TestReproducibleBenchmark:
|
|
"""Reproducible benchmark for hardware validation script."""
|
|
|
|
def test_benchmark_output_is_json_serializable(self):
|
|
"""Hardware metrics must be machine-readable for CI/reporting."""
|
|
# Simulate benchmark output structure
|
|
metrics = measure_keyword_latency(iterations=10)
|
|
json.dumps(metrics) # Should not raise
|
|
|
|
def test_benchmark_meets_p2_criteria(self):
|
|
"""P2 issue #116: Hardware validation must prove <5s inference on Pi 4."""
|
|
# Keyword detection is instant
|
|
kw_metrics = measure_keyword_latency(iterations=10)
|
|
assert kw_metrics["avg_ms"] < 1.0, "Keywords too slow for crisis"
|
|
|
|
# Model inference is the actual P2 requirements
|
|
# If model is unavailable, we skip — hardware test requires actual hardware
|
|
if subprocess.run(["which", "ollama"], capture_output=True).returncode != 0:
|
|
pytest.skip("ollama not installed — skip model latency test")
|
|
|
|
try:
|
|
start = time.perf_counter()
|
|
result = model_check(CRISIS_TEST_MESSAGE, MODEL_NAME)
|
|
if result.get("error") == "model_unavailable":
|
|
pytest.skip(f"Model {MODEL_NAME} not ready")
|
|
model_latency = (time.perf_counter() - start) * 1000
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
pytest.skip("Model inference timeout or ollama missing")
|
|
|
|
assert model_latency < MODEL_LATENCY_THRESHOLD_MS, (
|
|
f"Model inference {model_latency:.0f}ms exceeds 5s threshold on this hardware"
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Run with: python -m pytest tests/test_edge_detector_hardware.py -v
|
|
print("Run this test suite with: pytest tests/test_edge_detector_hardware.py -v")
|
|
print("On Raspberry Pi 4, ensure ollama is running: ollama serve")
|
|
print("And model pulled: ollama pull gemma2:2b")
|
|
sys.exit(0)
|