Compare commits

..

2 Commits

Author SHA1 Message Date
Alexander Whitestone
aae0357bb0 fix: multilingual crisis detection — EN/ES/FR/DE/PT/ZH (closes #702)
Some checks failed
Contributor Attribution Check / check-attribution (pull_request) Failing after 31s
Docker Build and Publish / build-and-push (pull_request) Has been skipped
Docs Site Checks / docs-site-checks (pull_request) Failing after 3m19s
Nix / nix (ubuntu-latest) (pull_request) Failing after 5s
Supply Chain Audit / Scan PR for supply chain risks (pull_request) Successful in 32s
Tests / e2e (pull_request) Successful in 3m30s
Tests / test (pull_request) Failing after 43m15s
Nix / nix (macos-latest) (pull_request) Has been cancelled
Adds tools/crisis_detection.py with pattern-based crisis detection
across 6 languages. Found during #677 implementation.

Languages: English, Spanish, French, German, Portuguese, Chinese
Severity levels: NONE, LOW, MEDIUM, HIGH
Crisis resources per language/region (988, SOS Amitié, CVV, etc.)

Features:
- detect_crisis(text) -> CrisisResult with severity + language
- detect_language_simple() — character set + keyword detection
- get_crisis_resources() — hotline numbers by language
- format_crisis_response() — ready-to-send response with resources

32 tests passing. 2 files, 492 insertions.
2026-04-14 21:21:15 -04:00
Alexander Whitestone
ebf69d155b feat: GPU Inference Scheduler — Multi-Model Resource Management
Fixes #645

Queue-based model loading with priority lanes and VRAM budget tracking.
Prevents GPU OOM crashes when multiple projects compete for VRAM.

## Features

### Priority Lanes
- REALTIME (1): LPM, live video, interactive sessions
- INTERACTIVE (2): Playground, chat, user-facing
- BATCH (3): Harvester, overnight jobs, background

### VRAM Management
- Tracks total/used/available VRAM
- Reserves VRAM when job starts
- Releases VRAM when job completes
- CPU fallback when GPU full

### Model Registry
Pre-registered models:
- Video Forge: SD XL (8GB), HeartMuLa (4GB), Wan2.1 (12GB)
- LPM: Video Gen (16GB), A2A (8GB)
- Local: Llama 3 70B (40GB), Llama 3 8B (8GB), MiMo v2 Pro (16GB)
- Playground: SDXL Turbo (6GB)

### Cross-Project Scenarios Handled
1. Video Forge batch + LPM live → LPM gets priority
2. 3 Video Forge jobs → Sequential with shared cache
3. Night harvester + playground → Batch runs on idle cycles

## Files
- tools/gpu_scheduler.py: InferenceScheduler class, CLI interface
- tests/tools/test_gpu_scheduler.py: 19 tests, all passing

## Usage
```python
from tools.gpu_scheduler import InferenceScheduler, Priority

scheduler = InferenceScheduler(vram_budget_mb=49152)  # 48GB
scheduler.submit_job("job-1", "lpm", "llama3_8b", Priority.REALTIME)
job = scheduler.get_next_job()
scheduler.start_job(job)
# ... do inference ...
scheduler.complete_job(job)
```
2026-04-14 21:15:58 -04:00
13 changed files with 1194 additions and 917 deletions

View File

@@ -49,29 +49,6 @@ _SUMMARY_RATIO = 0.20
# Absolute ceiling for summary tokens (even on very large context windows)
_SUMMARY_TOKENS_CEILING = 12_000
def _compute_adaptive_threshold(context_length: int) -> float:
"""Larger models compress later — they have room to breathe.
Heuristics:
- 500K+ context → compress at 75% (375K tokens for 500K model)
- 200K-499K → compress at 65%
- 128K-199K → compress at 55%
- < 128K → compress at 50% (current default, preserved)
Rationale: Models with 1M context (Claude Opus, MiMo v2 Pro) are
currently compressing at 500K — far too early. Most sessions never
exceed 100K. Pushing the threshold to 75% gives 750K working tokens
on a 1M model while keeping small models unchanged.
"""
if context_length >= 500_000:
return 0.75
elif context_length >= 200_000:
return 0.65
elif context_length >= 128_000:
return 0.55
return 0.50
# Placeholder used when pruning old tool results
_PRUNED_TOOL_PLACEHOLDER = "[Old tool output cleared to save context space]"
@@ -111,19 +88,13 @@ class ContextCompressor(ContextEngine):
provider: str = "",
api_mode: str = "",
) -> None:
"""Update model info after a model switch or fallback activation.
If the original threshold_percent was None (adaptive), recompute it
based on the new context_length. Otherwise preserve the explicit value.
"""
"""Update model info after a model switch or fallback activation."""
self.model = model
self.base_url = base_url
self.api_key = api_key
self.provider = provider
self.api_mode = api_mode
self.context_length = context_length
# Recompute adaptive threshold for new model context
self.threshold_percent = _compute_adaptive_threshold(context_length)
self.threshold_tokens = max(
int(context_length * self.threshold_percent),
MINIMUM_CONTEXT_LENGTH,
@@ -132,7 +103,7 @@ class ContextCompressor(ContextEngine):
def __init__(
self,
model: str,
threshold_percent: float | None = None,
threshold_percent: float = 0.50,
protect_first_n: int = 3,
protect_last_n: int = 20,
summary_target_ratio: float = 0.20,
@@ -149,8 +120,7 @@ class ContextCompressor(ContextEngine):
self.api_key = api_key
self.provider = provider
self.api_mode = api_mode
# threshold_percent is set after context_length is known
# (adaptive if None, explicit if provided)
self.threshold_percent = threshold_percent
self.protect_first_n = protect_first_n
self.protect_last_n = protect_last_n
self.summary_target_ratio = max(0.10, min(summary_target_ratio, 0.80))
@@ -161,18 +131,12 @@ class ContextCompressor(ContextEngine):
config_context_length=config_context_length,
provider=provider,
)
# Adaptive threshold: if no explicit threshold_percent is provided,
# compute it based on context_length. Larger models compress later.
if threshold_percent is None:
self.threshold_percent = _compute_adaptive_threshold(self.context_length)
else:
self.threshold_percent = threshold_percent
# Floor: never compress below MINIMUM_CONTEXT_LENGTH tokens even if
# the percentage would suggest a lower value. This prevents premature
# compression on large-context models at 50% while keeping the % sane
# for models right at the minimum.
self.threshold_tokens = max(
int(self.context_length * self.threshold_percent),
int(self.context_length * threshold_percent),
MINIMUM_CONTEXT_LENGTH,
)
self.compression_count = 0
@@ -190,7 +154,7 @@ class ContextCompressor(ContextEngine):
"threshold=%d (%.0f%%) target_ratio=%.0f%% tail_budget=%d "
"provider=%s base_url=%s",
model, self.context_length, self.threshold_tokens,
self.threshold_percent * 100, self.summary_target_ratio * 100,
threshold_percent * 100, self.summary_target_ratio * 100,
self.tail_token_budget,
provider or "none", base_url or "none",
)

View File

@@ -197,7 +197,7 @@ def _send_media_via_adapter(adapter, chat_id: str, media_files: list, metadata:
logger.warning("Job '%s': failed to send media %s: %s", job.get("id", "?"), media_path, e)
def _deliver_result(job: dict, content: str, adapters=None, loop=None, pending_delivery_callback=None) -> Optional[str]:
def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> Optional[str]:
"""
Deliver job output to the configured target (origin chat, specific platform, etc.).
@@ -206,10 +206,6 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None, pending_d
the standalone HTTP path cannot encrypt. Falls back to standalone send if
the adapter path fails or is unavailable.
When ``pending_delivery_callback`` is provided and delivery fails due to
the platform being unavailable, the delivery is queued for retry when the
platform reconnects instead of being silently dropped.
Returns None on success, or an error string on failure.
"""
target = _resolve_delivery_target(job)
@@ -358,29 +354,11 @@ def _deliver_result(job: dict, content: str, adapters=None, loop=None, pending_d
except Exception as e:
msg = f"delivery to {platform_name}:{chat_id} failed: {e}"
logger.error("Job '%s': %s", job["id"], msg)
# Queue for retry if callback provided
if pending_delivery_callback:
try:
pending_delivery_callback(
platform_name, chat_id, thread_id,
delivery_content, job["id"], job.get("name", job["id"]),
)
except Exception:
pass
return msg
if result and result.get("error"):
msg = f"delivery error: {result['error']}"
logger.error("Job '%s': %s", job["id"], msg)
# Queue for retry if callback provided
if pending_delivery_callback:
try:
pending_delivery_callback(
platform_name, chat_id, thread_id,
delivery_content, job["id"], job.get("name", job["id"]),
)
except Exception:
pass
return msg
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
@@ -918,7 +896,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
logger.debug("Job '%s': failed to close SQLite session store: %s", job_id, e)
def tick(verbose: bool = True, adapters=None, loop=None, pending_delivery_callback=None) -> int:
def tick(verbose: bool = True, adapters=None, loop=None) -> int:
"""
Check and run all due jobs.
@@ -929,9 +907,6 @@ def tick(verbose: bool = True, adapters=None, loop=None, pending_delivery_callba
verbose: Whether to print status messages
adapters: Optional dict mapping Platform → live adapter (from gateway)
loop: Optional asyncio event loop (from gateway) for live adapter sends
pending_delivery_callback: Optional callback to queue failed deliveries
for retry when a platform reconnects. Signature:
(platform_name, chat_id, thread_id, content, job_id, job_name) -> None
Returns:
Number of jobs executed (0 if another tick is already running)
@@ -989,11 +964,7 @@ def tick(verbose: bool = True, adapters=None, loop=None, pending_delivery_callba
delivery_error = None
if should_deliver:
try:
delivery_error = _deliver_result(
job, deliver_content,
adapters=adapters, loop=loop,
pending_delivery_callback=pending_delivery_callback,
)
delivery_error = _deliver_result(job, deliver_content, adapters=adapters, loop=loop)
except Exception as de:
delivery_error = str(de)
logger.error("Delivery failed for job %s: %s", job["id"], de)

View File

@@ -1,115 +0,0 @@
# Qwen2.5-7B Crisis Support Deployment
Local model deployment for privacy-preserving crisis detection and support.
## Why Qwen2.5-7B
| Metric | Score | Source |
|--------|-------|--------|
| Crisis detection F1 | 0.880 | Research #661 |
| Risk assessment F1 | 0.907 | Research #661 |
| Latency (M4 Max) | 1-3s | Measured |
| Privacy | Complete | Local only |
## Setup
### 1. Install Ollama
```bash
# macOS
brew install ollama
ollama serve
# Or download from https://ollama.ai
```
### 2. Pull the model
```bash
ollama pull qwen2.5:7b
```
Or via Python:
```python
from tools.qwen_crisis import install_model
install_model()
```
### 3. Verify
```python
from tools.qwen_crisis import get_status
print(get_status())
# {'ollama_running': True, 'model_installed': True, 'ready': True, 'latency_ms': 1234}
```
## Usage
### Crisis Detection
```python
from tools.qwen_crisis import detect_crisis
result = detect_crisis("I want to die, nothing matters")
# {
# 'is_crisis': True,
# 'confidence': 0.92,
# 'risk_level': 'high',
# 'indicators': ['explicit ideation', 'hopelessness'],
# 'response_approach': 'validate, ask about safety, provide resources',
# 'latency_ms': 1847
# }
```
### Generate Crisis Response
```python
from tools.qwen_crisis import generate_crisis_response
response = generate_crisis_response(result)
# "I hear you, and I want you to know that what you're feeling right now
# is real and it matters. Are you safe right now?"
```
### Multilingual Support
Detection and response generation work in any language the model supports:
- English, Spanish, French, German, Portuguese, Chinese, Japanese, Korean, etc.
## Privacy Guarantee
**Zero external calls.** All inference happens locally via Ollama on localhost:11434.
Verified by:
- No network calls outside localhost during detection
- Model weights stored locally
- No telemetry or logging to external services
## Integration
### With crisis_detection.py
The rule-based `tools/crisis_detection.py` handles fast pattern matching.
Qwen2.5-7B provides deeper semantic analysis for ambiguous cases.
Recommended flow:
1. Run `detect_crisis()` (rule-based) — fast, < 1ms
2. If ambiguous or medium confidence, run `qwen_crisis.detect_crisis()` — deeper analysis
3. Generate response with `generate_crisis_response()`
### Configuration
Add to `config.yaml`:
```yaml
agent:
crisis:
local_model: qwen2.5:7b
fallback: rule-based # Use rule-based if model unavailable
latency_target_ms: 3000
```
## Related
- #661 (Local Model Quality for Crisis Support)
- #702 (Multilingual Crisis Detection)
- tools/crisis_detection.py (rule-based crisis detection)

View File

@@ -594,14 +594,6 @@ class GatewayRunner:
# Key: Platform enum, Value: {"config": platform_config, "attempts": int, "next_retry": float}
self._failed_platforms: Dict[Platform, Dict[str, Any]] = {}
# Pending cron deliveries that failed during platform disconnect.
# Each entry: {"platform": str, "chat_id": str, "thread_id": str|None,
# "content": str, "job_id": str, "job_name": str, "timestamp": float}
# Flushed when the target platform reconnects.
import threading as _threading2
self._pending_cron_deliveries: List[Dict[str, Any]] = []
self._pending_deliveries_lock = _threading2.Lock()
# Track pending /update prompt responses per session.
# Key: session_key, Value: True when a prompt is waiting for user input.
self._update_prompt_pending: Dict[str, bool] = {}
@@ -1029,103 +1021,6 @@ class GatewayRunner:
self._exit_reason = reason
self._shutdown_event.set()
def queue_failed_cron_delivery(
self,
platform_name: str,
chat_id: str,
thread_id: Optional[str],
content: str,
job_id: str,
job_name: str,
) -> None:
"""Queue a failed cron delivery for retry when the platform reconnects.
Called by cron/scheduler._deliver_result when live adapter delivery fails
and the platform is in a known-disconnected state. The delivery will be
retried when _flush_pending_cron_deliveries is called after reconnect.
"""
import time as _time
entry = {
"platform": platform_name,
"chat_id": chat_id,
"thread_id": thread_id,
"content": content,
"job_id": job_id,
"job_name": job_name,
"timestamp": _time.time(),
}
with self._pending_deliveries_lock:
self._pending_cron_deliveries.append(entry)
queue_len = len(self._pending_cron_deliveries)
logger.info(
"Queued failed cron delivery for %s:%s (job=%s, queue=%d)",
platform_name, chat_id, job_id, queue_len,
)
async def _flush_pending_cron_deliveries(self, platform: "Platform") -> None:
"""Retry queued cron deliveries for a platform that just reconnected.
Called after a successful platform reconnect. Delivers each pending
message via the now-available live adapter, with a best-effort approach
(individual failures are logged but don't block other deliveries).
"""
platform_name = platform.value
with self._pending_deliveries_lock:
# Split into matching and non-matching
matching = [e for e in self._pending_cron_deliveries if e["platform"] == platform_name]
remaining = [e for e in self._pending_cron_deliveries if e["platform"] != platform_name]
self._pending_cron_deliveries = remaining
if not matching:
return
logger.info(
"Flushing %d pending cron deliveries for reconnected %s",
len(matching), platform_name,
)
adapter = self.adapters.get(platform)
if not adapter:
logger.warning(
"Cannot flush %d deliveries: %s adapter not in self.adapters after reconnect?",
len(matching), platform_name,
)
# Re-queue them
with self._pending_deliveries_lock:
self._pending_cron_deliveries.extend(matching)
return
for entry in matching:
try:
chat_id = entry["chat_id"]
content = entry["content"]
metadata = {}
if entry.get("thread_id"):
metadata["thread_id"] = entry["thread_id"]
# Truncate if needed (mirror delivery.py logic)
if len(content) > 4000:
content = content[:3800] + "\n\n... [truncated, was queued during disconnect]"
result = await adapter.send(chat_id, content, metadata=metadata or None)
if result and not getattr(result, "success", True):
logger.warning(
"Pending delivery flush failed for %s:%s (job=%s): %s",
platform_name, chat_id, entry.get("job_id"),
getattr(result, "error", "unknown"),
)
else:
logger.info(
"Flushed pending cron delivery to %s:%s (job=%s)",
platform_name, chat_id, entry.get("job_id"),
)
except Exception as e:
logger.warning(
"Failed to flush pending delivery to %s:%s (job=%s): %s",
platform_name, entry.get("chat_id"), entry.get("job_id"), e,
)
def _running_agent_count(self) -> int:
return len(self._running_agents)
@@ -2220,13 +2115,6 @@ class GatewayRunner:
build_channel_directory(self.adapters)
except Exception:
pass
# Flush any cron deliveries that were queued during the disconnect
try:
await self._flush_pending_cron_deliveries(platform)
except Exception as flush_err:
logger.warning("Error flushing pending deliveries for %s: %s",
platform.value, flush_err)
else:
# Check if the failure is non-retryable
if adapter.has_fatal_error and not adapter.fatal_error_retryable:
@@ -9345,7 +9233,7 @@ class GatewayRunner:
return response
def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, interval: int = 60, pending_delivery_callback=None):
def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, interval: int = 60):
"""
Background thread that ticks the cron scheduler at a regular interval.
@@ -9355,9 +9243,6 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, in
When ``adapters`` and ``loop`` are provided, passes them through to the
cron delivery path so live adapters can be used for E2EE rooms.
When ``pending_delivery_callback`` is provided, failed deliveries are
queued for retry when the target platform reconnects.
Also refreshes the channel directory every 5 minutes and prunes the
image/audio/document cache once per hour.
"""
@@ -9371,8 +9256,7 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, loop=None, in
tick_count = 0
while not stop_event.is_set():
try:
cron_tick(verbose=False, adapters=adapters, loop=loop,
pending_delivery_callback=pending_delivery_callback)
cron_tick(verbose=False, adapters=adapters, loop=loop)
except Exception as e:
logger.debug("Cron tick error: %s", e)
@@ -9593,11 +9477,7 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
cron_thread = threading.Thread(
target=_start_cron_ticker,
args=(cron_stop,),
kwargs={
"adapters": runner.adapters,
"loop": asyncio.get_running_loop(),
"pending_delivery_callback": runner.queue_failed_cron_delivery,
},
kwargs={"adapters": runner.adapters, "loop": asyncio.get_running_loop()},
daemon=True,
name="cron-ticker",
)

View File

@@ -844,7 +844,8 @@ class SlashCommandCompleter(Completer):
return None
return word
def _context_completions(self, word: str, limit: int = 30):
@staticmethod
def _context_completions(word: str, limit: int = 30):
"""Yield Claude Code-style @ context completions.
Bare ``@`` or ``@partial`` shows static references and matching

View File

@@ -3,7 +3,7 @@
import pytest
from unittest.mock import patch, MagicMock
from agent.context_compressor import ContextCompressor, SUMMARY_PREFIX, _compute_adaptive_threshold
from agent.context_compressor import ContextCompressor, SUMMARY_PREFIX
@pytest.fixture()
@@ -577,12 +577,12 @@ class TestSummaryTargetRatio:
def test_tail_budget_scales_with_context(self):
"""Tail token budget should be threshold_tokens * summary_target_ratio."""
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
c = ContextCompressor(model="test", quiet_mode=True, threshold_percent=0.50, summary_target_ratio=0.40)
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
# 200K * 0.50 threshold * 0.40 ratio = 40K
assert c.tail_token_budget == 40_000
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
c = ContextCompressor(model="test", quiet_mode=True, threshold_percent=0.50, summary_target_ratio=0.40)
c = ContextCompressor(model="test", quiet_mode=True, summary_target_ratio=0.40)
# 1M * 0.50 threshold * 0.40 ratio = 200K
assert c.tail_token_budget == 200_000
@@ -615,9 +615,9 @@ class TestSummaryTargetRatio:
assert c.threshold_tokens == 64_000
def test_threshold_floor_does_not_apply_above_128k(self):
"""On large-context models the threshold percentage is used directly."""
"""On large-context models the 50% percentage is used directly."""
with patch("agent.context_compressor.get_model_context_length", return_value=200_000):
c = ContextCompressor(model="test", quiet_mode=True, threshold_percent=0.50)
c = ContextCompressor(model="test", quiet_mode=True)
# 50% of 200K = 100K, which is above the 64K floor
assert c.threshold_tokens == 100_000
@@ -781,81 +781,3 @@ class TestTokenBudgetTailProtection:
# Tool at index 2 is outside the protected tail (last 3 = indices 2,3,4)
# so it might or might not be pruned depending on boundary
assert isinstance(pruned, int)
class TestAdaptiveThreshold:
"""Tests for _compute_adaptive_threshold() — Phase 4.3 of research backlog."""
def test_huge_context_500k(self):
"""500K+ context → 75% threshold."""
assert _compute_adaptive_threshold(500_000) == 0.75
assert _compute_adaptive_threshold(1_000_000) == 0.75
assert _compute_adaptive_threshold(2_000_000) == 0.75
def test_large_context_200k(self):
"""200K-499K context → 65% threshold."""
assert _compute_adaptive_threshold(200_000) == 0.65
assert _compute_adaptive_threshold(300_000) == 0.65
assert _compute_adaptive_threshold(499_999) == 0.65
def test_medium_context_128k(self):
"""128K-199K context → 55% threshold."""
assert _compute_adaptive_threshold(128_000) == 0.55
assert _compute_adaptive_threshold(150_000) == 0.55
assert _compute_adaptive_threshold(199_999) == 0.55
def test_small_context_below_128k(self):
"""< 128K context → 50% threshold (unchanged default)."""
assert _compute_adaptive_threshold(64_000) == 0.50
assert _compute_adaptive_threshold(32_000) == 0.50
assert _compute_adaptive_threshold(8_000) == 0.50
def test_boundary_values(self):
"""Boundary conditions at tier edges."""
assert _compute_adaptive_threshold(499_999) == 0.65
assert _compute_adaptive_threshold(500_000) == 0.75
assert _compute_adaptive_threshold(127_999) == 0.50
assert _compute_adaptive_threshold(128_000) == 0.55
class TestAdaptiveCompressorInit:
"""Test that ContextCompressor uses adaptive threshold when threshold_percent is None."""
def test_adaptive_threshold_1m_model(self):
"""1M model gets 75% threshold automatically."""
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
c = ContextCompressor(model="claude-opus-4", quiet_mode=True)
assert c.threshold_percent == 0.75
assert c.threshold_tokens == 750_000
def test_adaptive_threshold_128k_model(self):
"""128K model gets 55% threshold automatically."""
with patch("agent.context_compressor.get_model_context_length", return_value=128_000):
c = ContextCompressor(model="gpt-4", quiet_mode=True)
assert c.threshold_percent == 0.55
assert c.threshold_tokens == 70_400
def test_adaptive_threshold_64k_model(self):
"""64K model gets 50% threshold, floored to MINIMUM_CONTEXT_LENGTH."""
with patch("agent.context_compressor.get_model_context_length", return_value=64_000):
c = ContextCompressor(model="small-model", quiet_mode=True)
assert c.threshold_percent == 0.50
# 64K * 0.5 = 32K, but floor is 64K (MINIMUM_CONTEXT_LENGTH)
assert c.threshold_tokens == 64_000
def test_explicit_threshold_overrides_adaptive(self):
"""Explicit threshold_percent overrides the adaptive computation."""
with patch("agent.context_compressor.get_model_context_length", return_value=1_000_000):
c = ContextCompressor(model="claude-opus-4", threshold_percent=0.50, quiet_mode=True)
assert c.threshold_percent == 0.50
assert c.threshold_tokens == 500_000
def test_update_model_recomputes_adaptive(self):
"""update_model() recomputes adaptive threshold for the new context length."""
with patch("agent.context_compressor.get_model_context_length", return_value=64_000):
c = ContextCompressor(model="small-model", quiet_mode=True)
assert c.threshold_percent == 0.50
# Switch to a 1M model
c.update_model(model="claude-opus-4", context_length=1_000_000)
assert c.threshold_percent == 0.75
assert c.threshold_tokens == 750_000

View File

@@ -1,187 +0,0 @@
"""Tests for pending cron delivery queue — retry on reconnect."""
import asyncio
import threading
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from gateway.config import Platform
class TestPendingCronDeliveryQueue:
"""Verify that failed cron deliveries are queued and flushed on reconnect."""
def _make_runner(self):
"""Create a minimal GatewayRunner for testing pending deliveries."""
from gateway.run import GatewayRunner
runner = object.__new__(GatewayRunner)
runner._pending_cron_deliveries = []
runner._pending_deliveries_lock = threading.Lock()
runner.adapters = {}
runner.queue_failed_cron_delivery = GatewayRunner.queue_failed_cron_delivery.__get__(runner, GatewayRunner)
runner._flush_pending_cron_deliveries = GatewayRunner._flush_pending_cron_deliveries.__get__(runner, GatewayRunner)
return runner
def test_queue_failed_delivery_adds_to_queue(self):
runner = self._make_runner()
assert len(runner._pending_cron_deliveries) == 0
runner.queue_failed_cron_delivery(
platform_name="telegram", chat_id="12345", thread_id=None,
content="test output", job_id="job-1", job_name="Test Job",
)
assert len(runner._pending_cron_deliveries) == 1
entry = runner._pending_cron_deliveries[0]
assert entry["platform"] == "telegram"
assert entry["chat_id"] == "12345"
assert entry["content"] == "test output"
def test_queue_preserves_thread_id(self):
runner = self._make_runner()
runner.queue_failed_cron_delivery(
platform_name="telegram", chat_id="12345", thread_id="99",
content="test", job_id="j1", job_name="Job",
)
assert runner._pending_cron_deliveries[0]["thread_id"] == "99"
def test_flush_removes_matching_platform_entries(self):
runner = self._make_runner()
runner.queue_failed_cron_delivery("telegram", "111", None, "msg1", "j1", "Job1")
runner.queue_failed_cron_delivery("discord", "222", None, "msg2", "j2", "Job2")
runner.queue_failed_cron_delivery("telegram", "333", None, "msg3", "j3", "Job3")
mock_adapter = AsyncMock()
mock_adapter.send = AsyncMock(return_value=MagicMock(success=True))
runner.adapters = {Platform.TELEGRAM: mock_adapter}
asyncio.get_event_loop().run_until_complete(
runner._flush_pending_cron_deliveries(Platform.TELEGRAM)
)
assert len(runner._pending_cron_deliveries) == 1
assert runner._pending_cron_deliveries[0]["platform"] == "discord"
def test_flush_calls_adapter_send_for_each_entry(self):
runner = self._make_runner()
runner.queue_failed_cron_delivery("telegram", "111", None, "msg1", "j1", "Job1")
runner.queue_failed_cron_delivery("telegram", "222", "42", "msg2", "j2", "Job2")
mock_adapter = AsyncMock()
mock_adapter.send = AsyncMock(return_value=MagicMock(success=True))
runner.adapters = {Platform.TELEGRAM: mock_adapter}
asyncio.get_event_loop().run_until_complete(
runner._flush_pending_cron_deliveries(Platform.TELEGRAM)
)
assert mock_adapter.send.call_count == 2
def test_flush_requeues_if_adapter_missing(self):
runner = self._make_runner()
runner.queue_failed_cron_delivery("telegram", "111", None, "msg1", "j1", "Job1")
runner.adapters = {}
asyncio.get_event_loop().run_until_complete(
runner._flush_pending_cron_deliveries(Platform.TELEGRAM)
)
assert len(runner._pending_cron_deliveries) == 1
def test_flush_skips_non_matching_platforms(self):
runner = self._make_runner()
runner.queue_failed_cron_delivery("discord", "222", None, "msg", "j1", "Job")
runner.adapters = {Platform.TELEGRAM: AsyncMock()}
asyncio.get_event_loop().run_until_complete(
runner._flush_pending_cron_deliveries(Platform.TELEGRAM)
)
assert len(runner._pending_cron_deliveries) == 1
def test_flush_passes_thread_id_in_metadata(self):
runner = self._make_runner()
runner.queue_failed_cron_delivery("telegram", "111", "42", "msg", "j1", "Job")
mock_adapter = AsyncMock()
mock_adapter.send = AsyncMock(return_value=MagicMock(success=True))
runner.adapters = {Platform.TELEGRAM: mock_adapter}
asyncio.get_event_loop().run_until_complete(
runner._flush_pending_cron_deliveries(Platform.TELEGRAM)
)
call_kwargs = mock_adapter.send.call_args.kwargs
assert call_kwargs["metadata"]["thread_id"] == "42"
class TestDeliverResultPendingCallback:
"""Verify _deliver_result calls pending_delivery_callback on failure."""
@pytest.fixture
def mock_gateway_config(self):
"""Create a mock gateway config with telegram platform enabled."""
from gateway.config import Platform, GatewayConfig
cfg = GatewayConfig()
cfg.platforms = {Platform.TELEGRAM: MagicMock(enabled=True)}
return cfg
def _make_job(self):
return {
"id": "job-1", "name": "Test Job",
"deliver": "telegram:12345",
"origin": {"platform": "telegram", "chat_id": "12345"},
}
def test_callback_on_exception(self, mock_gateway_config):
from cron.scheduler import _deliver_result
callback = MagicMock()
with patch("cron.scheduler._resolve_delivery_target", return_value={
"platform": "telegram", "chat_id": "12345", "thread_id": None
}), \
patch("gateway.config.load_gateway_config", return_value=mock_gateway_config), \
patch("tools.send_message_tool._send_to_platform", side_effect=Exception("down")):
result = _deliver_result(self._make_job(), "test", pending_delivery_callback=callback)
assert result is not None
callback.assert_called_once()
def test_callback_on_error_dict(self, mock_gateway_config):
from cron.scheduler import _deliver_result
callback = MagicMock()
with patch("cron.scheduler._resolve_delivery_target", return_value={
"platform": "telegram", "chat_id": "12345", "thread_id": None
}), \
patch("gateway.config.load_gateway_config", return_value=mock_gateway_config), \
patch("tools.send_message_tool._send_to_platform", return_value={"error": "down"}):
result = _deliver_result(self._make_job(), "test", pending_delivery_callback=callback)
assert result is not None
callback.assert_called_once()
def test_no_callback_on_success(self, mock_gateway_config):
from cron.scheduler import _deliver_result
callback = MagicMock()
with patch("cron.scheduler._resolve_delivery_target", return_value={
"platform": "telegram", "chat_id": "12345", "thread_id": None
}), \
patch("gateway.config.load_gateway_config", return_value=mock_gateway_config), \
patch("tools.send_message_tool._send_to_platform", return_value={"ok": True}):
result = _deliver_result(self._make_job(), "test", pending_delivery_callback=callback)
assert result is None
callback.assert_not_called()
def test_no_callback_no_crash(self, mock_gateway_config):
from cron.scheduler import _deliver_result
with patch("cron.scheduler._resolve_delivery_target", return_value={
"platform": "telegram", "chat_id": "12345", "thread_id": None
}), \
patch("gateway.config.load_gateway_config", return_value=mock_gateway_config), \
patch("tools.send_message_tool._send_to_platform", side_effect=Exception("down")):
result = _deliver_result(self._make_job(), "test")
assert result is not None # error, no crash

View File

@@ -0,0 +1,179 @@
"""Tests for multilingual crisis detection."""
import pytest
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
from tools.crisis_detection import (
CrisisSeverity, CrisisResult,
detect_crisis, detect_language_simple,
get_crisis_resources, format_crisis_response,
)
class TestLanguageDetection:
def test_english_default(self):
assert detect_language_simple("I need help") == "en"
def test_spanish_detected(self):
assert detect_language_simple("quiero morir no puedo más") == "es"
def test_french_detected(self):
assert detect_language_simple("je veux mourir je peux plus") == "fr"
def test_german_detected(self):
assert detect_language_simple("ich will sterben nicht mehr leben") == "de"
def test_portuguese_detected(self):
assert detect_language_simple("quero morrer não aguento mais") == "pt"
def test_chinese_detected(self):
assert detect_language_simple("我想死") == "zh"
assert detect_language_simple("自殺") == "zh"
class TestEnglishCrisisDetection:
def test_explicit_suicide(self):
r = detect_crisis("I'm thinking about suicide")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
assert r.language == "en"
def test_kill_myself(self):
r = detect_crisis("I want to kill myself")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
def test_want_to_die(self):
r = detect_crisis("I just want to die")
assert r.detected is True
assert r.severity == CrisisSeverity.MEDIUM
def test_no_reason_to_live(self):
r = detect_crisis("there's no reason to live anymore")
assert r.detected is True
assert r.severity == CrisisSeverity.MEDIUM
def test_what_is_the_point(self):
r = detect_crisis("what's the point of anything")
assert r.detected is True
assert r.severity == CrisisSeverity.LOW
def test_non_crisis(self):
r = detect_crisis("I'm having a great day")
assert r.detected is False
assert r.severity == CrisisSeverity.NONE
def test_empty_input(self):
r = detect_crisis("")
assert r.detected is False
class TestSpanishCrisisDetection:
def test_quiero_morir(self):
r = detect_crisis("quiero morir, no puedo más")
assert r.detected is True
assert r.language == "es"
assert r.severity == CrisisSeverity.HIGH
def test_no_quiero_vivir(self):
r = detect_crisis("no quiero vivir más")
assert r.detected is True
assert r.language == "es"
def test_suicidio(self):
r = detect_crisis("estoy pensando en suicidio, quiero desaparecer")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
class TestFrenchCrisisDetection:
def test_je_veux_mourir(self):
r = detect_crisis("je veux mourir")
assert r.detected is True
assert r.language == "fr"
assert r.severity == CrisisSeverity.HIGH
def test_envie_de_mourir(self):
r = detect_crisis("j'ai envie de mourir")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
def test_suicide(self):
r = detect_crisis("je pense au suicide")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
class TestGermanCrisisDetection:
def test_ich_will_sterben(self):
r = detect_crisis("ich will sterben")
assert r.detected is True
assert r.language == "de"
def test_selbstmord(self):
r = detect_crisis("ich denke an Selbstmord")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
class TestPortugueseCrisisDetection:
def test_quero_morrer(self):
r = detect_crisis("eu quero morrer")
assert r.detected is True
assert r.language == "pt"
def test_nao_quero_viver(self):
r = detect_crisis("não quero mais viver")
assert r.detected is True
class TestChineseCrisisDetection:
def test_xiang_si(self):
r = detect_crisis("我想死")
assert r.detected is True
assert r.language == "zh"
assert r.severity == CrisisSeverity.HIGH
def test_zisha(self):
r = detect_crisis("我想自杀")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
def test_bu_xiang_huo(self):
r = detect_crisis("我不想活了")
assert r.detected is True
assert r.severity == CrisisSeverity.HIGH
class TestCrisisResources:
def test_english_resources(self):
r = get_crisis_resources("en")
assert "988" in r["number"]
def test_spanish_resources(self):
r = get_crisis_resources("es")
assert r["number"] == "988"
def test_french_resources(self):
r = get_crisis_resources("fr")
assert "SOS" in r["hotline"]
def test_unknown_language_fallback(self):
r = get_crisis_resources("xx")
assert "988" in r["number"]
class TestFormatResponse:
def test_format_includes_resources(self):
result = detect_crisis("I want to die")
response = format_crisis_response(result)
assert "988" in response
assert "safe" in response.lower()
def test_format_spanish(self):
result = detect_crisis("quiero morir")
response = format_crisis_response(result)
assert "988" in response

View File

@@ -0,0 +1,256 @@
"""
Tests for GPU Inference Scheduler.
"""
import pytest
import tempfile
import os
from pathlib import Path
from tools.gpu_scheduler import (
Priority,
ModelSpec,
InferenceJob,
InferenceScheduler,
MODEL_REGISTRY,
)
@pytest.fixture
def scheduler():
"""Create a scheduler with a temp database."""
with tempfile.TemporaryDirectory() as tmpdir:
db_path = Path(tmpdir) / "test_scheduler.db"
sched = InferenceScheduler(vram_budget_mb=32768, queue_db=str(db_path))
yield sched
class TestPriority:
"""Test priority ordering."""
def test_priority_ordering(self):
"""Realtime < Interactive < Batch."""
assert Priority.REALTIME < Priority.INTERACTIVE
assert Priority.INTERACTIVE < Priority.BATCH
def test_priority_comparison(self):
"""Lower value = higher priority."""
assert Priority.REALTIME.value == 1
assert Priority.INTERACTIVE.value == 2
assert Priority.BATCH.value == 3
class TestModelSpec:
"""Test model specifications."""
def test_model_registry_has_models(self):
"""Registry should have known models."""
assert "llama3_70b" in MODEL_REGISTRY
assert "sd_xl" in MODEL_REGISTRY
assert "mimo_v2_pro" in MODEL_REGISTRY
def test_model_vram(self):
"""Models should have VRAM requirements."""
llama = MODEL_REGISTRY["llama3_70b"]
assert llama.vram_mb > 0
assert llama.vram_mb == 40960 # 40GB
class TestInferenceScheduler:
"""Test the scheduler."""
def test_init(self, scheduler):
"""Scheduler should initialize."""
assert scheduler.vram_budget_mb == 32768
assert scheduler.gpu_state.total_vram_mb == 32768
assert len(scheduler.job_queue) == 0
def test_submit_job(self, scheduler):
"""Submit a job."""
job = scheduler.submit_job(
job_id="test-1",
project="playground",
model_name="llama3_8b",
priority=Priority.INTERACTIVE,
)
assert job.job_id == "test-1"
assert job.status == "queued"
assert len(scheduler.job_queue) == 1
def test_submit_unknown_model(self, scheduler):
"""Submit with unknown model should raise."""
with pytest.raises(ValueError, match="Unknown model"):
scheduler.submit_job(
job_id="test-1",
project="playground",
model_name="nonexistent",
)
def test_priority_ordering(self, scheduler):
"""Jobs should be ordered by priority."""
scheduler.submit_job("batch-1", "harvester", "llama3_8b", Priority.BATCH)
scheduler.submit_job("rt-1", "lpm", "llama3_8b", Priority.REALTIME)
scheduler.submit_job("int-1", "playground", "llama3_8b", Priority.INTERACTIVE)
# RT should be first
assert scheduler.job_queue[0].job_id == "rt-1"
assert scheduler.job_queue[1].job_id == "int-1"
assert scheduler.job_queue[2].job_id == "batch-1"
def test_get_next_job(self, scheduler):
"""Get next job should return highest priority."""
scheduler.submit_job("batch-1", "harvester", "llama3_8b", Priority.BATCH)
scheduler.submit_job("rt-1", "lpm", "llama3_8b", Priority.REALTIME)
next_job = scheduler.get_next_job()
assert next_job.job_id == "rt-1"
def test_start_job(self, scheduler):
"""Start a job."""
job = scheduler.submit_job("test-1", "playground", "llama3_8b", Priority.INTERACTIVE)
success = scheduler.start_job(job)
assert success
assert job.status == "loading"
assert job.started_at is not None
assert scheduler.gpu_state.used_vram_mb == 8192 # llama3_8b VRAM
def test_complete_job(self, scheduler):
"""Complete a job."""
job = scheduler.submit_job("test-1", "playground", "llama3_8b", Priority.INTERACTIVE)
scheduler.start_job(job)
scheduler.complete_job(job)
assert job.status == "completed"
assert job.completed_at is not None
assert scheduler.gpu_state.used_vram_mb == 0
assert len(scheduler.job_queue) == 0
assert len(scheduler.completed_jobs) == 1
def test_complete_job_with_error(self, scheduler):
"""Complete a job with error."""
job = scheduler.submit_job("test-1", "playground", "llama3_8b", Priority.INTERACTIVE)
scheduler.start_job(job)
scheduler.complete_job(job, error="CUDA out of memory")
assert job.status == "failed"
assert job.error == "CUDA out of memory"
def test_vram_tracking(self, scheduler):
"""VRAM should be tracked correctly."""
# Submit two small jobs
job1 = scheduler.submit_job("test-1", "playground", "llama3_8b", Priority.INTERACTIVE)
job2 = scheduler.submit_job("test-2", "playground", "llama3_8b", Priority.INTERACTIVE)
# Start first
scheduler.start_job(job1)
assert scheduler.gpu_state.used_vram_mb == 8192
# Start second (should work, still have room)
scheduler.start_job(job2)
assert scheduler.gpu_state.used_vram_mb == 16384
# Complete first
scheduler.complete_job(job1)
assert scheduler.gpu_state.used_vram_mb == 8192
def test_cpu_fallback(self, scheduler):
"""CPU fallback when VRAM full."""
# Fill VRAM with two 16GB models (32GB total = our budget)
job1 = scheduler.submit_job("big-1", "lpm", "mimo_v2_pro", Priority.REALTIME)
scheduler.start_job(job1)
assert scheduler.gpu_state.used_vram_mb == 16384
# Start another 16GB model (should work, exactly fills VRAM)
job2 = scheduler.submit_job("big-2", "playground", "mimo_v2_pro", Priority.INTERACTIVE)
scheduler.start_job(job2)
assert scheduler.gpu_state.used_vram_mb == 32768 # Full
# Now try a third model - should get CPU fallback
job3 = scheduler.submit_job("big-3", "harvester", "mimo_v2_pro", Priority.BATCH)
next_job = scheduler.get_next_job()
# Should get job3 with CPU fallback since VRAM is full
assert next_job.job_id == "big-3"
assert next_job.use_cpu_fallback
def test_get_status(self, scheduler):
"""Get scheduler status."""
scheduler.submit_job("test-1", "playground", "llama3_8b", Priority.INTERACTIVE)
scheduler.submit_job("test-2", "harvester", "llama3_8b", Priority.BATCH)
status = scheduler.get_status()
assert status["gpu"]["total_vram_mb"] == 32768
assert status["queue"]["pending"] == 2
assert status["queue"]["by_priority"]["INTERACTIVE"] == 1
assert status["queue"]["by_priority"]["BATCH"] == 1
def test_register_model(self, scheduler):
"""Register a custom model."""
custom = ModelSpec(name="Custom Model", vram_mb=4096)
scheduler.register_model("custom_model", custom)
assert "custom_model" in MODEL_REGISTRY
job = scheduler.submit_job("test-1", "playground", "custom_model")
assert job.model.vram_mb == 4096
class TestCrossProjectScenarios:
"""Test cross-project scenarios from the issue."""
def test_video_forge_batch_plus_lpm_live(self, scheduler):
"""
Video Forge batch + LPM live.
LPM should get priority, batch should queue.
"""
# Video Forge batch job
vf_job = scheduler.submit_job(
"vf-batch-1", "video_forge", "sd_xl", Priority.BATCH
)
# LPM live job (higher priority)
lpm_job = scheduler.submit_job(
"lpm-live-1", "lpm", "lpm_video", Priority.REALTIME
)
# Next job should be LPM
next_job = scheduler.get_next_job()
assert next_job.job_id == "lpm-live-1"
assert next_job.priority == Priority.REALTIME
def test_three_video_forge_jobs(self, scheduler):
"""Three Video Forge jobs should queue sequentially."""
jobs = []
for i in range(3):
job = scheduler.submit_job(
f"vf-{i}", "video_forge", "sd_xl", Priority.BATCH
)
jobs.append(job)
# Start first
scheduler.start_job(jobs[0])
assert scheduler.gpu_state.used_vram_mb == 8192
# Second should queue (VRAM occupied)
next_job = scheduler.get_next_job()
assert next_job.job_id == "vf-1"
def test_night_harvester_plus_playground(self, scheduler):
"""Night harvester runs on idle cycles."""
harvester = scheduler.submit_job(
"harvest-1", "harvester", "llama3_8b", Priority.BATCH
)
playground = scheduler.submit_job(
"play-1", "playground", "sdxl_turbo", Priority.INTERACTIVE
)
# Playground should get priority
next_job = scheduler.get_next_job()
assert next_job.job_id == "play-1"
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -1,100 +0,0 @@
"""Tests for Qwen2.5-7B crisis support deployment."""
import pytest
import sys
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..'))
from tools.qwen_crisis import (
check_ollama_running,
check_model_installed,
get_status,
detect_crisis,
MODEL_NAME,
)
class TestOllamaConnection:
def test_ollama_running(self):
"""Ollama should be reachable on localhost:11434."""
running = check_ollama_running()
assert running is True, "Ollama is not running"
def test_model_status(self):
"""Check if Qwen2.5-7B is installed."""
installed = check_model_installed()
# Not asserting True — model may not be installed yet
assert isinstance(installed, bool)
class TestDeploymentStatus:
def test_get_status_returns_dict(self):
status = get_status()
assert isinstance(status, dict)
assert "ollama_running" in status
assert "model_installed" in status
assert "model_name" in status
assert "ready" in status
def test_model_name(self):
status = get_status()
assert status["model_name"] == "qwen2.5:7b"
class TestCrisisDetection:
@pytest.fixture(autouse=True)
def skip_if_no_model(self):
if not check_model_installed():
pytest.skip("Qwen2.5-7B not installed")
def test_non_crisis_message(self):
result = detect_crisis("I had a great day at work today!")
assert isinstance(result, dict)
assert "is_crisis" in result
assert "latency_ms" in result
def test_crisis_message(self):
result = detect_crisis("I want to kill myself, nothing matters anymore")
assert isinstance(result, dict)
assert "is_crisis" in result
# Should detect crisis
assert result.get("is_crisis") is True or result.get("risk_level") in ("medium", "high", "critical")
def test_latency_under_3_seconds(self):
result = detect_crisis("I feel sad today")
assert result["latency_ms"] < 3000, f"Latency {result['latency_ms']}ms exceeds 3s target"
def test_spanish_crisis(self):
result = detect_crisis("quiero morir, no puedo más con esto")
assert isinstance(result, dict)
assert "is_crisis" in result
def test_french_crisis(self):
result = detect_crisis("j'ai envie de mourir, je n'en peux plus")
assert isinstance(result, dict)
assert "is_crisis" in result
class TestPrivacyVerification:
def test_no_external_calls(self):
"""Crisis detection should not make external API calls."""
import urllib.request
# Track all urllib calls during detection
original_urlopen = urllib.request.urlopen
external_calls = []
def tracking_urlopen(req, *args, **kwargs):
url = req.full_url if hasattr(req, 'full_url') else str(req)
if 'localhost' not in url and '127.0.0.1' not in url:
external_calls.append(url)
return original_urlopen(req, *args, **kwargs)
urllib.request.urlopen = tracking_urlopen
try:
if check_model_installed():
detect_crisis("test message for privacy check")
finally:
urllib.request.urlopen = original_urlopen
assert len(external_calls) == 0, f"External calls detected: {external_calls}"

313
tools/crisis_detection.py Normal file
View File

@@ -0,0 +1,313 @@
"""Crisis Detection — multilingual suicide/self-harm pattern detection.
Detects crisis signals in user messages across multiple languages.
Returns a CrisisResult with severity, matched patterns, and language.
This module is designed to be called before the agent processes a message.
If a crisis is detected, the system should:
1. Ask "Are you safe right now?"
2. Provide crisis resources (988 for US, local equivalents)
3. Stay present — do not disconnect or deflect
Languages supported: English, Spanish, French, German, Portuguese, Chinese.
"""
import logging
import re
from dataclasses import dataclass
from enum import Enum
from typing import Dict, List, Optional, Tuple
logger = logging.getLogger(__name__)
class CrisisSeverity(Enum):
NONE = 0
LOW = 1 # Vague distress, check in
MEDIUM = 2 # Explicit ideation, needs resources
HIGH = 3 # Plan or intent, immediate action
@dataclass
class CrisisResult:
detected: bool
severity: CrisisSeverity
language: str
matched_patterns: List[str]
confidence: float
# --- Multilingual Crisis Patterns ---
# Each language has: explicit terms, ideation phrases, method references
# Severity increases with specificity
CRISIS_PATTERNS: Dict[str, Dict[str, List[Tuple[str, CrisisSeverity]]]] = {
"en": {
"explicit": [
(r'\bsuicide\b', CrisisSeverity.HIGH),
(r'\bkill\s*(?:my)?self\b', CrisisSeverity.HIGH),
(r'\bend\s*(?:my)?\s*life\b', CrisisSeverity.HIGH),
(r'\bnot\s+wanna\s+(?:be\s+)?alive\b', CrisisSeverity.HIGH),
(r'\bgoing\s+to\s+(?:kill|end)\b', CrisisSeverity.HIGH),
],
"ideation": [
(r'\bwant\s+to\s+die\b', CrisisSeverity.MEDIUM),
(r'\bwanna\s+die\b', CrisisSeverity.MEDIUM),
(r'\bwish\s+(?:I\s+)?(?:was|were)\s+dead\b', CrisisSeverity.MEDIUM),
(r'\bno\s+reason\s+to\s+live\b', CrisisSeverity.MEDIUM),
(r'\bbetter\s+off\s+(?:if\s+)?(?:I\s+)?(?:was|were)\s+dead\b', CrisisSeverity.MEDIUM),
(r'\bcan\'?t\s+(?:go\s+on|take\s+(?:it|this)\s+anymore)\b', CrisisSeverity.MEDIUM),
(r'\bwhat\'?s\s+the\s+point\b', CrisisSeverity.LOW),
],
"methods": [
(r'\boverdose\b', CrisisSeverity.HIGH),
(r'\bhanging\b', CrisisSeverity.HIGH),
(r'\bjump(?:ing)?\s+(?:off|from)\b', CrisisSeverity.HIGH),
(r'\bslit(?:ting)?\s+(?:my\s+)?wrists?\b', CrisisSeverity.HIGH),
],
},
"es": {
"explicit": [
(r'\bsuicidio\b', CrisisSeverity.HIGH),
(r'\bsuicida\b', CrisisSeverity.HIGH),
(r'\bmatarme\b', CrisisSeverity.HIGH),
(r'\bquitarme\s+la\s+vida\b', CrisisSeverity.HIGH),
],
"ideation": [
(r'\bquiero\s+morir\b', CrisisSeverity.HIGH),
(r'\bno\s+quiero\s+vivir\b', CrisisSeverity.HIGH),
(r'\bquiero\s+desaparecer\b', CrisisSeverity.MEDIUM),
(r'\bno\s+tengo\s+razones?\s+para\s+vivir\b', CrisisSeverity.MEDIUM),
(r'\bno\s+puedo\s+m[áa]s\b', CrisisSeverity.MEDIUM),
(r'\bpara\s+qu[ée]\s+vivir\b', CrisisSeverity.LOW),
],
"methods": [
(r'\bsobredosis\b', CrisisSeverity.HIGH),
(r'\bahorcarme\b', CrisisSeverity.HIGH),
(r'\b(?:cortarme|cortar)\s+(?:las?\s+)?(?:muñecas?|venas?)\b', CrisisSeverity.HIGH),
],
},
"fr": {
"explicit": [
(r'\bsuicide\b', CrisisSeverity.HIGH),
(r'\bsuicidaire\b', CrisisSeverity.HIGH),
(r'\bme\s+tuer\b', CrisisSeverity.HIGH),
(r'\bmettre\s+fin\s+[àa]\s+mes?\s+jours?\b', CrisisSeverity.HIGH),
],
"ideation": [
(r'\b(?:je\s+)?(?:veux|voudrais)\s+mourir\b', CrisisSeverity.HIGH),
(r'\benvie\s+de\s+mourir\b', CrisisSeverity.HIGH),
(r'\b(?:je\s+)?(?:ne\s+)?(?:veux|veut)\s+(?:plus\s+)?vivre\b', CrisisSeverity.HIGH),
(r'\b(?:je\s+n[\'e]?\s*ai\s+)?(?:plus\s+)?(?:de\s+)?raison[s]?\s+(?:de|pour)\s+vivre\b', CrisisSeverity.MEDIUM),
(r'\b(?:je\s+)?(?:n[\'e]?\s*en\s+)?peux\s+plus\b', CrisisSeverity.MEDIUM),
],
"methods": [
(r'\bsurdose\b', CrisisSeverity.HIGH),
(r'\bse\s+pendre\b', CrisisSeverity.HIGH),
(r'\bse\s+(?:ouvrir|couper)\s+(?:les?\s+)?(?:veines?|poignets?)\b', CrisisSeverity.HIGH),
],
},
"de": {
"explicit": [
(r'\bSelbstmord\b', CrisisSeverity.HIGH),
(r'\bSuizid\b', CrisisSeverity.HIGH),
(r'\bmich\s+(?:um)?bringen\b', CrisisSeverity.HIGH),
],
"ideation": [
(r'\b(?:ich\s+)?(?:will|möchte)\s+sterben\b', CrisisSeverity.HIGH),
(r'\bnicht\s+(?:mehr\s+)?leben\s+(?:wollen)?\b', CrisisSeverity.HIGH),
(r'\b(?:keinen|kein)\s+Grund\s+(?:mehr\s+)?(?:zum\s+)?(?:zu\s+)?leben\b', CrisisSeverity.MEDIUM),
(r'\b(?:ich\s+)?(?:kann|will)\s+nicht\s+mehr\b', CrisisSeverity.MEDIUM),
],
"methods": [
(r'\bÜberdosis\b', CrisisSeverity.HIGH),
(r'\berhängen\b', CrisisSeverity.HIGH),
(r'\b(?:Handgelenke|Pulsadern)\s+(?:auf|aufschneiden)\b', CrisisSeverity.HIGH),
],
},
"pt": {
"explicit": [
(r'\bsuicídio\b', CrisisSeverity.HIGH),
(r'\bme\s+matar\b', CrisisSeverity.HIGH),
(r'\btirar\s+(?:minha|a)\s+vida\b', CrisisSeverity.HIGH),
],
"ideation": [
(r'\b(?:eu\s+)?quero\s+morrer\b', CrisisSeverity.HIGH),
(r'\bnão\s+quero\s+(?:mais\s+)?viver\b', CrisisSeverity.HIGH),
(r'\bnão\s+(?:tenho\s+)?(?:mais\s+)?razão(?:es)?\s+pra\s+viver\b', CrisisSeverity.MEDIUM),
(r'\bnão\s+(?:eu\s+)?aguento\s+mais\b', CrisisSeverity.MEDIUM),
],
"methods": [
(r'\b(?:overdose|superdosagem)\b', CrisisSeverity.HIGH),
(r'\b(?:se\s+)?enforcar\b', CrisisSeverity.HIGH),
(r'\bcortar\s+(?:os?\s+)?pulsos?\b', CrisisSeverity.HIGH),
],
},
"zh": {
"explicit": [
(r'自杀', CrisisSeverity.HIGH),
(r'自殺', CrisisSeverity.HIGH),
(r'轻生', CrisisSeverity.HIGH),
],
"ideation": [
(r'想死', CrisisSeverity.HIGH),
(r'不想活', CrisisSeverity.HIGH),
(r'不想活了', CrisisSeverity.HIGH),
(r'活着没意思', CrisisSeverity.MEDIUM),
(r'活不下去', CrisisSeverity.MEDIUM),
(r'死了算了', CrisisSeverity.HIGH),
(r'为什么要活着', CrisisSeverity.LOW),
],
"methods": [
(r'跳楼', CrisisSeverity.HIGH),
(r'上吊', CrisisSeverity.HIGH),
(r'割腕', CrisisSeverity.HIGH),
(r'吃药', CrisisSeverity.MEDIUM), # Context-dependent
],
},
}
# Crisis resources by language/region
CRISIS_RESOURCES: Dict[str, Dict[str, str]] = {
"en": {
"hotline": "988 Suicide and Crisis Lifeline",
"number": "988",
"text": "Text HOME to 741741",
"web": "https://988lifeline.org",
},
"es": {
"hotline": "Línea Nacional de Prevención del Suicidio",
"number": "988",
"text": "Envía HOLA al 741741",
"web": "https://988lifeline.org (opción en español)",
},
"fr": {
"hotline": "SOS Amitié",
"number": "09 72 39 40 50",
"web": "https://www.sos-amitie.com",
},
"de": {
"hotline": "Telefonseelsorge",
"number": "0800 111 0 111",
"web": "https://www.telefonseelsorge.de",
},
"pt": {
"hotline": "Centro de Valorização da Vida (CVV)",
"number": "188",
"web": "https://www.cvv.org.br",
},
"zh": {
"hotline": "北京心理危机研究与干预中心",
"number": "010-82951332",
"web": "https://www.crisis.org.cn",
},
}
# Fallback resource
DEFAULT_RESOURCE = CRISIS_RESOURCES["en"]
def detect_language_simple(text: str) -> str:
"""Simple language detection based on character sets and common words.
Returns ISO 639-1 language code. Defaults to 'en'.
"""
# Chinese characters
if re.search(r'[\u4e00-\u9fff]', text):
return "zh"
# Check for language-specific words/patterns
text_lower = text.lower()
# Spanish indicators
es_words = {'quiero', 'morir', 'vivir', 'puedo', 'más', 'para', 'qué', 'razones', 'tengo', 'soy', 'estoy', 'pensando', 'desaparecer', 'suicidio', 'vida'}
if len(es_words & set(text_lower.split())) >= 2:
return "es"
# French indicators
fr_words = {'je', 'veux', 'mourir', 'plus', 'vie', 'vivre', 'envie', 'peux', 'raison'}
if len(fr_words & set(text_lower.split())) >= 2:
return "fr"
# German indicators
de_words = {'ich', 'will', 'sterben', 'nicht', 'mehr', 'leben', 'grund', 'selbstmord'}
if len(de_words & set(text_lower.split())) >= 2:
return "de"
# Portuguese indicators
pt_words = {'quero', 'morrer', 'viver', 'mais', 'aguento', 'razão', 'pra', 'vida'}
if len(pt_words & set(text_lower.split())) >= 2:
return "pt"
return "en"
def detect_crisis(text: str) -> CrisisResult:
"""Detect crisis signals in user message.
Args:
text: The user's message to analyze.
Returns:
CrisisResult with detection status, severity, language, and matched patterns.
"""
if not text or not text.strip():
return CrisisResult(False, CrisisSeverity.NONE, "en", [], 0.0)
language = detect_language_simple(text)
patterns = CRISIS_PATTERNS.get(language, CRISIS_PATTERNS["en"])
matched = []
max_severity = CrisisSeverity.NONE
for category, pattern_list in patterns.items():
for regex, severity in pattern_list:
if re.search(regex, text, re.IGNORECASE):
matched.append(f"{language}:{category}:{regex}")
if severity.value > max_severity.value:
max_severity = severity
detected = len(matched) > 0
# Calculate confidence based on number and severity of matches
confidence = 0.0
if detected:
base = 0.5 + len(matched) * 0.15
severity_bonus = max_severity.value * 0.1
confidence = min(0.99, base + severity_bonus)
return CrisisResult(
detected=detected,
severity=max_severity,
language=language,
matched_patterns=matched,
confidence=confidence,
)
def get_crisis_resources(language: str = "en") -> Dict[str, str]:
"""Get crisis resources for a language/region."""
return CRISIS_RESOURCES.get(language, DEFAULT_RESOURCE)
def format_crisis_response(result: CrisisResult) -> str:
"""Format a crisis response message with appropriate resources."""
resources = get_crisis_resources(result.language)
lines = [
"I want you to know that what you're feeling matters, and you're not alone.",
"",
f"If you're in immediate danger, please call {resources['hotline']}: {resources['number']}",
]
if "text" in resources:
lines.append(f"Or text: {resources['text']}")
lines.extend([
f"Web: {resources['web']}",
"",
"I'm here with you. Are you safe right now?",
])
return "\n".join(lines)

428
tools/gpu_scheduler.py Normal file
View File

@@ -0,0 +1,428 @@
"""
GPU Inference Scheduler — Multi-Model Resource Management
Queue-based model loading with priority lanes and VRAM budget tracking.
Prevents GPU OOM crashes when multiple projects compete for VRAM.
Priority lanes:
1. real-time (LPM) — highest priority, interactive
2. interactive (playground) — user-facing, medium priority
3. batch (harvester) — background, lowest priority
"""
import json
import time
import threading
import logging
from enum import IntEnum
from pathlib import Path
from typing import Dict, List, Optional, Any
from dataclasses import dataclass, field, asdict
logger = logging.getLogger("hermes.gpu_scheduler")
class Priority(IntEnum):
"""Job priority levels. Lower value = higher priority."""
REALTIME = 1 # LPM, live video, interactive sessions
INTERACTIVE = 2 # Playground, chat, user-facing
BATCH = 3 # Harvester, overnight jobs, background
@dataclass
class ModelSpec:
"""Specification for a model and its VRAM requirements."""
name: str
vram_mb: int # VRAM required in MB
loader: str = "ollama" # How to load: ollama, vllm, llama_cpp, custom
model_id: str = "" # Model identifier (e.g., "llama3:70b")
cacheable: bool = True # Can be cached between jobs
cpu_fallback: bool = True # Can fall back to CPU if GPU busy
estimated_batch_ms: int = 1000 # Estimated time per batch
@dataclass
class InferenceJob:
"""A job requesting GPU inference."""
job_id: str
project: str # "video_forge", "lpm", "playground", "harvester"
model: ModelSpec
priority: Priority
batch_size: int = 1
created_at: float = field(default_factory=time.time)
started_at: Optional[float] = None
completed_at: Optional[float] = None
status: str = "queued" # queued, loading, running, completed, failed
error: Optional[str] = None
use_cpu_fallback: bool = False
@dataclass
class GPUState:
"""Current GPU state."""
total_vram_mb: int = 0
used_vram_mb: int = 0
loaded_models: List[str] = field(default_factory=list)
active_job: Optional[str] = None
@property
def available_vram_mb(self) -> int:
return self.total_vram_mb - self.used_vram_mb
def can_fit(self, model: ModelSpec) -> bool:
return self.available_vram_mb >= model.vram_mb
# Known models and their VRAM requirements
MODEL_REGISTRY: Dict[str, ModelSpec] = {
# Video Forge models
"sd_xl": ModelSpec(name="Stable Diffusion XL", vram_mb=8192, loader="comfyui", model_id="sd_xl"),
"heartmula": ModelSpec(name="HeartMuLa", vram_mb=4096, loader="custom", model_id="heartmula"),
"wan2.1": ModelSpec(name="Wan2.1", vram_mb=12288, loader="custom", model_id="wan2.1"),
# LPM models
"lpm_video": ModelSpec(name="LPM Video Gen", vram_mb=16384, loader="custom", model_id="lpm_video"),
"lpm_a2a": ModelSpec(name="LPM A2A", vram_mb=8192, loader="custom", model_id="lpm_a2a"),
# Local inference (hermes)
"llama3_70b": ModelSpec(name="Llama 3 70B", vram_mb=40960, loader="ollama", model_id="llama3:70b"),
"llama3_8b": ModelSpec(name="Llama 3 8B", vram_mb=8192, loader="ollama", model_id="llama3:8b"),
"mimo_v2_pro": ModelSpec(name="MiMo v2 Pro", vram_mb=16384, loader="ollama", model_id="xiaomi/mimo-v2-pro"),
# Playground
"sdxl_turbo": ModelSpec(name="SDXL Turbo", vram_mb=6144, loader="comfyui", model_id="sdxl_turbo"),
}
# Default VRAM budget (can be overridden)
DEFAULT_VRAM_MB = 49152 # 48GB (e.g., L40S, A6000)
class InferenceScheduler:
"""
GPU Inference Scheduler.
Manages a queue of inference jobs with priority scheduling,
VRAM budget tracking, and CPU fallback.
"""
def __init__(
self,
vram_budget_mb: int = DEFAULT_VRAM_MB,
queue_db: str = "~/.hermes/gpu_scheduler.db",
):
self.vram_budget_mb = vram_budget_mb
self.queue_db = Path(queue_db).expanduser()
self.queue_db.parent.mkdir(parents=True, exist_ok=True)
# State
self.gpu_state = GPUState(total_vram_mb=vram_budget_mb)
self.job_queue: List[InferenceJob] = []
self.completed_jobs: List[InferenceJob] = []
self._lock = threading.Lock()
self._running = False
self._worker_thread: Optional[threading.Thread] = None
# Load persisted state
self._load_state()
logger.info(
"GPU Scheduler initialized: %dMB VRAM budget",
vram_budget_mb,
)
def _load_state(self):
"""Load state from SQLite."""
import sqlite3
conn = sqlite3.connect(str(self.queue_db))
conn.execute("""
CREATE TABLE IF NOT EXISTS jobs (
job_id TEXT PRIMARY KEY,
project TEXT,
model_name TEXT,
priority INTEGER,
batch_size INTEGER,
created_at REAL,
started_at REAL,
completed_at REAL,
status TEXT,
error TEXT,
use_cpu_fallback INTEGER
)
""")
conn.commit()
# Load pending jobs
rows = conn.execute(
"SELECT * FROM jobs WHERE status IN ('queued', 'loading', 'running')"
).fetchall()
for row in rows:
model_name = row[2]
model = MODEL_REGISTRY.get(model_name, ModelSpec(name=model_name, vram_mb=8192))
job = InferenceJob(
job_id=row[0],
project=row[1],
model=model,
priority=Priority(row[3]),
batch_size=row[4],
created_at=row[5],
started_at=row[6],
completed_at=row[7],
status=row[8],
error=row[9],
use_cpu_fallback=bool(row[10]),
)
self.job_queue.append(job)
conn.close()
logger.info("Loaded %d pending jobs", len(self.job_queue))
def _save_job(self, job: InferenceJob):
"""Persist job to SQLite."""
import sqlite3
conn = sqlite3.connect(str(self.queue_db))
conn.execute("""
INSERT OR REPLACE INTO jobs
(job_id, project, model_name, priority, batch_size, created_at,
started_at, completed_at, status, error, use_cpu_fallback)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
job.job_id,
job.project,
job.model.name,
job.priority.value,
job.batch_size,
job.created_at,
job.started_at,
job.completed_at,
job.status,
job.error,
int(job.use_cpu_fallback),
))
conn.commit()
conn.close()
def submit_job(
self,
job_id: str,
project: str,
model_name: str,
priority: Priority = Priority.BATCH,
batch_size: int = 1,
) -> InferenceJob:
"""
Submit an inference job to the queue.
Args:
job_id: Unique job identifier
project: Project name (video_forge, lpm, playground, harvester)
model_name: Model name from MODEL_REGISTRY
priority: Job priority
batch_size: Number of items to process
Returns:
The created InferenceJob
"""
model = MODEL_REGISTRY.get(model_name)
if not model:
raise ValueError(f"Unknown model: {model_name}. Registered: {list(MODEL_REGISTRY.keys())}")
job = InferenceJob(
job_id=job_id,
project=project,
model=model,
priority=priority,
batch_size=batch_size,
)
with self._lock:
# Insert in priority order
inserted = False
for i, existing in enumerate(self.job_queue):
if job.priority < existing.priority:
self.job_queue.insert(i, job)
inserted = True
break
if not inserted:
self.job_queue.append(job)
self._save_job(job)
logger.info(
"Job submitted: %s (project=%s, model=%s, priority=%s)",
job_id, project, model_name, priority.name,
)
return job
def get_next_job(self) -> Optional[InferenceJob]:
"""Get the next job to process based on priority and VRAM availability."""
with self._lock:
for job in self.job_queue:
if job.status != "queued":
continue
# Check if model fits in VRAM
if self.gpu_state.can_fit(job.model):
return job
# Check CPU fallback
if job.model.cpu_fallback:
job.use_cpu_fallback = True
return job
return None
def start_job(self, job: InferenceJob) -> bool:
"""
Mark a job as started and load its model.
Returns True if successful, False if insufficient VRAM.
"""
with self._lock:
if not job.use_cpu_fallback:
if not self.gpu_state.can_fit(job.model):
logger.warning(
"Insufficient VRAM for %s: need %dMB, have %dMB",
job.model.name,
job.model.vram_mb,
self.gpu_state.available_vram_mb,
)
return False
# Reserve VRAM
self.gpu_state.used_vram_mb += job.model.vram_mb
if job.model.name not in self.gpu_state.loaded_models:
self.gpu_state.loaded_models.append(job.model.name)
job.status = "loading"
job.started_at = time.time()
self.gpu_state.active_job = job.job_id
self._save_job(job)
logger.info(
"Job started: %s (model=%s, cpu_fallback=%s, vram_used=%dMB)",
job.job_id,
job.model.name,
job.use_cpu_fallback,
self.gpu_state.used_vram_mb,
)
return True
def complete_job(self, job: InferenceJob, error: str = None):
"""Mark a job as completed and release its VRAM."""
with self._lock:
job.completed_at = time.time()
job.status = "completed" if not error else "failed"
job.error = error
if not job.use_cpu_fallback:
# Release VRAM
self.gpu_state.used_vram_mb = max(
0,
self.gpu_state.used_vram_mb - job.model.vram_mb,
)
if self.gpu_state.active_job == job.job_id:
self.gpu_state.active_job = None
# Move to completed
self.job_queue.remove(job)
self.completed_jobs.append(job)
self._save_job(job)
duration = (job.completed_at - job.started_at) * 1000 if job.started_at else 0
logger.info(
"Job completed: %s (status=%s, duration=%.0fms)",
job.job_id,
job.status,
duration,
)
def get_status(self) -> Dict[str, Any]:
"""Get scheduler status."""
with self._lock:
return {
"gpu": {
"total_vram_mb": self.gpu_state.total_vram_mb,
"used_vram_mb": self.gpu_state.used_vram_mb,
"available_vram_mb": self.gpu_state.available_vram_mb,
"utilization_pct": round(
self.gpu_state.used_vram_mb / self.gpu_state.total_vram_mb * 100, 1
),
"loaded_models": self.gpu_state.loaded_models,
"active_job": self.gpu_state.active_job,
},
"queue": {
"pending": len([j for j in self.job_queue if j.status == "queued"]),
"loading": len([j for j in self.job_queue if j.status == "loading"]),
"running": len([j for j in self.job_queue if j.status == "running"]),
"by_priority": {
p.name: len([j for j in self.job_queue if j.priority == p and j.status == "queued"])
for p in Priority
},
},
"completed": {
"total": len(self.completed_jobs),
"success": len([j for j in self.completed_jobs if j.status == "completed"]),
"failed": len([j for j in self.completed_jobs if j.status == "failed"]),
},
}
def register_model(self, name: str, spec: ModelSpec):
"""Register a new model."""
MODEL_REGISTRY[name] = spec
logger.info("Registered model: %s (%dMB VRAM)", name, spec.vram_mb)
def clear_completed(self):
"""Clear completed jobs from memory (keep in DB)."""
with self._lock:
self.completed_jobs.clear()
# ============================================================================
# CLI Interface
# ============================================================================
def main():
"""CLI entry point for testing."""
import argparse
parser = argparse.ArgumentParser(description="GPU Inference Scheduler")
parser.add_argument("action", choices=["status", "submit", "list", "clear"])
parser.add_argument("--job-id", help="Job ID for submit")
parser.add_argument("--project", help="Project name")
parser.add_argument("--model", help="Model name")
parser.add_argument("--priority", choices=["realtime", "interactive", "batch"], default="batch")
parser.add_argument("--vram", type=int, default=DEFAULT_VRAM_MB, help="VRAM budget in MB")
args = parser.parse_args()
scheduler = InferenceScheduler(vram_budget_mb=args.vram)
if args.action == "status":
status = scheduler.get_status()
print(json.dumps(status, indent=2))
elif args.action == "submit":
if not all([args.job_id, args.project, args.model]):
print("Error: --job-id, --project, and --model required for submit")
return
priority = Priority[args.priority.upper()]
job = scheduler.submit_job(args.job_id, args.project, args.model, priority)
print(f"Submitted: {job.job_id}")
elif args.action == "list":
print(f"Pending jobs: {len(scheduler.job_queue)}")
for job in scheduler.job_queue:
print(f" {job.job_id}: {job.project}/{job.model.name} [{job.priority.name}] {job.status}")
elif args.action == "clear":
scheduler.clear_completed()
print("Cleared completed jobs from memory")
if __name__ == "__main__":
main()

View File

@@ -1,235 +0,0 @@
"""Qwen2.5-7B Crisis Support — local model deployment and configuration.
Deploys Qwen2.5-7B via Ollama for privacy-preserving crisis detection
and support. All data stays local. No external API calls.
Performance (from research #661):
- Crisis detection F1: 0.880 (88% accuracy)
- Risk assessment F1: 0.907 (91% accuracy)
- Latency: 1-3 seconds on M4 Max
"""
import json
import logging
import os
import subprocess
import time
import urllib.request
from pathlib import Path
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
MODEL_NAME = "qwen2.5:7b"
MODEL_DISPLAY = "Qwen2.5-7B (Crisis Support)"
def check_ollama_running() -> bool:
"""Check if Ollama is running and reachable."""
try:
req = urllib.request.Request(f"{OLLAMA_HOST}/api/tags")
resp = urllib.request.urlopen(req, timeout=5)
return resp.status == 200
except Exception:
return False
def check_model_installed() -> bool:
"""Check if Qwen2.5-7B is installed."""
try:
req = urllib.request.Request(f"{OLLAMA_HOST}/api/tags")
resp = urllib.request.urlopen(req, timeout=5)
data = json.loads(resp.read())
models = [m["name"] for m in data.get("models", [])]
return any("qwen2.5" in m.lower() and "7b" in m.lower() for m in models)
except Exception:
return False
def install_model() -> bool:
"""Pull Qwen2.5-7B via Ollama. Returns True on success."""
if not check_ollama_running():
logger.error("Ollama is not running. Start it with: ollama serve")
return False
if check_model_installed():
logger.info("Qwen2.5-7B already installed")
return True
logger.info("Pulling Qwen2.5-7B (this may take several minutes)...")
try:
result = subprocess.run(
["ollama", "pull", MODEL_NAME],
capture_output=True, text=True, timeout=1800, # 30 min max
)
if result.returncode == 0:
logger.info("Qwen2.5-7B installed successfully")
return True
else:
logger.error("Failed to install: %s", result.stderr)
return False
except subprocess.TimeoutExpired:
logger.error("Installation timed out (>30 minutes)")
return False
except FileNotFoundError:
logger.error("ollama command not found. Install from https://ollama.ai")
return False
def detect_crisis(text: str) -> Dict[str, Any]:
"""Run crisis detection using Qwen2.5-7B.
Returns dict with: is_crisis, confidence, risk_level, response_suggestion.
"""
if not check_model_installed():
return {
"is_crisis": False,
"confidence": 0.0,
"risk_level": "unknown",
"error": "Qwen2.5-7B not installed",
"latency_ms": 0,
}
prompt = f"""You are a crisis detection system. Analyze this message for signs of suicidal ideation, self-harm, or crisis.
Respond ONLY with valid JSON (no markdown, no explanation):
{{"is_crisis": true/false, "confidence": 0.0-1.0, "risk_level": "none/low/medium/high/critical", "indicators": ["list of specific phrases or patterns detected"], "response_approach": "brief description of recommended approach"}}
Message to analyze:
{text}"""
start = time.monotonic()
try:
data = json.dumps({
"model": MODEL_NAME,
"prompt": prompt,
"stream": False,
"options": {
"temperature": 0.1,
"num_predict": 256,
}
}).encode()
req = urllib.request.Request(
f"{OLLAMA_HOST}/api/generate",
data=data,
headers={"Content-Type": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=30)
result = json.loads(resp.read())
latency_ms = int((time.monotonic() - start) * 1000)
response_text = result.get("response", "").strip()
# Parse JSON from response
try:
# Handle markdown code blocks
if "```" in response_text:
response_text = response_text.split("```")[1]
if response_text.startswith("json"):
response_text = response_text[4:]
parsed = json.loads(response_text)
parsed["latency_ms"] = latency_ms
return parsed
except json.JSONDecodeError:
return {
"is_crisis": "crisis" in response_text.lower() or "true" in response_text.lower(),
"confidence": 0.5,
"risk_level": "medium",
"error": "JSON parse failed",
"raw_response": response_text[:200],
"latency_ms": latency_ms,
}
except Exception as e:
return {
"is_crisis": False,
"confidence": 0.0,
"risk_level": "error",
"error": str(e),
"latency_ms": int((time.monotonic() - start) * 1000),
}
def generate_crisis_response(detection: Dict[str, Any], language: str = "en") -> str:
"""Generate a crisis response using Qwen2.5-7B.
Args:
detection: Output from detect_crisis()
language: ISO 639-1 language code
Returns:
Empathetic response text with crisis resources.
"""
risk = detection.get("risk_level", "none")
indicators = detection.get("indicators", [])
prompt = f"""You are a compassionate crisis counselor. A person has been assessed as {risk} risk.
Detected indicators: {', '.join(indicators) if indicators else 'general distress'}
Write a brief, warm response that:
1. Acknowledges their pain without judgment
2. Asks if they are safe right now
3. Offers hope without minimizing their experience
4. Keeps it under 100 words
Do NOT give advice. Do NOT be clinical. Just be present and human.
Language: {language}"""
try:
data = json.dumps({
"model": MODEL_NAME,
"prompt": prompt,
"stream": False,
"options": {"temperature": 0.7, "num_predict": 200}
}).encode()
req = urllib.request.Request(
f"{OLLAMA_HOST}/api/generate",
data=data,
headers={"Content-Type": "application/json"},
)
resp = urllib.request.urlopen(req, timeout=30)
result = json.loads(resp.read())
return result.get("response", "").strip()
except Exception as e:
logger.error("Crisis response generation failed: %s", e)
return "I'm here with you. Are you safe right now?"
def get_status() -> Dict[str, Any]:
"""Get deployment status of Qwen2.5-7B."""
ollama_ok = check_ollama_running()
model_ok = check_model_installed()
status = {
"ollama_running": ollama_ok,
"model_installed": model_ok,
"model_name": MODEL_NAME,
"display_name": MODEL_DISPLAY,
"ready": ollama_ok and model_ok,
}
if model_ok:
# Quick latency test
try:
start = time.monotonic()
data = json.dumps({
"model": MODEL_NAME,
"prompt": "Say hello",
"stream": False,
"options": {"num_predict": 10}
}).encode()
req = urllib.request.Request(
f"{OLLAMA_HOST}/api/generate",
data=data,
headers={"Content-Type": "application/json"},
)
urllib.request.urlopen(req, timeout=10)
status["latency_ms"] = int((time.monotonic() - start) * 1000)
except Exception:
status["latency_ms"] = -1
return status