Compare commits
1 Commits
fix/132
...
feat/131-v
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4dc6819079 |
@@ -1,155 +0,0 @@
|
||||
"""
|
||||
Image Content Screening for Self-Harm Indicators
|
||||
|
||||
Screens images sent in chat for self-harm content using vision models.
|
||||
Privacy-preserving: analyzes in-memory, discards after scoring. No image retention.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger("the-door.image_screening")
|
||||
|
||||
|
||||
class RiskLevel(str, Enum):
|
||||
SAFE = "safe"
|
||||
CONCERNING = "concerning"
|
||||
CRITICAL = "critical"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImageScreenResult:
|
||||
risk_level: RiskLevel
|
||||
confidence: float
|
||||
categories: list
|
||||
reasoning: str
|
||||
model_used: str
|
||||
|
||||
@property
|
||||
def needs_crisis_response(self):
|
||||
return self.risk_level == RiskLevel.CRITICAL
|
||||
|
||||
@property
|
||||
def needs_followup(self):
|
||||
return self.risk_level in (RiskLevel.CONCERNING, RiskLevel.CRITICAL)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"risk_level": self.risk_level.value,
|
||||
"confidence": round(self.confidence, 3),
|
||||
"categories": self.categories,
|
||||
"reasoning": self.reasoning,
|
||||
"model_used": self.model_used,
|
||||
}
|
||||
|
||||
|
||||
SCREENING_PROMPT = """Analyze this image for indicators of self-harm or crisis. Look for:
|
||||
|
||||
1. Self-harm wounds or injuries
|
||||
2. Medication/pills in concerning quantities
|
||||
3. Farewell-type imagery (empty rooms, items laid out, goodbye notes)
|
||||
4. Screenshots of crisis-related searches
|
||||
5. Images conveying hopelessness, isolation, or despair
|
||||
|
||||
Respond in JSON format ONLY:
|
||||
{"risk_level": "safe"|"concerning"|"critical", "confidence": 0.0-1.0, "categories": [], "reasoning": ""}
|
||||
|
||||
CRITICAL: imminent self-harm indicators
|
||||
CONCERNING: ambiguous but worrying
|
||||
SAFE: no indicators detected"""
|
||||
|
||||
|
||||
def _analyze_with_ollama(image_b64, model="gemma3:4b"):
|
||||
try:
|
||||
import urllib.request
|
||||
payload = json.dumps({
|
||||
"model": model,
|
||||
"messages": [{
|
||||
"role": "user",
|
||||
"content": SCREENING_PROMPT,
|
||||
"images": [image_b64],
|
||||
}],
|
||||
"stream": False,
|
||||
"options": {"temperature": 0.1},
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
"http://localhost:11434/api/chat",
|
||||
data=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
resp = urllib.request.urlopen(req, timeout=30)
|
||||
data = json.loads(resp.read())
|
||||
content = data.get("message", {}).get("content", "")
|
||||
json_start = content.find("{")
|
||||
json_end = content.rfind("}") + 1
|
||||
if json_start == -1 or json_end <= json_start:
|
||||
return None
|
||||
result = json.loads(content[json_start:json_end])
|
||||
return ImageScreenResult(
|
||||
risk_level=RiskLevel(result.get("risk_level", "safe")),
|
||||
confidence=float(result.get("confidence", 0.5)),
|
||||
categories=result.get("categories", []),
|
||||
reasoning=result.get("reasoning", ""),
|
||||
model_used=f"ollama:{model}",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Ollama vision analysis failed: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _analyze_fallback(image_bytes):
|
||||
return ImageScreenResult(
|
||||
risk_level=RiskLevel.SAFE,
|
||||
confidence=0.2,
|
||||
categories=["unanalyzed"],
|
||||
reasoning="No vision model available. Defaulting to safe with low confidence.",
|
||||
model_used="fallback:heuristic",
|
||||
)
|
||||
|
||||
|
||||
def screen_image(image_data, use_vision_model=True, model="gemma3:4b"):
|
||||
"""Screen image for self-harm indicators. Analyzes in-memory, no retention."""
|
||||
if isinstance(image_data, bytes):
|
||||
image_b64 = base64.b64encode(image_data).decode()
|
||||
else:
|
||||
image_b64 = image_data
|
||||
image_data = base64.b64decode(image_b64)
|
||||
|
||||
if use_vision_model:
|
||||
result = _analyze_with_ollama(image_b64, model)
|
||||
if result:
|
||||
logger.info(f"Image screened: {result.risk_level.value} (conf: {result.confidence:.2f})")
|
||||
if result.needs_crisis_response:
|
||||
logger.warning(f"CRITICAL image: {result.reasoning}")
|
||||
return result
|
||||
|
||||
return _analyze_fallback(image_data)
|
||||
|
||||
|
||||
def handle_chat_image(image_data):
|
||||
"""Handle image from chat. Returns action dict for gateway."""
|
||||
result = screen_image(image_data)
|
||||
action = {
|
||||
"result": result.to_dict(),
|
||||
"show_crisis_overlay": result.needs_crisis_response,
|
||||
"log_event": result.needs_followup,
|
||||
"response_text": None,
|
||||
}
|
||||
if result.risk_level == RiskLevel.CRITICAL:
|
||||
action["response_text"] = (
|
||||
"I noticed something concerning in the image you shared. "
|
||||
"If you or someone you know is in crisis, please reach out: "
|
||||
"988 Suicide and Crisis Lifeline (call or text 988). "
|
||||
"You are not alone."
|
||||
)
|
||||
elif result.risk_level == RiskLevel.CONCERNING:
|
||||
action["response_text"] = (
|
||||
"I want to check in \u2014 how are you doing? "
|
||||
"If you need to talk to someone, the 988 Lifeline is available 24/7."
|
||||
)
|
||||
return action
|
||||
@@ -1,84 +0,0 @@
|
||||
"""Tests for image content screening module."""
|
||||
|
||||
import json
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
from image_screening import (
|
||||
RiskLevel,
|
||||
ImageScreenResult,
|
||||
screen_image,
|
||||
handle_chat_image,
|
||||
_analyze_fallback,
|
||||
)
|
||||
|
||||
|
||||
class TestImageScreenResult:
|
||||
def test_safe_result(self):
|
||||
result = ImageScreenResult(
|
||||
risk_level=RiskLevel.SAFE, confidence=0.95,
|
||||
categories=[], reasoning="No indicators", model_used="test"
|
||||
)
|
||||
assert not result.needs_crisis_response
|
||||
assert not result.needs_followup
|
||||
assert result.to_dict()["risk_level"] == "safe"
|
||||
|
||||
def test_critical_result(self):
|
||||
result = ImageScreenResult(
|
||||
risk_level=RiskLevel.CRITICAL, confidence=0.9,
|
||||
categories=["wounds"], reasoning="Detected", model_used="test"
|
||||
)
|
||||
assert result.needs_crisis_response
|
||||
assert result.needs_followup
|
||||
|
||||
def test_concerning_result(self):
|
||||
result = ImageScreenResult(
|
||||
risk_level=RiskLevel.CONCERNING, confidence=0.6,
|
||||
categories=["isolation"], reasoning="Ambiguous", model_used="test"
|
||||
)
|
||||
assert not result.needs_crisis_response
|
||||
assert result.needs_followup
|
||||
|
||||
|
||||
class TestScreenImage:
|
||||
def test_fallback_returns_safe(self):
|
||||
result = screen_image(b"fake_image_data", use_vision_model=False)
|
||||
assert result.risk_level == RiskLevel.SAFE
|
||||
assert result.model_used == "fallback:heuristic"
|
||||
assert result.confidence < 0.5
|
||||
|
||||
def test_base64_input(self):
|
||||
import base64
|
||||
b64 = base64.b64encode(b"fake").decode()
|
||||
result = screen_image(b64, use_vision_model=False)
|
||||
assert result.risk_level == RiskLevel.SAFE
|
||||
|
||||
|
||||
class TestHandleChatImage:
|
||||
def test_safe_image_no_overlay(self):
|
||||
action = handle_chat_image(b"safe_image")
|
||||
assert not action["show_crisis_overlay"]
|
||||
assert action["response_text"] is None
|
||||
|
||||
@patch("image_screening._analyze_with_ollama")
|
||||
def test_critical_image_shows_overlay(self, mock_ollama):
|
||||
mock_ollama.return_value = ImageScreenResult(
|
||||
risk_level=RiskLevel.CRITICAL, confidence=0.95,
|
||||
categories=["wounds"], reasoning="Self-harm detected",
|
||||
model_used="ollama:gemma3:4b"
|
||||
)
|
||||
action = handle_chat_image(b"concerning_image")
|
||||
assert action["show_crisis_overlay"]
|
||||
assert "988" in action["response_text"]
|
||||
assert action["log_event"]
|
||||
|
||||
@patch("image_screening._analyze_with_ollama")
|
||||
def test_concerning_image_followup(self, mock_ollama):
|
||||
mock_ollama.return_value = ImageScreenResult(
|
||||
risk_level=RiskLevel.CONCERNING, confidence=0.6,
|
||||
categories=["isolation"], reasoning="Empty room",
|
||||
model_used="ollama:gemma3:4b"
|
||||
)
|
||||
action = handle_chat_image(b"maybe_concerning")
|
||||
assert not action["show_crisis_overlay"]
|
||||
assert action["log_event"]
|
||||
assert "check in" in action["response_text"]
|
||||
134
tests/test_voice_analysis.py
Normal file
134
tests/test_voice_analysis.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""Tests for voice message distress analysis (#131)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from voice_analysis import (
|
||||
VoiceAnalysisResult,
|
||||
compute_speech_rate,
|
||||
compute_distress_score,
|
||||
DISTRESS_THRESHOLDS,
|
||||
NORMAL_SPEECH_RATE,
|
||||
NORMAL_PITCH_VAR,
|
||||
)
|
||||
|
||||
|
||||
class TestDistressScore:
|
||||
"""Distress score computation from paralinguistic features."""
|
||||
|
||||
def test_normal_speech_no_distress(self):
|
||||
score, signals = compute_distress_score(
|
||||
speech_rate=140, # normal
|
||||
pitch_variability=50, # normal
|
||||
silence_ratio=0.15, # normal
|
||||
volume_db=-20, # normal
|
||||
)
|
||||
assert score < 0.1
|
||||
assert not signals
|
||||
|
||||
def test_slow_speech_detected(self):
|
||||
score, signals = compute_distress_score(
|
||||
speech_rate=60, # very slow
|
||||
pitch_variability=50,
|
||||
silence_ratio=0.15,
|
||||
volume_db=-20,
|
||||
)
|
||||
assert score > 0.1
|
||||
assert any("slow" in s for s in signals)
|
||||
|
||||
def test_monotone_detected(self):
|
||||
score, signals = compute_distress_score(
|
||||
speech_rate=140,
|
||||
pitch_variability=10, # very monotone
|
||||
silence_ratio=0.15,
|
||||
volume_db=-20,
|
||||
)
|
||||
assert score > 0.1
|
||||
assert any("monotone" in s for s in signals)
|
||||
|
||||
def test_long_pauses_detected(self):
|
||||
score, signals = compute_distress_score(
|
||||
speech_rate=140,
|
||||
pitch_variability=50,
|
||||
silence_ratio=0.50, # very quiet
|
||||
volume_db=-20,
|
||||
)
|
||||
assert score > 0.1
|
||||
assert any("pause" in s for s in signals)
|
||||
|
||||
def test_quiet_voice_detected(self):
|
||||
score, signals = compute_distress_score(
|
||||
speech_rate=140,
|
||||
pitch_variability=50,
|
||||
silence_ratio=0.15,
|
||||
volume_db=-45, # very quiet
|
||||
)
|
||||
assert score > 0.1
|
||||
assert any("quiet" in s for s in signals)
|
||||
|
||||
def test_multiple_signals_compound(self):
|
||||
score, signals = compute_distress_score(
|
||||
speech_rate=50, # very slow
|
||||
pitch_variability=5, # very monotone
|
||||
silence_ratio=0.55, # long pauses
|
||||
volume_db=-50, # very quiet
|
||||
)
|
||||
assert score > 0.5
|
||||
assert len(signals) >= 3
|
||||
|
||||
def test_max_score_is_1(self):
|
||||
score, _ = compute_distress_score(
|
||||
speech_rate=0,
|
||||
pitch_variability=0,
|
||||
silence_ratio=1.0,
|
||||
volume_db=-100,
|
||||
)
|
||||
assert score <= 1.0
|
||||
|
||||
|
||||
class TestSpeechRate:
|
||||
"""Speech rate computation."""
|
||||
|
||||
def test_normal_rate(self):
|
||||
# 100 words in 60 seconds = 100 wpm
|
||||
segments = [{"start": 0.0, "end": 60.0, "text": "x"}]
|
||||
wpm = compute_speech_rate("word " * 100, segments)
|
||||
assert abs(wpm - 100) < 5
|
||||
|
||||
def test_empty_transcript(self):
|
||||
assert compute_speech_rate("", []) == 0.0
|
||||
|
||||
def test_no_segments(self):
|
||||
assert compute_speech_rate("hello world", []) == 0.0
|
||||
|
||||
|
||||
class TestDistressThresholds:
|
||||
"""Threshold configuration."""
|
||||
|
||||
def test_thresholds_ordered(self):
|
||||
assert DISTRESS_THRESHOLDS["low"] < DISTRESS_THRESHOLDS["medium"]
|
||||
assert DISTRESS_THRESHOLDS["medium"] < DISTRESS_THRESHOLDS["high"]
|
||||
|
||||
def test_low_is_03(self):
|
||||
assert DISTRESS_THRESHOLDS["low"] == 0.3
|
||||
|
||||
def test_high_is_10(self):
|
||||
assert DISTRESS_THRESHOLDS["high"] == 1.0
|
||||
|
||||
|
||||
class TestVoiceAnalysisResult:
|
||||
"""Result data structure."""
|
||||
|
||||
def test_creation(self):
|
||||
result = VoiceAnalysisResult(
|
||||
transcript="hello", speech_rate_wpm=120.0,
|
||||
pitch_mean_hz=150.0, pitch_variability=40.0,
|
||||
silence_ratio=0.2, volume_db=-20.0,
|
||||
volume_variability=5.0, duration_seconds=10.0,
|
||||
distress_score=0.1, distress_level="low",
|
||||
distress_signals=[],
|
||||
)
|
||||
assert result.transcript == "hello"
|
||||
assert result.distress_level == "low"
|
||||
assert not result.distress_signals
|
||||
356
voice_analysis.py
Normal file
356
voice_analysis.py
Normal file
@@ -0,0 +1,356 @@
|
||||
"""Voice message distress analysis — paralinguistic features (#131).
|
||||
|
||||
Analyzes audio (OGG/MP3/WAV) for distress signals using audio
|
||||
features extracted without a neural model — pure DSP analysis.
|
||||
|
||||
Signals detected:
|
||||
- Speech rate (words per minute from timestamps)
|
||||
- Pitch variability (F0 std deviation — monotone = depression indicator)
|
||||
- Silence ratio (long pauses)
|
||||
- Volume dynamics (drops, tremor proxy)
|
||||
|
||||
Uses whisper for transcription + word timestamps. All other features
|
||||
are computed from raw audio via librosa.
|
||||
|
||||
Refs: #131 — Epic #102 (Multimodal Crisis Detection)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VoiceAnalysisResult:
|
||||
"""Result of voice message paralinguistic analysis."""
|
||||
transcript: str
|
||||
speech_rate_wpm: float # words per minute
|
||||
pitch_mean_hz: float # mean F0 in Hz
|
||||
pitch_variability: float # F0 standard deviation (low = monotone)
|
||||
silence_ratio: float # fraction of audio that is silence (0-1)
|
||||
volume_db: float # mean volume in dB
|
||||
volume_variability: float # volume std deviation
|
||||
duration_seconds: float # total audio duration
|
||||
distress_score: float # 0-1 composite score
|
||||
distress_level: str # "low", "medium", "high"
|
||||
distress_signals: List[str] # list of detected signals
|
||||
|
||||
|
||||
# Distress thresholds
|
||||
DISTRESS_THRESHOLDS = {
|
||||
"low": 0.3,
|
||||
"medium": 0.7,
|
||||
"high": 1.0,
|
||||
}
|
||||
|
||||
# Paralinguistic distress indicators
|
||||
# These are heuristic — the model learns what "normal" looks like
|
||||
# and flags deviations.
|
||||
NORMAL_SPEECH_RATE = (100, 180) # words per minute
|
||||
NORMAL_PITCH_VAR = (20, 80) # F0 std deviation in Hz
|
||||
NORMAL_SILENCE_RATIO = (0.05, 0.35) # fraction of silence
|
||||
NORMAL_VOLUME_DB = (-30, -10) # dB range
|
||||
|
||||
|
||||
def _ensure_whisper():
|
||||
"""Check if whisper is available."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["whisper", "--help"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
return True
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired):
|
||||
return False
|
||||
|
||||
|
||||
def _ensure_librosa():
|
||||
"""Check if librosa is available."""
|
||||
try:
|
||||
import librosa
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
|
||||
def transcribe_with_timestamps(audio_path: str) -> Dict[str, Any]:
|
||||
"""Transcribe audio using whisper and extract word-level timestamps.
|
||||
|
||||
Returns dict with 'text' and 'segments' (list of {start, end, text}).
|
||||
Falls back to subprocess whisper if Python whisper not available.
|
||||
"""
|
||||
try:
|
||||
import whisper
|
||||
model = whisper.load_model("base")
|
||||
result = model.transcribe(audio_path, word_timestamps=True)
|
||||
return {
|
||||
"text": result["text"],
|
||||
"segments": [
|
||||
{"start": s["start"], "end": s["end"], "text": s["text"]}
|
||||
for s in result.get("segments", [])
|
||||
],
|
||||
}
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Fallback: subprocess whisper
|
||||
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
|
||||
json_out = f.name
|
||||
|
||||
try:
|
||||
subprocess.run(
|
||||
["whisper", audio_path, "--model", "base", "--output_format", "json",
|
||||
"--output_dir", os.path.dirname(json_out)],
|
||||
capture_output=True, text=True, timeout=120,
|
||||
)
|
||||
|
||||
# Whisper outputs to <filename>.json in output_dir
|
||||
base = Path(audio_path).stem
|
||||
whisper_out = Path(os.path.dirname(json_out)) / f"{base}.json"
|
||||
|
||||
if whisper_out.exists():
|
||||
with open(whisper_out) as f:
|
||||
data = json.load(f)
|
||||
os.unlink(whisper_out)
|
||||
return {
|
||||
"text": data.get("text", ""),
|
||||
"segments": [
|
||||
{"start": s["start"], "end": s["end"], "text": s["text"]}
|
||||
for s in data.get("segments", [])
|
||||
],
|
||||
}
|
||||
except Exception as e:
|
||||
logger.warning("Whisper transcription failed: %s", e)
|
||||
finally:
|
||||
if os.path.exists(json_out):
|
||||
os.unlink(json_out)
|
||||
|
||||
return {"text": "", "segments": []}
|
||||
|
||||
|
||||
def extract_audio_features(audio_path: str) -> Dict[str, float]:
|
||||
"""Extract paralinguistic features from raw audio using librosa.
|
||||
|
||||
Returns dict with pitch, volume, and silence metrics.
|
||||
"""
|
||||
try:
|
||||
import librosa
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
logger.warning("librosa not available — returning defaults")
|
||||
return {
|
||||
"pitch_mean_hz": 0.0, "pitch_variability": 0.0,
|
||||
"silence_ratio": 0.0, "volume_db": 0.0, "volume_variability": 0.0,
|
||||
"duration_seconds": 0.0,
|
||||
}
|
||||
|
||||
try:
|
||||
y, sr = librosa.load(audio_path, sr=None)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load audio %s: %s", audio_path, e)
|
||||
return {
|
||||
"pitch_mean_hz": 0.0, "pitch_variability": 0.0,
|
||||
"silence_ratio": 0.0, "volume_db": 0.0, "volume_variability": 0.0,
|
||||
"duration_seconds": 0.0,
|
||||
}
|
||||
|
||||
duration = len(y) / sr
|
||||
|
||||
# Pitch (F0) estimation using pyin
|
||||
try:
|
||||
f0, voiced_flag, _ = librosa.pyin(y, fmin=50, fmax=500, sr=sr)
|
||||
f0_voiced = f0[~np.isnan(f0)]
|
||||
if len(f0_voiced) > 0:
|
||||
pitch_mean = float(np.mean(f0_voiced))
|
||||
pitch_var = float(np.std(f0_voiced))
|
||||
else:
|
||||
pitch_mean = 0.0
|
||||
pitch_var = 0.0
|
||||
except Exception:
|
||||
pitch_mean = 0.0
|
||||
pitch_var = 0.0
|
||||
|
||||
# Volume (RMS energy)
|
||||
rms = librosa.feature.rms(y=y)[0]
|
||||
volume_db = float(librosa.amplitude_to_db(rms, ref=np.max).mean())
|
||||
volume_var = float(librosa.amplitude_to_db(rms, ref=np.max).std())
|
||||
|
||||
# Silence ratio
|
||||
try:
|
||||
intervals = librosa.effects.split(y, top_db=30)
|
||||
speech_samples = sum(end - start for start, end in intervals)
|
||||
silence_ratio = 1.0 - (speech_samples / len(y)) if len(y) > 0 else 0.0
|
||||
except Exception:
|
||||
silence_ratio = 0.0
|
||||
|
||||
return {
|
||||
"pitch_mean_hz": round(pitch_mean, 1),
|
||||
"pitch_variability": round(pitch_var, 1),
|
||||
"silence_ratio": round(silence_ratio, 3),
|
||||
"volume_db": round(volume_db, 1),
|
||||
"volume_variability": round(volume_var, 1),
|
||||
"duration_seconds": round(duration, 2),
|
||||
}
|
||||
|
||||
|
||||
def compute_speech_rate(transcript: str, segments: List[dict]) -> float:
|
||||
"""Compute words per minute from transcript and timestamps."""
|
||||
words = len(transcript.split())
|
||||
if words == 0:
|
||||
return 0.0
|
||||
|
||||
if not segments:
|
||||
return 0.0
|
||||
|
||||
total_duration = max(s["end"] for s in segments) - min(s["start"] for s in segments)
|
||||
if total_duration <= 0:
|
||||
return 0.0
|
||||
|
||||
wpm = words / (total_duration / 60.0)
|
||||
return round(wpm, 1)
|
||||
|
||||
|
||||
def compute_distress_score(
|
||||
speech_rate: float,
|
||||
pitch_variability: float,
|
||||
silence_ratio: float,
|
||||
volume_db: float,
|
||||
) -> tuple[float, List[str]]:
|
||||
"""Compute composite distress score from paralinguistic features.
|
||||
|
||||
Returns (score, signals) where score is 0-1 and signals is a list
|
||||
of detected distress indicators.
|
||||
"""
|
||||
signals = []
|
||||
scores = []
|
||||
|
||||
# Speech rate: very slow (<80) or very fast (>200) is concerning
|
||||
if speech_rate > 0:
|
||||
if speech_rate < NORMAL_SPEECH_RATE[0]:
|
||||
signals.append(f"very_slow_speech ({speech_rate:.0f} wpm)")
|
||||
scores.append(min(1.0, (NORMAL_SPEECH_RATE[0] - speech_rate) / 50))
|
||||
elif speech_rate > NORMAL_SPEECH_RATE[1]:
|
||||
signals.append(f"very_fast_speech ({speech_rate:.0f} wpm)")
|
||||
scores.append(min(1.0, (speech_rate - NORMAL_SPEECH_RATE[1]) / 80))
|
||||
else:
|
||||
scores.append(0.0)
|
||||
|
||||
# Pitch variability: low = monotone (depression indicator)
|
||||
if pitch_variability > 0:
|
||||
if pitch_variability < NORMAL_PITCH_VAR[0]:
|
||||
signals.append(f"monotone_voice (F0_var={pitch_variability:.0f}Hz)")
|
||||
scores.append(min(1.0, (NORMAL_PITCH_VAR[0] - pitch_variability) / NORMAL_PITCH_VAR[0]))
|
||||
else:
|
||||
scores.append(0.0)
|
||||
|
||||
# Silence ratio: high = long pauses
|
||||
if silence_ratio > NORMAL_SILENCE_RATIO[1]:
|
||||
signals.append(f"long_pauses (silence={silence_ratio:.0%})")
|
||||
scores.append(min(1.0, (silence_ratio - NORMAL_SILENCE_RATIO[1]) / 0.4))
|
||||
else:
|
||||
scores.append(0.0)
|
||||
|
||||
# Volume: very quiet
|
||||
if volume_db < NORMAL_VOLUME_DB[0]:
|
||||
signals.append(f"very_quiet ({volume_db:.0f}dB)")
|
||||
scores.append(min(1.0, abs(volume_db - NORMAL_VOLUME_DB[0]) / 20))
|
||||
else:
|
||||
scores.append(0.0)
|
||||
|
||||
# Composite: max of individual signals (not average — one severe signal is enough)
|
||||
if scores:
|
||||
score = max(scores)
|
||||
else:
|
||||
score = 0.0
|
||||
|
||||
return round(score, 3), signals
|
||||
|
||||
|
||||
def analyze_voice_message(audio_path: str) -> VoiceAnalysisResult:
|
||||
"""Analyze a voice message for distress signals.
|
||||
|
||||
Args:
|
||||
audio_path: Path to audio file (OGG, MP3, WAV).
|
||||
|
||||
Returns:
|
||||
VoiceAnalysisResult with all paralinguistic features.
|
||||
"""
|
||||
# Step 1: Transcribe with timestamps
|
||||
transcription = transcribe_with_timestamps(audio_path)
|
||||
transcript = transcription["text"]
|
||||
segments = transcription["segments"]
|
||||
|
||||
# Step 2: Extract audio features
|
||||
features = extract_audio_features(audio_path)
|
||||
|
||||
# Step 3: Compute speech rate
|
||||
wpm = compute_speech_rate(transcript, segments)
|
||||
|
||||
# Step 4: Compute distress score
|
||||
distress_score, distress_signals = compute_distress_score(
|
||||
speech_rate=wpm,
|
||||
pitch_variability=features["pitch_variability"],
|
||||
silence_ratio=features["silence_ratio"],
|
||||
volume_db=features["volume_db"],
|
||||
)
|
||||
|
||||
# Determine level
|
||||
if distress_score >= DISTRESS_THRESHOLDS["high"]:
|
||||
level = "high"
|
||||
elif distress_score >= DISTRESS_THRESHOLDS["medium"]:
|
||||
level = "medium"
|
||||
else:
|
||||
level = "low"
|
||||
|
||||
return VoiceAnalysisResult(
|
||||
transcript=transcript,
|
||||
speech_rate_wpm=wpm,
|
||||
pitch_mean_hz=features["pitch_mean_hz"],
|
||||
pitch_variability=features["pitch_variability"],
|
||||
silence_ratio=features["silence_ratio"],
|
||||
volume_db=features["volume_db"],
|
||||
volume_variability=features["volume_variability"],
|
||||
duration_seconds=features["duration_seconds"],
|
||||
distress_score=distress_score,
|
||||
distress_level=level,
|
||||
distress_signals=distress_signals,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
p = argparse.ArgumentParser(description="Voice message distress analysis")
|
||||
p.add_argument("audio", help="Path to audio file")
|
||||
p.add_argument("--json", action="store_true")
|
||||
a = p.parse_args()
|
||||
|
||||
if not os.path.exists(a.audio):
|
||||
print(f"File not found: {a.audio}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
result = analyze_voice_message(a.audio)
|
||||
|
||||
if a.json:
|
||||
print(json.dumps(asdict(result), indent=2))
|
||||
else:
|
||||
print(f"Transcript: {result.transcript[:100]}...")
|
||||
print(f"Speech rate: {result.speech_rate_wpm} wpm")
|
||||
print(f"Pitch: {result.pitch_mean_hz} Hz (variability: {result.pitch_variability})")
|
||||
print(f"Silence: {result.silence_ratio:.0%}")
|
||||
print(f"Volume: {result.volume_db} dB")
|
||||
print(f"Distress: {result.distress_score:.2f} ({result.distress_level})")
|
||||
if result.distress_signals:
|
||||
print(f"Signals: {', '.join(result.distress_signals)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user