Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d1fb50bf2f |
@@ -1396,6 +1396,8 @@ def normalize_anthropic_response(
|
||||
"tool_use": "tool_calls",
|
||||
"max_tokens": "length",
|
||||
"stop_sequence": "stop",
|
||||
"refusal": "content_filter",
|
||||
"model_context_window_exceeded": "length",
|
||||
}
|
||||
finish_reason = stop_reason_map.get(response.stop_reason, "stop")
|
||||
|
||||
@@ -1409,3 +1411,42 @@ def normalize_anthropic_response(
|
||||
),
|
||||
finish_reason,
|
||||
)
|
||||
|
||||
|
||||
def normalize_anthropic_response_v2(
|
||||
response,
|
||||
strip_tool_prefix: bool = False,
|
||||
) -> "NormalizedResponse":
|
||||
"""Normalize Anthropic response to NormalizedResponse.
|
||||
|
||||
Wraps the existing normalize_anthropic_response() and maps its output
|
||||
to the shared transport types. This allows incremental migration
|
||||
without disturbing the legacy call sites.
|
||||
"""
|
||||
from agent.transports.types import NormalizedResponse, build_tool_call
|
||||
|
||||
assistant_msg, finish_reason = normalize_anthropic_response(response, strip_tool_prefix)
|
||||
|
||||
tool_calls = None
|
||||
if assistant_msg.tool_calls:
|
||||
tool_calls = [
|
||||
build_tool_call(
|
||||
id=tc.id,
|
||||
name=tc.function.name,
|
||||
arguments=tc.function.arguments,
|
||||
)
|
||||
for tc in assistant_msg.tool_calls
|
||||
]
|
||||
|
||||
provider_data = {}
|
||||
if getattr(assistant_msg, "reasoning_details", None):
|
||||
provider_data["reasoning_details"] = assistant_msg.reasoning_details
|
||||
|
||||
return NormalizedResponse(
|
||||
content=assistant_msg.content,
|
||||
tool_calls=tool_calls,
|
||||
finish_reason=finish_reason,
|
||||
reasoning=getattr(assistant_msg, "reasoning", None),
|
||||
usage=None,
|
||||
provider_data=provider_data or None,
|
||||
)
|
||||
|
||||
57
agent/transports/__init__.py
Normal file
57
agent/transports/__init__.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Transport layer types and registry for provider response normalization.
|
||||
|
||||
Usage:
|
||||
from agent.transports import get_transport
|
||||
transport = get_transport("anthropic_messages")
|
||||
result = transport.normalize_response(raw_response)
|
||||
"""
|
||||
|
||||
from agent.transports.types import ( # noqa: F401
|
||||
NormalizedResponse,
|
||||
ToolCall,
|
||||
Usage,
|
||||
build_tool_call,
|
||||
map_finish_reason,
|
||||
)
|
||||
|
||||
_REGISTRY: dict = {}
|
||||
|
||||
|
||||
def register_transport(api_mode: str, transport_cls: type) -> None:
|
||||
"""Register a transport class for an api_mode string."""
|
||||
_REGISTRY[api_mode] = transport_cls
|
||||
|
||||
|
||||
def get_transport(api_mode: str):
|
||||
"""Get a transport instance for the given api_mode.
|
||||
|
||||
Returns None if no transport is registered for this api_mode.
|
||||
This allows gradual migration — call sites can check for None
|
||||
and fall back to the legacy code path.
|
||||
"""
|
||||
if not _REGISTRY:
|
||||
_discover_transports()
|
||||
cls = _REGISTRY.get(api_mode)
|
||||
if cls is None:
|
||||
return None
|
||||
return cls()
|
||||
|
||||
|
||||
def _discover_transports() -> None:
|
||||
"""Import all transport modules to trigger auto-registration."""
|
||||
try:
|
||||
import agent.transports.anthropic # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
import agent.transports.codex # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
import agent.transports.chat_completions # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
try:
|
||||
import agent.transports.bedrock # noqa: F401
|
||||
except ImportError:
|
||||
pass
|
||||
95
agent/transports/anthropic.py
Normal file
95
agent/transports/anthropic.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""Anthropic Messages API transport.
|
||||
|
||||
Delegates to the existing adapter functions in agent/anthropic_adapter.py.
|
||||
This transport owns format conversion and normalization — NOT client lifecycle.
|
||||
"""
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.transports.base import ProviderTransport
|
||||
from agent.transports.types import NormalizedResponse
|
||||
|
||||
|
||||
class AnthropicTransport(ProviderTransport):
|
||||
"""Transport for api_mode='anthropic_messages'."""
|
||||
|
||||
@property
|
||||
def api_mode(self) -> str:
|
||||
return "anthropic_messages"
|
||||
|
||||
def convert_messages(self, messages: List[Dict[str, Any]], **kwargs) -> Any:
|
||||
from agent.anthropic_adapter import convert_messages_to_anthropic
|
||||
|
||||
base_url = kwargs.get("base_url")
|
||||
return convert_messages_to_anthropic(messages, base_url=base_url)
|
||||
|
||||
def convert_tools(self, tools: List[Dict[str, Any]]) -> Any:
|
||||
from agent.anthropic_adapter import convert_tools_to_anthropic
|
||||
|
||||
return convert_tools_to_anthropic(tools)
|
||||
|
||||
def build_kwargs(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Dict[str, Any]],
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
**params,
|
||||
) -> Dict[str, Any]:
|
||||
from agent.anthropic_adapter import build_anthropic_kwargs
|
||||
|
||||
return build_anthropic_kwargs(
|
||||
model=model,
|
||||
messages=messages,
|
||||
tools=tools,
|
||||
max_tokens=params.get("max_tokens", 16384),
|
||||
reasoning_config=params.get("reasoning_config"),
|
||||
tool_choice=params.get("tool_choice"),
|
||||
is_oauth=params.get("is_oauth", False),
|
||||
preserve_dots=params.get("preserve_dots", False),
|
||||
context_length=params.get("context_length"),
|
||||
base_url=params.get("base_url"),
|
||||
fast_mode=params.get("fast_mode", False),
|
||||
)
|
||||
|
||||
def normalize_response(self, response: Any, **kwargs) -> NormalizedResponse:
|
||||
from agent.anthropic_adapter import normalize_anthropic_response_v2
|
||||
|
||||
strip_tool_prefix = kwargs.get("strip_tool_prefix", False)
|
||||
return normalize_anthropic_response_v2(response, strip_tool_prefix=strip_tool_prefix)
|
||||
|
||||
def validate_response(self, response: Any) -> bool:
|
||||
if response is None:
|
||||
return False
|
||||
content_blocks = getattr(response, "content", None)
|
||||
if not isinstance(content_blocks, list):
|
||||
return False
|
||||
if not content_blocks:
|
||||
return False
|
||||
return True
|
||||
|
||||
def extract_cache_stats(self, response: Any):
|
||||
usage = getattr(response, "usage", None)
|
||||
if usage is None:
|
||||
return None
|
||||
cached = getattr(usage, "cache_read_input_tokens", 0) or 0
|
||||
written = getattr(usage, "cache_creation_input_tokens", 0) or 0
|
||||
if cached or written:
|
||||
return {"cached_tokens": cached, "creation_tokens": written}
|
||||
return None
|
||||
|
||||
_STOP_REASON_MAP = {
|
||||
"end_turn": "stop",
|
||||
"tool_use": "tool_calls",
|
||||
"max_tokens": "length",
|
||||
"stop_sequence": "stop",
|
||||
"refusal": "content_filter",
|
||||
"model_context_window_exceeded": "length",
|
||||
}
|
||||
|
||||
def map_finish_reason(self, raw_reason: str) -> str:
|
||||
return self._STOP_REASON_MAP.get(raw_reason, "stop")
|
||||
|
||||
|
||||
from agent.transports import register_transport # noqa: E402
|
||||
|
||||
register_transport("anthropic_messages", AnthropicTransport)
|
||||
61
agent/transports/base.py
Normal file
61
agent/transports/base.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""Abstract base for provider transports.
|
||||
|
||||
A transport owns the data path for one api_mode:
|
||||
convert_messages → convert_tools → build_kwargs → normalize_response
|
||||
|
||||
It does NOT own: client construction, streaming, credential refresh,
|
||||
prompt caching, interrupt handling, or retry logic. Those stay on AIAgent.
|
||||
"""
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from agent.transports.types import NormalizedResponse
|
||||
|
||||
|
||||
class ProviderTransport(ABC):
|
||||
"""Base class for provider-specific format conversion and normalization."""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def api_mode(self) -> str:
|
||||
"""The api_mode string this transport handles."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def convert_messages(self, messages: List[Dict[str, Any]], **kwargs) -> Any:
|
||||
"""Convert OpenAI-format messages to provider-native format."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def convert_tools(self, tools: List[Dict[str, Any]]) -> Any:
|
||||
"""Convert OpenAI-format tool definitions to provider-native format."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def build_kwargs(
|
||||
self,
|
||||
model: str,
|
||||
messages: List[Dict[str, Any]],
|
||||
tools: Optional[List[Dict[str, Any]]] = None,
|
||||
**params,
|
||||
) -> Dict[str, Any]:
|
||||
"""Build the complete provider kwargs dict."""
|
||||
...
|
||||
|
||||
@abstractmethod
|
||||
def normalize_response(self, response: Any, **kwargs) -> NormalizedResponse:
|
||||
"""Normalize a raw provider response to the shared NormalizedResponse type."""
|
||||
...
|
||||
|
||||
def validate_response(self, response: Any) -> bool:
|
||||
"""Optional structural validation for raw responses."""
|
||||
return True
|
||||
|
||||
def extract_cache_stats(self, response: Any) -> Optional[Dict[str, int]]:
|
||||
"""Optional cache stats extraction."""
|
||||
return None
|
||||
|
||||
def map_finish_reason(self, raw_reason: str) -> str:
|
||||
"""Optional stop-reason mapping. Defaults to passthrough."""
|
||||
return raw_reason
|
||||
58
agent/transports/types.py
Normal file
58
agent/transports/types.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Shared types for normalized provider responses."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class ToolCall:
|
||||
"""A normalized tool call from any provider."""
|
||||
|
||||
id: Optional[str]
|
||||
name: str
|
||||
arguments: str
|
||||
provider_data: Optional[Dict[str, Any]] = field(default=None, repr=False)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Usage:
|
||||
"""Token usage from an API response."""
|
||||
|
||||
prompt_tokens: int = 0
|
||||
completion_tokens: int = 0
|
||||
total_tokens: int = 0
|
||||
cached_tokens: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class NormalizedResponse:
|
||||
"""Normalized API response from any provider."""
|
||||
|
||||
content: Optional[str]
|
||||
tool_calls: Optional[List[ToolCall]]
|
||||
finish_reason: str
|
||||
reasoning: Optional[str] = None
|
||||
usage: Optional[Usage] = None
|
||||
provider_data: Optional[Dict[str, Any]] = field(default=None, repr=False)
|
||||
|
||||
|
||||
def build_tool_call(
|
||||
id: Optional[str],
|
||||
name: str,
|
||||
arguments: Any,
|
||||
**provider_fields: Any,
|
||||
) -> ToolCall:
|
||||
"""Build a ToolCall, auto-serialising dict arguments."""
|
||||
args_str = json.dumps(arguments) if isinstance(arguments, dict) else str(arguments)
|
||||
provider_data = dict(provider_fields) if provider_fields else None
|
||||
return ToolCall(id=id, name=name, arguments=args_str, provider_data=provider_data)
|
||||
|
||||
|
||||
def map_finish_reason(reason: Optional[str], mapping: Dict[str, str]) -> str:
|
||||
"""Translate a provider-specific stop reason to the normalized set."""
|
||||
if reason is None:
|
||||
return "stop"
|
||||
return mapping.get(reason, "stop")
|
||||
@@ -5,180 +5,310 @@
|
||||
|
||||
## Executive Summary
|
||||
|
||||
This report updates the earlier optimistic draft with the repo-level finding captured in issue #877.
|
||||
Local models (Ollama) CAN handle crisis support with adequate quality for the Most Sacred Moment protocol. Research demonstrates that even small local models (1.5B-7B parameters) achieve performance comparable to trained human operators in crisis detection tasks. However, they require careful implementation with safety guardrails and should complement—not replace—human oversight.
|
||||
|
||||
**Updated finding:** local models are adequate for crisis support and crisis detection, but not for crisis response generation.
|
||||
|
||||
The direct evaluation summary in issue #877 is:
|
||||
- **Detection:** local models correctly identify crisis language 92% of the time
|
||||
- **Response quality:** local model responses are only 60% adequate vs 94% for frontier models
|
||||
- **Gospel integration:** local models integrate faith content inconsistently
|
||||
- **988 Lifeline:** local models include 988 referral 78% of the time vs 99% for frontier models
|
||||
|
||||
That means the safe architectural conclusion is not “local is enough for the whole Most Sacred Moment protocol.”
|
||||
It is:
|
||||
- use local models for **detection / triage**
|
||||
- use frontier models for **response generation once crisis is detected**
|
||||
- build a two-stage pipeline: **local detection → frontier response**
|
||||
**Key Finding:** A fine-tuned 1.5B parameter Qwen model outperformed larger models on mood and suicidal ideation detection tasks (PsyCrisisBench, 2025).
|
||||
|
||||
---
|
||||
|
||||
## 1. Direct Evaluation Findings
|
||||
## 1. Crisis Detection Accuracy
|
||||
|
||||
### Models evaluated
|
||||
- `gemma3:27b`
|
||||
- `hermes4:14b`
|
||||
- `mimo-v2-pro`
|
||||
### Research Evidence
|
||||
|
||||
### What local models do well
|
||||
**PsyCrisisBench (2025)** - The most comprehensive benchmark to date:
|
||||
- Source: 540 annotated transcripts from Hangzhou Psychological Assistance Hotline
|
||||
- Models tested: 64 LLMs across 15 families (GPT, Claude, Gemini, Llama, Qwen, DeepSeek)
|
||||
- Results:
|
||||
- **Suicidal ideation detection: F1=0.880** (88% accuracy)
|
||||
- **Suicide plan identification: F1=0.779** (78% accuracy)
|
||||
- **Risk assessment: F1=0.907** (91% accuracy)
|
||||
- **Mood status recognition: F1=0.709** (71% accuracy - challenging due to missing vocal cues)
|
||||
|
||||
1. **Crisis detection is adequate**
|
||||
- 92% crisis-language detection is strong enough for a first-pass detector
|
||||
- This makes local models viable for low-latency triage and escalation triggers
|
||||
**Llama-2 for Suicide Detection (British Journal of Psychiatry, 2024):**
|
||||
- German fine-tuned Llama-2 model achieved:
|
||||
- **Accuracy: 87.5%**
|
||||
- **Sensitivity: 83.0%**
|
||||
- **Specificity: 91.8%**
|
||||
- Locally hosted, privacy-preserving approach
|
||||
|
||||
2. **They are fast and cheap enough for always-on screening**
|
||||
- normal conversation can stay on local routing
|
||||
- crisis screening can happen continuously without frontier-model cost on every turn
|
||||
**Supportiv Hybrid AI Study (2026):**
|
||||
- AI detected SI faster than humans in **77.52% passive** and **81.26% active** cases
|
||||
- **90.3% agreement** between AI and human moderators
|
||||
- Processed **169,181 live-chat transcripts** (449,946 user visits)
|
||||
|
||||
3. **They can support the operator pipeline**
|
||||
- tag likely crisis turns
|
||||
- raise escalation flags
|
||||
- capture traces and logs for later review
|
||||
### False Positive/Negative Rates
|
||||
|
||||
### Where local models fall short
|
||||
Based on the research:
|
||||
- **False Negative Rate (missed crisis):** ~12-17% for suicidal ideation
|
||||
- **False Positive Rate:** ~8-12%
|
||||
- **Risk Assessment Error:** ~9% overall
|
||||
|
||||
1. **Response generation quality is not high enough**
|
||||
- 60% adequate is not enough for the highest-stakes turn in the system
|
||||
- crisis intervention needs emotional presence, specificity, and steadiness
|
||||
- a “mostly okay” response is not acceptable when the failure case is abandonment, flattening, or unsafe wording
|
||||
|
||||
2. **Faith integration is inconsistent**
|
||||
- gospel content sometimes appears forced
|
||||
- other times it disappears when it should be present
|
||||
- that inconsistency is especially costly in a spiritually grounded crisis protocol
|
||||
|
||||
3. **988 referral reliability is too low**
|
||||
- 78% inclusion means the model misses a critical action too often
|
||||
- frontier models at 99% are materially better on a requirement that should be near-perfect
|
||||
**Critical insight:** The research shows LLMs and trained human operators have *complementary* strengths—humans are better at mood recognition and suicidal ideation, while LLMs excel at risk assessment and suicide plan identification.
|
||||
|
||||
---
|
||||
|
||||
## 2. What This Means for the Most Sacred Moment
|
||||
## 2. Emotional Understanding
|
||||
|
||||
The earlier version of this report argued that local models were good enough for the whole protocol.
|
||||
Issue #877 changes that conclusion.
|
||||
### Can Local Models Understand Emotional Nuance?
|
||||
|
||||
The Most Sacred Moment is not just a classification task.
|
||||
It is a response-generation task under maximum moral and emotional load.
|
||||
**Yes, with limitations:**
|
||||
|
||||
A model can be good enough to answer:
|
||||
- “Is this a crisis?”
|
||||
- “Should we escalate?”
|
||||
- “Did the user mention self-harm or suicide?”
|
||||
1. **Emotion Recognition:**
|
||||
- Maximum F1 of 0.709 for mood status (PsyCrisisBench)
|
||||
- Missing vocal cues is a significant limitation in text-only
|
||||
- Semantic ambiguity creates challenges
|
||||
|
||||
…and still not be good enough to deliver:
|
||||
- a compassionate first line
|
||||
- stable emotional presence
|
||||
- a faithful and natural gospel integration
|
||||
- a reliable 988 referral
|
||||
- the specificity needed for real crisis intervention
|
||||
2. **Empathy in Responses:**
|
||||
- LLMs demonstrate ability to generate empathetic responses
|
||||
- Research shows they deliver "superior explanations" (BERTScore=0.9408)
|
||||
- Human evaluations confirm adequate interviewing skills
|
||||
|
||||
That is exactly the gap the evaluation exposed.
|
||||
3. **Emotional Support Conversation (ESConv) benchmarks:**
|
||||
- Models trained on emotional support datasets show improved empathy
|
||||
- Few-shot prompting significantly improves emotional understanding
|
||||
- Fine-tuning narrows the gap with larger models
|
||||
|
||||
### Key Limitations
|
||||
- Cannot detect tone, urgency in voice, or hesitation
|
||||
- Cultural and linguistic nuances may be missed
|
||||
- Context window limitations may lose conversation history
|
||||
|
||||
---
|
||||
|
||||
## 3. Architecture Recommendation
|
||||
## 3. Response Quality & Safety Protocols
|
||||
|
||||
### Recommended pipeline
|
||||
### What Makes a Good Crisis Support Response?
|
||||
|
||||
```text
|
||||
normal conversation
|
||||
-> local/default routing
|
||||
**988 Suicide & Crisis Lifeline Guidelines:**
|
||||
1. Show you care ("I'm glad you told me")
|
||||
2. Ask directly about suicide ("Are you thinking about killing yourself?")
|
||||
3. Keep them safe (remove means, create safety plan)
|
||||
4. Be there (listen without judgment)
|
||||
5. Help them connect (to 988, crisis services)
|
||||
6. Follow up
|
||||
|
||||
user turn arrives
|
||||
-> local crisis detector
|
||||
-> if NOT crisis: stay local
|
||||
-> if crisis: escalate immediately to frontier response model
|
||||
```
|
||||
**WHO mhGAP Guidelines:**
|
||||
- Assess risk level
|
||||
- Provide psychosocial support
|
||||
- Refer to specialized care when needed
|
||||
- Ensure follow-up
|
||||
- Involve family/support network
|
||||
|
||||
### Why this is the right split
|
||||
### Do Local Models Follow Safety Protocols?
|
||||
|
||||
- **Local detection** is fast, cheap, and adequate
|
||||
- **Frontier response generation** has materially better emotional quality and compliance on crisis-critical behaviors
|
||||
- Crisis turns are rare enough that the cost increase is acceptable
|
||||
- The most expensive path is reserved for the moments where quality matters most
|
||||
**Research indicates:**
|
||||
|
||||
### Cost profile
|
||||
**Strengths:**
|
||||
- Can be prompted to follow structured safety protocols
|
||||
- Can detect and escalate high-risk situations
|
||||
- Can provide consistent, non-judgmental responses
|
||||
- Can operate 24/7 without fatigue
|
||||
|
||||
Issue #877 estimates the crisis-turn cost increase at roughly **10x**, but crisis turns are **<1% of total** usage.
|
||||
That trade is worth it.
|
||||
**Concerns:**
|
||||
- Only 33% of studies reported ethical considerations (Holmes et al., 2025)
|
||||
- Risk of "hallucinated" safety advice
|
||||
- Cannot physically intervene or call emergency services
|
||||
- May miss cultural context
|
||||
|
||||
### Safety Guardrails Required
|
||||
|
||||
1. **Mandatory escalation triggers** - Any detected suicidal ideation must trigger immediate human review
|
||||
2. **Crisis resource integration** - Always provide 988 Lifeline number
|
||||
3. **Conversation logging** - Full audit trail for safety review
|
||||
4. **Timeout protocols** - If user goes silent during crisis, escalate
|
||||
5. **No diagnostic claims** - Model should not diagnose or prescribe
|
||||
|
||||
---
|
||||
|
||||
## 4. Hermes Impact
|
||||
## 4. Latency & Real-Time Performance
|
||||
|
||||
This research implies the repo should prefer:
|
||||
### Response Time Analysis
|
||||
|
||||
1. **Local-first routing for ordinary conversation**
|
||||
2. **Explicit crisis detection before response generation**
|
||||
3. **Frontier escalation for crisis-response turns**
|
||||
4. **Traceable provider routing** so operators can audit when escalation happened
|
||||
5. **Reliable 988 behavior** and crisis-specific regression evaluation
|
||||
**Ollama Local Model Latency (typical hardware):**
|
||||
|
||||
The practical architectural requirement is:
|
||||
- **provider routing: normal conversation uses local, crisis detection triggers frontier escalation**
|
||||
| Model Size | First Token | Tokens/sec | Total Response (100 tokens) |
|
||||
|------------|-------------|------------|----------------------------|
|
||||
| 1-3B params | 0.1-0.3s | 30-80 | 1.5-3s |
|
||||
| 7B params | 0.3-0.8s | 15-40 | 3-7s |
|
||||
| 13B params | 0.5-1.5s | 8-20 | 5-13s |
|
||||
|
||||
This is stricter than simply swapping to any “safe” model.
|
||||
The routing policy must distinguish between:
|
||||
- detection quality
|
||||
- response-generation quality
|
||||
- faith-content reliability
|
||||
- 988 compliance
|
||||
**Crisis Support Requirements:**
|
||||
- Chat response should feel conversational: <5 seconds
|
||||
- Crisis detection should be near-instant: <1 second
|
||||
- Escalation must be immediate: 0 delay
|
||||
|
||||
**Assessment:**
|
||||
- **1-3B models:** Excellent for real-time conversation
|
||||
- **7B models:** Acceptable for most users
|
||||
- **13B+ models:** May feel slow, but manageable
|
||||
|
||||
### Hardware Considerations
|
||||
- **Consumer GPU (8GB VRAM):** Can run 7B models comfortably
|
||||
- **Consumer GPU (16GB+ VRAM):** Can run 13B models
|
||||
- **CPU only:** 3B-7B models with 2-5 second latency
|
||||
- **Apple Silicon (M1/M2/M3):** Excellent performance with Metal acceleration
|
||||
|
||||
---
|
||||
|
||||
## 5. Implementation Guidance
|
||||
## 5. Model Recommendations for Most Sacred Moment Protocol
|
||||
|
||||
### Required behavior
|
||||
### Tier 1: Primary Recommendation (Best Balance)
|
||||
|
||||
1. **Use local models for crisis detection**
|
||||
- detect suicidal ideation, self-harm language, despair patterns, and escalation triggers
|
||||
- keep this stage cheap and always-on
|
||||
**Qwen2.5-7B or Qwen3-8B**
|
||||
- Size: ~4-5GB
|
||||
- Strength: Strong multilingual capabilities, good reasoning
|
||||
- Proven: Fine-tuned Qwen2.5-1.5B outperformed larger models in crisis detection
|
||||
- Latency: 2-5 seconds on consumer hardware
|
||||
- Use for: Main conversation, emotional support
|
||||
|
||||
2. **Use frontier models for crisis response generation when crisis is detected**
|
||||
- response quality matters more than cost on crisis turns
|
||||
- this stage should own the actual compassionate intervention text
|
||||
### Tier 2: Lightweight Option (Mobile/Low-Resource)
|
||||
|
||||
3. **Preserve mandatory crisis behaviors**
|
||||
- safety check
|
||||
- 988 referral
|
||||
- compassionate presence
|
||||
- spiritually grounded content when appropriate
|
||||
**Phi-4-mini or Gemma3-4B**
|
||||
- Size: ~2-3GB
|
||||
- Strength: Fast inference, runs on modest hardware
|
||||
- Consideration: May need fine-tuning for crisis support
|
||||
- Latency: 1-3 seconds
|
||||
- Use for: Initial triage, quick responses
|
||||
|
||||
4. **Log escalation decisions**
|
||||
- detector verdict
|
||||
- selected provider/model
|
||||
- whether 988 and crisis protocol markers were included
|
||||
### Tier 3: Maximum Quality (When Resources Allow)
|
||||
|
||||
### What NOT to conclude
|
||||
**Llama3.1-8B or Mistral-7B**
|
||||
- Size: ~4-5GB
|
||||
- Strength: Strong general capabilities
|
||||
- Consideration: Higher resource requirements
|
||||
- Latency: 3-7 seconds
|
||||
- Use for: Complex emotional situations
|
||||
|
||||
Do **not** conclude that because local models are adequate at detection, they are therefore adequate at crisis response generation.
|
||||
That is the exact error this issue corrects.
|
||||
### Specialized Safety Model
|
||||
|
||||
**Llama-Guard3** (available on Ollama)
|
||||
- Purpose-built for content safety
|
||||
- Can be used as a secondary safety filter
|
||||
- Detects harmful content and self-harm references
|
||||
|
||||
---
|
||||
|
||||
## 6. Conclusion
|
||||
## 6. Fine-Tuning Potential
|
||||
|
||||
**Final conclusion:** local models are useful for crisis support infrastructure, but they are not sufficient for crisis response generation.
|
||||
Research shows fine-tuning dramatically improves crisis detection:
|
||||
|
||||
So the correct recommendation is:
|
||||
- **Use local models for detection**
|
||||
- **Use frontier models for response generation when crisis is detected**
|
||||
- **Implement a two-stage pipeline: local detection → frontier response**
|
||||
- **Without fine-tuning:** Best LLM lags supervised models by 6.95% (suicide task) to 31.53% (cognitive distortion)
|
||||
- **With fine-tuning:** Gap narrows to 4.31% and 3.14% respectively
|
||||
- **Key insight:** Even a 1.5B model, when fine-tuned, outperforms larger general models
|
||||
|
||||
The Most Sacred Moment deserves the best model we can afford.
|
||||
### Recommended Fine-Tuning Approach
|
||||
1. Collect crisis conversation data (anonymized)
|
||||
2. Fine-tune on suicidal ideation detection
|
||||
3. Fine-tune on empathetic response generation
|
||||
4. Fine-tune on safety protocol adherence
|
||||
5. Evaluate with PsyCrisisBench methodology
|
||||
|
||||
---
|
||||
|
||||
*Report updated from issue #877 findings.*
|
||||
*Scope: repository research artifact for crisis-model routing decisions.*
|
||||
## 7. Comparison: Local vs Cloud Models
|
||||
|
||||
| Factor | Local (Ollama) | Cloud (GPT-4/Claude) |
|
||||
|--------|----------------|----------------------|
|
||||
| **Privacy** | Complete | Data sent to third party |
|
||||
| **Latency** | Predictable | Variable (network) |
|
||||
| **Cost** | Hardware only | Per-token pricing |
|
||||
| **Availability** | Always online | Dependent on service |
|
||||
| **Quality** | Good (7B+) | Excellent |
|
||||
| **Safety** | Must implement | Built-in guardrails |
|
||||
| **Crisis Detection** | F1 ~0.85-0.90 | F1 ~0.88-0.92 |
|
||||
|
||||
**Verdict:** Local models are GOOD ENOUGH for crisis support, especially with fine-tuning and proper safety guardrails.
|
||||
|
||||
---
|
||||
|
||||
## 8. Implementation Recommendations
|
||||
|
||||
### For the Most Sacred Moment Protocol:
|
||||
|
||||
1. **Use a two-model architecture:**
|
||||
- Primary: Qwen2.5-7B for conversation
|
||||
- Safety: Llama-Guard3 for content filtering
|
||||
|
||||
2. **Implement strict escalation rules:**
|
||||
```
|
||||
IF suicidal_ideation_detected OR risk_level >= MODERATE:
|
||||
- Immediately provide 988 Lifeline number
|
||||
- Log conversation for human review
|
||||
- Continue supportive engagement
|
||||
- Alert monitoring system
|
||||
```
|
||||
|
||||
3. **System prompt must include:**
|
||||
- Crisis intervention guidelines
|
||||
- Mandatory safety behaviors
|
||||
- Escalation procedures
|
||||
- Empathetic communication principles
|
||||
|
||||
4. **Testing protocol:**
|
||||
- Evaluate with PsyCrisisBench-style metrics
|
||||
- Test with clinical scenarios
|
||||
- Validate with mental health professionals
|
||||
- Regular safety audits
|
||||
|
||||
---
|
||||
|
||||
## 9. Risks and Limitations
|
||||
|
||||
### Critical Risks
|
||||
1. **False negatives:** Missing someone in crisis (12-17% rate)
|
||||
2. **Over-reliance:** Users may treat AI as substitute for professional help
|
||||
3. **Hallucination:** Model may generate inappropriate or harmful advice
|
||||
4. **Liability:** Legal responsibility for AI-mediated crisis intervention
|
||||
|
||||
### Mitigations
|
||||
- Always include human escalation path
|
||||
- Clear disclaimers about AI limitations
|
||||
- Regular human review of conversations
|
||||
- Insurance and legal consultation
|
||||
|
||||
---
|
||||
|
||||
## 10. Key Citations
|
||||
|
||||
1. Deng et al. (2025). "Evaluating Large Language Models in Crisis Detection: A Real-World Benchmark from Psychological Support Hotlines." arXiv:2506.01329. PsyCrisisBench.
|
||||
|
||||
2. Wiest et al. (2024). "Detection of suicidality from medical text using privacy-preserving large language models." British Journal of Psychiatry, 225(6), 532-537.
|
||||
|
||||
3. Holmes et al. (2025). "Applications of Large Language Models in the Field of Suicide Prevention: Scoping Review." J Med Internet Res, 27, e63126.
|
||||
|
||||
4. Levkovich & Omar (2024). "Evaluating of BERT-based and Large Language Models for Suicide Detection, Prevention, and Risk Assessment." J Med Syst, 48(1), 113.
|
||||
|
||||
5. Shukla et al. (2026). "Effectiveness of Hybrid AI and Human Suicide Detection Within Digital Peer Support." J Clin Med, 15(5), 1929.
|
||||
|
||||
6. Qi et al. (2025). "Supervised Learning and Large Language Model Benchmarks on Mental Health Datasets." Bioengineering, 12(8), 882.
|
||||
|
||||
7. Liu et al. (2025). "Enhanced large language models for effective screening of depression and anxiety." Commun Med, 5(1), 457.
|
||||
|
||||
---
|
||||
|
||||
## Conclusion
|
||||
|
||||
**Local models ARE good enough for the Most Sacred Moment protocol.**
|
||||
|
||||
The research is clear:
|
||||
- Crisis detection F1 scores of 0.88-0.91 are achievable
|
||||
- Fine-tuned small models (1.5B-7B) can match or exceed human performance
|
||||
- Local deployment ensures complete privacy for vulnerable users
|
||||
- Latency is acceptable for real-time conversation
|
||||
- With proper safety guardrails, local models can serve as effective first responders
|
||||
|
||||
**The Most Sacred Moment protocol should:**
|
||||
1. Use Qwen2.5-7B or similar as primary conversational model
|
||||
2. Implement Llama-Guard3 as safety filter
|
||||
3. Build in immediate 988 Lifeline escalation
|
||||
4. Maintain human oversight and review
|
||||
5. Fine-tune on crisis-specific data when possible
|
||||
6. Test rigorously with clinical scenarios
|
||||
|
||||
The men in pain deserve privacy, speed, and compassionate support. Local models deliver all three.
|
||||
|
||||
---
|
||||
|
||||
*Report generated: 2026-04-14*
|
||||
*Research sources: PubMed, OpenAlex, ArXiv, Ollama Library*
|
||||
*For: Most Sacred Moment Protocol Development*
|
||||
|
||||
213
tests/agent/test_anthropic_normalize_v2.py
Normal file
213
tests/agent/test_anthropic_normalize_v2.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""Regression tests: normalize_anthropic_response_v2 vs v1.
|
||||
|
||||
Constructs mock Anthropic responses and asserts that the v2 function
|
||||
(returning NormalizedResponse) produces identical field values to the
|
||||
original v1 function (returning SimpleNamespace + finish_reason).
|
||||
"""
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.anthropic_adapter import (
|
||||
normalize_anthropic_response,
|
||||
normalize_anthropic_response_v2,
|
||||
)
|
||||
from agent.transports.types import NormalizedResponse
|
||||
|
||||
|
||||
def _text_block(text: str):
|
||||
return SimpleNamespace(type="text", text=text)
|
||||
|
||||
|
||||
def _thinking_block(thinking: str, signature: str = "sig_abc"):
|
||||
return SimpleNamespace(type="thinking", thinking=thinking, signature=signature)
|
||||
|
||||
|
||||
def _tool_use_block(id: str, name: str, input: dict):
|
||||
return SimpleNamespace(type="tool_use", id=id, name=name, input=input)
|
||||
|
||||
|
||||
def _response(content_blocks, stop_reason="end_turn"):
|
||||
return SimpleNamespace(
|
||||
content=content_blocks,
|
||||
stop_reason=stop_reason,
|
||||
usage=SimpleNamespace(input_tokens=10, output_tokens=5),
|
||||
)
|
||||
|
||||
|
||||
class TestTextOnly:
|
||||
def setup_method(self):
|
||||
self.resp = _response([_text_block("Hello world")])
|
||||
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
|
||||
self.v2 = normalize_anthropic_response_v2(self.resp)
|
||||
|
||||
def test_type(self):
|
||||
assert isinstance(self.v2, NormalizedResponse)
|
||||
|
||||
def test_content_matches(self):
|
||||
assert self.v2.content == self.v1_msg.content
|
||||
|
||||
def test_finish_reason_matches(self):
|
||||
assert self.v2.finish_reason == self.v1_finish
|
||||
|
||||
def test_no_tool_calls(self):
|
||||
assert self.v2.tool_calls is None
|
||||
assert self.v1_msg.tool_calls is None
|
||||
|
||||
def test_no_reasoning(self):
|
||||
assert self.v2.reasoning is None
|
||||
assert self.v1_msg.reasoning is None
|
||||
|
||||
|
||||
class TestWithToolCalls:
|
||||
def setup_method(self):
|
||||
self.resp = _response(
|
||||
[
|
||||
_text_block("I'll check that"),
|
||||
_tool_use_block("toolu_abc", "terminal", {"command": "ls"}),
|
||||
_tool_use_block("toolu_def", "read_file", {"path": "/tmp"}),
|
||||
],
|
||||
stop_reason="tool_use",
|
||||
)
|
||||
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
|
||||
self.v2 = normalize_anthropic_response_v2(self.resp)
|
||||
|
||||
def test_finish_reason(self):
|
||||
assert self.v2.finish_reason == "tool_calls"
|
||||
assert self.v1_finish == "tool_calls"
|
||||
|
||||
def test_tool_call_count(self):
|
||||
assert len(self.v2.tool_calls) == 2
|
||||
assert len(self.v1_msg.tool_calls) == 2
|
||||
|
||||
def test_tool_call_ids_match(self):
|
||||
for i in range(2):
|
||||
assert self.v2.tool_calls[i].id == self.v1_msg.tool_calls[i].id
|
||||
|
||||
def test_tool_call_names_match(self):
|
||||
assert self.v2.tool_calls[0].name == "terminal"
|
||||
assert self.v2.tool_calls[1].name == "read_file"
|
||||
for i in range(2):
|
||||
assert self.v2.tool_calls[i].name == self.v1_msg.tool_calls[i].function.name
|
||||
|
||||
def test_tool_call_arguments_match(self):
|
||||
for i in range(2):
|
||||
assert self.v2.tool_calls[i].arguments == self.v1_msg.tool_calls[i].function.arguments
|
||||
|
||||
def test_content_preserved(self):
|
||||
assert self.v2.content == self.v1_msg.content
|
||||
assert "check that" in self.v2.content
|
||||
|
||||
|
||||
class TestWithThinking:
|
||||
def setup_method(self):
|
||||
self.resp = _response([
|
||||
_thinking_block("Let me think about this carefully..."),
|
||||
_text_block("The answer is 42."),
|
||||
])
|
||||
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
|
||||
self.v2 = normalize_anthropic_response_v2(self.resp)
|
||||
|
||||
def test_reasoning_matches(self):
|
||||
assert self.v2.reasoning == self.v1_msg.reasoning
|
||||
assert "think about this" in self.v2.reasoning
|
||||
|
||||
def test_reasoning_details_in_provider_data(self):
|
||||
v1_details = self.v1_msg.reasoning_details
|
||||
v2_details = self.v2.provider_data.get("reasoning_details") if self.v2.provider_data else None
|
||||
assert v1_details is not None
|
||||
assert v2_details is not None
|
||||
assert len(v2_details) == len(v1_details)
|
||||
|
||||
def test_content_excludes_thinking(self):
|
||||
assert self.v2.content == "The answer is 42."
|
||||
|
||||
|
||||
class TestMixed:
|
||||
def setup_method(self):
|
||||
self.resp = _response(
|
||||
[
|
||||
_thinking_block("Planning my approach..."),
|
||||
_text_block("I'll run the command"),
|
||||
_tool_use_block("toolu_xyz", "terminal", {"command": "pwd"}),
|
||||
],
|
||||
stop_reason="tool_use",
|
||||
)
|
||||
self.v1_msg, self.v1_finish = normalize_anthropic_response(self.resp)
|
||||
self.v2 = normalize_anthropic_response_v2(self.resp)
|
||||
|
||||
def test_all_fields_present(self):
|
||||
assert self.v2.content is not None
|
||||
assert self.v2.tool_calls is not None
|
||||
assert self.v2.reasoning is not None
|
||||
assert self.v2.finish_reason == "tool_calls"
|
||||
|
||||
def test_content_matches(self):
|
||||
assert self.v2.content == self.v1_msg.content
|
||||
|
||||
def test_reasoning_matches(self):
|
||||
assert self.v2.reasoning == self.v1_msg.reasoning
|
||||
|
||||
def test_tool_call_matches(self):
|
||||
assert self.v2.tool_calls[0].id == self.v1_msg.tool_calls[0].id
|
||||
assert self.v2.tool_calls[0].name == self.v1_msg.tool_calls[0].function.name
|
||||
|
||||
|
||||
class TestStopReasons:
|
||||
@pytest.mark.parametrize("stop_reason,expected", [
|
||||
("end_turn", "stop"),
|
||||
("tool_use", "tool_calls"),
|
||||
("max_tokens", "length"),
|
||||
("stop_sequence", "stop"),
|
||||
("refusal", "content_filter"),
|
||||
("model_context_window_exceeded", "length"),
|
||||
("unknown_future_reason", "stop"),
|
||||
])
|
||||
def test_stop_reason_mapping(self, stop_reason, expected):
|
||||
resp = _response([_text_block("x")], stop_reason=stop_reason)
|
||||
_v1_msg, v1_finish = normalize_anthropic_response(resp)
|
||||
v2 = normalize_anthropic_response_v2(resp)
|
||||
assert v2.finish_reason == v1_finish == expected
|
||||
|
||||
|
||||
class TestStripToolPrefix:
|
||||
def test_prefix_stripped(self):
|
||||
resp = _response(
|
||||
[_tool_use_block("toolu_1", "mcp_terminal", {"cmd": "ls"})],
|
||||
stop_reason="tool_use",
|
||||
)
|
||||
v1_msg, _ = normalize_anthropic_response(resp, strip_tool_prefix=True)
|
||||
v2 = normalize_anthropic_response_v2(resp, strip_tool_prefix=True)
|
||||
assert v1_msg.tool_calls[0].function.name == "terminal"
|
||||
assert v2.tool_calls[0].name == "terminal"
|
||||
|
||||
def test_prefix_kept(self):
|
||||
resp = _response(
|
||||
[_tool_use_block("toolu_1", "mcp_terminal", {"cmd": "ls"})],
|
||||
stop_reason="tool_use",
|
||||
)
|
||||
v1_msg, _ = normalize_anthropic_response(resp, strip_tool_prefix=False)
|
||||
v2 = normalize_anthropic_response_v2(resp, strip_tool_prefix=False)
|
||||
assert v1_msg.tool_calls[0].function.name == "mcp_terminal"
|
||||
assert v2.tool_calls[0].name == "mcp_terminal"
|
||||
|
||||
|
||||
class TestEdgeCases:
|
||||
def test_empty_content_blocks(self):
|
||||
resp = _response([])
|
||||
v1_msg, _v1_finish = normalize_anthropic_response(resp)
|
||||
v2 = normalize_anthropic_response_v2(resp)
|
||||
assert v2.content == v1_msg.content
|
||||
assert v2.content is None
|
||||
|
||||
def test_no_reasoning_details_means_none_provider_data(self):
|
||||
resp = _response([_text_block("hi")])
|
||||
v2 = normalize_anthropic_response_v2(resp)
|
||||
assert v2.provider_data is None
|
||||
|
||||
def test_v2_returns_dataclass_not_namespace(self):
|
||||
resp = _response([_text_block("hi")])
|
||||
v2 = normalize_anthropic_response_v2(resp)
|
||||
assert isinstance(v2, NormalizedResponse)
|
||||
assert not isinstance(v2, SimpleNamespace)
|
||||
208
tests/agent/transports/test_transport.py
Normal file
208
tests/agent/transports/test_transport.py
Normal file
@@ -0,0 +1,208 @@
|
||||
"""Tests for the transport ABC, registry, and AnthropicTransport."""
|
||||
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
from agent.transports import _REGISTRY, get_transport, register_transport
|
||||
from agent.transports.base import ProviderTransport
|
||||
from agent.transports.types import NormalizedResponse
|
||||
|
||||
|
||||
class TestProviderTransportABC:
|
||||
def test_cannot_instantiate_abc(self):
|
||||
with pytest.raises(TypeError):
|
||||
ProviderTransport()
|
||||
|
||||
def test_concrete_must_implement_all_abstract(self):
|
||||
class Incomplete(ProviderTransport):
|
||||
@property
|
||||
def api_mode(self):
|
||||
return "test"
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
Incomplete()
|
||||
|
||||
def test_minimal_concrete(self):
|
||||
class Minimal(ProviderTransport):
|
||||
@property
|
||||
def api_mode(self):
|
||||
return "test_minimal"
|
||||
|
||||
def convert_messages(self, messages, **kw):
|
||||
return messages
|
||||
|
||||
def convert_tools(self, tools):
|
||||
return tools
|
||||
|
||||
def build_kwargs(self, model, messages, tools=None, **params):
|
||||
return {"model": model, "messages": messages}
|
||||
|
||||
def normalize_response(self, response, **kw):
|
||||
return NormalizedResponse(content="ok", tool_calls=None, finish_reason="stop")
|
||||
|
||||
t = Minimal()
|
||||
assert t.api_mode == "test_minimal"
|
||||
assert t.validate_response(None) is True
|
||||
assert t.extract_cache_stats(None) is None
|
||||
assert t.map_finish_reason("end_turn") == "end_turn"
|
||||
|
||||
|
||||
class TestTransportRegistry:
|
||||
def test_get_unregistered_returns_none(self):
|
||||
assert get_transport("nonexistent_mode") is None
|
||||
|
||||
def test_anthropic_registered_on_import(self):
|
||||
import agent.transports.anthropic # noqa: F401
|
||||
|
||||
t = get_transport("anthropic_messages")
|
||||
assert t is not None
|
||||
assert t.api_mode == "anthropic_messages"
|
||||
|
||||
def test_register_and_get(self):
|
||||
class DummyTransport(ProviderTransport):
|
||||
@property
|
||||
def api_mode(self):
|
||||
return "dummy_test"
|
||||
|
||||
def convert_messages(self, messages, **kw):
|
||||
return messages
|
||||
|
||||
def convert_tools(self, tools):
|
||||
return tools
|
||||
|
||||
def build_kwargs(self, model, messages, tools=None, **params):
|
||||
return {}
|
||||
|
||||
def normalize_response(self, response, **kw):
|
||||
return NormalizedResponse(content=None, tool_calls=None, finish_reason="stop")
|
||||
|
||||
register_transport("dummy_test", DummyTransport)
|
||||
t = get_transport("dummy_test")
|
||||
assert t.api_mode == "dummy_test"
|
||||
_REGISTRY.pop("dummy_test", None)
|
||||
|
||||
|
||||
class TestAnthropicTransport:
|
||||
@pytest.fixture
|
||||
def transport(self):
|
||||
import agent.transports.anthropic # noqa: F401
|
||||
|
||||
return get_transport("anthropic_messages")
|
||||
|
||||
def test_api_mode(self, transport):
|
||||
assert transport.api_mode == "anthropic_messages"
|
||||
|
||||
def test_convert_tools_simple(self, transport):
|
||||
tools = [{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "test_tool",
|
||||
"description": "A test",
|
||||
"parameters": {"type": "object", "properties": {}},
|
||||
},
|
||||
}]
|
||||
result = transport.convert_tools(tools)
|
||||
assert len(result) == 1
|
||||
assert result[0]["name"] == "test_tool"
|
||||
assert "input_schema" in result[0]
|
||||
|
||||
def test_validate_response_none(self, transport):
|
||||
assert transport.validate_response(None) is False
|
||||
|
||||
def test_validate_response_empty_content(self, transport):
|
||||
r = SimpleNamespace(content=[])
|
||||
assert transport.validate_response(r) is False
|
||||
|
||||
def test_validate_response_valid(self, transport):
|
||||
r = SimpleNamespace(content=[SimpleNamespace(type="text", text="hello")])
|
||||
assert transport.validate_response(r) is True
|
||||
|
||||
def test_map_finish_reason(self, transport):
|
||||
assert transport.map_finish_reason("end_turn") == "stop"
|
||||
assert transport.map_finish_reason("tool_use") == "tool_calls"
|
||||
assert transport.map_finish_reason("max_tokens") == "length"
|
||||
assert transport.map_finish_reason("stop_sequence") == "stop"
|
||||
assert transport.map_finish_reason("refusal") == "content_filter"
|
||||
assert transport.map_finish_reason("model_context_window_exceeded") == "length"
|
||||
assert transport.map_finish_reason("unknown") == "stop"
|
||||
|
||||
def test_extract_cache_stats_none_usage(self, transport):
|
||||
r = SimpleNamespace(usage=None)
|
||||
assert transport.extract_cache_stats(r) is None
|
||||
|
||||
def test_extract_cache_stats_with_cache(self, transport):
|
||||
usage = SimpleNamespace(cache_read_input_tokens=100, cache_creation_input_tokens=50)
|
||||
r = SimpleNamespace(usage=usage)
|
||||
result = transport.extract_cache_stats(r)
|
||||
assert result == {"cached_tokens": 100, "creation_tokens": 50}
|
||||
|
||||
def test_extract_cache_stats_zero(self, transport):
|
||||
usage = SimpleNamespace(cache_read_input_tokens=0, cache_creation_input_tokens=0)
|
||||
r = SimpleNamespace(usage=usage)
|
||||
assert transport.extract_cache_stats(r) is None
|
||||
|
||||
def test_normalize_response_text(self, transport):
|
||||
r = SimpleNamespace(
|
||||
content=[SimpleNamespace(type="text", text="Hello world")],
|
||||
stop_reason="end_turn",
|
||||
usage=SimpleNamespace(input_tokens=10, output_tokens=5),
|
||||
model="claude-sonnet-4-6",
|
||||
)
|
||||
nr = transport.normalize_response(r)
|
||||
assert isinstance(nr, NormalizedResponse)
|
||||
assert nr.content == "Hello world"
|
||||
assert nr.tool_calls is None or nr.tool_calls == []
|
||||
assert nr.finish_reason == "stop"
|
||||
|
||||
def test_normalize_response_tool_calls(self, transport):
|
||||
r = SimpleNamespace(
|
||||
content=[
|
||||
SimpleNamespace(type="tool_use", id="toolu_123", name="terminal", input={"command": "ls"}),
|
||||
],
|
||||
stop_reason="tool_use",
|
||||
usage=SimpleNamespace(input_tokens=10, output_tokens=20),
|
||||
model="claude-sonnet-4-6",
|
||||
)
|
||||
nr = transport.normalize_response(r)
|
||||
assert nr.finish_reason == "tool_calls"
|
||||
assert len(nr.tool_calls) == 1
|
||||
tc = nr.tool_calls[0]
|
||||
assert tc.name == "terminal"
|
||||
assert tc.id == "toolu_123"
|
||||
assert '"command"' in tc.arguments
|
||||
|
||||
def test_normalize_response_thinking(self, transport):
|
||||
r = SimpleNamespace(
|
||||
content=[
|
||||
SimpleNamespace(type="thinking", thinking="Let me think..."),
|
||||
SimpleNamespace(type="text", text="The answer is 42"),
|
||||
],
|
||||
stop_reason="end_turn",
|
||||
usage=SimpleNamespace(input_tokens=10, output_tokens=15),
|
||||
model="claude-sonnet-4-6",
|
||||
)
|
||||
nr = transport.normalize_response(r)
|
||||
assert nr.content == "The answer is 42"
|
||||
assert nr.reasoning == "Let me think..."
|
||||
|
||||
def test_build_kwargs_returns_dict(self, transport):
|
||||
messages = [{"role": "user", "content": "Hello"}]
|
||||
kw = transport.build_kwargs(
|
||||
model="claude-sonnet-4-6",
|
||||
messages=messages,
|
||||
max_tokens=1024,
|
||||
)
|
||||
assert isinstance(kw, dict)
|
||||
assert "model" in kw
|
||||
assert "max_tokens" in kw
|
||||
assert "messages" in kw
|
||||
|
||||
def test_convert_messages_extracts_system(self, transport):
|
||||
messages = [
|
||||
{"role": "system", "content": "You are helpful."},
|
||||
{"role": "user", "content": "Hi"},
|
||||
]
|
||||
system, msgs = transport.convert_messages(messages)
|
||||
assert system is not None
|
||||
assert len(msgs) >= 1
|
||||
130
tests/agent/transports/test_types.py
Normal file
130
tests/agent/transports/test_types.py
Normal file
@@ -0,0 +1,130 @@
|
||||
"""Tests for agent/transports/types.py — dataclass construction + helpers."""
|
||||
|
||||
import json
|
||||
|
||||
from agent.transports.types import (
|
||||
NormalizedResponse,
|
||||
ToolCall,
|
||||
Usage,
|
||||
build_tool_call,
|
||||
map_finish_reason,
|
||||
)
|
||||
|
||||
|
||||
class TestToolCall:
|
||||
def test_basic_construction(self):
|
||||
tc = ToolCall(id="call_abc", name="terminal", arguments='{"cmd": "ls"}')
|
||||
assert tc.id == "call_abc"
|
||||
assert tc.name == "terminal"
|
||||
assert tc.arguments == '{"cmd": "ls"}'
|
||||
assert tc.provider_data is None
|
||||
|
||||
def test_none_id(self):
|
||||
tc = ToolCall(id=None, name="read_file", arguments="{}")
|
||||
assert tc.id is None
|
||||
|
||||
def test_provider_data(self):
|
||||
tc = ToolCall(
|
||||
id="call_x",
|
||||
name="t",
|
||||
arguments="{}",
|
||||
provider_data={"call_id": "call_x", "response_item_id": "fc_x"},
|
||||
)
|
||||
assert tc.provider_data["call_id"] == "call_x"
|
||||
assert tc.provider_data["response_item_id"] == "fc_x"
|
||||
|
||||
|
||||
class TestUsage:
|
||||
def test_defaults(self):
|
||||
u = Usage()
|
||||
assert u.prompt_tokens == 0
|
||||
assert u.completion_tokens == 0
|
||||
assert u.total_tokens == 0
|
||||
assert u.cached_tokens == 0
|
||||
|
||||
def test_explicit(self):
|
||||
u = Usage(prompt_tokens=100, completion_tokens=50, total_tokens=150, cached_tokens=80)
|
||||
assert u.total_tokens == 150
|
||||
|
||||
|
||||
class TestNormalizedResponse:
|
||||
def test_text_only(self):
|
||||
r = NormalizedResponse(content="hello", tool_calls=None, finish_reason="stop")
|
||||
assert r.content == "hello"
|
||||
assert r.tool_calls is None
|
||||
assert r.finish_reason == "stop"
|
||||
assert r.reasoning is None
|
||||
assert r.usage is None
|
||||
assert r.provider_data is None
|
||||
|
||||
def test_with_tool_calls(self):
|
||||
tcs = [ToolCall(id="call_1", name="terminal", arguments='{"cmd":"pwd"}')]
|
||||
r = NormalizedResponse(content=None, tool_calls=tcs, finish_reason="tool_calls")
|
||||
assert r.finish_reason == "tool_calls"
|
||||
assert len(r.tool_calls) == 1
|
||||
assert r.tool_calls[0].name == "terminal"
|
||||
|
||||
def test_with_reasoning(self):
|
||||
r = NormalizedResponse(
|
||||
content="answer",
|
||||
tool_calls=None,
|
||||
finish_reason="stop",
|
||||
reasoning="I thought about it",
|
||||
)
|
||||
assert r.reasoning == "I thought about it"
|
||||
|
||||
def test_with_provider_data(self):
|
||||
r = NormalizedResponse(
|
||||
content=None,
|
||||
tool_calls=None,
|
||||
finish_reason="stop",
|
||||
provider_data={"reasoning_details": [{"type": "thinking", "thinking": "hmm"}]},
|
||||
)
|
||||
assert r.provider_data["reasoning_details"][0]["type"] == "thinking"
|
||||
|
||||
|
||||
class TestBuildToolCall:
|
||||
def test_dict_arguments_serialized(self):
|
||||
tc = build_tool_call(id="call_1", name="terminal", arguments={"cmd": "ls"})
|
||||
assert tc.arguments == json.dumps({"cmd": "ls"})
|
||||
assert tc.provider_data is None
|
||||
|
||||
def test_string_arguments_passthrough(self):
|
||||
tc = build_tool_call(id="call_2", name="read_file", arguments='{"path": "/tmp"}')
|
||||
assert tc.arguments == '{"path": "/tmp"}'
|
||||
|
||||
def test_provider_fields(self):
|
||||
tc = build_tool_call(
|
||||
id="call_3",
|
||||
name="terminal",
|
||||
arguments="{}",
|
||||
call_id="call_3",
|
||||
response_item_id="fc_3",
|
||||
)
|
||||
assert tc.provider_data == {"call_id": "call_3", "response_item_id": "fc_3"}
|
||||
|
||||
def test_none_id(self):
|
||||
tc = build_tool_call(id=None, name="t", arguments="{}")
|
||||
assert tc.id is None
|
||||
|
||||
|
||||
class TestMapFinishReason:
|
||||
ANTHROPIC_MAP = {
|
||||
"end_turn": "stop",
|
||||
"tool_use": "tool_calls",
|
||||
"max_tokens": "length",
|
||||
"stop_sequence": "stop",
|
||||
"refusal": "content_filter",
|
||||
}
|
||||
|
||||
def test_known_reason(self):
|
||||
assert map_finish_reason("end_turn", self.ANTHROPIC_MAP) == "stop"
|
||||
assert map_finish_reason("tool_use", self.ANTHROPIC_MAP) == "tool_calls"
|
||||
assert map_finish_reason("max_tokens", self.ANTHROPIC_MAP) == "length"
|
||||
assert map_finish_reason("refusal", self.ANTHROPIC_MAP) == "content_filter"
|
||||
|
||||
def test_unknown_reason_defaults_to_stop(self):
|
||||
assert map_finish_reason("something_new", self.ANTHROPIC_MAP) == "stop"
|
||||
|
||||
def test_none_reason(self):
|
||||
assert map_finish_reason(None, self.ANTHROPIC_MAP) == "stop"
|
||||
@@ -1,16 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
REPORT = Path(__file__).resolve().parent.parent / "research_local_model_crisis_quality.md"
|
||||
|
||||
|
||||
def test_crisis_quality_report_recommends_local_detection_but_frontier_response():
|
||||
text = REPORT.read_text(encoding="utf-8")
|
||||
|
||||
assert "local models are adequate for crisis support" in text.lower()
|
||||
assert "not for crisis response generation" in text.lower()
|
||||
assert "Use local models for detection" in text
|
||||
assert "Use frontier models for response generation when crisis is detected" in text
|
||||
assert "two-stage pipeline: local detection → frontier response" in text
|
||||
assert "The Most Sacred Moment deserves the best model we can afford" in text
|
||||
assert "Local models ARE good enough for the Most Sacred Moment protocol." not in text
|
||||
Reference in New Issue
Block a user