Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
This commit was merged in pull request #1089.
This commit is contained in:
@@ -67,6 +67,29 @@ providers:
|
||||
capabilities: [text, creative, streaming]
|
||||
description: "Dolphin 3.0 8B with Morrowind system prompt and higher temperature"
|
||||
|
||||
# Secondary: vllm-mlx (OpenAI-compatible local backend, 25–50% faster than Ollama on Apple Silicon)
|
||||
# Evaluation results (EuroMLSys '26 / M3 Ultra benchmarks):
|
||||
# - 21–87% higher throughput than llama.cpp across configurations
|
||||
# - +38% to +59% speed advantage vs Ollama on M3 Ultra for Qwen3-14B
|
||||
# - ~15% lower memory usage than Ollama
|
||||
# - Full OpenAI-compatible API — tool calling works identically
|
||||
# Recommendation: Use over Ollama when throughput matters and Apple Silicon is available.
|
||||
# Stay on Ollama for broadest ecosystem compatibility and simpler setup.
|
||||
# To enable: start vllm-mlx server (`python -m vllm.entrypoints.openai.api_server
|
||||
# --model Qwen/Qwen2.5-14B-Instruct-MLX --port 8000`) then set enabled: true.
|
||||
- name: vllm-mlx-local
|
||||
type: vllm_mlx
|
||||
enabled: false # Enable when vllm-mlx server is running
|
||||
priority: 2
|
||||
base_url: "http://localhost:8000/v1"
|
||||
models:
|
||||
- name: Qwen/Qwen2.5-14B-Instruct-MLX
|
||||
default: true
|
||||
context_window: 32000
|
||||
capabilities: [text, tools, json, streaming]
|
||||
- name: mlx-community/Qwen2.5-7B-Instruct-4bit
|
||||
context_window: 32000
|
||||
capabilities: [text, tools, json, streaming]
|
||||
|
||||
# Tertiary: OpenAI (if API key available)
|
||||
- name: openai-backup
|
||||
|
||||
@@ -25,18 +25,17 @@ import logging
|
||||
import subprocess
|
||||
import urllib.request
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
from datetime import UTC, datetime
|
||||
from enum import StrEnum
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MetabolicTier(str, Enum):
|
||||
class MetabolicTier(StrEnum):
|
||||
"""The three-tier metabolic protocol from the Timmy Time architecture."""
|
||||
|
||||
BURST = "burst" # Cloud API (Claude/Groq) — expensive, best quality
|
||||
ACTIVE = "active" # Local 14B (Qwen3-14B) — free, good quality
|
||||
BURST = "burst" # Cloud API (Claude/Groq) — expensive, best quality
|
||||
ACTIVE = "active" # Local 14B (Qwen3-14B) — free, good quality
|
||||
RESTING = "resting" # Local 8B (Qwen3-8B) — free, fast, adequate
|
||||
|
||||
|
||||
@@ -44,10 +43,10 @@ class MetabolicTier(str, Enum):
|
||||
class QuotaStatus:
|
||||
"""Current Claude quota state."""
|
||||
|
||||
five_hour_utilization: float # 0.0 to 1.0
|
||||
five_hour_resets_at: Optional[str]
|
||||
seven_day_utilization: float # 0.0 to 1.0
|
||||
seven_day_resets_at: Optional[str]
|
||||
five_hour_utilization: float # 0.0 to 1.0
|
||||
five_hour_resets_at: str | None
|
||||
seven_day_utilization: float # 0.0 to 1.0
|
||||
seven_day_resets_at: str | None
|
||||
raw_response: dict
|
||||
fetched_at: datetime
|
||||
|
||||
@@ -101,11 +100,11 @@ class QuotaMonitor:
|
||||
USER_AGENT = "claude-code/2.0.32"
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._token: Optional[str] = None
|
||||
self._last_status: Optional[QuotaStatus] = None
|
||||
self._token: str | None = None
|
||||
self._last_status: QuotaStatus | None = None
|
||||
self._cache_seconds = 30 # Don't hammer the API
|
||||
|
||||
def _get_token(self) -> Optional[str]:
|
||||
def _get_token(self) -> str | None:
|
||||
"""Extract OAuth token from macOS Keychain."""
|
||||
if self._token:
|
||||
return self._token
|
||||
@@ -126,11 +125,16 @@ class QuotaMonitor:
|
||||
self._token = oauth.get("accessToken")
|
||||
return self._token
|
||||
|
||||
except (json.JSONDecodeError, KeyError, FileNotFoundError, subprocess.TimeoutExpired) as exc:
|
||||
except (
|
||||
json.JSONDecodeError,
|
||||
KeyError,
|
||||
FileNotFoundError,
|
||||
subprocess.TimeoutExpired,
|
||||
) as exc:
|
||||
logger.warning("Could not read Claude Code credentials: %s", exc)
|
||||
return None
|
||||
|
||||
def check(self, force: bool = False) -> Optional[QuotaStatus]:
|
||||
def check(self, force: bool = False) -> QuotaStatus | None:
|
||||
"""
|
||||
Fetch current quota status.
|
||||
|
||||
@@ -139,7 +143,7 @@ class QuotaMonitor:
|
||||
"""
|
||||
# Return cached if fresh
|
||||
if not force and self._last_status:
|
||||
age = (datetime.now(timezone.utc) - self._last_status.fetched_at).total_seconds()
|
||||
age = (datetime.now(UTC) - self._last_status.fetched_at).total_seconds()
|
||||
if age < self._cache_seconds:
|
||||
return self._last_status
|
||||
|
||||
@@ -170,7 +174,7 @@ class QuotaMonitor:
|
||||
seven_day_utilization=float(seven_day.get("utilization", 0.0)),
|
||||
seven_day_resets_at=seven_day.get("resets_at"),
|
||||
raw_response=data,
|
||||
fetched_at=datetime.now(timezone.utc),
|
||||
fetched_at=datetime.now(UTC),
|
||||
)
|
||||
return self._last_status
|
||||
|
||||
@@ -195,13 +199,13 @@ class QuotaMonitor:
|
||||
tier = status.recommended_tier
|
||||
|
||||
if tier == MetabolicTier.BURST and task_complexity == "high":
|
||||
return "claude-sonnet-4-6" # Cloud — best quality
|
||||
return "claude-sonnet-4-6" # Cloud — best quality
|
||||
elif tier == MetabolicTier.BURST and task_complexity == "medium":
|
||||
return "qwen3:14b" # Save cloud for truly hard tasks
|
||||
return "qwen3:14b" # Save cloud for truly hard tasks
|
||||
elif tier == MetabolicTier.ACTIVE:
|
||||
return "qwen3:14b" # Local 14B — good enough
|
||||
return "qwen3:14b" # Local 14B — good enough
|
||||
else: # RESTING
|
||||
return "qwen3:8b" # Local 8B — conserve everything
|
||||
return "qwen3:8b" # Local 8B — conserve everything
|
||||
|
||||
def should_use_cloud(self, task_value: str = "normal") -> bool:
|
||||
"""
|
||||
@@ -224,14 +228,14 @@ class QuotaMonitor:
|
||||
return False # Never waste cloud on routine
|
||||
|
||||
|
||||
def _time_remaining(reset_at: Optional[str]) -> str:
|
||||
def _time_remaining(reset_at: str | None) -> str:
|
||||
"""Format time until reset as human-readable string."""
|
||||
if not reset_at or reset_at == "null":
|
||||
return "unknown"
|
||||
|
||||
try:
|
||||
reset = datetime.fromisoformat(reset_at.replace("Z", "+00:00"))
|
||||
now = datetime.now(timezone.utc)
|
||||
now = datetime.now(UTC)
|
||||
diff = reset - now
|
||||
|
||||
if diff.total_seconds() <= 0:
|
||||
@@ -249,7 +253,7 @@ def _time_remaining(reset_at: Optional[str]) -> str:
|
||||
|
||||
|
||||
# Module-level singleton
|
||||
_quota_monitor: Optional[QuotaMonitor] = None
|
||||
_quota_monitor: QuotaMonitor | None = None
|
||||
|
||||
|
||||
def get_quota_monitor() -> QuotaMonitor:
|
||||
|
||||
@@ -310,6 +310,22 @@ class CascadeRouter:
|
||||
logger.debug("Ollama provider check error: %s", exc)
|
||||
return False
|
||||
|
||||
elif provider.type == "vllm_mlx":
|
||||
# Check if local vllm-mlx server is running (OpenAI-compatible)
|
||||
if requests is None:
|
||||
return True
|
||||
try:
|
||||
base_url = provider.base_url or provider.url or "http://localhost:8000"
|
||||
# Strip /v1 suffix — health endpoint is at the root
|
||||
server_root = base_url.rstrip("/")
|
||||
if server_root.endswith("/v1"):
|
||||
server_root = server_root[:-3]
|
||||
response = requests.get(f"{server_root}/health", timeout=5)
|
||||
return response.status_code == 200
|
||||
except Exception as exc:
|
||||
logger.debug("vllm-mlx provider check error: %s", exc)
|
||||
return False
|
||||
|
||||
elif provider.type in ("openai", "anthropic", "grok"):
|
||||
# Check if API key is set
|
||||
return provider.api_key is not None and provider.api_key != ""
|
||||
@@ -619,6 +635,14 @@ class CascadeRouter:
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
elif provider.type == "vllm_mlx":
|
||||
result = await self._call_vllm_mlx(
|
||||
provider=provider,
|
||||
messages=messages,
|
||||
model=model or provider.get_default_model(),
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Unknown provider type: {provider.type}")
|
||||
|
||||
@@ -815,6 +839,48 @@ class CascadeRouter:
|
||||
"model": response.model,
|
||||
}
|
||||
|
||||
async def _call_vllm_mlx(
|
||||
self,
|
||||
provider: Provider,
|
||||
messages: list[dict],
|
||||
model: str,
|
||||
temperature: float,
|
||||
max_tokens: int | None,
|
||||
) -> dict:
|
||||
"""Call vllm-mlx via its OpenAI-compatible API.
|
||||
|
||||
vllm-mlx exposes the same /v1/chat/completions endpoint as OpenAI,
|
||||
so we reuse the OpenAI client pointed at the local server.
|
||||
No API key is required for local deployments.
|
||||
"""
|
||||
import openai
|
||||
|
||||
base_url = provider.base_url or provider.url or "http://localhost:8000"
|
||||
# Ensure the base_url ends with /v1 as expected by the OpenAI client
|
||||
if not base_url.rstrip("/").endswith("/v1"):
|
||||
base_url = base_url.rstrip("/") + "/v1"
|
||||
|
||||
client = openai.AsyncOpenAI(
|
||||
api_key=provider.api_key or "no-key-required",
|
||||
base_url=base_url,
|
||||
timeout=self.config.timeout_seconds,
|
||||
)
|
||||
|
||||
kwargs: dict = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
}
|
||||
if max_tokens:
|
||||
kwargs["max_tokens"] = max_tokens
|
||||
|
||||
response = await client.chat.completions.create(**kwargs)
|
||||
|
||||
return {
|
||||
"content": response.choices[0].message.content,
|
||||
"model": response.model,
|
||||
}
|
||||
|
||||
def _record_success(self, provider: Provider, latency_ms: float) -> None:
|
||||
"""Record a successful request."""
|
||||
provider.metrics.total_requests += 1
|
||||
|
||||
@@ -299,9 +299,7 @@ async def poll_kimi_issue(
|
||||
"error": None,
|
||||
}
|
||||
else:
|
||||
logger.warning(
|
||||
"Poll issue #%s returned %s", issue_number, resp.status_code
|
||||
)
|
||||
logger.warning("Poll issue #%s returned %s", issue_number, resp.status_code)
|
||||
|
||||
except Exception as exc:
|
||||
logger.warning("Poll error for issue #%s: %s", issue_number, exc)
|
||||
@@ -332,7 +330,7 @@ def _extract_action_items(text: str) -> list[str]:
|
||||
items: list[str] = []
|
||||
patterns = [
|
||||
re.compile(r"^[-*]\s+\[ \]\s+(.+)", re.MULTILINE), # - [ ] checkbox
|
||||
re.compile(r"^\d+\.\s+(.+)", re.MULTILINE), # 1. numbered list
|
||||
re.compile(r"^\d+\.\s+(.+)", re.MULTILINE), # 1. numbered list
|
||||
re.compile(r"^(?:Action|TODO|Next step):\s*(.+)", re.MULTILINE | re.IGNORECASE),
|
||||
]
|
||||
seen: set[str] = set()
|
||||
|
||||
@@ -54,9 +54,7 @@ class ActionItem:
|
||||
parts.append(f"- {url}")
|
||||
|
||||
if source_issue:
|
||||
parts.append(
|
||||
f"\n### Origin\nExtracted from research in #{source_issue}"
|
||||
)
|
||||
parts.append(f"\n### Origin\nExtracted from research in #{source_issue}")
|
||||
|
||||
parts.append("\n---\n*Auto-triaged from research findings by Timmy*")
|
||||
return "\n".join(parts)
|
||||
@@ -123,7 +121,7 @@ def _validate_action_item(raw_item: dict[str, Any]) -> ActionItem | None:
|
||||
|
||||
labels = raw_item.get("labels", [])
|
||||
if isinstance(labels, str):
|
||||
labels = [l.strip() for l in labels.split(",") if l.strip()]
|
||||
labels = [lbl.strip() for lbl in labels.split(",") if lbl.strip()]
|
||||
if not isinstance(labels, list):
|
||||
labels = []
|
||||
|
||||
@@ -303,7 +301,7 @@ async def _resolve_label_ids(
|
||||
if resp.status_code != 200:
|
||||
return []
|
||||
|
||||
existing = {l["name"]: l["id"] for l in resp.json()}
|
||||
existing = {lbl["name"]: lbl["id"] for lbl in resp.json()}
|
||||
label_ids = []
|
||||
|
||||
for name in label_names:
|
||||
|
||||
@@ -14,7 +14,9 @@ app = typer.Typer(help="Timmy Serve — sovereign AI agent API")
|
||||
def start(
|
||||
port: int = typer.Option(8402, "--port", "-p", help="Port for the serve API"),
|
||||
host: str = typer.Option("0.0.0.0", "--host", "-h", help="Host to bind to"),
|
||||
price: int = typer.Option(None, "--price", help="Price per request in sats (default: from config)"),
|
||||
price: int = typer.Option(
|
||||
None, "--price", help="Price per request in sats (default: from config)"
|
||||
),
|
||||
dry_run: bool = typer.Option(False, "--dry-run", help="Print config and exit (for testing)"),
|
||||
):
|
||||
"""Start Timmy in serve mode."""
|
||||
|
||||
@@ -24,7 +24,6 @@ from dashboard.routes.health import (
|
||||
_generate_recommendations,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Pydantic models
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -118,7 +117,9 @@ class TestGenerateRecommendations:
|
||||
|
||||
def test_unavailable_service(self):
|
||||
deps = [
|
||||
DependencyStatus(name="Ollama AI", status="unavailable", sovereignty_score=10, details={})
|
||||
DependencyStatus(
|
||||
name="Ollama AI", status="unavailable", sovereignty_score=10, details={}
|
||||
)
|
||||
]
|
||||
recs = _generate_recommendations(deps)
|
||||
assert any("Ollama AI is unavailable" in r for r in recs)
|
||||
@@ -137,9 +138,7 @@ class TestGenerateRecommendations:
|
||||
|
||||
def test_degraded_non_lightning(self):
|
||||
"""Degraded non-Lightning dep produces no specific recommendation."""
|
||||
deps = [
|
||||
DependencyStatus(name="Redis", status="degraded", sovereignty_score=5, details={})
|
||||
]
|
||||
deps = [DependencyStatus(name="Redis", status="degraded", sovereignty_score=5, details={})]
|
||||
recs = _generate_recommendations(deps)
|
||||
assert recs == ["System operating optimally - all dependencies healthy"]
|
||||
|
||||
@@ -379,7 +378,9 @@ class TestHealthEndpoint:
|
||||
assert response.status_code == 200
|
||||
|
||||
def test_ok_when_ollama_up(self, client):
|
||||
with patch("dashboard.routes.health.check_ollama", new_callable=AsyncMock, return_value=True):
|
||||
with patch(
|
||||
"dashboard.routes.health.check_ollama", new_callable=AsyncMock, return_value=True
|
||||
):
|
||||
data = client.get("/health").json()
|
||||
|
||||
assert data["status"] == "ok"
|
||||
@@ -415,7 +416,9 @@ class TestHealthStatusPanel:
|
||||
assert "text/html" in response.headers["content-type"]
|
||||
|
||||
def test_shows_up_when_ollama_healthy(self, client):
|
||||
with patch("dashboard.routes.health.check_ollama", new_callable=AsyncMock, return_value=True):
|
||||
with patch(
|
||||
"dashboard.routes.health.check_ollama", new_callable=AsyncMock, return_value=True
|
||||
):
|
||||
text = client.get("/health/status").text
|
||||
|
||||
assert "UP" in text
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
"""Tests for Claude Quota Monitor and Metabolic Protocol."""
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from unittest.mock import patch
|
||||
|
||||
from infrastructure.claude_quota import (
|
||||
MetabolicTier,
|
||||
@@ -22,7 +20,7 @@ def _make_status(five_hour: float = 0.0, seven_day: float = 0.0) -> QuotaStatus:
|
||||
seven_day_utilization=seven_day,
|
||||
seven_day_resets_at=None,
|
||||
raw_response={},
|
||||
fetched_at=datetime.now(timezone.utc),
|
||||
fetched_at=datetime.now(UTC),
|
||||
)
|
||||
|
||||
|
||||
@@ -104,25 +102,25 @@ class TestTimeRemaining:
|
||||
assert _time_remaining("") == "unknown"
|
||||
|
||||
def test_past_time_returns_resetting_now(self):
|
||||
past = (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat()
|
||||
past = (datetime.now(UTC) - timedelta(hours=1)).isoformat()
|
||||
assert _time_remaining(past) == "resetting now"
|
||||
|
||||
def test_future_time_hours_and_minutes(self):
|
||||
future = (datetime.now(timezone.utc) + timedelta(hours=2, minutes=15)).isoformat()
|
||||
future = (datetime.now(UTC) + timedelta(hours=2, minutes=15)).isoformat()
|
||||
result = _time_remaining(future)
|
||||
assert "2h" in result
|
||||
# Minutes may vary ±1 due to test execution time
|
||||
assert "m" in result
|
||||
|
||||
def test_future_time_minutes_only(self):
|
||||
future = (datetime.now(timezone.utc) + timedelta(minutes=45)).isoformat()
|
||||
future = (datetime.now(UTC) + timedelta(minutes=45)).isoformat()
|
||||
result = _time_remaining(future)
|
||||
assert "h" not in result
|
||||
# Minutes may vary ±1 due to test execution time
|
||||
assert "m" in result
|
||||
|
||||
def test_z_suffix_handled(self):
|
||||
future = (datetime.now(timezone.utc) + timedelta(hours=1)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
future = (datetime.now(UTC) + timedelta(hours=1)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
result = _time_remaining(future)
|
||||
assert result != "unknown"
|
||||
|
||||
@@ -238,7 +236,7 @@ class TestQuotaMonitorCaching:
|
||||
|
||||
def test_stale_cache_triggers_fetch(self):
|
||||
monitor = QuotaMonitor()
|
||||
old_time = datetime.now(timezone.utc) - timedelta(seconds=60)
|
||||
old_time = datetime.now(UTC) - timedelta(seconds=60)
|
||||
stale_status = QuotaStatus(
|
||||
five_hour_utilization=0.10,
|
||||
five_hour_resets_at=None,
|
||||
|
||||
@@ -489,6 +489,197 @@ class TestProviderAvailabilityCheck:
|
||||
|
||||
assert router._check_provider_available(provider) is False
|
||||
|
||||
def test_check_vllm_mlx_without_requests(self):
|
||||
"""Test vllm-mlx returns True when requests not available (fallback)."""
|
||||
router = CascadeRouter(config_path=Path("/nonexistent"))
|
||||
|
||||
provider = Provider(
|
||||
name="vllm-mlx-local",
|
||||
type="vllm_mlx",
|
||||
enabled=True,
|
||||
priority=2,
|
||||
base_url="http://localhost:8000/v1",
|
||||
)
|
||||
|
||||
import infrastructure.router.cascade as cascade_module
|
||||
|
||||
old_requests = cascade_module.requests
|
||||
cascade_module.requests = None
|
||||
try:
|
||||
assert router._check_provider_available(provider) is True
|
||||
finally:
|
||||
cascade_module.requests = old_requests
|
||||
|
||||
def test_check_vllm_mlx_server_healthy(self):
|
||||
"""Test vllm-mlx when health check succeeds."""
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
router = CascadeRouter(config_path=Path("/nonexistent"))
|
||||
|
||||
provider = Provider(
|
||||
name="vllm-mlx-local",
|
||||
type="vllm_mlx",
|
||||
enabled=True,
|
||||
priority=2,
|
||||
base_url="http://localhost:8000/v1",
|
||||
)
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
|
||||
with patch("infrastructure.router.cascade.requests") as mock_requests:
|
||||
mock_requests.get.return_value = mock_response
|
||||
result = router._check_provider_available(provider)
|
||||
|
||||
assert result is True
|
||||
mock_requests.get.assert_called_once_with("http://localhost:8000/health", timeout=5)
|
||||
|
||||
def test_check_vllm_mlx_server_down(self):
|
||||
"""Test vllm-mlx when server is not running."""
|
||||
from unittest.mock import patch
|
||||
|
||||
router = CascadeRouter(config_path=Path("/nonexistent"))
|
||||
|
||||
provider = Provider(
|
||||
name="vllm-mlx-local",
|
||||
type="vllm_mlx",
|
||||
enabled=True,
|
||||
priority=2,
|
||||
base_url="http://localhost:8000/v1",
|
||||
)
|
||||
|
||||
with patch("infrastructure.router.cascade.requests") as mock_requests:
|
||||
mock_requests.get.side_effect = ConnectionRefusedError("Connection refused")
|
||||
result = router._check_provider_available(provider)
|
||||
|
||||
assert result is False
|
||||
|
||||
def test_check_vllm_mlx_default_url(self):
|
||||
"""Test vllm-mlx uses default localhost:8000 when no URL configured."""
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
router = CascadeRouter(config_path=Path("/nonexistent"))
|
||||
|
||||
provider = Provider(
|
||||
name="vllm-mlx-local",
|
||||
type="vllm_mlx",
|
||||
enabled=True,
|
||||
priority=2,
|
||||
)
|
||||
|
||||
mock_response = MagicMock()
|
||||
mock_response.status_code = 200
|
||||
|
||||
with patch("infrastructure.router.cascade.requests") as mock_requests:
|
||||
mock_requests.get.return_value = mock_response
|
||||
router._check_provider_available(provider)
|
||||
|
||||
mock_requests.get.assert_called_once_with("http://localhost:8000/health", timeout=5)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
class TestVllmMlxProvider:
|
||||
"""Test vllm-mlx provider integration."""
|
||||
|
||||
async def test_complete_with_vllm_mlx(self):
|
||||
"""Test successful completion via vllm-mlx."""
|
||||
router = CascadeRouter(config_path=Path("/nonexistent"))
|
||||
|
||||
provider = Provider(
|
||||
name="vllm-mlx-local",
|
||||
type="vllm_mlx",
|
||||
enabled=True,
|
||||
priority=2,
|
||||
base_url="http://localhost:8000/v1",
|
||||
models=[{"name": "Qwen/Qwen2.5-14B-Instruct-MLX", "default": True}],
|
||||
)
|
||||
router.providers = [provider]
|
||||
|
||||
with patch.object(router, "_call_vllm_mlx") as mock_call:
|
||||
mock_call.return_value = {
|
||||
"content": "MLX response",
|
||||
"model": "Qwen/Qwen2.5-14B-Instruct-MLX",
|
||||
}
|
||||
|
||||
result = await router.complete(
|
||||
messages=[{"role": "user", "content": "Hi"}],
|
||||
)
|
||||
|
||||
assert result["content"] == "MLX response"
|
||||
assert result["provider"] == "vllm-mlx-local"
|
||||
assert result["model"] == "Qwen/Qwen2.5-14B-Instruct-MLX"
|
||||
|
||||
async def test_vllm_mlx_base_url_normalization(self):
|
||||
"""Test _call_vllm_mlx appends /v1 when missing."""
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
router = CascadeRouter(config_path=Path("/nonexistent"))
|
||||
|
||||
provider = Provider(
|
||||
name="vllm-mlx-local",
|
||||
type="vllm_mlx",
|
||||
enabled=True,
|
||||
priority=2,
|
||||
base_url="http://localhost:8000", # No /v1
|
||||
models=[{"name": "qwen-mlx", "default": True}],
|
||||
)
|
||||
|
||||
mock_choice = MagicMock()
|
||||
mock_choice.message.content = "hello"
|
||||
mock_response = MagicMock()
|
||||
mock_response.choices = [mock_choice]
|
||||
mock_response.model = "qwen-mlx"
|
||||
|
||||
async def fake_create(**kwargs):
|
||||
return mock_response
|
||||
|
||||
with patch("openai.AsyncOpenAI") as mock_openai_cls:
|
||||
mock_client = MagicMock()
|
||||
mock_client.chat.completions.create = AsyncMock(side_effect=fake_create)
|
||||
mock_openai_cls.return_value = mock_client
|
||||
|
||||
await router._call_vllm_mlx(
|
||||
provider=provider,
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
model="qwen-mlx",
|
||||
temperature=0.7,
|
||||
max_tokens=None,
|
||||
)
|
||||
|
||||
call_kwargs = mock_openai_cls.call_args
|
||||
base_url_used = call_kwargs.kwargs.get("base_url") or call_kwargs[1].get("base_url")
|
||||
assert base_url_used.endswith("/v1")
|
||||
|
||||
async def test_vllm_mlx_is_local_not_cloud(self):
|
||||
"""Confirm vllm_mlx is not subject to metabolic protocol cloud skip."""
|
||||
router = CascadeRouter(config_path=Path("/nonexistent"))
|
||||
|
||||
provider = Provider(
|
||||
name="vllm-mlx-local",
|
||||
type="vllm_mlx",
|
||||
enabled=True,
|
||||
priority=2,
|
||||
base_url="http://localhost:8000/v1",
|
||||
models=[{"name": "qwen-mlx", "default": True}],
|
||||
)
|
||||
router.providers = [provider]
|
||||
|
||||
# Quota monitor returns False (block cloud) — vllm_mlx should still be tried
|
||||
with patch("infrastructure.router.cascade._quota_monitor") as mock_qm:
|
||||
mock_qm.check.return_value = object()
|
||||
mock_qm.should_use_cloud.return_value = False
|
||||
|
||||
with patch.object(router, "_call_vllm_mlx") as mock_call:
|
||||
mock_call.return_value = {
|
||||
"content": "Local MLX response",
|
||||
"model": "qwen-mlx",
|
||||
}
|
||||
result = await router.complete(
|
||||
messages=[{"role": "user", "content": "hi"}],
|
||||
)
|
||||
|
||||
assert result["content"] == "Local MLX response"
|
||||
|
||||
|
||||
class TestCascadeRouterReload:
|
||||
"""Test hot-reload of providers.yaml."""
|
||||
|
||||
@@ -175,9 +175,7 @@ async def test_bridge_run_simple_response():
|
||||
bridge = MCPBridge(include_gitea=False, include_shell=False)
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.json.return_value = {
|
||||
"message": {"role": "assistant", "content": "Hello!"}
|
||||
}
|
||||
mock_resp.json.return_value = {"message": {"role": "assistant", "content": "Hello!"}}
|
||||
mock_resp.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = AsyncMock()
|
||||
@@ -238,9 +236,7 @@ async def test_bridge_run_with_tool_call():
|
||||
|
||||
# Round 2: model returns final text
|
||||
final_resp = MagicMock()
|
||||
final_resp.json.return_value = {
|
||||
"message": {"role": "assistant", "content": "Done with tools!"}
|
||||
}
|
||||
final_resp.json.return_value = {"message": {"role": "assistant", "content": "Done with tools!"}}
|
||||
final_resp.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = AsyncMock()
|
||||
@@ -276,17 +272,13 @@ async def test_bridge_run_unknown_tool():
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{"function": {"name": "nonexistent", "arguments": {}}}
|
||||
],
|
||||
"tool_calls": [{"function": {"name": "nonexistent", "arguments": {}}}],
|
||||
}
|
||||
}
|
||||
tool_call_resp.raise_for_status = MagicMock()
|
||||
|
||||
final_resp = MagicMock()
|
||||
final_resp.json.return_value = {
|
||||
"message": {"role": "assistant", "content": "OK"}
|
||||
}
|
||||
final_resp.json.return_value = {"message": {"role": "assistant", "content": "OK"}}
|
||||
final_resp.raise_for_status = MagicMock()
|
||||
|
||||
mock_client = AsyncMock()
|
||||
@@ -332,9 +324,7 @@ async def test_bridge_run_max_rounds():
|
||||
"message": {
|
||||
"role": "assistant",
|
||||
"content": "",
|
||||
"tool_calls": [
|
||||
{"function": {"name": "loop_tool", "arguments": {}}}
|
||||
],
|
||||
"tool_calls": [{"function": {"name": "loop_tool", "arguments": {}}}],
|
||||
}
|
||||
}
|
||||
tool_call_resp.raise_for_status = MagicMock()
|
||||
@@ -365,9 +355,7 @@ async def test_bridge_run_connection_error():
|
||||
bridge = MCPBridge(include_gitea=False, include_shell=False)
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.post = AsyncMock(
|
||||
side_effect=httpx.ConnectError("Connection refused")
|
||||
)
|
||||
mock_client.post = AsyncMock(side_effect=httpx.ConnectError("Connection refused"))
|
||||
mock_client.aclose = AsyncMock()
|
||||
|
||||
bridge._client = mock_client
|
||||
|
||||
@@ -9,7 +9,6 @@ import pytest
|
||||
from timmy.research_triage import (
|
||||
ActionItem,
|
||||
_parse_llm_response,
|
||||
_resolve_label_ids,
|
||||
_validate_action_item,
|
||||
create_gitea_issue,
|
||||
extract_action_items,
|
||||
@@ -250,7 +249,9 @@ class TestCreateGiteaIssue:
|
||||
|
||||
with (
|
||||
patch("timmy.research_triage.settings") as mock_settings,
|
||||
patch("timmy.research_triage._resolve_label_ids", new_callable=AsyncMock, return_value=[1]),
|
||||
patch(
|
||||
"timmy.research_triage._resolve_label_ids", new_callable=AsyncMock, return_value=[1]
|
||||
),
|
||||
patch("timmy.research_triage.httpx.AsyncClient") as mock_cls,
|
||||
):
|
||||
mock_settings.gitea_enabled = True
|
||||
@@ -284,7 +285,9 @@ class TestCreateGiteaIssue:
|
||||
|
||||
with (
|
||||
patch("timmy.research_triage.settings") as mock_settings,
|
||||
patch("timmy.research_triage._resolve_label_ids", new_callable=AsyncMock, return_value=[]),
|
||||
patch(
|
||||
"timmy.research_triage._resolve_label_ids", new_callable=AsyncMock, return_value=[]
|
||||
),
|
||||
patch("timmy.research_triage.httpx.AsyncClient") as mock_cls,
|
||||
):
|
||||
mock_settings.gitea_enabled = True
|
||||
@@ -331,7 +334,9 @@ class TestTriageResearchReport:
|
||||
|
||||
with (
|
||||
patch("timmy.research_triage.settings") as mock_settings,
|
||||
patch("timmy.research_triage._resolve_label_ids", new_callable=AsyncMock, return_value=[]),
|
||||
patch(
|
||||
"timmy.research_triage._resolve_label_ids", new_callable=AsyncMock, return_value=[]
|
||||
),
|
||||
patch("timmy.research_triage.httpx.AsyncClient") as mock_cls,
|
||||
):
|
||||
mock_settings.gitea_enabled = True
|
||||
|
||||
@@ -14,7 +14,6 @@ from timmy.kimi_delegation import (
|
||||
exceeds_local_capacity,
|
||||
)
|
||||
|
||||
|
||||
# ── Constants ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -455,9 +454,7 @@ class TestExtractAndCreateFollowups:
|
||||
patch("config.settings", mock_settings),
|
||||
patch("httpx.AsyncClient", return_value=async_ctx),
|
||||
):
|
||||
result = await extract_and_create_followups(
|
||||
"1. Do the thing\n2. Do another thing", 10
|
||||
)
|
||||
result = await extract_and_create_followups("1. Do the thing\n2. Do another thing", 10)
|
||||
|
||||
assert result["success"] is True
|
||||
assert 200 in result["created"]
|
||||
|
||||
Reference in New Issue
Block a user