Compare commits
1 Commits
whip/378-1
...
burn/255-1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cc9d7705b6 |
@@ -1,11 +1,10 @@
|
||||
"""Helpers for optional cheap-vs-strong and time-aware model routing."""
|
||||
"""Helpers for optional cheap-vs-strong model routing."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from utils import is_truthy_value
|
||||
|
||||
@@ -193,104 +192,3 @@ def resolve_turn_route(user_message: str, routing_config: Optional[Dict[str, Any
|
||||
tuple(runtime.get("args") or ()),
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# Time-aware cron model routing
|
||||
# =========================================================================
|
||||
#
|
||||
# Empirical finding: cron error rate peaks at 18:00 (9.4%) vs 4.0% at 09:00.
|
||||
# During high-error windows, route cron jobs to more capable models.
|
||||
#
|
||||
# Config (config.yaml):
|
||||
# cron_model_routing:
|
||||
# enabled: true
|
||||
# fallback_model: "anthropic/claude-sonnet-4"
|
||||
# fallback_provider: "openrouter"
|
||||
# windows:
|
||||
# - start_hour: 17
|
||||
# end_hour: 22
|
||||
# reason: "evening_error_peak"
|
||||
# - start_hour: 2
|
||||
# end_hour: 5
|
||||
# reason: "overnight_api_instability"
|
||||
# =========================================================================
|
||||
|
||||
def _hour_in_window(hour: int, start: int, end: int) -> bool:
|
||||
"""Check if hour falls in [start, end) window, handling midnight wrap."""
|
||||
if start <= end:
|
||||
return start <= hour < end
|
||||
else:
|
||||
# Wraps midnight: e.g., 22-06
|
||||
return hour >= start or hour < end
|
||||
|
||||
|
||||
def resolve_cron_model(
|
||||
base_model: str,
|
||||
routing_config: Optional[Dict[str, Any]],
|
||||
now: Optional[datetime] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Apply time-aware model override for cron jobs.
|
||||
|
||||
During configured high-error windows, returns a stronger model config.
|
||||
Outside windows, returns the base model unchanged.
|
||||
|
||||
Args:
|
||||
base_model: The model string already resolved (from job/config/env).
|
||||
routing_config: The cron_model_routing dict from config.yaml.
|
||||
now: Override current time (for testing). Defaults to datetime.now().
|
||||
|
||||
Returns:
|
||||
Dict with keys: model, provider, overridden, reason.
|
||||
- model: the effective model string to use
|
||||
- provider: provider override (empty string = use default)
|
||||
- overridden: True if time-based override was applied
|
||||
- reason: why override was applied (empty string if not)
|
||||
"""
|
||||
cfg = routing_config or {}
|
||||
|
||||
if not _coerce_bool(cfg.get("enabled"), False):
|
||||
return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
|
||||
|
||||
windows = cfg.get("windows") or []
|
||||
if not isinstance(windows, list) or not windows:
|
||||
return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
|
||||
|
||||
current = now or datetime.now()
|
||||
current_hour = current.hour
|
||||
|
||||
matched_window = None
|
||||
for window in windows:
|
||||
if not isinstance(window, dict):
|
||||
continue
|
||||
start = _coerce_int(window.get("start_hour"), -1)
|
||||
end = _coerce_int(window.get("end_hour"), -1)
|
||||
if start < 0 or end < 0:
|
||||
continue
|
||||
if _hour_in_window(current_hour, start, end):
|
||||
matched_window = window
|
||||
break
|
||||
|
||||
if not matched_window:
|
||||
return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
|
||||
|
||||
# Window matched — use the override model from window or global fallback
|
||||
override_model = str(matched_window.get("model") or "").strip()
|
||||
override_provider = str(matched_window.get("provider") or "").strip()
|
||||
|
||||
if not override_model:
|
||||
override_model = str(cfg.get("fallback_model") or "").strip()
|
||||
if not override_provider:
|
||||
override_provider = str(cfg.get("fallback_provider") or "").strip()
|
||||
|
||||
if not override_model:
|
||||
return {"model": base_model, "provider": "", "overridden": False, "reason": ""}
|
||||
|
||||
reason = str(matched_window.get("reason") or "time_window").strip()
|
||||
|
||||
return {
|
||||
"model": override_model,
|
||||
"provider": override_provider,
|
||||
"overridden": True,
|
||||
"reason": f"cron_routing:{reason}(hour={current_hour})",
|
||||
}
|
||||
|
||||
192
cli.py
192
cli.py
@@ -3134,196 +3134,6 @@ class HermesCLI:
|
||||
print(f" Home: {display}")
|
||||
print()
|
||||
|
||||
def _handle_debug_command(self, command: str):
|
||||
"""Generate a debug report with system info and logs, upload to paste service."""
|
||||
import platform
|
||||
import sys
|
||||
import time as _time
|
||||
|
||||
# Parse optional lines argument
|
||||
parts = command.split(maxsplit=1)
|
||||
log_lines = 50
|
||||
if len(parts) > 1:
|
||||
try:
|
||||
log_lines = min(int(parts[1]), 500)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
_cprint(" Collecting debug info...")
|
||||
|
||||
# Collect system info
|
||||
lines = []
|
||||
lines.append("=== HERMES DEBUG REPORT ===")
|
||||
lines.append(f"Generated: {_time.strftime('%Y-%m-%d %H:%M:%S %z')}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("--- System ---")
|
||||
lines.append(f"Python: {sys.version}")
|
||||
lines.append(f"Platform: {platform.platform()}")
|
||||
lines.append(f"Architecture: {platform.machine()}")
|
||||
lines.append(f"Hostname: {platform.node()}")
|
||||
lines.append("")
|
||||
|
||||
# Hermes info
|
||||
lines.append("--- Hermes ---")
|
||||
try:
|
||||
from hermes_constants import get_hermes_home, display_hermes_home
|
||||
lines.append(f"Home: {display_hermes_home()}")
|
||||
except Exception:
|
||||
lines.append("Home: unknown")
|
||||
|
||||
try:
|
||||
from hermes_constants import __version__
|
||||
lines.append(f"Version: {__version__}")
|
||||
except Exception:
|
||||
lines.append("Version: unknown")
|
||||
|
||||
lines.append(f"Profile: {getattr(self, '_profile_name', 'default')}")
|
||||
lines.append(f"Session: {self.session_id}")
|
||||
lines.append(f"Model: {self.model}")
|
||||
lines.append(f"Provider: {getattr(self, '_provider_name', 'unknown')}")
|
||||
|
||||
try:
|
||||
lines.append(f"Working dir: {os.getcwd()}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Config (redacted)
|
||||
lines.append("")
|
||||
lines.append("--- Config (redacted) ---")
|
||||
try:
|
||||
from hermes_constants import get_hermes_home
|
||||
config_path = get_hermes_home() / "config.yaml"
|
||||
if config_path.exists():
|
||||
import yaml
|
||||
with open(config_path) as f:
|
||||
cfg = yaml.safe_load(f) or {}
|
||||
# Redact secrets
|
||||
for key in ("api_key", "token", "secret", "password"):
|
||||
if key in cfg:
|
||||
cfg[key] = "***REDACTED***"
|
||||
lines.append(yaml.dump(cfg, default_flow_style=False)[:2000])
|
||||
else:
|
||||
lines.append("(no config file found)")
|
||||
except Exception as e:
|
||||
lines.append(f"(error reading config: {e})")
|
||||
|
||||
# Recent logs
|
||||
lines.append("")
|
||||
lines.append(f"--- Recent Logs (last {log_lines} lines) ---")
|
||||
try:
|
||||
from hermes_constants import get_hermes_home
|
||||
log_dir = get_hermes_home() / "logs"
|
||||
if log_dir.exists():
|
||||
for log_file in sorted(log_dir.glob("*.log")):
|
||||
try:
|
||||
content = log_file.read_text(encoding="utf-8", errors="replace")
|
||||
tail = content.strip().split("\n")[-log_lines:]
|
||||
if tail:
|
||||
lines.append(f"\n[{log_file.name}]")
|
||||
lines.extend(tail)
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
lines.append("(no logs directory)")
|
||||
except Exception:
|
||||
lines.append("(error reading logs)")
|
||||
|
||||
# Tool info
|
||||
lines.append("")
|
||||
lines.append("--- Enabled Toolsets ---")
|
||||
try:
|
||||
lines.append(", ".join(self.enabled_toolsets) if self.enabled_toolsets else "(none)")
|
||||
except Exception:
|
||||
lines.append("(unknown)")
|
||||
|
||||
report = "\n".join(lines)
|
||||
report_size = len(report)
|
||||
|
||||
# Try to upload to paste services
|
||||
paste_url = None
|
||||
services = [
|
||||
("dpaste", _upload_dpaste),
|
||||
("0x0.st", _upload_0x0st),
|
||||
]
|
||||
|
||||
for name, uploader in services:
|
||||
try:
|
||||
url = uploader(report)
|
||||
if url:
|
||||
paste_url = url
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
print()
|
||||
if paste_url:
|
||||
_cprint(f" Debug report uploaded: {paste_url}")
|
||||
_cprint(f" Size: {report_size} bytes, {len(lines)} lines")
|
||||
else:
|
||||
# Fallback: save locally
|
||||
try:
|
||||
from hermes_constants import get_hermes_home
|
||||
debug_path = get_hermes_home() / "debug-report.txt"
|
||||
debug_path.write_text(report, encoding="utf-8")
|
||||
_cprint(f" Paste services unavailable. Report saved to: {debug_path}")
|
||||
_cprint(f" Size: {report_size} bytes, {len(lines)} lines")
|
||||
except Exception as e:
|
||||
_cprint(f" Failed to save report: {e}")
|
||||
_cprint(f" Report ({report_size} bytes):")
|
||||
print(report)
|
||||
print()
|
||||
|
||||
|
||||
def _upload_dpaste(content: str) -> str | None:
|
||||
"""Upload content to dpaste.org. Returns URL or None."""
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
data = urllib.parse.urlencode({
|
||||
"content": content,
|
||||
"syntax": "text",
|
||||
"expiry_days": 7,
|
||||
}).encode()
|
||||
req = urllib.request.Request(
|
||||
"https://dpaste.org/api/",
|
||||
data=data,
|
||||
headers={"User-Agent": "hermes-agent/debug"},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
url = resp.read().decode().strip()
|
||||
if url.startswith("http"):
|
||||
return url
|
||||
return None
|
||||
|
||||
|
||||
def _upload_0x0st(content: str) -> str | None:
|
||||
"""Upload content to 0x0.st. Returns URL or None."""
|
||||
import urllib.request
|
||||
import io
|
||||
# 0x0.st expects multipart form with a file field
|
||||
boundary = "----HermesDebugBoundary"
|
||||
body = (
|
||||
f"--{boundary}\r\n"
|
||||
f'Content-Disposition: form-data; name="file"; filename="debug.txt"\r\n'
|
||||
f"Content-Type: text/plain\r\n\r\n"
|
||||
f"{content}\r\n"
|
||||
f"--{boundary}--\r\n"
|
||||
).encode()
|
||||
req = urllib.request.Request(
|
||||
"https://0x0.st",
|
||||
data=body,
|
||||
headers={
|
||||
"Content-Type": f"multipart/form-data; boundary={boundary}",
|
||||
"User-Agent": "hermes-agent/debug",
|
||||
},
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
url = resp.read().decode().strip()
|
||||
if url.startswith("http"):
|
||||
return url
|
||||
return None
|
||||
|
||||
|
||||
def show_config(self):
|
||||
"""Display current configuration with kawaii ASCII art."""
|
||||
# Get terminal config from environment (which was set from cli-config.yaml)
|
||||
@@ -4511,8 +4321,6 @@ def _upload_0x0st(content: str) -> str | None:
|
||||
self.show_help()
|
||||
elif canonical == "profile":
|
||||
self._handle_profile_command()
|
||||
elif canonical == "debug":
|
||||
self._handle_debug_command(cmd_original)
|
||||
elif canonical == "tools":
|
||||
self._handle_tools_command(cmd_original)
|
||||
elif canonical == "toolsets":
|
||||
|
||||
71
cron/jobs.py
71
cron/jobs.py
@@ -547,30 +547,20 @@ def resume_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
|
||||
|
||||
def trigger_job(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Schedule a job to run on the next scheduler tick.
|
||||
|
||||
Clears stale error state when re-triggering a previously-failed job
|
||||
so the stale failure doesn't persist until the next tick completes.
|
||||
"""
|
||||
"""Schedule a job to run on the next scheduler tick."""
|
||||
job = get_job(job_id)
|
||||
if not job:
|
||||
return None
|
||||
|
||||
updates = {
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
"paused_at": None,
|
||||
"paused_reason": None,
|
||||
"next_run_at": _hermes_now().isoformat(),
|
||||
}
|
||||
|
||||
# Clear stale error state when re-triggering
|
||||
if job.get("last_status") == "error":
|
||||
updates["last_status"] = "retrying"
|
||||
updates["last_error"] = None
|
||||
updates["error_cleared_at"] = _hermes_now().isoformat()
|
||||
|
||||
return update_job(job_id, updates)
|
||||
return update_job(
|
||||
job_id,
|
||||
{
|
||||
"enabled": True,
|
||||
"state": "scheduled",
|
||||
"paused_at": None,
|
||||
"paused_reason": None,
|
||||
"next_run_at": _hermes_now().isoformat(),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def run_job_now(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
@@ -628,7 +618,6 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
||||
|
||||
Updates last_run_at, last_status, increments completed count,
|
||||
computes next_run_at, and auto-deletes if repeat limit reached.
|
||||
Tracks health timestamps for error/success history.
|
||||
"""
|
||||
jobs = load_jobs()
|
||||
for i, job in enumerate(jobs):
|
||||
@@ -638,18 +627,6 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
||||
job["last_status"] = "ok" if success else "error"
|
||||
job["last_error"] = error if not success else None
|
||||
|
||||
# Track health timestamps
|
||||
if success:
|
||||
job["last_success_at"] = now
|
||||
# Clear stale error tracking on success
|
||||
if job.get("last_error_at"):
|
||||
job["error_resolved_at"] = now
|
||||
else:
|
||||
job["last_error_at"] = now
|
||||
# Clear resolved tracking on new error
|
||||
if job.get("error_resolved_at"):
|
||||
del job["error_resolved_at"]
|
||||
|
||||
# Increment completed count
|
||||
if job.get("repeat"):
|
||||
job["repeat"]["completed"] = job["repeat"].get("completed", 0) + 1
|
||||
@@ -679,32 +656,6 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
||||
save_jobs(jobs)
|
||||
|
||||
|
||||
|
||||
def clear_job_error(job_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Clear stale error state for a job.
|
||||
|
||||
Resets last_status to 'ok', last_error to None, and
|
||||
records when the error was cleared. Useful after auth
|
||||
recovery when the job itself is healthy but stale error
|
||||
state persists.
|
||||
|
||||
Returns:
|
||||
Updated job dict, or None if not found.
|
||||
"""
|
||||
jobs = load_jobs()
|
||||
for job in jobs:
|
||||
if job["id"] == job_id:
|
||||
job["last_status"] = "ok"
|
||||
job["last_error"] = None
|
||||
job["error_cleared_at"] = _hermes_now().isoformat()
|
||||
save_jobs(jobs)
|
||||
return job
|
||||
save_jobs(jobs)
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def advance_next_run(job_id: str) -> bool:
|
||||
"""Preemptively advance next_run_at for a recurring job before execution.
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ import concurrent.futures
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
@@ -38,7 +37,6 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from hermes_constants import get_hermes_home
|
||||
from hermes_cli.config import load_config
|
||||
from hermes_time import now as _hermes_now
|
||||
from agent.model_metadata import is_local_endpoint
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -644,66 +642,6 @@ def _build_job_prompt(job: dict) -> str:
|
||||
return "\n".join(parts)
|
||||
|
||||
|
||||
# Well-known local inference endpoints to probe for prefer_local jobs
|
||||
_LOCAL_ENDPOINTS = [
|
||||
{"name": "ollama", "base_url": "http://localhost:11434/v1", "health": "http://localhost:11434/api/tags"},
|
||||
{"name": "llama-cpp", "base_url": "http://localhost:8080/v1", "health": "http://localhost:8080/health"},
|
||||
{"name": "vllm", "base_url": "http://localhost:8000/v1", "health": "http://localhost:8000/v1/models"},
|
||||
]
|
||||
|
||||
|
||||
def _probe_local_endpoint(url: str, timeout: float = 2.0) -> bool:
|
||||
"""Quick probe to check if a local inference server is running."""
|
||||
import urllib.request
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.status == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _resolve_prefer_local(job: dict) -> tuple[Optional[str], Optional[str], str]:
|
||||
"""For jobs with prefer_local=true, find a running local inference server.
|
||||
|
||||
Returns (provider_override, base_url_override, status_message).
|
||||
None values mean "use default resolution".
|
||||
"""
|
||||
if not job.get("prefer_local"):
|
||||
return None, None, ""
|
||||
|
||||
# If the job already specifies an explicit base_url and it's local, honor it
|
||||
explicit_url = job.get("base_url", "")
|
||||
if explicit_url:
|
||||
from agent.model_metadata import is_local_endpoint
|
||||
if is_local_endpoint(explicit_url):
|
||||
return None, None, f"prefer_local: explicit base_url {explicit_url} is already local"
|
||||
|
||||
# Probe well-known local endpoints
|
||||
for ep in _LOCAL_ENDPOINTS:
|
||||
if _probe_local_endpoint(ep["health"]):
|
||||
logger.info(
|
||||
"Job '%s': prefer_local → found %s at %s",
|
||||
job.get("name", "?"), ep["name"], ep["base_url"],
|
||||
)
|
||||
return None, ep["base_url"], (
|
||||
f"prefer_local: using {ep['name']} at {ep['base_url']}"
|
||||
)
|
||||
|
||||
# No local server found — warn and fall back to default
|
||||
logger.warning(
|
||||
"Job '%s': prefer_local=true but no local inference server found "
|
||||
"(probed: %s). Falling back to default provider.",
|
||||
job.get("name", "?"),
|
||||
", ".join(ep["name"] for ep in _LOCAL_ENDPOINTS),
|
||||
)
|
||||
return None, None, (
|
||||
"prefer_local: no local server found (tried: "
|
||||
+ ", ".join(ep["name"] for ep in _LOCAL_ENDPOINTS)
|
||||
+ "). Using default provider."
|
||||
)
|
||||
|
||||
|
||||
def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
"""
|
||||
Execute a single cron job.
|
||||
@@ -779,22 +717,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
|
||||
# Reasoning config from env or config.yaml
|
||||
from hermes_constants import parse_reasoning_effort
|
||||
|
||||
# Time-aware cron model routing — override model during high-error windows
|
||||
try:
|
||||
from agent.smart_model_routing import resolve_cron_model
|
||||
_cron_routing_cfg = (_cfg.get("cron_model_routing") or {})
|
||||
_cron_route = resolve_cron_model(model, _cron_routing_cfg)
|
||||
if _cron_route["overridden"]:
|
||||
_original_model = model
|
||||
model = _cron_route["model"]
|
||||
logger.info(
|
||||
"Job '%s': cron model override %s -> %s (%s)",
|
||||
job_id, _original_model, model, _cron_route["reason"],
|
||||
)
|
||||
except Exception as _e:
|
||||
logger.debug("Job '%s': cron model routing skipped: %s", job_id, _e)
|
||||
|
||||
effort = os.getenv("HERMES_REASONING_EFFORT", "")
|
||||
if not effort:
|
||||
effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
|
||||
@@ -825,12 +747,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
pr = _cfg.get("provider_routing", {})
|
||||
smart_routing = _cfg.get("smart_model_routing", {}) or {}
|
||||
|
||||
# prefer_local: if the job declares prefer_local=true, probe for a
|
||||
# local inference server and override the base_url when found. (#378)
|
||||
_pl_provider, _pl_base_url, _pl_status = _resolve_prefer_local(job)
|
||||
if _pl_status:
|
||||
logger.info("Job '%s': %s", job_name, _pl_status)
|
||||
|
||||
from hermes_cli.runtime_provider import (
|
||||
resolve_runtime_provider,
|
||||
format_runtime_provider_error,
|
||||
@@ -839,10 +755,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
runtime_kwargs = {
|
||||
"requested": job.get("provider") or os.getenv("HERMES_INFERENCE_PROVIDER"),
|
||||
}
|
||||
# prefer_local override: use the discovered local endpoint
|
||||
if _pl_base_url:
|
||||
runtime_kwargs["explicit_base_url"] = _pl_base_url
|
||||
elif job.get("base_url"):
|
||||
if job.get("base_url"):
|
||||
runtime_kwargs["explicit_base_url"] = job.get("base_url")
|
||||
runtime = resolve_runtime_provider(**runtime_kwargs)
|
||||
except Exception as exc:
|
||||
@@ -864,29 +777,6 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
},
|
||||
)
|
||||
|
||||
# Build disabled toolsets — always exclude cronjob/messaging/clarify
|
||||
# for cron sessions. When the runtime endpoint is cloud (not local),
|
||||
# also disable terminal so the agent does not attempt SSH or shell
|
||||
# commands that require local infrastructure (keys, filesystem).
|
||||
# Jobs that declare requires_local_infra=true also get terminal
|
||||
# disabled on cloud endpoints regardless of this check. #379
|
||||
_cron_disabled = ["cronjob", "messaging", "clarify"]
|
||||
_runtime_base_url = turn_route["runtime"].get("base_url", "")
|
||||
_is_cloud = not is_local_endpoint(_runtime_base_url)
|
||||
if _is_cloud:
|
||||
_cron_disabled.append("terminal")
|
||||
logger.info(
|
||||
"Job '%s': cloud provider detected (%s), disabling terminal toolset",
|
||||
job_name,
|
||||
turn_route["runtime"].get("provider", "unknown"),
|
||||
)
|
||||
if job.get("requires_local_infra") and _is_cloud:
|
||||
logger.warning(
|
||||
"Job '%s': requires_local_infra=true but running on cloud provider — "
|
||||
"terminal-dependent steps will fail gracefully",
|
||||
job_name,
|
||||
)
|
||||
|
||||
_agent_kwargs = _safe_agent_kwargs({
|
||||
"model": turn_route["model"],
|
||||
"api_key": turn_route["runtime"].get("api_key"),
|
||||
@@ -894,7 +784,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
"provider": turn_route["runtime"].get("provider"),
|
||||
"api_mode": turn_route["runtime"].get("api_mode"),
|
||||
"acp_command": turn_route["runtime"].get("command"),
|
||||
"acp_args": list(turn_route["runtime"].get("args") or []),
|
||||
"acp_args": turn_route["runtime"].get("args"),
|
||||
"max_iterations": max_iterations,
|
||||
"reasoning_config": reasoning_config,
|
||||
"prefill_messages": prefill_messages,
|
||||
@@ -902,7 +792,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
|
||||
"providers_ignored": pr.get("ignore"),
|
||||
"providers_order": pr.get("order"),
|
||||
"provider_sort": pr.get("sort"),
|
||||
"disabled_toolsets": _cron_disabled,
|
||||
"disabled_toolsets": ["cronjob", "messaging", "clarify"],
|
||||
"tool_choice": "required",
|
||||
"quiet_mode": True,
|
||||
"skip_memory": True, # Cron system prompts would corrupt user representations
|
||||
|
||||
154
deploy-crons.py
154
deploy-crons.py
@@ -1,154 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
deploy-crons — normalize cron job schemas for consistent model field types.
|
||||
|
||||
This script ensures that the model field in jobs.json is always a dict when
|
||||
either model or provider is specified, preventing schema inconsistency.
|
||||
|
||||
Usage:
|
||||
python deploy-crons.py [--dry-run] [--jobs-file PATH]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
|
||||
def normalize_job(job: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Normalize a job dict to ensure consistent model field types.
|
||||
|
||||
Before normalization:
|
||||
- If model AND provider: model = raw string, provider = raw string (inconsistent)
|
||||
- If only model: model = raw string
|
||||
- If only provider: provider = raw string at top level
|
||||
|
||||
After normalization:
|
||||
- If model exists: model = {"model": "xxx"}
|
||||
- If provider exists: model = {"provider": "yyy"}
|
||||
- If both exist: model = {"model": "xxx", "provider": "yyy"}
|
||||
- If neither: model = None
|
||||
"""
|
||||
job = dict(job) # Create a copy to avoid modifying the original
|
||||
|
||||
model = job.get("model")
|
||||
provider = job.get("provider")
|
||||
|
||||
# Skip if already normalized (model is a dict)
|
||||
if isinstance(model, dict):
|
||||
return job
|
||||
|
||||
# Build normalized model dict
|
||||
model_dict = {}
|
||||
|
||||
if model is not None and isinstance(model, str):
|
||||
model_dict["model"] = model.strip()
|
||||
|
||||
if provider is not None and isinstance(provider, str):
|
||||
model_dict["provider"] = provider.strip()
|
||||
|
||||
# Set model field
|
||||
if model_dict:
|
||||
job["model"] = model_dict
|
||||
else:
|
||||
job["model"] = None
|
||||
|
||||
# Remove top-level provider field if it was moved into model dict
|
||||
if provider is not None and "provider" in model_dict:
|
||||
# Keep provider field for backward compatibility but mark it as deprecated
|
||||
# This allows existing code that reads job["provider"] to continue working
|
||||
pass
|
||||
|
||||
return job
|
||||
|
||||
|
||||
def normalize_jobs_file(jobs_file: Path, dry_run: bool = False) -> int:
|
||||
"""
|
||||
Normalize all jobs in a jobs.json file.
|
||||
|
||||
Returns the number of jobs that were modified.
|
||||
"""
|
||||
if not jobs_file.exists():
|
||||
print(f"Error: Jobs file not found: {jobs_file}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
try:
|
||||
with open(jobs_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Error: Invalid JSON in {jobs_file}: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
jobs = data.get("jobs", [])
|
||||
if not jobs:
|
||||
print("No jobs found in file.")
|
||||
return 0
|
||||
|
||||
modified_count = 0
|
||||
for i, job in enumerate(jobs):
|
||||
original_model = job.get("model")
|
||||
original_provider = job.get("provider")
|
||||
|
||||
normalized_job = normalize_job(job)
|
||||
|
||||
# Check if anything changed
|
||||
if (normalized_job.get("model") != original_model or
|
||||
normalized_job.get("provider") != original_provider):
|
||||
jobs[i] = normalized_job
|
||||
modified_count += 1
|
||||
|
||||
job_id = job.get("id", "?")
|
||||
job_name = job.get("name", "(unnamed)")
|
||||
print(f"Normalized job {job_id} ({job_name}):")
|
||||
print(f" model: {original_model!r} -> {normalized_job.get('model')!r}")
|
||||
print(f" provider: {original_provider!r} -> {normalized_job.get('provider')!r}")
|
||||
|
||||
if modified_count == 0:
|
||||
print("All jobs already have consistent model field types.")
|
||||
return 0
|
||||
|
||||
if dry_run:
|
||||
print(f"DRY RUN: Would normalize {modified_count} jobs.")
|
||||
return 0
|
||||
|
||||
# Write back to file
|
||||
data["jobs"] = jobs
|
||||
try:
|
||||
with open(jobs_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False)
|
||||
print(f"Normalized {modified_count} jobs in {jobs_file}")
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f"Error writing to {jobs_file}: {e}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Normalize cron job schemas for consistent model field types."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Show what would be changed without modifying the file."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--jobs-file",
|
||||
type=Path,
|
||||
default=Path.home() / ".hermes" / "cron" / "jobs.json",
|
||||
help="Path to jobs.json file (default: ~/.hermes/cron/jobs.json)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.dry_run:
|
||||
print("DRY RUN MODE — no changes will be made.")
|
||||
print()
|
||||
|
||||
return normalize_jobs_file(args.jobs_file, args.dry_run)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
9
deploy/synapse/.gitignore
vendored
Normal file
9
deploy/synapse/.gitignore
vendored
Normal file
@@ -0,0 +1,9 @@
|
||||
# Secrets — never commit
|
||||
.env
|
||||
synapse-credentials.env
|
||||
|
||||
# Backups
|
||||
backups/
|
||||
|
||||
# Generated config backups
|
||||
homeserver.yaml.bak
|
||||
82
deploy/synapse/docker-compose.yml
Normal file
82
deploy/synapse/docker-compose.yml
Normal file
@@ -0,0 +1,82 @@
|
||||
# Synapse Homeserver — Docker Compose Stack
|
||||
# Matrix Phase 1: Deploy Synapse on Ezra VPS
|
||||
#
|
||||
# Usage:
|
||||
# cd deploy/synapse
|
||||
# ./setup.sh # first-time deploy (generates config + keys)
|
||||
# docker compose up -d # start
|
||||
# docker compose logs -f # follow logs
|
||||
# docker compose down # stop
|
||||
#
|
||||
# Secrets:
|
||||
# Never commit .env to version control.
|
||||
# setup.sh generates secrets automatically.
|
||||
|
||||
services:
|
||||
synapse-db:
|
||||
image: postgres:16-alpine
|
||||
container_name: synapse-db
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- synapse_db:/var/lib/postgresql/data
|
||||
environment:
|
||||
POSTGRES_USER: synapse
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?Set POSTGRES_PASSWORD in .env}
|
||||
POSTGRES_INITDB_ARGS: "--encoding=UTF8 --lc-collate=C --lc-ctype=C"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U synapse"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
networks:
|
||||
- synapse_net
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "20m"
|
||||
max-file: "3"
|
||||
|
||||
synapse:
|
||||
image: matrixdotorg/synapse:latest
|
||||
container_name: synapse
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
synapse-db:
|
||||
condition: service_healthy
|
||||
volumes:
|
||||
- synapse_data:/data
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
SYNAPSE_CONFIG_PATH: /data/homeserver.yaml
|
||||
ports:
|
||||
- "127.0.0.1:8008:8008" # Client-server API (localhost only)
|
||||
- "8448:8448" # Federation (public)
|
||||
networks:
|
||||
- synapse_net
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-fSs", "http://localhost:8008/health"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
logging:
|
||||
driver: "json-file"
|
||||
options:
|
||||
max-size: "50m"
|
||||
max-file: "5"
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
cpus: "2.0"
|
||||
memory: 2G
|
||||
reservations:
|
||||
memory: 512M
|
||||
|
||||
volumes:
|
||||
synapse_data:
|
||||
synapse_db:
|
||||
|
||||
networks:
|
||||
synapse_net:
|
||||
driver: bridge
|
||||
101
deploy/synapse/homeserver.yaml
Normal file
101
deploy/synapse/homeserver.yaml
Normal file
@@ -0,0 +1,101 @@
|
||||
# Synapse Homeserver Configuration
|
||||
# Generated by setup.sh — edit with care.
|
||||
#
|
||||
# Docs: https://matrix-org.github.io/synapse/latest/usage/configuration/config_documentation.html
|
||||
|
||||
# Server name — your Matrix domain (e.g. matrix.example.com)
|
||||
server_name: "SERVER_NAME_PLACEHOLDER"
|
||||
|
||||
# Signing key — generated by setup.sh
|
||||
signing_key_path: "/data/signing.key"
|
||||
|
||||
# Trusted key servers (empty = trust only ourselves for our own keys)
|
||||
trusted_key_servers: []
|
||||
|
||||
# Report stats to matrix.org (no for sovereignty)
|
||||
report_stats: false
|
||||
|
||||
# Listeners
|
||||
listeners:
|
||||
- port: 8008
|
||||
tls: false
|
||||
type: http
|
||||
x_forwarded: true
|
||||
resources:
|
||||
- names: [client, federation]
|
||||
compress: false
|
||||
|
||||
# Database — PostgreSQL
|
||||
database:
|
||||
name: psycopg2
|
||||
args:
|
||||
user: synapse
|
||||
password: "${POSTGRES_PASSWORD}"
|
||||
database: synapse
|
||||
host: synapse-db
|
||||
cp_min: 5
|
||||
cp_max: 10
|
||||
|
||||
# Media store
|
||||
media_store_path: "/data/media_store"
|
||||
|
||||
# Upload limits
|
||||
max_upload_size: "50M"
|
||||
|
||||
# URL previews (disable to reduce attack surface)
|
||||
url_preview_enabled: false
|
||||
|
||||
# Enable room list publishing
|
||||
enable_room_list_search: true
|
||||
|
||||
# Turn off public registration by default (create users via admin API)
|
||||
enable_registration: false
|
||||
enable_registration_without_verification: false
|
||||
|
||||
# Rate limiting
|
||||
rc_message:
|
||||
per_second: 0.2
|
||||
burst_count: 10
|
||||
|
||||
rc_registration:
|
||||
per_second: 0.1
|
||||
burst_count: 3
|
||||
|
||||
rc_login:
|
||||
address:
|
||||
per_second: 0.05
|
||||
burst_count: 2
|
||||
account:
|
||||
per_second: 0.05
|
||||
burst_count: 2
|
||||
failed_attempts:
|
||||
per_second: 0.15
|
||||
burst_count: 3
|
||||
|
||||
# Retention — keep messages for 90 days by default
|
||||
retention:
|
||||
enabled: true
|
||||
default_policy:
|
||||
min_lifetime: 1d
|
||||
max_lifetime: 90d
|
||||
|
||||
# Logging
|
||||
log_config: "/data/log.config"
|
||||
|
||||
# Metrics (optional — enable if running Prometheus)
|
||||
enable_metrics: false
|
||||
|
||||
# Presence
|
||||
use_presence: true
|
||||
|
||||
# Federation
|
||||
federation_verify_certificates: true
|
||||
federation_sender_instances: 1
|
||||
|
||||
# Appservice config directory
|
||||
app_service_config_files: []
|
||||
|
||||
# Experimental features
|
||||
experimental_features:
|
||||
# MSC3440: Threading support
|
||||
msc3440_enabled: true
|
||||
33
deploy/synapse/log.config
Normal file
33
deploy/synapse/log.config
Normal file
@@ -0,0 +1,33 @@
|
||||
# Synapse logging configuration
|
||||
# https://matrix-org.github.io/synapse/latest/usage/configuration/config_documentation.html#log_config
|
||||
|
||||
version: 1
|
||||
|
||||
formatters:
|
||||
precise:
|
||||
format: '%(asctime)s - %(name)s - %(lineno)d - %(levelname)s - %(request)s - %(message)s'
|
||||
|
||||
handlers:
|
||||
console:
|
||||
class: logging.StreamHandler
|
||||
formatter: precise
|
||||
level: INFO
|
||||
stream: ext://sys.stdout
|
||||
|
||||
file:
|
||||
class: logging.handlers.RotatingFileHandler
|
||||
formatter: precise
|
||||
filename: /data/homeserver.log
|
||||
maxBytes: 104857600 # 100MB
|
||||
backupCount: 3
|
||||
level: INFO
|
||||
|
||||
loggers:
|
||||
synapse.storage.SQL:
|
||||
level: WARNING
|
||||
synapse.http.client:
|
||||
level: INFO
|
||||
|
||||
root:
|
||||
level: INFO
|
||||
handlers: [console, file]
|
||||
131
deploy/synapse/manage.sh
Executable file
131
deploy/synapse/manage.sh
Executable file
@@ -0,0 +1,131 @@
|
||||
#!/usr/bin/env bash
|
||||
# Synapse Homeserver — Management Utilities
|
||||
# Usage: ./manage.sh <command>
|
||||
#
|
||||
# Commands:
|
||||
# status Show container status and health
|
||||
# restart Restart Synapse (preserves data)
|
||||
# logs Tail Synapse logs
|
||||
# create-user <username> <password> [admin]
|
||||
# backup Create timestamped backup of data volumes
|
||||
# update Pull latest Synapse image and recreate
|
||||
# teardown Stop and remove everything (DESTRUCTIVE)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m'
|
||||
|
||||
info() { echo -e "${GREEN}[MANAGE]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||
|
||||
COMMAND="${1:-help}"
|
||||
|
||||
case "$COMMAND" in
|
||||
status)
|
||||
info "Container status:"
|
||||
docker compose ps
|
||||
echo ""
|
||||
info "Synapse health:"
|
||||
curl -sfS http://127.0.0.1:8008/health && echo "" || echo "Not responding"
|
||||
echo ""
|
||||
info "Disk usage:"
|
||||
docker system df -v 2>/dev/null | grep -E "synapse|VOLUME" || true
|
||||
;;
|
||||
|
||||
restart)
|
||||
info "Restarting Synapse..."
|
||||
docker compose restart synapse
|
||||
info "Waiting for health check..."
|
||||
sleep 5
|
||||
curl -sfS http://127.0.0.1:8008/health && echo "" && info "Synapse is healthy" || warn "Not responding yet"
|
||||
;;
|
||||
|
||||
logs)
|
||||
shift
|
||||
LINES="${1:-100}"
|
||||
info "Tailing Synapse logs (last $LINES lines)..."
|
||||
docker compose logs -f --tail="$LINES" synapse
|
||||
;;
|
||||
|
||||
create-user)
|
||||
USERNAME="${2:?Usage: manage.sh create-user <username> <password> [admin]}"
|
||||
PASSWORD="${3:?Usage: manage.sh create-user <username> <password> [admin]}"
|
||||
IS_ADMIN="${4:-false}"
|
||||
info "Creating user @$USERNAME..."
|
||||
ADMIN_FLAG=""
|
||||
if [ "$IS_ADMIN" = "admin" ] || [ "$IS_ADMIN" = "true" ]; then
|
||||
ADMIN_FLAG="--admin"
|
||||
fi
|
||||
docker compose exec -T synapse register_new_matrix_user \
|
||||
http://localhost:8008 \
|
||||
-c /data/homeserver.yaml \
|
||||
-u "$USERNAME" \
|
||||
-p "$PASSWORD" \
|
||||
$ADMIN_FLAG \
|
||||
--no-extra-prompt
|
||||
;;
|
||||
|
||||
backup)
|
||||
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||||
BACKUP_DIR="./backups/${TIMESTAMP}"
|
||||
mkdir -p "$BACKUP_DIR"
|
||||
info "Backing up PostgreSQL..."
|
||||
docker compose exec -T synapse-db pg_dump -U synapse > "${BACKUP_DIR}/synapse_db.sql"
|
||||
info "Backing up Synapse data volume..."
|
||||
docker run --rm \
|
||||
-v synapse_data:/source:ro \
|
||||
-v "$(pwd)/${BACKUP_DIR}:/backup" \
|
||||
alpine tar czf /backup/synapse_data.tar.gz -C /source .
|
||||
info "Backup complete: $BACKUP_DIR"
|
||||
ls -lh "$BACKUP_DIR"
|
||||
;;
|
||||
|
||||
update)
|
||||
info "Pulling latest Synapse image..."
|
||||
docker compose pull synapse
|
||||
info "Recreating containers..."
|
||||
docker compose up -d --force-recreate synapse
|
||||
info "Waiting for health..."
|
||||
sleep 10
|
||||
curl -sfS http://127.0.0.1:8008/health && echo "" && info "Updated and healthy" || warn "Check logs"
|
||||
;;
|
||||
|
||||
teardown)
|
||||
echo -e "${RED}WARNING: This will stop and remove all Synapse containers and volumes.${NC}"
|
||||
echo -e "${RED}ALL DATA WILL BE LOST. This cannot be undone.${NC}"
|
||||
echo ""
|
||||
read -p "Type 'yes-delete-everything' to confirm: " CONFIRM
|
||||
if [ "$CONFIRM" = "yes-delete-everything" ]; then
|
||||
info "Stopping containers..."
|
||||
docker compose down -v
|
||||
info "Removing volumes..."
|
||||
docker volume rm synapse_data synapse_db 2>/dev/null || true
|
||||
info "Teardown complete."
|
||||
else
|
||||
info "Aborted."
|
||||
fi
|
||||
;;
|
||||
|
||||
help|*)
|
||||
echo "Synapse Homeserver Management"
|
||||
echo ""
|
||||
echo "Usage: ./manage.sh <command>"
|
||||
echo ""
|
||||
echo "Commands:"
|
||||
echo " status Show container status and health"
|
||||
echo " restart Restart Synapse"
|
||||
echo " logs [lines] Tail Synapse logs (default: 100)"
|
||||
echo " create-user <u> <p> [admin] Create a new Matrix user"
|
||||
echo " backup Backup database + data volume"
|
||||
echo " update Pull latest image and recreate"
|
||||
echo " teardown Stop and remove everything (DESTRUCTIVE)"
|
||||
;;
|
||||
esac
|
||||
211
deploy/synapse/setup.sh
Executable file
211
deploy/synapse/setup.sh
Executable file
@@ -0,0 +1,211 @@
|
||||
#!/usr/bin/env bash
|
||||
# Synapse Homeserver — One-Shot Setup Script
|
||||
# Matrix Phase 1: Deploy Synapse on Ezra VPS
|
||||
#
|
||||
# Usage:
|
||||
# ./setup.sh <server_name> [admin_user] [admin_password]
|
||||
#
|
||||
# Example:
|
||||
# ./setup.sh matrix.timmy-time.xyz hermes-bot 'secure-pass-123'
|
||||
#
|
||||
# What it does:
|
||||
# 1. Generates .env with secrets
|
||||
# 2. Prepares homeserver.yaml with correct server name
|
||||
# 3. Generates signing key
|
||||
# 4. Starts Synapse + PostgreSQL via Docker Compose
|
||||
# 5. Waits for Synapse to be healthy
|
||||
# 6. Registers admin user + bot account
|
||||
# 7. Outputs Matrix credentials for hermes-agent
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
|
||||
# --- Colors ---
|
||||
RED='\033[0;31m'
|
||||
GREEN='\033[0;32m'
|
||||
YELLOW='\033[1;33m'
|
||||
CYAN='\033[0;36m'
|
||||
NC='\033[0m'
|
||||
|
||||
info() { echo -e "${GREEN}[SETUP]${NC} $*"; }
|
||||
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
|
||||
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
|
||||
|
||||
# --- Args ---
|
||||
SERVER_NAME="${1:?Usage: $0 <server_name> [admin_user] [admin_password]}"
|
||||
ADMIN_USER="${2:-timmy-admin}"
|
||||
ADMIN_PASS="${3:-$(openssl rand -hex 16)}"
|
||||
BOT_USER="${4:-hermes-bot}"
|
||||
BOT_PASS="${5:-$(openssl rand -hex 16)}"
|
||||
|
||||
echo -e "${CYAN}"
|
||||
echo "╔══════════════════════════════════════════════════╗"
|
||||
echo "║ Synapse Homeserver — Matrix Phase 1 Deploy ║"
|
||||
echo "╚══════════════════════════════════════════════════╝"
|
||||
echo -e "${NC}"
|
||||
info "Server name: $SERVER_NAME"
|
||||
info "Admin user: @$ADMIN_USER:$SERVER_NAME"
|
||||
info "Bot user: @$BOT_USER:$SERVER_NAME"
|
||||
echo ""
|
||||
|
||||
# --- Preflight ---
|
||||
info "Preflight checks..."
|
||||
command -v docker >/dev/null 2>&1 || error "docker not found. Install Docker first."
|
||||
command -v docker compose version >/dev/null 2>&1 || error "docker compose not found. Install Docker Compose plugin."
|
||||
info "Docker: $(docker --version | head -1)"
|
||||
info "Compose: $(docker compose version | head -1)"
|
||||
|
||||
# --- Generate .env ---
|
||||
info "Generating .env..."
|
||||
POSTGRES_PASSWORD=$(openssl rand -hex 24)
|
||||
REGISTRATION_SECRET=$(openssl rand -hex 16)
|
||||
|
||||
cat > .env <<EOF
|
||||
# Synapse deployment — generated $(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
# DO NOT COMMIT THIS FILE
|
||||
|
||||
POSTGRES_PASSWORD=${POSTGRES_PASSWORD}
|
||||
SYNAPSE_SERVER_NAME=${SERVER_NAME}
|
||||
SYNAPSE_REPORT_STATS=no
|
||||
REGISTRATION_SECRET=${REGISTRATION_SECRET}
|
||||
EOF
|
||||
chmod 600 .env
|
||||
info ".env written with secure permissions"
|
||||
|
||||
# --- Prepare homeserver.yaml ---
|
||||
info "Preparing homeserver.yaml..."
|
||||
sed -i.bak "s/SERVER_NAME_PLACEHOLDER/${SERVER_NAME}/g" homeserver.yaml
|
||||
rm -f homeserver.yaml.bak
|
||||
info "Server name set to: $SERVER_NAME"
|
||||
|
||||
# --- Generate signing key ---
|
||||
info "Generating signing key..."
|
||||
# Synapse will generate its own key on first run if missing
|
||||
# But we pre-create the data volume structure
|
||||
docker volume create synapse_data >/dev/null 2>&1 || true
|
||||
docker volume create synapse_db >/dev/null 2>&1 || true
|
||||
|
||||
# --- Start the stack ---
|
||||
info "Starting Synapse + PostgreSQL..."
|
||||
docker compose up -d
|
||||
|
||||
# --- Wait for Synapse to be healthy ---
|
||||
info "Waiting for Synapse to start (up to 120s)..."
|
||||
MAX_WAIT=120
|
||||
ELAPSED=0
|
||||
while [ $ELAPSED -lt $MAX_WAIT ]; do
|
||||
if curl -sfS http://127.0.0.1:8008/health >/dev/null 2>&1; then
|
||||
info "Synapse is healthy!"
|
||||
break
|
||||
fi
|
||||
sleep 3
|
||||
ELAPSED=$((ELAPSED + 3))
|
||||
if [ $((ELAPSED % 15)) -eq 0 ]; then
|
||||
info "Still waiting... (${ELAPSED}s)"
|
||||
fi
|
||||
done
|
||||
|
||||
if [ $ELAPSED -ge $MAX_WAIT ]; then
|
||||
warn "Synapse did not respond within ${MAX_WAIT}s. Check logs:"
|
||||
echo " docker compose logs synapse"
|
||||
error "Aborting registration."
|
||||
fi
|
||||
|
||||
# --- Register admin user ---
|
||||
info "Registering admin user @$ADMIN_USER:$SERVER_NAME..."
|
||||
docker compose exec -T synapse register_new_matrix_user \
|
||||
http://localhost:8008 \
|
||||
-c /data/homeserver.yaml \
|
||||
-u "$ADMIN_USER" \
|
||||
-p "$ADMIN_PASS" \
|
||||
--admin \
|
||||
--no-extra-prompt 2>&1 || {
|
||||
# User might already exist if re-running
|
||||
warn "Admin user registration returned non-zero (may already exist)"
|
||||
}
|
||||
|
||||
# --- Register bot user ---
|
||||
info "Registering bot user @$BOT_USER:$SERVER_NAME..."
|
||||
docker compose exec -T synapse register_new_matrix_user \
|
||||
http://localhost:8008 \
|
||||
-c /data/homeserver.yaml \
|
||||
-u "$BOT_USER" \
|
||||
-p "$BOT_PASS" \
|
||||
--no-admin \
|
||||
--no-extra-prompt 2>&1 || {
|
||||
warn "Bot user registration returned non-zero (may already exist)"
|
||||
}
|
||||
|
||||
# --- Get bot access token ---
|
||||
info "Acquiring bot access token..."
|
||||
BOT_TOKEN_RESPONSE=$(curl -sfS -X POST "http://127.0.0.1:8008/_matrix/client/v3/login" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{
|
||||
\"type\": \"m.login.password\",
|
||||
\"identifier\": {
|
||||
\"type\": \"m.id.user\",
|
||||
\"user\": \"${BOT_USER}\"
|
||||
},
|
||||
\"password\": \"${BOT_PASS}\",
|
||||
\"device_name\": \"Hermes Agent\"
|
||||
}")
|
||||
|
||||
BOT_ACCESS_TOKEN=$(echo "$BOT_TOKEN_RESPONSE" | python3 -c "import sys,json; print(json.load(sys.stdin)['access_token'])" 2>/dev/null || echo "FAILED_TO_EXTRACT")
|
||||
BOT_DEVICE_ID=$(echo "$BOT_TOKEN_RESPONSE" | python3 -c "import sys,json; print(json.load(sys.stdin)['device_id'])" 2>/dev/null || echo "UNKNOWN")
|
||||
|
||||
if [ "$BOT_ACCESS_TOKEN" = "FAILED_TO_EXTRACT" ]; then
|
||||
warn "Could not extract bot access token automatically."
|
||||
warn "Login manually: curl -X POST http://127.0.0.1:8008/_matrix/client/v3/login ..."
|
||||
fi
|
||||
|
||||
# --- Write credentials file ---
|
||||
CREDENTIALS_FILE="synapse-credentials.env"
|
||||
cat > "$CREDENTIALS_FILE" <<EOF
|
||||
# Synapse Credentials — generated $(date -u +%Y-%m-%dT%H:%M:%SZ)
|
||||
# Add these to hermes-agent's ~/.hermes/.env
|
||||
|
||||
# Matrix integration
|
||||
MATRIX_HOMESERVER=http://${SERVER_NAME}:8008
|
||||
MATRIX_ACCESS_TOKEN=${BOT_ACCESS_TOKEN}
|
||||
MATRIX_USER_ID=@${BOT_USER}:${SERVER_NAME}
|
||||
MATRIX_DEVICE_ID=${BOT_DEVICE_ID}
|
||||
MATRIX_ENCRYPTION=true
|
||||
|
||||
# Admin credentials (for user management)
|
||||
SYNAPSE_ADMIN_USER=@${ADMIN_USER}:${SERVER_NAME}
|
||||
SYNAPSE_ADMIN_PASSWORD=${ADMIN_PASS}
|
||||
|
||||
# Bot credentials
|
||||
SYNAPSE_BOT_USER=@${BOT_USER}:${SERVER_NAME}
|
||||
SYNAPSE_BOT_PASSWORD=${BOT_PASS}
|
||||
EOF
|
||||
chmod 600 "$CREDENTIALS_FILE"
|
||||
info "Credentials written to: $CREDENTIALS_FILE"
|
||||
|
||||
# --- Summary ---
|
||||
echo ""
|
||||
echo -e "${GREEN}╔══════════════════════════════════════════════════╗${NC}"
|
||||
echo -e "${GREEN}║ Synapse Deployed Successfully! ║${NC}"
|
||||
echo -e "${GREEN}╚══════════════════════════════════════════════════╝${NC}"
|
||||
echo ""
|
||||
echo -e " Server: ${CYAN}https://${SERVER_NAME}${NC}"
|
||||
echo -e " Client API: ${CYAN}http://127.0.0.1:8008${NC}"
|
||||
echo -e " Federation: ${CYAN}https://${SERVER_NAME}:8448${NC}"
|
||||
echo ""
|
||||
echo -e " Admin: ${YELLOW}@${ADMIN_USER}:${SERVER_NAME}${NC}"
|
||||
echo -e " Bot: ${YELLOW}@${BOT_USER}:${SERVER_NAME}${NC}"
|
||||
echo -e " Bot Token: ${YELLOW}${BOT_ACCESS_TOKEN:0:20}...${NC}"
|
||||
echo ""
|
||||
echo -e " Credentials: ${CYAN}${SCRIPT_DIR}/${CREDENTIALS_FILE}${NC}"
|
||||
echo ""
|
||||
echo -e "${GREEN}Next steps:${NC}"
|
||||
echo " 1. Point DNS: ${SERVER_NAME} → $(curl -s ifconfig.me 2>/dev/null || echo '<VPS_IP>')"
|
||||
echo " 2. Set up TLS: nginx/certbot reverse proxy for :8008 and :8448"
|
||||
echo " 3. Copy credentials to hermes-agent: cp ${CREDENTIALS_FILE} ~/.hermes/.env"
|
||||
echo " 4. Start hermes: hermes gateway --platform matrix"
|
||||
echo ""
|
||||
echo " Manage: docker compose logs -f | docker compose restart | docker compose down"
|
||||
echo " Users: docker compose exec synapse register_new_matrix_user http://localhost:8008 -c /data/homeserver.yaml -u <user> -p <pass>"
|
||||
echo ""
|
||||
@@ -1,170 +0,0 @@
|
||||
# Honcho Memory Integration Evaluation (#322)
|
||||
|
||||
## Executive Summary
|
||||
|
||||
**Status:** Integration already implemented and production-ready.
|
||||
**Recommendation:** KEEP — well-gated, zero overhead when disabled, supports self-hosted.
|
||||
|
||||
## Decision: Cloud vs Local
|
||||
|
||||
### The Question
|
||||
"Do we want a cloud-dependent memory layer, or keep everything local?"
|
||||
|
||||
### Answer: BOTH — User's Choice
|
||||
|
||||
Honcho supports both deployment modes:
|
||||
|
||||
| Mode | Configuration | Data Location | Use Case |
|
||||
|------|--------------|---------------|----------|
|
||||
| Cloud | `HONCHO_API_KEY` | Honcho servers | Quick start, no infrastructure |
|
||||
| Self-hosted | `HONCHO_BASE_URL=http://localhost:8000` | Your servers | Full sovereignty |
|
||||
| Disabled | No config | N/A | Pure local (holographic fact_store only) |
|
||||
|
||||
### Why Keep It
|
||||
|
||||
1. **Opt-in Architecture**
|
||||
- No Honcho config → zero overhead (cron guard, lazy init)
|
||||
- Memory provider system allows switching between providers
|
||||
- `hermes memory off` disables completely
|
||||
|
||||
2. **Zero Runtime Cost When Disabled**
|
||||
```python
|
||||
if not cfg.enabled or not (cfg.api_key or cfg.base_url):
|
||||
return "" # No HTTP calls, no overhead
|
||||
```
|
||||
|
||||
3. **Cross-Session User Modeling**
|
||||
- Holographic fact_store lacks persistent user modeling
|
||||
- Honcho provides: peer cards, dialectic Q&A, semantic search
|
||||
- Complements (not replaces) local memory
|
||||
|
||||
4. **Self-Hosted Option**
|
||||
- Set `HONCHO_BASE_URL=http://localhost:8000`
|
||||
- Run Honcho server locally via Docker
|
||||
- Full data sovereignty
|
||||
|
||||
5. **Production-Grade Implementation**
|
||||
- 3 components, ~700 lines of code
|
||||
- 7 tests passing
|
||||
- Async prefetch (zero-latency context injection)
|
||||
- Configurable recall modes (hybrid/context/tools)
|
||||
- Write frequency control (async/turn/session/N-turns)
|
||||
|
||||
## Architecture
|
||||
|
||||
### Components (Already Implemented)
|
||||
|
||||
```
|
||||
plugins/memory/honcho/
|
||||
├── client.py # Config resolution (API key, base_url, profiles)
|
||||
├── session.py # Session management, async prefetch, dialectic queries
|
||||
├── __init__.py # MemoryProvider interface, 4 tool schemas
|
||||
├── cli.py # CLI commands (setup, status, sessions, map, peer, mode)
|
||||
├── plugin.yaml # Plugin metadata
|
||||
└── README.md # Documentation
|
||||
```
|
||||
|
||||
### Integration Points
|
||||
|
||||
1. **System Prompt**: Context injected on first turn (cached for prompt caching)
|
||||
2. **Tool Registry**: 4 tools available when `recall_mode != "context"`
|
||||
3. **Session End**: Messages flushed to Honcho
|
||||
4. **Cron Guard**: Fully inactive in cron context
|
||||
|
||||
### Tools Available
|
||||
|
||||
| Tool | Cost | Speed | Purpose |
|
||||
|------|------|-------|---------|
|
||||
| `honcho_profile` | Free | Fast | Quick factual snapshot (peer card) |
|
||||
| `honcho_search` | Free | Fast | Semantic search (raw excerpts) |
|
||||
| `honcho_context` | Paid | Slow | Dialectic Q&A (synthesized answers) |
|
||||
| `honcho_conclude` | Free | Fast | Save persistent facts about user |
|
||||
|
||||
## Configuration Guide
|
||||
|
||||
### Option 1: Cloud (Quick Start)
|
||||
```bash
|
||||
# Get API key from https://app.honcho.dev
|
||||
export HONCHO_API_KEY="your-api-key"
|
||||
hermes chat
|
||||
```
|
||||
|
||||
### Option 2: Self-Hosted (Full Sovereignty)
|
||||
```bash
|
||||
# Run Honcho server locally
|
||||
docker run -p 8000:8000 honcho/server
|
||||
|
||||
# Configure Hermes
|
||||
export HONCHO_BASE_URL="http://localhost:8000"
|
||||
hermes chat
|
||||
```
|
||||
|
||||
### Option 3: CLI Setup
|
||||
```bash
|
||||
hermes honcho setup
|
||||
```
|
||||
|
||||
### Option 4: Disabled (Pure Local)
|
||||
```bash
|
||||
# Don't set any Honcho config
|
||||
hermes memory off # If previously enabled
|
||||
hermes chat
|
||||
```
|
||||
|
||||
## Memory Modes
|
||||
|
||||
| Mode | Context Injection | Tools | Cost | Use Case |
|
||||
|------|------------------|-------|------|----------|
|
||||
| hybrid | Yes | Yes | Medium | Default — auto-inject + on-demand |
|
||||
| context | Yes | No | Low | Budget mode — auto-inject only |
|
||||
| tools | No | Yes | Variable | Full control — agent decides |
|
||||
|
||||
## Risk Assessment
|
||||
|
||||
| Risk | Mitigation | Status |
|
||||
|------|------------|--------|
|
||||
| Cloud dependency | Self-hosted option available | ✅ |
|
||||
| Cost from LLM calls | Recall mode "context" or "tools" reduces calls | ✅ |
|
||||
| Data privacy | Self-hosted keeps data on your servers | ✅ |
|
||||
| Performance overhead | Cron guard + lazy init + async prefetch | ✅ |
|
||||
| Vendor lock-in | MemoryProvider interface allows swapping | ✅ |
|
||||
|
||||
## Comparison with Alternatives
|
||||
|
||||
| Feature | Honcho | Holographic | Mem0 | Hindsight |
|
||||
|---------|--------|-------------|------|-----------|
|
||||
| Cross-session modeling | ✅ | ❌ | ✅ | ✅ |
|
||||
| Dialectic Q&A | ✅ | ❌ | ❌ | ❌ |
|
||||
| Self-hosted | ✅ | N/A | ❌ | ❌ |
|
||||
| Local-only option | ✅ | ✅ | ❌ | ✅ |
|
||||
| Cost | Free/Paid | Free | Paid | Free |
|
||||
|
||||
## Conclusion
|
||||
|
||||
**Keep Honcho integration.** It provides unique cross-session user modeling capabilities that complement the local holographic fact_store. The integration is:
|
||||
|
||||
- Well-gated (opt-in, zero overhead when disabled)
|
||||
- Flexible (cloud or self-hosted)
|
||||
- Production-ready (7 tests passing, async prefetch, configurable)
|
||||
- Non-exclusive (works alongside other memory providers)
|
||||
|
||||
### To Enable
|
||||
|
||||
```bash
|
||||
# Cloud
|
||||
hermes honcho setup
|
||||
|
||||
# Self-hosted
|
||||
export HONCHO_BASE_URL="http://localhost:8000"
|
||||
hermes chat
|
||||
```
|
||||
|
||||
### To Disable
|
||||
|
||||
```bash
|
||||
hermes memory off
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
*Evaluated by SANDALPHON — Cron/Ops lane*
|
||||
251
docs/synapse-deployment.md
Normal file
251
docs/synapse-deployment.md
Normal file
@@ -0,0 +1,251 @@
|
||||
# Synapse Homeserver Deployment Guide
|
||||
|
||||
## Matrix Phase 1: Deploy Synapse on Ezra VPS
|
||||
|
||||
Part of [Epic #269: Matrix Integration — Sovereign Messaging for Timmy](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/269).
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────┐
|
||||
│ Ezra VPS (143.198.27.163) │
|
||||
│ │
|
||||
│ ┌──────────┐ ┌─────────────────────────┐ │
|
||||
│ │ Nginx │────▶│ Synapse (Docker) │ │
|
||||
│ │ :443→8008│ │ Client API: localhost:8008│ │
|
||||
│ │ :8448→8448│ │ Federation: 0.0.0.0:8448│ │
|
||||
│ └──────────┘ └──────────┬──────────────┘ │
|
||||
│ │ │
|
||||
│ ┌────────▼──────────┐ │
|
||||
│ │ PostgreSQL 16 │ │
|
||||
│ │ (Docker volume) │ │
|
||||
│ └───────────────────┘ │
|
||||
│ │
|
||||
│ ┌──────────────────────────────────────────┐ │
|
||||
│ │ hermes-agent (gateway) │ │
|
||||
│ │ MATRIX_HOMESERVER=http://localhost:8008 │ │
|
||||
│ └──────────────────────────────────────────┘ │
|
||||
└─────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- Docker + Docker Compose plugin on Ezra VPS
|
||||
- SSH access: `ssh root@143.198.27.163`
|
||||
- DNS A record pointing to the VPS IP
|
||||
- (Recommended) Nginx + Certbot for TLS termination
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# SSH into Ezra
|
||||
ssh root@143.198.27.163
|
||||
|
||||
# Clone hermes-agent (if not present)
|
||||
cd /root
|
||||
git clone https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent.git
|
||||
cd hermes-agent/deploy/synapse
|
||||
|
||||
# Deploy Synapse
|
||||
chmod +x setup.sh
|
||||
./setup.sh matrix.timmy-time.xyz
|
||||
|
||||
# This will:
|
||||
# 1. Generate .env with database password
|
||||
# 2. Prepare homeserver.yaml
|
||||
# 3. Start Synapse + PostgreSQL via Docker Compose
|
||||
# 4. Wait for health
|
||||
# 5. Register admin + bot accounts
|
||||
# 6. Acquire bot access token
|
||||
# 7. Write synapse-credentials.env
|
||||
```
|
||||
|
||||
## Step-by-Step
|
||||
|
||||
### 1. DNS Configuration
|
||||
|
||||
Point your Matrix domain to Ezra's IP:
|
||||
|
||||
```
|
||||
Type Name Value
|
||||
A matrix 143.198.27.163
|
||||
```
|
||||
|
||||
Federation uses SRV records for port discovery, but direct `:8448` works without them.
|
||||
|
||||
### 2. Deploy Synapse
|
||||
|
||||
```bash
|
||||
cd /root/hermes-agent/deploy/synapse
|
||||
./setup.sh matrix.timmy-time.xyz hermes-bot 'your-secure-password'
|
||||
```
|
||||
|
||||
Arguments:
|
||||
| Arg | Default | Description |
|
||||
|-----|---------|-------------|
|
||||
| `server_name` | (required) | Matrix domain (e.g., `matrix.timmy-time.xyz`) |
|
||||
| `admin_user` | `timmy-admin` | Admin account username |
|
||||
| `admin_password` | (random) | Admin account password |
|
||||
| `bot_user` | `hermes-bot` | Bot account username |
|
||||
| `bot_password` | (random) | Bot account password |
|
||||
|
||||
### 3. TLS Termination (Nginx)
|
||||
|
||||
Install Nginx + Certbot:
|
||||
|
||||
```bash
|
||||
apt install -y nginx certbot python3-certbot-nginx
|
||||
|
||||
# Client-server API
|
||||
cat > /etc/nginx/sites-available/matrix <<'EOF'
|
||||
server {
|
||||
listen 443 ssl http2;
|
||||
server_name matrix.timmy-time.xyz;
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/matrix.timmy-time.xyz/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/matrix.timmy-time.xyz/privkey.pem;
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:8008;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
client_max_body_size 50M;
|
||||
}
|
||||
}
|
||||
|
||||
server {
|
||||
listen 8448 ssl http2;
|
||||
server_name matrix.timmy-time.xyz;
|
||||
|
||||
ssl_certificate /etc/letsencrypt/live/matrix.timmy-time.xyz/fullchain.pem;
|
||||
ssl_certificate_key /etc/letsencrypt/live/matrix.timmy-time.xyz/privkey.pem;
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:8008;
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
}
|
||||
EOF
|
||||
|
||||
ln -sf /etc/nginx/sites-available/matrix /etc/nginx/sites-enabled/
|
||||
nginx -t && systemctl reload nginx
|
||||
|
||||
# Get cert
|
||||
certbot --nginx -d matrix.timmy-time.xyz
|
||||
```
|
||||
|
||||
### 4. Wire Hermes Agent
|
||||
|
||||
Copy the generated credentials to hermes-agent's environment:
|
||||
|
||||
```bash
|
||||
# From synapse-credentials.env, add to ~/.hermes/.env:
|
||||
MATRIX_HOMESERVER=https://matrix.timmy-time.xyz
|
||||
MATRIX_ACCESS_TOKEN=<from synapse-credentials.env>
|
||||
MATRIX_USER_ID=@hermes-bot:matrix.timmy-time.xyz
|
||||
MATRIX_DEVICE_ID=<from synapse-credentials.env>
|
||||
MATRIX_ENCRYPTION=true
|
||||
```
|
||||
|
||||
Then start the gateway:
|
||||
|
||||
```bash
|
||||
hermes gateway --platform matrix
|
||||
```
|
||||
|
||||
### 5. Verify
|
||||
|
||||
```bash
|
||||
# Check Synapse health
|
||||
curl -s https://matrix.timmy-time.xyz/_matrix/client/versions
|
||||
|
||||
# Check federation
|
||||
curl -s https://matrix.timmy-time.xyz:8448/_matrix/federation/v1/version
|
||||
|
||||
# Check bot is connected
|
||||
# (should appear online in Element or any Matrix client)
|
||||
```
|
||||
|
||||
## Management
|
||||
|
||||
Use the management script for day-to-day operations:
|
||||
|
||||
```bash
|
||||
cd /root/hermes-agent/deploy/synapse
|
||||
|
||||
./manage.sh status # container health
|
||||
./manage.sh logs # tail logs
|
||||
./manage.sh restart # restart Synapse
|
||||
./manage.sh backup # backup DB + data
|
||||
./manage.sh update # pull latest image
|
||||
./manage.sh create-user alice 'password123'
|
||||
./manage.sh create-user admin 'secret' admin
|
||||
```
|
||||
|
||||
## Backups
|
||||
|
||||
```bash
|
||||
./manage.sh backup
|
||||
# Creates: backups/YYYYMMDD_HHMMSS/
|
||||
# ├── synapse_db.sql (PostgreSQL dump)
|
||||
# └── synapse_data.tar.gz (media store + keys)
|
||||
```
|
||||
|
||||
Automate with cron:
|
||||
|
||||
```bash
|
||||
# Daily backup at 3 AM
|
||||
0 3 * * * cd /root/hermes-agent/deploy/synapse && ./manage.sh backup >> /var/log/synapse-backup.log 2>&1
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Synapse won't start
|
||||
```bash
|
||||
docker compose logs synapse
|
||||
# Common: PostgreSQL not ready. Wait for healthcheck.
|
||||
```
|
||||
|
||||
### Bot can't connect
|
||||
```bash
|
||||
# Verify token is valid
|
||||
curl -H "Authorization: Bearer $MATRIX_ACCESS_TOKEN" \
|
||||
https://matrix.timmy-time.xyz/_matrix/client/v3/account/whoami
|
||||
```
|
||||
|
||||
### Federation not working
|
||||
```bash
|
||||
# Check port 8448 is open
|
||||
ss -tlnp | grep 8448
|
||||
# Check firewall
|
||||
ufw status
|
||||
```
|
||||
|
||||
### High memory usage
|
||||
```bash
|
||||
# Check resource limits in docker-compose.yml
|
||||
docker stats synapse
|
||||
# Tune in homeserver.yaml: event_cache_size, caches
|
||||
```
|
||||
|
||||
## Security Notes
|
||||
|
||||
- Registration is disabled by default (`enable_registration: false`)
|
||||
- Rate limiting is enforced on login, registration, and messages
|
||||
- Federation certificate verification is enabled
|
||||
- `.env` and `synapse-credentials.env` are `chmod 600`
|
||||
- Client API binds to `127.0.0.1` only (use Nginx for public access)
|
||||
- Consider: firewall rules, fail2ban, regular backups
|
||||
|
||||
## References
|
||||
|
||||
- [Synapse Documentation](https://matrix-org.github.io/synapse/latest/)
|
||||
- [Matrix Spec](https://spec.matrix.org/)
|
||||
- [Epic #269: Matrix Integration](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/269)
|
||||
- [Issue #272: Deploy Synapse on Ezra](https://forge.alexanderwhitestone.com/Timmy_Foundation/hermes-agent/issues/272)
|
||||
- [Hermes Matrix Setup Guide](docs/matrix-setup.md)
|
||||
@@ -412,52 +412,6 @@ class GatewayConfig:
|
||||
return self.unauthorized_dm_behavior
|
||||
|
||||
|
||||
def _validate_fallback_providers() -> None:
|
||||
"""Validate fallback_providers from config.yaml at gateway startup.
|
||||
|
||||
Checks that each entry has 'provider' and 'model' fields and logs
|
||||
warnings for malformed entries. This catches broken fallback chains
|
||||
before they silently degrade into no-fallback mode.
|
||||
"""
|
||||
try:
|
||||
_home = get_hermes_home()
|
||||
_config_path = _home / "config.yaml"
|
||||
if not _config_path.exists():
|
||||
return
|
||||
import yaml
|
||||
with open(_config_path, encoding="utf-8") as _f:
|
||||
_cfg = yaml.safe_load(_f) or {}
|
||||
fbp = _cfg.get("fallback_providers")
|
||||
if not fbp:
|
||||
return
|
||||
if not isinstance(fbp, list):
|
||||
logger.warning(
|
||||
"fallback_providers should be a YAML list, got %s. "
|
||||
"Fallback chain will be disabled.",
|
||||
type(fbp).__name__,
|
||||
)
|
||||
return
|
||||
for i, entry in enumerate(fbp):
|
||||
if not isinstance(entry, dict):
|
||||
logger.warning(
|
||||
"fallback_providers[%d] is not a dict (got %s). Skipping entry.",
|
||||
i, type(entry).__name__,
|
||||
)
|
||||
continue
|
||||
if not entry.get("provider"):
|
||||
logger.warning(
|
||||
"fallback_providers[%d] missing 'provider' field. Skipping entry.",
|
||||
i,
|
||||
)
|
||||
if not entry.get("model"):
|
||||
logger.warning(
|
||||
"fallback_providers[%d] missing 'model' field. Skipping entry.",
|
||||
i,
|
||||
)
|
||||
except Exception:
|
||||
pass # Non-fatal; validation is advisory
|
||||
|
||||
|
||||
def load_gateway_config() -> GatewayConfig:
|
||||
"""
|
||||
Load gateway configuration from multiple sources.
|
||||
@@ -691,19 +645,6 @@ def load_gateway_config() -> GatewayConfig:
|
||||
platform.value, env_name,
|
||||
)
|
||||
|
||||
# Warn about API Server enabled without a key (unauthenticated endpoint)
|
||||
if Platform.API_SERVER in config.platforms:
|
||||
api_cfg = config.platforms[Platform.API_SERVER]
|
||||
if api_cfg.enabled and not api_cfg.extra.get("key"):
|
||||
logger.warning(
|
||||
"api_server is enabled but API_SERVER_KEY is not set. "
|
||||
"The API endpoint will run unauthenticated. "
|
||||
"Set API_SERVER_KEY in ~/.hermes/.env to secure it.",
|
||||
)
|
||||
|
||||
# Validate fallback_providers structure from config.yaml
|
||||
_validate_fallback_providers()
|
||||
|
||||
return config
|
||||
|
||||
|
||||
|
||||
@@ -1026,16 +1026,6 @@ class GatewayRunner:
|
||||
cfg = _y.safe_load(_f) or {}
|
||||
fb = cfg.get("fallback_providers") or cfg.get("fallback_model") or None
|
||||
if fb:
|
||||
# Treat empty dict / disabled fallback as "not configured"
|
||||
if isinstance(fb, dict):
|
||||
_enabled = fb.get("enabled")
|
||||
if _enabled is False or (
|
||||
isinstance(_enabled, str)
|
||||
and _enabled.strip().lower() in ("false", "0", "no", "off")
|
||||
):
|
||||
return None
|
||||
if not fb.get("provider") and not fb.get("model"):
|
||||
return None
|
||||
return fb
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -1338,11 +1338,6 @@ _KNOWN_ROOT_KEYS = {
|
||||
"fallback_providers", "credential_pool_strategies", "toolsets",
|
||||
"agent", "terminal", "display", "compression", "delegation",
|
||||
"auxiliary", "custom_providers", "memory", "gateway",
|
||||
"session_reset", "browser", "checkpoints", "smart_model_routing",
|
||||
"voice", "stt", "tts", "human_delay", "security", "privacy",
|
||||
"cron", "logging", "approvals", "command_allowlist", "quick_commands",
|
||||
"personalities", "skills", "honcho", "timezone", "discord",
|
||||
"whatsapp", "prefill_messages_file", "file_read_max_chars",
|
||||
}
|
||||
|
||||
# Valid fields inside a custom_providers list entry
|
||||
@@ -1426,7 +1421,6 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
|
||||
))
|
||||
|
||||
# ── fallback_model must be a top-level dict with provider + model ────
|
||||
# Blank or explicitly disabled fallback is intentional — skip validation.
|
||||
fb = config.get("fallback_model")
|
||||
if fb is not None:
|
||||
if not isinstance(fb, dict):
|
||||
@@ -1436,40 +1430,21 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
|
||||
"Change to:\n"
|
||||
" fallback_model:\n"
|
||||
" provider: openrouter\n"
|
||||
" model: anthropic/claude-sonnet-4\n"
|
||||
"Or disable with:\n"
|
||||
" fallback_model:\n"
|
||||
" enabled: false",
|
||||
" model: anthropic/claude-sonnet-4",
|
||||
))
|
||||
elif fb:
|
||||
# Skip warnings when fallback is explicitly disabled (enabled: false)
|
||||
_enabled = fb.get("enabled")
|
||||
if _enabled is False or (isinstance(_enabled, str) and _enabled.strip().lower() in ("false", "0", "no", "off")):
|
||||
pass # intentionally disabled — no warnings
|
||||
else:
|
||||
# Check if both fields are blank (intentional disable)
|
||||
provider = fb.get("provider")
|
||||
model = fb.get("model")
|
||||
provider_blank = not provider or (isinstance(provider, str) and not provider.strip())
|
||||
model_blank = not model or (isinstance(model, str) and not model.strip())
|
||||
|
||||
# Only warn if at least one field is set (user might be trying to configure)
|
||||
# If both are blank, treat as intentionally disabled
|
||||
if not provider_blank or not model_blank:
|
||||
if provider_blank:
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
"fallback_model is missing 'provider' field — fallback will be disabled",
|
||||
"Add: provider: openrouter (or another provider)\n"
|
||||
"Or disable with: enabled: false",
|
||||
))
|
||||
if model_blank:
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
"fallback_model is missing 'model' field — fallback will be disabled",
|
||||
"Add: model: anthropic/claude-sonnet-4 (or another model)\n"
|
||||
"Or disable with: enabled: false",
|
||||
))
|
||||
if not fb.get("provider"):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
"fallback_model is missing 'provider' field — fallback will be disabled",
|
||||
"Add: provider: openrouter (or another provider)",
|
||||
))
|
||||
if not fb.get("model"):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
"fallback_model is missing 'model' field — fallback will be disabled",
|
||||
"Add: model: anthropic/claude-sonnet-4 (or another model)",
|
||||
))
|
||||
|
||||
# ── Check for fallback_model accidentally nested inside custom_providers ──
|
||||
if isinstance(cp, dict) and "fallback_model" not in config and "fallback_model" in (cp or {}):
|
||||
@@ -1503,72 +1478,6 @@ def validate_config_structure(config: Optional[Dict[str, Any]] = None) -> List["
|
||||
f"Move '{key}' under the appropriate section",
|
||||
))
|
||||
|
||||
# ── fallback_providers must be a list of dicts with provider + model ─
|
||||
fbp = config.get("fallback_providers")
|
||||
if fbp is not None:
|
||||
if not isinstance(fbp, list):
|
||||
issues.append(ConfigIssue(
|
||||
"error",
|
||||
f"fallback_providers should be a YAML list, got {type(fbp).__name__}",
|
||||
"Change to:\n"
|
||||
" fallback_providers:\n"
|
||||
" - provider: openrouter\n"
|
||||
" model: google/gemini-3-flash-preview",
|
||||
))
|
||||
elif fbp:
|
||||
for i, entry in enumerate(fbp):
|
||||
if not isinstance(entry, dict):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"fallback_providers[{i}] is not a dict (got {type(entry).__name__})",
|
||||
"Each entry needs at minimum: provider, model",
|
||||
))
|
||||
continue
|
||||
if not entry.get("provider"):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"fallback_providers[{i}] is missing 'provider' field — this fallback will be skipped",
|
||||
"Add: provider: openrouter (or another provider name)",
|
||||
))
|
||||
if not entry.get("model"):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"fallback_providers[{i}] is missing 'model' field — this fallback will be skipped",
|
||||
"Add: model: google/gemini-3-flash-preview (or another model slug)",
|
||||
))
|
||||
|
||||
# ── session_reset validation ─────────────────────────────────────────
|
||||
session_reset = config.get("session_reset", {})
|
||||
if isinstance(session_reset, dict):
|
||||
idle_minutes = session_reset.get("idle_minutes")
|
||||
if idle_minutes is not None:
|
||||
if not isinstance(idle_minutes, (int, float)) or idle_minutes <= 0:
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"session_reset.idle_minutes={idle_minutes} is invalid (must be a positive number)",
|
||||
"Set to a positive integer, e.g. 1440 (24 hours). Using 0 causes immediate resets.",
|
||||
))
|
||||
at_hour = session_reset.get("at_hour")
|
||||
if at_hour is not None:
|
||||
if not isinstance(at_hour, (int, float)) or not (0 <= at_hour <= 23):
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
f"session_reset.at_hour={at_hour} is invalid (must be 0-23)",
|
||||
"Set to an hour between 0 and 23, e.g. 4 for 4am",
|
||||
))
|
||||
|
||||
# ── API Server key check ─────────────────────────────────────────────
|
||||
# If api_server is enabled via env, but no key is set, warn.
|
||||
# This catches the "API_SERVER_KEY not configured" error from gateway logs.
|
||||
api_server_enabled = os.getenv("API_SERVER_ENABLED", "").lower() in ("true", "1", "yes")
|
||||
api_server_key = os.getenv("API_SERVER_KEY", "").strip()
|
||||
if api_server_enabled and not api_server_key:
|
||||
issues.append(ConfigIssue(
|
||||
"warning",
|
||||
"API_SERVER is enabled but API_SERVER_KEY is not set — the API server will run unauthenticated",
|
||||
"Set API_SERVER_KEY in ~/.hermes/.env to secure the API endpoint",
|
||||
))
|
||||
|
||||
return issues
|
||||
|
||||
|
||||
|
||||
@@ -93,39 +93,6 @@ def cron_list(show_all: bool = False):
|
||||
script = job.get("script")
|
||||
if script:
|
||||
print(f" Script: {script}")
|
||||
|
||||
# Show health status
|
||||
last_status = job.get("last_status")
|
||||
last_error = job.get("last_error")
|
||||
last_error_at = job.get("last_error_at")
|
||||
last_success_at = job.get("last_success_at")
|
||||
error_cleared_at = job.get("error_cleared_at")
|
||||
error_resolved_at = job.get("error_resolved_at")
|
||||
|
||||
if last_status == "error" and last_error:
|
||||
if error_cleared_at or error_resolved_at:
|
||||
# Error was cleared/resolved
|
||||
cleared_time = error_cleared_at or error_resolved_at
|
||||
print(color(f" Status: ok (error cleared)", Colors.GREEN))
|
||||
print(color(f" Last error: {last_error[:80]}...", Colors.DIM))
|
||||
print(color(f" Resolved: {cleared_time}", Colors.DIM))
|
||||
else:
|
||||
# Current error
|
||||
print(color(f" Status: ERROR", Colors.RED))
|
||||
print(color(f" Error: {last_error[:80]}...", Colors.RED))
|
||||
if last_error_at:
|
||||
print(color(f" Since: {last_error_at}", Colors.RED))
|
||||
elif last_status == "retrying":
|
||||
print(color(f" Status: retrying (error cleared)", Colors.YELLOW))
|
||||
elif last_status == "ok":
|
||||
if last_success_at:
|
||||
print(color(f" Status: ok (last success: {last_success_at})", Colors.GREEN))
|
||||
elif last_status:
|
||||
print(f" Status: {last_status}")
|
||||
|
||||
# Show success history if available
|
||||
if last_success_at and last_status != "error":
|
||||
print(f" Last ok: {last_success_at}")
|
||||
print()
|
||||
|
||||
from hermes_cli.gateway import find_gateway_pids
|
||||
@@ -255,18 +222,7 @@ def cron_edit(args):
|
||||
|
||||
|
||||
def _job_action(action: str, job_id: str, success_verb: str, now: bool = False) -> int:
|
||||
if action == "clear_error":
|
||||
result = _cron_api(action="clear_error", job_id=job_id)
|
||||
if not result.get("success"):
|
||||
print(color(f"Failed to clear error: {result.get('error', 'unknown error')}", Colors.RED))
|
||||
return 1
|
||||
job = result.get("job", {})
|
||||
name = job.get("name", job_id)
|
||||
print(color(f"Cleared stale error state for job '{name}'", Colors.GREEN))
|
||||
if job.get("error_cleared_at"):
|
||||
print(f" Cleared at: {job['error_cleared_at']}")
|
||||
return 0
|
||||
if action == "run" and now:
|
||||
if action == "run" and now:
|
||||
# Synchronous execution — run job immediately and show result
|
||||
result = _cron_api(action="run_now", job_id=job_id)
|
||||
if not result.get("success"):
|
||||
@@ -336,13 +292,9 @@ def cron_command(args):
|
||||
now = getattr(args, 'now', False)
|
||||
return _job_action("run", args.job_id, "Triggered", now=now)
|
||||
|
||||
|
||||
if subcmd == "clear-error":
|
||||
return _job_action("clear_error", args.job_id, "Cleared")
|
||||
|
||||
if subcmd in {"remove", "rm", "delete"}:
|
||||
return _job_action("remove", args.job_id, "Removed")
|
||||
|
||||
print(f"Unknown cron command: {subcmd}")
|
||||
print("Usage: hermes cron [list|create|edit|pause|resume|run|remove|clear-error|status|tick]")
|
||||
print("Usage: hermes cron [list|create|edit|pause|resume|run|remove|status|tick]")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -4576,9 +4576,6 @@ For more help on a command:
|
||||
cron_run.add_argument("job_id", help="Job ID to trigger")
|
||||
cron_run.add_argument("--now", action="store_true", help="Execute immediately and wait for result (clears stale errors)")
|
||||
|
||||
cron_clear_error = cron_subparsers.add_parser("clear-error", help="Clear stale error state for a job")
|
||||
cron_clear_error.add_argument("job_id", help="Job ID to clear error for")
|
||||
|
||||
cron_remove = cron_subparsers.add_parser("remove", aliases=["rm", "delete"], help="Remove a scheduled job")
|
||||
cron_remove.add_argument("job_id", help="Job ID to remove")
|
||||
|
||||
@@ -5008,7 +5005,7 @@ For more help on a command:
|
||||
# =========================================================================
|
||||
sessions_parser = subparsers.add_parser(
|
||||
"sessions",
|
||||
help="Manage session history (list, rename, export, prune, gc, delete)",
|
||||
help="Manage session history (list, rename, export, prune, delete)",
|
||||
description="View and manage the SQLite session store"
|
||||
)
|
||||
sessions_subparsers = sessions_parser.add_subparsers(dest="sessions_action")
|
||||
@@ -5031,14 +5028,6 @@ For more help on a command:
|
||||
sessions_prune.add_argument("--source", help="Only prune sessions from this source")
|
||||
sessions_prune.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
|
||||
|
||||
sessions_gc = sessions_subparsers.add_parser("gc", help="Garbage-collect empty/trivial sessions")
|
||||
sessions_gc.add_argument("--empty-hours", type=int, default=24, help="Delete empty (0-msg) sessions older than N hours (default: 24)")
|
||||
sessions_gc.add_argument("--trivial-days", type=int, default=7, help="Delete trivial (1-5 msg) sessions older than N days (default: 7)")
|
||||
sessions_gc.add_argument("--trivial-max", type=int, default=5, help="Max messages to consider trivial (default: 5)")
|
||||
sessions_gc.add_argument("--source", help="Only GC sessions from this source")
|
||||
sessions_gc.add_argument("--dry-run", action="store_true", help="Show what would be deleted without deleting")
|
||||
sessions_gc.add_argument("--yes", "-y", action="store_true", help="Skip confirmation")
|
||||
|
||||
sessions_stats = sessions_subparsers.add_parser("stats", help="Show session store statistics")
|
||||
|
||||
sessions_rename = sessions_subparsers.add_parser("rename", help="Set or change a session's title")
|
||||
@@ -5208,49 +5197,6 @@ For more help on a command:
|
||||
size_mb = os.path.getsize(db_path) / (1024 * 1024)
|
||||
print(f"Database size: {size_mb:.1f} MB")
|
||||
|
||||
elif action == "gc":
|
||||
dry_run = getattr(args, "dry_run", False)
|
||||
if dry_run:
|
||||
counts = db.garbage_collect(
|
||||
empty_older_than_hours=args.empty_hours,
|
||||
trivial_max_messages=args.trivial_max,
|
||||
trivial_older_than_days=args.trivial_days,
|
||||
source=args.source,
|
||||
dry_run=True,
|
||||
)
|
||||
print(f"[dry-run] Would delete {counts['total']} session(s):")
|
||||
print(f" Empty (0 msgs, >{args.empty_hours}h old): {counts['empty']}")
|
||||
print(f" Trivial (<={args.trivial_max} msgs, >{args.trivial_days}d old): {counts['trivial']}")
|
||||
else:
|
||||
# Preview first
|
||||
preview = db.garbage_collect(
|
||||
empty_older_than_hours=args.empty_hours,
|
||||
trivial_max_messages=args.trivial_max,
|
||||
trivial_older_than_days=args.trivial_days,
|
||||
source=args.source,
|
||||
dry_run=True,
|
||||
)
|
||||
if preview["total"] == 0:
|
||||
print("Nothing to collect.")
|
||||
else:
|
||||
if not args.yes:
|
||||
if not _confirm_prompt(
|
||||
f"Delete {preview['total']} session(s) "
|
||||
f"({preview['empty']} empty, {preview['trivial']} trivial)? [y/N] "
|
||||
):
|
||||
print("Cancelled.")
|
||||
return
|
||||
counts = db.garbage_collect(
|
||||
empty_older_than_hours=args.empty_hours,
|
||||
trivial_max_messages=args.trivial_max,
|
||||
trivial_older_than_days=args.trivial_days,
|
||||
source=args.source,
|
||||
dry_run=False,
|
||||
)
|
||||
print(f"Collected {counts['total']} session(s):")
|
||||
print(f" Empty: {counts['empty']}")
|
||||
print(f" Trivial: {counts['trivial']}")
|
||||
|
||||
else:
|
||||
sessions_parser.print_help()
|
||||
|
||||
|
||||
179
hermes_state.py
179
hermes_state.py
@@ -32,7 +32,7 @@ T = TypeVar("T")
|
||||
|
||||
DEFAULT_DB_PATH = get_hermes_home() / "state.db"
|
||||
|
||||
SCHEMA_VERSION = 7
|
||||
SCHEMA_VERSION = 6
|
||||
|
||||
SCHEMA_SQL = """
|
||||
CREATE TABLE IF NOT EXISTS schema_version (
|
||||
@@ -66,7 +66,6 @@ CREATE TABLE IF NOT EXISTS sessions (
|
||||
cost_source TEXT,
|
||||
pricing_version TEXT,
|
||||
title TEXT,
|
||||
profile TEXT,
|
||||
FOREIGN KEY (parent_session_id) REFERENCES sessions(id)
|
||||
);
|
||||
|
||||
@@ -87,7 +86,6 @@ CREATE TABLE IF NOT EXISTS messages (
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sessions_source ON sessions(source);
|
||||
CREATE INDEX IF NOT EXISTS idx_sessions_profile ON sessions(profile);
|
||||
CREATE INDEX IF NOT EXISTS idx_sessions_parent ON sessions(parent_session_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_sessions_started ON sessions(started_at DESC);
|
||||
CREATE INDEX IF NOT EXISTS idx_messages_session ON messages(session_id, timestamp);
|
||||
@@ -332,19 +330,6 @@ class SessionDB:
|
||||
except sqlite3.OperationalError:
|
||||
pass # Column already exists
|
||||
cursor.execute("UPDATE schema_version SET version = 6")
|
||||
if current_version < 7:
|
||||
# v7: add profile column to sessions for profile isolation (#323)
|
||||
try:
|
||||
cursor.execute('ALTER TABLE sessions ADD COLUMN "profile" TEXT')
|
||||
except sqlite3.OperationalError:
|
||||
pass # Column already exists
|
||||
try:
|
||||
cursor.execute(
|
||||
"CREATE INDEX IF NOT EXISTS idx_sessions_profile ON sessions(profile)"
|
||||
)
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
cursor.execute("UPDATE schema_version SET version = 7")
|
||||
|
||||
# Unique title index — always ensure it exists (safe to run after migrations
|
||||
# since the title column is guaranteed to exist at this point)
|
||||
@@ -377,19 +362,13 @@ class SessionDB:
|
||||
system_prompt: str = None,
|
||||
user_id: str = None,
|
||||
parent_session_id: str = None,
|
||||
profile: str = None,
|
||||
) -> str:
|
||||
"""Create a new session record. Returns the session_id.
|
||||
|
||||
Args:
|
||||
profile: Profile name for session isolation. When set, sessions
|
||||
are tagged so queries can filter by profile. (#323)
|
||||
"""
|
||||
"""Create a new session record. Returns the session_id."""
|
||||
def _do(conn):
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO sessions (id, source, user_id, model, model_config,
|
||||
system_prompt, parent_session_id, profile, started_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
system_prompt, parent_session_id, started_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
session_id,
|
||||
source,
|
||||
@@ -398,7 +377,6 @@ class SessionDB:
|
||||
json.dumps(model_config) if model_config else None,
|
||||
system_prompt,
|
||||
parent_session_id,
|
||||
profile,
|
||||
time.time(),
|
||||
),
|
||||
)
|
||||
@@ -527,23 +505,19 @@ class SessionDB:
|
||||
session_id: str,
|
||||
source: str = "unknown",
|
||||
model: str = None,
|
||||
profile: str = None,
|
||||
) -> None:
|
||||
"""Ensure a session row exists, creating it with minimal metadata if absent.
|
||||
|
||||
Used by _flush_messages_to_session_db to recover from a failed
|
||||
create_session() call (e.g. transient SQLite lock at agent startup).
|
||||
INSERT OR IGNORE is safe to call even when the row already exists.
|
||||
|
||||
Args:
|
||||
profile: Profile name for session isolation. (#323)
|
||||
"""
|
||||
def _do(conn):
|
||||
conn.execute(
|
||||
"""INSERT OR IGNORE INTO sessions
|
||||
(id, source, model, profile, started_at)
|
||||
VALUES (?, ?, ?, ?, ?)""",
|
||||
(session_id, source, model, profile, time.time()),
|
||||
(id, source, model, started_at)
|
||||
VALUES (?, ?, ?, ?)""",
|
||||
(session_id, source, model, time.time()),
|
||||
)
|
||||
self._execute_write(_do)
|
||||
|
||||
@@ -814,7 +788,6 @@ class SessionDB:
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
include_children: bool = False,
|
||||
profile: str = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List sessions with preview (first user message) and last active timestamp.
|
||||
|
||||
@@ -826,10 +799,6 @@ class SessionDB:
|
||||
|
||||
By default, child sessions (subagent runs, compression continuations)
|
||||
are excluded. Pass ``include_children=True`` to include them.
|
||||
|
||||
Args:
|
||||
profile: Filter sessions to this profile name. Pass None to see all.
|
||||
(#323)
|
||||
"""
|
||||
where_clauses = []
|
||||
params = []
|
||||
@@ -844,9 +813,6 @@ class SessionDB:
|
||||
placeholders = ",".join("?" for _ in exclude_sources)
|
||||
where_clauses.append(f"s.source NOT IN ({placeholders})")
|
||||
params.extend(exclude_sources)
|
||||
if profile:
|
||||
where_clauses.append("s.profile = ?")
|
||||
params.append(profile)
|
||||
|
||||
where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
|
||||
query = f"""
|
||||
@@ -1192,52 +1158,34 @@ class SessionDB:
|
||||
source: str = None,
|
||||
limit: int = 20,
|
||||
offset: int = 0,
|
||||
profile: str = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""List sessions, optionally filtered by source and profile.
|
||||
|
||||
Args:
|
||||
profile: Filter sessions to this profile name. Pass None to see all.
|
||||
(#323)
|
||||
"""
|
||||
where_clauses = []
|
||||
params = []
|
||||
if source:
|
||||
where_clauses.append("source = ?")
|
||||
params.append(source)
|
||||
if profile:
|
||||
where_clauses.append("profile = ?")
|
||||
params.append(profile)
|
||||
|
||||
where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
|
||||
query = f"SELECT * FROM sessions {where_sql} ORDER BY started_at DESC LIMIT ? OFFSET ?"
|
||||
params.extend([limit, offset])
|
||||
"""List sessions, optionally filtered by source."""
|
||||
with self._lock:
|
||||
cursor = self._conn.execute(query, params)
|
||||
if source:
|
||||
cursor = self._conn.execute(
|
||||
"SELECT * FROM sessions WHERE source = ? ORDER BY started_at DESC LIMIT ? OFFSET ?",
|
||||
(source, limit, offset),
|
||||
)
|
||||
else:
|
||||
cursor = self._conn.execute(
|
||||
"SELECT * FROM sessions ORDER BY started_at DESC LIMIT ? OFFSET ?",
|
||||
(limit, offset),
|
||||
)
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
|
||||
# =========================================================================
|
||||
# Utility
|
||||
# =========================================================================
|
||||
|
||||
def session_count(self, source: str = None, profile: str = None) -> int:
|
||||
"""Count sessions, optionally filtered by source and profile.
|
||||
|
||||
Args:
|
||||
profile: Filter to this profile name. Pass None to count all. (#323)
|
||||
"""
|
||||
where_clauses = []
|
||||
params = []
|
||||
if source:
|
||||
where_clauses.append("source = ?")
|
||||
params.append(source)
|
||||
if profile:
|
||||
where_clauses.append("profile = ?")
|
||||
params.append(profile)
|
||||
|
||||
where_sql = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
|
||||
def session_count(self, source: str = None) -> int:
|
||||
"""Count sessions, optionally filtered by source."""
|
||||
with self._lock:
|
||||
cursor = self._conn.execute(f"SELECT COUNT(*) FROM sessions {where_sql}", params)
|
||||
if source:
|
||||
cursor = self._conn.execute(
|
||||
"SELECT COUNT(*) FROM sessions WHERE source = ?", (source,)
|
||||
)
|
||||
else:
|
||||
cursor = self._conn.execute("SELECT COUNT(*) FROM sessions")
|
||||
return cursor.fetchone()[0]
|
||||
|
||||
def message_count(self, session_id: str = None) -> int:
|
||||
@@ -1355,78 +1303,3 @@ class SessionDB:
|
||||
return len(session_ids)
|
||||
|
||||
return self._execute_write(_do)
|
||||
|
||||
def garbage_collect(
|
||||
self,
|
||||
empty_older_than_hours: int = 24,
|
||||
trivial_max_messages: int = 5,
|
||||
trivial_older_than_days: int = 7,
|
||||
source: str = None,
|
||||
dry_run: bool = False,
|
||||
) -> Dict[str, int]:
|
||||
"""Delete empty and trivial sessions based on age.
|
||||
|
||||
Policy (matches #315):
|
||||
- Empty sessions (0 messages) older than ``empty_older_than_hours``
|
||||
- Trivial sessions (1..``trivial_max_messages`` msgs) older than
|
||||
``trivial_older_than_days``
|
||||
- Sessions with more than ``trivial_max_messages`` are kept indefinitely
|
||||
- Active (not ended) sessions are never deleted
|
||||
|
||||
Returns a dict with counts: ``empty``, ``trivial``, ``total``.
|
||||
"""
|
||||
now = time.time()
|
||||
empty_cutoff = now - (empty_older_than_hours * 3600)
|
||||
trivial_cutoff = now - (trivial_older_than_days * 86400)
|
||||
|
||||
def _do(conn):
|
||||
# --- Find empty sessions ---
|
||||
empty_q = (
|
||||
"SELECT id FROM sessions "
|
||||
"WHERE message_count = 0 AND started_at < ? AND ended_at IS NOT NULL"
|
||||
)
|
||||
params = [empty_cutoff]
|
||||
if source:
|
||||
empty_q += " AND source = ?"
|
||||
params.append(source)
|
||||
empty_ids = [r[0] for r in conn.execute(empty_q, params).fetchall()]
|
||||
|
||||
# --- Find trivial sessions ---
|
||||
trivial_q = (
|
||||
"SELECT id FROM sessions "
|
||||
"WHERE message_count BETWEEN 1 AND ? AND started_at < ? AND ended_at IS NOT NULL"
|
||||
)
|
||||
t_params = [trivial_max_messages, trivial_cutoff]
|
||||
if source:
|
||||
trivial_q += " AND source = ?"
|
||||
t_params.append(source)
|
||||
trivial_ids = [r[0] for r in conn.execute(trivial_q, t_params).fetchall()]
|
||||
|
||||
all_ids = set(empty_ids) | set(trivial_ids)
|
||||
|
||||
if dry_run:
|
||||
return {"empty": len(empty_ids), "trivial": len(trivial_ids),
|
||||
"total": len(all_ids)}
|
||||
|
||||
# --- Collect child sessions to delete first (FK constraint) ---
|
||||
child_ids = set()
|
||||
for sid in all_ids:
|
||||
for r in conn.execute(
|
||||
"SELECT id FROM sessions WHERE parent_session_id = ?", (sid,)
|
||||
).fetchall():
|
||||
child_ids.add(r[0])
|
||||
|
||||
# Delete children
|
||||
for cid in child_ids:
|
||||
conn.execute("DELETE FROM messages WHERE session_id = ?", (cid,))
|
||||
conn.execute("DELETE FROM sessions WHERE id = ?", (cid,))
|
||||
|
||||
# Delete targets
|
||||
for sid in all_ids:
|
||||
conn.execute("DELETE FROM messages WHERE session_id = ?", (sid,))
|
||||
conn.execute("DELETE FROM sessions WHERE id = ?", (sid,))
|
||||
|
||||
return {"empty": len(empty_ids), "trivial": len(trivial_ids),
|
||||
"total": len(all_ids)}
|
||||
|
||||
return self._execute_write(_do)
|
||||
|
||||
@@ -1,286 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Model Watchdog — monitors tmux panes for model drift.
|
||||
Checks all hermes TUI sessions in dev and timmy tmux sessions.
|
||||
If any pane is running a non-mimo model, kills and restarts it.
|
||||
|
||||
Usage: python3 ~/.hermes/bin/model-watchdog.py [--fix]
|
||||
--fix Actually restart drifted panes (default: dry-run)
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import re
|
||||
import time
|
||||
import os
|
||||
|
||||
ALLOWED_MODEL = "mimo-v2-pro"
|
||||
|
||||
# Profile -> expected model. If a pane is running this profile with this model, it's healthy.
|
||||
# Profiles not in this map are checked against ALLOWED_MODEL.
|
||||
PROFILE_MODELS = {
|
||||
"default": "mimo-v2-pro",
|
||||
"timmy-sprint": "mimo-v2-pro",
|
||||
"fenrir": "mimo-v2-pro",
|
||||
"bezalel": "gpt-5.4",
|
||||
"burn": "mimo-v2-pro",
|
||||
"creative": "claude-sonnet",
|
||||
"research": "claude-sonnet",
|
||||
"review": "claude-sonnet",
|
||||
}
|
||||
|
||||
TMUX_SESSIONS = ["dev", "timmy"]
|
||||
LOG_FILE = os.path.expanduser("~/.hermes/logs/model-watchdog.log")
|
||||
|
||||
def log(msg):
|
||||
os.makedirs(os.path.dirname(LOG_FILE), exist_ok=True)
|
||||
ts = time.strftime("%Y-%m-%d %H:%M:%S")
|
||||
line = f"[{ts}] {msg}"
|
||||
print(line)
|
||||
with open(LOG_FILE, "a") as f:
|
||||
f.write(line + "\n")
|
||||
|
||||
def run(cmd):
|
||||
r = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=10)
|
||||
return r.stdout.strip(), r.returncode
|
||||
|
||||
def get_panes(session):
|
||||
"""Get all pane info from ALL windows in a tmux session."""
|
||||
# First get all windows
|
||||
win_out, win_rc = run(f"tmux list-windows -t {session} -F '#{{window_name}}' 2>/dev/null")
|
||||
if win_rc != 0:
|
||||
return []
|
||||
|
||||
panes = []
|
||||
for window_name in win_out.split("\n"):
|
||||
if not window_name.strip():
|
||||
continue
|
||||
target = f"{session}:{window_name}"
|
||||
out, rc = run(f"tmux list-panes -t {target} -F '#{{pane_index}}|#{{pane_pid}}|#{{pane_tty}}' 2>/dev/null")
|
||||
if rc != 0:
|
||||
continue
|
||||
for line in out.split("\n"):
|
||||
if "|" in line:
|
||||
idx, pid, tty = line.split("|")
|
||||
panes.append({
|
||||
"session": session,
|
||||
"window": window_name,
|
||||
"index": int(idx),
|
||||
"pid": int(pid),
|
||||
"tty": tty,
|
||||
})
|
||||
return panes
|
||||
|
||||
def get_hermes_pid_for_tty(tty):
|
||||
"""Find hermes process running on a specific TTY."""
|
||||
out, _ = run(f"ps aux | grep '{tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | awk '{{print $2}}'")
|
||||
if out:
|
||||
return int(out.split("\n")[0])
|
||||
return None
|
||||
|
||||
def get_model_from_pane(session, pane_idx, window=None):
|
||||
"""Capture the pane and extract the model from the status bar."""
|
||||
target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}"
|
||||
out, _ = run(f"tmux capture-pane -t {target} -p 2>/dev/null | tail -30")
|
||||
# Look for model in status bar: ⚕ model-name │
|
||||
matches = re.findall(r'⚕\s+(\S+)\s+│', out)
|
||||
if matches:
|
||||
return matches[0]
|
||||
return None
|
||||
|
||||
def check_session_meta(session_id):
|
||||
"""Check what model a hermes session was last using from its session file."""
|
||||
import json
|
||||
session_file = os.path.expanduser(f"~/.hermes/sessions/session_{session_id}.json")
|
||||
if os.path.exists(session_file):
|
||||
try:
|
||||
with open(session_file) as f:
|
||||
data = json.load(f)
|
||||
return data.get("model"), data.get("provider")
|
||||
except:
|
||||
pass
|
||||
# Try jsonl
|
||||
jsonl_file = os.path.expanduser(f"~/.hermes/sessions/{session_id}.jsonl")
|
||||
if os.path.exists(jsonl_file):
|
||||
try:
|
||||
with open(jsonl_file) as f:
|
||||
for line in f:
|
||||
d = json.loads(line.strip())
|
||||
if d.get("role") == "session_meta":
|
||||
return d.get("model"), d.get("provider")
|
||||
break
|
||||
except:
|
||||
pass
|
||||
return None, None
|
||||
|
||||
def is_drifted(model_name, profile=None):
|
||||
"""Check if a model name indicates drift from the expected model for this profile."""
|
||||
if model_name is None:
|
||||
return False, "no-model-detected"
|
||||
|
||||
# If we know the profile, check against its expected model
|
||||
if profile and profile in PROFILE_MODELS:
|
||||
expected = PROFILE_MODELS[profile]
|
||||
if expected in model_name:
|
||||
return False, model_name
|
||||
return True, model_name
|
||||
|
||||
# No profile known — fall back to ALLOWED_MODEL
|
||||
if ALLOWED_MODEL in model_name:
|
||||
return False, model_name
|
||||
return True, model_name
|
||||
|
||||
def get_profile_from_pane(tty):
|
||||
"""Detect which hermes profile a pane is running by inspecting its process args."""
|
||||
# ps shows short TTY (s031) not full path (/dev/ttys031)
|
||||
short_tty = tty.replace("/dev/ttys", "s").replace("/dev/ttys", "")
|
||||
out, _ = run(f"ps aux | grep '{short_tty}' | grep '[h]ermes' | grep -v 'gateway' | grep -v 'node' | grep -v cron")
|
||||
if not out:
|
||||
return None
|
||||
# Look for -p <profile> in the command line
|
||||
match = re.search(r'-p\s+(\S+)', out)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def kill_and_restart(session, pane_idx, window=None):
|
||||
"""Kill the hermes process in a pane and restart it with the same profile."""
|
||||
target = f"{session}:{window}.{pane_idx}" if window else f"{session}.{pane_idx}"
|
||||
|
||||
# Get the pane's TTY
|
||||
out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'")
|
||||
tty = out.strip()
|
||||
|
||||
# Detect which profile was running
|
||||
profile = get_profile_from_pane(tty)
|
||||
|
||||
# Find and kill hermes on that TTY
|
||||
hermes_pid = get_hermes_pid_for_tty(tty)
|
||||
if hermes_pid:
|
||||
log(f"Killing hermes PID {hermes_pid} on {target} (tty={tty}, profile={profile})")
|
||||
run(f"kill {hermes_pid}")
|
||||
time.sleep(2)
|
||||
|
||||
# Send Ctrl+C to clear any state
|
||||
run(f"tmux send-keys -t {target} C-c")
|
||||
time.sleep(1)
|
||||
|
||||
# Restart hermes with the same profile
|
||||
if profile:
|
||||
cmd = f"hermes -p {profile} chat"
|
||||
else:
|
||||
cmd = "hermes chat"
|
||||
run(f"tmux send-keys -t {target} '{cmd}' Enter")
|
||||
log(f"Restarted hermes in {target} with: {cmd}")
|
||||
|
||||
# Wait and verify
|
||||
time.sleep(8)
|
||||
new_model = get_model_from_pane(session, pane_idx, window)
|
||||
if new_model and ALLOWED_MODEL in new_model:
|
||||
log(f"✓ {target} now on {new_model}")
|
||||
return True
|
||||
else:
|
||||
log(f"⚠ {target} model after restart: {new_model}")
|
||||
return False
|
||||
|
||||
def verify_expected_model(provider_yaml, expected):
|
||||
"""Compare actual provider in a YAML config against expected value."""
|
||||
return provider_yaml.strip() == expected.strip()
|
||||
|
||||
def check_config_drift():
|
||||
"""Scan all relevant config.yaml files for provider drift. Does NOT modify anything.
|
||||
Returns list of drift issues found."""
|
||||
issues = []
|
||||
CONFIGS = {
|
||||
"main_config": (os.path.expanduser("~/.hermes/config.yaml"), "nous"),
|
||||
"fenrir": (os.path.expanduser("~/.hermes/profiles/fenrir/config.yaml"), "nous"),
|
||||
"timmy_sprint": (os.path.expanduser("~/.hermes/profiles/timmy-sprint/config.yaml"), "nous"),
|
||||
"default_profile": (os.path.expanduser("~/.hermes/profiles/default/config.yaml"), "nous"),
|
||||
}
|
||||
for name, (path, expected_provider) in CONFIGS.items():
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
try:
|
||||
with open(path, "r") as f:
|
||||
content = f.read()
|
||||
# Parse YAML to correctly read model.provider (not the first provider: line)
|
||||
try:
|
||||
import yaml
|
||||
cfg = yaml.safe_load(content) or {}
|
||||
except ImportError:
|
||||
# Fallback: find provider under model: block via indentation-aware scan
|
||||
cfg = {}
|
||||
in_model = False
|
||||
for line in content.split("\n"):
|
||||
stripped = line.strip()
|
||||
indent = len(line) - len(line.lstrip())
|
||||
if stripped.startswith("model:") and indent == 0:
|
||||
in_model = True
|
||||
continue
|
||||
if in_model and indent == 0 and stripped:
|
||||
in_model = False
|
||||
if in_model and stripped.startswith("provider:"):
|
||||
cfg = {"model": {"provider": stripped.split(":", 1)[1].strip()}}
|
||||
break
|
||||
actual = (cfg.get("model") or {}).get("provider", "")
|
||||
if actual and expected_provider and actual != expected_provider:
|
||||
issues.append(f"CONFIG DRIFT [{name}]: provider is '{actual}' (expected '{expected_provider}')")
|
||||
except Exception as e:
|
||||
issues.append(f"CONFIG CHECK ERROR [{name}]: {e}")
|
||||
return issues
|
||||
|
||||
def main():
|
||||
fix_mode = "--fix" in sys.argv
|
||||
drift_found = False
|
||||
issues = []
|
||||
|
||||
# Always check config files for provider drift (read-only, never writes)
|
||||
config_drift_issues = check_config_drift()
|
||||
if config_drift_issues:
|
||||
for issue in config_drift_issues:
|
||||
log(f"CONFIG DRIFT: {issue}")
|
||||
|
||||
for session in TMUX_SESSIONS:
|
||||
panes = get_panes(session)
|
||||
for pane in panes:
|
||||
window = pane.get("window")
|
||||
target = f"{session}:{window}.{pane['index']}" if window else f"{session}.{pane['index']}"
|
||||
|
||||
# Detect profile from running process
|
||||
out, _ = run(f"tmux list-panes -t {target} -F '#{{pane_tty}}'")
|
||||
tty = out.strip()
|
||||
profile = get_profile_from_pane(tty)
|
||||
|
||||
model = get_model_from_pane(session, pane["index"], window)
|
||||
drifted, model_name = is_drifted(model, profile)
|
||||
|
||||
if drifted:
|
||||
drift_found = True
|
||||
issues.append(f"{target}: {model_name} (profile={profile})")
|
||||
log(f"DRIFT DETECTED: {target} is on '{model_name}' (profile={profile}, expected='{PROFILE_MODELS.get(profile, ALLOWED_MODEL)}')")
|
||||
|
||||
if fix_mode:
|
||||
log(f"Auto-fixing {target}...")
|
||||
success = kill_and_restart(session, pane["index"], window)
|
||||
if not success:
|
||||
issues.append(f" ↳ RESTART FAILED for {target}")
|
||||
|
||||
if not drift_found:
|
||||
total = sum(len(get_panes(s)) for s in TMUX_SESSIONS)
|
||||
log(f"All {total} panes healthy (on {ALLOWED_MODEL})")
|
||||
|
||||
# Print summary for cron output
|
||||
if issues or config_drift_issues:
|
||||
print("\n=== MODEL DRIFT REPORT ===")
|
||||
for issue in issues:
|
||||
print(f" [PANE] {issue}")
|
||||
if config_drift_issues:
|
||||
for issue in config_drift_issues:
|
||||
print(f" [CONFIG] {issue}")
|
||||
if not fix_mode:
|
||||
print("\nRun with --fix to auto-restart drifted panes.")
|
||||
return 1
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -136,83 +136,6 @@ class TestFallbackModelValidation:
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_blank_fallback_fields_no_issues(self):
|
||||
"""Blank fallback_model fields (both empty) should not trigger warnings."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_model": {
|
||||
"provider": "",
|
||||
"model": "",
|
||||
},
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_blank_fallback_fields_with_whitespace_no_issues(self):
|
||||
"""Blank fallback_model fields with whitespace should not trigger warnings."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_model": {
|
||||
"provider": " ",
|
||||
"model": " ",
|
||||
},
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_none_fallback_fields_no_issues(self):
|
||||
"""None fallback_model fields should not trigger warnings."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_model": {
|
||||
"provider": None,
|
||||
"model": None,
|
||||
},
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_enabled_false_no_issues(self):
|
||||
"""enabled: false should suppress warnings."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_model": {
|
||||
"enabled": False,
|
||||
},
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_enabled_false_string_no_issues(self):
|
||||
"""enabled: 'false' (string) should suppress warnings."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_model": {
|
||||
"enabled": "false",
|
||||
},
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_partial_blank_fallback_warns(self):
|
||||
"""Partial blank fallback (only one field blank) should warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_model": {
|
||||
"provider": "",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
},
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 1
|
||||
assert "provider" in fb_issues[0].message
|
||||
|
||||
def test_valid_fallback_with_enabled_true(self):
|
||||
"""Valid fallback with enabled: true should not warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_model": {
|
||||
"enabled": True,
|
||||
"provider": "openrouter",
|
||||
"model": "anthropic/claude-sonnet-4",
|
||||
},
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback" in i.message.lower()]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
|
||||
class TestMissingModelSection:
|
||||
"""Warn when custom_providers exists but model section is missing."""
|
||||
@@ -249,111 +172,3 @@ class TestConfigIssueDataclass:
|
||||
a = ConfigIssue("error", "msg", "hint")
|
||||
b = ConfigIssue("error", "msg", "hint")
|
||||
assert a == b
|
||||
|
||||
|
||||
class TestFallbackProvidersValidation:
|
||||
"""fallback_providers must be a list of dicts with provider + model."""
|
||||
|
||||
def test_non_list(self):
|
||||
"""fallback_providers as string should error."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": "openrouter:google/gemini-3-flash-preview",
|
||||
})
|
||||
errors = [i for i in issues if i.severity == "error"]
|
||||
assert any("fallback_providers" in i.message and "list" in i.message for i in errors)
|
||||
|
||||
def test_dict_instead_of_list(self):
|
||||
"""fallback_providers as dict should error."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": {"provider": "openrouter", "model": "test"},
|
||||
})
|
||||
errors = [i for i in issues if i.severity == "error"]
|
||||
assert any("fallback_providers" in i.message and "dict" in i.message for i in errors)
|
||||
|
||||
def test_entry_missing_provider(self):
|
||||
"""Entry without provider should warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [{"model": "google/gemini-3-flash-preview"}],
|
||||
})
|
||||
assert any("missing 'provider'" in i.message for i in issues)
|
||||
|
||||
def test_entry_missing_model(self):
|
||||
"""Entry without model should warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [{"provider": "openrouter"}],
|
||||
})
|
||||
assert any("missing 'model'" in i.message for i in issues)
|
||||
|
||||
def test_entry_not_dict(self):
|
||||
"""Non-dict entries should warn."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": ["not-a-dict"],
|
||||
})
|
||||
assert any("not a dict" in i.message for i in issues)
|
||||
|
||||
def test_valid_entries(self):
|
||||
"""Valid fallback_providers should produce no fallback-related issues."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [
|
||||
{"provider": "openrouter", "model": "google/gemini-3-flash-preview"},
|
||||
{"provider": "gemini", "model": "gemini-2.5-flash"},
|
||||
],
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback_providers" in i.message]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
def test_empty_list_no_issues(self):
|
||||
"""Empty list is valid (fallback disabled)."""
|
||||
issues = validate_config_structure({
|
||||
"fallback_providers": [],
|
||||
})
|
||||
fb_issues = [i for i in issues if "fallback_providers" in i.message]
|
||||
assert len(fb_issues) == 0
|
||||
|
||||
|
||||
class TestSessionResetValidation:
|
||||
"""session_reset.idle_minutes must be positive."""
|
||||
|
||||
def test_zero_idle_minutes(self):
|
||||
"""idle_minutes=0 should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": 0},
|
||||
})
|
||||
assert any("idle_minutes=0" in i.message for i in issues)
|
||||
|
||||
def test_negative_idle_minutes(self):
|
||||
"""idle_minutes=-5 should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": -5},
|
||||
})
|
||||
assert any("idle_minutes=-5" in i.message for i in issues)
|
||||
|
||||
def test_string_idle_minutes(self):
|
||||
"""idle_minutes as string should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": "abc"},
|
||||
})
|
||||
assert any("idle_minutes=" in i.message for i in issues)
|
||||
|
||||
def test_valid_idle_minutes(self):
|
||||
"""Valid idle_minutes should not warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"idle_minutes": 1440},
|
||||
})
|
||||
idle_issues = [i for i in issues if "idle_minutes" in i.message]
|
||||
assert len(idle_issues) == 0
|
||||
|
||||
def test_invalid_at_hour(self):
|
||||
"""at_hour=25 should warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"at_hour": 25},
|
||||
})
|
||||
assert any("at_hour=25" in i.message for i in issues)
|
||||
|
||||
def test_valid_at_hour(self):
|
||||
"""Valid at_hour should not warn."""
|
||||
issues = validate_config_structure({
|
||||
"session_reset": {"at_hour": 4},
|
||||
})
|
||||
hour_issues = [i for i in issues if "at_hour" in i.message]
|
||||
assert len(hour_issues) == 0
|
||||
|
||||
@@ -1,73 +0,0 @@
|
||||
"""Tests for cron scheduler cloud-provider terminal disabling (#379).
|
||||
|
||||
When a cron job runs on a cloud inference endpoint (Nous, OpenRouter, etc.),
|
||||
the terminal toolset must be disabled because SSH keys don't exist on cloud
|
||||
servers. Only local endpoints (localhost, 127.0.0.1, RFC-1918) retain
|
||||
terminal access.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from agent.model_metadata import is_local_endpoint
|
||||
|
||||
|
||||
class TestIsLocalEndpoint:
|
||||
"""Verify is_local_endpoint correctly classifies endpoints."""
|
||||
|
||||
def test_localhost(self):
|
||||
assert is_local_endpoint("http://localhost:11434/v1") is True
|
||||
|
||||
def test_127_loopback(self):
|
||||
assert is_local_endpoint("http://127.0.0.1:8080/v1") is True
|
||||
|
||||
def test_0_0_0_0(self):
|
||||
assert is_local_endpoint("http://0.0.0.0:11434/v1") is True
|
||||
|
||||
def test_rfc1918_10(self):
|
||||
assert is_local_endpoint("http://10.0.0.5:8080/v1") is True
|
||||
|
||||
def test_rfc1918_192(self):
|
||||
assert is_local_endpoint("http://192.168.1.100:11434/v1") is True
|
||||
|
||||
def test_rfc1918_172(self):
|
||||
assert is_local_endpoint("http://172.16.0.1:8080/v1") is True
|
||||
|
||||
def test_cloud_openrouter(self):
|
||||
assert is_local_endpoint("https://openrouter.ai/api/v1") is False
|
||||
|
||||
def test_cloud_nous(self):
|
||||
assert is_local_endpoint("https://inference-api.nousresearch.com/v1") is False
|
||||
|
||||
def test_cloud_anthropic(self):
|
||||
assert is_local_endpoint("https://api.anthropic.com") is False
|
||||
|
||||
def test_empty_url(self):
|
||||
assert is_local_endpoint("") is False
|
||||
|
||||
def test_none_url(self):
|
||||
assert is_local_endpoint(None) is False
|
||||
|
||||
|
||||
class TestCronDisabledToolsetsLogic:
|
||||
"""Verify the disabled_toolsets logic matches scheduler expectations."""
|
||||
|
||||
def _build_disabled(self, base_url, job=None):
|
||||
"""Mirror the scheduler's disabled_toolsets logic."""
|
||||
from agent.model_metadata import is_local_endpoint
|
||||
cron_disabled = ["cronjob", "messaging", "clarify"]
|
||||
if not is_local_endpoint(base_url):
|
||||
cron_disabled.append("terminal")
|
||||
return cron_disabled
|
||||
|
||||
def test_local_keeps_terminal(self):
|
||||
disabled = self._build_disabled("http://localhost:11434/v1")
|
||||
assert "terminal" not in disabled
|
||||
assert "cronjob" in disabled
|
||||
|
||||
def test_cloud_disables_terminal(self):
|
||||
disabled = self._build_disabled("https://openrouter.ai/api/v1")
|
||||
assert "terminal" in disabled
|
||||
assert "cronjob" in disabled
|
||||
|
||||
def test_empty_url_disables_terminal(self):
|
||||
disabled = self._build_disabled("")
|
||||
assert "terminal" in disabled
|
||||
@@ -1,128 +0,0 @@
|
||||
"""Tests for time-aware cron model routing — Issue #317."""
|
||||
|
||||
import pytest
|
||||
from datetime import datetime
|
||||
|
||||
from agent.smart_model_routing import resolve_cron_model, _hour_in_window
|
||||
|
||||
|
||||
class TestHourInWindow:
|
||||
"""Hour-in-window detection including midnight wrap."""
|
||||
|
||||
def test_normal_window(self):
|
||||
assert _hour_in_window(18, 17, 22) is True
|
||||
assert _hour_in_window(16, 17, 22) is False
|
||||
assert _hour_in_window(22, 17, 22) is False
|
||||
|
||||
def test_midnight_wrap(self):
|
||||
assert _hour_in_window(23, 22, 6) is True
|
||||
assert _hour_in_window(3, 22, 6) is True
|
||||
assert _hour_in_window(10, 22, 6) is False
|
||||
|
||||
def test_edge_cases(self):
|
||||
assert _hour_in_window(0, 0, 24) is True
|
||||
assert _hour_in_window(23, 0, 24) is True
|
||||
assert _hour_in_window(0, 22, 6) is True
|
||||
assert _hour_in_window(5, 22, 6) is True
|
||||
assert _hour_in_window(6, 22, 6) is False
|
||||
|
||||
|
||||
class TestResolveCronModel:
|
||||
"""Time-aware model resolution for cron jobs."""
|
||||
|
||||
def _config(self, **overrides):
|
||||
base = {
|
||||
"enabled": True,
|
||||
"fallback_model": "anthropic/claude-sonnet-4",
|
||||
"fallback_provider": "openrouter",
|
||||
"windows": [
|
||||
{"start_hour": 17, "end_hour": 22, "reason": "evening_error_peak"},
|
||||
],
|
||||
}
|
||||
base.update(overrides)
|
||||
return base
|
||||
|
||||
def test_disabled_returns_base(self):
|
||||
result = resolve_cron_model("mimo", {"enabled": False}, now=datetime(2026, 4, 12, 18, 0))
|
||||
assert result["model"] == "mimo"
|
||||
assert result["overridden"] is False
|
||||
|
||||
def test_no_config_returns_base(self):
|
||||
result = resolve_cron_model("mimo", None)
|
||||
assert result["model"] == "mimo"
|
||||
assert result["overridden"] is False
|
||||
|
||||
def test_no_windows_returns_base(self):
|
||||
result = resolve_cron_model("mimo", {"enabled": True, "windows": []}, now=datetime(2026, 4, 12, 18, 0))
|
||||
assert result["overridden"] is False
|
||||
|
||||
def test_evening_window_overrides(self):
|
||||
result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 18, 0))
|
||||
assert result["model"] == "anthropic/claude-sonnet-4"
|
||||
assert result["provider"] == "openrouter"
|
||||
assert result["overridden"] is True
|
||||
assert "evening_error_peak" in result["reason"]
|
||||
assert "hour=18" in result["reason"]
|
||||
|
||||
def test_outside_window_keeps_base(self):
|
||||
result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 9, 0))
|
||||
assert result["model"] == "mimo"
|
||||
assert result["overridden"] is False
|
||||
|
||||
def test_window_boundary_start_inclusive(self):
|
||||
result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 17, 0))
|
||||
assert result["overridden"] is True
|
||||
|
||||
def test_window_boundary_end_exclusive(self):
|
||||
result = resolve_cron_model("mimo", self._config(), now=datetime(2026, 4, 12, 22, 0))
|
||||
assert result["overridden"] is False
|
||||
|
||||
def test_midnight_window(self):
|
||||
config = self._config(windows=[{"start_hour": 22, "end_hour": 6, "reason": "overnight"}])
|
||||
assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 23, 0))["overridden"] is True
|
||||
assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 13, 3, 0))["overridden"] is True
|
||||
assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 10, 0))["overridden"] is False
|
||||
|
||||
def test_per_window_model_override(self):
|
||||
config = self._config(windows=[{
|
||||
"start_hour": 17, "end_hour": 22,
|
||||
"model": "anthropic/claude-opus-4-6", "provider": "anthropic", "reason": "peak",
|
||||
}])
|
||||
result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 18, 0))
|
||||
assert result["model"] == "anthropic/claude-opus-4-6"
|
||||
assert result["provider"] == "anthropic"
|
||||
|
||||
def test_first_matching_window_wins(self):
|
||||
config = self._config(windows=[
|
||||
{"start_hour": 17, "end_hour": 20, "model": "strong-1", "provider": "p1", "reason": "w1"},
|
||||
{"start_hour": 19, "end_hour": 22, "model": "strong-2", "provider": "p2", "reason": "w2"},
|
||||
])
|
||||
result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 19, 0))
|
||||
assert result["model"] == "strong-1"
|
||||
|
||||
def test_no_fallback_model_keeps_base(self):
|
||||
config = {"enabled": True, "windows": [{"start_hour": 17, "end_hour": 22, "reason": "test"}]}
|
||||
result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 18, 0))
|
||||
assert result["overridden"] is False
|
||||
assert result["model"] == "mimo"
|
||||
|
||||
def test_malformed_windows_skipped(self):
|
||||
config = self._config(windows=[
|
||||
"not-a-dict",
|
||||
{"start_hour": 17},
|
||||
{"end_hour": 22},
|
||||
{"start_hour": "bad", "end_hour": "bad"},
|
||||
{"start_hour": 17, "end_hour": 22, "reason": "valid"},
|
||||
])
|
||||
result = resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 18, 0))
|
||||
assert result["overridden"] is True
|
||||
assert "valid" in result["reason"]
|
||||
|
||||
def test_multiple_windows_coverage(self):
|
||||
config = self._config(windows=[
|
||||
{"start_hour": 17, "end_hour": 22, "reason": "evening"},
|
||||
{"start_hour": 2, "end_hour": 5, "reason": "overnight"},
|
||||
])
|
||||
assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 20, 0))["overridden"] is True
|
||||
assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 13, 3, 0))["overridden"] is True
|
||||
assert resolve_cron_model("mimo", config, now=datetime(2026, 4, 12, 10, 0))["overridden"] is False
|
||||
@@ -1,116 +0,0 @@
|
||||
"""Tests for cron prefer_local auto-routing (#378).
|
||||
|
||||
Jobs with prefer_local=true should automatically route to a local inference
|
||||
server (Ollama, llama.cpp, vllm) when one is available, instead of falling
|
||||
through to the cloud default.
|
||||
"""
|
||||
|
||||
import re
|
||||
import pytest
|
||||
|
||||
|
||||
# Patterns mirrored from scheduler for test isolation
|
||||
_LOCAL_ENDPOINTS = [
|
||||
{"name": "ollama", "base_url": "http://localhost:11434/v1", "health": "http://localhost:11434/api/tags"},
|
||||
{"name": "llama-cpp", "base_url": "http://localhost:8080/v1", "health": "http://localhost:8080/health"},
|
||||
{"name": "vllm", "base_url": "http://localhost:8000/v1", "health": "http://localhost:8000/v1/models"},
|
||||
]
|
||||
|
||||
|
||||
def _probe_local_endpoint(url: str, timeout: float = 2.0) -> bool:
|
||||
import urllib.request
|
||||
try:
|
||||
req = urllib.request.Request(url)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
return resp.status == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _is_local_endpoint(base_url: str) -> bool:
|
||||
if not base_url:
|
||||
return False
|
||||
from urllib.parse import urlparse
|
||||
parsed = urlparse(base_url)
|
||||
host = (parsed.hostname or "").lower()
|
||||
return host in ("localhost", "127.0.0.1", "0.0.0.0") or (
|
||||
host.startswith("10.") or host.startswith("192.168.") or
|
||||
any(host.startswith(f"172.{i}.") for i in range(16, 32))
|
||||
)
|
||||
|
||||
|
||||
def _resolve_prefer_local(job: dict):
|
||||
if not job.get("prefer_local"):
|
||||
return None, None, ""
|
||||
explicit_url = job.get("base_url", "")
|
||||
if explicit_url and _is_local_endpoint(explicit_url):
|
||||
return None, None, f"prefer_local: explicit base_url {explicit_url} is already local"
|
||||
for ep in _LOCAL_ENDPOINTS:
|
||||
if _probe_local_endpoint(ep["health"], timeout=0.5):
|
||||
return None, ep["base_url"], f"prefer_local: using {ep['name']} at {ep['base_url']}"
|
||||
return None, None, "prefer_local: no local server found"
|
||||
|
||||
|
||||
class TestProbeLocalEndpoint:
|
||||
def test_unreachable_returns_false(self):
|
||||
"""A port with nothing listening should return False."""
|
||||
assert _probe_local_endpoint("http://localhost:19999/api/tags", timeout=0.5) is False
|
||||
|
||||
def test_invalid_url_returns_false(self):
|
||||
assert _probe_local_endpoint("not-a-url", timeout=0.5) is False
|
||||
|
||||
|
||||
class TestResolvePreferLocal:
|
||||
def test_no_prefer_local(self):
|
||||
"""When prefer_local is not set, return empty overrides."""
|
||||
job = {"name": "test", "prompt": "hello"}
|
||||
prov, url, status = _resolve_prefer_local(job)
|
||||
assert prov is None
|
||||
assert url is None
|
||||
assert status == ""
|
||||
|
||||
def test_prefer_local_with_explicit_local_url(self):
|
||||
"""When base_url is already local, skip probing."""
|
||||
job = {"name": "test", "prefer_local": True, "base_url": "http://localhost:11434/v1"}
|
||||
prov, url, status = _resolve_prefer_local(job)
|
||||
assert prov is None
|
||||
assert url is None # Don't override — already local
|
||||
assert "already local" in status
|
||||
|
||||
def test_prefer_local_no_server_found(self):
|
||||
"""When no local server is running, status indicates fallback."""
|
||||
job = {"name": "test", "prefer_local": True}
|
||||
prov, url, status = _resolve_prefer_local(job)
|
||||
# Unless Ollama happens to be running, this should fail
|
||||
if url is None:
|
||||
assert "no local server" in status
|
||||
|
||||
def test_prefer_local_false(self):
|
||||
"""prefer_local=false should act like unset."""
|
||||
job = {"name": "test", "prefer_local": False}
|
||||
prov, url, status = _resolve_prefer_local(job)
|
||||
assert prov is None
|
||||
assert url is None
|
||||
assert status == ""
|
||||
|
||||
|
||||
class TestLocalEndpointsConfig:
|
||||
"""Verify the well-known endpoints list covers expected servers."""
|
||||
|
||||
def test_ollama_in_endpoints(self):
|
||||
names = [ep["name"] for ep in _LOCAL_ENDPOINTS]
|
||||
assert "ollama" in names
|
||||
|
||||
def test_llama_cpp_in_endpoints(self):
|
||||
names = [ep["name"] for ep in _LOCAL_ENDPOINTS]
|
||||
assert "llama-cpp" in names
|
||||
|
||||
def test_all_endpoints_have_health(self):
|
||||
for ep in _LOCAL_ENDPOINTS:
|
||||
assert "health" in ep
|
||||
assert ep["health"].startswith("http")
|
||||
|
||||
def test_all_endpoints_have_base_url(self):
|
||||
for ep in _LOCAL_ENDPOINTS:
|
||||
assert "base_url" in ep
|
||||
assert "/v1" in ep["base_url"]
|
||||
@@ -665,127 +665,6 @@ class TestPruneSessions:
|
||||
|
||||
|
||||
# =========================================================================
|
||||
# =========================================================================
|
||||
# Garbage Collect
|
||||
# =========================================================================
|
||||
|
||||
class TestGarbageCollect:
|
||||
def test_gc_deletes_empty_old_sessions(self, db):
|
||||
"""Empty sessions (0 messages) older than 24h should be deleted."""
|
||||
db.create_session(session_id="empty_old", source="cli")
|
||||
db.end_session("empty_old", end_reason="done")
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - 48 * 3600, "empty_old"), # 48 hours ago
|
||||
)
|
||||
db._conn.commit()
|
||||
|
||||
# Recent empty session should be kept
|
||||
db.create_session(session_id="empty_new", source="cli")
|
||||
db.end_session("empty_new", end_reason="done")
|
||||
|
||||
result = db.garbage_collect()
|
||||
assert result["empty"] == 1
|
||||
assert result["trivial"] == 0
|
||||
assert result["total"] == 1
|
||||
assert db.get_session("empty_old") is None
|
||||
assert db.get_session("empty_new") is not None
|
||||
|
||||
def test_gc_deletes_trivial_old_sessions(self, db):
|
||||
"""Sessions with 1-5 messages older than 7 days should be deleted."""
|
||||
db.create_session(session_id="trivial_old", source="cli")
|
||||
for i in range(3):
|
||||
db.append_message("trivial_old", role="user", content=f"msg {i}")
|
||||
db.end_session("trivial_old", end_reason="done")
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - 10 * 86400, "trivial_old"), # 10 days ago
|
||||
)
|
||||
db._conn.commit()
|
||||
|
||||
result = db.garbage_collect()
|
||||
assert result["trivial"] == 1
|
||||
assert result["total"] == 1
|
||||
assert db.get_session("trivial_old") is None
|
||||
|
||||
def test_gc_keeps_active_sessions(self, db):
|
||||
"""Active (not ended) sessions should never be deleted."""
|
||||
db.create_session(session_id="active_old", source="cli")
|
||||
# Backdate but don't end
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - 48 * 3600, "active_old"),
|
||||
)
|
||||
db._conn.commit()
|
||||
|
||||
result = db.garbage_collect()
|
||||
assert result["total"] == 0
|
||||
assert db.get_session("active_old") is not None
|
||||
|
||||
def test_gc_keeps_substantial_sessions(self, db):
|
||||
"""Sessions with >5 messages should never be deleted."""
|
||||
db.create_session(session_id="big_old", source="cli")
|
||||
for i in range(10):
|
||||
db.append_message("big_old", role="user", content=f"msg {i}")
|
||||
db.end_session("big_old", end_reason="done")
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - 365 * 86400, "big_old"), # 1 year ago
|
||||
)
|
||||
db._conn.commit()
|
||||
|
||||
result = db.garbage_collect()
|
||||
assert result["total"] == 0
|
||||
assert db.get_session("big_old") is not None
|
||||
|
||||
def test_gc_dry_run_does_not_delete(self, db):
|
||||
"""dry_run=True should return counts but not delete anything."""
|
||||
db.create_session(session_id="empty_old", source="cli")
|
||||
db.end_session("empty_old", end_reason="done")
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - 48 * 3600, "empty_old"),
|
||||
)
|
||||
db._conn.commit()
|
||||
|
||||
result = db.garbage_collect(dry_run=True)
|
||||
assert result["total"] == 1
|
||||
assert db.get_session("empty_old") is not None # Still exists
|
||||
|
||||
def test_gc_with_source_filter(self, db):
|
||||
"""--source should only GC sessions from that source."""
|
||||
for sid, src in [("old_cli", "cli"), ("old_tg", "telegram")]:
|
||||
db.create_session(session_id=sid, source=src)
|
||||
db.end_session(sid, end_reason="done")
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - 48 * 3600, sid),
|
||||
)
|
||||
db._conn.commit()
|
||||
|
||||
result = db.garbage_collect(source="cli")
|
||||
assert result["total"] == 1
|
||||
assert db.get_session("old_cli") is None
|
||||
assert db.get_session("old_tg") is not None
|
||||
|
||||
def test_gc_handles_child_sessions(self, db):
|
||||
"""Child sessions should be deleted when parent is GC'd."""
|
||||
db.create_session(session_id="parent_old", source="cli")
|
||||
db.end_session("parent_old", end_reason="done")
|
||||
db._conn.execute(
|
||||
"UPDATE sessions SET started_at = ? WHERE id = ?",
|
||||
(time.time() - 48 * 3600, "parent_old"),
|
||||
)
|
||||
# Create child session
|
||||
db.create_session(session_id="child", source="cli", parent_session_id="parent_old")
|
||||
db.end_session("child", end_reason="done")
|
||||
db._conn.commit()
|
||||
|
||||
result = db.garbage_collect()
|
||||
assert result["total"] == 1
|
||||
assert db.get_session("parent_old") is None
|
||||
assert db.get_session("child") is None
|
||||
|
||||
# Schema and WAL mode
|
||||
# =========================================================================
|
||||
|
||||
|
||||
@@ -201,17 +201,6 @@ def _format_job(job: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"paused_at": job.get("paused_at"),
|
||||
"paused_reason": job.get("paused_reason"),
|
||||
}
|
||||
# Health timestamps
|
||||
if job.get("last_error_at"):
|
||||
result["last_error_at"] = job["last_error_at"]
|
||||
if job.get("last_success_at"):
|
||||
result["last_success_at"] = job["last_success_at"]
|
||||
if job.get("error_resolved_at"):
|
||||
result["error_resolved_at"] = job["error_resolved_at"]
|
||||
if job.get("error_cleared_at"):
|
||||
result["error_cleared_at"] = job["error_cleared_at"]
|
||||
|
||||
|
||||
if job.get("script"):
|
||||
result["script"] = job["script"]
|
||||
return result
|
||||
@@ -337,13 +326,6 @@ def cronjob(
|
||||
if result is None:
|
||||
return json.dumps({"success": False, "error": "Job not found"}, indent=2)
|
||||
return json.dumps(result, indent=2)
|
||||
if normalized == "clear_error":
|
||||
from cron.jobs import clear_job_error
|
||||
job = clear_job_error(job_id)
|
||||
if job is None:
|
||||
return json.dumps({"success": False, "error": "Job not found"}, indent=2)
|
||||
return json.dumps({"success": True, "job": _format_job(job)}, indent=2)
|
||||
|
||||
|
||||
if normalized == "update":
|
||||
updates: Dict[str, Any] = {}
|
||||
|
||||
Reference in New Issue
Block a user