Files
hermes-agent/cron/scheduler.py
Myeongwon Choi ea16949422 fix(cron): suppress delivery when [SILENT] appears anywhere in response
Previously the scheduler checked startswith('[SILENT]'), so agents that
appended [SILENT] after an explanation (e.g. 'N items filtered.\n\n[SILENT]')
would still trigger delivery.

Change the check to 'in' so the marker is caught regardless of position.
Add test_silent_trailing_suppresses_delivery to cover this case.
2026-04-06 16:49:40 -07:00

851 lines
32 KiB
Python

"""
Cron job scheduler - executes due jobs.
Provides tick() which checks for due jobs and runs them. The gateway
calls this every 60 seconds from a background thread.
Uses a file-based lock (~/.hermes/cron/.tick.lock) so only one tick
runs at a time if multiple processes overlap.
"""
import asyncio
import concurrent.futures
import json
import logging
import os
import subprocess
import sys
# fcntl is Unix-only; on Windows use msvcrt for file locking
try:
import fcntl
except ImportError:
fcntl = None
try:
import msvcrt
except ImportError:
msvcrt = None
import time
from pathlib import Path
from typing import Optional
# Add parent directory to path for imports BEFORE repo-level imports.
# Without this, standalone invocations (e.g. after `hermes update` reloads
# the module) fail with ModuleNotFoundError for hermes_time et al.
sys.path.insert(0, str(Path(__file__).parent.parent))
from hermes_constants import get_hermes_home
from hermes_cli.config import load_config
from hermes_time import now as _hermes_now
logger = logging.getLogger(__name__)
# Valid delivery platforms — used to validate user-supplied platform names
# in cron delivery targets, preventing env var enumeration via crafted names.
_KNOWN_DELIVERY_PLATFORMS = frozenset({
"telegram", "discord", "slack", "whatsapp", "signal",
"matrix", "mattermost", "homeassistant", "dingtalk", "feishu",
"wecom", "sms", "email", "webhook",
})
from cron.jobs import get_due_jobs, mark_job_run, save_job_output, advance_next_run
# Sentinel: when a cron agent has nothing new to report, it can start its
# response with this marker to suppress delivery. Output is still saved
# locally for audit.
SILENT_MARKER = "[SILENT]"
# Resolve Hermes home directory (respects HERMES_HOME override)
_hermes_home = get_hermes_home()
# File-based lock prevents concurrent ticks from gateway + daemon + systemd timer
_LOCK_DIR = _hermes_home / "cron"
_LOCK_FILE = _LOCK_DIR / ".tick.lock"
def _resolve_origin(job: dict) -> Optional[dict]:
"""Extract origin info from a job, preserving any extra routing metadata."""
origin = job.get("origin")
if not origin:
return None
platform = origin.get("platform")
chat_id = origin.get("chat_id")
if platform and chat_id:
return origin
return None
def _resolve_delivery_target(job: dict) -> Optional[dict]:
"""Resolve the concrete auto-delivery target for a cron job, if any."""
deliver = job.get("deliver", "local")
origin = _resolve_origin(job)
if deliver == "local":
return None
if deliver == "origin":
if origin:
return {
"platform": origin["platform"],
"chat_id": str(origin["chat_id"]),
"thread_id": origin.get("thread_id"),
}
# Origin missing (e.g. job created via API/script) — try each
# platform's home channel as a fallback instead of silently dropping.
for platform_name in ("matrix", "telegram", "discord", "slack"):
chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "")
if chat_id:
logger.info(
"Job '%s' has deliver=origin but no origin; falling back to %s home channel",
job.get("name", job.get("id", "?")),
platform_name,
)
return {
"platform": platform_name,
"chat_id": chat_id,
"thread_id": None,
}
return None
if ":" in deliver:
platform_name, rest = deliver.split(":", 1)
platform_key = platform_name.lower()
from tools.send_message_tool import _parse_target_ref
parsed_chat_id, parsed_thread_id, is_explicit = _parse_target_ref(platform_key, rest)
if is_explicit:
chat_id, thread_id = parsed_chat_id, parsed_thread_id
else:
chat_id, thread_id = rest, None
# Resolve human-friendly labels like "Alice (dm)" to real IDs.
try:
from gateway.channel_directory import resolve_channel_name
resolved = resolve_channel_name(platform_key, chat_id)
if resolved:
parsed_chat_id, parsed_thread_id, resolved_is_explicit = _parse_target_ref(platform_key, resolved)
if resolved_is_explicit:
chat_id, thread_id = parsed_chat_id, parsed_thread_id
else:
chat_id = resolved
except Exception:
pass
return {
"platform": platform_name,
"chat_id": chat_id,
"thread_id": thread_id,
}
platform_name = deliver
if origin and origin.get("platform") == platform_name:
return {
"platform": platform_name,
"chat_id": str(origin["chat_id"]),
"thread_id": origin.get("thread_id"),
}
if platform_name.lower() not in _KNOWN_DELIVERY_PLATFORMS:
return None
chat_id = os.getenv(f"{platform_name.upper()}_HOME_CHANNEL", "")
if not chat_id:
return None
return {
"platform": platform_name,
"chat_id": chat_id,
"thread_id": None,
}
def _deliver_result(job: dict, content: str, adapters=None, loop=None) -> None:
"""
Deliver job output to the configured target (origin chat, specific platform, etc.).
When ``adapters`` and ``loop`` are provided (gateway is running), tries to
use the live adapter first — this supports E2EE rooms (e.g. Matrix) where
the standalone HTTP path cannot encrypt. Falls back to standalone send if
the adapter path fails or is unavailable.
"""
target = _resolve_delivery_target(job)
if not target:
if job.get("deliver", "local") != "local":
logger.warning(
"Job '%s' deliver=%s but no concrete delivery target could be resolved",
job["id"],
job.get("deliver", "local"),
)
return
platform_name = target["platform"]
chat_id = target["chat_id"]
thread_id = target.get("thread_id")
from tools.send_message_tool import _send_to_platform
from gateway.config import load_gateway_config, Platform
platform_map = {
"telegram": Platform.TELEGRAM,
"discord": Platform.DISCORD,
"slack": Platform.SLACK,
"whatsapp": Platform.WHATSAPP,
"signal": Platform.SIGNAL,
"matrix": Platform.MATRIX,
"mattermost": Platform.MATTERMOST,
"homeassistant": Platform.HOMEASSISTANT,
"dingtalk": Platform.DINGTALK,
"feishu": Platform.FEISHU,
"wecom": Platform.WECOM,
"email": Platform.EMAIL,
"sms": Platform.SMS,
}
platform = platform_map.get(platform_name.lower())
if not platform:
logger.warning("Job '%s': unknown platform '%s' for delivery", job["id"], platform_name)
return
try:
config = load_gateway_config()
except Exception as e:
logger.error("Job '%s': failed to load gateway config for delivery: %s", job["id"], e)
return
pconfig = config.platforms.get(platform)
if not pconfig or not pconfig.enabled:
logger.warning("Job '%s': platform '%s' not configured/enabled", job["id"], platform_name)
return
# Optionally wrap the content with a header/footer so the user knows this
# is a cron delivery. Wrapping is on by default; set cron.wrap_response: false
# in config.yaml for clean output.
wrap_response = True
try:
user_cfg = load_config()
wrap_response = user_cfg.get("cron", {}).get("wrap_response", True)
except Exception:
pass
if wrap_response:
task_name = job.get("name", job["id"])
delivery_content = (
f"Cronjob Response: {task_name}\n"
f"-------------\n\n"
f"{content}\n\n"
f"Note: The agent cannot see this message, and therefore cannot respond to it."
)
else:
delivery_content = content
# Extract MEDIA: tags so attachments are forwarded as files, not raw text
from gateway.platforms.base import BasePlatformAdapter
media_files, cleaned_delivery_content = BasePlatformAdapter.extract_media(delivery_content)
# Prefer the live adapter when the gateway is running — this supports E2EE
# rooms (e.g. Matrix) where the standalone HTTP path cannot encrypt.
runtime_adapter = (adapters or {}).get(platform)
if runtime_adapter is not None and loop is not None and getattr(loop, "is_running", lambda: False)():
send_metadata = {"thread_id": thread_id} if thread_id else None
try:
future = asyncio.run_coroutine_threadsafe(
runtime_adapter.send(chat_id, delivery_content, metadata=send_metadata),
loop,
)
send_result = future.result(timeout=60)
if send_result and not getattr(send_result, "success", True):
err = getattr(send_result, "error", "unknown")
logger.warning(
"Job '%s': live adapter send to %s:%s failed (%s), falling back to standalone",
job["id"], platform_name, chat_id, err,
)
else:
logger.info("Job '%s': delivered to %s:%s via live adapter", job["id"], platform_name, chat_id)
return
except Exception as e:
logger.warning(
"Job '%s': live adapter delivery to %s:%s failed (%s), falling back to standalone",
job["id"], platform_name, chat_id, e,
)
# Standalone path: run the async send in a fresh event loop (safe from any thread)
coro = _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files)
try:
result = asyncio.run(coro)
except RuntimeError:
# asyncio.run() checks for a running loop before awaiting the coroutine;
# when it raises, the original coro was never started — close it to
# prevent "coroutine was never awaited" RuntimeWarning, then retry in a
# fresh thread that has no running loop.
coro.close()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
future = pool.submit(asyncio.run, _send_to_platform(platform, pconfig, chat_id, cleaned_delivery_content, thread_id=thread_id, media_files=media_files))
result = future.result(timeout=30)
except Exception as e:
logger.error("Job '%s': delivery to %s:%s failed: %s", job["id"], platform_name, chat_id, e)
return
if result and result.get("error"):
logger.error("Job '%s': delivery error: %s", job["id"], result["error"])
else:
logger.info("Job '%s': delivered to %s:%s", job["id"], platform_name, chat_id)
_SCRIPT_TIMEOUT = 120 # seconds
def _run_job_script(script_path: str) -> tuple[bool, str]:
"""Execute a cron job's data-collection script and capture its output.
Scripts must reside within HERMES_HOME/scripts/. Both relative and
absolute paths are resolved and validated against this directory to
prevent arbitrary script execution via path traversal or absolute
path injection.
Args:
script_path: Path to a Python script. Relative paths are resolved
against HERMES_HOME/scripts/. Absolute and ~-prefixed paths
are also validated to ensure they stay within the scripts dir.
Returns:
(success, output) — on failure *output* contains the error message so the
LLM can report the problem to the user.
"""
from hermes_constants import get_hermes_home
scripts_dir = get_hermes_home() / "scripts"
scripts_dir.mkdir(parents=True, exist_ok=True)
scripts_dir_resolved = scripts_dir.resolve()
raw = Path(script_path).expanduser()
if raw.is_absolute():
path = raw.resolve()
else:
path = (scripts_dir / raw).resolve()
# Guard against path traversal, absolute path injection, and symlink
# escape — scripts MUST reside within HERMES_HOME/scripts/.
try:
path.relative_to(scripts_dir_resolved)
except ValueError:
return False, (
f"Blocked: script path resolves outside the scripts directory "
f"({scripts_dir_resolved}): {script_path!r}"
)
if not path.exists():
return False, f"Script not found: {path}"
if not path.is_file():
return False, f"Script path is not a file: {path}"
try:
result = subprocess.run(
[sys.executable, str(path)],
capture_output=True,
text=True,
timeout=_SCRIPT_TIMEOUT,
cwd=str(path.parent),
)
stdout = (result.stdout or "").strip()
stderr = (result.stderr or "").strip()
if result.returncode != 0:
parts = [f"Script exited with code {result.returncode}"]
if stderr:
parts.append(f"stderr:\n{stderr}")
if stdout:
parts.append(f"stdout:\n{stdout}")
return False, "\n".join(parts)
# Redact any secrets that may appear in script output before
# they are injected into the LLM prompt context.
try:
from agent.redact import redact_sensitive_text
stdout = redact_sensitive_text(stdout)
except Exception:
pass
return True, stdout
except subprocess.TimeoutExpired:
return False, f"Script timed out after {_SCRIPT_TIMEOUT}s: {path}"
except Exception as exc:
return False, f"Script execution failed: {exc}"
def _build_job_prompt(job: dict) -> str:
"""Build the effective prompt for a cron job, optionally loading one or more skills first."""
prompt = job.get("prompt", "")
skills = job.get("skills")
# Run data-collection script if configured, inject output as context.
script_path = job.get("script")
if script_path:
success, script_output = _run_job_script(script_path)
if success:
if script_output:
prompt = (
"## Script Output\n"
"The following data was collected by a pre-run script. "
"Use it as context for your analysis.\n\n"
f"```\n{script_output}\n```\n\n"
f"{prompt}"
)
else:
prompt = (
"[Script ran successfully but produced no output.]\n\n"
f"{prompt}"
)
else:
prompt = (
"## Script Error\n"
"The data-collection script failed. Report this to the user.\n\n"
f"```\n{script_output}\n```\n\n"
f"{prompt}"
)
# Always prepend cron execution guidance so the agent knows how
# delivery works and can suppress delivery when appropriate.
cron_hint = (
"[SYSTEM: You are running as a scheduled cron job. "
"DELIVERY: Your final response will be automatically delivered "
"to the user — do NOT use send_message or try to deliver "
"the output yourself. Just produce your report/output as your "
"final response and the system handles the rest. "
"SILENT: If there is genuinely nothing new to report, respond "
"with exactly \"[SILENT]\" (nothing else) to suppress delivery. "
"Never combine [SILENT] with content — either report your "
"findings normally, or say [SILENT] and nothing more.]\n\n"
)
prompt = cron_hint + prompt
if skills is None:
legacy = job.get("skill")
skills = [legacy] if legacy else []
skill_names = [str(name).strip() for name in skills if str(name).strip()]
if not skill_names:
return prompt
from tools.skills_tool import skill_view
parts = []
skipped: list[str] = []
for skill_name in skill_names:
loaded = json.loads(skill_view(skill_name))
if not loaded.get("success"):
error = loaded.get("error") or f"Failed to load skill '{skill_name}'"
logger.warning("Cron job '%s': skill not found, skipping — %s", job.get("name", job.get("id")), error)
skipped.append(skill_name)
continue
content = str(loaded.get("content") or "").strip()
if parts:
parts.append("")
parts.extend(
[
f'[SYSTEM: The user has invoked the "{skill_name}" skill, indicating they want you to follow its instructions. The full skill content is loaded below.]',
"",
content,
]
)
if skipped:
notice = (
f"[SYSTEM: The following skill(s) were listed for this job but could not be found "
f"and were skipped: {', '.join(skipped)}. "
f"Start your response with a brief notice so the user is aware, e.g.: "
f"'⚠️ Skill(s) not found and skipped: {', '.join(skipped)}']"
)
parts.insert(0, notice)
if prompt:
parts.extend(["", f"The user has provided the following instruction alongside the skill invocation: {prompt}"])
return "\n".join(parts)
def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
"""
Execute a single cron job.
Returns:
Tuple of (success, full_output_doc, final_response, error_message)
"""
from run_agent import AIAgent
# Initialize SQLite session store so cron job messages are persisted
# and discoverable via session_search (same pattern as gateway/run.py).
_session_db = None
try:
from hermes_state import SessionDB
_session_db = SessionDB()
except Exception as e:
logger.debug("Job '%s': SQLite session store not available: %s", job.get("id", "?"), e)
job_id = job["id"]
job_name = job["name"]
prompt = _build_job_prompt(job)
origin = _resolve_origin(job)
_cron_session_id = f"cron_{job_id}_{_hermes_now().strftime('%Y%m%d_%H%M%S')}"
logger.info("Running job '%s' (ID: %s)", job_name, job_id)
logger.info("Prompt: %s", prompt[:100])
try:
# Inject origin context so the agent's send_message tool knows the chat.
# Must be INSIDE the try block so the finally cleanup always runs.
if origin:
os.environ["HERMES_SESSION_PLATFORM"] = origin["platform"]
os.environ["HERMES_SESSION_CHAT_ID"] = str(origin["chat_id"])
if origin.get("chat_name"):
os.environ["HERMES_SESSION_CHAT_NAME"] = origin["chat_name"]
# Re-read .env and config.yaml fresh every run so provider/key
# changes take effect without a gateway restart.
from dotenv import load_dotenv
try:
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="utf-8")
except UnicodeDecodeError:
load_dotenv(str(_hermes_home / ".env"), override=True, encoding="latin-1")
delivery_target = _resolve_delivery_target(job)
if delivery_target:
os.environ["HERMES_CRON_AUTO_DELIVER_PLATFORM"] = delivery_target["platform"]
os.environ["HERMES_CRON_AUTO_DELIVER_CHAT_ID"] = str(delivery_target["chat_id"])
if delivery_target.get("thread_id") is not None:
os.environ["HERMES_CRON_AUTO_DELIVER_THREAD_ID"] = str(delivery_target["thread_id"])
model = job.get("model") or os.getenv("HERMES_MODEL") or ""
# Load config.yaml for model, reasoning, prefill, toolsets, provider routing
_cfg = {}
try:
import yaml
_cfg_path = str(_hermes_home / "config.yaml")
if os.path.exists(_cfg_path):
with open(_cfg_path) as _f:
_cfg = yaml.safe_load(_f) or {}
_model_cfg = _cfg.get("model", {})
if not job.get("model"):
if isinstance(_model_cfg, str):
model = _model_cfg
elif isinstance(_model_cfg, dict):
model = _model_cfg.get("default", model)
except Exception as e:
logger.warning("Job '%s': failed to load config.yaml, using defaults: %s", job_id, e)
# Reasoning config from env or config.yaml
from hermes_constants import parse_reasoning_effort
effort = os.getenv("HERMES_REASONING_EFFORT", "")
if not effort:
effort = str(_cfg.get("agent", {}).get("reasoning_effort", "")).strip()
reasoning_config = parse_reasoning_effort(effort)
# Prefill messages from env or config.yaml
prefill_messages = None
prefill_file = os.getenv("HERMES_PREFILL_MESSAGES_FILE", "") or _cfg.get("prefill_messages_file", "")
if prefill_file:
import json as _json
pfpath = Path(prefill_file).expanduser()
if not pfpath.is_absolute():
pfpath = _hermes_home / pfpath
if pfpath.exists():
try:
with open(pfpath, "r", encoding="utf-8") as _pf:
prefill_messages = _json.load(_pf)
if not isinstance(prefill_messages, list):
prefill_messages = None
except Exception as e:
logger.warning("Job '%s': failed to parse prefill messages file '%s': %s", job_id, pfpath, e)
prefill_messages = None
# Max iterations
max_iterations = _cfg.get("agent", {}).get("max_turns") or _cfg.get("max_turns") or 90
# Provider routing
pr = _cfg.get("provider_routing", {})
smart_routing = _cfg.get("smart_model_routing", {}) or {}
from hermes_cli.runtime_provider import (
resolve_runtime_provider,
format_runtime_provider_error,
)
try:
runtime_kwargs = {
"requested": job.get("provider") or os.getenv("HERMES_INFERENCE_PROVIDER"),
}
if job.get("base_url"):
runtime_kwargs["explicit_base_url"] = job.get("base_url")
runtime = resolve_runtime_provider(**runtime_kwargs)
except Exception as exc:
message = format_runtime_provider_error(exc)
raise RuntimeError(message) from exc
from agent.smart_model_routing import resolve_turn_route
turn_route = resolve_turn_route(
prompt,
smart_routing,
{
"model": model,
"api_key": runtime.get("api_key"),
"base_url": runtime.get("base_url"),
"provider": runtime.get("provider"),
"api_mode": runtime.get("api_mode"),
"command": runtime.get("command"),
"args": list(runtime.get("args") or []),
},
)
agent = AIAgent(
model=turn_route["model"],
api_key=turn_route["runtime"].get("api_key"),
base_url=turn_route["runtime"].get("base_url"),
provider=turn_route["runtime"].get("provider"),
api_mode=turn_route["runtime"].get("api_mode"),
acp_command=turn_route["runtime"].get("command"),
acp_args=turn_route["runtime"].get("args"),
max_iterations=max_iterations,
reasoning_config=reasoning_config,
prefill_messages=prefill_messages,
providers_allowed=pr.get("only"),
providers_ignored=pr.get("ignore"),
providers_order=pr.get("order"),
provider_sort=pr.get("sort"),
disabled_toolsets=["cronjob", "messaging", "clarify"],
quiet_mode=True,
skip_memory=True, # Cron system prompts would corrupt user representations
platform="cron",
session_id=_cron_session_id,
session_db=_session_db,
)
# Run the agent with an *inactivity*-based timeout: the job can run
# for hours if it's actively calling tools / receiving stream tokens,
# but a hung API call or stuck tool with no activity for the configured
# duration is caught and killed. Default 600s (10 min inactivity);
# override via HERMES_CRON_TIMEOUT env var. 0 = unlimited.
#
# Uses the agent's built-in activity tracker (updated by
# _touch_activity() on every tool call, API call, and stream delta).
_cron_timeout = float(os.getenv("HERMES_CRON_TIMEOUT", 600))
_cron_inactivity_limit = _cron_timeout if _cron_timeout > 0 else None
_POLL_INTERVAL = 5.0
_cron_pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
_cron_future = _cron_pool.submit(agent.run_conversation, prompt)
_inactivity_timeout = False
try:
if _cron_inactivity_limit is None:
# Unlimited — just wait for the result.
result = _cron_future.result()
else:
result = None
while True:
done, _ = concurrent.futures.wait(
{_cron_future}, timeout=_POLL_INTERVAL,
)
if done:
result = _cron_future.result()
break
# Agent still running — check inactivity.
_idle_secs = 0.0
if hasattr(agent, "get_activity_summary"):
try:
_act = agent.get_activity_summary()
_idle_secs = _act.get("seconds_since_activity", 0.0)
except Exception:
pass
if _idle_secs >= _cron_inactivity_limit:
_inactivity_timeout = True
break
except Exception:
_cron_pool.shutdown(wait=False, cancel_futures=True)
raise
finally:
_cron_pool.shutdown(wait=False)
if _inactivity_timeout:
# Build diagnostic summary from the agent's activity tracker.
_activity = {}
if hasattr(agent, "get_activity_summary"):
try:
_activity = agent.get_activity_summary()
except Exception:
pass
_last_desc = _activity.get("last_activity_desc", "unknown")
_secs_ago = _activity.get("seconds_since_activity", 0)
_cur_tool = _activity.get("current_tool")
_iter_n = _activity.get("api_call_count", 0)
_iter_max = _activity.get("max_iterations", 0)
logger.error(
"Job '%s' idle for %.0fs (inactivity limit %.0fs) "
"| last_activity=%s | iteration=%s/%s | tool=%s",
job_name, _secs_ago, _cron_inactivity_limit,
_last_desc, _iter_n, _iter_max,
_cur_tool or "none",
)
if hasattr(agent, "interrupt"):
agent.interrupt("Cron job timed out (inactivity)")
raise TimeoutError(
f"Cron job '{job_name}' idle for "
f"{int(_secs_ago)}s (limit {int(_cron_inactivity_limit)}s) "
f"— last activity: {_last_desc}"
)
final_response = result.get("final_response", "") or ""
# Use a separate variable for log display; keep final_response clean
# for delivery logic (empty response = no delivery).
logged_response = final_response if final_response else "(No response generated)"
output = f"""# Cron Job: {job_name}
**Job ID:** {job_id}
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
**Schedule:** {job.get('schedule_display', 'N/A')}
## Prompt
{prompt}
## Response
{logged_response}
"""
logger.info("Job '%s' completed successfully", job_name)
return True, output, final_response, None
except Exception as e:
error_msg = f"{type(e).__name__}: {str(e)}"
logger.exception("Job '%s' failed: %s", job_name, error_msg)
output = f"""# Cron Job: {job_name} (FAILED)
**Job ID:** {job_id}
**Run Time:** {_hermes_now().strftime('%Y-%m-%d %H:%M:%S')}
**Schedule:** {job.get('schedule_display', 'N/A')}
## Prompt
{prompt}
## Error
```
{error_msg}
```
"""
return False, output, "", error_msg
finally:
# Clean up injected env vars so they don't leak to other jobs
for key in (
"HERMES_SESSION_PLATFORM",
"HERMES_SESSION_CHAT_ID",
"HERMES_SESSION_CHAT_NAME",
"HERMES_CRON_AUTO_DELIVER_PLATFORM",
"HERMES_CRON_AUTO_DELIVER_CHAT_ID",
"HERMES_CRON_AUTO_DELIVER_THREAD_ID",
):
os.environ.pop(key, None)
if _session_db:
try:
_session_db.end_session(_cron_session_id, "cron_complete")
except (Exception, KeyboardInterrupt) as e:
logger.debug("Job '%s': failed to end session: %s", job_id, e)
try:
_session_db.close()
except (Exception, KeyboardInterrupt) as e:
logger.debug("Job '%s': failed to close SQLite session store: %s", job_id, e)
def tick(verbose: bool = True, adapters=None, loop=None) -> int:
"""
Check and run all due jobs.
Uses a file lock so only one tick runs at a time, even if the gateway's
in-process ticker and a standalone daemon or manual tick overlap.
Args:
verbose: Whether to print status messages
adapters: Optional dict mapping Platform → live adapter (from gateway)
loop: Optional asyncio event loop (from gateway) for live adapter sends
Returns:
Number of jobs executed (0 if another tick is already running)
"""
_LOCK_DIR.mkdir(parents=True, exist_ok=True)
# Cross-platform file locking: fcntl on Unix, msvcrt on Windows
lock_fd = None
try:
lock_fd = open(_LOCK_FILE, "w")
if fcntl:
fcntl.flock(lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
elif msvcrt:
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_NBLCK, 1)
except (OSError, IOError):
logger.debug("Tick skipped — another instance holds the lock")
if lock_fd is not None:
lock_fd.close()
return 0
try:
due_jobs = get_due_jobs()
if verbose and not due_jobs:
logger.info("%s - No jobs due", _hermes_now().strftime('%H:%M:%S'))
return 0
if verbose:
logger.info("%s - %s job(s) due", _hermes_now().strftime('%H:%M:%S'), len(due_jobs))
executed = 0
for job in due_jobs:
try:
# For recurring jobs (cron/interval), advance next_run_at to the
# next future occurrence BEFORE execution. This way, if the
# process crashes mid-run, the job won't re-fire on restart.
# One-shot jobs are left alone so they can retry on restart.
advance_next_run(job["id"])
success, output, final_response, error = run_job(job)
output_file = save_job_output(job["id"], output)
if verbose:
logger.info("Output saved to: %s", output_file)
# Deliver the final response to the origin/target chat.
# If the agent responded with [SILENT], skip delivery (but
# output is already saved above). Failed jobs always deliver.
deliver_content = final_response if success else f"⚠️ Cron job '{job.get('name', job['id'])}' failed:\n{error}"
should_deliver = bool(deliver_content)
if should_deliver and success and SILENT_MARKER in deliver_content.strip().upper():
logger.info("Job '%s': agent returned %s — skipping delivery", job["id"], SILENT_MARKER)
should_deliver = False
if should_deliver:
try:
_deliver_result(job, deliver_content, adapters=adapters, loop=loop)
except Exception as de:
logger.error("Delivery failed for job %s: %s", job["id"], de)
mark_job_run(job["id"], success, error)
executed += 1
except Exception as e:
logger.error("Error processing job %s: %s", job['id'], e)
mark_job_run(job["id"], False, str(e))
return executed
finally:
if fcntl:
fcntl.flock(lock_fd, fcntl.LOCK_UN)
elif msvcrt:
try:
msvcrt.locking(lock_fd.fileno(), msvcrt.LK_UNLCK, 1)
except (OSError, IOError):
pass
lock_fd.close()
if __name__ == "__main__":
tick(verbose=True)