Previously crash recovery recreated detached sessions as if they were fully managed, so polls and kills could lie about liveness and the checkpoint could forget recovered jobs after the next restart. This commit refreshes recovered host-backed sessions from real PID state, keeps checkpoint data durable, and preserves notify watcher metadata while treating sandbox-only PIDs as non-recoverable. - Persist `pid_scope` in `tools/process_registry.py` and skip recovering sandbox-backed entries without a host-visible PID handle - Refresh detached sessions on access so `get`/`poll`/`wait` and active session queries observe exited processes instead of hanging forever - Allow recovered host PIDs to be terminated honestly and requeue `notify_on_complete` watchers during checkpoint recovery - Add regression tests for durable checkpoints, detached exit/kill behavior, sandbox skip logic, and recovered notify watchers
511 lines
18 KiB
Python
511 lines
18 KiB
Python
"""Tests for tools/process_registry.py — ProcessRegistry query methods, pruning, checkpoint."""
|
|
|
|
import json
|
|
import os
|
|
import signal
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import pytest
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
from tools.environments.local import _HERMES_PROVIDER_ENV_FORCE_PREFIX
|
|
from tools.process_registry import (
|
|
ProcessRegistry,
|
|
ProcessSession,
|
|
MAX_OUTPUT_CHARS,
|
|
FINISHED_TTL_SECONDS,
|
|
MAX_PROCESSES,
|
|
)
|
|
|
|
|
|
@pytest.fixture()
|
|
def registry():
|
|
"""Create a fresh ProcessRegistry."""
|
|
return ProcessRegistry()
|
|
|
|
|
|
def _make_session(
|
|
sid="proc_test123",
|
|
command="echo hello",
|
|
task_id="t1",
|
|
exited=False,
|
|
exit_code=None,
|
|
output="",
|
|
started_at=None,
|
|
) -> ProcessSession:
|
|
"""Helper to create a ProcessSession for testing."""
|
|
s = ProcessSession(
|
|
id=sid,
|
|
command=command,
|
|
task_id=task_id,
|
|
started_at=started_at or time.time(),
|
|
exited=exited,
|
|
exit_code=exit_code,
|
|
output_buffer=output,
|
|
)
|
|
return s
|
|
|
|
|
|
def _spawn_python_sleep(seconds: float) -> subprocess.Popen:
|
|
"""Spawn a portable short-lived Python sleep process."""
|
|
return subprocess.Popen(
|
|
[sys.executable, "-c", f"import time; time.sleep({seconds})"],
|
|
)
|
|
|
|
|
|
def _wait_until(predicate, timeout: float = 5.0, interval: float = 0.05) -> bool:
|
|
"""Poll a predicate until it returns truthy or the timeout elapses."""
|
|
deadline = time.monotonic() + timeout
|
|
while time.monotonic() < deadline:
|
|
if predicate():
|
|
return True
|
|
time.sleep(interval)
|
|
return False
|
|
|
|
|
|
# =========================================================================
|
|
# Get / Poll
|
|
# =========================================================================
|
|
|
|
class TestGetAndPoll:
|
|
def test_get_not_found(self, registry):
|
|
assert registry.get("nonexistent") is None
|
|
|
|
def test_get_running(self, registry):
|
|
s = _make_session()
|
|
registry._running[s.id] = s
|
|
assert registry.get(s.id) is s
|
|
|
|
def test_get_finished(self, registry):
|
|
s = _make_session(exited=True, exit_code=0)
|
|
registry._finished[s.id] = s
|
|
assert registry.get(s.id) is s
|
|
|
|
def test_poll_not_found(self, registry):
|
|
result = registry.poll("nonexistent")
|
|
assert result["status"] == "not_found"
|
|
|
|
def test_poll_running(self, registry):
|
|
s = _make_session(output="some output here")
|
|
registry._running[s.id] = s
|
|
result = registry.poll(s.id)
|
|
assert result["status"] == "running"
|
|
assert "some output" in result["output_preview"]
|
|
assert result["command"] == "echo hello"
|
|
|
|
def test_poll_exited(self, registry):
|
|
s = _make_session(exited=True, exit_code=0, output="done")
|
|
registry._finished[s.id] = s
|
|
result = registry.poll(s.id)
|
|
assert result["status"] == "exited"
|
|
assert result["exit_code"] == 0
|
|
|
|
|
|
# =========================================================================
|
|
# Read log
|
|
# =========================================================================
|
|
|
|
class TestReadLog:
|
|
def test_not_found(self, registry):
|
|
result = registry.read_log("nonexistent")
|
|
assert result["status"] == "not_found"
|
|
|
|
def test_read_full_log(self, registry):
|
|
lines = "\n".join([f"line {i}" for i in range(50)])
|
|
s = _make_session(output=lines)
|
|
registry._running[s.id] = s
|
|
result = registry.read_log(s.id)
|
|
assert result["total_lines"] == 50
|
|
|
|
def test_read_with_limit(self, registry):
|
|
lines = "\n".join([f"line {i}" for i in range(100)])
|
|
s = _make_session(output=lines)
|
|
registry._running[s.id] = s
|
|
result = registry.read_log(s.id, limit=10)
|
|
# Default: last 10 lines
|
|
assert "10 lines" in result["showing"]
|
|
|
|
def test_read_with_offset(self, registry):
|
|
lines = "\n".join([f"line {i}" for i in range(100)])
|
|
s = _make_session(output=lines)
|
|
registry._running[s.id] = s
|
|
result = registry.read_log(s.id, offset=10, limit=5)
|
|
assert "5 lines" in result["showing"]
|
|
|
|
|
|
# =========================================================================
|
|
# List sessions
|
|
# =========================================================================
|
|
|
|
class TestListSessions:
|
|
def test_empty(self, registry):
|
|
assert registry.list_sessions() == []
|
|
|
|
def test_lists_running_and_finished(self, registry):
|
|
s1 = _make_session(sid="proc_1", task_id="t1")
|
|
s2 = _make_session(sid="proc_2", task_id="t1", exited=True, exit_code=0)
|
|
registry._running[s1.id] = s1
|
|
registry._finished[s2.id] = s2
|
|
result = registry.list_sessions()
|
|
assert len(result) == 2
|
|
|
|
def test_filter_by_task_id(self, registry):
|
|
s1 = _make_session(sid="proc_1", task_id="t1")
|
|
s2 = _make_session(sid="proc_2", task_id="t2")
|
|
registry._running[s1.id] = s1
|
|
registry._running[s2.id] = s2
|
|
result = registry.list_sessions(task_id="t1")
|
|
assert len(result) == 1
|
|
assert result[0]["session_id"] == "proc_1"
|
|
|
|
def test_list_entry_fields(self, registry):
|
|
s = _make_session(output="preview text")
|
|
registry._running[s.id] = s
|
|
entry = registry.list_sessions()[0]
|
|
assert "session_id" in entry
|
|
assert "command" in entry
|
|
assert "status" in entry
|
|
assert "pid" in entry
|
|
assert "output_preview" in entry
|
|
|
|
|
|
# =========================================================================
|
|
# Active process queries
|
|
# =========================================================================
|
|
|
|
class TestActiveQueries:
|
|
def test_has_active_processes(self, registry):
|
|
s = _make_session(task_id="t1")
|
|
registry._running[s.id] = s
|
|
assert registry.has_active_processes("t1") is True
|
|
assert registry.has_active_processes("t2") is False
|
|
|
|
def test_has_active_for_session(self, registry):
|
|
s = _make_session()
|
|
s.session_key = "gw_session_1"
|
|
registry._running[s.id] = s
|
|
assert registry.has_active_for_session("gw_session_1") is True
|
|
assert registry.has_active_for_session("other") is False
|
|
|
|
def test_exited_not_active(self, registry):
|
|
s = _make_session(task_id="t1", exited=True, exit_code=0)
|
|
registry._finished[s.id] = s
|
|
assert registry.has_active_processes("t1") is False
|
|
|
|
|
|
# =========================================================================
|
|
# Pruning
|
|
# =========================================================================
|
|
|
|
class TestPruning:
|
|
def test_prune_expired_finished(self, registry):
|
|
old_session = _make_session(
|
|
sid="proc_old",
|
|
exited=True,
|
|
started_at=time.time() - FINISHED_TTL_SECONDS - 100,
|
|
)
|
|
registry._finished[old_session.id] = old_session
|
|
registry._prune_if_needed()
|
|
assert "proc_old" not in registry._finished
|
|
|
|
def test_prune_keeps_recent(self, registry):
|
|
recent = _make_session(sid="proc_recent", exited=True)
|
|
registry._finished[recent.id] = recent
|
|
registry._prune_if_needed()
|
|
assert "proc_recent" in registry._finished
|
|
|
|
def test_prune_over_max_removes_oldest(self, registry):
|
|
# Fill up to MAX_PROCESSES
|
|
for i in range(MAX_PROCESSES):
|
|
s = _make_session(
|
|
sid=f"proc_{i}",
|
|
exited=True,
|
|
started_at=time.time() - i, # older as i increases
|
|
)
|
|
registry._finished[s.id] = s
|
|
|
|
# Add one more running to trigger prune
|
|
s = _make_session(sid="proc_new")
|
|
registry._running[s.id] = s
|
|
registry._prune_if_needed()
|
|
|
|
total = len(registry._running) + len(registry._finished)
|
|
assert total <= MAX_PROCESSES
|
|
|
|
|
|
# =========================================================================
|
|
# Spawn env sanitization
|
|
# =========================================================================
|
|
|
|
class TestSpawnEnvSanitization:
|
|
def test_spawn_local_strips_blocked_vars_from_background_env(self, registry):
|
|
captured = {}
|
|
|
|
def fake_popen(cmd, **kwargs):
|
|
captured["env"] = kwargs["env"]
|
|
proc = MagicMock()
|
|
proc.pid = 4321
|
|
proc.stdout = iter([])
|
|
proc.stdin = MagicMock()
|
|
proc.poll.return_value = None
|
|
return proc
|
|
|
|
fake_thread = MagicMock()
|
|
|
|
with patch.dict(os.environ, {
|
|
"PATH": "/usr/bin:/bin",
|
|
"HOME": "/home/user",
|
|
"USER": "tester",
|
|
"TELEGRAM_BOT_TOKEN": "bot-secret",
|
|
"FIRECRAWL_API_KEY": "fc-secret",
|
|
}, clear=True), \
|
|
patch("tools.process_registry._find_shell", return_value="/bin/bash"), \
|
|
patch("subprocess.Popen", side_effect=fake_popen), \
|
|
patch("threading.Thread", return_value=fake_thread), \
|
|
patch.object(registry, "_write_checkpoint"):
|
|
registry.spawn_local(
|
|
"echo hello",
|
|
cwd="/tmp",
|
|
env_vars={
|
|
"MY_CUSTOM_VAR": "keep-me",
|
|
"TELEGRAM_BOT_TOKEN": "drop-me",
|
|
f"{_HERMES_PROVIDER_ENV_FORCE_PREFIX}TELEGRAM_BOT_TOKEN": "forced-bot-token",
|
|
},
|
|
)
|
|
|
|
env = captured["env"]
|
|
assert env["MY_CUSTOM_VAR"] == "keep-me"
|
|
assert env["TELEGRAM_BOT_TOKEN"] == "forced-bot-token"
|
|
assert "FIRECRAWL_API_KEY" not in env
|
|
assert f"{_HERMES_PROVIDER_ENV_FORCE_PREFIX}TELEGRAM_BOT_TOKEN" not in env
|
|
assert env["PYTHONUNBUFFERED"] == "1"
|
|
|
|
|
|
# =========================================================================
|
|
# Checkpoint
|
|
# =========================================================================
|
|
|
|
class TestCheckpoint:
|
|
def test_write_checkpoint(self, registry, tmp_path):
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"):
|
|
s = _make_session()
|
|
registry._running[s.id] = s
|
|
registry._write_checkpoint()
|
|
|
|
data = json.loads((tmp_path / "procs.json").read_text())
|
|
assert len(data) == 1
|
|
assert data[0]["session_id"] == s.id
|
|
|
|
def test_recover_no_file(self, registry, tmp_path):
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "missing.json"):
|
|
assert registry.recover_from_checkpoint() == 0
|
|
|
|
def test_recover_dead_pid(self, registry, tmp_path):
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_dead",
|
|
"command": "sleep 999",
|
|
"pid": 999999999, # almost certainly not running
|
|
"task_id": "t1",
|
|
}]))
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 0
|
|
|
|
def test_write_checkpoint_includes_watcher_metadata(self, registry, tmp_path):
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", tmp_path / "procs.json"):
|
|
s = _make_session()
|
|
s.watcher_platform = "telegram"
|
|
s.watcher_chat_id = "999"
|
|
s.watcher_thread_id = "42"
|
|
s.watcher_interval = 60
|
|
registry._running[s.id] = s
|
|
registry._write_checkpoint()
|
|
|
|
data = json.loads((tmp_path / "procs.json").read_text())
|
|
assert len(data) == 1
|
|
assert data[0]["watcher_platform"] == "telegram"
|
|
assert data[0]["watcher_chat_id"] == "999"
|
|
assert data[0]["watcher_thread_id"] == "42"
|
|
assert data[0]["watcher_interval"] == 60
|
|
|
|
def test_recover_enqueues_watchers(self, registry, tmp_path):
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_live",
|
|
"command": "sleep 999",
|
|
"pid": os.getpid(), # current process — guaranteed alive
|
|
"task_id": "t1",
|
|
"session_key": "sk1",
|
|
"watcher_platform": "telegram",
|
|
"watcher_chat_id": "123",
|
|
"watcher_thread_id": "42",
|
|
"watcher_interval": 60,
|
|
}]))
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 1
|
|
assert len(registry.pending_watchers) == 1
|
|
w = registry.pending_watchers[0]
|
|
assert w["session_id"] == "proc_live"
|
|
assert w["platform"] == "telegram"
|
|
assert w["chat_id"] == "123"
|
|
assert w["thread_id"] == "42"
|
|
assert w["check_interval"] == 60
|
|
|
|
def test_recover_skips_watcher_when_no_interval(self, registry, tmp_path):
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_live",
|
|
"command": "sleep 999",
|
|
"pid": os.getpid(),
|
|
"task_id": "t1",
|
|
"watcher_interval": 0,
|
|
}]))
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 1
|
|
assert len(registry.pending_watchers) == 0
|
|
|
|
def test_recovery_keeps_live_checkpoint_entries(self, registry, tmp_path):
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_live",
|
|
"command": "sleep 999",
|
|
"pid": os.getpid(),
|
|
"task_id": "t1",
|
|
"session_key": "sk1",
|
|
}]))
|
|
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 1
|
|
assert registry.get("proc_live") is not None
|
|
|
|
data = json.loads(checkpoint.read_text())
|
|
assert len(data) == 1
|
|
assert data[0]["session_id"] == "proc_live"
|
|
assert data[0]["pid"] == os.getpid()
|
|
assert data != []
|
|
|
|
def test_recovery_skips_explicit_sandbox_backed_entries(self, registry, tmp_path):
|
|
checkpoint = tmp_path / "procs.json"
|
|
original = [{
|
|
"session_id": "proc_remote",
|
|
"command": "sleep 999",
|
|
"pid": os.getpid(),
|
|
"task_id": "t1",
|
|
"pid_scope": "sandbox",
|
|
}]
|
|
checkpoint.write_text(json.dumps(original))
|
|
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 0
|
|
assert registry.get("proc_remote") is None
|
|
|
|
data = json.loads(checkpoint.read_text())
|
|
assert data == []
|
|
|
|
def test_detached_recovered_process_eventually_exits(self, registry, tmp_path):
|
|
proc = _spawn_python_sleep(0.4)
|
|
checkpoint = tmp_path / "procs.json"
|
|
checkpoint.write_text(json.dumps([{
|
|
"session_id": "proc_live",
|
|
"command": "python -c 'import time; time.sleep(0.4)'",
|
|
"pid": proc.pid,
|
|
"task_id": "t1",
|
|
"session_key": "sk1",
|
|
}]))
|
|
|
|
try:
|
|
with patch("tools.process_registry.CHECKPOINT_PATH", checkpoint):
|
|
recovered = registry.recover_from_checkpoint()
|
|
assert recovered == 1
|
|
|
|
session = registry.get("proc_live")
|
|
assert session is not None
|
|
assert session.detached is True
|
|
|
|
proc.wait(timeout=5)
|
|
|
|
assert _wait_until(
|
|
lambda: registry.get("proc_live") is not None
|
|
and registry.get("proc_live").exited,
|
|
timeout=5,
|
|
)
|
|
|
|
poll_result = registry.poll("proc_live")
|
|
assert poll_result["status"] == "exited"
|
|
|
|
wait_result = registry.wait("proc_live", timeout=1)
|
|
assert wait_result["status"] == "exited"
|
|
finally:
|
|
if proc.poll() is None:
|
|
proc.terminate()
|
|
try:
|
|
proc.wait(timeout=5)
|
|
except Exception:
|
|
proc.kill()
|
|
proc.wait(timeout=5)
|
|
|
|
|
|
# =========================================================================
|
|
# Kill process
|
|
# =========================================================================
|
|
|
|
class TestKillProcess:
|
|
def test_kill_not_found(self, registry):
|
|
result = registry.kill_process("nonexistent")
|
|
assert result["status"] == "not_found"
|
|
|
|
def test_kill_already_exited(self, registry):
|
|
s = _make_session(exited=True, exit_code=0)
|
|
registry._finished[s.id] = s
|
|
result = registry.kill_process(s.id)
|
|
assert result["status"] == "already_exited"
|
|
|
|
def test_kill_detached_session_uses_host_pid(self, registry):
|
|
s = _make_session(sid="proc_detached", command="sleep 999")
|
|
s.pid = 424242
|
|
s.detached = True
|
|
registry._running[s.id] = s
|
|
|
|
calls = []
|
|
|
|
def fake_kill(pid, sig):
|
|
calls.append((pid, sig))
|
|
|
|
try:
|
|
with patch("tools.process_registry.os.kill", side_effect=fake_kill):
|
|
result = registry.kill_process(s.id)
|
|
|
|
assert result["status"] == "killed"
|
|
assert (424242, 0) in calls
|
|
assert (424242, signal.SIGTERM) in calls
|
|
finally:
|
|
registry._running.pop(s.id, None)
|
|
|
|
|
|
# =========================================================================
|
|
# Tool handler
|
|
# =========================================================================
|
|
|
|
class TestProcessToolHandler:
|
|
def test_list_action(self):
|
|
from tools.process_registry import _handle_process
|
|
result = json.loads(_handle_process({"action": "list"}))
|
|
assert "processes" in result
|
|
|
|
def test_poll_missing_session_id(self):
|
|
from tools.process_registry import _handle_process
|
|
result = json.loads(_handle_process({"action": "poll"}))
|
|
assert "error" in result
|
|
|
|
def test_unknown_action(self):
|
|
from tools.process_registry import _handle_process
|
|
result = json.loads(_handle_process({"action": "unknown_action"}))
|
|
assert "error" in result
|