Refactor Singularity environment for persistent container management

- Updated the _SingularityEnvironment class to utilize a persistent Apptainer instance, allowing state (files, installs, environment changes) to persist across commands.
- Enhanced the initialization process to start a background instance with full isolation and writable filesystem.
- Modified the execute method to connect to the running instance, ensuring commands run within the same container context.
- Implemented cleanup functionality to stop the persistent instance on cleanup or destruction, improving resource management.
- Updated class documentation to reflect new features and usage of the persistent environment.
This commit is contained in:
teknium
2026-02-10 06:49:58 +00:00
parent 1b1307d0d1
commit e8343f2d87

View File

@@ -637,15 +637,21 @@ class _LocalEnvironment:
class _SingularityEnvironment:
"""
Custom Singularity/Apptainer environment with better space management.
Persistent Singularity/Apptainer container environment.
- Automatically builds/caches SIF images from docker:// URLs
- Builds sandbox in /scratch (if available) or configurable location
- Binds a large working directory into the container
- Keeps container isolated from host filesystem
Uses `apptainer instance` to create a long-running container that persists
state (files, installs, env changes) across all commands within a task.
The model experiences this as a real Linux VM.
Features:
- Persistent filesystem: files created in one command are visible in the next
- Package installs persist: pip/apt installs survive across tool calls
- Full isolation: --containall gives PID, IPC, and environment isolation
- Writable tmpfs overlay: full root filesystem is writable (RAM-backed)
- Automatic SIF caching: docker:// images converted to SIF once, reused forever
"""
def __init__(self, image: str, cwd: str = "/workspace", timeout: int = 60):
def __init__(self, image: str, cwd: str = "/root", timeout: int = 60):
self.cwd = cwd
self.timeout = timeout
@@ -655,48 +661,60 @@ class _SingularityEnvironment:
# Get or build SIF from docker:// URL (fast if already cached)
self.image = _get_or_build_sif(image, self.executable)
# Get scratch directory for working files
self.scratch_dir = _get_scratch_dir()
# Create unique instance name (must be alphanumeric + underscores)
self.instance_id = f"hermes_{uuid.uuid4().hex[:12]}"
self._instance_started = False
# Create unique ID for this environment
self.sandbox_id = f"hermes-{uuid.uuid4().hex[:12]}"
# Start the persistent instance
self._start_instance()
def _start_instance(self):
"""Start a persistent apptainer instance.
# Create a working directory that will be bound into the container
# This persists files across commands within the same task
self.work_dir = self.scratch_dir / f"{self.sandbox_id}-work"
self.work_dir.mkdir(parents=True, exist_ok=True)
The instance runs as a background process. All subsequent execute() calls
run commands inside this same instance, so state persists across calls.
"""
cmd = [
self.executable, "instance", "start",
"--writable-tmpfs", # RAM-backed writable overlay on read-only SIF
"--containall", # Full isolation: PID, IPC, environment, filesystem
str(self.image),
self.instance_id,
]
# No sandbox build needed! We use --writable-tmpfs which runs directly
# from the read-only SIF with an in-memory writable overlay.
# This is instant (vs minutes for sandbox build) and scales to any
# number of concurrent workers since they all share the same SIF.
print(f"[Singularity] Environment {self.sandbox_id} ready (tmpfs overlay, no build needed)", flush=True)
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120, # 2 min for instance startup
)
if result.returncode != 0:
raise RuntimeError(f"Failed to start instance: {result.stderr}")
self._instance_started = True
print(f"[Singularity] Instance {self.instance_id} started (persistent container)", flush=True)
except subprocess.TimeoutExpired:
raise RuntimeError("Instance start timed out")
def execute(self, command: str, cwd: str = "", *, timeout: int | None = None) -> dict:
"""Execute a command in the Singularity container."""
"""Execute a command in the persistent Singularity instance.
All commands run in the same container, so files, installs, and
environment changes persist between calls.
"""
if not self._instance_started:
return {"output": "Instance not started", "returncode": -1}
cmd = [self.executable, "exec"]
# Isolation flags - contain filesystem, clean env, isolate PID namespace
# --contain: isolate filesystem (no host mounts except explicit binds)
# --cleanenv: don't inherit host environment variables
# --pid: isolate PID namespace (ps aux only shows container processes)
# --writable-tmpfs: RAM-backed writable overlay on top of read-only SIF
# (instant startup, no sandbox build, scales to 250+ concurrent workers)
cmd.extend(["--contain", "--cleanenv", "--pid", "--writable-tmpfs"])
# Bind the working directory into the container at /workspace
# This gives the container access to a large writable space on disk
cmd.extend(["--bind", f"{self.work_dir}:/workspace"])
# Also bind it to /tmp inside container for pip cache etc.
cmd.extend(["--bind", f"{self.work_dir}:/tmp"])
# Set working directory
work_dir = cwd or self.cwd
cmd.extend(["--pwd", work_dir])
# Use the SIF directly (read-only, shared by all workers)
cmd.append(str(self.image))
# Connect to the running instance
cmd.append(f"instance://{self.instance_id}")
# Transform sudo commands if SUDO_PASSWORD is available
exec_command = _transform_sudo_command(command)
@@ -720,8 +738,19 @@ class _SingularityEnvironment:
return {"output": f"Command timed out after {timeout or self.timeout}s", "returncode": 124}
def cleanup(self):
"""Clean up working directory (no sandbox to remove with tmpfs overlay)."""
shutil.rmtree(self.work_dir, ignore_errors=True)
"""Stop the persistent instance and clean up."""
if self._instance_started:
try:
subprocess.run(
[self.executable, "instance", "stop", self.instance_id],
capture_output=True,
text=True,
timeout=30,
)
print(f"[Singularity] Instance {self.instance_id} stopped", flush=True)
except Exception as e:
print(f"[Singularity] Warning: failed to stop instance {self.instance_id}: {e}", flush=True)
self._instance_started = False
def stop(self):
"""Alias for cleanup."""
@@ -729,7 +758,10 @@ class _SingularityEnvironment:
def __del__(self):
"""Cleanup on destruction."""
self.cleanup()
try:
self.cleanup()
except:
pass
class _SSHEnvironment:
@@ -1016,6 +1048,44 @@ _env_lock = threading.Lock()
_cleanup_thread = None
_cleanup_running = False
# Per-task environment overrides registry.
# Allows environments (e.g., TerminalBench2Env) to specify a custom Docker/Modal
# image for a specific task_id BEFORE the agent loop starts. When the terminal or
# file tools create a new sandbox for that task_id, they check this registry first
# and fall back to the TERMINAL_MODAL_IMAGE (etc.) env var if no override is set.
#
# This is never exposed to the model -- only infrastructure code calls it.
# Thread-safe because each task_id is unique per rollout.
_task_env_overrides: Dict[str, Dict[str, Any]] = {}
def register_task_env_overrides(task_id: str, overrides: Dict[str, Any]):
"""
Register environment overrides for a specific task/rollout.
Called by Atropos environments before the agent loop to configure
per-task sandbox settings (e.g., a custom Dockerfile for the Modal image).
Supported override keys:
- modal_image: str -- Path to Dockerfile or Docker Hub image name
- docker_image: str -- Docker image name
- cwd: str -- Working directory inside the sandbox
Args:
task_id: The rollout's unique task identifier
overrides: Dict of config keys to override
"""
_task_env_overrides[task_id] = overrides
def clear_task_env_overrides(task_id: str):
"""
Clear environment overrides for a task after rollout completes.
Called during cleanup to avoid stale entries accumulating.
"""
_task_env_overrides.pop(task_id, None)
# Configuration from environment variables
def _get_env_config() -> Dict[str, Any]:
"""Get terminal environment configuration from environment variables."""
@@ -1027,10 +1097,10 @@ def _get_env_config() -> Dict[str, Any]:
# - local/ssh: current working directory (CLI resolves "." before we get here)
# - docker/singularity: /tmp inside the container (singularity bind-mounts /scratch there)
# - modal: /root (ephemeral cloud container, full filesystem access)
if env_type == "modal":
if env_type in ("modal", "singularity"):
default_cwd = "/root"
elif env_type in ("docker", "singularity"):
default_cwd = "/tmp"
elif env_type == "docker":
default_cwd = "/"
else:
default_cwd = os.getcwd()
@@ -1313,24 +1383,28 @@ def terminal_tool(
# Get configuration
config = _get_env_config()
env_type = config["env_type"]
# Select image based on env type
if env_type == "docker":
image = config["docker_image"]
elif env_type == "singularity":
image = config["singularity_image"]
elif env_type == "modal":
image = config["modal_image"]
else:
image = ""
cwd = config["cwd"]
default_timeout = config["timeout"]
effective_timeout = timeout or default_timeout
# Use task_id for environment isolation
effective_task_id = task_id or "default"
# Check per-task overrides (set by environments like TerminalBench2Env)
# before falling back to global env var config
overrides = _task_env_overrides.get(effective_task_id, {})
# Select image based on env type, with per-task override support
if env_type == "docker":
image = overrides.get("docker_image") or config["docker_image"]
elif env_type == "singularity":
image = overrides.get("singularity_image") or config["singularity_image"]
elif env_type == "modal":
image = overrides.get("modal_image") or config["modal_image"]
else:
image = ""
cwd = overrides.get("cwd") or config["cwd"]
default_timeout = config["timeout"]
effective_timeout = timeout or default_timeout
# For local environment in batch mode, create a unique subdirectory per task
# This prevents parallel tasks from overwriting each other's files
# In CLI mode (HERMES_QUIET), use the cwd directly without subdirectories