Update RL tools and enhance configuration management

- Modified `model_tools.py` to update default model IDs and add new RL function `rl_test_inference`. - Enhanced `README.md` with installation instructions for submodules and updated API key usage. - Improved `rl_cli.py` to load configuration from `~/.hermes/config.yaml` and set terminal working directory for RL tools. - Updated `run_agent.py` to handle empty string arguments as empty objects for better JSON validation. - Refined installation scripts to ensure submodules are cloned and installed correctly, enhancing setup experience.
2026-02-04 13:57:59 -08:00
parent 12bbca95ec
commit 3c0d0dba49
7 changed files with 274 additions and 56 deletions
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ irm https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/ins
 ```

 The installer will:
- Clone to `~/.hermes-agent`
+- Clone to `~/.hermes-agent` (with submodules: mini-swe-agent, tinker-atropos)
 - Create a virtual environment
 - Install all dependencies
 - Run the interactive setup wizard
@@ -281,18 +281,10 @@ Train language models with reinforcement learning using the Tinker API and Atrop
 ```bash
 TINKER_API_KEY=your-tinker-key      # Get from https://tinker-console.thinkingmachines.ai/keys
 WANDB_API_KEY=your-wandb-key        # Get from https://wandb.ai/authorize
+OPENROUTER_API_KEY=your-key         # Optional: for rl_test_inference
 ```

-2. **Install tinker-atropos:** (in a separate directory)
-```bash
-cd ~/tinker-atropos
-pip install -e .
-```
-
-3. **Start the RL API server:**
-```bash
-rl-server    # Runs on port 8080 by default
-```
+2. **That's it!** tinker-atropos is included as a submodule - no separate installation needed.

 #### Using RL Tools

@@ -313,10 +305,12 @@ Agent: I'll set up an RL training run on the GSM8k environment...
 | `rl_select_environment` | Select an environment for training |
 | `rl_get_current_config` | View all configurable options |
 | `rl_edit_config` | Change a configuration value |
+| `rl_test_inference` | Test environment with OpenRouter (pre-training validation) |
 | `rl_start_training` | Start a training run |
 | `rl_check_status` | Check training progress |
 | `rl_stop_training` | Stop a running training |
 | `rl_get_results` | Fetch WandB metrics |
+| `rl_list_runs` | List active training runs |

 #### Dedicated RL CLI

@@ -434,7 +428,7 @@ skills/
 If you prefer not to use the installer:

 ```bash
-# Clone the repository
+# Clone the repository (with submodules)
 git clone --recurse-submodules https://github.com/NousResearch/hermes-agent.git
 cd hermes-agent

@@ -445,6 +439,11 @@ cd hermes-agent
 python3 -m venv venv
 source venv/bin/activate
 pip install -e ".[all]"
+
+# Install submodules (required for terminal and RL tools)
+pip install -e "./mini-swe-agent"    # Terminal tool backend
+pip install -e "./tinker-atropos"    # RL training backend
+
 hermes setup
 ```

--- a/model_tools.py
+++ b/model_tools.py
@@ -665,7 +665,7 @@ def get_rl_tool_definitions() -> List[Dict[str, Any]]:
                        "models": {
                            "type": "array",
                            "items": {"type": "string"},
-                            "description": "Optional list of OpenRouter model IDs. Default: qwen/qwen3-8b, zhipu-ai/glm-4-flash, minimax/minimax-m1"
+                            "description": "Optional list of OpenRouter model IDs. Default: qwen/qwen3-8b, z-ai/glm-4.7-flash, minimax/minimax-m2.1"
                        }
                    },
                    "required": []
@@ -730,7 +730,7 @@ def get_all_tool_names() -> List[str]:
            "rl_get_current_config", "rl_edit_config",
            "rl_start_training", "rl_check_status",
            "rl_stop_training", "rl_get_results",
-            "rl_list_runs"
+            "rl_list_runs", "rl_test_inference"
        ])
    
    return tool_names
@@ -898,7 +898,7 @@ def get_tool_definitions(
                            "rl_get_current_config", "rl_edit_config",
                            "rl_start_training", "rl_check_status",
                            "rl_stop_training", "rl_get_results",
-                            "rl_list_runs"
+                            "rl_list_runs", "rl_test_inference"
                        ]
                    }
                    legacy_tools = legacy_map.get(toolset_name, [])
@@ -950,7 +950,7 @@ def get_tool_definitions(
                            "rl_get_current_config", "rl_edit_config",
                            "rl_start_training", "rl_check_status",
                            "rl_stop_training", "rl_get_results",
-                            "rl_list_runs"
+                            "rl_list_runs", "rl_test_inference"
                        ]
                    }
                    legacy_tools = legacy_map.get(toolset_name, [])
@@ -1407,7 +1407,7 @@ def handle_function_call(
            "rl_get_current_config", "rl_edit_config",
            "rl_start_training", "rl_check_status",
            "rl_stop_training", "rl_get_results",
-            "rl_list_runs"
+            "rl_list_runs", "rl_test_inference"
        ]:
            return handle_rl_function_call(function_name, function_args)

--- a/rl_cli.py
+++ b/rl_cli.py
@@ -25,14 +25,34 @@ import sys
 from pathlib import Path

 import fire
+import yaml

 # Load environment variables from .env file
 from dotenv import load_dotenv

-env_path = Path(__file__).parent / '.env'
-if env_path.exists():
-    load_dotenv(dotenv_path=env_path)
-    print(f"✅ Loaded environment variables from {env_path}")
+# Load from ~/.hermes/.env first, then local .env
+hermes_env_path = Path.home() / '.hermes' / '.env'
+local_env_path = Path(__file__).parent / '.env'
+
+if hermes_env_path.exists():
+    load_dotenv(dotenv_path=hermes_env_path)
+    print(f"✅ Loaded environment variables from {hermes_env_path}")
+elif local_env_path.exists():
+    load_dotenv(dotenv_path=local_env_path)
+    print(f"✅ Loaded environment variables from {local_env_path}")
+
+# Set terminal working directory to tinker-atropos submodule
+# This ensures terminal commands run in the right context for RL work
+tinker_atropos_dir = Path(__file__).parent / 'tinker-atropos'
+if tinker_atropos_dir.exists():
+    os.environ['TERMINAL_CWD'] = str(tinker_atropos_dir)
+    os.environ['HERMES_QUIET'] = '1'  # Disable temp subdirectory creation
+    print(f"📂 Terminal working directory: {tinker_atropos_dir}")
+else:
+    # Fall back to hermes-agent directory if submodule not found
+    os.environ['TERMINAL_CWD'] = str(Path(__file__).parent)
+    os.environ['HERMES_QUIET'] = '1'
+    print(f"⚠️  tinker-atropos submodule not found, using: {Path(__file__).parent}")

 # Import agent and tools
 from run_agent import AIAgent
@@ -40,6 +60,50 @@ from model_tools import get_tool_definitions, check_toolset_requirements
 from tools.rl_training_tool import check_rl_api_keys, get_missing_keys


+# ============================================================================
+# Config Loading
+# ============================================================================
+
+DEFAULT_MODEL = "anthropic/claude-opus-4.5"
+DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"
+
+
+def load_hermes_config() -> dict:
+    """
+    Load configuration from ~/.hermes/config.yaml.
+    
+    Returns:
+        dict: Configuration with model, base_url, etc.
+    """
+    config_path = Path.home() / '.hermes' / 'config.yaml'
+    
+    config = {
+        "model": DEFAULT_MODEL,
+        "base_url": DEFAULT_BASE_URL,
+    }
+    
+    if config_path.exists():
+        try:
+            with open(config_path, "r") as f:
+                file_config = yaml.safe_load(f) or {}
+            
+            # Get model from config
+            if "model" in file_config:
+                if isinstance(file_config["model"], str):
+                    config["model"] = file_config["model"]
+                elif isinstance(file_config["model"], dict):
+                    config["model"] = file_config["model"].get("default", DEFAULT_MODEL)
+            
+            # Get base_url if specified
+            if "base_url" in file_config:
+                config["base_url"] = file_config["base_url"]
+                
+        except Exception as e:
+            print(f"⚠️  Warning: Failed to load config.yaml: {e}")
+    
+    return config
+
+
 # ============================================================================
 # RL-Specific Configuration
 # ============================================================================
@@ -108,7 +172,7 @@ When asked to train a model, follow this workflow:
 """

 # Toolsets to enable for RL workflows
-RL_TOOLSETS = ["base", "terminal", "web", "rl"]
+RL_TOOLSETS = ["terminal", "web", "rl"]


 # ============================================================================
@@ -172,9 +236,9 @@ def list_environments_sync():

 def main(
    task: str = None,
-    model: str = "anthropic/claude-sonnet-4-20250514",
+    model: str = None,
    api_key: str = None,
-    base_url: str = "https://openrouter.ai/api/v1",
+    base_url: str = None,
    max_iterations: int = RL_MAX_ITERATIONS,
    interactive: bool = False,
    list_environments: bool = False,
@@ -187,9 +251,9 @@ def main(
    
    Args:
        task: The training task/goal (e.g., "Train a model on GSM8k for math")
-        model: Model to use for the agent (default: claude-sonnet-4)
+        model: Model to use for the agent (reads from ~/.hermes/config.yaml if not provided)
        api_key: OpenRouter API key (uses OPENROUTER_API_KEY env var if not provided)
-        base_url: API base URL (default: OpenRouter)
+        base_url: API base URL (reads from config or defaults to OpenRouter)
        max_iterations: Maximum agent iterations (default: 200 for long workflows)
        interactive: Run in interactive mode (multiple conversations)
        list_environments: Just list available RL environments and exit
@@ -210,6 +274,15 @@ def main(
        # Check server status
        python rl_cli.py --check-server
    """
+    # Load config from ~/.hermes/config.yaml
+    config = load_hermes_config()
+    
+    # Use config values if not explicitly provided
+    if model is None:
+        model = config["model"]
+    if base_url is None:
+        base_url = config["base_url"]
+    
    print("🎯 RL Training Agent")
    print("=" * 60)
    
--- a/run_agent.py
+++ b/run_agent.py
@@ -1764,10 +1764,16 @@ class AIAgent:
                        self._invalid_tool_retries = 0
                    
                    # Validate tool call arguments are valid JSON
+                    # Handle empty strings as empty objects (common model quirk)
                    invalid_json_args = []
                    for tc in assistant_message.tool_calls:
+                        args = tc.function.arguments
+                        # Treat empty/whitespace strings as empty object
+                        if not args or not args.strip():
+                            tc.function.arguments = "{}"
+                            continue
                        try:
-                            json.loads(tc.function.arguments)
+                            json.loads(args)
                        except json.JSONDecodeError as e:
                            invalid_json_args.append((tc.function.name, str(e)))
                    
--- a/scripts/install.ps1
+++ b/scripts/install.ps1
@@ -150,14 +150,15 @@ function Install-Repository {
        }
    } else {
        # Try SSH first (for private repo access), fall back to HTTPS
+        # Use --recurse-submodules to also clone mini-swe-agent and tinker-atropos
        Write-Info "Trying SSH clone..."
-        $sshResult = git clone --branch $Branch $RepoUrlSsh $InstallDir 2>&1
+        $sshResult = git clone --branch $Branch --recurse-submodules $RepoUrlSsh $InstallDir 2>&1
        
        if ($LASTEXITCODE -eq 0) {
            Write-Success "Cloned via SSH"
        } else {
            Write-Info "SSH failed, trying HTTPS..."
-            $httpsResult = git clone --branch $Branch $RepoUrlHttps $InstallDir 2>&1
+            $httpsResult = git clone --branch $Branch --recurse-submodules $RepoUrlHttps $InstallDir 2>&1
            
            if ($LASTEXITCODE -eq 0) {
                Write-Success "Cloned via HTTPS"
@@ -171,6 +172,13 @@ function Install-Repository {
        }
    }
    
+    # Ensure submodules are initialized and updated (for existing installs or if --recurse failed)
+    Write-Info "Initializing submodules (mini-swe-agent, tinker-atropos)..."
+    Push-Location $InstallDir
+    git submodule update --init --recursive
+    Pop-Location
+    Write-Success "Submodules ready"
+    
    Write-Success "Repository ready"
 }

@@ -208,15 +216,43 @@ function Install-Dependencies {
        & .\venv\Scripts\Activate.ps1
    }
    
+    # Install main package
    try {
        pip install -e ".[all]" 2>&1 | Out-Null
    } catch {
        pip install -e "." | Out-Null
    }
    
+    Write-Success "Main package installed"
+    
+    # Install submodules
+    Write-Info "Installing mini-swe-agent (terminal tool backend)..."
+    if (Test-Path "mini-swe-agent\pyproject.toml") {
+        try {
+            pip install -e ".\mini-swe-agent" 2>&1 | Out-Null
+            Write-Success "mini-swe-agent installed"
+        } catch {
+            Write-Warning "mini-swe-agent install failed (terminal tools may not work)"
+        }
+    } else {
+        Write-Warning "mini-swe-agent not found (run: git submodule update --init)"
+    }
+    
+    Write-Info "Installing tinker-atropos (RL training backend)..."
+    if (Test-Path "tinker-atropos\pyproject.toml") {
+        try {
+            pip install -e ".\tinker-atropos" 2>&1 | Out-Null
+            Write-Success "tinker-atropos installed"
+        } catch {
+            Write-Warning "tinker-atropos install failed (RL tools may not work)"
+        }
+    } else {
+        Write-Warning "tinker-atropos not found (run: git submodule update --init)"
+    }
+    
    Pop-Location
    
-    Write-Success "Dependencies installed"
+    Write-Success "All dependencies installed"
 }

 function Set-PathVariable {
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -292,12 +292,13 @@ clone_repo() {
        fi
    else
        # Try SSH first (for private repo access), fall back to HTTPS
+        # Use --recurse-submodules to also clone mini-swe-agent and tinker-atropos
        log_info "Trying SSH clone..."
-        if git clone --branch "$BRANCH" "$REPO_URL_SSH" "$INSTALL_DIR" 2>/dev/null; then
+        if git clone --branch "$BRANCH" --recurse-submodules "$REPO_URL_SSH" "$INSTALL_DIR" 2>/dev/null; then
            log_success "Cloned via SSH"
        else
            log_info "SSH failed, trying HTTPS..."
-            if git clone --branch "$BRANCH" "$REPO_URL_HTTPS" "$INSTALL_DIR"; then
+            if git clone --branch "$BRANCH" --recurse-submodules "$REPO_URL_HTTPS" "$INSTALL_DIR"; then
                log_success "Cloned via HTTPS"
            else
                log_error "Failed to clone repository"
@@ -310,6 +311,12 @@ clone_repo() {
    fi
    
    cd "$INSTALL_DIR"
+    
+    # Ensure submodules are initialized and updated (for existing installs or if --recurse failed)
+    log_info "Initializing submodules (mini-swe-agent, tinker-atropos)..."
+    git submodule update --init --recursive
+    log_success "Submodules ready"
+    
    log_success "Repository ready"
 }

@@ -343,10 +350,29 @@ install_deps() {
        source venv/bin/activate
    fi
    
-    # Install the package in editable mode with all extras
+    # Install the main package in editable mode with all extras
    pip install -e ".[all]" > /dev/null 2>&1 || pip install -e "." > /dev/null
    
-    log_success "Dependencies installed"
+    log_success "Main package installed"
+    
+    # Install submodules
+    log_info "Installing mini-swe-agent (terminal tool backend)..."
+    if [ -d "mini-swe-agent" ] && [ -f "mini-swe-agent/pyproject.toml" ]; then
+        pip install -e "./mini-swe-agent" > /dev/null 2>&1 || log_warn "mini-swe-agent install failed (terminal tools may not work)"
+        log_success "mini-swe-agent installed"
+    else
+        log_warn "mini-swe-agent not found (run: git submodule update --init)"
+    fi
+    
+    log_info "Installing tinker-atropos (RL training backend)..."
+    if [ -d "tinker-atropos" ] && [ -f "tinker-atropos/pyproject.toml" ]; then
+        pip install -e "./tinker-atropos" > /dev/null 2>&1 || log_warn "tinker-atropos install failed (RL tools may not work)"
+        log_success "tinker-atropos installed"
+    else
+        log_warn "tinker-atropos not found (run: git submodule update --init)"
+    fi
+    
+    log_success "All dependencies installed"
 }

 setup_path() {
--- a/tools/rl_training_tool.py
+++ b/tools/rl_training_tool.py
@@ -37,6 +37,7 @@ import subprocess
 import sys
 import time
 import uuid
+from datetime import datetime
 import yaml
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -84,6 +85,7 @@ LOCKED_FIELDS = {
            "weight": 1.0,
            "num_requests_for_eval": 256,
            "timeout": 3600,
+            "server_type": "sglang",  # Tinker uses sglang for actual training
        }
    ],
    "tinker": {
@@ -211,6 +213,9 @@ def _scan_environments() -> List[EnvironmentInfo]:
 def _get_env_config_fields(env_file_path: str) -> Dict[str, Dict[str, Any]]:
    """
    Dynamically import an environment and extract its config fields.
+    
+    Uses config_init() to get the actual config class, with fallback to
+    directly importing BaseEnvConfig if config_init fails.
    """
    try:
        # Load the environment module
@@ -230,15 +235,38 @@ def _get_env_config_fields(env_file_path: str) -> Dict[str, Dict[str, Any]]:
        if not env_class:
            return {}
        
-        # Call config_init to get the actual config
+        # Try calling config_init to get the actual config class
+        config_class = None
+        try:
            env_config, server_configs = env_class.config_init()
            config_class = type(env_config)
+        except Exception as config_error:
+            # Fallback: try to import BaseEnvConfig directly from atroposlib
+            print(f"Note: config_init failed ({config_error}), using BaseEnvConfig defaults")
+            try:
+                from atroposlib.envs.base import BaseEnvConfig
+                config_class = BaseEnvConfig
+            except ImportError:
+                return {}
+        
+        if not config_class:
+            return {}
+        
+        # Helper to make values JSON-serializable (handle enums, etc.)
+        def make_serializable(val):
+            if val is None:
+                return None
+            if hasattr(val, 'value'):  # Enum
+                return val.value
+            if hasattr(val, 'name') and hasattr(val, '__class__') and 'Enum' in str(type(val)):
+                return val.name
+            return val
        
        # Extract fields from the Pydantic model
        fields = {}
        for field_name, field_info in config_class.model_fields.items():
            field_type = field_info.annotation
-            default = field_info.default
+            default = make_serializable(field_info.default)
            description = field_info.description or ""
            
            is_locked = field_name in LOCKED_FIELD_NAMES
@@ -248,12 +276,15 @@ def _get_env_config_fields(env_file_path: str) -> Dict[str, Dict[str, Any]]:
            if hasattr(field_type, "__origin__"):
                type_name = str(field_type)
            
+            locked_value = LOCKED_FIELDS.get("env", {}).get(field_name, default)
+            current_value = make_serializable(locked_value) if is_locked else default
+            
            fields[field_name] = {
                "type": type_name,
-                "default": default if default is not None else None,
+                "default": default,
                "description": description,
                "locked": is_locked,
-                "current_value": LOCKED_FIELDS.get("env", {}).get(field_name, default) if is_locked else default,
+                "current_value": current_value,
            }
        
        return fields
@@ -315,7 +346,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        
        trainer_log_file = open(trainer_log, "w")
        run_state.trainer_process = subprocess.Popen(
-            ["python", "launch_training.py", "--config", str(config_path)],
+            [sys.executable, "launch_training.py", "--config", str(config_path)],
            stdout=trainer_log_file,
            stderr=subprocess.STDOUT,
            cwd=str(TINKER_ATROPOS_ROOT),
@@ -355,7 +386,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        
        env_log_file = open(env_log, "w")
        run_state.env_process = subprocess.Popen(
-            ["python", str(env_info.file_path), "serve", "--config", str(config_path)],
+            [sys.executable, str(env_info.file_path), "serve", "--config", str(config_path)],
            stdout=env_log_file,
            stderr=subprocess.STDOUT,
            cwd=str(TINKER_ATROPOS_ROOT),
@@ -543,17 +574,14 @@ async def rl_select_environment(name: str) -> str:
        if not field_info.get("locked", False):
            _current_config[field_name] = field_info.get("default")
    
-    configurable_count = sum(1 for f in config_fields.values() if not f.get("locked", False))
-    locked_count = sum(1 for f in config_fields.values() if f.get("locked", False))
+    # Auto-set wandb_name to "{env_name}-DATETIME" to avoid overlaps
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
+    _current_config["wandb_name"] = f"{name}-{timestamp}"
    
    return json.dumps({
        "message": f"Selected environment: {name}",
        "environment": name,
        "file_path": env_info.file_path,
-        "configurable_fields": configurable_count,
-        "locked_fields": locked_count,
-        "config": _current_config,
-        "tip": f"Use rl_get_current_config() to see all {configurable_count} configurable fields.",
    }, indent=2)


@@ -961,10 +989,11 @@ async def rl_list_runs() -> str:
 # ============================================================================

 # Test models at different scales for robustness testing
+# These are cheap, capable models on OpenRouter for testing parsing/scoring
 TEST_MODELS = [
    {"id": "qwen/qwen3-8b", "name": "Qwen3 8B", "scale": "small"},
-    {"id": "zhipu-ai/glm-4-flash", "name": "GLM-4 Flash", "scale": "medium"},
-    {"id": "minimax/minimax-m1", "name": "MiniMax M1", "scale": "large"},
+    {"id": "z-ai/glm-4.7-flash", "name": "GLM-4.7 Flash", "scale": "medium"},
+    {"id": "minimax/minimax-m2.1", "name": "MiniMax M2.1", "scale": "large"},
 ]

 # Default test parameters - quick but representative
@@ -1066,18 +1095,35 @@ async def rl_test_inference(
        
        # Build the process command using Atropos's built-in CLI
        # This runs the environment's actual code with OpenRouter as the inference backend
+        # We pass our locked settings + test-specific overrides via CLI args
        cmd = [
-            "python", env_info.file_path, "process",
+            sys.executable, env_info.file_path, "process",
+            # Test-specific overrides
            "--env.total_steps", str(num_steps),
            "--env.group_size", str(group_size),
-            "--env.use_wandb", "false",
+            "--env.use_wandb", "false",  # No wandb for quick tests
            "--env.data_path_to_save_groups", str(output_file),
+            # Use locked settings from our config
+            "--env.tokenizer_name", LOCKED_FIELDS["env"]["tokenizer_name"],
+            "--env.max_token_length", str(LOCKED_FIELDS["env"]["max_token_length"]),
+            "--env.max_num_workers", str(LOCKED_FIELDS["env"]["max_num_workers"]),
+            "--env.max_batches_offpolicy", str(LOCKED_FIELDS["env"]["max_batches_offpolicy"]),
+            # OpenRouter config for inference testing
+            # IMPORTANT: Use server_type=openai for OpenRouter (not sglang)
+            # sglang is only for actual training with Tinker's inference server
            "--openai.base_url", "https://openrouter.ai/api/v1",
            "--openai.api_key", api_key,
            "--openai.model_name", model_id,
+            "--openai.server_type", "openai",  # OpenRouter is OpenAI-compatible
+            "--openai.health_check", "false",  # OpenRouter doesn't have health endpoint
        ]
        
-        print(f"Running: python {Path(env_info.file_path).name} process ...")
+        # Debug: Print the full command
+        cmd_str = " ".join(str(c) for c in cmd)
+        # Hide API key in printed output
+        cmd_display = cmd_str.replace(api_key, "***API_KEY***")
+        print(f"Command: {cmd_display}")
+        print(f"Working dir: {TINKER_ATROPOS_ROOT}")
        print(f"  {num_steps} steps × {group_size} completions = {total_rollouts_per_model} rollouts")
        
        model_results = {
@@ -1105,12 +1151,44 @@ async def rl_test_inference(
                timeout=600,  # 10 minute timeout per model
            )
            
+            # Decode output
+            stdout_text = stdout.decode() if stdout else ""
+            stderr_text = stderr.decode() if stderr else ""
+            
+            # Write logs to files for inspection outside CLI
+            log_file = test_output_dir / f"test_{_current_env}_{model_safe_name}.log"
+            with open(log_file, "w") as f:
+                f.write(f"Command: {cmd_display}\n")
+                f.write(f"Working dir: {TINKER_ATROPOS_ROOT}\n")
+                f.write(f"Return code: {process.returncode}\n")
+                f.write(f"\n{'='*60}\n")
+                f.write(f"STDOUT:\n{'='*60}\n")
+                f.write(stdout_text or "(empty)\n")
+                f.write(f"\n{'='*60}\n")
+                f.write(f"STDERR:\n{'='*60}\n")
+                f.write(stderr_text or "(empty)\n")
+            
+            print(f"  Log file: {log_file}")
+            
+            # Print to console for immediate debugging
+            if stdout_text.strip():
+                print(f"\n--- STDOUT ---")
+                print(stdout_text[-2000:])  # Last 2000 chars
+            
+            if stderr_text.strip():
+                print(f"\n--- STDERR ---")
+                print(stderr_text[-2000:])  # Last 2000 chars
+            
            if process.returncode != 0:
                model_results["error"] = f"Process exited with code {process.returncode}"
-                model_results["stderr"] = stderr.decode()[-1000:]
-                print(f"  Error: {model_results['error']}")
+                model_results["stderr"] = stderr_text[-1000:]
+                model_results["stdout"] = stdout_text[-1000:]
+                model_results["log_file"] = str(log_file)
+                print(f"\n  ❌ Error: {model_results['error']}")
            else:
-                print(f"  Process completed successfully")
+                print(f"\n  ✅ Process completed successfully")
+                print(f"  Output file: {output_file}")
+                print(f"  File exists: {output_file.exists()}")
                
                # Parse the output JSONL file
                if output_file.exists():