hermes-agent/environments/agentic_opd_env.py

"""
AgenticOPDEnv — On-Policy Distillation for Agentic Tool-Calling Tasks
=====================================================================

First Atropos environment to populate the distill_token_ids / distill_logprobs
fields on ScoredDataGroup, enabling on-policy distillation (OPD) training.

Key idea (from OpenClaw-RL, Princeton 2026):
  Every time an agent receives a next-state signal (tool result, error trace,
  test verdict), that signal contains hindsight information about how the
  agent's PREVIOUS response could have been better. This environment:

  1. Runs standard agentic rollouts (tool-calling agent loop)
  2. Walks the conversation to find (assistant_turn, next_state) pairs
  3. Uses an LLM judge to extract "hints" from next-state signals
  4. Builds an enhanced prompt (original context + hint)
  5. Scores the student's response tokens under the enhanced distribution
     using VLLM's prompt_logprobs (via Atropos's get_logprobs API)
  6. Packages the teacher's top-K predictions as distill_token_ids /
     distill_logprobs on the ScoredDataGroup

The trainer then computes per-token advantages:
  A_t = teacher_logprob(token_t) - student_logprob(token_t)
  Positive → teacher approves this token (upweight)
  Negative → teacher disapproves (downweight)

This gives dense, token-level training signal from every tool interaction,
instead of just a scalar reward at the end of the trajectory.

Task: Coding tasks with test verification (rich next-state signals from
test results, error messages, terminal output). Falls back to built-in
coding problems if no HuggingFace dataset is configured.

Requirements:
  - VLLM backend (server_type: vllm) — needed for prompt logprob scoring
  - Phase 2 mode (ManagedServer) — needed for token-level tracking

Usage:
    # Process mode (offline data generation with OPD)
    python environments/agentic_opd_env.py process \\
        --env.total_steps 10 --env.group_size 2 \\
        --env.data_path_to_save_groups output.jsonl \\
        --openai.base_url http://localhost:8000/v1 \\
        --openai.model_name Qwen/Qwen3-4B

    # Serve mode (connected to Atropos trainer)
    python environments/agentic_opd_env.py serve \\
        --openai.base_url http://localhost:8000/v1 \\
        --openai.model_name Qwen/Qwen3-4B

    # Evaluate mode
    python environments/agentic_opd_env.py evaluate \\
        --env.eval_size 10 \\
        --openai.base_url http://localhost:8000/v1 \\
        --openai.model_name Qwen/Qwen3-4B

Reference: Wang et al., "OpenClaw-RL: Train Any Agent Simply by Talking"
           arXiv:2603.10165, March 2026
"""

from __future__ import annotations

import asyncio
import copy
import json
import logging
import os
import random
import re
import sys
import time
import uuid
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple, Union

from pydantic import Field

# Ensure hermes-agent root is on path
_repo_root = Path(__file__).resolve().parent.parent
if str(_repo_root) not in sys.path:
    sys.path.insert(0, str(_repo_root))

from atroposlib.envs.base import ScoredDataGroup, ScoredDataItem
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from atroposlib.type_definitions import Item

from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.tool_context import ToolContext

logger = logging.getLogger(__name__)


# ═══════════════════════════════════════════════════════════════════════
# Built-in coding tasks (fallback when no HF dataset is configured)
# ═══════════════════════════════════════════════════════════════════════

BUILTIN_CODING_TASKS = [
    {
        "task": "Write a Python function `fizzbuzz(n)` that returns a list of strings from 1 to n. "
        "For multiples of 3 return 'Fizz', for multiples of 5 return 'Buzz', "
        "for multiples of both return 'FizzBuzz', otherwise the number as a string.",
        "test_code": (
            "from solution import fizzbuzz\n"
            "assert fizzbuzz(15) == ['1','2','Fizz','4','Buzz','Fizz','7','8','Fizz','Buzz','11','Fizz','13','14','FizzBuzz']\n"
            "assert fizzbuzz(1) == ['1']\n"
            "assert fizzbuzz(0) == []\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "easy",
    },
    {
        "task": "Write a Python function `is_palindrome(s)` that checks if a string is a palindrome, "
        "ignoring case and non-alphanumeric characters. Return True or False.",
        "test_code": (
            "from solution import is_palindrome\n"
            "assert is_palindrome('A man, a plan, a canal: Panama') == True\n"
            "assert is_palindrome('race a car') == False\n"
            "assert is_palindrome('') == True\n"
            "assert is_palindrome('Was it a car or a cat I saw?') == True\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "easy",
    },
    {
        "task": "Write a Python function `two_sum(nums, target)` that returns the indices of the two "
        "numbers in `nums` that add up to `target`. Assume exactly one solution exists. "
        "Return a list of two indices [i, j] where i < j.",
        "test_code": (
            "from solution import two_sum\n"
            "assert two_sum([2, 7, 11, 15], 9) == [0, 1]\n"
            "assert two_sum([3, 2, 4], 6) == [1, 2]\n"
            "assert two_sum([3, 3], 6) == [0, 1]\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "easy",
    },
    {
        "task": "Write a Python function `flatten(lst)` that takes an arbitrarily nested list and "
        "returns a flat list of all elements. For example, flatten([1, [2, [3, 4], 5]]) "
        "should return [1, 2, 3, 4, 5].",
        "test_code": (
            "from solution import flatten\n"
            "assert flatten([1, [2, [3, 4], 5]]) == [1, 2, 3, 4, 5]\n"
            "assert flatten([]) == []\n"
            "assert flatten([1, 2, 3]) == [1, 2, 3]\n"
            "assert flatten([[[[1]]]]) == [1]\n"
            "assert flatten([1, [2], [[3]], [[[4]]]]) == [1, 2, 3, 4]\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "medium",
    },
    {
        "task": "Write a Python function `longest_common_prefix(strs)` that finds the longest "
        "common prefix string amongst a list of strings. If there is no common prefix, "
        "return an empty string.",
        "test_code": (
            "from solution import longest_common_prefix\n"
            "assert longest_common_prefix(['flower', 'flow', 'flight']) == 'fl'\n"
            "assert longest_common_prefix(['dog', 'racecar', 'car']) == ''\n"
            "assert longest_common_prefix(['interspecies', 'interstellar', 'interstate']) == 'inters'\n"
            "assert longest_common_prefix(['a']) == 'a'\n"
            "assert longest_common_prefix([]) == ''\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "easy",
    },
    {
        "task": "Write a Python function `group_anagrams(strs)` that groups anagrams together. "
        "Return a list of lists, where each inner list contains strings that are anagrams of "
        "each other. The order of groups and strings within groups does not matter.",
        "test_code": (
            "from solution import group_anagrams\n"
            "result = group_anagrams(['eat', 'tea', 'tan', 'ate', 'nat', 'bat'])\n"
            "result_sorted = sorted([sorted(g) for g in result])\n"
            "assert result_sorted == [['ate', 'eat', 'tea'], ['bat'], ['nat', 'tan']]\n"
            "assert group_anagrams([]) == []\n"
            "assert group_anagrams(['a']) == [['a']]\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "medium",
    },
    {
        "task": "Write a Python function `valid_parentheses(s)` that determines if a string "
        "containing just '(', ')', '{', '}', '[' and ']' is valid. A string is valid if "
        "open brackets are closed by the same type and in the correct order.",
        "test_code": (
            "from solution import valid_parentheses\n"
            "assert valid_parentheses('()') == True\n"
            "assert valid_parentheses('()[]{}') == True\n"
            "assert valid_parentheses('(]') == False\n"
            "assert valid_parentheses('([)]') == False\n"
            "assert valid_parentheses('{[]}') == True\n"
            "assert valid_parentheses('') == True\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "easy",
    },
    {
        "task": "Write a Python function `merge_intervals(intervals)` that merges overlapping "
        "intervals. Each interval is a list [start, end]. Return the merged intervals sorted "
        "by start time.",
        "test_code": (
            "from solution import merge_intervals\n"
            "assert merge_intervals([[1,3],[2,6],[8,10],[15,18]]) == [[1,6],[8,10],[15,18]]\n"
            "assert merge_intervals([[1,4],[4,5]]) == [[1,5]]\n"
            "assert merge_intervals([[1,4],[0,4]]) == [[0,4]]\n"
            "assert merge_intervals([]) == []\n"
            "assert merge_intervals([[1,2]]) == [[1,2]]\n"
            "print('All tests passed!')\n"
        ),
        "difficulty": "medium",
    },
]


# ═══════════════════════════════════════════════════════════════════════
# Hint extraction prompts (adapted from OpenClaw-RL)
# ═══════════════════════════════════════════════════════════════════════

_HINT_JUDGE_SYSTEM = (
    "You are a process reward model used for hindsight hint extraction.\n"
    "You are given:\n"
    "1) The assistant response at turn t.\n"
    "2) The next state at turn t+1, along with its **role**.\n\n"
    "## Understanding the next state's role\n"
    "- role='user': A reply from the user (follow-up, correction, new request, etc.).\n"
    "- role='tool': The return value of a tool the assistant invoked. "
    "This content was NOT available before the assistant's action — "
    "it exists BECAUSE the assistant called the tool. "
    "A successful, non-error tool output generally means the assistant's "
    "action was appropriate; do NOT treat it as information the assistant "
    "should have already known.\n\n"
    "Your goal is to decide whether the next state reveals useful hindsight information\n"
    "that could have helped improve the assistant response at turn t.\n\n"
    "Output format rules (strict):\n"
    "- You MUST include exactly one final decision token: \\boxed{1} or \\boxed{-1}.\n"
    "- If and only if decision is \\boxed{1}, provide a concise, information-dense hint in 1-3 sentences,\n"
    "  wrapped between [HINT_START] and [HINT_END].\n"
    "- If decision is \\boxed{-1}, do not provide a hint block.\n"
    "- Hint must be concrete and actionable for improving the previous response."
)

_BOXED_RE = re.compile(r"\\boxed\{(-?\d+)\}")
_HINT_RE = re.compile(r"\[HINT_START\](.*?)\[HINT_END\]", re.DOTALL)


def _build_hint_judge_messages(
    response_text: str, next_state_text: str, next_state_role: str = "tool"
) -> list[dict]:
    """Build messages for the hint extraction judge."""
    user = (
        f"## Assistant response (turn t)\n{response_text}\n\n"
        f"## Next state (turn t+1) [role: {next_state_role}]\n{next_state_text}\n\n"
        "Now output your decision and (if positive) the hint in the required format."
    )
    return [
        {"role": "system", "content": _HINT_JUDGE_SYSTEM},
        {"role": "user", "content": user},
    ]


def _parse_hint_result(text: str) -> tuple[int | None, str]:
    """Parse the judge's boxed decision and hint text."""
    boxed = _BOXED_RE.findall(text)
    score = int(boxed[-1]) if boxed else None
    if score not in (1, -1):
        score = None
    hint_matches = _HINT_RE.findall(text)
    hint = hint_matches[-1].strip() if hint_matches else ""
    return score, hint


def _select_best_hint(votes: list[dict]) -> dict | None:
    """Select the best hint from majority-voted judge results."""
    good = [
        v
        for v in votes
        if v.get("score") == 1
        and isinstance(v.get("hint"), str)
        and len(v["hint"].strip()) > 10
    ]
    if not good:
        return None
    return max(good, key=lambda v: len(v["hint"].strip()))


def _append_hint_to_messages(messages: list[dict], hint: str) -> list[dict]:
    """Clone messages and append hint to the last user message."""
    cloned = copy.deepcopy(messages)
    if not cloned:
        return [{"role": "user", "content": f"[user's hint / instruction]\n{hint}"}]

    # Find last user message
    target_idx = None
    for i in range(len(cloned) - 1, -1, -1):
        if cloned[i].get("role") == "user":
            target_idx = i
            break
    if target_idx is None:
        target_idx = len(cloned) - 1

    content = cloned[target_idx].get("content", "")
    if isinstance(content, list):
        content = " ".join(
            c.get("text", "") if isinstance(c, dict) else str(c) for c in content
        )
    suffix = f"\n\n[user's hint / instruction]\n{hint.strip()}"
    cloned[target_idx]["content"] = (content + suffix).strip()
    return cloned


# ═══════════════════════════════════════════════════════════════════════
# Configuration
# ═══════════════════════════════════════════════════════════════════════


class AgenticOPDConfig(HermesAgentEnvConfig):
    """Configuration for the agentic OPD environment."""

    # --- OPD settings ---
    opd_enabled: bool = Field(
        default=True,
        description="Enable on-policy distillation pipeline. When disabled, "
        "the environment behaves like a standard agentic env (no distill fields).",
    )
    distill_topk: int = Field(
        default=50,
        description="Number of top-K teacher logprobs per position for distillation.",
    )
    prm_votes: int = Field(
        default=3,
        description="Number of independent judge queries for majority-voted hint extraction.",
    )
    hint_max_next_state_chars: int = Field(
        default=4000,
        description="Maximum characters of next-state text to include in the hint judge prompt. "
        "Tool results can be very long — truncating prevents judge context overflow.",
    )

    # --- Reward settings ---
    correctness_weight: float = Field(
        default=0.7,
        description="Weight for test pass/fail in reward.",
    )
    efficiency_weight: float = Field(
        default=0.15,
        description="Weight for efficiency (fewer turns = better).",
    )
    tool_usage_weight: float = Field(
        default=0.15,
        description="Weight for appropriate tool usage signal.",
    )

    # --- Dataset ---
    dataset_name: Optional[str] = Field(
        default=None,
        description="HuggingFace dataset with coding tasks. "
        "Expected fields: 'task' (problem description) and 'test_code' (pytest/assert tests). "
        "Falls back to built-in tasks if not set or unavailable.",
    )

    # --- Eval ---
    eval_size: int = Field(
        default=10,
        description="Number of held-out items for evaluation.",
    )
    eval_split_ratio: float = Field(
        default=0.15,
        description="Fraction of dataset to hold out for evaluation.",
    )


# ═══════════════════════════════════════════════════════════════════════
# Environment
# ═══════════════════════════════════════════════════════════════════════


class AgenticOPDEnv(HermesAgentBaseEnv):
    """
    RL environment with on-policy distillation from next-state signals.

    Runs coding tasks where the agent writes code and runs tests.
    Tool results (test pass/fail, error traces) serve as next-state signals
    for hint extraction and teacher logprob scoring.

    This is the first Atropos environment to populate distill_token_ids
    and distill_logprobs on ScoredDataGroup for OPD training.
    """

    name = "agentic-opd"
    env_config_cls = AgenticOPDConfig

    # Default toolsets: terminal for running code, file for writing it
    default_toolsets = ["terminal", "file"]

    @classmethod
    def config_init(cls) -> Tuple[AgenticOPDConfig, List[APIServerConfig]]:
        """Default configuration."""
        env_config = AgenticOPDConfig(
            # Toolsets
            enabled_toolsets=["terminal", "file"],
            # Agent loop
            max_agent_turns=15,
            agent_temperature=1.0,
            system_prompt=(
                "You are a skilled Python programmer. When given a coding task:\n"
                "1. Write the solution to a file called 'solution.py'\n"
                "2. Write the test code to a file called 'test_solution.py'\n"
                "3. Run the tests with: python test_solution.py\n"
                "4. If tests fail, read the error output carefully, fix your code, and re-run\n"
                "5. Once all tests pass, report success\n\n"
                "Be efficient — write clean code and fix errors methodically."
            ),
            # OPD
            opd_enabled=True,
            distill_topk=50,
            prm_votes=3,
            # Training
            group_size=4,
            total_steps=500,
            steps_per_eval=50,
            use_wandb=True,
            wandb_name="agentic-opd",
        )

        server_configs = [
            APIServerConfig(
                base_url="http://localhost:8000/v1",
                model_name="Qwen/Qwen3-4B",
                server_type="vllm",
            )
        ]

        return env_config, server_configs

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._items: list[dict] = []
        self._eval_items: list[dict] = []
        self._index: int = 0

        # Metric buffers
        self._reward_buffer: list[float] = []
        self._correctness_buffer: list[float] = []
        self._efficiency_buffer: list[float] = []
        self._tool_usage_buffer: list[float] = []
        self._hints_extracted_buffer: list[int] = []
        self._opd_turns_scored_buffer: list[int] = []

    # ═══════════════════════════════════════════════════════════════════
    # 1. setup — load dataset
    # ═══════════════════════════════════════════════════════════════════

    async def setup(self) -> None:
        """Load coding tasks from HuggingFace or use built-in set."""
        if self.config.dataset_name:
            try:
                from datasets import load_dataset

                logger.info(
                    "Loading dataset '%s'...", self.config.dataset_name
                )
                ds = load_dataset(
                    self.config.dataset_name, split=self.config.dataset_split
                )
                task_field = self.config.prompt_field
                self._items = [
                    {
                        "task": row.get(task_field, row.get("task", "")),
                        "test_code": row.get("test_code", row.get("tests", "")),
                        "difficulty": row.get("difficulty", "unknown"),
                    }
                    for row in ds
                    if row.get(task_field, row.get("task", ""))
                ]
                if self._items:
                    random.shuffle(self._items)
                    eval_size = max(
                        self.config.eval_size,
                        int(len(self._items) * self.config.eval_split_ratio),
                    )
                    self._eval_items = self._items[:eval_size]
                    self._items = self._items[eval_size:]
                    logger.info(
                        "Loaded %d train / %d eval items from '%s'",
                        len(self._items),
                        len(self._eval_items),
                        self.config.dataset_name,
                    )
                    return
            except Exception as e:
                logger.warning(
                    "Could not load dataset '%s': %s. Using built-in tasks.",
                    self.config.dataset_name,
                    e,
                )

        # Fallback to built-in tasks
        items = copy.deepcopy(BUILTIN_CODING_TASKS)
        random.shuffle(items)
        split = max(1, len(items) * 85 // 100)
        self._items = items[:split]
        self._eval_items = items[split:]
        logger.info(
            "Using built-in coding tasks: %d train / %d eval items",
            len(self._items),
            len(self._eval_items),
        )

    # ═══════════════════════════════════════════════════════════════════
    # 2. get_next_item
    # ═══════════════════════════════════════════════════════════════════

    async def get_next_item(self) -> dict:
        """Return the next coding task, cycling through the dataset."""
        if not self._items:
            raise RuntimeError("Dataset is empty. Did you call setup()?")
        item = self._items[self._index % len(self._items)]
        self._index += 1
        return item

    # ═══════════════════════════════════════════════════════════════════
    # 3. format_prompt
    # ═══════════════════════════════════════════════════════════════════

    def format_prompt(self, item: dict) -> str:
        """Format the coding task as a user prompt."""
        prompt = (
            f"Solve the following coding task.\n\n"
            f"## Task\n{item['task']}\n\n"
        )
        if item.get("test_code"):
            prompt += (
                f"## Tests\nThe following test code will be used to verify your solution:\n"
                f"```python\n{item['test_code']}```\n\n"
            )
        prompt += (
            "## Instructions\n"
            "1. Write your solution to `solution.py`\n"
            "2. Write the test code to `test_solution.py`\n"
            "3. Run `python test_solution.py` to verify\n"
            "4. Fix any failures and re-run until all tests pass\n"
        )
        return prompt

    # ═══════════════════════════════════════════════════════════════════
    # 4. compute_reward
    # ═══════════════════════════════════════════════════════════════════

    async def compute_reward(
        self,
        item: dict,
        result: AgentResult,
        ctx: ToolContext,
    ) -> float:
        """
        Multi-signal reward:
          - correctness (0.7): Did the tests pass?
          - efficiency (0.15): Fewer turns = better
          - tool_usage (0.15): Did the agent actually write + run code?
        """
        cfg = self.config

        # ---- Signal 1: Test correctness ----
        # Check if test_solution.py exists and passes in the agent's sandbox
        correctness = 0.0
        try:
            test_result = ctx.terminal("python test_solution.py 2>&1", timeout=30)
            output = test_result.get("output", "")
            exit_code = test_result.get("exit_code", 1)
            if exit_code == 0 and "passed" in output.lower():
                correctness = 1.0
            elif exit_code == 0:
                correctness = 0.8  # Ran without error but no explicit "passed"
            elif "assert" in output.lower() and "error" in output.lower():
                correctness = 0.2  # Partial — code runs but assertions fail
            else:
                correctness = 0.1  # Code errors out entirely
        except Exception as e:
            logger.debug("Test execution failed in reward: %s", e)
            correctness = 0.0

        # ---- Signal 2: Efficiency ----
        max_turns = cfg.max_agent_turns
        turns_used = result.turns_used
        if turns_used <= 3:
            efficiency = 1.0
        elif turns_used <= max_turns // 2:
            efficiency = 0.8
        elif turns_used <= max_turns * 3 // 4:
            efficiency = 0.5
        else:
            efficiency = 0.2

        # ---- Signal 3: Tool usage ----
        tools_used = set()
        for msg in result.messages:
            if msg.get("role") == "assistant" and msg.get("tool_calls"):
                for tc in msg["tool_calls"]:
                    fn = tc.get("function", {}) if isinstance(tc, dict) else {}
                    name = fn.get("name", "")
                    if name:
                        tools_used.add(name)

        # Good: used both terminal and file tools
        if "terminal" in tools_used and ("write_file" in tools_used or "patch" in tools_used):
            tool_usage = 1.0
        elif "terminal" in tools_used:
            tool_usage = 0.6
        elif tools_used:
            tool_usage = 0.3
        else:
            tool_usage = 0.0

        # ---- Combine ----
        reward = (
            cfg.correctness_weight * correctness
            + cfg.efficiency_weight * efficiency
            + cfg.tool_usage_weight * tool_usage
        )
        reward = min(1.0, max(0.0, reward))

        # Track metrics
        self._reward_buffer.append(reward)
        self._correctness_buffer.append(correctness)
        self._efficiency_buffer.append(efficiency)
        self._tool_usage_buffer.append(tool_usage)

        logger.debug(
            "Reward: correctness=%.2f, efficiency=%.2f, tool_usage=%.2f → %.3f",
            correctness,
            efficiency,
            tool_usage,
            reward,
        )
        return reward

    # ═══════════════════════════════════════════════════════════════════
    # 5. collect_trajectories — OPD pipeline
    # ═══════════════════════════════════════════════════════════════════

    async def collect_trajectories(
        self, item: Item
    ) -> Tuple[
        Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
        List[Item],
    ]:
        """
        Override collect_trajectories to add the OPD pipeline.

        1. Run standard rollouts via super() → ScoredDataGroup with tokens/masks/scores
        2. For each rollout, extract hints from next-state signals
        3. Score student tokens under enhanced (hint-augmented) distribution
        4. Add distill_token_ids / distill_logprobs to the ScoredDataGroup
        """
        # Step 1: Run standard rollouts
        scored_group, backlog = await super().collect_trajectories(item)

        # Step 2: OPD pipeline (only if enabled and we have VLLM server)
        if (
            self.config.opd_enabled
            and scored_group is not None
            and isinstance(scored_group, dict)
            and self._use_managed_server()
        ):
            await self._apply_opd_pipeline(scored_group)

        return scored_group, backlog

    async def _apply_opd_pipeline(self, group: ScoredDataGroup) -> None:
        """
        Apply on-policy distillation to each rollout in the group.

        For each rollout's messages:
        1. Find (assistant, next_state) turn pairs
        2. Extract hints via LLM judge with majority voting
        3. Build enhanced prompt (original + hint)
        4. Score student tokens under enhanced distribution via get_logprobs
        5. Add distill_token_ids / distill_logprobs to the group
        """
        messages_list = group.get("messages", [])
        tokens_list = group.get("tokens", [])

        if not messages_list or not tokens_list:
            logger.debug("OPD: No messages or tokens to process")
            return

        all_distill_token_ids: List[Optional[List[List[int]]]] = []
        all_distill_logprobs: List[Optional[List[List[float]]]] = []

        for seq_idx, (messages, student_tokens) in enumerate(
            zip(messages_list, tokens_list)
        ):
            try:
                distill_ids, distill_lps = await self._opd_for_sequence(
                    messages, student_tokens
                )
                all_distill_token_ids.append(distill_ids)
                all_distill_logprobs.append(distill_lps)
            except Exception as e:
                logger.warning(
                    "OPD failed for sequence %d: %s", seq_idx, e
                )
                all_distill_token_ids.append(None)
                all_distill_logprobs.append(None)

        # Only set distill fields if at least one sequence succeeded
        any_succeeded = any(d is not None for d in all_distill_token_ids)
        if any_succeeded:
            # Replace None entries with zero-padded arrays matching token length
            for i in range(len(all_distill_token_ids)):
                if all_distill_token_ids[i] is None and i < len(tokens_list):
                    seq_len = len(tokens_list[i])
                    k = self.config.distill_topk
                    all_distill_token_ids[i] = [[0] * k] * seq_len
                    all_distill_logprobs[i] = [[0.0] * k] * seq_len

            group["distill_token_ids"] = all_distill_token_ids
            group["distill_logprobs"] = all_distill_logprobs
            logger.info(
                "OPD: Set distill fields on %d/%d sequences",
                sum(1 for d in all_distill_token_ids if d is not None),
                len(all_distill_token_ids),
            )

    async def _opd_for_sequence(
        self, messages: List[Dict], student_tokens: List[int]
    ) -> Tuple[List[List[int]], List[List[float]]]:
        """
        Run OPD for a single rollout sequence.

        1. Walk conversation to find (assistant, next_state) pairs
        2. Extract hints from next-state signals
        3. For each hint-augmented turn, score student tokens via get_logprobs
        4. Merge per-turn teacher logprobs into a full-sequence distill array

        Returns:
            (distill_token_ids, distill_logprobs) each of shape [seq_len][top_k]
        """
        k = self.config.distill_topk
        seq_len = len(student_tokens)

        # Initialize with zeros (no distill info = neutral)
        distill_token_ids: List[List[int]] = [[0] * k for _ in range(seq_len)]
        distill_logprobs: List[List[float]] = [[0.0] * k for _ in range(seq_len)]

        # Find (assistant, next_state) turn pairs
        turn_pairs = self._extract_turn_pairs(messages)
        if not turn_pairs:
            return distill_token_ids, distill_logprobs

        hints_extracted = 0
        turns_scored = 0

        for pair in turn_pairs:
            try:
                hint = await self._extract_hint(
                    pair["assistant_text"],
                    pair["next_state_text"],
                    pair["next_state_role"],
                )
                if not hint:
                    continue

                hints_extracted += 1

                # Build enhanced prompt with hint
                enhanced_messages = _append_hint_to_messages(
                    pair["context_messages"], hint
                )

                # Tokenize the enhanced prompt
                if not self.tokenizer:
                    logger.warning("OPD: No tokenizer available, skipping scoring")
                    continue

                enhanced_prompt = self.tokenizer.apply_chat_template(
                    enhanced_messages,
                    tokenize=False,
                    add_generation_prompt=True,
                )

                # Tokenize the assistant response to score
                response_text = pair["assistant_text"]
                enhanced_full_text = enhanced_prompt + response_text
                enhanced_ids = self.tokenizer(
                    enhanced_full_text, add_special_tokens=False
                )["input_ids"]

                response_ids = self.tokenizer(
                    response_text, add_special_tokens=False
                )["input_ids"]
                response_len = len(response_ids)

                if response_len == 0:
                    continue

                # Score via get_logprobs — teacher scoring the student's tokens
                # under the enhanced (hint-augmented) distribution
                try:
                    logprob_result = await self.server.get_logprobs(
                        input_ids=enhanced_ids,
                        top_k=k,
                        split="eval",  # Use eval semaphore to not block training
                    )
                except Exception as e:
                    logger.debug("get_logprobs failed: %s", e)
                    continue

                teacher_topk_ids = logprob_result.get("prompt_topk_token_ids", [])
                teacher_topk_lps = logprob_result.get("prompt_topk_logprobs", [])

                if not teacher_topk_ids:
                    continue

                # Extract only the response positions (last response_len entries)
                if len(teacher_topk_ids) >= response_len:
                    resp_topk_ids = teacher_topk_ids[-response_len:]
                    resp_topk_lps = teacher_topk_lps[-response_len:]
                else:
                    # Pad from the left if the response was shorter than expected
                    pad_len = response_len - len(teacher_topk_ids)
                    resp_topk_ids = [[0] * k] * pad_len + teacher_topk_ids
                    resp_topk_lps = [[0.0] * k] * pad_len + teacher_topk_lps

                # Map these back to the student's full sequence positions
                # Find where this assistant turn's tokens appear in the full sequence
                turn_start = self._find_token_span(
                    student_tokens, response_ids
                )
                if turn_start is not None:
                    for j in range(min(response_len, seq_len - turn_start)):
                        pos = turn_start + j
                        if pos < seq_len and j < len(resp_topk_ids):
                            # Pad/truncate to exactly k entries
                            ids = resp_topk_ids[j][:k]
                            lps = resp_topk_lps[j][:k]
                            while len(ids) < k:
                                ids.append(0)
                                lps.append(0.0)
                            distill_token_ids[pos] = ids
                            distill_logprobs[pos] = lps
                    turns_scored += 1

            except Exception as e:
                logger.debug("OPD turn processing failed: %s", e)
                continue

        # Track OPD metrics
        self._hints_extracted_buffer.append(hints_extracted)
        self._opd_turns_scored_buffer.append(turns_scored)

        logger.debug(
            "OPD sequence: %d turn pairs, %d hints extracted, %d turns scored",
            len(turn_pairs),
            hints_extracted,
            turns_scored,
        )
        return distill_token_ids, distill_logprobs

    def _extract_turn_pairs(
        self, messages: List[Dict]
    ) -> List[Dict[str, Any]]:
        """
        Walk conversation messages to find (assistant, next_state) pairs.

        A "turn pair" is an assistant message with content (the response)
        followed by one or more tool results or a user reply (the next state).

        Returns list of dicts:
          {
            "context_messages": messages up to (not including) the assistant turn,
            "assistant_text": the assistant's response text,
            "next_state_text": the next state content (tool result or user reply),
            "next_state_role": "tool" or "user",
          }
        """
        pairs = []
        i = 0
        while i < len(messages):
            msg = messages[i]
            if msg.get("role") == "assistant" and msg.get("content"):
                # Found an assistant message with content
                assistant_text = msg["content"]
                context = messages[:i]  # Everything before this turn

                # Look ahead for next state
                j = i + 1
                # Skip tool_calls-only assistant messages and collect tool results
                next_states = []
                while j < len(messages):
                    next_msg = messages[j]
                    if next_msg.get("role") == "tool":
                        next_states.append(next_msg)
                        j += 1
                    elif next_msg.get("role") == "user":
                        next_states.append(next_msg)
                        break
                    else:
                        break

                if next_states:
                    # Combine all next-state content
                    next_text_parts = []
                    next_role = next_states[0].get("role", "tool")
                    for ns in next_states:
                        content = ns.get("content", "")
                        if content:
                            # Truncate very long tool outputs
                            max_chars = self.config.hint_max_next_state_chars
                            if len(content) > max_chars:
                                content = content[:max_chars] + "\n...[truncated]"
                            next_text_parts.append(content)

                    next_text = "\n---\n".join(next_text_parts)
                    if next_text.strip():
                        pairs.append(
                            {
                                "context_messages": context,
                                "assistant_text": assistant_text,
                                "next_state_text": next_text,
                                "next_state_role": next_role,
                            }
                        )
            i += 1
        return pairs

    async def _extract_hint(
        self,
        assistant_text: str,
        next_state_text: str,
        next_state_role: str,
    ) -> Optional[str]:
        """
        Extract a hindsight hint from a next-state signal using majority-voted LLM judge.

        Returns the hint string if the judge votes positively, None otherwise.
        """
        judge_messages = _build_hint_judge_messages(
            response_text=assistant_text,
            next_state_text=next_state_text,
            next_state_role=next_state_role,
        )

        # Majority voting across multiple judge queries
        votes = []
        tasks = []
        for _ in range(self.config.prm_votes):
            tasks.append(
                self.server.chat_completion(
                    messages=judge_messages,
                    n=1,
                    max_tokens=500,
                    temperature=0.7,
                    split="eval",
                )
            )

        results = await asyncio.gather(*tasks, return_exceptions=True)

        for result in results:
            if isinstance(result, Exception):
                logger.debug("Hint judge call failed: %s", result)
                votes.append({"score": None, "hint": ""})
                continue
            try:
                text = result.choices[0].message.content or ""
                score, hint = _parse_hint_result(text)
                votes.append({"score": score, "hint": hint})
            except Exception as e:
                logger.debug("Hint parse failed: %s", e)
                votes.append({"score": None, "hint": ""})

        selected = _select_best_hint(votes)
        if selected is None:
            return None
        return selected["hint"]

    @staticmethod
    def _find_token_span(
        full_tokens: List[int], sub_tokens: List[int]
    ) -> Optional[int]:
        """
        Find where sub_tokens appears in full_tokens.
        Returns the start index, or None if not found.

        Uses a sliding window search. For long sequences, searches
        from the end since assistant responses are typically at the end.
        """
        if not sub_tokens or not full_tokens:
            return None
        sub_len = len(sub_tokens)
        full_len = len(full_tokens)
        if sub_len > full_len:
            return None

        # Search backwards (assistant responses are usually near the end)
        for i in range(full_len - sub_len, -1, -1):
            if full_tokens[i : i + sub_len] == sub_tokens:
                return i
        return None

    # ═══════════════════════════════════════════════════════════════════
    # 6. evaluate
    # ═══════════════════════════════════════════════════════════════════

    async def evaluate(self, *args, **kwargs) -> None:
        """
        Evaluate on held-out coding tasks using the full agent loop.
        No OPD during eval — just standard agentic evaluation.
        """
        if not self._eval_items:
            logger.warning("No eval items available.")
            return

        eval_size = min(self.config.eval_size, len(self._eval_items))
        eval_items = self._eval_items[:eval_size]

        logger.info("Running eval on %d coding tasks...", len(eval_items))
        start_time = time.time()
        samples = []

        tools, valid_names = self._resolve_tools_for_group()

        for i, item in enumerate(eval_items):
            task_id = str(uuid.uuid4())
            logger.info(
                "Eval [%d/%d]: %s...", i + 1, len(eval_items), item["task"][:60]
            )

            try:
                messages: List[Dict[str, Any]] = []
                if self.config.system_prompt:
                    messages.append(
                        {"role": "system", "content": self.config.system_prompt}
                    )
                messages.append(
                    {"role": "user", "content": self.format_prompt(item)}
                )

                agent = HermesAgentLoop(
                    server=self.server,
                    tool_schemas=tools,
                    valid_tool_names=valid_names,
                    max_turns=self.config.max_agent_turns,
                    task_id=task_id,
                    temperature=0.0,
                    max_tokens=self.config.max_token_length,
                    extra_body=self.config.extra_body,
                )
                result = await agent.run(messages)

                # Compute reward (track buffer lengths to rollback eval pollution)
                buf_len = len(self._correctness_buffer)
                ctx = ToolContext(task_id)
                try:
                    reward = await self.compute_reward(item, result, ctx)
                finally:
                    ctx.cleanup()

                # Extract correctness and rollback training buffers
                correctness = (
                    self._correctness_buffer[buf_len]
                    if len(self._correctness_buffer) > buf_len
                    else 0.0
                )
                for buf in (
                    self._reward_buffer,
                    self._correctness_buffer,
                    self._efficiency_buffer,
                    self._tool_usage_buffer,
                ):
                    if len(buf) > buf_len:
                        buf.pop()

                # Also rollback OPD buffers if they were touched
                for buf in (
                    self._hints_extracted_buffer,
                    self._opd_turns_scored_buffer,
                ):
                    if len(buf) > buf_len:
                        buf.pop()

                # Extract final response
                final_response = ""
                for msg in reversed(result.messages):
                    if (
                        msg.get("role") == "assistant"
                        and msg.get("content")
                        and not final_response
                    ):
                        final_response = msg["content"]
                        break

                samples.append(
                    {
                        "prompt": item["task"][:200],
                        "response": final_response[:500],
                        "correctness": correctness,
                        "reward": reward,
                        "turns": result.turns_used,
                    }
                )

                logger.info(
                    "  → correctness=%.2f, reward=%.3f, turns=%d",
                    correctness,
                    reward,
                    result.turns_used,
                )

            except Exception as e:
                logger.error("Eval error: %s", e)
                samples.append(
                    {
                        "prompt": item["task"][:200],
                        "response": f"ERROR: {e}",
                        "correctness": 0.0,
                        "reward": 0.0,
                        "turns": 0,
                    }
                )

        end_time = time.time()

        correctness_scores = [s["correctness"] for s in samples]
        rewards = [s["reward"] for s in samples]
        n = len(samples)

        eval_metrics = {
            "eval/mean_correctness": sum(correctness_scores) / n if n else 0.0,
            "eval/mean_reward": sum(rewards) / n if n else 0.0,
            "eval/pass_rate": (
                sum(1 for c in correctness_scores if c >= 0.8) / n if n else 0.0
            ),
            "eval/n_items": n,
        }

        logger.info(
            "Eval complete — correctness=%.3f, reward=%.3f, pass_rate=%.0f%%",
            eval_metrics["eval/mean_correctness"],
            eval_metrics["eval/mean_reward"],
            eval_metrics["eval/pass_rate"] * 100,
        )

        await self.evaluate_log(
            metrics=eval_metrics,
            samples=samples,
            start_time=start_time,
            end_time=end_time,
        )

    # ═══════════════════════════════════════════════════════════════════
    # 7. wandb_log — custom OPD metrics
    # ═══════════════════════════════════════════════════════════════════

    async def wandb_log(self, wandb_metrics: Optional[Dict] = None) -> None:
        """Log reward breakdown and OPD-specific metrics to wandb."""
        if wandb_metrics is None:
            wandb_metrics = {}

        if self._reward_buffer:
            n = len(self._reward_buffer)
            wandb_metrics["train/mean_reward"] = sum(self._reward_buffer) / n
            wandb_metrics["train/mean_correctness"] = (
                sum(self._correctness_buffer) / n
            )
            wandb_metrics["train/mean_efficiency"] = (
                sum(self._efficiency_buffer) / n
            )
            wandb_metrics["train/mean_tool_usage"] = (
                sum(self._tool_usage_buffer) / n
            )
            wandb_metrics["train/pass_rate"] = (
                sum(1 for c in self._correctness_buffer if c >= 0.8) / n
            )
            wandb_metrics["train/total_rollouts"] = n

            self._reward_buffer.clear()
            self._correctness_buffer.clear()
            self._efficiency_buffer.clear()
            self._tool_usage_buffer.clear()

        # OPD-specific metrics
        if self._hints_extracted_buffer:
            n = len(self._hints_extracted_buffer)
            wandb_metrics["opd/mean_hints_per_rollout"] = (
                sum(self._hints_extracted_buffer) / n
            )
            wandb_metrics["opd/mean_turns_scored"] = (
                sum(self._opd_turns_scored_buffer) / n
            )
            wandb_metrics["opd/hint_rate"] = (
                sum(1 for h in self._hints_extracted_buffer if h > 0) / n
            )
            wandb_metrics["opd/total_hints"] = sum(self._hints_extracted_buffer)
            wandb_metrics["opd/total_scored_turns"] = sum(
                self._opd_turns_scored_buffer
            )

            self._hints_extracted_buffer.clear()
            self._opd_turns_scored_buffer.clear()

        await super().wandb_log(wandb_metrics)


# ═══════════════════════════════════════════════════════════════════════
# Entry point
# ═══════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    AgenticOPDEnv.cli()