feat: add WebResearchEnv RL environment for multi-step web research

2026-03-05 14:34:36 +00:00
parent ada3713e77
commit 15561ec425
2 changed files with 563 additions and 0 deletions
--- a/datagen-config-examples/web_research.yaml
+++ b/datagen-config-examples/web_research.yaml
@@ -0,0 +1,46 @@
+# datagen-config-examples/web_research.yaml
+#
+# Batch data generation config for WebResearchEnv.
+# Generates tool-calling trajectories for multi-step web research tasks.
+#
+# Usage:
+#   python batch_runner.py \
+#     --config datagen-config-examples/web_research.yaml \
+#     --run_name web_research_v1
+
+environment: web-research
+
+# Toolsets available to the agent during data generation
+toolsets:
+  - web
+  - file
+
+# How many parallel workers to use
+num_workers: 4
+
+# Questions per batch
+batch_size: 20
+
+# Total trajectories to generate (comment out to run full dataset)
+max_items: 500
+
+# Model to use for generation (override with --model flag)
+model: openrouter/nousresearch/hermes-3-llama-3.1-405b
+
+# System prompt additions (ephemeral — not saved to trajectories)
+ephemeral_system_prompt: |
+  You are a highly capable research agent. When asked a factual question,
+  always use web_search to find current, accurate information before answering.
+  Cite at least 2 sources. Be concise and accurate.
+
+# Output directory
+output_dir: data/web_research_v1
+
+# Trajectory compression settings (for fitting into training token budgets)
+compression:
+  enabled: true
+  target_max_tokens: 16000
+
+# Eval settings
+eval_every: 100       # Run eval every N trajectories
+eval_size: 25         # Number of held-out questions per eval run
--- a/environments/web_research_env.py
+++ b/environments/web_research_env.py
@@ -0,0 +1,517 @@
+"""
+WebResearchEnv — RL Environment for Multi-Step Web Research
+============================================================
+
+Trains models to do accurate, efficient, multi-source web research.
+
+Reward signals:
+  - Answer correctness  (LLM judge, 0.0–1.0)
+  - Source diversity    (used ≥2 distinct domains)
+  - Efficiency          (penalizes excessive tool calls)
+  - Tool usage          (bonus for actually using web tools)
+
+Dataset: FRAMES benchmark (Google, 2024) — multi-hop factual questions
+  HuggingFace: google/frames-benchmark
+  Fallback:    built-in sample questions (no HF token needed)
+
+Usage:
+    # Phase 1 (OpenAI-compatible server)
+    python environments/web_research_env.py serve \
+        --openai.base_url http://localhost:8000/v1 \
+        --openai.model_name YourModel \
+        --openai.server_type openai
+
+    # With eval split
+    python environments/web_research_env.py serve \
+        --openai.base_url http://localhost:8000/v1 \
+        --openai.model_name YourModel \
+        --env.eval_every 50 \
+        --env.eval_size 20
+
+    # Standalone eval (no training server needed)
+    python environments/web_research_env.py eval \
+        --openai.base_url http://localhost:8000/v1 \
+        --openai.model_name YourModel
+
+Built by: github.com/jackx707
+Inspired by: GroceryMind — production Hermes agent doing live web research
+             across German grocery stores (firecrawl + hermes-agent)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import random
+import re
+from typing import Any, Optional
+from urllib.parse import urlparse
+
+# ---------------------------------------------------------------------------
+# Optional HuggingFace datasets import
+# ---------------------------------------------------------------------------
+try:
+    from datasets import load_dataset
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+
+from environments.hermes_base_env import HermesAgentBaseEnv
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Fallback sample dataset (used when HuggingFace is unavailable)
+# These are multi-hop questions that require real web search to answer.
+# ---------------------------------------------------------------------------
+SAMPLE_QUESTIONS = [
+    {
+        "question": "What is the current population of the capital city of the country that won the 2022 FIFA World Cup?",
+        "answer": "Buenos Aires has approximately 3 million people in the city proper, or around 15 million in the greater metro area.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "Who is the CEO of the company that makes the most widely used open-source container orchestration platform?",
+        "answer": "The Linux Foundation oversees Kubernetes. CNCF (Cloud Native Computing Foundation) is the specific body — it does not have a traditional CEO but has an executive director.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "What programming language was used to write the original version of the web framework used by Instagram?",
+        "answer": "Django, which Instagram was built on, is written in Python.",
+        "difficulty": "easy",
+        "hops": 2,
+    },
+    {
+        "question": "In what year was the university founded where the inventor of the World Wide Web currently holds a professorship?",
+        "answer": "Tim Berners-Lee holds a professorship at MIT (founded 1861) and the University of Southampton (founded 1952).",
+        "difficulty": "hard",
+        "hops": 3,
+    },
+    {
+        "question": "What is the latest stable version of the programming language that ranks #1 on the TIOBE index as of this year?",
+        "answer": "Python is currently #1 on TIOBE. The latest stable version should be verified via the official python.org site.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "How many employees does the parent company of Instagram have?",
+        "answer": "Meta Platforms (parent of Instagram) employs approximately 70,000+ people as of recent reports.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "What is the current interest rate set by the central bank of the country where the Eiffel Tower is located?",
+        "answer": "The European Central Bank sets rates for France/eurozone. The current rate should be verified — it has changed frequently in 2023-2025.",
+        "difficulty": "hard",
+        "hops": 2,
+    },
+    {
+        "question": "Which company acquired the startup founded by the creator of Oculus VR?",
+        "answer": "Palmer Luckey founded Oculus VR, which was acquired by Facebook (now Meta). He later founded Anduril Industries.",
+        "difficulty": "medium",
+        "hops": 2,
+    },
+    {
+        "question": "What is the market cap of the company that owns the most popular search engine in Russia?",
+        "answer": "Yandex (now split into separate entities after 2024 restructuring). Current market cap should be verified via financial sources.",
+        "difficulty": "hard",
+        "hops": 2,
+    },
+    {
+        "question": "What was the GDP growth rate of the country that hosted the most recent Summer Olympics?",
+        "answer": "Paris, France hosted the 2024 Summer Olympics. France's recent GDP growth should be verified via World Bank or IMF data.",
+        "difficulty": "hard",
+        "hops": 2,
+    },
+]
+
+
+# ---------------------------------------------------------------------------
+# Environment
+# ---------------------------------------------------------------------------
+
+class WebResearchEnv(HermesAgentBaseEnv):
+    """
+    RL environment for training multi-step web research skills.
+
+    The model is given a factual question requiring 2-3 hops of web research
+    and must use web_search / web_extract tools to find and synthesize the answer.
+
+    Reward is multi-signal:
+      60% — answer correctness (LLM judge)
+      20% — tool usage (did the model actually search the web?)
+      20% — efficiency (penalizes >6 tool calls)
+
+    Bonus +0.1 for source diversity (≥2 distinct domains cited).
+    """
+
+    name = "web-research"
+
+    # Default toolsets for this environment — web + file for saving notes
+    default_toolsets = ["web", "file"]
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._items: list[dict] = []
+        self._eval_items: list[dict] = []
+        self._index: int = 0
+        self._total_scored: int = 0
+        self._total_reward: float = 0.0
+
+    # ------------------------------------------------------------------
+    # 1. Setup — load dataset
+    # ------------------------------------------------------------------
+
+    async def setup(self) -> None:
+        """Load the FRAMES benchmark or fall back to built-in samples."""
+        if HF_AVAILABLE:
+            try:
+                logger.info("Loading FRAMES benchmark from HuggingFace...")
+                ds = load_dataset("google/frames-benchmark", split="test")
+                self._items = [
+                    {
+                        "question": row["Prompt"],
+                        "answer": row["Answer"],
+                        "difficulty": row.get("reasoning_types", "unknown"),
+                        "hops": 2,
+                    }
+                    for row in ds
+                ]
+                # Hold out 10% for eval
+                eval_size = max(20, len(self._items) // 10)
+                random.shuffle(self._items)
+                self._eval_items = self._items[:eval_size]
+                self._items = self._items[eval_size:]
+                logger.info(
+                    f"Loaded {len(self._items)} train / {len(self._eval_items)} eval items "
+                    f"from FRAMES benchmark."
+                )
+                return
+            except Exception as e:
+                logger.warning(f"Could not load FRAMES from HuggingFace: {e}. Using built-in samples.")
+
+        # Fallback
+        random.shuffle(SAMPLE_QUESTIONS)
+        split = max(1, len(SAMPLE_QUESTIONS) * 8 // 10)
+        self._items = SAMPLE_QUESTIONS[:split]
+        self._eval_items = SAMPLE_QUESTIONS[split:]
+        logger.info(
+            f"Using built-in sample dataset: {len(self._items)} train / "
+            f"{len(self._eval_items)} eval items."
+        )
+
+    # ------------------------------------------------------------------
+    # 2. get_next_item — return the next question
+    # ------------------------------------------------------------------
+
+    async def get_next_item(self) -> dict:
+        """Return the next item, cycling through the dataset."""
+        if not self._items:
+            raise RuntimeError("Dataset is empty. Did you call setup()?")
+        item = self._items[self._index % len(self._items)]
+        self._index += 1
+        return item
+
+    # ------------------------------------------------------------------
+    # 3. format_prompt — build the user-facing prompt
+    # ------------------------------------------------------------------
+
+    def format_prompt(self, item: dict) -> str:
+        """
+        Format the research question as a task prompt.
+        Instructs the model to use web search and cite sources.
+        """
+        return (
+            f"Research the following question thoroughly using web search. "
+            f"You MUST search the web to find current, accurate information — "
+            f"do not rely solely on your training data.\n\n"
+            f"Question: {item['question']}\n\n"
+            f"Requirements:\n"
+            f"- Use web_search and/or web_extract tools to find information\n"
+            f"- Search at least 2 different sources\n"
+            f"- Provide a concise, accurate answer (2-4 sentences)\n"
+            f"- Cite the sources you used"
+        )
+
+    # ------------------------------------------------------------------
+    # 4. compute_reward — multi-signal scoring
+    # ------------------------------------------------------------------
+
+    async def compute_reward(
+        self,
+        item: dict,
+        result: dict,
+        ctx: Any,  # ToolContext
+    ) -> float:
+        """
+        Multi-signal reward function:
+
+          0.6 * correctness   — LLM judge comparing answer to ground truth
+          0.2 * tool_used     — binary: did the model use web tools?
+          0.2 * efficiency    — penalizes wasteful tool usage
+          +0.1 bonus          — source diversity (≥2 distinct domains)
+        """
+        final_response: str = result.get("final_response", "")
+        tools_used: list[str] = result.get("tools_used", [])
+        tool_call_count: int = result.get("tool_call_count", len(tools_used))
+
+        # ---- Signal 1: Answer correctness (LLM judge) ----------------
+        correctness = await self._llm_judge(
+            question=item["question"],
+            expected=item["answer"],
+            model_answer=final_response,
+            ctx=ctx,
+        )
+
+        # ---- Signal 2: Web tool usage --------------------------------
+        web_tools = {"web_search", "web_extract", "search", "firecrawl"}
+        tool_used = 1.0 if any(t in web_tools for t in tools_used) else 0.0
+
+        # ---- Signal 3: Efficiency ------------------------------------
+        # Ideal: 2-5 tool calls. Penalise beyond 6, hard cap at 15.
+        if tool_call_count <= 5:
+            efficiency = 1.0
+        elif tool_call_count <= 10:
+            efficiency = 1.0 - (tool_call_count - 5) * 0.08
+        else:
+            efficiency = max(0.0, 1.0 - (tool_call_count - 5) * 0.12)
+
+        # ---- Bonus: Source diversity ---------------------------------
+        domains = self._extract_domains(final_response)
+        diversity_bonus = 0.1 if len(domains) >= 2 else 0.0
+
+        # ---- Combine ------------------------------------------------
+        reward = (
+            0.6 * correctness
+            + 0.2 * tool_used
+            + 0.2 * efficiency
+            + diversity_bonus
+        )
+        reward = min(1.0, max(0.0, reward))  # clamp to [0, 1]
+
+        # Track running stats
+        self._total_scored += 1
+        self._total_reward += reward
+
+        logger.debug(
+            f"Reward breakdown — correctness={correctness:.2f}, "
+            f"tool_used={tool_used:.1f}, efficiency={efficiency:.2f}, "
+            f"diversity_bonus={diversity_bonus:.1f} → total={reward:.3f}"
+        )
+
+        return reward
+
+    # ------------------------------------------------------------------
+    # 5. evaluate — run on held-out eval split
+    # ------------------------------------------------------------------
+
+    async def evaluate(
+        self,
+        *args: Any,
+        eval_size: Optional[int] = None,
+        **kwargs: Any,
+    ) -> dict:
+        """
+        Run evaluation on the held-out split.
+        Returns a dict of metrics for logging.
+        """
+        items = self._eval_items
+        if eval_size:
+            items = items[:eval_size]
+
+        if not items:
+            logger.warning("No eval items available.")
+            return {}
+
+        logger.info(f"Running eval on {len(items)} questions...")
+
+        rewards = []
+        correctness_scores = []
+
+        for item in items:
+            try:
+                # Run the agent on each eval question
+                result = await self._run_agent_on_item(item)
+                reward = await self.compute_reward(item, result, ctx=None)
+                rewards.append(reward)
+
+                # Also track raw correctness separately
+                if result.get("final_response"):
+                    correctness_scores.append(
+                        await self._llm_judge(
+                            question=item["question"],
+                            expected=item["answer"],
+                            model_answer=result["final_response"],
+                            ctx=None,
+                        )
+                    )
+            except Exception as e:
+                logger.error(f"Eval error on item: {e}")
+                rewards.append(0.0)
+
+        metrics = {
+            "eval/mean_reward": sum(rewards) / len(rewards) if rewards else 0.0,
+            "eval/mean_correctness": (
+                sum(correctness_scores) / len(correctness_scores)
+                if correctness_scores else 0.0
+            ),
+            "eval/n_items": len(rewards),
+            "train/mean_reward_so_far": (
+                self._total_reward / self._total_scored
+                if self._total_scored > 0 else 0.0
+            ),
+        }
+
+        logger.info(
+            f"Eval complete — mean_reward={metrics['eval/mean_reward']:.3f}, "
+            f"mean_correctness={metrics['eval/mean_correctness']:.3f}"
+        )
+        return metrics
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    async def _llm_judge(
+        self,
+        question: str,
+        expected: str,
+        model_answer: str,
+        ctx: Any,
+    ) -> float:
+        """
+        Use an LLM to judge whether `model_answer` correctly addresses
+        `question` compared to `expected`. Returns a float in [0, 1].
+
+        Uses the agent's own inference client if ctx is available,
+        otherwise falls back to a lightweight heuristic.
+        """
+        if not model_answer or not model_answer.strip():
+            return 0.0
+
+        # Build judge prompt
+        judge_prompt = (
+            "You are an impartial judge evaluating the quality of an AI research answer.\n\n"
+            f"Question: {question}\n\n"
+            f"Reference answer: {expected}\n\n"
+            f"Model answer: {model_answer}\n\n"
+            "Score the model answer on a scale from 0.0 to 1.0 where:\n"
+            "  1.0 = fully correct and complete\n"
+            "  0.7 = mostly correct with minor gaps\n"
+            "  0.4 = partially correct\n"
+            "  0.1 = mentions relevant topic but wrong or very incomplete\n"
+            "  0.0 = completely wrong or no answer\n\n"
+            "Consider: factual accuracy, completeness, and relevance.\n"
+            "Respond with ONLY a JSON object: {\"score\": <float>, \"reason\": \"<one sentence>\"}"
+        )
+
+        # Try using ctx for inference (Phase 2 / live training)
+        if ctx is not None and hasattr(ctx, "chat_completion"):
+            try:
+                response = await ctx.chat_completion(
+                    messages=[{"role": "user", "content": judge_prompt}],
+                    max_tokens=100,
+                    temperature=0.0,
+                )
+                text = response.get("content", "")
+                parsed = self._parse_judge_json(text)
+                if parsed is not None:
+                    return float(parsed)
+            except Exception as e:
+                logger.debug(f"LLM judge via ctx failed: {e}. Using heuristic.")
+
+        # Fallback: keyword overlap heuristic
+        return self._heuristic_score(expected, model_answer)
+
+    @staticmethod
+    def _parse_judge_json(text: str) -> Optional[float]:
+        """Extract the score float from LLM judge JSON response."""
+        try:
+            # Strip markdown code fences if present
+            clean = re.sub(r"```(?:json)?|```", "", text).strip()
+            data = json.loads(clean)
+            score = float(data.get("score", -1))
+            if 0.0 <= score <= 1.0:
+                return score
+        except Exception:
+            # Try regex fallback
+            match = re.search(r'"score"\s*:\s*([0-9.]+)', text)
+            if match:
+                score = float(match.group(1))
+                if 0.0 <= score <= 1.0:
+                    return score
+        return None
+
+    @staticmethod
+    def _heuristic_score(expected: str, model_answer: str) -> float:
+        """
+        Lightweight keyword overlap score as fallback when no LLM is available.
+        Extracts meaningful tokens and computes Jaccard similarity.
+        """
+        stopwords = {
+            "the", "a", "an", "is", "are", "was", "were", "of", "in", "on",
+            "at", "to", "for", "with", "and", "or", "but", "it", "its",
+            "this", "that", "as", "by", "from", "be", "has", "have", "had",
+        }
+
+        def tokenize(text: str) -> set:
+            tokens = re.findall(r'\b[a-zA-Z0-9]+\b', text.lower())
+            return {t for t in tokens if t not in stopwords and len(t) > 2}
+
+        expected_tokens = tokenize(expected)
+        answer_tokens = tokenize(model_answer)
+
+        if not expected_tokens:
+            return 0.5  # Can't judge
+
+        overlap = len(expected_tokens & answer_tokens)
+        union = len(expected_tokens | answer_tokens)
+
+        jaccard = overlap / union if union > 0 else 0.0
+        # Recall-weighted: reward covering expected content
+        recall = overlap / len(expected_tokens)
+        return min(1.0, 0.4 * jaccard + 0.6 * recall)
+
+    @staticmethod
+    def _extract_domains(text: str) -> set:
+        """
+        Extract unique domains from URLs cited in the response.
+        Used to measure source diversity.
+        """
+        urls = re.findall(r'https?://[^\s\)>\]"\']+', text)
+        domains = set()
+        for url in urls:
+            try:
+                parsed = urlparse(url)
+                # Normalize: strip www.
+                domain = parsed.netloc.lower().lstrip("www.")
+                if domain:
+                    domains.add(domain)
+            except Exception:
+                pass
+        return domains
+
+    async def _run_agent_on_item(self, item: dict) -> dict:
+        """
+        Stub for running agent during eval. In Phase 1/2, this is handled
+        by the Atropos framework's rollout mechanism. Provided here for
+        standalone eval compatibility.
+        """
+        # In real usage, the framework calls get_next_item + format_prompt
+        # and runs the agent. This stub returns an empty result for safety.
+        return {
+            "final_response": "",
+            "tools_used": [],
+            "tool_call_count": 0,
+        }
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    WebResearchEnv.cli()