feat: warm session provisioning v2 — full acceptance criteria (#327 )

Marathon sessions (100+ msgs) have lower per-tool error rates (5.7%) than mid-length sessions (9.0%). This implements warm session provisioning addressing all four acceptance criteria: 1. What makes marathon sessions reliable? - SessionProfiler analyzes error rates, tool distribution, proficiency gain (early vs late error rate delta) 2. Pre-seed sessions with successful tool-call examples? - PatternExtractor mines successful tool calls from SessionDB - build_warm_conversation() converts to conversation_history - Injected via existing run_conversation() parameter 3. Does context compression preserve proficiency? - analyze_compression_impact() compares parent vs child session error rates after compression events 4. A/B testing: warm vs cold comparison - compare_sessions() computes error rate improvement - profile action analyzes individual sessions - compare action runs A/B between two sessions agent/warm_session.py (678 lines): - SessionProfile, WarmPattern, WarmSessionTemplate dataclasses - profile_session() — reliability analysis - extract_patterns_from_session() — mines successful patterns - extract_from_session_db() — batch extraction from marathon sessions - build_warm_conversation() — conversation_history builder - analyze_compression_impact() — compression preservation test - compare_sessions() — A/B comparison - save/load/list templates tools/warm_session_tool.py (275 lines): 7 actions: build, list, load, delete, profile, compress-check, compare 25 tests added, all passing. Closes #327
2026-04-13 20:19:58 -04:00
3 changed files with 1291 additions and 0 deletions
--- a/agent/warm_session.py
+++ b/agent/warm_session.py
@@ -0,0 +1,678 @@
+"""Warm Session Provisioning v2 — pre-proficient agent sessions.
+
+Marathon sessions (100+ msgs) have lower per-tool error rates because
+agents accumulate successful patterns and context. This module provides
+infrastructure to capture that proficiency and pre-seed new sessions.
+
+Addresses all acceptance criteria from #327:
+1. What makes marathon sessions reliable? → pattern extraction + analysis
+2. Pre-seed with successful tool-call examples → conversation_history injection
+3. Context compression preservation → compressed_session support
+4. A/B testing → warm vs cold comparison infrastructure
+
+Architecture:
+  - SessionProfiler: analyzes session reliability metrics
+  - PatternExtractor: mines successful tool-call sequences
+  - WarmSessionTemplate: holds patterns + metadata
+  - CompressionAnalyzer: tests if compression preserves proficiency
+  - ABTester: compares warm vs cold session performance
+"""
+
+import json
+import logging
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+from hermes_constants import get_hermes_home
+
+logger = logging.getLogger(__name__)
+
+TEMPLATES_DIR = get_hermes_home() / "warm_sessions"
+
+
+# ---------------------------------------------------------------------------
+# Data structures
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ToolCallOutcome:
+    """A single tool call with its context and outcome."""
+    tool_name: str
+    arguments: Dict[str, Any]
+    result_success: bool
+    result_error: Optional[str] = None
+    result_summary: str = ""
+    session_position: int = 0  # which turn in the session (0-indexed)
+    context_tokens: int = 0  # approximate context size at this point
+
+
+@dataclass
+class SessionProfile:
+    """Analysis of a single session's reliability patterns."""
+    session_id: str
+    message_count: int
+    tool_call_count: int
+    successful_calls: int
+    failed_calls: int
+    error_rate: float
+    tool_distribution: Dict[str, int] = field(default_factory=dict)
+    tool_success_rates: Dict[str, float] = field(default_factory=dict)
+    early_error_rate: float = 0.0  # first 20% of calls
+    late_error_rate: float = 0.0   # last 20% of calls
+    proficiency_gain: float = 0.0  # late_error_rate - early_error_rate (negative = improvement)
+    dominant_tool_type: str = ""  # code, file, research, terminal
+
+
+@dataclass
+class WarmPattern:
+    """A successful tool-call pattern with context."""
+    tool_name: str
+    arguments: Dict[str, Any]
+    result_summary: str
+    preceding_context: str = ""  # what the user/agent said before this call
+    pattern_type: str = ""  # "init", "sequence", "retry", "final"
+    success_count: int = 1
+    session_types: List[str] = field(default_factory=list)  # which session types this appeared in
+
+
+@dataclass
+class WarmSessionTemplate:
+    """A template for pre-seeding proficient sessions."""
+    name: str
+    description: str
+    patterns: List[WarmPattern] = field(default_factory=list)
+    system_prompt_addendum: str = ""
+    tags: List[str] = field(default_factory=list)
+    source_session_ids: List[str] = field(default_factory=list)
+    created_at: float = 0
+    version: int = 2
+    metrics: Dict[str, Any] = field(default_factory=dict)  # extraction metrics
+
+    def __post_init__(self):
+        if not self.created_at:
+            self.created_at = time.time()
+
+    def to_dict(self) -> Dict[str, Any]:
+        return asdict(self)
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "WarmSessionTemplate":
+        patterns = [
+            WarmPattern(**p) if isinstance(p, dict) else p
+            for p in data.get("patterns", [])
+        ]
+        return cls(
+            name=data["name"],
+            description=data.get("description", ""),
+            patterns=patterns,
+            system_prompt_addendum=data.get("system_prompt_addendum", ""),
+            tags=data.get("tags", []),
+            source_session_ids=data.get("source_session_ids", []),
+            created_at=data.get("created_at", 0),
+            version=data.get("version", 2),
+            metrics=data.get("metrics", {}),
+        )
+
+
+# ---------------------------------------------------------------------------
+# Session Profiler — analyzes why marathon sessions are more reliable
+# ---------------------------------------------------------------------------
+
+# Tools that are "trivial" and shouldn't be included in patterns
+_TRIVIAL_TOOLS = frozenset({
+    "clarify", "memory", "fact_store", "fact_feedback",
+    "session_search", "skill_view", "skills_list",
+})
+
+# Tool type classification
+_TOOL_TYPES = {
+    "terminal": "terminal",
+    "execute_code": "code",
+    "read_file": "file",
+    "write_file": "file",
+    "patch": "file",
+    "search_files": "file",
+    "web_search": "research",
+    "web_extract": "research",
+    "browser": "research",
+    "skill_manage": "code",
+    "warm_session": "meta",
+}
+
+
+def classify_tool_type(tool_name: str) -> str:
+    """Classify a tool into a broad category."""
+    return _TOOL_TYPES.get(tool_name, "general")
+
+
+def profile_session(messages: List[Dict[str, Any]], session_id: str = "") -> SessionProfile:
+    """Analyze a session's reliability patterns.
+
+    Examines tool call outcomes across the session to determine if
+    the agent improved with experience (lower error rate later).
+    """
+    tool_outcomes: List[ToolCallOutcome] = []
+
+    for i, msg in enumerate(messages):
+        if msg.get("role") != "assistant":
+            continue
+
+        tool_calls_raw = msg.get("tool_calls")
+        if not tool_calls_raw:
+            continue
+
+        try:
+            tool_calls = json.loads(tool_calls_raw) if isinstance(tool_calls_raw, str) else tool_calls_raw
+        except (json.JSONDecodeError, TypeError):
+            continue
+
+        if not isinstance(tool_calls, list):
+            continue
+
+        for tc in tool_calls:
+            if not isinstance(tc, dict):
+                continue
+            func = tc.get("function", {})
+            tool_name = func.get("name", "")
+            if not tool_name or tool_name in _TRIVIAL_TOOLS:
+                continue
+
+            try:
+                arguments = json.loads(func.get("arguments", "{}"))
+            except (json.JSONDecodeError, TypeError):
+                arguments = {}
+
+            # Find the corresponding tool result
+            tc_id = tc.get("id", "")
+            result_msg = None
+            for subsequent in messages[i+1:i+5]:  # look ahead a few messages
+                if subsequent.get("role") == "tool" and subsequent.get("tool_call_id") == tc_id:
+                    result_msg = subsequent
+                    break
+
+            result_content = result_msg.get("content", "") if result_msg else ""
+            # Heuristic: if result contains error indicators, it failed
+            result_success = not any(err in str(result_content).lower() for err in [
+                "error", "failed", "exception", "traceback", "denied", "not found",
+            ])
+
+            tool_outcomes.append(ToolCallOutcome(
+                tool_name=tool_name,
+                arguments=arguments,
+                result_success=result_success,
+                result_summary=str(result_content)[:500] if result_content else "",
+                session_position=i,
+            ))
+
+    if not tool_outcomes:
+        return SessionProfile(
+            session_id=session_id,
+            message_count=len(messages),
+            tool_call_count=0,
+            successful_calls=0,
+            failed_calls=0,
+            error_rate=0.0,
+        )
+
+    # Calculate metrics
+    total = len(tool_outcomes)
+    successful = sum(1 for o in tool_outcomes if o.result_success)
+    failed = total - successful
+    error_rate = failed / total if total > 0 else 0.0
+
+    # Tool distribution
+    tool_dist: Dict[str, int] = defaultdict(int)
+    tool_success: Dict[str, List[bool]] = defaultdict(list)
+    for outcome in tool_outcomes:
+        tool_dist[outcome.tool_name] += 1
+        tool_success[outcome.tool_name].append(outcome.result_success)
+
+    tool_success_rates = {
+        name: sum(outcomes) / len(outcomes) if outcomes else 0.0
+        for name, outcomes in tool_success.items()
+    }
+
+    # Early vs late error rates (proficiency gain)
+    split_point = max(1, total // 5)  # first 20%
+    early = tool_outcomes[:split_point]
+    late = tool_outcomes[-split_point:]
+
+    early_errors = sum(1 for o in early if not o.result_success) / len(early) if early else 0
+    late_errors = sum(1 for o in late if not o.result_success) / len(late) if late else 0
+    proficiency_gain = late_errors - early_errors  # negative = improvement
+
+    # Dominant tool type
+    type_counts: Dict[str, int] = defaultdict(int)
+    for outcome in tool_outcomes:
+        type_counts[classify_tool_type(outcome.tool_name)] += 1
+    dominant = max(type_counts.items(), key=lambda x: x[1])[0] if type_counts else "general"
+
+    return SessionProfile(
+        session_id=session_id,
+        message_count=len(messages),
+        tool_call_count=total,
+        successful_calls=successful,
+        failed_calls=failed,
+        error_rate=error_rate,
+        tool_distribution=dict(tool_dist),
+        tool_success_rates=tool_success_rates,
+        early_error_rate=early_errors,
+        late_error_rate=late_errors,
+        proficiency_gain=proficiency_gain,
+        dominant_tool_type=dominant,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Pattern Extractor — mines successful tool-call sequences
+# ---------------------------------------------------------------------------
+
+def extract_patterns_from_session(
+    messages: List[Dict[str, Any]],
+    min_success_rate: float = 0.8,
+) -> List[WarmPattern]:
+    """Extract successful patterns from a single session.
+
+    Only includes tools that succeeded, with their arguments and
+    result summaries as reusable patterns.
+    """
+    patterns: List[WarmPattern] = []
+
+    for i, msg in enumerate(messages):
+        if msg.get("role") != "assistant":
+            continue
+
+        tool_calls_raw = msg.get("tool_calls")
+        if not tool_calls_raw:
+            continue
+
+        try:
+            tool_calls = json.loads(tool_calls_raw) if isinstance(tool_calls_raw, str) else tool_calls_raw
+        except (json.JSONDecodeError, TypeError):
+            continue
+
+        if not isinstance(tool_calls, list):
+            continue
+
+        for tc in tool_calls:
+            if not isinstance(tc, dict):
+                continue
+            func = tc.get("function", {})
+            tool_name = func.get("name", "")
+            if not tool_name or tool_name in _TRIVIAL_TOOLS:
+                continue
+
+            try:
+                arguments = json.loads(func.get("arguments", "{}"))
+            except (json.JSONDecodeError, TypeError):
+                continue
+
+            # Find the result
+            tc_id = tc.get("id", "")
+            result_content = ""
+            result_success = False
+            for subsequent in messages[i+1:i+5]:
+                if subsequent.get("role") == "tool" and subsequent.get("tool_call_id") == tc_id:
+                    result_content = str(subsequent.get("content", ""))
+                    result_success = not any(err in result_content.lower() for err in [
+                        "error", "failed", "exception", "traceback", "denied",
+                    ])
+                    break
+
+            if not result_success:
+                continue  # only capture successful patterns
+
+            # Get preceding context
+            preceding = ""
+            if i > 0:
+                prev = messages[i-1]
+                if prev.get("role") == "user":
+                    preceding = str(prev.get("content", ""))[:200]
+
+            patterns.append(WarmPattern(
+                tool_name=tool_name,
+                arguments=arguments,
+                result_summary=result_content[:500],
+                preceding_context=preceding,
+                pattern_type="sequence",
+            ))
+
+    return patterns
+
+
+def extract_from_session_db(
+    session_db,
+    min_messages: int = 30,
+    max_sessions: int = 50,
+    source_filter: str = None,
+) -> Tuple[List[WarmPattern], Dict[str, Any]]:
+    """Mine patterns from marathon sessions in the SessionDB.
+
+    Returns (patterns, metrics) where metrics tracks extraction stats.
+    """
+    all_patterns: List[WarmPattern] = []
+    metrics = {
+        "sessions_scanned": 0,
+        "sessions_qualified": 0,
+        "total_patterns": 0,
+        "tool_distribution": defaultdict(int),
+        "avg_proficiency_gain": 0.0,
+    }
+
+    try:
+        sessions = session_db.list_sessions(
+            limit=max_sessions,
+            source=source_filter,
+        )
+    except Exception as e:
+        logger.warning("Failed to list sessions: %s", e)
+        return all_patterns, metrics
+
+    proficiency_gains: List[float] = []
+
+    for session_meta in sessions:
+        session_id = session_meta.get("id") or session_meta.get("session_id")
+        if not session_id:
+            continue
+
+        msg_count = session_meta.get("message_count", 0)
+        if msg_count < min_messages:
+            continue
+
+        end_reason = session_meta.get("end_reason", "")
+        if end_reason and end_reason not in ("completed", "user_exit", "compression"):
+            continue
+
+        metrics["sessions_scanned"] += 1
+
+        try:
+            messages = session_db.get_messages(session_id)
+        except Exception:
+            continue
+
+        # Profile the session
+        profile = profile_session(messages, session_id)
+        if profile.error_rate > 0.5:  # skip very error-prone sessions
+            continue
+
+        metrics["sessions_qualified"] += 1
+        proficiency_gains.append(profile.proficiency_gain)
+
+        # Extract patterns
+        patterns = extract_patterns_from_session(messages)
+        for p in patterns:
+            p.session_types.append(profile.dominant_tool_type)
+        all_patterns.extend(patterns)
+
+        for p in patterns:
+            metrics["tool_distribution"][p.tool_name] += 1
+
+    metrics["total_patterns"] = len(all_patterns)
+    metrics["avg_proficiency_gain"] = (
+        sum(proficiency_gains) / len(proficiency_gains) if proficiency_gains else 0.0
+    )
+
+    return all_patterns, dict(metrics)
+
+
+# ---------------------------------------------------------------------------
+# Conversation Builder — converts patterns to conversation_history
+# ---------------------------------------------------------------------------
+
+def build_warm_conversation(
+    template: WarmSessionTemplate,
+    max_patterns: int = 15,
+) -> List[Dict[str, Any]]:
+    """Convert template patterns into conversation_history messages.
+
+    Produces a synthetic conversation that demonstrates successful
+    tool-calling patterns, priming the agent with experience.
+    """
+    messages: List[Dict[str, Any]] = []
+
+    if template.system_prompt_addendum:
+        messages.append({
+            "role": "system",
+            "content": (
+                f"[WARM SESSION] The following patterns come from experienced, "
+                f"successful sessions. They demonstrate effective tool usage. "
+                f"Use them as reference for structuring your own tool calls.\n\n"
+                f"{template.system_prompt_addendum}"
+            ),
+        })
+
+    patterns = template.patterns[:max_patterns]
+    for i, pattern in enumerate(patterns):
+        # User turn describing intent
+        user_content = pattern.preceding_context or f"[Pattern {i+1}] Demonstrate {pattern.tool_name} usage."
+        messages.append({"role": "user", "content": user_content})
+
+        # Assistant turn with the tool call
+        tool_call_id = f"warm_{i}_{pattern.tool_name}"
+        messages.append({
+            "role": "assistant",
+            "content": None,
+            "tool_calls": [{
+                "id": tool_call_id,
+                "type": "function",
+                "function": {
+                    "name": pattern.tool_name,
+                    "arguments": json.dumps(pattern.arguments, ensure_ascii=False),
+                },
+            }],
+        })
+
+        # Tool result
+        messages.append({
+            "role": "tool",
+            "tool_call_id": tool_call_id,
+            "content": pattern.result_summary or f"Tool {pattern.tool_name} executed successfully.",
+        })
+
+    return messages
+
+
+# ---------------------------------------------------------------------------
+# Compression Analyzer — tests if compression preserves proficiency
+# ---------------------------------------------------------------------------
+
+def analyze_compression_impact(
+    session_db,
+    session_id: str,
+) -> Dict[str, Any]:
+    """Analyze whether context compression preserves agent proficiency.
+
+    Compares the error rates before and after compression events in a session.
+    Compression creates a new session_id (parent → child chain).
+    """
+    result = {
+        "session_id": session_id,
+        "has_compression": False,
+        "pre_compression_profile": None,
+        "post_compression_profile": None,
+        "proficiency_preserved": None,
+    }
+
+    try:
+        messages = session_db.get_messages(session_id)
+    except Exception:
+        return result
+
+    # Check if this session was the result of compression
+    try:
+        session_meta = session_db.get_session(session_id)
+        parent_id = session_meta.get("parent_session_id") if session_meta else None
+    except Exception:
+        parent_id = None
+
+    if not parent_id:
+        return result
+
+    result["has_compression"] = True
+
+    # Profile parent (pre-compression)
+    try:
+        parent_messages = session_db.get_messages(parent_id)
+        pre_profile = profile_session(parent_messages, parent_id)
+        result["pre_compression_profile"] = {
+            "error_rate": pre_profile.error_rate,
+            "tool_call_count": pre_profile.tool_call_count,
+            "proficiency_gain": pre_profile.proficiency_gain,
+        }
+    except Exception:
+        pass
+
+    # Profile current (post-compression)
+    post_profile = profile_session(messages, session_id)
+    result["post_compression_profile"] = {
+        "error_rate": post_profile.error_rate,
+        "tool_call_count": post_profile.tool_call_count,
+        "proficiency_gain": post_profile.proficiency_gain,
+    }
+
+    # Determine if proficiency was preserved
+    if result["pre_compression_profile"]:
+        pre_rate = result["pre_compression_profile"]["error_rate"]
+        post_rate = result["post_compression_profile"]["error_rate"]
+        # Proficiency preserved if post-compression error rate isn't significantly worse
+        result["proficiency_preserved"] = post_rate <= pre_rate * 1.2  # 20% tolerance
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# A/B Testing — warm vs cold session comparison
+# ---------------------------------------------------------------------------
+
+@dataclass
+class ABTestResult:
+    """Result of comparing warm vs cold session performance."""
+    test_name: str
+    warm_session_errors: int
+    warm_session_total: int
+    cold_session_errors: int
+    cold_session_total: int
+    warm_error_rate: float
+    cold_error_rate: float
+    improvement: float  # positive = warm is better
+    warm_session_id: str = ""
+    cold_session_id: str = ""
+
+
+def compare_sessions(
+    warm_profile: SessionProfile,
+    cold_profile: SessionProfile,
+    test_name: str = "",
+) -> ABTestResult:
+    """Compare warm vs cold session performance."""
+    warm_rate = warm_profile.error_rate
+    cold_rate = cold_profile.error_rate
+    improvement = cold_rate - warm_rate  # positive means warm is better
+
+    return ABTestResult(
+        test_name=test_name,
+        warm_session_errors=warm_profile.failed_calls,
+        warm_session_total=warm_profile.tool_call_count,
+        cold_session_errors=cold_profile.failed_calls,
+        cold_session_total=cold_profile.tool_call_count,
+        warm_error_rate=warm_rate,
+        cold_error_rate=cold_rate,
+        improvement=improvement,
+        warm_session_id=warm_profile.session_id,
+        cold_session_id=cold_profile.session_id,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Persistence
+# ---------------------------------------------------------------------------
+
+def save_template(template: WarmSessionTemplate) -> Path:
+    """Save a warm session template to disk."""
+    TEMPLATES_DIR.mkdir(parents=True, exist_ok=True)
+    path = TEMPLATES_DIR / f"{template.name}.json"
+    path.write_text(json.dumps(template.to_dict(), indent=2, ensure_ascii=False))
+    logger.info("Warm session template saved: %s", path)
+    return path
+
+
+def load_template(name: str) -> Optional[WarmSessionTemplate]:
+    """Load a warm session template by name."""
+    path = TEMPLATES_DIR / f"{name}.json"
+    if not path.exists():
+        return None
+    try:
+        data = json.loads(path.read_text())
+        return WarmSessionTemplate.from_dict(data)
+    except Exception as e:
+        logger.warning("Failed to load warm session template '%s': %s", name, e)
+        return None
+
+
+def list_templates() -> List[Dict[str, Any]]:
+    """List all saved warm session templates."""
+    if not TEMPLATES_DIR.exists():
+        return []
+
+    templates = []
+    for path in sorted(TEMPLATES_DIR.glob("*.json")):
+        try:
+            data = json.loads(path.read_text())
+            templates.append({
+                "name": data.get("name", path.stem),
+                "description": data.get("description", ""),
+                "tags": data.get("tags", []),
+                "pattern_count": len(data.get("patterns", [])),
+                "created_at": data.get("created_at", 0),
+                "version": data.get("version", 1),
+            })
+        except Exception:
+            continue
+    return templates
+
+
+def build_and_save(
+    session_db,
+    name: str,
+    description: str = "",
+    min_messages: int = 30,
+    max_sessions: int = 30,
+    source_filter: str = None,
+    tags: List[str] = None,
+) -> Tuple[WarmSessionTemplate, Dict[str, Any]]:
+    """One-shot: mine sessions, build template, save it.
+
+    Returns (template, extraction_metrics).
+    """
+    patterns, metrics = extract_from_session_db(
+        session_db,
+        min_messages=min_messages,
+        max_sessions=max_sessions,
+        source_filter=source_filter,
+    )
+
+    # Deduplicate patterns by (tool_name, arguments)
+    seen = set()
+    unique_patterns = []
+    for p in patterns:
+        key = (p.tool_name, json.dumps(p.arguments, sort_keys=True))
+        if key not in seen:
+            seen.add(key)
+            unique_patterns.append(p)
+
+    template = WarmSessionTemplate(
+        name=name,
+        description=description or f"Auto-generated from {metrics['sessions_qualified']} sessions",
+        patterns=unique_patterns,
+        tags=tags or [],
+        source_session_ids=[],
+        metrics=metrics,
+    )
+
+    if unique_patterns:
+        save_template(template)
+
+    return template, metrics
--- a/tests/agent/test_warm_session.py
+++ b/tests/agent/test_warm_session.py
@@ -0,0 +1,338 @@
+"""Tests for warm session provisioning v2 (#327)."""
+
+import json
+import time
+from collections import defaultdict
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from agent.warm_session import (
+    SessionProfile,
+    WarmPattern,
+    WarmSessionTemplate,
+    ToolCallOutcome,
+    classify_tool_type,
+    profile_session,
+    extract_patterns_from_session,
+    build_warm_conversation,
+    analyze_compression_impact,
+    compare_sessions,
+    save_template,
+    load_template,
+    list_templates,
+    _TRIVIAL_TOOLS,
+)
+
+
+@pytest.fixture()
+def isolated_templates_dir(tmp_path, monkeypatch):
+    """Point TEMPLATES_DIR at a temp directory."""
+    tdir = tmp_path / "warm_sessions"
+    tdir.mkdir()
+    monkeypatch.setattr("agent.warm_session.TEMPLATES_DIR", tdir)
+    return tdir
+
+
+def _make_messages(tool_calls_and_results):
+    """Helper to build message list from (tool_name, args, result, success) tuples."""
+    messages = []
+    for i, (tool_name, args, result, success) in enumerate(tool_calls_and_results):
+        tc_id = f"tc_{i}"
+        messages.append({
+            "role": "assistant",
+            "content": None,
+            "tool_calls": json.dumps([{
+                "id": tc_id,
+                "type": "function",
+                "function": {"name": tool_name, "arguments": json.dumps(args)},
+            }]),
+        })
+        error_words = "error failed" if not success else ""
+        messages.append({
+            "role": "tool",
+            "tool_call_id": tc_id,
+            "content": f"{result} {error_words}".strip(),
+        })
+    return messages
+
+
+# ---------------------------------------------------------------------------
+# Tool classification
+# ---------------------------------------------------------------------------
+
+class TestClassifyToolType:
+    def test_terminal(self):
+        assert classify_tool_type("terminal") == "terminal"
+
+    def test_code(self):
+        assert classify_tool_type("execute_code") == "code"
+
+    def test_file(self):
+        assert classify_tool_type("read_file") == "file"
+
+    def test_research(self):
+        assert classify_tool_type("web_search") == "research"
+
+    def test_unknown(self):
+        assert classify_tool_type("custom_tool") == "general"
+
+
+# ---------------------------------------------------------------------------
+# Session profiling
+# ---------------------------------------------------------------------------
+
+class TestProfileSession:
+    def test_empty_session(self):
+        profile = profile_session([], "s1")
+        assert profile.tool_call_count == 0
+        assert profile.error_rate == 0.0
+
+    def test_all_successful(self):
+        messages = _make_messages([
+            ("terminal", {"command": "ls"}, "file list", True),
+            ("read_file", {"path": "x.py"}, "code", True),
+            ("terminal", {"command": "pwd"}, "/home", True),
+        ])
+        profile = profile_session(messages, "s1")
+        assert profile.tool_call_count == 3
+        assert profile.successful_calls == 3
+        assert profile.error_rate == 0.0
+        assert profile.tool_distribution["terminal"] == 2
+
+    def test_mixed_success(self):
+        messages = _make_messages([
+            ("terminal", {"command": "ls"}, "ok", True),
+            ("terminal", {"command": "bad"}, "error!", False),
+            ("read_file", {"path": "x"}, "content", True),
+        ])
+        profile = profile_session(messages, "s1")
+        assert profile.tool_call_count == 3
+        assert profile.successful_calls == 2
+        assert abs(profile.error_rate - 0.333) < 0.01
+
+    def test_proficiency_gain_negative_means_improvement(self):
+        # Early errors, later success → negative proficiency_gain (improvement)
+        messages = _make_messages([
+            ("terminal", {"c": "1"}, "error!", False),  # early error
+            ("terminal", {"c": "2"}, "error!", False),  # early error
+            ("terminal", {"c": "3"}, "ok", True),
+            ("terminal", {"c": "4"}, "ok", True),
+            ("terminal", {"c": "5"}, "ok", True),
+            ("terminal", {"c": "6"}, "ok", True),
+            ("terminal", {"c": "7"}, "ok", True),
+            ("terminal", {"c": "8"}, "ok", True),
+            ("terminal", {"c": "9"}, "ok", True),
+            ("terminal", {"c": "10"}, "ok", True),  # late success
+        ])
+        profile = profile_session(messages, "s1")
+        assert profile.proficiency_gain < 0  # improvement
+
+    def test_skips_trivial_tools(self):
+        messages = _make_messages([
+            ("clarify", {"question": "what?"}, "answer", True),
+            ("terminal", {"command": "ls"}, "ok", True),
+        ])
+        profile = profile_session(messages, "s1")
+        assert profile.tool_call_count == 1  # clarify skipped
+        assert profile.tool_distribution.get("clarify", 0) == 0
+
+
+# ---------------------------------------------------------------------------
+# Pattern extraction
+# ---------------------------------------------------------------------------
+
+class TestExtractPatterns:
+    def test_extracts_successful_only(self):
+        messages = _make_messages([
+            ("terminal", {"command": "ls"}, "file list", True),
+            ("read_file", {"path": "bad"}, "error!", False),  # skip
+            ("search_files", {"pattern": "import"}, "matches", True),
+        ])
+        patterns = extract_patterns_from_session(messages)
+        assert len(patterns) == 2
+        assert patterns[0].tool_name == "terminal"
+        assert patterns[1].tool_name == "search_files"
+
+    def test_includes_preceding_context(self):
+        messages = [
+            {"role": "user", "content": "List the files please"},
+        ]
+        messages.extend(_make_messages([
+            ("terminal", {"command": "ls"}, "files", True),
+        ]))
+        patterns = extract_patterns_from_session(messages)
+        assert len(patterns) == 1
+        assert "List the files" in patterns[0].preceding_context
+
+    def test_skips_trivial_tools(self):
+        messages = _make_messages([
+            ("memory", {"action": "add"}, "ok", True),
+            ("terminal", {"command": "ls"}, "ok", True),
+        ])
+        patterns = extract_patterns_from_session(messages)
+        assert len(patterns) == 1
+        assert patterns[0].tool_name == "terminal"
+
+
+# ---------------------------------------------------------------------------
+# Warm conversation builder
+# ---------------------------------------------------------------------------
+
+class TestBuildWarmConversation:
+    def test_basic_conversation(self):
+        template = WarmSessionTemplate(
+            name="test",
+            description="test",
+            patterns=[
+                WarmPattern(tool_name="terminal", arguments={"command": "ls"}, result_summary="files"),
+                WarmPattern(tool_name="read_file", arguments={"path": "x"}, result_summary="content"),
+            ],
+        )
+        messages = build_warm_conversation(template)
+        # 2 patterns * 3 messages each = 6
+        assert len(messages) == 6
+
+    def test_message_roles(self):
+        template = WarmSessionTemplate(
+            name="test",
+            description="test",
+            patterns=[WarmPattern(tool_name="terminal", arguments={"c": "pwd"}, result_summary="/home")],
+        )
+        messages = build_warm_conversation(template)
+        assert messages[0]["role"] == "user"
+        assert messages[1]["role"] == "assistant"
+        assert messages[1]["tool_calls"][0]["function"]["name"] == "terminal"
+        assert messages[2]["role"] == "tool"
+        assert messages[2]["tool_call_id"] == messages[1]["tool_calls"][0]["id"]
+
+    def test_max_patterns_limit(self):
+        patterns = [
+            WarmPattern(tool_name=f"tool_{i}", arguments={}, result_summary=f"r{i}")
+            for i in range(20)
+        ]
+        template = WarmSessionTemplate(name="test", description="test", patterns=patterns)
+        messages = build_warm_conversation(template, max_patterns=3)
+        assert len(messages) == 9  # 3 * 3
+
+    def test_system_prompt_addendum(self):
+        template = WarmSessionTemplate(
+            name="test",
+            description="test",
+            patterns=[],
+            system_prompt_addendum="Use Python 3.12",
+        )
+        messages = build_warm_conversation(template)
+        assert len(messages) == 1
+        assert messages[0]["role"] == "system"
+        assert "Python 3.12" in messages[0]["content"]
+
+
+# ---------------------------------------------------------------------------
+# Compression analysis
+# ---------------------------------------------------------------------------
+
+class TestCompressionAnalysis:
+    def test_no_compression(self):
+        db = MagicMock()
+        db.get_session.return_value = {"parent_session_id": None}
+        result = analyze_compression_impact(db, "s1")
+        assert result["has_compression"] is False
+
+    def test_with_compression(self):
+        db = MagicMock()
+        db.get_session.return_value = {"parent_session_id": "parent_s1"}
+        # Parent: all success
+        parent_msgs = _make_messages([
+            ("terminal", {"c": "ls"}, "ok", True),
+            ("terminal", {"c": "pwd"}, "/home", True),
+        ])
+        # Child: one error
+        child_msgs = _make_messages([
+            ("terminal", {"c": "bad"}, "error!", False),
+            ("terminal", {"c": "ls"}, "ok", True),
+        ])
+        db.get_messages.side_effect = lambda sid: parent_msgs if sid == "parent_s1" else child_msgs
+
+        result = analyze_compression_impact(db, "child_s1")
+        assert result["has_compression"] is True
+        assert result["proficiency_preserved"] is False  # error rate went up
+
+
+# ---------------------------------------------------------------------------
+# A/B comparison
+# ---------------------------------------------------------------------------
+
+class TestCompareSessions:
+    def test_warm_better(self):
+        warm = SessionProfile(session_id="w", message_count=10, tool_call_count=10,
+                              successful_calls=9, failed_calls=1, error_rate=0.1)
+        cold = SessionProfile(session_id="c", message_count=10, tool_call_count=10,
+                              successful_calls=7, failed_calls=3, error_rate=0.3)
+        result = compare_sessions(warm, cold)
+        assert result.improvement > 0  # warm is better
+        assert result.warm_error_rate == 0.1
+        assert result.cold_error_rate == 0.3
+
+
+# ---------------------------------------------------------------------------
+# Persistence
+# ---------------------------------------------------------------------------
+
+class TestPersistence:
+    def test_save_and_load(self, isolated_templates_dir):
+        template = WarmSessionTemplate(
+            name="persist-test",
+            description="test persistence",
+            patterns=[WarmPattern(tool_name="t", arguments={}, result_summary="r")],
+        )
+        save_template(template)
+        loaded = load_template("persist-test")
+        assert loaded is not None
+        assert loaded.name == "persist-test"
+        assert len(loaded.patterns) == 1
+
+    def test_load_nonexistent(self, isolated_templates_dir):
+        assert load_template("nope") is None
+
+    def test_list_templates(self, isolated_templates_dir):
+        t1 = WarmSessionTemplate(name="a", description="a", patterns=[])
+        t2 = WarmSessionTemplate(name="b", description="b", patterns=[
+            WarmPattern(tool_name="t", arguments={}, result_summary="r"),
+        ])
+        save_template(t1)
+        save_template(t2)
+        templates = list_templates()
+        assert len(templates) == 2
+        names = {t["name"] for t in templates}
+        assert names == {"a", "b"}
+
+    def test_list_empty(self, isolated_templates_dir):
+        assert list_templates() == []
+
+
+# ---------------------------------------------------------------------------
+# SessionDB extraction (mocked)
+# ---------------------------------------------------------------------------
+
+class TestExtractFromDB:
+    def test_extracts_from_qualifying_sessions(self):
+        from agent.warm_session import extract_from_session_db
+
+        db = MagicMock()
+        db.list_sessions.return_value = [
+            {"id": "s1", "message_count": 50, "end_reason": "completed"},
+            {"id": "s2", "message_count": 10, "end_reason": "completed"},  # too short
+            {"id": "s3", "message_count": 40, "end_reason": "error"},  # wrong end reason
+        ]
+
+        good_msgs = _make_messages([
+            ("terminal", {"c": "ls"}, "ok", True),
+            ("read_file", {"p": "x"}, "content", True),
+        ])
+        db.get_messages.return_value = good_msgs
+
+        patterns, metrics = extract_from_session_db(db, min_messages=20)
+        assert metrics["sessions_scanned"] == 1  # only s1 qualifies
+        assert metrics["sessions_qualified"] == 1
+        assert len(patterns) >= 0
--- a/tools/warm_session_tool.py
+++ b/tools/warm_session_tool.py
@@ -0,0 +1,275 @@
+"""Warm Session Tool v2 — manage pre-proficient agent sessions.
+
+Provides build/list/load/delete/compress-analyze/compare actions
+for warm session provisioning.
+"""
+
+import json
+import logging
+from typing import Optional
+
+from tools.registry import registry
+
+logger = logging.getLogger(__name__)
+
+
+def warm_session(
+    action: str,
+    name: str = None,
+    description: str = "",
+    min_messages: int = 30,
+    max_sessions: int = 30,
+    source_filter: str = None,
+    tags: list = None,
+    session_id: str = None,
+    compare_with: str = None,
+) -> str:
+    """Manage warm session templates.
+
+    Actions:
+      build          — mine existing sessions, create template
+      list           — show saved templates
+      load           — get conversation_history from a template
+      delete         — remove a template
+      profile        — analyze a session's reliability patterns
+      compress-check — test if compression preserved proficiency
+      compare        — compare two sessions' error rates (A/B)
+    """
+    from agent.warm_session import (
+        build_and_save,
+        load_template,
+        list_templates,
+        build_warm_conversation,
+        profile_session,
+        analyze_compression_impact,
+        compare_sessions,
+        TEMPLATES_DIR,
+    )
+
+    if action == "list":
+        templates = list_templates()
+        return json.dumps({
+            "success": True,
+            "templates": templates,
+            "count": len(templates),
+        })
+
+    if action == "build":
+        if not name:
+            return json.dumps({"success": False, "error": "name is required for 'build'."})
+
+        try:
+            from hermes_state import SessionDB
+            db = SessionDB()
+        except Exception as e:
+            return json.dumps({"success": False, "error": f"Cannot open session DB: {e}"})
+
+        template, metrics = build_and_save(
+            db,
+            name=name,
+            description=description,
+            min_messages=min_messages,
+            max_sessions=max_sessions,
+            source_filter=source_filter,
+            tags=tags or [],
+        )
+
+        return json.dumps({
+            "success": True,
+            "name": template.name,
+            "pattern_count": len(template.patterns),
+            "description": template.description,
+            "metrics": {
+                "sessions_scanned": metrics.get("sessions_scanned", 0),
+                "sessions_qualified": metrics.get("sessions_qualified", 0),
+                "avg_proficiency_gain": round(metrics.get("avg_proficiency_gain", 0), 3),
+            },
+        })
+
+    if action == "load":
+        if not name:
+            return json.dumps({"success": False, "error": "name is required for 'load'."})
+
+        template = load_template(name)
+        if not template:
+            return json.dumps({"success": False, "error": f"Template '{name}' not found."})
+
+        conversation = build_warm_conversation(template)
+        return json.dumps({
+            "success": True,
+            "name": template.name,
+            "message_count": len(conversation),
+            "pattern_count": len(template.patterns),
+            "conversation_preview": [
+                {"role": m["role"], "content_preview": str(m.get("content", ""))[:100]}
+                for m in conversation[:6]
+            ],
+        })
+
+    if action == "delete":
+        if not name:
+            return json.dumps({"success": False, "error": "name is required for 'delete'."})
+
+        path = TEMPLATES_DIR / f"{name}.json"
+        if not path.exists():
+            return json.dumps({"success": False, "error": f"Template '{name}' not found."})
+
+        path.unlink()
+        return json.dumps({"success": True, "message": f"Template '{name}' deleted."})
+
+    if action == "profile":
+        if not session_id:
+            return json.dumps({"success": False, "error": "session_id is required for 'profile'."})
+
+        try:
+            from hermes_state import SessionDB
+            db = SessionDB()
+            messages = db.get_messages(session_id)
+        except Exception as e:
+            return json.dumps({"success": False, "error": f"Cannot load session: {e}"})
+
+        from agent.warm_session import profile_session as _profile
+        profile = _profile(messages, session_id)
+        return json.dumps({
+            "success": True,
+            "session_id": profile.session_id,
+            "message_count": profile.message_count,
+            "tool_call_count": profile.tool_call_count,
+            "error_rate": round(profile.error_rate, 3),
+            "proficiency_gain": round(profile.proficiency_gain, 3),
+            "dominant_tool_type": profile.dominant_tool_type,
+            "tool_success_rates": {
+                k: round(v, 3) for k, v in profile.tool_success_rates.items()
+            },
+        })
+
+    if action == "compress-check":
+        if not session_id:
+            return json.dumps({"success": False, "error": "session_id is required for 'compress-check'."})
+
+        try:
+            from hermes_state import SessionDB
+            db = SessionDB()
+        except Exception as e:
+            return json.dumps({"success": False, "error": f"Cannot open session DB: {e}"})
+
+        result = analyze_compression_impact(db, session_id)
+        return json.dumps({
+            "success": True,
+            **result,
+        })
+
+    if action == "compare":
+        if not session_id or not compare_with:
+            return json.dumps({
+                "success": False,
+                "error": "Both session_id and compare_with are required for 'compare'.",
+            })
+
+        try:
+            from hermes_state import SessionDB
+            db = SessionDB()
+            warm_msgs = db.get_messages(session_id)
+            cold_msgs = db.get_messages(compare_with)
+        except Exception as e:
+            return json.dumps({"success": False, "error": f"Cannot load sessions: {e}"})
+
+        from agent.warm_session import profile_session as _profile, compare_sessions as _compare
+        warm_profile = _profile(warm_msgs, session_id)
+        cold_profile = _profile(cold_msgs, compare_with)
+        result = _compare(warm_profile, cold_profile, test_name=f"{session_id} vs {compare_with}")
+
+        return json.dumps({
+            "success": True,
+            "test_name": result.test_name,
+            "warm_error_rate": round(result.warm_error_rate, 3),
+            "cold_error_rate": round(result.cold_error_rate, 3),
+            "improvement": round(result.improvement, 3),
+            "warm_better": result.improvement > 0,
+        })
+
+    return json.dumps({
+        "success": False,
+        "error": f"Unknown action '{action}'. Use: build, list, load, delete, profile, compress-check, compare",
+    })
+
+
+WARM_SESSION_SCHEMA = {
+    "name": "warm_session",
+    "description": (
+        "Manage warm session templates for pre-proficient agent sessions. "
+        "Marathon sessions have lower error rates because agents accumulate "
+        "successful patterns. This tool captures those patterns and can "
+        "pre-seed new sessions with experience.\n\n"
+        "Actions:\n"
+        "  build          — mine existing sessions for successful patterns, save as template\n"
+        "  list           — show saved templates\n"
+        "  load           — retrieve template's conversation history for injection\n"
+        "  delete         — remove a template\n"
+        "  profile        — analyze a session's reliability metrics\n"
+        "  compress-check — test if context compression preserved proficiency\n"
+        "  compare        — compare two sessions' error rates (A/B test)"
+    ),
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "action": {
+                "type": "string",
+                "enum": ["build", "list", "load", "delete", "profile", "compress-check", "compare"],
+                "description": "The action to perform.",
+            },
+            "name": {
+                "type": "string",
+                "description": "Template name. Required for build/load/delete.",
+            },
+            "description": {
+                "type": "string",
+                "description": "Description for the template. Used with 'build'.",
+            },
+            "min_messages": {
+                "type": "integer",
+                "description": "Minimum messages for a session to qualify (default: 30).",
+            },
+            "max_sessions": {
+                "type": "integer",
+                "description": "Maximum sessions to scan (default: 30).",
+            },
+            "source_filter": {
+                "type": "string",
+                "description": "Filter sessions by source (cli, telegram, discord, etc.).",
+            },
+            "tags": {
+                "type": "array",
+                "items": {"type": "string"},
+                "description": "Tags for organizing templates.",
+            },
+            "session_id": {
+                "type": "string",
+                "description": "Session ID for profile/compress-check/compare actions.",
+            },
+            "compare_with": {
+                "type": "string",
+                "description": "Second session ID for compare action.",
+            },
+        },
+        "required": ["action"],
+    },
+}
+
+registry.register(
+    name="warm_session",
+    toolset="skills",
+    schema=WARM_SESSION_SCHEMA,
+    handler=lambda args, **kw: warm_session(
+        action=args.get("action", ""),
+        name=args.get("name"),
+        description=args.get("description", ""),
+        min_messages=args.get("min_messages", 30),
+        max_sessions=args.get("max_sessions", 30),
+        source_filter=args.get("source_filter"),
+        tags=args.get("tags"),
+        session_id=args.get("session_id"),
+        compare_with=args.get("compare_with"),
+    ),
+    emoji="🔥",
+)