intelligence/deepdive/dpo_generator.py

#!/usr/bin/env python3
"""Deep Dive DPO Training Pair Generator — Phase 3.5

Transforms ranked research items + synthesis output into DPO preference
pairs for overnight Hermes training. Closes the loop between arXiv
intelligence gathering and sovereign model improvement.

Pair strategy:
  1. summarize   — "Summarize this paper" → fleet-grounded analysis (chosen) vs generic abstract (rejected)
  2. relevance   — "What's relevant to Hermes?" → scored relevance analysis (chosen) vs vague (rejected)
  3. implication  — "What are the implications?" → actionable insight (chosen) vs platitude (rejected)

Output format matches timmy-home training-data convention:
  {"prompt", "chosen", "rejected", "source_session", "task_type", "evidence_ids", "safety_flags"}
"""

import hashlib
import json
import logging
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional

# Quality validation gate
try:
    from dpo_quality import DPOQualityValidator
    HAS_DPO_QUALITY = True
except ImportError:
    HAS_DPO_QUALITY = False
    DPOQualityValidator = None

logger = logging.getLogger("deepdive.dpo_generator")


@dataclass
class DPOPair:
    """Single DPO training pair."""
    prompt: str
    chosen: str
    rejected: str
    task_type: str
    evidence_ids: List[str] = field(default_factory=list)
    source_session: Dict[str, Any] = field(default_factory=dict)
    safety_flags: List[str] = field(default_factory=list)
    metadata: Dict[str, Any] = field(default_factory=dict)

    def to_dict(self) -> Dict[str, Any]:
        return {
            "prompt": self.prompt,
            "chosen": self.chosen,
            "rejected": self.rejected,
            "task_type": self.task_type,
            "evidence_ids": self.evidence_ids,
            "source_session": self.source_session,
            "safety_flags": self.safety_flags,
            "metadata": self.metadata,
        }


class DPOPairGenerator:
    """Generate DPO training pairs from Deep Dive pipeline output.

    Sits between Phase 3 (Synthesis) and Phase 4 (Audio) as Phase 3.5.
    Takes ranked items + synthesis briefing and produces training pairs
    that teach Hermes to produce fleet-grounded research analysis.
    """

    def __init__(self, config: Optional[Dict[str, Any]] = None):
        cfg = config or {}
        self.output_dir = Path(
            cfg.get("output_dir", str(Path.home() / ".timmy" / "training-data" / "dpo-pairs"))
        )
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self.min_score = cfg.get("min_score", 0.5)
        self.max_pairs_per_run = cfg.get("max_pairs_per_run", 30)
        self.pair_types = cfg.get("pair_types", ["summarize", "relevance", "implication"])

        # Quality validator
        self.validator = None
        validation_cfg = cfg.get("validation", {})
        if HAS_DPO_QUALITY and validation_cfg.get("enabled", True):
            self.validator = DPOQualityValidator(
                config=validation_cfg,
                output_dir=self.output_dir,
            )
            logger.info("DPO quality validator enabled")
        elif not HAS_DPO_QUALITY:
            logger.info("DPO quality validator not available (dpo_quality module not found)")
        else:
            logger.info("DPO quality validator disabled in config")

        logger.info(
            f"DPOPairGenerator: output_dir={self.output_dir}, "
            f"pair_types={self.pair_types}, max_pairs={self.max_pairs_per_run}"
        )

    def _content_hash(self, text: str) -> str:
        return hashlib.sha256(text.encode()).hexdigest()[:12]

    def _build_summarize_pair(self, item, score: float,
                               synthesis_excerpt: str) -> DPOPair:
        """Type 1: 'Summarize this paper' → fleet-grounded analysis vs generic abstract."""
        prompt = (
            f"Summarize the following research paper and explain its significance "
            f"for a team building sovereign LLM agents:\n\n"
            f"Title: {item.title}\n"
            f"Abstract: {item.summary[:500]}\n"
            f"Source: {item.source}\n"
            f"URL: {item.url}"
        )

        chosen = (
            f"{synthesis_excerpt}\n\n"
            f"Relevance score: {score:.2f}/5.0 — "
            f"This work directly impacts our agent architecture and training pipeline."
        )

        # Rejected: generic, unhelpful summary without fleet context
        rejected = (
            f"This paper titled \"{item.title}\" presents research findings in the area "
            f"of artificial intelligence. The authors discuss various methods and present "
            f"results. This may be of interest to researchers in the field."
        )

        return DPOPair(
            prompt=prompt,
            chosen=chosen,
            rejected=rejected,
            task_type="summarize",
            evidence_ids=[self._content_hash(item.url or item.title)],
            source_session={
                "pipeline": "deepdive",
                "phase": "3.5_dpo",
                "relevance_score": score,
                "source_url": item.url,
            },
            safety_flags=["auto-generated", "deepdive-pipeline"],
            metadata={
                "source_feed": item.source,
                "item_title": item.title,
                "score": score,
            },
        )

    def _build_relevance_pair(self, item, score: float,
                               fleet_context_text: str) -> DPOPair:
        """Type 2: 'What's relevant to Hermes?' → scored analysis vs vague response."""
        prompt = (
            f"Analyze this research for relevance to the Hermes agent fleet — "
            f"a sovereign AI system using local Gemma models, Ollama inference, "
            f"and GRPO/DPO training:\n\n"
            f"Title: {item.title}\n"
            f"Abstract: {item.summary[:400]}"
        )

        # Build keyword match explanation
        keywords_matched = []
        text_lower = f"{item.title} {item.summary}".lower()
        relevance_terms = [
            "agent", "tool use", "function calling", "reinforcement learning",
            "RLHF", "GRPO", "fine-tuning", "LoRA", "quantization", "inference",
            "reasoning", "chain of thought", "transformer", "local"
        ]
        for term in relevance_terms:
            if term.lower() in text_lower:
                keywords_matched.append(term)

        keyword_str = ", ".join(keywords_matched[:5]) if keywords_matched else "general AI/ML"

        chosen = (
            f"**Relevance: {score:.2f}/5.0**\n\n"
            f"This paper is relevant to our fleet because it touches on: {keyword_str}.\n\n"
        )
        if fleet_context_text:
            chosen += (
                f"In the context of our current fleet state:\n"
                f"{fleet_context_text[:300]}\n\n"
            )
        chosen += (
            f"**Actionable takeaway:** Review this work for techniques applicable to "
            f"our overnight training loop and agent architecture improvements."
        )

        rejected = (
            f"This paper might be relevant. It discusses some AI topics. "
            f"It could potentially be useful for various AI projects. "
            f"Further reading may be needed to determine its applicability."
        )

        return DPOPair(
            prompt=prompt,
            chosen=chosen,
            rejected=rejected,
            task_type="relevance",
            evidence_ids=[self._content_hash(item.url or item.title)],
            source_session={
                "pipeline": "deepdive",
                "phase": "3.5_dpo",
                "relevance_score": score,
                "keywords_matched": keywords_matched,
            },
            safety_flags=["auto-generated", "deepdive-pipeline"],
            metadata={
                "source_feed": item.source,
                "item_title": item.title,
                "score": score,
            },
        )

    def _build_implication_pair(self, item, score: float,
                                 synthesis_excerpt: str) -> DPOPair:
        """Type 3: 'What are the implications?' → actionable insight vs platitude."""
        prompt = (
            f"What are the practical implications of this research for a team "
            f"running sovereign LLM agents with local training infrastructure?\n\n"
            f"Title: {item.title}\n"
            f"Summary: {item.summary[:400]}"
        )

        chosen = (
            f"**Immediate implications for our fleet:**\n\n"
            f"1. **Training pipeline:** {synthesis_excerpt[:200] if synthesis_excerpt else 'This work suggests improvements to our GRPO/DPO training approach.'}\n\n"
            f"2. **Agent architecture:** Techniques described here could enhance "
            f"our tool-use and reasoning capabilities in Hermes agents.\n\n"
            f"3. **Deployment consideration:** With a relevance score of {score:.2f}, "
            f"this should be flagged for the next tightening cycle. "
            f"Consider adding these techniques to the overnight R&D queue.\n\n"
            f"**Priority:** {'HIGH — review before next deploy' if score >= 2.0 else 'MEDIUM — queue for weekly review'}"
        )

        rejected = (
            f"This research has some implications for AI development. "
            f"Teams working on AI projects should be aware of these developments. "
            f"The field is moving quickly and it's important to stay up to date."
        )

        return DPOPair(
            prompt=prompt,
            chosen=chosen,
            rejected=rejected,
            task_type="implication",
            evidence_ids=[self._content_hash(item.url or item.title)],
            source_session={
                "pipeline": "deepdive",
                "phase": "3.5_dpo",
                "relevance_score": score,
            },
            safety_flags=["auto-generated", "deepdive-pipeline"],
            metadata={
                "source_feed": item.source,
                "item_title": item.title,
                "score": score,
            },
        )

    def generate(
        self,
        ranked_items: List[tuple],
        briefing: Dict[str, Any],
        fleet_context_text: str = "",
    ) -> List[DPOPair]:
        """Generate DPO pairs from ranked items and synthesis output.

        Args:
            ranked_items: List of (FeedItem, score) tuples from Phase 2
            briefing: Structured briefing dict from Phase 3
            fleet_context_text: Optional fleet context markdown string

        Returns:
            List of DPOPair objects
        """
        if not ranked_items:
            logger.info("No ranked items — skipping DPO generation")
            return []

        synthesis_text = briefing.get("briefing", "")
        pairs: List[DPOPair] = []

        for item, score in ranked_items:
            if score < self.min_score:
                continue

            # Extract a synthesis excerpt relevant to this item
            excerpt = self._extract_relevant_excerpt(synthesis_text, item.title)

            if "summarize" in self.pair_types:
                pairs.append(self._build_summarize_pair(item, score, excerpt))

            if "relevance" in self.pair_types:
                pairs.append(self._build_relevance_pair(item, score, fleet_context_text))

            if "implication" in self.pair_types:
                pairs.append(self._build_implication_pair(item, score, excerpt))

            if len(pairs) >= self.max_pairs_per_run:
                break

        logger.info(f"Generated {len(pairs)} DPO pairs from {len(ranked_items)} ranked items")
        return pairs

    def _extract_relevant_excerpt(self, synthesis_text: str, title: str) -> str:
        """Extract the portion of synthesis most relevant to a given item title."""
        if not synthesis_text:
            return ""

        # Try to find a paragraph mentioning key words from the title
        title_words = [w.lower() for w in title.split() if len(w) > 4]
        paragraphs = synthesis_text.split("\n\n")

        best_para = ""
        best_overlap = 0

        for para in paragraphs:
            para_lower = para.lower()
            overlap = sum(1 for w in title_words if w in para_lower)
            if overlap > best_overlap:
                best_overlap = overlap
                best_para = para

        if best_overlap > 0:
            return best_para.strip()[:500]

        # Fallback: first substantive paragraph
        for para in paragraphs:
            stripped = para.strip()
            if len(stripped) > 100 and not stripped.startswith("#"):
                return stripped[:500]

        return synthesis_text[:500]

    def export(self, pairs: List[DPOPair], session_id: Optional[str] = None) -> Path:
        """Write DPO pairs to JSONL file.

        Args:
            pairs: List of DPOPair objects
            session_id: Optional session identifier for the filename

        Returns:
            Path to the written JSONL file
        """
        timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
        suffix = f"_{session_id}" if session_id else ""
        filename = f"deepdive_{timestamp}{suffix}.jsonl"
        output_path = self.output_dir / filename

        written = 0
        with open(output_path, "w") as f:
            for pair in pairs:
                f.write(json.dumps(pair.to_dict()) + "\n")
                written += 1

        logger.info(f"Exported {written} DPO pairs to {output_path}")
        return output_path

    def run(
        self,
        ranked_items: List[tuple],
        briefing: Dict[str, Any],
        fleet_context_text: str = "",
        session_id: Optional[str] = None,
    ) -> Dict[str, Any]:
        """Full Phase 3.5: generate → validate → export DPO pairs.

        Returns summary dict for pipeline result aggregation.
        """
        pairs = self.generate(ranked_items, briefing, fleet_context_text)

        if not pairs:
            return {
                "status": "skipped",
                "pairs_generated": 0,
                "pairs_validated": 0,
                "output_path": None,
            }

        # Quality gate: validate before export
        quality_report = None
        if self.validator:
            pair_dicts = [p.to_dict() for p in pairs]
            filtered_dicts, quality_report = self.validator.validate(pair_dicts)

            logger.info(
                f"Quality gate: {quality_report.passed_pairs}/{quality_report.total_pairs} "
                f"passed, {quality_report.dropped_pairs} dropped, "
                f"{quality_report.flagged_pairs} flagged"
            )

            if not filtered_dicts:
                return {
                    "status": "all_filtered",
                    "pairs_generated": len(pairs),
                    "pairs_validated": 0,
                    "output_path": None,
                    "quality": quality_report.to_dict(),
                }

            # Rebuild DPOPair objects from filtered dicts
            pairs = [
                DPOPair(
                    prompt=d["prompt"],
                    chosen=d["chosen"],
                    rejected=d["rejected"],
                    task_type=d.get("task_type", "unknown"),
                    evidence_ids=d.get("evidence_ids", []),
                    source_session=d.get("source_session", {}),
                    safety_flags=d.get("safety_flags", []),
                    metadata=d.get("metadata", {}),
                )
                for d in filtered_dicts
            ]

        output_path = self.export(pairs, session_id)

        # Register exported hashes in the persistent dedup index
        if self.validator:
            try:
                exported_dicts = [p.to_dict() for p in pairs]
                self.validator.register_exported_hashes(
                    exported_dicts, output_path.name
                )
            except Exception as e:
                logger.warning(f"Failed to register hashes in dedup index: {e}")

        # Summary by task type
        type_counts = {}
        for p in pairs:
            type_counts[p.task_type] = type_counts.get(p.task_type, 0) + 1

        result = {
            "status": "success",
            "pairs_generated": len(pairs) + (quality_report.dropped_pairs if quality_report else 0),
            "pairs_validated": len(pairs),
            "output_path": str(output_path),
            "pair_types": type_counts,
            "output_dir": str(self.output_dir),
        }
        if quality_report:
            result["quality"] = quality_report.to_dict()
        return result