#!/usr/bin/env python3 """Deep Dive DPO Training Pair Generator — Phase 3.5 Transforms ranked research items + synthesis output into DPO preference pairs for overnight Hermes training. Closes the loop between arXiv intelligence gathering and sovereign model improvement. Pair strategy: 1. summarize — "Summarize this paper" → fleet-grounded analysis (chosen) vs generic abstract (rejected) 2. relevance — "What's relevant to Hermes?" → scored relevance analysis (chosen) vs vague (rejected) 3. implication — "What are the implications?" → actionable insight (chosen) vs platitude (rejected) Output format matches timmy-home training-data convention: {"prompt", "chosen", "rejected", "source_session", "task_type", "evidence_ids", "safety_flags"} """ import hashlib import json import logging from dataclasses import dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Optional # Quality validation gate try: from dpo_quality import DPOQualityValidator HAS_DPO_QUALITY = True except ImportError: HAS_DPO_QUALITY = False DPOQualityValidator = None logger = logging.getLogger("deepdive.dpo_generator") @dataclass class DPOPair: """Single DPO training pair.""" prompt: str chosen: str rejected: str task_type: str evidence_ids: List[str] = field(default_factory=list) source_session: Dict[str, Any] = field(default_factory=dict) safety_flags: List[str] = field(default_factory=list) metadata: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: return { "prompt": self.prompt, "chosen": self.chosen, "rejected": self.rejected, "task_type": self.task_type, "evidence_ids": self.evidence_ids, "source_session": self.source_session, "safety_flags": self.safety_flags, "metadata": self.metadata, } class DPOPairGenerator: """Generate DPO training pairs from Deep Dive pipeline output. Sits between Phase 3 (Synthesis) and Phase 4 (Audio) as Phase 3.5. Takes ranked items + synthesis briefing and produces training pairs that teach Hermes to produce fleet-grounded research analysis. """ def __init__(self, config: Optional[Dict[str, Any]] = None): cfg = config or {} self.output_dir = Path( cfg.get("output_dir", str(Path.home() / ".timmy" / "training-data" / "dpo-pairs")) ) self.output_dir.mkdir(parents=True, exist_ok=True) self.min_score = cfg.get("min_score", 0.5) self.max_pairs_per_run = cfg.get("max_pairs_per_run", 30) self.pair_types = cfg.get("pair_types", ["summarize", "relevance", "implication"]) # Quality validator self.validator = None validation_cfg = cfg.get("validation", {}) if HAS_DPO_QUALITY and validation_cfg.get("enabled", True): self.validator = DPOQualityValidator( config=validation_cfg, output_dir=self.output_dir, ) logger.info("DPO quality validator enabled") elif not HAS_DPO_QUALITY: logger.info("DPO quality validator not available (dpo_quality module not found)") else: logger.info("DPO quality validator disabled in config") logger.info( f"DPOPairGenerator: output_dir={self.output_dir}, " f"pair_types={self.pair_types}, max_pairs={self.max_pairs_per_run}" ) def _content_hash(self, text: str) -> str: return hashlib.sha256(text.encode()).hexdigest()[:12] def _build_summarize_pair(self, item, score: float, synthesis_excerpt: str) -> DPOPair: """Type 1: 'Summarize this paper' → fleet-grounded analysis vs generic abstract.""" prompt = ( f"Summarize the following research paper and explain its significance " f"for a team building sovereign LLM agents:\n\n" f"Title: {item.title}\n" f"Abstract: {item.summary[:500]}\n" f"Source: {item.source}\n" f"URL: {item.url}" ) chosen = ( f"{synthesis_excerpt}\n\n" f"Relevance score: {score:.2f}/5.0 — " f"This work directly impacts our agent architecture and training pipeline." ) # Rejected: generic, unhelpful summary without fleet context rejected = ( f"This paper titled \"{item.title}\" presents research findings in the area " f"of artificial intelligence. The authors discuss various methods and present " f"results. This may be of interest to researchers in the field." ) return DPOPair( prompt=prompt, chosen=chosen, rejected=rejected, task_type="summarize", evidence_ids=[self._content_hash(item.url or item.title)], source_session={ "pipeline": "deepdive", "phase": "3.5_dpo", "relevance_score": score, "source_url": item.url, }, safety_flags=["auto-generated", "deepdive-pipeline"], metadata={ "source_feed": item.source, "item_title": item.title, "score": score, }, ) def _build_relevance_pair(self, item, score: float, fleet_context_text: str) -> DPOPair: """Type 2: 'What's relevant to Hermes?' → scored analysis vs vague response.""" prompt = ( f"Analyze this research for relevance to the Hermes agent fleet — " f"a sovereign AI system using local Gemma models, Ollama inference, " f"and GRPO/DPO training:\n\n" f"Title: {item.title}\n" f"Abstract: {item.summary[:400]}" ) # Build keyword match explanation keywords_matched = [] text_lower = f"{item.title} {item.summary}".lower() relevance_terms = [ "agent", "tool use", "function calling", "reinforcement learning", "RLHF", "GRPO", "fine-tuning", "LoRA", "quantization", "inference", "reasoning", "chain of thought", "transformer", "local" ] for term in relevance_terms: if term.lower() in text_lower: keywords_matched.append(term) keyword_str = ", ".join(keywords_matched[:5]) if keywords_matched else "general AI/ML" chosen = ( f"**Relevance: {score:.2f}/5.0**\n\n" f"This paper is relevant to our fleet because it touches on: {keyword_str}.\n\n" ) if fleet_context_text: chosen += ( f"In the context of our current fleet state:\n" f"{fleet_context_text[:300]}\n\n" ) chosen += ( f"**Actionable takeaway:** Review this work for techniques applicable to " f"our overnight training loop and agent architecture improvements." ) rejected = ( f"This paper might be relevant. It discusses some AI topics. " f"It could potentially be useful for various AI projects. " f"Further reading may be needed to determine its applicability." ) return DPOPair( prompt=prompt, chosen=chosen, rejected=rejected, task_type="relevance", evidence_ids=[self._content_hash(item.url or item.title)], source_session={ "pipeline": "deepdive", "phase": "3.5_dpo", "relevance_score": score, "keywords_matched": keywords_matched, }, safety_flags=["auto-generated", "deepdive-pipeline"], metadata={ "source_feed": item.source, "item_title": item.title, "score": score, }, ) def _build_implication_pair(self, item, score: float, synthesis_excerpt: str) -> DPOPair: """Type 3: 'What are the implications?' → actionable insight vs platitude.""" prompt = ( f"What are the practical implications of this research for a team " f"running sovereign LLM agents with local training infrastructure?\n\n" f"Title: {item.title}\n" f"Summary: {item.summary[:400]}" ) chosen = ( f"**Immediate implications for our fleet:**\n\n" f"1. **Training pipeline:** {synthesis_excerpt[:200] if synthesis_excerpt else 'This work suggests improvements to our GRPO/DPO training approach.'}\n\n" f"2. **Agent architecture:** Techniques described here could enhance " f"our tool-use and reasoning capabilities in Hermes agents.\n\n" f"3. **Deployment consideration:** With a relevance score of {score:.2f}, " f"this should be flagged for the next tightening cycle. " f"Consider adding these techniques to the overnight R&D queue.\n\n" f"**Priority:** {'HIGH — review before next deploy' if score >= 2.0 else 'MEDIUM — queue for weekly review'}" ) rejected = ( f"This research has some implications for AI development. " f"Teams working on AI projects should be aware of these developments. " f"The field is moving quickly and it's important to stay up to date." ) return DPOPair( prompt=prompt, chosen=chosen, rejected=rejected, task_type="implication", evidence_ids=[self._content_hash(item.url or item.title)], source_session={ "pipeline": "deepdive", "phase": "3.5_dpo", "relevance_score": score, }, safety_flags=["auto-generated", "deepdive-pipeline"], metadata={ "source_feed": item.source, "item_title": item.title, "score": score, }, ) def generate( self, ranked_items: List[tuple], briefing: Dict[str, Any], fleet_context_text: str = "", ) -> List[DPOPair]: """Generate DPO pairs from ranked items and synthesis output. Args: ranked_items: List of (FeedItem, score) tuples from Phase 2 briefing: Structured briefing dict from Phase 3 fleet_context_text: Optional fleet context markdown string Returns: List of DPOPair objects """ if not ranked_items: logger.info("No ranked items — skipping DPO generation") return [] synthesis_text = briefing.get("briefing", "") pairs: List[DPOPair] = [] for item, score in ranked_items: if score < self.min_score: continue # Extract a synthesis excerpt relevant to this item excerpt = self._extract_relevant_excerpt(synthesis_text, item.title) if "summarize" in self.pair_types: pairs.append(self._build_summarize_pair(item, score, excerpt)) if "relevance" in self.pair_types: pairs.append(self._build_relevance_pair(item, score, fleet_context_text)) if "implication" in self.pair_types: pairs.append(self._build_implication_pair(item, score, excerpt)) if len(pairs) >= self.max_pairs_per_run: break logger.info(f"Generated {len(pairs)} DPO pairs from {len(ranked_items)} ranked items") return pairs def _extract_relevant_excerpt(self, synthesis_text: str, title: str) -> str: """Extract the portion of synthesis most relevant to a given item title.""" if not synthesis_text: return "" # Try to find a paragraph mentioning key words from the title title_words = [w.lower() for w in title.split() if len(w) > 4] paragraphs = synthesis_text.split("\n\n") best_para = "" best_overlap = 0 for para in paragraphs: para_lower = para.lower() overlap = sum(1 for w in title_words if w in para_lower) if overlap > best_overlap: best_overlap = overlap best_para = para if best_overlap > 0: return best_para.strip()[:500] # Fallback: first substantive paragraph for para in paragraphs: stripped = para.strip() if len(stripped) > 100 and not stripped.startswith("#"): return stripped[:500] return synthesis_text[:500] def export(self, pairs: List[DPOPair], session_id: Optional[str] = None) -> Path: """Write DPO pairs to JSONL file. Args: pairs: List of DPOPair objects session_id: Optional session identifier for the filename Returns: Path to the written JSONL file """ timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") suffix = f"_{session_id}" if session_id else "" filename = f"deepdive_{timestamp}{suffix}.jsonl" output_path = self.output_dir / filename written = 0 with open(output_path, "w") as f: for pair in pairs: f.write(json.dumps(pair.to_dict()) + "\n") written += 1 logger.info(f"Exported {written} DPO pairs to {output_path}") return output_path def run( self, ranked_items: List[tuple], briefing: Dict[str, Any], fleet_context_text: str = "", session_id: Optional[str] = None, ) -> Dict[str, Any]: """Full Phase 3.5: generate → validate → export DPO pairs. Returns summary dict for pipeline result aggregation. """ pairs = self.generate(ranked_items, briefing, fleet_context_text) if not pairs: return { "status": "skipped", "pairs_generated": 0, "pairs_validated": 0, "output_path": None, } # Quality gate: validate before export quality_report = None if self.validator: pair_dicts = [p.to_dict() for p in pairs] filtered_dicts, quality_report = self.validator.validate(pair_dicts) logger.info( f"Quality gate: {quality_report.passed_pairs}/{quality_report.total_pairs} " f"passed, {quality_report.dropped_pairs} dropped, " f"{quality_report.flagged_pairs} flagged" ) if not filtered_dicts: return { "status": "all_filtered", "pairs_generated": len(pairs), "pairs_validated": 0, "output_path": None, "quality": quality_report.to_dict(), } # Rebuild DPOPair objects from filtered dicts pairs = [ DPOPair( prompt=d["prompt"], chosen=d["chosen"], rejected=d["rejected"], task_type=d.get("task_type", "unknown"), evidence_ids=d.get("evidence_ids", []), source_session=d.get("source_session", {}), safety_flags=d.get("safety_flags", []), metadata=d.get("metadata", {}), ) for d in filtered_dicts ] output_path = self.export(pairs, session_id) # Register exported hashes in the persistent dedup index if self.validator: try: exported_dicts = [p.to_dict() for p in pairs] self.validator.register_exported_hashes( exported_dicts, output_path.name ) except Exception as e: logger.warning(f"Failed to register hashes in dedup index: {e}") # Summary by task type type_counts = {} for p in pairs: type_counts[p.task_type] = type_counts.get(p.task_type, 0) + 1 result = { "status": "success", "pairs_generated": len(pairs) + (quality_report.dropped_pairs if quality_report else 0), "pairs_validated": len(pairs), "output_path": str(output_path), "pair_types": type_counts, "output_dir": str(self.output_dir), } if quality_report: result["quality"] = quality_report.to_dict() return result