ezra-environment/the-nexus/deepdive/bin/phase3_synthesize.py

#!/usr/bin/env python3
"""
Deep Dive Phase 3: Synthesis Engine
Generates structured intelligence briefing via LLM.

Usage:
    python phase3_synthesize.py [--date YYYY-MM-DD] [--output-dir DIR]

Issue: the-nexus#830
"""

import argparse
import json
import os
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import List, Optional


# System prompt engineered for Hermes/Timmy context
BRIEFING_SYSTEM_PROMPT = """You are Deep Dive, an intelligence briefing system for the Hermes Agent Framework and Timmy organization.

Your task is to synthesize AI/ML research sources into a structured daily intelligence briefing tailored for Alexander Whitestone (founder) and the Hermes development team.

CONTEXT ABOUT HERMES/TIMMY:
- Hermes is an open-source AI agent framework with tool use, multi-agent orchestration, and MCP (Model Context Protocol) support
- Timmy is the fleet coordinator managing multiple AI coding agents
- Current priorities: agent reliability, context compression, distributed execution, sovereign infrastructure
- Technology stack: Python, asyncio, SQLite, FastAPI, llama.cpp, vLLM

BRIEFING STRUCTURE:
1. HEADLINES (3-5 bullets): Major developments with impact assessment
2. DEEP DIVES (2-3 items): Detailed analysis of most relevant papers/posts
3. IMPLICATIONS FOR HERMES: How this research affects our roadmap
4. ACTION ITEMS: Specific follow-ups for the team
5. SOURCES: Cited with URLs

TONE:
- Professional intelligence briefing
- Concise but substantive
- Technical depth appropriate for AI engineers
- Forward-looking implications

RULES:
- Prioritize sources by relevance to agent systems and LLM architecture
- Include specific techniques/methods when applicable
- Connect findings to Hermes' current challenges
- Always cite sources
"""


@dataclass
class Source:
    """Ranked source item."""
    title: str
    url: str
    source: str
    summary: str
    score: float


class SynthesisEngine:
    """Generate intelligence briefings via LLM."""

    def __init__(self, output_dir: Path, date: str, model: str = "openai/gpt-4o-mini"):
        self.output_dir = output_dir
        self.date = date
        self.model = model
        self.ranked_dir = output_dir / "ranked"
        self.briefings_dir = output_dir / "briefings"
        self.briefings_dir.mkdir(parents=True, exist_ok=True)

    def load_ranked_sources(self) -> List[Source]:
        """Load ranked sources from Phase 2."""
        ranked_file = self.ranked_dir / f"{self.date}.json"
        if not ranked_file.exists():
            raise FileNotFoundError(f"Phase 2 output not found: {ranked_file}")

        with open(ranked_file) as f:
            data = json.load(f)

        return [
            Source(
                title=item.get('title', ''),
                url=item.get('url', ''),
                source=item.get('source', ''),
                summary=item.get('summary', ''),
                score=item.get('total_score', 0)
            )
            for item in data.get('items', [])
        ]

    def format_sources_for_llm(self, sources: List[Source]) -> str:
        """Format sources for LLM consumption."""
        lines = []
        for i, src in enumerate(sources[:15], 1):  # Top 15 sources
            lines.append(f"\n--- Source {i} [{src.source}] (score: {src.score}) ---")
            lines.append(f"Title: {src.title}")
            lines.append(f"URL: {src.url}")
            lines.append(f"Summary: {src.summary[:800]}")
        return "\n".join(lines)

    def generate_briefing_openai(self, sources_text: str) -> str:
        """Generate briefing using OpenAI API."""
        try:
            from openai import OpenAI
            client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))

            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": BRIEFING_SYSTEM_PROMPT},
                    {"role": "user", "content": f"Generate today's Deep Dive briefing ({self.date}) based on these sources:\n\n{sources_text}"}
                ],
                temperature=0.7,
                max_tokens=4000
            )
            return response.choices[0].message.content
        except Exception as e:
            print(f"[ERROR] OpenAI generation failed: {e}")
            return self._fallback_briefing(sources_text)

    def generate_briefing_anthropic(self, sources_text: str) -> str:
        """Generate briefing using Anthropic API."""
        try:
            import anthropic
            client = anthropic.Anthropic(api_key=os.environ.get('ANTHROPIC_API_KEY'))

            response = client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=4000,
                system=BRIEFING_SYSTEM_PROMPT,
                messages=[
                    {"role": "user", "content": f"Generate today's Deep Dive briefing ({self.date}) based on these sources:\n\n{sources_text}"}
                ]
            )
            return response.content[0].text
        except Exception as e:
            print(f"[ERROR] Anthropic generation failed: {e}")
            return self._fallback_briefing(sources_text)

    def generate_briefing_hermes(self, sources_text: str) -> str:
        """Generate briefing using local Hermes endpoint."""
        try:
            import requests

            response = requests.post(
                "http://localhost:8645/v1/chat/completions",
                json={
                    "model": "hermes",
                    "messages": [
                        {"role": "system", "content": BRIEFING_SYSTEM_PROMPT},
                        {"role": "user", "content": f"Generate today's Deep Dive briefing ({self.date}):\n\n{sources_text[:6000]}"}
                    ],
                    "temperature": 0.7,
                    "max_tokens": 4000
                },
                timeout=120
            )
            return response.json()['choices'][0]['message']['content']
        except Exception as e:
            print(f"[ERROR] Hermes generation failed: {e}")
            return self._fallback_briefing(sources_text)

    def _fallback_briefing(self, sources_text: str) -> str:
        """Generate fallback briefing when LLM fails."""
        lines = [
            f"# Deep Dive: AI Intelligence Briefing — {self.date}",
            "",
            "*Note: LLM synthesis unavailable. This is a structured source digest.*",
            "",
            "## Sources Today",
            ""
        ]
        # Simple extraction from sources
        for line in sources_text.split('\n')[:50]:
            if line.startswith('Title:') or line.startswith('URL:'):
                lines.append(line)

        lines.extend([
            "",
            "## Note",
            "LLM synthesis failed. Review source URLs directly for content.",
            "",
            "---",
            "Deep Dive (Fallback Mode) | Hermes Agent Framework"
        ])

        return "\n".join(lines)

    def generate_briefing(self, sources: List[Source]) -> str:
        """Generate briefing using selected model."""
        sources_text = self.format_sources_for_llm(sources)

        print(f"[Phase 3] Generating briefing using {self.model}...")

        if 'openai' in self.model.lower():
            return self.generate_briefing_openai(sources_text)
        elif 'anthropic' in self.model or 'claude' in self.model.lower():
            return self.generate_briefing_anthropic(sources_text)
        elif 'hermes' in self.model.lower():
            return self.generate_briefing_hermes(sources_text)
        else:
            # Try OpenAI first, fallback to Hermes
            if os.environ.get('OPENAI_API_KEY'):
                return self.generate_briefing_openai(sources_text)
            elif os.environ.get('ANTHROPIC_API_KEY'):
                return self.generate_briefing_anthropic(sources_text)
            else:
                return self.generate_briefing_hermes(sources_text)

    def save_briefing(self, content: str):
        """Save briefing to markdown file."""
        output_file = self.briefings_dir / f"{self.date}.md"

        # Add metadata header
        header = f"""---
date: {self.date}
generated_at: {datetime.now().isoformat()}
model: {self.model}
version: 1.0
---

"""

        full_content = header + content

        with open(output_file, 'w') as f:
            f.write(full_content)

        print(f"[Phase 3] Saved briefing to {output_file}")
        return output_file

    def run(self) -> Path:
        """Run full synthesis pipeline."""
        print(f"[Phase 3] Synthesizing briefing for {self.date}")

        sources = self.load_ranked_sources()
        print(f"[Phase 3] Loaded {len(sources)} ranked sources")

        briefing = self.generate_briefing(sources)
        output_file = self.save_briefing(briefing)

        print(f"[Phase 3] Briefing generated: {len(briefing)} characters")
        return output_file


def main():
    parser = argparse.ArgumentParser(description='Deep Dive Phase 3: Synthesis Engine')
    parser.add_argument('--date', default=datetime.now().strftime('%Y-%m-%d'),
                       help='Target date (YYYY-MM-DD)')
    parser.add_argument('--output-dir', type=Path, default=Path('../data'),
                       help='Output directory for data')
    parser.add_argument('--model', default='openai/gpt-4o-mini',
                       help='LLM model for synthesis')
    args = parser.parse_args()

    engine = SynthesisEngine(args.output_dir, args.date, args.model)
    engine.run()


if __name__ == '__main__':
    main()