ezra-environment/the-nexus/deepdive/bin/phase2_rank.py

#!/usr/bin/env python3
"""
Deep Dive Phase 2: Relevance Engine
Filters and ranks sources by relevance to Hermes/Timmy mission.

Usage:
    python phase2_rank.py [--date YYYY-MM-DD] [--output-dir DIR]

Issue: the-nexus#830
"""

import argparse
import json
import re
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np


@dataclass
class ScoredItem:
    """A source item with relevance scores."""
    id: str
    title: str
    url: str
    source: str
    published: str
    summary: str
    authors: List[str]
    categories: List[str]
    scores: Dict[str, float]
    total_score: float


class RelevanceEngine:
    """Score sources by relevance to Hermes/Timmy work."""

    # Keywords weighted by importance to Hermes mission
    HERMES_KEYWORDS = {
        # Core (high weight)
        'agent': 1.5,
        'agents': 1.5,
        'multi-agent': 2.0,
        'mcp': 2.0,  # Model Context Protocol
        'hermes': 2.5,
        'timmy': 2.5,
        'tool use': 1.8,
        'function calling': 1.8,
        'llm': 1.2,
        'llms': 1.2,

        # Architecture (medium-high weight)
        'transformer': 1.3,
        'attention': 1.2,
        'fine-tuning': 1.4,
        'rlhf': 1.5,
        'reinforcement learning': 1.5,
        'training': 1.1,
        'inference': 1.1,

        # Relevance (medium weight)
        'autonomous': 1.3,
        'orchestration': 1.4,
        'workflow': 1.1,
        'pipeline': 1.0,
        'automation': 1.2,

        # Technical (context weight)
        'rag': 1.2,
        'retrieval': 1.0,
        'embedding': 1.1,
        'vector': 0.9,
        'clustering': 0.8,
    }

    # Source authority weights
    SOURCE_WEIGHTS = {
        'arxiv-cs.AI': 1.2,
        'arxiv-cs.CL': 1.1,
        'arxiv-cs.LG': 1.15,
        'openai': 1.0,
        'anthropic': 1.0,
        'deepmind': 1.0,
        'google-research': 0.95,
    }

    def __init__(self, output_dir: Path, date: str):
        self.output_dir = output_dir
        self.date = date
        self.sources_dir = output_dir / "sources" / date
        self.ranked_dir = output_dir / "ranked"
        self.ranked_dir.mkdir(parents=True, exist_ok=True)

    def load_sources(self) -> List[dict]:
        """Load aggregated sources from Phase 1."""
        source_file = self.sources_dir / "aggregated.json"
        if not source_file.exists():
            raise FileNotFoundError(f"Phase 1 output not found: {source_file}")

        with open(source_file) as f:
            data = json.load(f)

        return data.get('items', [])

    def calculate_keyword_score(self, item: dict) -> float:
        """Calculate keyword match score."""
        text = f"{item.get('title', '')} {item.get('summary', '')}"
        text_lower = text.lower()

        score = 0.0
        for keyword, weight in self.HERMES_KEYWORDS.items():
            count = len(re.findall(r'\b' + re.escape(keyword.lower()) + r'\b', text_lower))
            score += count * weight

        return min(score, 10.0)  # Cap at 10

    def calculate_source_score(self, item: dict) -> float:
        """Calculate source authority score."""
        source = item.get('source', '')
        return self.SOURCE_WEIGHTS.get(source, 0.8)

    def calculate_recency_score(self, item: dict) -> float:
        """Calculate recency score (higher for more recent)."""
        # Simplified: all items from today get full score
        # Could parse dates for more nuance
        return 1.0

    def score_item(self, item: dict) -> ScoredItem:
        """Calculate full relevance scores for an item."""
        keyword_score = self.calculate_keyword_score(item)
        source_score = self.calculate_source_score(item)
        recency_score = self.calculate_recency_score(item)

        # Weighted total
        total_score = (
            keyword_score * 0.5 +
            source_score * 0.3 +
            recency_score * 0.2
        )

        return ScoredItem(
            id=item.get('id', ''),
            title=item.get('title', ''),
            url=item.get('url', ''),
            source=item.get('source', ''),
            published=item.get('published', ''),
            summary=item.get('summary', '')[:500],
            authors=item.get('authors', []),
            categories=item.get('categories', []),
            scores={
                'keyword': round(keyword_score, 2),
                'source': round(source_score, 2),
                'recency': round(recency_score, 2),
            },
            total_score=round(total_score, 2)
        )

    def rank_items(self, items: List[dict], top_n: int = 20) -> List[ScoredItem]:
        """Score and rank all items."""
        scored = [self.score_item(item) for item in items]
        scored.sort(key=lambda x: x.total_score, reverse=True)
        return scored[:top_n]

    def save_ranked(self, items: List[ScoredItem]):
        """Save ranked items to JSON."""
        output_file = self.ranked_dir / f"{self.date}.json"

        data = {
            'date': self.date,
            'generated_at': datetime.now().isoformat(),
            'count': len(items),
            'items': [
                {
                    'id': item.id,
                    'title': item.title,
                    'url': item.url,
                    'source': item.source,
                    'published': item.published,
                    'summary': item.summary,
                    'scores': item.scores,
                    'total_score': item.total_score,
                }
                for item in items
            ]
        }

        with open(output_file, 'w') as f:
            json.dump(data, f, indent=2)

        print(f"[Phase 2] Saved ranked items to {output_file}")

    def run(self, top_n: int = 20) -> List[ScoredItem]:
        """Run full ranking pipeline."""
        print(f"[Phase 2] Ranking sources for {self.date}")

        sources = self.load_sources()
        print(f"[Phase 2] Loaded {len(sources)} sources")

        ranked = self.rank_items(sources, top_n)
        print(f"[Phase 2] Top {len(ranked)} items selected")

        # Print top 5 for visibility
        print("\n[Phase 2] Top 5 Sources:")
        for i, item in enumerate(ranked[:5], 1):
            print(f"  {i}. [{item.total_score:.1f}] {item.title[:60]}...")

        self.save_ranked(ranked)
        return ranked


def main():
    parser = argparse.ArgumentParser(description='Deep Dive Phase 2: Relevance Engine')
    parser.add_argument('--date', default=datetime.now().strftime('%Y-%m-%d'),
                       help='Target date (YYYY-MM-DD)')
    parser.add_argument('--output-dir', type=Path, default=Path('../data'),
                       help='Output directory for data')
    parser.add_argument('--top-n', type=int, default=20,
                       help='Number of top items to keep')
    args = parser.parse_args()

    engine = RelevanceEngine(args.output_dir, args.date)
    engine.run(args.top_n)


if __name__ == '__main__':
    main()