[ezra] Phase 2: Relevance scoring for Deep Dive (#830)

2026-04-05 05:16:33 +00:00
parent 2b06e179d1
commit cbf05e1fc8
1 changed files with 246 additions and 0 deletions
--- a/bin/deepdive_filter.py
+++ b/bin/deepdive_filter.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+"""
+Deep Dive Phase 2: Relevance Filtering
+Scores and filters entries by Hermes/Timmy relevance.
+
+Usage:
+    deepdive_filter.py --input PATH --output PATH [--top-n N]
+"""
+
+import argparse
+import json
+import re
+from pathlib import Path
+from typing import List, Dict, Tuple
+from dataclasses import dataclass
+from collections import Counter
+
+try:
+    from sentence_transformers import SentenceTransformer, util
+    EMBEDDINGS_AVAILABLE = True
+except ImportError:
+    EMBEDDINGS_AVAILABLE = False
+    print("[WARN] sentence-transformers not available, keyword-only mode")
+
+
+@dataclass
+class ScoredEntry:
+    entry: dict
+    relevance_score: float
+    keyword_score: float
+    embedding_score: float = 0.0
+    keywords_matched: List[str] = None
+    reasons: List[str] = None
+
+
+class KeywordScorer:
+    """Scores entries by keyword matching."""
+    
+    WEIGHTS = {
+        "high": 3.0,
+        "medium": 1.5,
+        "low": 0.5
+    }
+    
+    KEYWORDS = {
+        "high": [
+            "hermes", "timmy", "timmy foundation",
+            "langchain", "llm agent", "agent framework",
+            "multi-agent", "agent orchestration",
+            "reinforcement learning", "RLHF", "DPO", "GRPO",
+            "tool use", "tool calling", "function calling",
+            "chain-of-thought", "reasoning", "planning",
+            "fine-tuning", "instruction tuning",
+            "alignment", "safety"
+        ],
+        "medium": [
+            "llm", "large language model", "transformer",
+            "inference optimization", "quantization", "distillation",
+            "rag", "retrieval augmented", "vector database",
+            "context window", "prompt engineering",
+            "mcp", "model context protocol",
+            "openai", "anthropic", "claude", "gpt",
+            "training", "foundation model"
+        ],
+        "low": [
+            "ai", "artificial intelligence",
+            "machine learning", "deep learning",
+            "neural network"
+        ]
+    }
+    
+    def score(self, entry: dict) -> Tuple[float, List[str], List[str]]:
+        """Return (score, matched_keywords, reasons)."""
+        text = f"{entry.get('title', '')} {entry.get('summary', '')}".lower()
+        matched = []
+        reasons = []
+        total_score = 0.0
+        
+        for tier, keywords in self.KEYWORDS.items():
+            weight = self.WEIGHTS[tier]
+            for keyword in keywords:
+                if keyword.lower() in text:
+                    matched.append(keyword)
+                    total_score += weight
+                    if len(reasons) < 3:  # Limit reasons
+                        reasons.append(f"Keyword '{keyword}' ({tier} priority)")
+        
+        # Bonus for arXiv AI/CL/LG papers
+        if entry.get('source', '').startswith('arxiv'):
+            total_score += 0.5
+            reasons.append("arXiv AI paper (category bonus)")
+        
+        # Normalize score (roughly 0-10 scale)
+        normalized = min(10.0, total_score)
+        
+        return normalized, matched, reasons
+
+
+class EmbeddingScorer:
+    """Scores entries by embedding similarity to Hermes context."""
+    
+    HERMES_CONTEXT = [
+        "Hermes agent framework for autonomous AI systems",
+        "Tool calling and function use in LLMs",
+        "Multi-agent orchestration and communication",
+        "Reinforcement learning from human feedback",
+        "LLM fine-tuning and alignment",
+        "Model context protocol and agent tools",
+        "Open source AI agent systems",
+    ]
+    
+    def __init__(self):
+        if not EMBEDDINGS_AVAILABLE:
+            self.model = None
+            self.context_embeddings = None
+            return
+        
+        print("[INFO] Loading embedding model...")
+        self.model = SentenceTransformer('all-MiniLM-L6-v2')
+        self.context_embeddings = self.model.encode(
+            self.HERMES_CONTEXT, convert_to_tensor=True
+        )
+    
+    def score(self, entry: dict) -> float:
+        """Return similarity score 0-1."""
+        if not EMBEDDINGS_AVAILABLE or not self.model:
+            return 0.0
+        
+        text = f"{entry.get('title', '')}. {entry.get('summary', '')}"
+        if not text.strip():
+            return 0.0
+        
+        entry_embedding = self.model.encode(text, convert_to_tensor=True)
+        similarities = util.cos_sim(entry_embedding, self.context_embeddings)
+        max_sim = float(similarities.max())
+        
+        return max_sim
+
+
+class RelevanceFilter:
+    """Main filtering orchestrator."""
+    
+    def __init__(self, use_embeddings: bool = True):
+        self.keyword_scorer = KeywordScorer()
+        self.embedding_scorer = EmbeddingScorer() if use_embeddings else None
+        
+        # Combined weights
+        self.weights = {
+            "keyword": 0.6,
+            "embedding": 0.4
+        }
+    
+    def rank_entries(self, entries: List[dict]) -> List[ScoredEntry]:
+        """Rank all entries by relevance."""
+        scored = []
+        
+        for entry in entries:
+            kw_score, keywords, reasons = self.keyword_scorer.score(entry)
+            
+            emb_score = 0.0
+            if self.embedding_scorer:
+                emb_score = self.embedding_scorer.score(entry)
+                # Convert 0-1 to 0-10 scale
+                emb_score = emb_score * 10
+            
+            # Combined score
+            combined = (
+                self.weights["keyword"] * kw_score +
+                self.weights["embedding"] * emb_score
+            )
+            
+            scored.append(ScoredEntry(
+                entry=entry,
+                relevance_score=combined,
+                keyword_score=kw_score,
+                embedding_score=emb_score,
+                keywords_matched=keywords,
+                reasons=reasons
+            ))
+        
+        # Sort by relevance (descending)
+        scored.sort(key=lambda x: x.relevance_score, reverse=True)
+        return scored
+    
+    def filter_top_n(self, entries: List[dict], n: int = 15, threshold: float = 2.0) -> List[ScoredEntry]:
+        """Filter to top N entries above threshold."""
+        scored = self.rank_entries(entries)
+        
+        # Filter by threshold
+        above_threshold = [s for s in scored if s.relevance_score >= threshold]
+        
+        # Take top N
+        result = above_threshold[:n]
+        
+        print(f"[INFO] Filtered {len(entries)} → {len(result)} (threshold={threshold})")
+        
+        return result
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Deep Dive: Relevance Filtering")
+    parser.add_argument("--input", "-i", type=Path, required=True, help="Input JSONL from aggregator")
+    parser.add_argument("--output", "-o", type=Path, required=True, help="Output JSONL with scores")
+    parser.add_argument("--top-n", "-n", type=int, default=15, help="Number of top entries to keep")
+    parser.add_argument("--threshold", "-t", type=float, default=2.0, help="Minimum relevance score")
+    parser.add_argument("--no-embeddings", action="store_true", help="Disable embedding scoring")
+    args = parser.parse_args()
+    
+    print(f"[Deep Dive] Phase 2: Filtering relevance from {args.input}")
+    
+    # Load entries
+    entries = []
+    with open(args.input) as f:
+        for line in f:
+            entries.append(json.loads(line))
+    
+    print(f"[INFO] Loaded {len(entries)} entries")
+    
+    # Filter
+    filter_engine = RelevanceFilter(use_embeddings=not args.no_embeddings)
+    filtered = filter_engine.filter_top_n(entries, n=args.top_n, threshold=args.threshold)
+    
+    # Save results
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w") as f:
+        for item in filtered:
+            f.write(json.dumps({
+                "entry": item.entry,
+                "relevance_score": item.relevance_score,
+                "keyword_score": item.keyword_score,
+                "embedding_score": item.embedding_score,
+                "keywords_matched": item.keywords_matched,
+                "reasons": item.reasons
+            }) + "\n")
+    
+    print(f"[SUCCESS] Phase 2 complete: {len(filtered)} entries written to {args.output}")
+    
+    # Show top 5
+    print("\nTop 5 entries:")
+    for item in filtered[:5]:
+        title = item.entry.get('title', 'Unknown')[:60]
+        print(f"  [{item.relevance_score:.1f}] {title}...")
+
+
+if __name__ == "__main__":
+    main()