[DEEP-DIVE] Scaffold component — #830

2026-04-05 07:42:27 +00:00
parent e18174975a
commit cec82bf991
1 changed files with 98 additions and 0 deletions
--- a/scaffold/deep-dive/relevance/relevance_engine.py
+++ b/scaffold/deep-dive/relevance/relevance_engine.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+"""
+Relevance Engine for Deep Dive
+Filters and ranks content by Hermes/Timmy relevance
+"""
+
+import chromadb
+from chromadb.utils import embedding_functions
+from typing import List, Dict, Any
+import json
+from dataclasses import asdict
+
+# Hermes codebase snippets for similarity comparison
+HERMES_CONTEXT = [
+    "Hermes agent system with tool calling and conversation loop",
+    "LLM inference with tool orchestration",
+    "Retrieval augmented generation RAG architecture",
+    "Multi-agent orchestration and delegation",
+    "Reinforcement learning RL for agent training",
+    "Model quantization and efficient inference",
+    "Vector database Chroma for embeddings",
+    "MCP Model Context Protocol integration",
+    "Gateway pattern for messaging platforms",
+    "Agent trajectory logging and replay",
+]
+
+class RelevanceEngine:
+    def __init__(self, collection_name: str = "deep_dive"):
+        self.client = chromadb.PersistentClient(path="./chroma_db")
+        self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
+            model_name="all-MiniLM-L6-v2"
+        )
+        
+        # Get or create collection
+        try:
+            self.collection = self.client.get_collection(
+                name=collection_name,
+                embedding_function=self.embedding_fn
+            )
+        except:
+            self.collection = self.client.create_collection(
+                name=collection_name,
+                embedding_function=self.embedding_fn
+            )
+            self._seed_context()
+    
+    def _seed_context(self):
+        """Seed the collection with Hermes context."""
+        self.collection.add(
+            documents=HERMES_CONTEXT,
+            ids=[f"ctx_{i}" for i in range(len(HERMES_CONTEXT))],
+            metadatas=[{"type": "context"} for _ in HERMES_CONTEXT]
+        )
+    
+    def rank_items(self, items: List[Any], text_fn, top_k: int = 10) -> List[tuple]:
+        """Rank items by similarity to Hermes context."""
+        texts = [text_fn(item) for item in items]
+        
+        # Query against context
+        results = self.collection.query(
+            query_texts=texts,
+            n_results=3,
+            include=["distances"]
+        )
+        
+        # Calculate relevance scores (inverse distance, averaged)
+        scored = []
+        for item, distances in zip(items, results["distances"]):
+            avg_similarity = sum(1/(1+d) for d in distances) / len(distances)
+            scored.append((item, avg_similarity))
+        
+        # Sort by score descending
+        scored.sort(key=lambda x: x[1], reverse=True)
+        return scored[:top_k]
+    
+    def filter_by_keywords(self, items: List[Any], text_fn, keywords: List[str]) -> List[Any]:
+        """Filter items that match at least one keyword."""
+        filtered = []
+        for item in items:
+            text = text_fn(item).lower()
+            if any(kw.lower() in text for kw in keywords):
+                filtered.append(item)
+        return filtered
+
+def rank_papers(papers: List[Any], top_k: int = 10) -> List[tuple]:
+    """Convenience function for paper ranking."""
+    engine = RelevanceEngine()
+    return engine.rank_items(
+        papers,
+        text_fn=lambda p: f"{p.title} {p.abstract}",
+        top_k=top_k
+    )
+
+if __name__ == "__main__":
+    # Test with sample data
+    engine = RelevanceEngine()
+    print("Relevance engine initialized")
+    print(f"Collection count: {engine.collection.count()}")