diff --git a/scaffold/deep-dive/relevance/relevance_engine.py b/scaffold/deep-dive/relevance/relevance_engine.py new file mode 100644 index 0000000..727bb8a --- /dev/null +++ b/scaffold/deep-dive/relevance/relevance_engine.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +""" +Relevance Engine for Deep Dive +Filters and ranks content by Hermes/Timmy relevance +""" + +import chromadb +from chromadb.utils import embedding_functions +from typing import List, Dict, Any +import json +from dataclasses import asdict + +# Hermes codebase snippets for similarity comparison +HERMES_CONTEXT = [ + "Hermes agent system with tool calling and conversation loop", + "LLM inference with tool orchestration", + "Retrieval augmented generation RAG architecture", + "Multi-agent orchestration and delegation", + "Reinforcement learning RL for agent training", + "Model quantization and efficient inference", + "Vector database Chroma for embeddings", + "MCP Model Context Protocol integration", + "Gateway pattern for messaging platforms", + "Agent trajectory logging and replay", +] + +class RelevanceEngine: + def __init__(self, collection_name: str = "deep_dive"): + self.client = chromadb.PersistentClient(path="./chroma_db") + self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction( + model_name="all-MiniLM-L6-v2" + ) + + # Get or create collection + try: + self.collection = self.client.get_collection( + name=collection_name, + embedding_function=self.embedding_fn + ) + except: + self.collection = self.client.create_collection( + name=collection_name, + embedding_function=self.embedding_fn + ) + self._seed_context() + + def _seed_context(self): + """Seed the collection with Hermes context.""" + self.collection.add( + documents=HERMES_CONTEXT, + ids=[f"ctx_{i}" for i in range(len(HERMES_CONTEXT))], + metadatas=[{"type": "context"} for _ in HERMES_CONTEXT] + ) + + def rank_items(self, items: List[Any], text_fn, top_k: int = 10) -> List[tuple]: + """Rank items by similarity to Hermes context.""" + texts = [text_fn(item) for item in items] + + # Query against context + results = self.collection.query( + query_texts=texts, + n_results=3, + include=["distances"] + ) + + # Calculate relevance scores (inverse distance, averaged) + scored = [] + for item, distances in zip(items, results["distances"]): + avg_similarity = sum(1/(1+d) for d in distances) / len(distances) + scored.append((item, avg_similarity)) + + # Sort by score descending + scored.sort(key=lambda x: x[1], reverse=True) + return scored[:top_k] + + def filter_by_keywords(self, items: List[Any], text_fn, keywords: List[str]) -> List[Any]: + """Filter items that match at least one keyword.""" + filtered = [] + for item in items: + text = text_fn(item).lower() + if any(kw.lower() in text for kw in keywords): + filtered.append(item) + return filtered + +def rank_papers(papers: List[Any], top_k: int = 10) -> List[tuple]: + """Convenience function for paper ranking.""" + engine = RelevanceEngine() + return engine.rank_items( + papers, + text_fn=lambda p: f"{p.title} {p.abstract}", + top_k=top_k + ) + +if __name__ == "__main__": + # Test with sample data + engine = RelevanceEngine() + print("Relevance engine initialized") + print(f"Collection count: {engine.collection.count()}")