99 lines
3.4 KiB
Python
99 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Relevance Engine for Deep Dive
|
|
Filters and ranks content by Hermes/Timmy relevance
|
|
"""
|
|
|
|
import chromadb
|
|
from chromadb.utils import embedding_functions
|
|
from typing import List, Dict, Any
|
|
import json
|
|
from dataclasses import asdict
|
|
|
|
# Hermes codebase snippets for similarity comparison
|
|
HERMES_CONTEXT = [
|
|
"Hermes agent system with tool calling and conversation loop",
|
|
"LLM inference with tool orchestration",
|
|
"Retrieval augmented generation RAG architecture",
|
|
"Multi-agent orchestration and delegation",
|
|
"Reinforcement learning RL for agent training",
|
|
"Model quantization and efficient inference",
|
|
"Vector database Chroma for embeddings",
|
|
"MCP Model Context Protocol integration",
|
|
"Gateway pattern for messaging platforms",
|
|
"Agent trajectory logging and replay",
|
|
]
|
|
|
|
class RelevanceEngine:
|
|
def __init__(self, collection_name: str = "deep_dive"):
|
|
self.client = chromadb.PersistentClient(path="./chroma_db")
|
|
self.embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
|
|
model_name="all-MiniLM-L6-v2"
|
|
)
|
|
|
|
# Get or create collection
|
|
try:
|
|
self.collection = self.client.get_collection(
|
|
name=collection_name,
|
|
embedding_function=self.embedding_fn
|
|
)
|
|
except:
|
|
self.collection = self.client.create_collection(
|
|
name=collection_name,
|
|
embedding_function=self.embedding_fn
|
|
)
|
|
self._seed_context()
|
|
|
|
def _seed_context(self):
|
|
"""Seed the collection with Hermes context."""
|
|
self.collection.add(
|
|
documents=HERMES_CONTEXT,
|
|
ids=[f"ctx_{i}" for i in range(len(HERMES_CONTEXT))],
|
|
metadatas=[{"type": "context"} for _ in HERMES_CONTEXT]
|
|
)
|
|
|
|
def rank_items(self, items: List[Any], text_fn, top_k: int = 10) -> List[tuple]:
|
|
"""Rank items by similarity to Hermes context."""
|
|
texts = [text_fn(item) for item in items]
|
|
|
|
# Query against context
|
|
results = self.collection.query(
|
|
query_texts=texts,
|
|
n_results=3,
|
|
include=["distances"]
|
|
)
|
|
|
|
# Calculate relevance scores (inverse distance, averaged)
|
|
scored = []
|
|
for item, distances in zip(items, results["distances"]):
|
|
avg_similarity = sum(1/(1+d) for d in distances) / len(distances)
|
|
scored.append((item, avg_similarity))
|
|
|
|
# Sort by score descending
|
|
scored.sort(key=lambda x: x[1], reverse=True)
|
|
return scored[:top_k]
|
|
|
|
def filter_by_keywords(self, items: List[Any], text_fn, keywords: List[str]) -> List[Any]:
|
|
"""Filter items that match at least one keyword."""
|
|
filtered = []
|
|
for item in items:
|
|
text = text_fn(item).lower()
|
|
if any(kw.lower() in text for kw in keywords):
|
|
filtered.append(item)
|
|
return filtered
|
|
|
|
def rank_papers(papers: List[Any], top_k: int = 10) -> List[tuple]:
|
|
"""Convenience function for paper ranking."""
|
|
engine = RelevanceEngine()
|
|
return engine.rank_items(
|
|
papers,
|
|
text_fn=lambda p: f"{p.title} {p.abstract}",
|
|
top_k=top_k
|
|
)
|
|
|
|
if __name__ == "__main__":
|
|
# Test with sample data
|
|
engine = RelevanceEngine()
|
|
print("Relevance engine initialized")
|
|
print(f"Collection count: {engine.collection.count()}")
|