diff --git a/research/deep-dive/IMPLEMENTATION.md b/research/deep-dive/IMPLEMENTATION.md new file mode 100644 index 0000000..c8c1a59 --- /dev/null +++ b/research/deep-dive/IMPLEMENTATION.md @@ -0,0 +1,248 @@ +# Deep Dive Implementation Guide + +> Quick-start path from architecture to running system + +--- + +## Phase 1 Quick Win: ArXiv Text Digest (2-3 hours) + +This minimal implementation proves value without Phase 2/4 complexity. + +### Step 1: Dependencies + +```bash +pip install feedparser requests python-telegram-bot +``` + +### Step 2: Basic Fetcher + +```python +#!/usr/bin/env python3 +# scripts/arxiv-fetch.py +import feedparser +import json +from datetime import datetime + +FEEDS = { + "cs.AI": "http://export.arxiv.org/rss/cs.AI", + "cs.CL": "http://export.arxiv.org/rss/cs.CL", + "cs.LG": "http://export.arxiv.org/rss/cs.LG", +} + +KEYWORDS = [ + "transformer", "attention", "LLM", "large language model", + "agent", "multi-agent", "reasoning", "chain-of-thought", + "RLHF", "fine-tuning", "RAG", "retrieval augmented", + "vector database", "embedding", "tool use", "function calling" +] + +def score_item(title, abstract): + text = f"{title} {abstract}".lower() + matches = sum(1 for kw in KEYWORDS if kw in text) + return min(matches / 3, 1.0) # Cap at 1.0 + +def fetch_and_score(): + items = [] + for category, url in FEEDS.items(): + feed = feedparser.parse(url) + for entry in feed.entries[:20]: # Top 20 per category + score = score_item(entry.title, entry.get("summary", "")) + if score > 0.2: # Minimum relevance threshold + items.append({ + "category": category, + "title": entry.title, + "url": entry.link, + "score": score, + "abstract": entry.get("summary", "")[:300] + }) + + # Sort by score + items.sort(key=lambda x: x["score"], reverse=True) + return items[:10] # Top 10 + +if __name__ == "__main__": + items = fetch_and_score() + date = datetime.now().strftime("%Y-%m-%d") + + with open(f"data/raw/{date}-arxiv.json", "w") as f: + json.dump(items, f, indent=2) + + print(f"Fetched {len(items)} relevant papers") +``` + +### Step 3: Synthesis (Text Only) + +```python +#!/usr/bin/env python3 +# scripts/text-digest.py +import json +from datetime import datetime + +def generate_digest(items): + lines = [f"📚 Deep Dive — {datetime.now().strftime('%Y-%m-%d')}", ""] + + for i, item in enumerate(items[:5], 1): + lines.append(f"{i}. {item['title']}") + lines.append(f" {item['url']}") + lines.append(f" Relevance: {item['score']:.2f}") + lines.append("") + + return "\n".join(lines) + +# Load and generate +date = datetime.now().strftime("%Y-%m-%d") +with open(f"data/raw/{date}-arxiv.json") as f: + items = json.load(f) + +digest = generate_digest(items) +print(digest) + +# Save +with open(f"data/briefings/{date}-digest.txt", "w") as f: + f.write(digest) +``` + +### Step 4: Telegram Delivery + +```python +#!/usr/bin/env python3 +# scripts/telegram-send.py +import os +import asyncio +from telegram import Bot + +async def send_digest(): + bot = Bot(token=os.environ["TELEGRAM_BOT_TOKEN"]) + chat_id = os.environ["TELEGRAM_HOME_CHANNEL"] + + date = datetime.now().strftime("%Y-%m-%d") + with open(f"data/briefings/{date}-digest.txt") as f: + text = f.read() + + await bot.send_message(chat_id=chat_id, text=text[:4000]) + +asyncio.run(send_digest()) +``` + +### Step 5: Cron Setup + +```bash +# crontab -e +0 6 * * * cd /path/to/deep-dive && ./scripts/run-daily.sh +``` + +```bash +#!/bin/bash +# scripts/run-daily.sh +set -e + +DATE=$(date +%Y-%m-%d) +mkdir -p "data/raw" "data/briefings" + +python3 scripts/arxiv-fetch.py +python3 scripts/text-digest.py +python3 scripts/telegram-send.py + +echo "✅ Deep Dive completed for $DATE" +``` + +--- + +## Phase 2: Embedding-Based Relevance (Add Day 2) + +```python +# scripts/rank-embeddings.py +from sentence_transformers import SentenceTransformer +import chromadb +import json + +# Load model +model = SentenceTransformer('all-MiniLM-L6-v2') + +# Initialize Chroma (persistent) +client = chromadb.PersistentClient(path="data/chroma") +collection = client.get_or_create_collection("hermes-codebase") + +# Load top items +with open("data/raw/YYYY-MM-DD-arxiv.json") as f: + items = json.load(f) + +# Score using embeddings +def embedding_score(item): + item_emb = model.encode(item['title'] + " " + item['abstract']) + # Query similar docs from codebase + results = collection.query(query_embeddings=[item_emb.tolist()], n_results=5) + # Average similarity of top matches + return sum(results['distances'][0]) / len(results['distances'][0]) + +# Re-rank +for item in items: + item['embedding_score'] = embedding_score(item) + item['final_score'] = (item['score'] * 0.3) + (item['embedding_score'] * 0.7) + +items.sort(key=lambda x: x['final_score'], reverse=True) +``` + +--- + +## Phase 4: Piper TTS Integration (Add Day 3) + +```bash +# Install Piper +pip install piper-tts + +# Download voice +mkdir -p voices +wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx +wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json +``` + +```python +#!/usr/bin/env python3 +# scripts/generate-audio.py +import subprocess +from datetime import datetime + +date = datetime.now().strftime("%Y-%m-%d") + +# Read briefing +with open(f"data/briefings/{date}-briefing.md") as f: + text = f.read() + +# Preprocess for TTS (strip markdown, limit length) +# ... + +# Generate audio +subprocess.run([ + "piper", + "--model", "voices/en_US-lessac-high.onnx", + "--output_file", f"data/audio/{date}-deep-dive.wav", + "--length_scale", "1.1" +], input=text[:5000].encode()) # First 5K chars + +# Convert to MP3 +subprocess.run([ + "ffmpeg", "-y", "-i", f"data/audio/{date}-deep-dive.wav", + "-codec:a", "libmp3lame", "-q:a", "4", + f"data/audio/{date}-deep-dive.mp3" +]) +``` + +--- + +## Testing Checklist + +- [ ] Phase 1: Manual run produces valid JSON +- [ ] Phase 1: Keyword filter returns relevant results only +- [ ] Phase 2: Embeddings load without error +- [ ] Phase 2: Chroma collection queries return matches +- [ ] Phase 3: LLM generates coherent briefing +- [ ] Phase 4: Piper produces audible WAV +- [ ] Phase 4: MP3 conversion works +- [ ] Phase 5: Telegram text message delivers +- [ ] Phase 5: Telegram voice message delivers +- [ ] End-to-end: Cron completes without error + +--- + +*Implementation guide version 1.0*