[#830] Deep Dive architecture scaffold - IMPLEMENTATION.md
Quick-start guide for Phase 1 implementation: - ArXiv fetcher skeleton - Keyword-based relevance scoring - Telegram text delivery - Phase 2/4 expansion paths
This commit is contained in:
248
research/deep-dive/IMPLEMENTATION.md
Normal file
248
research/deep-dive/IMPLEMENTATION.md
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
# Deep Dive Implementation Guide
|
||||||
|
|
||||||
|
> Quick-start path from architecture to running system
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 1 Quick Win: ArXiv Text Digest (2-3 hours)
|
||||||
|
|
||||||
|
This minimal implementation proves value without Phase 2/4 complexity.
|
||||||
|
|
||||||
|
### Step 1: Dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install feedparser requests python-telegram-bot
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 2: Basic Fetcher
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# scripts/arxiv-fetch.py
|
||||||
|
import feedparser
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
FEEDS = {
|
||||||
|
"cs.AI": "http://export.arxiv.org/rss/cs.AI",
|
||||||
|
"cs.CL": "http://export.arxiv.org/rss/cs.CL",
|
||||||
|
"cs.LG": "http://export.arxiv.org/rss/cs.LG",
|
||||||
|
}
|
||||||
|
|
||||||
|
KEYWORDS = [
|
||||||
|
"transformer", "attention", "LLM", "large language model",
|
||||||
|
"agent", "multi-agent", "reasoning", "chain-of-thought",
|
||||||
|
"RLHF", "fine-tuning", "RAG", "retrieval augmented",
|
||||||
|
"vector database", "embedding", "tool use", "function calling"
|
||||||
|
]
|
||||||
|
|
||||||
|
def score_item(title, abstract):
|
||||||
|
text = f"{title} {abstract}".lower()
|
||||||
|
matches = sum(1 for kw in KEYWORDS if kw in text)
|
||||||
|
return min(matches / 3, 1.0) # Cap at 1.0
|
||||||
|
|
||||||
|
def fetch_and_score():
|
||||||
|
items = []
|
||||||
|
for category, url in FEEDS.items():
|
||||||
|
feed = feedparser.parse(url)
|
||||||
|
for entry in feed.entries[:20]: # Top 20 per category
|
||||||
|
score = score_item(entry.title, entry.get("summary", ""))
|
||||||
|
if score > 0.2: # Minimum relevance threshold
|
||||||
|
items.append({
|
||||||
|
"category": category,
|
||||||
|
"title": entry.title,
|
||||||
|
"url": entry.link,
|
||||||
|
"score": score,
|
||||||
|
"abstract": entry.get("summary", "")[:300]
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by score
|
||||||
|
items.sort(key=lambda x: x["score"], reverse=True)
|
||||||
|
return items[:10] # Top 10
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
items = fetch_and_score()
|
||||||
|
date = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
with open(f"data/raw/{date}-arxiv.json", "w") as f:
|
||||||
|
json.dump(items, f, indent=2)
|
||||||
|
|
||||||
|
print(f"Fetched {len(items)} relevant papers")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 3: Synthesis (Text Only)
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# scripts/text-digest.py
|
||||||
|
import json
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
def generate_digest(items):
|
||||||
|
lines = [f"📚 Deep Dive — {datetime.now().strftime('%Y-%m-%d')}", ""]
|
||||||
|
|
||||||
|
for i, item in enumerate(items[:5], 1):
|
||||||
|
lines.append(f"{i}. {item['title']}")
|
||||||
|
lines.append(f" {item['url']}")
|
||||||
|
lines.append(f" Relevance: {item['score']:.2f}")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
# Load and generate
|
||||||
|
date = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
with open(f"data/raw/{date}-arxiv.json") as f:
|
||||||
|
items = json.load(f)
|
||||||
|
|
||||||
|
digest = generate_digest(items)
|
||||||
|
print(digest)
|
||||||
|
|
||||||
|
# Save
|
||||||
|
with open(f"data/briefings/{date}-digest.txt", "w") as f:
|
||||||
|
f.write(digest)
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Telegram Delivery
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# scripts/telegram-send.py
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
from telegram import Bot
|
||||||
|
|
||||||
|
async def send_digest():
|
||||||
|
bot = Bot(token=os.environ["TELEGRAM_BOT_TOKEN"])
|
||||||
|
chat_id = os.environ["TELEGRAM_HOME_CHANNEL"]
|
||||||
|
|
||||||
|
date = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
with open(f"data/briefings/{date}-digest.txt") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
await bot.send_message(chat_id=chat_id, text=text[:4000])
|
||||||
|
|
||||||
|
asyncio.run(send_digest())
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 5: Cron Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# crontab -e
|
||||||
|
0 6 * * * cd /path/to/deep-dive && ./scripts/run-daily.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# scripts/run-daily.sh
|
||||||
|
set -e
|
||||||
|
|
||||||
|
DATE=$(date +%Y-%m-%d)
|
||||||
|
mkdir -p "data/raw" "data/briefings"
|
||||||
|
|
||||||
|
python3 scripts/arxiv-fetch.py
|
||||||
|
python3 scripts/text-digest.py
|
||||||
|
python3 scripts/telegram-send.py
|
||||||
|
|
||||||
|
echo "✅ Deep Dive completed for $DATE"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 2: Embedding-Based Relevance (Add Day 2)
|
||||||
|
|
||||||
|
```python
|
||||||
|
# scripts/rank-embeddings.py
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
import chromadb
|
||||||
|
import json
|
||||||
|
|
||||||
|
# Load model
|
||||||
|
model = SentenceTransformer('all-MiniLM-L6-v2')
|
||||||
|
|
||||||
|
# Initialize Chroma (persistent)
|
||||||
|
client = chromadb.PersistentClient(path="data/chroma")
|
||||||
|
collection = client.get_or_create_collection("hermes-codebase")
|
||||||
|
|
||||||
|
# Load top items
|
||||||
|
with open("data/raw/YYYY-MM-DD-arxiv.json") as f:
|
||||||
|
items = json.load(f)
|
||||||
|
|
||||||
|
# Score using embeddings
|
||||||
|
def embedding_score(item):
|
||||||
|
item_emb = model.encode(item['title'] + " " + item['abstract'])
|
||||||
|
# Query similar docs from codebase
|
||||||
|
results = collection.query(query_embeddings=[item_emb.tolist()], n_results=5)
|
||||||
|
# Average similarity of top matches
|
||||||
|
return sum(results['distances'][0]) / len(results['distances'][0])
|
||||||
|
|
||||||
|
# Re-rank
|
||||||
|
for item in items:
|
||||||
|
item['embedding_score'] = embedding_score(item)
|
||||||
|
item['final_score'] = (item['score'] * 0.3) + (item['embedding_score'] * 0.7)
|
||||||
|
|
||||||
|
items.sort(key=lambda x: x['final_score'], reverse=True)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Phase 4: Piper TTS Integration (Add Day 3)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Install Piper
|
||||||
|
pip install piper-tts
|
||||||
|
|
||||||
|
# Download voice
|
||||||
|
mkdir -p voices
|
||||||
|
wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx
|
||||||
|
wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# scripts/generate-audio.py
|
||||||
|
import subprocess
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
date = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
|
||||||
|
# Read briefing
|
||||||
|
with open(f"data/briefings/{date}-briefing.md") as f:
|
||||||
|
text = f.read()
|
||||||
|
|
||||||
|
# Preprocess for TTS (strip markdown, limit length)
|
||||||
|
# ...
|
||||||
|
|
||||||
|
# Generate audio
|
||||||
|
subprocess.run([
|
||||||
|
"piper",
|
||||||
|
"--model", "voices/en_US-lessac-high.onnx",
|
||||||
|
"--output_file", f"data/audio/{date}-deep-dive.wav",
|
||||||
|
"--length_scale", "1.1"
|
||||||
|
], input=text[:5000].encode()) # First 5K chars
|
||||||
|
|
||||||
|
# Convert to MP3
|
||||||
|
subprocess.run([
|
||||||
|
"ffmpeg", "-y", "-i", f"data/audio/{date}-deep-dive.wav",
|
||||||
|
"-codec:a", "libmp3lame", "-q:a", "4",
|
||||||
|
f"data/audio/{date}-deep-dive.mp3"
|
||||||
|
])
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Testing Checklist
|
||||||
|
|
||||||
|
- [ ] Phase 1: Manual run produces valid JSON
|
||||||
|
- [ ] Phase 1: Keyword filter returns relevant results only
|
||||||
|
- [ ] Phase 2: Embeddings load without error
|
||||||
|
- [ ] Phase 2: Chroma collection queries return matches
|
||||||
|
- [ ] Phase 3: LLM generates coherent briefing
|
||||||
|
- [ ] Phase 4: Piper produces audible WAV
|
||||||
|
- [ ] Phase 4: MP3 conversion works
|
||||||
|
- [ ] Phase 5: Telegram text message delivers
|
||||||
|
- [ ] Phase 5: Telegram voice message delivers
|
||||||
|
- [ ] End-to-end: Cron completes without error
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
*Implementation guide version 1.0*
|
||||||
Reference in New Issue
Block a user