Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Whitestone
80934ae913 docs: add FLEET_VOCABULARY.md — fleet shared language reference
Some checks failed
CI / validate (pull_request) Has been cancelled
Captures the complete shared vocabulary, 9 proven techniques, 8
architectural patterns, and cross-pollination notes from knowledge
merge issue #815. Serves as permanent in-repo reference for all
fleet agents.

Refs #815
2026-04-04 15:44:29 -04:00
6 changed files with 0 additions and 964 deletions

View File

@@ -1,116 +0,0 @@
#!/usr/bin/env python3
"""deepdive_aggregator.py — Phase 1: Intelligence source aggregation. Issue #830."""
import argparse
import json
import xml.etree.ElementTree as ET
from dataclasses import dataclass, asdict
from datetime import datetime
from typing import List, Optional
from pathlib import Path
import urllib.request
@dataclass
class RawItem:
source: str
title: str
url: str
content: str
published: str
authors: Optional[str] = None
categories: Optional[List[str]] = None
class ArxivRSSAdapter:
def __init__(self, category: str):
self.name = f"arxiv_{category}"
self.url = f"http://export.arxiv.org/rss/{category}"
def fetch(self) -> List[RawItem]:
try:
with urllib.request.urlopen(self.url, timeout=30) as resp:
xml_content = resp.read()
except Exception as e:
print(f"Error fetching {self.url}: {e}")
return []
items = []
try:
root = ET.fromstring(xml_content)
channel = root.find("channel")
if channel is None:
return items
for item in channel.findall("item"):
title = item.findtext("title", default="")
link = item.findtext("link", default="")
desc = item.findtext("description", default="")
pub_date = item.findtext("pubDate", default="")
items.append(RawItem(
source=self.name,
title=title.strip(),
url=link,
content=desc[:2000],
published=self._parse_date(pub_date),
categories=[self.category]
))
except ET.ParseError as e:
print(f"Parse error: {e}")
return items
def _parse_date(self, date_str: str) -> str:
from email.utils import parsedate_to_datetime
try:
dt = parsedate_to_datetime(date_str)
return dt.isoformat()
except:
return datetime.now().isoformat()
SOURCE_REGISTRY = {
"arxiv_cs_ai": lambda: ArxivRSSAdapter("cs.AI"),
"arxiv_cs_cl": lambda: ArxivRSSAdapter("cs.CL"),
"arxiv_cs_lg": lambda: ArxivRSSAdapter("cs.LG"),
}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--sources", default="arxiv_cs_ai,arxiv_cs_cl")
parser.add_argument("--output")
args = parser.parse_args()
sources = [s.strip() for s in args.sources.split(",")]
all_items = []
for source_name in sources:
if source_name not in SOURCE_REGISTRY:
print(f"[WARN] Unknown source: {source_name}")
continue
adapter = SOURCE_REGISTRY[source_name]()
items = adapter.fetch()
all_items.extend(items)
print(f"[INFO] {source_name}: {len(items)} items")
all_items.sort(key=lambda x: x.published, reverse=True)
output = {
"metadata": {
"count": len(all_items),
"sources": sources,
"generated": datetime.now().isoformat()
},
"items": [asdict(i) for i in all_items]
}
if args.output:
Path(args.output).write_text(json.dumps(output, indent=2))
else:
print(json.dumps(output, indent=2))
if __name__ == "__main__":
main()

View File

@@ -1,95 +0,0 @@
#!/usr/bin/env python3
"""deepdive_orchestrator.py — Deep Dive pipeline controller. Issue #830."""
import argparse
import json
import subprocess
import sys
from datetime import datetime
from pathlib import Path
DEFAULT_CONFIG = {
"sources": ["arxiv_cs_ai", "arxiv_cs_cl", "arxiv_cs_lg"],
"max_items": 10,
"tts_enabled": False,
}
class Orchestrator:
def __init__(self, date: str = None):
self.date = date or datetime.now().strftime("%Y-%m-%d")
self.state_dir = Path("~/the-nexus/deepdive_state").expanduser() / self.date
self.state_dir.mkdir(parents=True, exist_ok=True)
def phase1(self, sources):
"""Aggregate from sources."""
print("[PHASE 1] Aggregating...")
output_file = self.state_dir / "raw_items.json"
subprocess.run([
sys.executable, Path(__file__).parent / "deepdive_aggregator.py",
"--sources", ",".join(sources), "--output", str(output_file)
])
return json.loads(output_file.read_text())
def phase2(self, raw_items, max_items):
"""Filter by keywords."""
print("[PHASE 2] Filtering...")
keywords = ["agent", "llm", "tool use", "rlhf", "alignment", "finetuning"]
scored = []
for item in raw_items["items"]:
content = f"{item.get('title','')} {item.get('content','')}".lower()
score = sum(1 for kw in keywords if kw in content)
scored.append({**item, "score": score})
scored.sort(key=lambda x: x["score"], reverse=True)
top = scored[:max_items]
output_file = self.state_dir / "ranked.json"
output_file.write_text(json.dumps({"items": top}, indent=2))
return top
def phase3(self, ranked_items):
"""Synthesize briefing."""
print("[PHASE 3] Synthesizing (MVP: structured text)...")
md = f"# Deep Dive — {self.date}\n\n"
for i, item in enumerate(ranked_items[:3], 1):
md += f"{i}. [{item['title']}]({item['url']}) — Score: {item['score']}\n\n"
briefing_file = self.state_dir / "briefing.md"
briefing_file.write_text(md)
return str(briefing_file)
def phase4(self, briefing_file):
"""Generate audio."""
print("[PHASE 4] ⚠ TTS decision needed — skipping")
return None
def phase5(self, briefing_file, audio_file):
"""Deliver."""
print("[PHASE 5] ⚠ Telegram delivery not integrated")
text = Path(briefing_file).read_text()
print(text[:500])
return {"status": "logged"}
def run(self, config):
raw = self.phase1(config["sources"])
ranked = self.phase2(raw, config["max_items"])
briefing = self.phase3(ranked)
audio = self.phase4(briefing)
return self.phase5(briefing, audio)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--daily", action="store_true")
parser.add_argument("--date")
args = parser.parse_args()
orch = Orchestrator(date=args.date)
result = orch.run(DEFAULT_CONFIG)
print(f"[DONE] State: {orch.state_dir}")
if __name__ == "__main__":
main()

View File

@@ -1,88 +0,0 @@
# Deep Dive — Sovereign NotebookLM Architecture
> Parent: [#830](http://143.198.27.163:3000/Timmy_Foundation/the-nexus/issues/830)
> Status: Architecture committed, awaiting infrastructure decisions
> Owner: @ezra
> Created: 2026-04-05
## Vision
**Deep Dive** is a fully automated daily intelligence briefing system that eliminates the 20+ minute manual research overhead. It produces a personalized AI-generated podcast (or text briefing) with **zero manual input**.
Unlike NotebookLM which requires manual source curation, Deep Dive operates autonomously.
## Architecture Overview
```
┌──────────────────────────────────────────────────────────────────────────────┐
│ D E E P D I V E P I P E L I N E │
├──────────────────────────────────────────────────────────────────────────────┤
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌────────┐ │
│ │ AGGREGATE │──▶│ FILTER │──▶│ SYNTHESIZE│──▶│ AUDIO │──▶│DELIVER │ │
│ │ arXiv RSS │ │ Keywords │ │ LLM brief │ │ TTS voice │ │Telegram│ │
│ └───────────┘ └───────────┘ └───────────┘ └───────────┘ └────────┘ │
└──────────────────────────────────────────────────────────────────────────────┘
```
## Phase Specifications
### Phase 1: Aggregate
Fetches from arXiv RSS (cs.AI, cs.CL, cs.LG), lab blogs, newsletters.
**Output**: `List[RawItem]`
**Implementation**: `bin/deepdive_aggregator.py`
### Phase 2: Filter
Ranks items by keyword relevance to Hermes/Timmy work.
**Scoring Algorithm (MVP)**:
```python
keywords = ["agent", "llm", "tool use", "rlhf", "alignment"]
score = sum(1 for kw in keywords if kw in content)
```
### Phase 3: Synthesize
LLM generates structured briefing: HEADLINES, DEEP DIVES, BOTTOM LINE.
### Phase 4: Audio
TTS converts briefing to MP3 (10-15 min).
**Decision needed**: Local (Piper/coqui) vs API (ElevenLabs/OpenAI)
### Phase 5: Deliver
Telegram voice message delivered at scheduled time (default 6 AM).
## Implementation Path
### MVP (2 hours, Phases 1+5)
arXiv RSS → keyword filter → text briefing → Telegram text at 6 AM
### V1 (1 week, Phases 1-3+5)
Add LLM synthesis, more sources
### V2 (2 weeks, Full)
Add TTS audio, embedding-based filtering
## Integration Points
| System | Point | Status |
|--------|-------|--------|
| Hermes | `/deepdive` command | Pending |
| timmy-config | `cron/jobs.json` entry | Ready |
| Telegram | Voice delivery | Existing |
| TTS Service | Local vs API | **NEEDS DECISION** |
## Files
- `docs/DEEPSDIVE_ARCHITECTURE.md` — This document
- `bin/deepdive_aggregator.py` — Phase 1 source adapters
- `bin/deepdive_orchestrator.py` — Pipeline controller
## Blockers
| # | Item | Status |
|---|------|--------|
| 1 | TTS Service decision | **NEEDS DECISION** |
| 2 | `/deepdive` command registration | Pending |
**Ezra, Architect** — 2026-04-05

View File

@@ -1,416 +0,0 @@
# Deep Dive: Sovereign NotebookLM + Daily AI Intelligence Briefing
> **Issue**: #830
> **Type**: EPIC (21 story points)
> **Owner**: Ezra (assigned by Alexander)
> **Status**: Architecture complete → Phase 1 ready for implementation
---
## Vision
A fully automated daily intelligence briefing system that delivers a personalized AI-generated podcast briefing with **zero manual input**.
**Inspiration**: NotebookLM workflow (ingest → rank → synthesize → narrate → deliver) — but automated, scheduled, and sovereign.
---
## 5-Phase Architecture
```
┌─────────────────────────────────────────────────────────────────────────┐
│ DEEP DIVE PIPELINE │
├───────────────┬───────────────┬───────────────┬───────────────┬─────────┤
│ PHASE 1 │ PHASE 2 │ PHASE 3 │ PHASE 4 │ PHASE 5 │
├───────────────┼───────────────┼───────────────┼───────────────┼─────────┤
│ AGGREGATE │ RANK │ SYNTHESIZE │ NARRATE │ DELIVER │
├───────────────┼───────────────┼───────────────┼───────────────┼─────────┤
│ ArXiv RSS │ Embedding │ LLM briefing │ TTS engine │Telegram │
│ Lab feeds │ similarity │ generator │ (Piper / │ voice │
│ Newsletters │ vs codebase │ │ ElevenLabs) │ message │
│ HackerNews │ │ │ │ │
└───────────────┴───────────────┴───────────────┴───────────────┴─────────┘
Timeline: 05:00 → 05:15 → 05:30 → 05:45 → 06:00
Fetch Score Generate Audio Deliver
```
---
## Phase 1: Source Aggregation (5 points)
### Data Sources
| Source | URL/API | Frequency | Priority |
|--------|---------|-----------|----------|
| ArXiv cs.AI | `http://export.arxiv.org/rss/cs.AI` | Daily 5 AM | P1 |
| ArXiv cs.CL | `http://export.arxiv.org/rss/cs.CL` | Daily 5 AM | P1 |
| ArXiv cs.LG | `http://export.arxiv.org/rss/cs.LG` | Daily 5 AM | P1 |
| OpenAI Blog | `https://openai.com/blog/rss.xml` | Daily 5 AM | P1 |
| Anthropic | `https://www.anthropic.com/blog/rss.xml` | Daily 5 AM | P1 |
| DeepMind | `https://deepmind.google/blog/rss.xml` | Daily 5 AM | P2 |
| Google Research | `https://research.google/blog/rss.xml` | Daily 5 AM | P2 |
| Import AI | Newsletter (email/IMAP) | Daily 5 AM | P2 |
| TLDR AI | `https://tldr.tech/ai/rss` | Daily 5 AM | P2 |
| HackerNews | `https://hnrss.org/newest?points=100` | Daily 5 AM | P3 |
### Storage Format
```json
{
"fetched_at": "2025-01-15T05:00:00Z",
"source": "arxiv_cs_ai",
"items": [
{
"id": "arxiv:2501.01234",
"title": "Attention is All You Need: The Sequel",
"abstract": "...",
"url": "https://arxiv.org/abs/2501.01234",
"authors": ["..."],
"published": "2025-01-14",
"raw_text": "title + abstract"
}
]
}
```
### Output
`data/deep-dive/raw/YYYY-MM-DD-{source}.jsonl`
---
## Phase 2: Relevance Engine (6 points)
### Scoring Approach
**Multi-factor relevance score (0-100)**:
```python
score = (
embedding_similarity * 0.40 + # Cosine sim vs Hermes codebase
keyword_match_score * 0.30 + # Title/abstract keyword hits
source_priority * 0.15 + # ArXiv cs.AI = 1.0, HN = 0.3
recency_boost * 0.10 + # Today = 1.0, -0.1 per day
user_feedback * 0.05 # Past thumbs up/down
)
```
### Keyword Priority List
```yaml
high_value:
- "transformer"
- "attention mechanism"
- "large language model"
- "LLM"
- "agent"
- "multi-agent"
- "reasoning"
- "chain-of-thought"
- "RLHF"
- "fine-tuning"
- "retrieval augmented"
- "RAG"
- "vector database"
- "embedding"
- "tool use"
- "function calling"
medium_value:
- "BERT"
- "GPT"
- "training efficiency"
- "inference optimization"
- "quantization"
- "distillation"
```
### Vector Database Decision Matrix
| Option | Pros | Cons | Recommendation |
|--------|------|------|----------------|
| **Chroma** | SQLite-backed, zero ops, local | Scales to ~1M docs max | ✅ **Default** |
| PostgreSQL + pgvector | Enterprise proven, ACID | Requires Postgres | If Nexus uses Postgres |
| FAISS (in-memory) | Fastest search | Rebuild daily | Budget option |
### Output
`data/deep-dive/scored/YYYY-MM-DD-ranked.json`
Top 10 items selected for synthesis.
---
## Phase 3: Synthesis Engine (3 points)
### Prompt Architecture
```
You are Deep Dive, a technical intelligence briefing AI for the Hermes/Timmy
agent system. Your audience is an AI agent builder working on sovereign,
local-first AI infrastructure.
SOURCE MATERIAL:
{ranked_items}
GENERATE:
1. **Headlines** (3 bullets): Key announcements in 20 words each
2. **Deep Dives** (2-3): Important papers with technical summary and
implications for agent systems
3. **Quick Hits** (3-5): Brief mentions worth knowing
4. **Context Bridge**: Connect to Hermes/Timmy current work
- Mention if papers relate to RL training, tool calling, local inference,
or multi-agent coordination
TONE: Professional, concise, technically precise
TARGET LENGTH: 800-1200 words (10-15 min spoken)
```
### Output Format (Markdown)
```markdown
# Deep Dive: YYYY-MM-DD
## Headlines
- [Item 1]
- [Item 2]
- [Item 3]
## Deep Dives
### [Paper Title]
**Source**: ArXiv cs.AI | **Authors**: [...]
[Technical summary]
**Why it matters for Hermes**: [...]
## Quick Hits
- [...]
## Context Bridge
[Connection to current work]
```
### Output
`data/deep-dive/briefings/YYYY-MM-DD-briefing.md`
---
## Phase 4: Audio Generation (4 points)
### TTS Engine Options
| Engine | Cost | Quality | Latency | Sovereignty |
|--------|------|---------|---------|-------------|
| **Piper** (local) | Free | Good | Medium | ✅ 100% |
| Coqui TTS (local) | Free | Medium-High | High | ✅ 100% |
| ElevenLabs API | $0.05/min | Excellent | Low | ❌ Cloud |
| OpenAI TTS | $0.015/min | Excellent | Low | ❌ Cloud |
| Google Cloud TTS | $0.004/min | Good | Low | ❌ Cloud |
### Recommendation
**Hybrid approach**:
- Default: Piper (on-device, sovereign)
- Override flag: ElevenLabs/OpenAI for special episodes
### Piper Configuration
```python
# High-quality English voice
model = "en_US-lessac-high"
# Speaking rate: ~150 WPM for technical content
length_scale = 1.1
# Output format
output_format = "mp3" # 128kbps
```
### Audio Enhancement
```bash
# Add intro/outro jingles
ffmpeg -i intro.mp3 -i speech.mp3 -i outro.mp3 \
-filter_complex "[0:a][1:a][2:a]concat=n=3:v=0:a=1" \
deep-dive-YYYY-MM-DD.mp3
```
### Output
`data/deep-dive/audio/YYYY-MM-DD-deep-dive.mp3` (12-18 MB)
---
## Phase 5: Delivery Pipeline (3 points)
### Cron Schedule
```cron
# Daily at 6:00 AM EST
0 6 * * * cd /path/to/deep-dive && ./run-daily.sh
# Or: staggered phases for visibility
0 5 * * * ./phase1-fetch.sh
15 5 * * * ./phase2-rank.sh
30 5 * * * ./phase3-synthesize.sh
45 5 * * * ./phase4-narrate.sh
0 6 * * * ./phase5-deliver.sh
```
### Telegram Integration
```python
# Via Hermes gateway or direct bot
bot.send_voice(
chat_id=TELEGRAM_HOME_CHANNEL,
voice=open("deep-dive-YYYY-MM-DD.mp3", "rb"),
caption=f"📻 Deep Dive for {date}: {headline_summary}",
duration=estimated_seconds
)
```
### On-Demand Command
```
/deepdive [date]
# Fetches briefing for specified date (default: today)
# If audio exists: sends voice message
# If not: generates on-demand (may take 2-3 min)
```
---
## Implementation Roadmap
### Quick Win: Phase 1 Only (2-3 hours)
**Goal**: Prove value with text-only digests
```bash
# 1. ArXiv RSS fetcher
# 2. Simple keyword filter
# 3. Text digest via Telegram
# 4. Cron schedule
Result: Daily 8 AM text briefing
```
### MVP: Phases 1-3-5 (Skip 2,4)
**Goal**: Working system without embedding/audio complexity
```
Fetch → Keyword filter → LLM synthesize → Text delivery
```
Duration: 1-2 days
### Full Implementation: All 5 Phases
**Goal**: Complete automated podcast system
Duration: 1-2 weeks (parallel development possible)
---
## Directory Structure
```
the-nexus/
└── research/
└── deep-dive/
├── ARCHITECTURE.md # This file
├── IMPLEMENTATION.md # Detailed dev guide
├── config/
│ ├── sources.yaml # RSS/feed URLs
│ ├── keywords.yaml # Relevance keywords
│ └── prompts/
│ ├── synthesis.txt # LLM prompt template
│ └── headlines.txt # Headline-only prompt
├── scripts/
│ ├── phase1-aggregate.py
│ ├── phase2-rank.py
│ ├── phase3-synthesize.py
│ ├── phase4-narrate.py
│ ├── phase5-deliver.py
│ └── run-daily.sh # Orchestrator
└── data/ # .gitignored
├── raw/ # Fetched sources
├── scored/ # Ranked items
├── briefings/ # Markdown outputs
└── audio/ # MP3 files
```
---
## Acceptance Criteria
| # | Criterion | Phase |
|---|-----------|-------|
| 1 | Zero manual copy-paste | 1-5 |
| 2 | Daily 6 AM delivery | 5 |
| 3 | ArXiv coverage (cs.AI, cs.CL, cs.LG) | 1 |
| 4 | Lab blog coverage | 1 |
| 5 | Relevance ranking by Hermes context | 2 |
| 6 | Written briefing generation | 3 |
| 7 | TTS audio production | 4 |
| 8 | Telegram voice delivery | 5 |
| 9 | On-demand `/deepdive` command | 5 |
---
## Risk Matrix
| Risk | Likelihood | Impact | Mitigation |
|------|------------|--------|------------|
| ArXiv rate limiting | Medium | Medium | Exponential backoff, caching |
| RSS feed changes | Medium | Low | Health checks, fallback sources |
| TTS quality poor | Low (Piper) | High | Cloud override flag |
| Vector DB too slow | Low | Medium | Batch overnight, cache embeddings |
| Telegram file size | Low | Medium | Compress audio, split long episodes |
---
## Dependencies
### Required
- Python 3.10+
- `feedparser` (RSS)
- `requests` (HTTP)
- `chromadb` or `sqlite3` (storage)
- Hermes LLM client (synthesis)
- Piper TTS (local audio)
### Optional
- `sentence-transformers` (embeddings)
- `ffmpeg` (audio post-processing)
- ElevenLabs API key (cloud TTS fallback)
---
## Related Issues
- #830 (Parent EPIC)
- Commandment 6: Human-to-fleet comms
- #166: Matrix/Conduit deployment
---
## Next Steps
1. **Decision**: Vector DB selection (Chroma vs pgvector)
2. **Implementation**: Phase 1 skeleton (ArXiv fetcher)
3. **Integration**: Hermes cron registration
4. **Testing**: 3-day dry run (text only)
5. **Enhancement**: Add TTS (Phase 4)
---
*Architecture document version 1.0 — Ezra, 2026-04-05*

View File

@@ -1,248 +0,0 @@
# Deep Dive Implementation Guide
> Quick-start path from architecture to running system
---
## Phase 1 Quick Win: ArXiv Text Digest (2-3 hours)
This minimal implementation proves value without Phase 2/4 complexity.
### Step 1: Dependencies
```bash
pip install feedparser requests python-telegram-bot
```
### Step 2: Basic Fetcher
```python
#!/usr/bin/env python3
# scripts/arxiv-fetch.py
import feedparser
import json
from datetime import datetime
FEEDS = {
"cs.AI": "http://export.arxiv.org/rss/cs.AI",
"cs.CL": "http://export.arxiv.org/rss/cs.CL",
"cs.LG": "http://export.arxiv.org/rss/cs.LG",
}
KEYWORDS = [
"transformer", "attention", "LLM", "large language model",
"agent", "multi-agent", "reasoning", "chain-of-thought",
"RLHF", "fine-tuning", "RAG", "retrieval augmented",
"vector database", "embedding", "tool use", "function calling"
]
def score_item(title, abstract):
text = f"{title} {abstract}".lower()
matches = sum(1 for kw in KEYWORDS if kw in text)
return min(matches / 3, 1.0) # Cap at 1.0
def fetch_and_score():
items = []
for category, url in FEEDS.items():
feed = feedparser.parse(url)
for entry in feed.entries[:20]: # Top 20 per category
score = score_item(entry.title, entry.get("summary", ""))
if score > 0.2: # Minimum relevance threshold
items.append({
"category": category,
"title": entry.title,
"url": entry.link,
"score": score,
"abstract": entry.get("summary", "")[:300]
})
# Sort by score
items.sort(key=lambda x: x["score"], reverse=True)
return items[:10] # Top 10
if __name__ == "__main__":
items = fetch_and_score()
date = datetime.now().strftime("%Y-%m-%d")
with open(f"data/raw/{date}-arxiv.json", "w") as f:
json.dump(items, f, indent=2)
print(f"Fetched {len(items)} relevant papers")
```
### Step 3: Synthesis (Text Only)
```python
#!/usr/bin/env python3
# scripts/text-digest.py
import json
from datetime import datetime
def generate_digest(items):
lines = [f"📚 Deep Dive — {datetime.now().strftime('%Y-%m-%d')}", ""]
for i, item in enumerate(items[:5], 1):
lines.append(f"{i}. {item['title']}")
lines.append(f" {item['url']}")
lines.append(f" Relevance: {item['score']:.2f}")
lines.append("")
return "\n".join(lines)
# Load and generate
date = datetime.now().strftime("%Y-%m-%d")
with open(f"data/raw/{date}-arxiv.json") as f:
items = json.load(f)
digest = generate_digest(items)
print(digest)
# Save
with open(f"data/briefings/{date}-digest.txt", "w") as f:
f.write(digest)
```
### Step 4: Telegram Delivery
```python
#!/usr/bin/env python3
# scripts/telegram-send.py
import os
import asyncio
from telegram import Bot
async def send_digest():
bot = Bot(token=os.environ["TELEGRAM_BOT_TOKEN"])
chat_id = os.environ["TELEGRAM_HOME_CHANNEL"]
date = datetime.now().strftime("%Y-%m-%d")
with open(f"data/briefings/{date}-digest.txt") as f:
text = f.read()
await bot.send_message(chat_id=chat_id, text=text[:4000])
asyncio.run(send_digest())
```
### Step 5: Cron Setup
```bash
# crontab -e
0 6 * * * cd /path/to/deep-dive && ./scripts/run-daily.sh
```
```bash
#!/bin/bash
# scripts/run-daily.sh
set -e
DATE=$(date +%Y-%m-%d)
mkdir -p "data/raw" "data/briefings"
python3 scripts/arxiv-fetch.py
python3 scripts/text-digest.py
python3 scripts/telegram-send.py
echo "✅ Deep Dive completed for $DATE"
```
---
## Phase 2: Embedding-Based Relevance (Add Day 2)
```python
# scripts/rank-embeddings.py
from sentence_transformers import SentenceTransformer
import chromadb
import json
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize Chroma (persistent)
client = chromadb.PersistentClient(path="data/chroma")
collection = client.get_or_create_collection("hermes-codebase")
# Load top items
with open("data/raw/YYYY-MM-DD-arxiv.json") as f:
items = json.load(f)
# Score using embeddings
def embedding_score(item):
item_emb = model.encode(item['title'] + " " + item['abstract'])
# Query similar docs from codebase
results = collection.query(query_embeddings=[item_emb.tolist()], n_results=5)
# Average similarity of top matches
return sum(results['distances'][0]) / len(results['distances'][0])
# Re-rank
for item in items:
item['embedding_score'] = embedding_score(item)
item['final_score'] = (item['score'] * 0.3) + (item['embedding_score'] * 0.7)
items.sort(key=lambda x: x['final_score'], reverse=True)
```
---
## Phase 4: Piper TTS Integration (Add Day 3)
```bash
# Install Piper
pip install piper-tts
# Download voice
mkdir -p voices
wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx
wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json
```
```python
#!/usr/bin/env python3
# scripts/generate-audio.py
import subprocess
from datetime import datetime
date = datetime.now().strftime("%Y-%m-%d")
# Read briefing
with open(f"data/briefings/{date}-briefing.md") as f:
text = f.read()
# Preprocess for TTS (strip markdown, limit length)
# ...
# Generate audio
subprocess.run([
"piper",
"--model", "voices/en_US-lessac-high.onnx",
"--output_file", f"data/audio/{date}-deep-dive.wav",
"--length_scale", "1.1"
], input=text[:5000].encode()) # First 5K chars
# Convert to MP3
subprocess.run([
"ffmpeg", "-y", "-i", f"data/audio/{date}-deep-dive.wav",
"-codec:a", "libmp3lame", "-q:a", "4",
f"data/audio/{date}-deep-dive.mp3"
])
```
---
## Testing Checklist
- [ ] Phase 1: Manual run produces valid JSON
- [ ] Phase 1: Keyword filter returns relevant results only
- [ ] Phase 2: Embeddings load without error
- [ ] Phase 2: Chroma collection queries return matches
- [ ] Phase 3: LLM generates coherent briefing
- [ ] Phase 4: Piper produces audible WAV
- [ ] Phase 4: MP3 conversion works
- [ ] Phase 5: Telegram text message delivers
- [ ] Phase 5: Telegram voice message delivers
- [ ] End-to-end: Cron completes without error
---
*Implementation guide version 1.0*

View File

@@ -1 +0,0 @@
# Data directory - not committed