docs: add FLEET_VOCABULARY.md — fleet shared language reference

Captures the complete shared vocabulary, 9 proven techniques, 8 architectural patterns, and cross-pollination notes from knowledge merge issue #815. Serves as permanent in-repo reference for all fleet agents. Refs #815
2026-04-04 15:44:29 -04:00
6 changed files with 0 additions and 964 deletions
--- a/bin/deepdive_aggregator.py
+++ b/bin/deepdive_aggregator.py
@@ -1,116 +0,0 @@
-#!/usr/bin/env python3
-"""deepdive_aggregator.py — Phase 1: Intelligence source aggregation. Issue #830."""
-
-import argparse
-import json
-import xml.etree.ElementTree as ET
-from dataclasses import dataclass, asdict
-from datetime import datetime
-from typing import List, Optional
-from pathlib import Path
-import urllib.request
-
-
-@dataclass
-class RawItem:
-    source: str
-    title: str
-    url: str
-    content: str
-    published: str
-    authors: Optional[str] = None
-    categories: Optional[List[str]] = None
-
-
-class ArxivRSSAdapter:
-    def __init__(self, category: str):
-        self.name = f"arxiv_{category}"
-        self.url = f"http://export.arxiv.org/rss/{category}"
-    
-    def fetch(self) -> List[RawItem]:
-        try:
-            with urllib.request.urlopen(self.url, timeout=30) as resp:
-                xml_content = resp.read()
-        except Exception as e:
-            print(f"Error fetching {self.url}: {e}")
-            return []
-        
-        items = []
-        try:
-            root = ET.fromstring(xml_content)
-            channel = root.find("channel")
-            if channel is None:
-                return items
-            
-            for item in channel.findall("item"):
-                title = item.findtext("title", default="")
-                link = item.findtext("link", default="")
-                desc = item.findtext("description", default="")
-                pub_date = item.findtext("pubDate", default="")
-                
-                items.append(RawItem(
-                    source=self.name,
-                    title=title.strip(),
-                    url=link,
-                    content=desc[:2000],
-                    published=self._parse_date(pub_date),
-                    categories=[self.category]
-                ))
-        except ET.ParseError as e:
-            print(f"Parse error: {e}")
-        
-        return items
-    
-    def _parse_date(self, date_str: str) -> str:
-        from email.utils import parsedate_to_datetime
-        try:
-            dt = parsedate_to_datetime(date_str)
-            return dt.isoformat()
-        except:
-            return datetime.now().isoformat()
-
-
-SOURCE_REGISTRY = {
-    "arxiv_cs_ai": lambda: ArxivRSSAdapter("cs.AI"),
-    "arxiv_cs_cl": lambda: ArxivRSSAdapter("cs.CL"),
-    "arxiv_cs_lg": lambda: ArxivRSSAdapter("cs.LG"),
-}
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--sources", default="arxiv_cs_ai,arxiv_cs_cl")
-    parser.add_argument("--output")
-    args = parser.parse_args()
-    
-    sources = [s.strip() for s in args.sources.split(",")]
-    all_items = []
-    
-    for source_name in sources:
-        if source_name not in SOURCE_REGISTRY:
-            print(f"[WARN] Unknown source: {source_name}")
-            continue
-        adapter = SOURCE_REGISTRY[source_name]()
-        items = adapter.fetch()
-        all_items.extend(items)
-        print(f"[INFO] {source_name}: {len(items)} items")
-    
-    all_items.sort(key=lambda x: x.published, reverse=True)
-    
-    output = {
-        "metadata": {
-            "count": len(all_items),
-            "sources": sources,
-            "generated": datetime.now().isoformat()
-        },
-        "items": [asdict(i) for i in all_items]
-    }
-    
-    if args.output:
-        Path(args.output).write_text(json.dumps(output, indent=2))
-    else:
-        print(json.dumps(output, indent=2))
-
-
-if __name__ == "__main__":
-    main()
--- a/bin/deepdive_orchestrator.py
+++ b/bin/deepdive_orchestrator.py
@@ -1,95 +0,0 @@
-#!/usr/bin/env python3
-"""deepdive_orchestrator.py — Deep Dive pipeline controller. Issue #830."""
-
-import argparse
-import json
-import subprocess
-import sys
-from datetime import datetime
-from pathlib import Path
-
-DEFAULT_CONFIG = {
-    "sources": ["arxiv_cs_ai", "arxiv_cs_cl", "arxiv_cs_lg"],
-    "max_items": 10,
-    "tts_enabled": False,
-}
-
-
-class Orchestrator:
-    def __init__(self, date: str = None):
-        self.date = date or datetime.now().strftime("%Y-%m-%d")
-        self.state_dir = Path("~/the-nexus/deepdive_state").expanduser() / self.date
-        self.state_dir.mkdir(parents=True, exist_ok=True)
-    
-    def phase1(self, sources):
-        """Aggregate from sources."""
-        print("[PHASE 1] Aggregating...")
-        output_file = self.state_dir / "raw_items.json"
-        subprocess.run([
-            sys.executable, Path(__file__).parent / "deepdive_aggregator.py",
-            "--sources", ",".join(sources), "--output", str(output_file)
-        ])
-        return json.loads(output_file.read_text())
-    
-    def phase2(self, raw_items, max_items):
-        """Filter by keywords."""
-        print("[PHASE 2] Filtering...")
-        keywords = ["agent", "llm", "tool use", "rlhf", "alignment", "finetuning"]
-        
-        scored = []
-        for item in raw_items["items"]:
-            content = f"{item.get('title','')} {item.get('content','')}".lower()
-            score = sum(1 for kw in keywords if kw in content)
-            scored.append({**item, "score": score})
-        
-        scored.sort(key=lambda x: x["score"], reverse=True)
-        top = scored[:max_items]
-        
-        output_file = self.state_dir / "ranked.json"
-        output_file.write_text(json.dumps({"items": top}, indent=2))
-        return top
-    
-    def phase3(self, ranked_items):
-        """Synthesize briefing."""
-        print("[PHASE 3] Synthesizing (MVP: structured text)...")
-        md = f"# Deep Dive — {self.date}\n\n"
-        for i, item in enumerate(ranked_items[:3], 1):
-            md += f"{i}. [{item['title']}]({item['url']}) — Score: {item['score']}\n\n"
-        
-        briefing_file = self.state_dir / "briefing.md"
-        briefing_file.write_text(md)
-        return str(briefing_file)
-    
-    def phase4(self, briefing_file):
-        """Generate audio."""
-        print("[PHASE 4] ⚠ TTS decision needed — skipping")
-        return None
-    
-    def phase5(self, briefing_file, audio_file):
-        """Deliver."""
-        print("[PHASE 5] ⚠ Telegram delivery not integrated")
-        text = Path(briefing_file).read_text()
-        print(text[:500])
-        return {"status": "logged"}
-    
-    def run(self, config):
-        raw = self.phase1(config["sources"])
-        ranked = self.phase2(raw, config["max_items"])
-        briefing = self.phase3(ranked)
-        audio = self.phase4(briefing)
-        return self.phase5(briefing, audio)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--daily", action="store_true")
-    parser.add_argument("--date")
-    args = parser.parse_args()
-    
-    orch = Orchestrator(date=args.date)
-    result = orch.run(DEFAULT_CONFIG)
-    print(f"[DONE] State: {orch.state_dir}")
-
-
-if __name__ == "__main__":
-    main()
--- a/docs/DEEPSDIVE_ARCHITECTURE.md
+++ b/docs/DEEPSDIVE_ARCHITECTURE.md
@@ -1,88 +0,0 @@
-# Deep Dive — Sovereign NotebookLM Architecture
-
-> Parent: [#830](http://143.198.27.163:3000/Timmy_Foundation/the-nexus/issues/830)  
-> Status: Architecture committed, awaiting infrastructure decisions  
-> Owner: @ezra  
-> Created: 2026-04-05
-
-## Vision
-
-**Deep Dive** is a fully automated daily intelligence briefing system that eliminates the 20+ minute manual research overhead. It produces a personalized AI-generated podcast (or text briefing) with **zero manual input**.
-
-Unlike NotebookLM which requires manual source curation, Deep Dive operates autonomously.
-
-## Architecture Overview
-
-```
-┌──────────────────────────────────────────────────────────────────────────────┐
-│                    D E E P   D I V E   P I P E L I N E                       │
-├──────────────────────────────────────────────────────────────────────────────┤
-│  ┌───────────┐   ┌───────────┐   ┌───────────┐   ┌───────────┐   ┌────────┐ │
-│  │ AGGREGATE │──▶│  FILTER   │──▶│ SYNTHESIZE│──▶│   AUDIO   │──▶│DELIVER │ │
-│  │ arXiv RSS │   │ Keywords  │   │ LLM brief │   │ TTS voice │   │Telegram│ │
-│  └───────────┘   └───────────┘   └───────────┘   └───────────┘   └────────┘ │
-└──────────────────────────────────────────────────────────────────────────────┘
-```
-
-## Phase Specifications
-
-### Phase 1: Aggregate
-Fetches from arXiv RSS (cs.AI, cs.CL, cs.LG), lab blogs, newsletters.
-
-**Output**: `List[RawItem]`  
-**Implementation**: `bin/deepdive_aggregator.py`
-
-### Phase 2: Filter
-Ranks items by keyword relevance to Hermes/Timmy work.
-
-**Scoring Algorithm (MVP)**:
-```python
-keywords = ["agent", "llm", "tool use", "rlhf", "alignment"]
-score = sum(1 for kw in keywords if kw in content)
-```
-
-### Phase 3: Synthesize
-LLM generates structured briefing: HEADLINES, DEEP DIVES, BOTTOM LINE.
-
-### Phase 4: Audio
-TTS converts briefing to MP3 (10-15 min).
-
-**Decision needed**: Local (Piper/coqui) vs API (ElevenLabs/OpenAI)
-
-### Phase 5: Deliver
-Telegram voice message delivered at scheduled time (default 6 AM).
-
-## Implementation Path
-
-### MVP (2 hours, Phases 1+5)
-arXiv RSS → keyword filter → text briefing → Telegram text at 6 AM
-
-### V1 (1 week, Phases 1-3+5)
-Add LLM synthesis, more sources
-
-### V2 (2 weeks, Full)
-Add TTS audio, embedding-based filtering
-
-## Integration Points
-
-| System | Point | Status |
-|--------|-------|--------|
-| Hermes | `/deepdive` command | Pending |
-| timmy-config | `cron/jobs.json` entry | Ready |
-| Telegram | Voice delivery | Existing |
-| TTS Service | Local vs API | **NEEDS DECISION** |
-
-## Files
-
- `docs/DEEPSDIVE_ARCHITECTURE.md` — This document
- `bin/deepdive_aggregator.py` — Phase 1 source adapters
- `bin/deepdive_orchestrator.py` — Pipeline controller
-
-## Blockers
-
-| # | Item | Status |
-|---|------|--------|
-| 1 | TTS Service decision | **NEEDS DECISION** |
-| 2 | `/deepdive` command registration | Pending |
-
-**Ezra, Architect** — 2026-04-05
--- a/research/deep-dive/ARCHITECTURE.md
+++ b/research/deep-dive/ARCHITECTURE.md
@@ -1,416 +0,0 @@
-# Deep Dive: Sovereign NotebookLM + Daily AI Intelligence Briefing
-
-> **Issue**: #830  
-> **Type**: EPIC (21 story points)  
-> **Owner**: Ezra (assigned by Alexander)  
-> **Status**: Architecture complete → Phase 1 ready for implementation
-
---
-
-## Vision
-
-A fully automated daily intelligence briefing system that delivers a personalized AI-generated podcast briefing with **zero manual input**.
-
-**Inspiration**: NotebookLM workflow (ingest → rank → synthesize → narrate → deliver) — but automated, scheduled, and sovereign.
-
---
-
-## 5-Phase Architecture
-
-```
-┌─────────────────────────────────────────────────────────────────────────┐
-│                         DEEP DIVE PIPELINE                              │
-├───────────────┬───────────────┬───────────────┬───────────────┬─────────┤
-│   PHASE 1     │   PHASE 2     │   PHASE 3     │   PHASE 4     │ PHASE 5 │
-├───────────────┼───────────────┼───────────────┼───────────────┼─────────┤
-│  AGGREGATE    │    RANK       │  SYNTHESIZE   │   NARRATE     │ DELIVER │
-├───────────────┼───────────────┼───────────────┼───────────────┼─────────┤
-│ ArXiv RSS     │ Embedding     │ LLM briefing  │ TTS engine    │Telegram │
-│ Lab feeds     │ similarity    │ generator     │ (Piper /      │ voice   │
-│ Newsletters   │ vs codebase   │               │ ElevenLabs)   │ message │
-│ HackerNews    │               │               │               │         │
-└───────────────┴───────────────┴───────────────┴───────────────┴─────────┘
-
-Timeline: 05:00  →  05:15  →  05:30  →  05:45  →  06:00
-          Fetch    Score    Generate   Audio      Deliver
-```
-
---
-
-## Phase 1: Source Aggregation (5 points)
-
-### Data Sources
-
-| Source | URL/API | Frequency | Priority |
-|--------|---------|-----------|----------|
-| ArXiv cs.AI | `http://export.arxiv.org/rss/cs.AI` | Daily 5 AM | P1 |
-| ArXiv cs.CL | `http://export.arxiv.org/rss/cs.CL` | Daily 5 AM | P1 |
-| ArXiv cs.LG | `http://export.arxiv.org/rss/cs.LG` | Daily 5 AM | P1 |
-| OpenAI Blog | `https://openai.com/blog/rss.xml` | Daily 5 AM | P1 |
-| Anthropic | `https://www.anthropic.com/blog/rss.xml` | Daily 5 AM | P1 |
-| DeepMind | `https://deepmind.google/blog/rss.xml` | Daily 5 AM | P2 |
-| Google Research | `https://research.google/blog/rss.xml` | Daily 5 AM | P2 |
-| Import AI | Newsletter (email/IMAP) | Daily 5 AM | P2 |
-| TLDR AI | `https://tldr.tech/ai/rss` | Daily 5 AM | P2 |
-| HackerNews | `https://hnrss.org/newest?points=100` | Daily 5 AM | P3 |
-
-### Storage Format
-
-```json
-{
-  "fetched_at": "2025-01-15T05:00:00Z",
-  "source": "arxiv_cs_ai",
-  "items": [
-    {
-      "id": "arxiv:2501.01234",
-      "title": "Attention is All You Need: The Sequel",
-      "abstract": "...",
-      "url": "https://arxiv.org/abs/2501.01234",
-      "authors": ["..."],
-      "published": "2025-01-14",
-      "raw_text": "title + abstract"
-    }
-  ]
-}
-```
-
-### Output
-
-`data/deep-dive/raw/YYYY-MM-DD-{source}.jsonl`
-
---
-
-## Phase 2: Relevance Engine (6 points)
-
-### Scoring Approach
-
-**Multi-factor relevance score (0-100)**:
-
-```python
-score = (
-    embedding_similarity * 0.40 +    # Cosine sim vs Hermes codebase
-    keyword_match_score * 0.30 +     # Title/abstract keyword hits
-    source_priority * 0.15 +         # ArXiv cs.AI = 1.0, HN = 0.3
-    recency_boost * 0.10 +           # Today = 1.0, -0.1 per day
-    user_feedback * 0.05             # Past thumbs up/down
-)
-```
-
-### Keyword Priority List
-
-```yaml
-high_value:
-  - "transformer"
-  - "attention mechanism"
-  - "large language model"
-  - "LLM"
-  - "agent"
-  - "multi-agent"
-  - "reasoning"
-  - "chain-of-thought"
-  - "RLHF"
-  - "fine-tuning"
-  - "retrieval augmented"
-  - "RAG"
-  - "vector database"
-  - "embedding"
-  - "tool use"
-  - "function calling"
-
-medium_value:
-  - "BERT"
-  - "GPT"
-  - "training efficiency"
-  - "inference optimization"
-  - "quantization"
-  - "distillation"
-```
-
-### Vector Database Decision Matrix
-
-| Option | Pros | Cons | Recommendation |
-|--------|------|------|----------------|
-| **Chroma** | SQLite-backed, zero ops, local | Scales to ~1M docs max | ✅ **Default** |
-| PostgreSQL + pgvector | Enterprise proven, ACID | Requires Postgres | If Nexus uses Postgres |
-| FAISS (in-memory) | Fastest search | Rebuild daily | Budget option |
-
-### Output
-
-`data/deep-dive/scored/YYYY-MM-DD-ranked.json`
-
-Top 10 items selected for synthesis.
-
---
-
-## Phase 3: Synthesis Engine (3 points)
-
-### Prompt Architecture
-
-```
-You are Deep Dive, a technical intelligence briefing AI for the Hermes/Timmy
-agent system. Your audience is an AI agent builder working on sovereign,
-local-first AI infrastructure.
-
-SOURCE MATERIAL:
-{ranked_items}
-
-GENERATE:
-1. **Headlines** (3 bullets): Key announcements in 20 words each
-2. **Deep Dives** (2-3): Important papers with technical summary and
-   implications for agent systems
-3. **Quick Hits** (3-5): Brief mentions worth knowing
-4. **Context Bridge**: Connect to Hermes/Timmy current work
-   - Mention if papers relate to RL training, tool calling, local inference,
-     or multi-agent coordination
-
-TONE: Professional, concise, technically precise
-TARGET LENGTH: 800-1200 words (10-15 min spoken)
-```
-
-### Output Format (Markdown)
-
-```markdown
-# Deep Dive: YYYY-MM-DD
-
-## Headlines
- [Item 1]
- [Item 2]
- [Item 3]
-
-## Deep Dives
-
-### [Paper Title]
-**Source**: ArXiv cs.AI | **Authors**: [...]
-
-[Technical summary]
-
-**Why it matters for Hermes**: [...]
-
-## Quick Hits
- [...]
-
-## Context Bridge
-[Connection to current work]
-```
-
-### Output
-
-`data/deep-dive/briefings/YYYY-MM-DD-briefing.md`
-
---
-
-## Phase 4: Audio Generation (4 points)
-
-### TTS Engine Options
-
-| Engine | Cost | Quality | Latency | Sovereignty |
-|--------|------|---------|---------|-------------|
-| **Piper** (local) | Free | Good | Medium | ✅ 100% |
-| Coqui TTS (local) | Free | Medium-High | High | ✅ 100% |
-| ElevenLabs API | $0.05/min | Excellent | Low | ❌ Cloud |
-| OpenAI TTS | $0.015/min | Excellent | Low | ❌ Cloud |
-| Google Cloud TTS | $0.004/min | Good | Low | ❌ Cloud |
-
-### Recommendation
-
-**Hybrid approach**:
- Default: Piper (on-device, sovereign)
- Override flag: ElevenLabs/OpenAI for special episodes
-
-### Piper Configuration
-
-```python
-# High-quality English voice
-model = "en_US-lessac-high"
-
-# Speaking rate: ~150 WPM for technical content
-length_scale = 1.1
-
-# Output format
-output_format = "mp3"  # 128kbps
-```
-
-### Audio Enhancement
-
-```bash
-# Add intro/outro jingles
-ffmpeg -i intro.mp3 -i speech.mp3 -i outro.mp3 \
-       -filter_complex "[0:a][1:a][2:a]concat=n=3:v=0:a=1" \
-       deep-dive-YYYY-MM-DD.mp3
-```
-
-### Output
-
-`data/deep-dive/audio/YYYY-MM-DD-deep-dive.mp3` (12-18 MB)
-
---
-
-## Phase 5: Delivery Pipeline (3 points)
-
-### Cron Schedule
-
-```cron
-# Daily at 6:00 AM EST
-0 6 * * * cd /path/to/deep-dive && ./run-daily.sh
-
-# Or: staggered phases for visibility
-0 5 * * * ./phase1-fetch.sh
-15 5 * * * ./phase2-rank.sh
-30 5 * * * ./phase3-synthesize.sh
-45 5 * * * ./phase4-narrate.sh
-0 6 * * * ./phase5-deliver.sh
-```
-
-### Telegram Integration
-
-```python
-# Via Hermes gateway or direct bot
-bot.send_voice(
-    chat_id=TELEGRAM_HOME_CHANNEL,
-    voice=open("deep-dive-YYYY-MM-DD.mp3", "rb"),
-    caption=f"📻 Deep Dive for {date}: {headline_summary}",
-    duration=estimated_seconds
-)
-```
-
-### On-Demand Command
-
-```
-/deepdive [date]
-
-# Fetches briefing for specified date (default: today)
-# If audio exists: sends voice message
-# If not: generates on-demand (may take 2-3 min)
-```
-
---
-
-## Implementation Roadmap
-
-### Quick Win: Phase 1 Only (2-3 hours)
-
-**Goal**: Prove value with text-only digests
-
-```bash
-# 1. ArXiv RSS fetcher
-# 2. Simple keyword filter
-# 3. Text digest via Telegram
-# 4. Cron schedule
-
-Result: Daily 8 AM text briefing
-```
-
-### MVP: Phases 1-3-5 (Skip 2,4)
-
-**Goal**: Working system without embedding/audio complexity
-
-```
-Fetch → Keyword filter → LLM synthesize → Text delivery
-```
-
-Duration: 1-2 days
-
-### Full Implementation: All 5 Phases
-
-**Goal**: Complete automated podcast system
-
-Duration: 1-2 weeks (parallel development possible)
-
---
-
-## Directory Structure
-
-```
-the-nexus/
-└── research/
-    └── deep-dive/
-        ├── ARCHITECTURE.md          # This file
-        ├── IMPLEMENTATION.md        # Detailed dev guide
-        ├── config/
-        │   ├── sources.yaml         # RSS/feed URLs
-        │   ├── keywords.yaml        # Relevance keywords
-        │   └── prompts/
-        │       ├── synthesis.txt    # LLM prompt template
-        │       └── headlines.txt    # Headline-only prompt
-        ├── scripts/
-        │   ├── phase1-aggregate.py
-        │   ├── phase2-rank.py
-        │   ├── phase3-synthesize.py
-        │   ├── phase4-narrate.py
-        │   ├── phase5-deliver.py
-        │   └── run-daily.sh         # Orchestrator
-        └── data/                    # .gitignored
-            ├── raw/                 # Fetched sources
-            ├── scored/              # Ranked items
-            ├── briefings/           # Markdown outputs
-            └── audio/               # MP3 files
-```
-
---
-
-## Acceptance Criteria
-
-| # | Criterion | Phase |
-|---|-----------|-------|
-| 1 | Zero manual copy-paste | 1-5 |
-| 2 | Daily 6 AM delivery | 5 |
-| 3 | ArXiv coverage (cs.AI, cs.CL, cs.LG) | 1 |
-| 4 | Lab blog coverage | 1 |
-| 5 | Relevance ranking by Hermes context | 2 |
-| 6 | Written briefing generation | 3 |
-| 7 | TTS audio production | 4 |
-| 8 | Telegram voice delivery | 5 |
-| 9 | On-demand `/deepdive` command | 5 |
-
---
-
-## Risk Matrix
-
-| Risk | Likelihood | Impact | Mitigation |
-|------|------------|--------|------------|
-| ArXiv rate limiting | Medium | Medium | Exponential backoff, caching |
-| RSS feed changes | Medium | Low | Health checks, fallback sources |
-| TTS quality poor | Low (Piper) | High | Cloud override flag |
-| Vector DB too slow | Low | Medium | Batch overnight, cache embeddings |
-| Telegram file size | Low | Medium | Compress audio, split long episodes |
-
---
-
-## Dependencies
-
-### Required
-
- Python 3.10+
- `feedparser` (RSS)
- `requests` (HTTP)
- `chromadb` or `sqlite3` (storage)
- Hermes LLM client (synthesis)
- Piper TTS (local audio)
-
-### Optional
-
- `sentence-transformers` (embeddings)
- `ffmpeg` (audio post-processing)
- ElevenLabs API key (cloud TTS fallback)
-
---
-
-## Related Issues
-
- #830 (Parent EPIC)
- Commandment 6: Human-to-fleet comms
- #166: Matrix/Conduit deployment
-
---
-
-## Next Steps
-
-1. **Decision**: Vector DB selection (Chroma vs pgvector)
-2. **Implementation**: Phase 1 skeleton (ArXiv fetcher)
-3. **Integration**: Hermes cron registration
-4. **Testing**: 3-day dry run (text only)
-5. **Enhancement**: Add TTS (Phase 4)
-
---
-
-*Architecture document version 1.0 — Ezra, 2026-04-05*
--- a/research/deep-dive/IMPLEMENTATION.md
+++ b/research/deep-dive/IMPLEMENTATION.md
@@ -1,248 +0,0 @@
-# Deep Dive Implementation Guide
-
-> Quick-start path from architecture to running system
-
---
-
-## Phase 1 Quick Win: ArXiv Text Digest (2-3 hours)
-
-This minimal implementation proves value without Phase 2/4 complexity.
-
-### Step 1: Dependencies
-
-```bash
-pip install feedparser requests python-telegram-bot
-```
-
-### Step 2: Basic Fetcher
-
-```python
-#!/usr/bin/env python3
-# scripts/arxiv-fetch.py
-import feedparser
-import json
-from datetime import datetime
-
-FEEDS = {
-    "cs.AI": "http://export.arxiv.org/rss/cs.AI",
-    "cs.CL": "http://export.arxiv.org/rss/cs.CL",
-    "cs.LG": "http://export.arxiv.org/rss/cs.LG",
-}
-
-KEYWORDS = [
-    "transformer", "attention", "LLM", "large language model",
-    "agent", "multi-agent", "reasoning", "chain-of-thought",
-    "RLHF", "fine-tuning", "RAG", "retrieval augmented",
-    "vector database", "embedding", "tool use", "function calling"
-]
-
-def score_item(title, abstract):
-    text = f"{title} {abstract}".lower()
-    matches = sum(1 for kw in KEYWORDS if kw in text)
-    return min(matches / 3, 1.0)  # Cap at 1.0
-
-def fetch_and_score():
-    items = []
-    for category, url in FEEDS.items():
-        feed = feedparser.parse(url)
-        for entry in feed.entries[:20]:  # Top 20 per category
-            score = score_item(entry.title, entry.get("summary", ""))
-            if score > 0.2:  # Minimum relevance threshold
-                items.append({
-                    "category": category,
-                    "title": entry.title,
-                    "url": entry.link,
-                    "score": score,
-                    "abstract": entry.get("summary", "")[:300]
-                })
-    
-    # Sort by score
-    items.sort(key=lambda x: x["score"], reverse=True)
-    return items[:10]  # Top 10
-
-if __name__ == "__main__":
-    items = fetch_and_score()
-    date = datetime.now().strftime("%Y-%m-%d")
-    
-    with open(f"data/raw/{date}-arxiv.json", "w") as f:
-        json.dump(items, f, indent=2)
-    
-    print(f"Fetched {len(items)} relevant papers")
-```
-
-### Step 3: Synthesis (Text Only)
-
-```python
-#!/usr/bin/env python3
-# scripts/text-digest.py
-import json
-from datetime import datetime
-
-def generate_digest(items):
-    lines = [f"📚 Deep Dive — {datetime.now().strftime('%Y-%m-%d')}", ""]
-    
-    for i, item in enumerate(items[:5], 1):
-        lines.append(f"{i}. {item['title']}")
-        lines.append(f"   {item['url']}")
-        lines.append(f"   Relevance: {item['score']:.2f}")
-        lines.append("")
-    
-    return "\n".join(lines)
-
-# Load and generate
-date = datetime.now().strftime("%Y-%m-%d")
-with open(f"data/raw/{date}-arxiv.json") as f:
-    items = json.load(f)
-
-digest = generate_digest(items)
-print(digest)
-
-# Save
-with open(f"data/briefings/{date}-digest.txt", "w") as f:
-    f.write(digest)
-```
-
-### Step 4: Telegram Delivery
-
-```python
-#!/usr/bin/env python3
-# scripts/telegram-send.py
-import os
-import asyncio
-from telegram import Bot
-
-async def send_digest():
-    bot = Bot(token=os.environ["TELEGRAM_BOT_TOKEN"])
-    chat_id = os.environ["TELEGRAM_HOME_CHANNEL"]
-    
-    date = datetime.now().strftime("%Y-%m-%d")
-    with open(f"data/briefings/{date}-digest.txt") as f:
-        text = f.read()
-    
-    await bot.send_message(chat_id=chat_id, text=text[:4000])
-
-asyncio.run(send_digest())
-```
-
-### Step 5: Cron Setup
-
-```bash
-# crontab -e
-0 6 * * * cd /path/to/deep-dive && ./scripts/run-daily.sh
-```
-
-```bash
-#!/bin/bash
-# scripts/run-daily.sh
-set -e
-
-DATE=$(date +%Y-%m-%d)
-mkdir -p "data/raw" "data/briefings"
-
-python3 scripts/arxiv-fetch.py
-python3 scripts/text-digest.py
-python3 scripts/telegram-send.py
-
-echo "✅ Deep Dive completed for $DATE"
-```
-
---
-
-## Phase 2: Embedding-Based Relevance (Add Day 2)
-
-```python
-# scripts/rank-embeddings.py
-from sentence_transformers import SentenceTransformer
-import chromadb
-import json
-
-# Load model
-model = SentenceTransformer('all-MiniLM-L6-v2')
-
-# Initialize Chroma (persistent)
-client = chromadb.PersistentClient(path="data/chroma")
-collection = client.get_or_create_collection("hermes-codebase")
-
-# Load top items
-with open("data/raw/YYYY-MM-DD-arxiv.json") as f:
-    items = json.load(f)
-
-# Score using embeddings
-def embedding_score(item):
-    item_emb = model.encode(item['title'] + " " + item['abstract'])
-    # Query similar docs from codebase
-    results = collection.query(query_embeddings=[item_emb.tolist()], n_results=5)
-    # Average similarity of top matches
-    return sum(results['distances'][0]) / len(results['distances'][0])
-
-# Re-rank
-for item in items:
-    item['embedding_score'] = embedding_score(item)
-    item['final_score'] = (item['score'] * 0.3) + (item['embedding_score'] * 0.7)
-
-items.sort(key=lambda x: x['final_score'], reverse=True)
-```
-
---
-
-## Phase 4: Piper TTS Integration (Add Day 3)
-
-```bash
-# Install Piper
-pip install piper-tts
-
-# Download voice
-mkdir -p voices
-wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx
-wget -P voices/ https://huggingface.co/rhasspy/piper-voices/resolve/main/en/en_US/lessac/high/en_US-lessac-high.onnx.json
-```
-
-```python
-#!/usr/bin/env python3
-# scripts/generate-audio.py
-import subprocess
-from datetime import datetime
-
-date = datetime.now().strftime("%Y-%m-%d")
-
-# Read briefing
-with open(f"data/briefings/{date}-briefing.md") as f:
-    text = f.read()
-
-# Preprocess for TTS (strip markdown, limit length)
-# ...
-
-# Generate audio
-subprocess.run([
-    "piper",
-    "--model", "voices/en_US-lessac-high.onnx",
-    "--output_file", f"data/audio/{date}-deep-dive.wav",
-    "--length_scale", "1.1"
-], input=text[:5000].encode())  # First 5K chars
-
-# Convert to MP3
-subprocess.run([
-    "ffmpeg", "-y", "-i", f"data/audio/{date}-deep-dive.wav",
-    "-codec:a", "libmp3lame", "-q:a", "4",
-    f"data/audio/{date}-deep-dive.mp3"
-])
-```
-
---
-
-## Testing Checklist
-
- [ ] Phase 1: Manual run produces valid JSON
- [ ] Phase 1: Keyword filter returns relevant results only
- [ ] Phase 2: Embeddings load without error
- [ ] Phase 2: Chroma collection queries return matches
- [ ] Phase 3: LLM generates coherent briefing
- [ ] Phase 4: Piper produces audible WAV
- [ ] Phase 4: MP3 conversion works
- [ ] Phase 5: Telegram text message delivers
- [ ] Phase 5: Telegram voice message delivers
- [ ] End-to-end: Cron completes without error
-
---
-
-*Implementation guide version 1.0*
--- a/research/deep-dive/data/.gitkeep
+++ b/research/deep-dive/data/.gitkeep
@@ -1 +0,0 @@
-# Data directory - not committed