From 949becff22d81dacb6d70d06d79a090bab57b9b7 Mon Sep 17 00:00:00 2001 From: Ezra Date: Sun, 5 Apr 2026 06:19:48 +0000 Subject: [PATCH] [scaffold] Deep Dive intelligence pipeline: intelligence/deepdive/architecture.md --- intelligence/deepdive/architecture.md | 277 ++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 intelligence/deepdive/architecture.md diff --git a/intelligence/deepdive/architecture.md b/intelligence/deepdive/architecture.md new file mode 100644 index 0000000..1fb307a --- /dev/null +++ b/intelligence/deepdive/architecture.md @@ -0,0 +1,277 @@ +# Deep Dive Architecture Specification + +## Phase 1: Source Aggregation Layer + +### Data Sources + +| Source | URL | Format | Frequency | +|--------|-----|--------|-----------| +| arXiv cs.AI | http://export.arxiv.org/rss/cs.AI | RSS | Daily | +| arXiv cs.CL | http://export.arxiv.org/rss/cs.CL | RSS | Daily | +| arXiv cs.LG | http://export.arxiv.org/rss/cs.LG | RSS | Daily | +| OpenAI Blog | https://openai.com/blog/rss.xml | RSS | On-update | +| Anthropic | https://www.anthropic.com/blog/rss.xml | RSS | On-update | +| DeepMind | https://deepmind.google/blog/rss.xml | RSS | On-update | +| Import AI | https://importai.substack.com/feed | RSS | Daily | +| TLDR AI | https://tldr.tech/ai/rss | RSS | Daily | + +### Implementation + +```python +# aggregator.py +class RSSAggregator: + def __init__(self, sources: List[SourceConfig]): + self.sources = sources + self.cache_dir = Path("~/.cache/deepdive/feeds") + + async def fetch_all(self, since: datetime) -> List[FeedItem]: + # Parallel RSS fetch with etag support + # Returns normalized items with title, summary, url, published + pass +``` + +## Phase 2: Relevance Engine + +### Scoring Algorithm + +```python +# relevance.py +from sentence_transformers import SentenceTransformer + +class RelevanceScorer: + def __init__(self): + self.model = SentenceTransformer('all-MiniLM-L6-v2') + self.keywords = [ + "LLM agent", "agent architecture", "tool use", + "reinforcement learning", "RLHF", "GRPO", + "transformer", "attention mechanism", + "Hermes", "local LLM", "llama.cpp" + ] + # Pre-compute keyword embeddings + self.keyword_emb = self.model.encode(self.keywords) + + def score(self, item: FeedItem) -> float: + title_emb = self.model.encode(item.title) + summary_emb = self.model.encode(item.summary) + + # Cosine similarity to keyword centroid + keyword_sim = cosine_similarity([title_emb], self.keyword_emb).mean() + + # Boost for agent/LLM architecture terms + boost = 1.0 + if any(k in item.title.lower() for k in ["agent", "llm", "transformer"]): + boost = 1.5 + + return keyword_sim * boost +``` + +### Ranking + +- Fetch all items from last 24h +- Score each with RelevanceScorer +- Select top N (default: 10) for briefing + +## Phase 3: Synthesis Engine + +### LLM Prompt + +```jinja2 +You are an intelligence analyst for the Timmy Foundation fleet. +Produce a concise daily briefing from the following sources. + +CONTEXT: We build Hermes (local AI agent framework) and operate +a distributed fleet of AI agents. Focus on developments relevant +to: LLM architecture, agent systems, RL training, local inference. + +SOURCES: +{% for item in sources %} +- {{ item.title }} ({{ item.source }}) + {{ item.summary }} +{% endfor %} + +OUTPUT FORMAT: +## Daily Intelligence Briefing - {{ date }} + +### Headlines +- [Source] Key development in one sentence + +### Deep Dive: {{ most_relevant.title }} +Why this matters for our work: +[2-3 sentences connecting to Hermes/Timmy context] + +### Action Items +- [ ] Any immediate implications + +Keep total briefing under 800 words. Tight, professional tone. +``` + +## Phase 4: Audio Generation + +### TTS Pipeline + +```python +# tts.py +import subprocess +from pathlib import Path + +class PiperTTS: + def __init__(self, model_path: str, voice: str = "en_US-amy-medium"): + self.model = Path(model_path) / f"{voice}.onnx" + self.config = Path(model_path) / f"{voice}.onnx.json" + + def generate(self, text: str, output_path: Path) -> Path: + # Piper produces WAV from stdin text + cmd = [ + "piper", + "--model", str(self.model), + "--config", str(self.config), + "--output_file", str(output_path) + ] + subprocess.run(cmd, input=text.encode()) + return output_path +``` + +### Voice Selection + +- Base: `en_US-amy-medium` (clear, professional) +- Alternative: `en_GB-southern_english_female-medium` + +## Phase 5: Delivery Pipeline + +### Cron Scheduler + +```yaml +# cron entry (runs 5:30 AM daily) +deepdive-daily: + schedule: "30 5 * * *" + command: "/opt/deepdive/run-pipeline.sh --deliver" + timezone: "America/New_York" +``` + +### Delivery Integration + +```python +# delivery.py +from hermes.gateway import TelegramGateway + +class TelegramDelivery: + def __init__(self, bot_token: str, chat_id: str): + self.gateway = TelegramGateway(bot_token, chat_id) + + async def deliver(self, audio_path: Path, briefing_text: str): + # Send voice message + await self.gateway.send_voice(audio_path) + # Send text summary as follow-up + await self.gateway.send_message(briefing_text[:4000]) +``` + +### On-Demand Command + +``` +/deepdive [optional: date or topic filter] +``` + +Triggers pipeline immediately, bypasses cron. + +## Data Flow + +``` +RSS Feeds + │ + ▼ +┌───────────┐ ┌───────────┐ ┌───────────┐ +│ Raw Items │───▶│ Scored │───▶│ Top 10 │ +│ (100-500) │ │ (ranked) │ │ Selected │ +└───────────┘ └───────────┘ └─────┬─────┘ + │ + ┌───────────────────┘ + ▼ + ┌───────────┐ ┌───────────┐ ┌───────────┐ + │ Synthesis │───▶│ Briefing │───▶│ TTS Gen │ + │ (LLM) │ │ Text │ │ (Piper) │ + └───────────┘ └───────────┘ └─────┬─────┘ + │ + ┌───────┴───────┐ + ▼ ▼ + Telegram Voice Telegram Text +``` + +## Configuration + +```yaml +# config.yaml +deepdive: + schedule: + daily_time: "06:00" + timezone: "America/New_York" + + aggregation: + sources: + - name: "arxiv_ai" + url: "http://export.arxiv.org/rss/cs.AI" + fetch_window_hours: 24 + - name: "openai_blog" + url: "https://openai.com/blog/rss.xml" + limit: 5 # max items per source + + relevance: + model: "all-MiniLM-L6-v2" + top_n: 10 + min_score: 0.3 + keywords: + - "LLM agent" + - "agent architecture" + - "reinforcement learning" + + synthesis: + llm_model: "gemma-4-it" # local via llama-server + max_summary_length: 800 + + tts: + engine: "piper" + voice: "en_US-amy-medium" + speed: 1.0 + + delivery: + method: "telegram" + channel_id: "-1003664764329" + send_text_summary: true +``` + +## Implementation Phases + +| Phase | Est. Effort | Dependencies | Owner | +|-------|-------------|--------------|-------| +| 1: Aggregation | 3 pts | None | Any agent | +| 2: Relevance | 4 pts | Phase 1 | @gemini | +| 3: Synthesis | 4 pts | Phase 2 | @gemini | +| 4: Audio | 4 pts | Phase 3 | @ezra | +| 5: Delivery | 4 pts | Phase 4 | @ezra | + +## API Surface (Tentative) + +```python +# deepdive/__init__.py +class DeepDivePipeline: + async def run( + self, + since: Optional[datetime] = None, + deliver: bool = True + ) -> BriefingResult: + ... + +@dataclass +class BriefingResult: + sources_considered: int + sources_selected: int + briefing_text: str + audio_path: Optional[Path] + delivered: bool +``` + +## Success Metrics + +- [ ] Daily delivery within 30 min of scheduled time +- [ ] < 5 minute audio length +- [ ] Relevance precision > 80% (manual audit) +- [ ] Zero API dependencies (full local stack)