Files
ezra-environment/the-nexus/deepdive/bin/run_full_pipeline.py
Ezra 9f010ad044 [BURN] Deep Dive scaffold: 5-phase sovereign NotebookLM (#830)
Complete production-ready scaffold for automated daily AI intelligence briefings:

- Phase 1: Source aggregation (arXiv + lab blogs)
- Phase 2: Relevance ranking (keyword + source authority scoring)
- Phase 3: LLM synthesis (Hermes-context briefing generation)
- Phase 4: TTS audio (edge-tts/OpenAI/ElevenLabs)
- Phase 5: Telegram delivery (voice message)

Deliverables:
- docs/ARCHITECTURE.md (9000+ lines) - system design
- docs/OPERATIONS.md - runbook and troubleshooting
- 5 executable phase scripts (bin/)
- Full pipeline orchestrator (run_full_pipeline.py)
- requirements.txt, README.md

Addresses all 9 acceptance criteria from #830.
Ready for host selection, credential config, and cron activation.

Author: Ezra | Burn mode | 2026-04-05
2026-04-05 05:48:12 +00:00

196 lines
6.3 KiB
Python

#!/usr/bin/env python3
"""
Deep Dive: Full Pipeline Orchestrator
Runs all 5 phases: Aggregate → Rank → Synthesize → Audio → Deliver
Usage:
./run_full_pipeline.py [--date YYYY-MM-DD] [--phases PHASES] [--dry-run]
Issue: the-nexus#830
"""
import argparse
import asyncio
import sys
from datetime import datetime
from pathlib import Path
# Import phase modules
sys.path.insert(0, str(Path(__file__).parent))
import phase1_aggregate
import phase2_rank
import phase3_synthesize
import phase4_generate_audio
import phase5_deliver
class PipelineOrchestrator:
"""Orchestrate the full Deep Dive pipeline."""
PHASES = {
1: ('aggregate', phase1_aggregate),
2: ('rank', phase2_rank),
3: ('synthesize', phase3_synthesize),
4: ('audio', phase4_generate_audio),
5: ('deliver', phase5_deliver),
}
def __init__(self, date: str, output_dir: Path, phases: list, dry_run: bool = False):
self.date = date
self.output_dir = output_dir
self.phases = phases
self.dry_run = dry_run
def run_phase1(self):
"""Run aggregation phase."""
print("=" * 60)
print("PHASE 1: SOURCE AGGREGATION")
print("=" * 60)
aggregator = phase1_aggregate.SourceAggregator(self.output_dir, self.date)
return asyncio.run(aggregator.run())
def run_phase2(self):
"""Run ranking phase."""
print("\n" + "=" * 60)
print("PHASE 2: RELEVANCE RANKING")
print("=" * 60)
engine = phase2_rank.RelevanceEngine(self.output_dir, self.date)
return engine.run(top_n=20)
def run_phase3(self):
"""Run synthesis phase."""
print("\n" + "=" * 60)
print("PHASE 3: SYNTHESIS")
print("=" * 60)
engine = phase3_synthesize.SynthesisEngine(self.output_dir, self.date)
return engine.run()
def run_phase4(self):
"""Run audio generation phase."""
print("\n" + "=" * 60)
print("PHASE 4: AUDIO GENERATION")
print("=" * 60)
generator = phase4_generate_audio.AudioGenerator(self.output_dir, self.date)
return generator.run()
def run_phase5(self):
"""Run delivery phase."""
print("\n" + "=" * 60)
print("PHASE 5: DELIVERY")
print("=" * 60)
pipeline = phase5_deliver.DeliveryPipeline(self.output_dir, self.date)
return asyncio.run(pipeline.run())
def run(self):
"""Run selected phases."""
print("🎙️ DEEP DIVE — Daily AI Intelligence Briefing")
print(f"Date: {self.date}")
print(f"Phases: {', '.join(str(p) for p in self.phases)}")
print(f"Output: {self.output_dir}")
if self.dry_run:
print("[DRY RUN] No actual API calls or deliveries")
print()
results = {}
try:
for phase in self.phases:
if self.dry_run:
print(f"[DRY RUN] Would run phase {phase}")
continue
if phase == 1:
results[1] = "aggregated" if self.run_phase1() else "failed"
elif phase == 2:
results[2] = "ranked" if self.run_phase2() else "failed"
elif phase == 3:
results[3] = str(self.run_phase3()) if self.run_phase3() else "failed"
elif phase == 4:
results[4] = str(self.run_phase4()) if self.run_phase4() else "failed"
elif phase == 5:
results[5] = "delivered" if self.run_phase5() else "failed"
print("\n" + "=" * 60)
print("PIPELINE COMPLETE")
print("=" * 60)
for phase, result in results.items():
status = "" if result != "failed" else ""
print(f"{status} Phase {phase}: {result}")
return all(r != "failed" for r in results.values())
except Exception as e:
print(f"\n[ERROR] Pipeline failed: {e}")
import traceback
traceback.print_exc()
return False
def main():
parser = argparse.ArgumentParser(
description='Deep Dive: Full Pipeline Orchestrator'
)
parser.add_argument('--date', default=datetime.now().strftime('%Y-%m-%d'),
help='Target date (YYYY-MM-DD)')
parser.add_argument('--output-dir', type=Path,
default=Path(__file__).parent.parent / 'data',
help='Output directory for data')
parser.add_argument('--phases', default='1,2,3,4,5',
help='Comma-separated phase numbers to run (e.g., 1,2,3)')
parser.add_argument('--dry-run', action='store_true',
help='Dry run (no API calls)')
parser.add_argument('--phase1-only', action='store_true',
help='Run only Phase 1 (aggregate)')
parser.add_argument('--phase2-only', action='store_true',
help='Run only Phase 2 (rank)')
parser.add_argument('--phase3-only', action='store_true',
help='Run only Phase 3 (synthesize)')
parser.add_argument('--phase4-only', action='store_true',
help='Run only Phase 4 (audio)')
parser.add_argument('--phase5-only', action='store_true',
help='Run only Phase 5 (deliver)')
args = parser.parse_args()
# Handle phase-specific flags
if args.phase1_only:
phases = [1]
elif args.phase2_only:
phases = [2]
elif args.phase3_only:
phases = [3]
elif args.phase4_only:
phases = [4]
elif args.phase5_only:
phases = [5]
else:
phases = [int(p) for p in args.phases.split(',')]
# Validate phases
for p in phases:
if p not in range(1, 6):
print(f"[ERROR] Invalid phase: {p}")
sys.exit(1)
# Sort phases
phases = sorted(set(phases))
orchestrator = PipelineOrchestrator(
date=args.date,
output_dir=args.output_dir,
phases=phases,
dry_run=args.dry_run
)
success = orchestrator.run()
sys.exit(0 if success else 1)
if __name__ == '__main__':
main()