the-nexus/intelligence/deepdive/config.yaml

# Deep Dive Configuration
# Copy to config.yaml and customize

deepdive:
  # Schedule
  schedule:
    daily_time: "06:00"
    timezone: "America/New_York"

  # Phase 1: Aggregation
  sources:
    - name: "arxiv_cs_ai"
      url: "http://export.arxiv.org/rss/cs.AI"
      type: "rss"
      fetch_window_hours: 24
      max_items: 50

    - name: "arxiv_cs_cl"
      url: "http://export.arxiv.org/rss/cs.CL"
      type: "rss"
      fetch_window_hours: 24
      max_items: 50

    - name: "arxiv_cs_lg"
      url: "http://export.arxiv.org/rss/cs.LG"
      type: "rss"
      fetch_window_hours: 24
      max_items: 50

    - name: "openai_blog"
      url: "https://openai.com/blog/rss.xml"
      type: "rss"
      fetch_window_hours: 48
      max_items: 5

    - name: "anthropic_blog"
      url: "https://www.anthropic.com/blog/rss.xml"
      type: "rss"
      fetch_window_hours: 48
      max_items: 5

    - name: "deepmind_blog"
      url: "https://deepmind.google/blog/rss.xml"
      type: "rss"
      fetch_window_hours: 48
      max_items: 5

  # Phase 2: Relevance
  relevance:
    model: "all-MiniLM-L6-v2"  # ~80MB embeddings model
    top_n: 10  # Items selected for briefing
    min_score: 0.25  # Hard cutoff
    keywords:
      - "LLM agent"
      - "agent architecture"
      - "tool use"
      - "function calling"
      - "chain of thought"
      - "reasoning"
      - "reinforcement learning"
      - "RLHF"
      - "GRPO"
      - "PPO"
      - "fine-tuning"
      - "transformer"
      - "attention mechanism"
      - "inference optimization"
      - "quantization"
      - "local LLM"
      - "llama.cpp"
      - "ollama"
      - "vLLM"
      - "Hermes"
      - "open source AI"

  # Phase 3: Synthesis
  synthesis:
    llm_endpoint: "http://localhost:4000/v1"  # Local llama-server
    llm_model: "gemma4:12b"
    max_summary_length: 800
    temperature: 0.7

  # Phase 4: Audio
  tts:
    engine: "piper"
    model_path: "~/.local/share/piper/models"
    voice: "en_US-amy-medium"
    speed: 1.0
    output_format: "mp3"  # piper outputs WAV, convert for Telegram

  # Phase 3.5: DPO Training Pair Generation
  training:
    dpo:
      enabled: true
      output_dir: "~/.timmy/training-data/dpo-pairs"
      min_score: 0.5        # Only generate pairs from items above this relevance score
      max_pairs_per_run: 30  # Cap pairs per pipeline execution
      pair_types:            # Which pair strategies to use
        - "summarize"        # Paper summary → fleet-grounded analysis
        - "relevance"        # Relevance analysis → scored fleet context
        - "implication"      # Implications → actionable insight
      validation:
        enabled: true
        flagged_pair_action: "drop"      # "drop" = remove bad pairs, "flag" = export with warning
        min_prompt_chars: 40             # Minimum prompt length
        min_chosen_chars: 80             # Minimum chosen response length
        min_rejected_chars: 30           # Minimum rejected response length
        min_chosen_rejected_ratio: 1.3   # Chosen must be ≥1.3x longer than rejected
        max_chosen_rejected_similarity: 0.70  # Max Jaccard overlap between chosen/rejected
        max_prompt_prompt_similarity: 0.85    # Max Jaccard overlap between prompts (dedup)
        dedup_history_files: 5           # How many recent JSONL files to scan for cross-run dedup

  # Phase 0: Fleet Context Grounding
  fleet_context:
    enabled: true
    gitea_url: "https://forge.alexanderwhitestone.com"
    token: "${GITEA_TOKEN}"   # From environment
    owner: "Timmy_Foundation"
    repos:
      - "timmy-config"
      - "the-nexus"
      - "timmy-home"
      - "hermes-agent"

  # Phase 5: Delivery
  delivery:
    method: "telegram"
    bot_token: "${TELEGRAM_BOT_TOKEN}"  # From env
    channel_id: "-1003664764329"
    send_text_summary: true

  output_dir: "~/briefings"
  log_level: "INFO"