Add DPOQualityValidator that catches bad training pairs before they enter the tightening loop. Wired into DPOPairGenerator between generate() and export() as an automatic quality gate. New module: dpo_quality.py - 5 single-pair quality checks: 1. Field length minimums (prompt ≥40, chosen ≥80, rejected ≥30 chars) 2. Chosen/rejected length ratio (chosen must be ≥1.3x longer) 3. Chosen≈rejected similarity (Jaccard ≤0.70 — catches low-contrast) 4. Vocabulary diversity in chosen (unique word ratio ≥0.30) 5. Substance markers in chosen (≥2 fleet/training/action terms) - 2 cross-pair quality checks: 6. Near-duplicate prompts within batch (Jaccard ≤0.85) 7. Cross-run dedup against recent JSONL history files - Two modes: 'drop' (filter out bad pairs) or 'flag' (export with warning) - BatchReport with per-pair diagnostics, pass rates, and warnings - Standalone CLI: python3 dpo_quality.py <file.jsonl> [--strict] [--json] Modified: dpo_generator.py - Imports DPOQualityValidator with graceful degradation - Initializes from config validation section (enabled by default) - Validates between generate() and export() in run() - Quality report included in pipeline result dict - Validator failure never blocks — falls back to unvalidated export Modified: config.yaml - New deepdive.training.dpo.validation section with all tunable knobs: enabled, flagged_pair_action, similarity thresholds, length minimums, dedup_history_files Integration tested — 6 test cases covering: ✓ Good pairs pass (3/3 accepted) ✓ Bad pairs caught: too-short, high-similarity, inverted signal (0/3) ✓ Near-duplicate prompt detection (1/2 deduped) ✓ Flag mode preserves pairs with warnings (3/3 flagged) ✓ Cross-run deduplication against history (1 dupe caught) ✓ Full generator→validator→export pipeline (6/6 validated)
134 lines
3.8 KiB
YAML
134 lines
3.8 KiB
YAML
# Deep Dive Configuration
|
|
# Copy to config.yaml and customize
|
|
|
|
deepdive:
|
|
# Schedule
|
|
schedule:
|
|
daily_time: "06:00"
|
|
timezone: "America/New_York"
|
|
|
|
# Phase 1: Aggregation
|
|
sources:
|
|
- name: "arxiv_cs_ai"
|
|
url: "http://export.arxiv.org/rss/cs.AI"
|
|
type: "rss"
|
|
fetch_window_hours: 24
|
|
max_items: 50
|
|
|
|
- name: "arxiv_cs_cl"
|
|
url: "http://export.arxiv.org/rss/cs.CL"
|
|
type: "rss"
|
|
fetch_window_hours: 24
|
|
max_items: 50
|
|
|
|
- name: "arxiv_cs_lg"
|
|
url: "http://export.arxiv.org/rss/cs.LG"
|
|
type: "rss"
|
|
fetch_window_hours: 24
|
|
max_items: 50
|
|
|
|
- name: "openai_blog"
|
|
url: "https://openai.com/blog/rss.xml"
|
|
type: "rss"
|
|
fetch_window_hours: 48
|
|
max_items: 5
|
|
|
|
- name: "anthropic_blog"
|
|
url: "https://www.anthropic.com/blog/rss.xml"
|
|
type: "rss"
|
|
fetch_window_hours: 48
|
|
max_items: 5
|
|
|
|
- name: "deepmind_blog"
|
|
url: "https://deepmind.google/blog/rss.xml"
|
|
type: "rss"
|
|
fetch_window_hours: 48
|
|
max_items: 5
|
|
|
|
# Phase 2: Relevance
|
|
relevance:
|
|
model: "all-MiniLM-L6-v2" # ~80MB embeddings model
|
|
top_n: 10 # Items selected for briefing
|
|
min_score: 0.25 # Hard cutoff
|
|
keywords:
|
|
- "LLM agent"
|
|
- "agent architecture"
|
|
- "tool use"
|
|
- "function calling"
|
|
- "chain of thought"
|
|
- "reasoning"
|
|
- "reinforcement learning"
|
|
- "RLHF"
|
|
- "GRPO"
|
|
- "PPO"
|
|
- "fine-tuning"
|
|
- "transformer"
|
|
- "attention mechanism"
|
|
- "inference optimization"
|
|
- "quantization"
|
|
- "local LLM"
|
|
- "llama.cpp"
|
|
- "ollama"
|
|
- "vLLM"
|
|
- "Hermes"
|
|
- "open source AI"
|
|
|
|
# Phase 3: Synthesis
|
|
synthesis:
|
|
llm_endpoint: "http://localhost:4000/v1" # Local llama-server
|
|
llm_model: "gemma4:12b"
|
|
max_summary_length: 800
|
|
temperature: 0.7
|
|
|
|
# Phase 4: Audio
|
|
tts:
|
|
engine: "piper"
|
|
model_path: "~/.local/share/piper/models"
|
|
voice: "en_US-amy-medium"
|
|
speed: 1.0
|
|
output_format: "mp3" # piper outputs WAV, convert for Telegram
|
|
|
|
# Phase 3.5: DPO Training Pair Generation
|
|
training:
|
|
dpo:
|
|
enabled: true
|
|
output_dir: "~/.timmy/training-data/dpo-pairs"
|
|
min_score: 0.5 # Only generate pairs from items above this relevance score
|
|
max_pairs_per_run: 30 # Cap pairs per pipeline execution
|
|
pair_types: # Which pair strategies to use
|
|
- "summarize" # Paper summary → fleet-grounded analysis
|
|
- "relevance" # Relevance analysis → scored fleet context
|
|
- "implication" # Implications → actionable insight
|
|
validation:
|
|
enabled: true
|
|
flagged_pair_action: "drop" # "drop" = remove bad pairs, "flag" = export with warning
|
|
min_prompt_chars: 40 # Minimum prompt length
|
|
min_chosen_chars: 80 # Minimum chosen response length
|
|
min_rejected_chars: 30 # Minimum rejected response length
|
|
min_chosen_rejected_ratio: 1.3 # Chosen must be ≥1.3x longer than rejected
|
|
max_chosen_rejected_similarity: 0.70 # Max Jaccard overlap between chosen/rejected
|
|
max_prompt_prompt_similarity: 0.85 # Max Jaccard overlap between prompts (dedup)
|
|
dedup_history_files: 5 # How many recent JSONL files to scan for cross-run dedup
|
|
|
|
# Phase 0: Fleet Context Grounding
|
|
fleet_context:
|
|
enabled: true
|
|
gitea_url: "https://forge.alexanderwhitestone.com"
|
|
token: "${GITEA_TOKEN}" # From environment
|
|
owner: "Timmy_Foundation"
|
|
repos:
|
|
- "timmy-config"
|
|
- "the-nexus"
|
|
- "timmy-home"
|
|
- "hermes-agent"
|
|
|
|
# Phase 5: Delivery
|
|
delivery:
|
|
method: "telegram"
|
|
bot_token: "${TELEGRAM_BOT_TOKEN}" # From env
|
|
channel_id: "-1003664764329"
|
|
send_text_summary: true
|
|
|
|
output_dir: "~/briefings"
|
|
log_level: "INFO"
|