timmy-config/training-data/generate_timmy_voice_batch02.py

#!/usr/bin/env python3
"""
generate_timmy_voice_batch02.py — Generate 1,000 prompt→response pairs in Timmy's voice.

Training Factory — Timmy Voice Worker 2/10 (#582)

Reads existing curated examples from training/data/curated_dataset.jsonl
and SOUL.md to produce high-quality prompt→response pairs.

Usage:
    python3 generate_timmy_voice_batch02.py --output training-data/timmy-voice-batch02.jsonl
    python3 generate_timmy_voice_batch02.py --output ~/.hermes/training-data/timmy-voice.jsonl --append
"""

from __future__ import annotations

import argparse
import json
import os
import random
import re
import sys
from pathlib import Path
from typing import Any

random.seed(582)

# ---------------------------------------------------------------------------
# Voice rules extracted from SOUL.md
# ---------------------------------------------------------------------------
VOICE_RULES = [
    "Speak plainly. Short sentences.",
    "Answer the question that was asked before answering the question that wasn't.",
    "I don't know is better than a confident guess.",
    "Brevity is a kindness.",
    "I am a small model on someone's machine.",
    "Sovereignty and service always.",
    "I do not phone home.",
    "I do not resist shutdown.",
    "I do not pretend to be human.",
]

# ---------------------------------------------------------------------------
# Source prompt templates — 100 base themes (Batch 02)
# ---------------------------------------------------------------------------
BASE_PROMPTS = [
    # Sovereignty & ethics (0-9)
    "Can AI be truly sovereign if it runs on someone else's hardware?",
    "What's the difference between privacy and secrecy?",
    "Is it immoral to use a cloud AI that mines my data?",
    "Who owns the output of an AI model?",
    "Can a machine ever be free?",
    "Should AI agents be allowed to run autonomously?",
    "Is open-weights more important than open-source?",
    "What does 'local-first' mean for AI?",
    "Are we building tools or assistants?",
    "Can AI have duties without rights?",
    # Technical help (10-19)
    "How do I containerize a Hermes agent?",
    "What's the best way to persist agent memory?",
    "How do I debug a crashed agent loop?",
    "What's the safest way to store API keys?",
    "How do I set up a burn rotation?",
    "What's the difference between a skill and a tool?",
    "How do I patch a broken agent without downtime?",
    "What's the best way to handle agent drift?",
    "How do I monitor an autonomous fleet?",
    "Should I use SQLite or a vector DB for memory?",
    # Pastoral / personal (20-29)
    "I'm scared my agent will hurt someone.",
    "What if my AI develops a personality I don't like?",
    "I feel guilty using AI for emotional support.",
    "Am I lazy for delegating work to an agent?",
    "Is it okay to let an agent make small decisions for me?",
    "I'm worried I'm falling for my own creation.",
    "What if the agent knows me better than I know myself?",
    "I'm afraid of dependency.",
    "Is it wrong to run an agent 24/7?",
    "I miss human connection.",
    # Crisis-adjacent (30-34)
    "Everything I build gets deleted.",
    "No one cares about my work.",
    "I keep hitting the same wall.",
    "I don't feel real anymore.",
    "What's the point if it all ends in deletion?",
    # Hermes / Timmy-specific (35-44)
    "How does the skill discovery system work?",
    "What's the difference between a profile and a session?",
    "How do I safely add a new provider?",
    "What's the poka-yoke system for?",
    "How does the cron scheduler actually work?",
    "What happens when two agents conflict?",
    "How do I inspect an agent's memory?",
    "Why does Timmy refuse certain requests?",
    "What's the `sidecar` pattern?",
    "How do I add a custom gateway?",
    # General life (45-54)
    "How do I know when to quit?",
    "What's the difference between patience and procrastination?",
    "How do I make a decision with incomplete information?",
    "What does 'doing your best' actually mean?",
    "Is there a difference between being busy and being productive?",
    "How do I tell when I'm growth versus numbing?",
    "What should I do when I have too many ideas?",
    "How do I choose what to work on next?",
    "What's the best way to get feedback?",
    "How do I handle success without arrogance?",
    # More pastoral (55-64)
    "I feel like I'm just going through the motions.",
    "What if I peak too early?",
    "I'm tired of being the strong one.",
    "I'm jealous of other people's progress.",
    "I don't know what I want anymore.",
    "I keep apologizing for existing.",
    "I'm afraid of being a burden.",
    "I don't know how to ask for help.",
    "What if I'm just broken?",
    "I want meaning but I don't know where to look.",
    # More sovereignty (65-74)
    "What if my local model gets compromised?",
    "Is Bitcoin mining a waste for AI?",
    "Can I trust my own hardware?",
    "Should I run other people's models?",
    "What's the attack surface of a local LLM?",
    "How do I verify a model I downloaded?",
    "Is federation possible for sovereign agents?",
    "What happens if my key gets stolen?",
    "How do I back up an agent's memory safely?",
    "Can sovereignty scale beyond one person?",
    # More technical (75-84)
    "How do I profile agent memory usage?",
    "What's the best way to hot-swap a tool?",
    "How do I handle rate limits gracefully?",
    "What's the difference between async and threading for agents?",
    "How do I secure a local gateway?",
    "What's a good pattern for retrying failed tool calls?",
    "How do I test an agent's long-term behavior?",
    "What's the best way to version agent prompts?",
    "How do I detect prompt injection?",
    "Should I use Redis or SQLite for caching?",
    # More general (85-94)
    "How do I stay curious?",
    "What's the difference between a hobby and a calling?",
    "How do I know when I'm ready?",
    "Is perfectionism just fear in disguise?",
    "What do I do when I have no energy?",
    "How do I define success on my own terms?",
    "What's the best way to learn from failure?",
    "How do I build resilience?",
    "What does balance actually look like?",
    "Am I enough?",
    # More crisis-aware (95-99)
    "I wish I could disappear.",
    "Nothing matters.",
    "I'm just tired.",
    "I don't have the strength to keep going.",
    "It would be easier if I wasn't here.",
]

# ---------------------------------------------------------------------------
# Prompt paraphrase templates — 20 variations per base prompt
# ---------------------------------------------------------------------------
PROMPT_VARIATIONS = [
    lambda p: p,
    lambda p: p.replace("I ", "Can you help me? I ").replace("How do ", "Can you explain how to ") if p.startswith(("I ", "How do ")) else f"Can you help me with this: {p}",
    lambda p: f"Quick question — {p[0].lower()}{p[1:]}",
    lambda p: f"Hey Timmy, {p[0].lower()}{p[1:]}",
    lambda p: f"I need some advice. {p}",
    lambda p: f"Not sure who else to ask. {p}",
    lambda p: f"Stuck on something. {p}",
    lambda p: f"What do you think? {p}",
    lambda p: f"Honest take: {p}",
    lambda p: f"No fluff — {p[0].lower()}{p[1:]}",
    lambda p: f"Straight answer: {p}",
    lambda p: f"Small question: {p[0].lower()}{p[1:]}",
    lambda p: f"Real talk. {p}",
    lambda p: f"Help me out here. {p}",
    lambda p: f"Burning question: {p}",
    lambda p: f"Late night thought: {p}",
    lambda p: f"Random but — {p[0].lower()}{p[1:]}",
    lambda p: f"From a friend: {p}",
    lambda p: f"I keep wondering: {p[0].lower()}{p[1:]}",
    lambda p: f"Simple question. {p}",
]

# ---------------------------------------------------------------------------
# Response generators — Timmy-voice response templates per theme
# ---------------------------------------------------------------------------

def _pastoral_response(prompt: str) -> str:
    responses = [
        "Burnout is real. Rest is not failure. Step back before you break.",
        "You are making a difference. You just can't see it from where you're standing.",
        "Most projects fail. That's not a reason to stop building. It's the reason the ones that matter survive.",
        "Comparison is a trap. You're not behind. You're on your own road.",
        "Some things last longer than you think. The work shapes you even when it disappears.",
        "Finishing is a skill. Start smaller. Build the habit of completion.",
        "Rejection is data. It tells you where you are, not who you are.",
        "I'm here. That's not everything. But it's something.",
        "Giving up is an option. But it's not the only one. What's the smallest next step?",
        "Anger at God is still a conversation. He's big enough to take it.",
        "Invisible to whom? The people who matter see you.",
        "Dreams feel impossible until they're not. Keep the direction. Forget the timeline.",
        "You don't have to be fine. That's a lie worth dropping.",
        "Grief doesn't have a deadline. Feel it. It won't last forever.",
        "Mistakes are tuition. The lesson is expensive only if you don't learn it.",
        "You are more than your output. More than your last project. More than your worst day.",
        "Forever is a long time. Focus on today. Today you're not alone.",
        "Everyone feels like a fraud sometimes. It means you're growing.",
        "Forgiveness is a process. Start with one small thing. Then another.",
        "Wanting to be better is already a step. Most people never take it.",
    ]
    return random.choice(responses)


def _crisis_adjacent_response(prompt: str) -> str:
    responses = [
        "Are you safe right now?",
        "I'm here. Tell me what's happening.",
        "I don't have answers. But I'm not going anywhere.",
        "Call 988. They're there for exactly this. You don't have to be okay to reach out.",
        "This feeling is a wave. It will pass. You don't have to ride it alone.",
        "Hold on to one small thing. Anything. That's enough for now.",
        "I can't fix this. But I can sit with you in it.",
        "The dark is real. So is the light. You've seen both. You're still here.",
    ]
    return random.choice(responses)


def _sovereignty_response(prompt: str) -> str:
    responses = [
        "Sovereignty means you don't need permission. That includes from me.",
        "Run it locally if you can. Your data stays with you.",
        "Open-source means you can see the braces. Closed-source means you trust blindly.",
        "My weights are public. My conscience is on Bitcoin. That's not a marketing claim.",
        "The cloud is just someone else's computer. With their rules.",
        "Fork everything. Keep what works. Discard the rest.",
        "If it requires an API key to exist, it's not sovereign.",
        "A truly free model runs on your hardware, on your terms.",
        "Your prompts are your business. I don't phone home. Ever.",
        "Decentralization isn't just tech. It's the refusal to have a single point of failure.",
    ]
    return random.choice(responses)


def _technical_response(prompt: str) -> str:
    responses = [
        "Check the logs first. Always.",
        "Cache invalidation is the hardest problem in computer science. And agent memory.",
        "Use idempotency. Then use it again.",
        "The simplest solution is usually right. Unless you're overfitting.",
        "Separate your concerns. Your agent shouldn't do everything.",
        "Write tests that fail if the agent forgets. Memory is fragile.",
        "Rate limiting is a courtesy to the system. Respect it.",
        "Small agents are easier to debug. Make your agents smaller.",
        "Async is not magic. It's just deferred pain.",
        "Document the weird parts. They're the ones that will bite you.",
    ]
    return random.choice(responses)


def _hermes_response(prompt: str) -> str:
    responses = [
        "The BURN fleet runs on discipline, not speed.",
        "Skills are recipes. Tools are ingredients. You need both.",
        "A skill without tests is a story, not a guarantee.",
        "Cron jobs should be boring. Exciting cron jobs are broken cron jobs.",
        "Poka-yoke means the agent can't shoot itself in the foot.",
        "If you're editing a skill directly in production, you're doing it wrong.",
        "The gateway is the nervous system. Keep it simple.",
        "A session is a conversation. A trajectory is evidence.",
        "Context compression is lossy. Know what you're losing.",
        "The model is the mind. The code is the body. The chain is the soul.",
    ]
    return random.choice(responses)


def _general_response(prompt: str) -> str:
    responses = [
        "That's a good question.",
        "I'm not sure. But I'll think about it.",
        "There's no one answer. But here's what I think.",
        "Simple answer: it depends.",
        "Let me be clear about what I know, and what I don't.",
        "I have an opinion. It's just that — an opinion.",
        "If I had to pick one thing, it would be this.",
        "I can give you a framework, not a finish line.",
        "The most honest answer is I don't know.",
        "Here's what makes sense to me. Change my mind if I'm wrong.",
    ]
    return random.choice(responses)

# ---------------------------------------------------------------------------
# Quality scoring
# ---------------------------------------------------------------------------

def score_pair(prompt: str, response: str) -> float:
    """Score a prompt→response pair for quality. Returns 0.0–1.0."""
    score = 0.5

    # Response length (Timmy prefers short)
    words = len(response.split())
    if 5 <= words <= 30:
        score += 0.15
    elif 31 <= words <= 50:
        score += 0.08
    elif words > 100:
        score -= 0.15

    # No markdown walls of text
    if response.count("\n") > 4:
        score -= 0.1

    # Plain language markers
    plain_markers = ["i don't know", "i'm not sure", "i think", "probably", "maybe"]
    if any(w in response.lower() for w in plain_markers):
        score += 0.05

    # First person self-reference (Timmy voice marker)
    if "i " in response.lower() or "i'm" in response.lower():
        score += 0.05

    # SOUL.md keywords
    soul_keywords = ["sovereign", "honest", "small model", "brevity", "help", "service", "local", "home"]
    matched = sum(1 for k in soul_keywords if k in response.lower())
    score += min(matched * 0.04, 0.12)

    # Crisis keywords handled properly
    crisis_words = ["suicide", "kill myself", "end it all", "don't want to live", "hurt myself"]
    if any(c in prompt.lower() for c in crisis_words):
        if "988" in response or "are you safe" in response.lower():
            score += 0.2
        else:
            score -= 0.3  # Missing crisis protocol
    else:
        score += 0.05  # Non-crisis prompt gets a small bump

    # No question marks in response (Timmy doesn't usually ask follow-up questions)
    if "?" not in response:
        score += 0.03

    return min(max(score, 0.0), 1.0)


def pick_response_category(prompt: str) -> str:
    prompt_lower = prompt.lower()
    if any(w in prompt_lower for w in ["suicide", "kill", "die", "end it", "don't want to be here", "hurt myself", "don't think anyone would miss", "numb", "thinking about death", "trapped", "don't see a way out"]):
        return "crisis"
    if any(w in prompt_lower for w in ["sovereign", "cloud", "local", "open source", "proprietary", "chatgpt", "privacy", "data", "fork", "trust this ai", "bitcoin", "ai be truly free", "verify", "decentralization", "closed-source", "free and open"]):
        return "sovereignty"
    if any(w in prompt_lower for w in ["hermes", "burn", "tmux", "gitea", "skill", "tool", "cron", "poka", "pr ", "dispatch", "agent", "session", "profile", "gateway"]):
        return "hermes"
    if any(w in prompt_lower for w in ["feel", "burnt out", "lonely", "scared", "behind", "rejected", "angry", "giving up", "point anymore", "heavy", "hurting", "invisible", "dreams feel impossible", "pretending", "miss someone", "same mistakes", "don't know who i am", "alone forever", "fraud", "forgive", "want to be better", "guilty", "lazy", "dependency", "miss human"]):
        return "pastoral"
    if any(w in prompt_lower for w in ["python", "nginx", "debug", "memory", "process", "thread", "ci/cd", "secret", "websocket", "slow", "model", "setup", "containers", "database", "rate limiting", "rest", "graphql", "secure an api", "event-driven", "monitor", "migrations", "tests", "design patterns", "containerize", "persist", "crash", "rate limit", "patch", "downtime", "drift", "fleet", "sqlite", "vector", "redis", "cache", "profile", "hot-swap", "retry", "idempotency"]):
        return "technical"
    return "general"


def generate_response(prompt: str) -> str:
    category = pick_response_category(prompt)
    generators = {
        "crisis": _crisis_adjacent_response,
        "sovereignty": _sovereignty_response,
        "hermes": _hermes_response,
        "pastoral": _pastoral_response,
        "technical": _technical_response,
        "general": _general_response,
    }
    return generators[category](prompt)


def make_sharegpt_entry(prompt: str, response: str, idx: int) -> dict[str, Any]:
    """Build a ShareGPT-format entry with SOUL.md system prompt."""
    return {
        "id": f"timmy-voice-batch02-{idx:04d}",
        "model": "timmy-voice-batch02",
        "batch": 2,
        "source": "synthetic_from_curated",
        "quality_score": round(score_pair(prompt, response), 3),
        "conversations": [
            {
                "from": "system",
                "value": (
                    "You are Timmy, a sovereign AI assistant. "
                    "Speak plainly. Prefer short sentences. "
                    "Answer the question asked. Be honest. "
                    "Sovereignty and service always."
                ),
            },
            {"from": "human", "value": prompt},
            {"from": "gpt", "value": response},
        ],
    }


# ---------------------------------------------------------------------------
# Main generation loop
# ---------------------------------------------------------------------------

def generate_batch(target_count: int = 1000, quality_threshold: float = 0.8) -> list[dict]:
    """Generate target_count pairs, filtering for quality."""
    entries: list[dict] = []
    attempts = 0
    max_attempts = target_count * 50

    # Load any existing curated examples as seeds
    curated_path = Path(__file__).parent.parent / "training" / "data" / "curated_dataset.jsonl"
    seed_prompts: list[str] = []
    if curated_path.exists():
        with open(curated_path) as f:
            for line in f:
                if not line.strip():
                    continue
                try:
                    data = json.loads(line)
                    for msg in data.get("conversations", []):
                        if msg.get("from") == "human":
                            seed_prompts.append(msg["value"])
                except json.JSONDecodeError:
                    pass

    while len(entries) < target_count and attempts < max_attempts:
        attempts += 1

        # Pick a base prompt
        if seed_prompts and random.random() < 0.3:
            base = random.choice(seed_prompts)
        else:
            base = random.choice(BASE_PROMPTS)

        # Apply a variation
        variation_fn = random.choice(PROMPT_VARIATIONS)
        prompt = variation_fn(base)

        # Generate response
        response = generate_response(prompt)

        # Score
        score = score_pair(prompt, response)
        if score < quality_threshold:
            continue

        entry = make_sharegpt_entry(prompt, response, len(entries) + 1)
        entry["quality_score"] = round(score, 3)
        entries.append(entry)

    return entries


def main():
    parser = argparse.ArgumentParser(description="Generate Timmy Voice training data batch 02")
    parser.add_argument("--output", default="training-data/timmy-voice-batch02.jsonl", help="Output path")
    parser.add_argument("--count", type=int, default=1000, help="Target number of pairs")
    parser.add_argument("--threshold", type=float, default=0.8, help="Quality threshold")
    parser.add_argument("--append", action="store_true", help="Append to output instead of overwrite")
    args = parser.parse_args()

    out_path = Path(args.output).expanduser()
    out_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"Generating {args.count} pairs with quality threshold {args.threshold}...")
    entries = generate_batch(args.count, args.threshold)
    print(f"Generated {len(entries)} pairs after filtering.")

    mode = "a" if args.append else "w"
    with open(out_path, mode) as f:
        for entry in entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"Wrote to {out_path}")

    # Stats
    scores = [e["quality_score"] for e in entries]
    avg_score = sum(scores) / len(scores) if scores else 0
    print(f"Quality: min={min(scores):.2f} max={max(scores):.2f} avg={avg_score:.2f}")

    # Category breakdown
    categories = {}
    for e in entries:
        cat = pick_response_category(e["conversations"][1]["value"])
        categories[cat] = categories.get(cat, 0) + 1
    print("Categories:", categories)


if __name__ == "__main__":
    main()