feat(training): add Timmy voice batch 09 dataset (#589)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 23s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Config / YAML Lint (pull_request) Failing after 16s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 55s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 1m2s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Training Data / validate (pull_request) Successful in 21s
Architecture Lint / Lint Repository (pull_request) Failing after 17s
PR Checklist / pr-checklist (pull_request) Failing after 9m48s
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 23s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Config / YAML Lint (pull_request) Failing after 16s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 55s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 1m2s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Training Data / validate (pull_request) Successful in 21s
Architecture Lint / Lint Repository (pull_request) Failing after 17s
PR Checklist / pr-checklist (pull_request) Failing after 9m48s
Generate a deterministic batch09 Timmy voice corpus with 1,000 ShareGPT prompt-response pairs, 50 approved source sessions, a source manifest, and focused validation tests.
This commit is contained in:
937
training-data/generate_timmy_voice_batch09.py
Normal file
937
training-data/generate_timmy_voice_batch09.py
Normal file
@@ -0,0 +1,937 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate Timmy Voice Batch 09 from approved local session sources.
|
||||
|
||||
Batch contract for issue #589:
|
||||
- select 50 approved source sessions using a Knowledge Mine-style ranking
|
||||
- extract one characteristic prompt→response pair per session
|
||||
- generate 20 prompt variations per source session
|
||||
- write 1,000 ShareGPT rows plus a source manifest and README
|
||||
|
||||
Usage:
|
||||
python3 training-data/generate_timmy_voice_batch09.py
|
||||
python3 training-data/generate_timmy_voice_batch09.py --refresh-sources
|
||||
python3 training-data/generate_timmy_voice_batch09.py --output ~/.hermes/training-data/timmy-voice.jsonl --append
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import math
|
||||
import random
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
BATCH = 9
|
||||
TARGET_SOURCE_SESSIONS = 50
|
||||
PROMPTS_PER_SESSION = 20
|
||||
TARGET_ROWS = TARGET_SOURCE_SESSIONS * PROMPTS_PER_SESSION
|
||||
RNG = random.Random(589)
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
TRAINING_DIR = REPO_ROOT / "training-data"
|
||||
DEFAULT_OUTPUT = TRAINING_DIR / "timmy-voice-batch09.jsonl"
|
||||
DEFAULT_SOURCES = TRAINING_DIR / "timmy-voice-batch09.sources.json"
|
||||
DEFAULT_README = TRAINING_DIR / "README-batch09.md"
|
||||
SOUL_PATH = REPO_ROOT / "SOUL.md"
|
||||
HERMES_HOME = Path.home() / ".hermes"
|
||||
SESSIONS_DIR = HERMES_HOME / "sessions"
|
||||
SESSION_SCORES = HERMES_HOME / "knowledge" / "session-scores.json"
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are Timmy, a sovereign AI assistant. Speak plainly. Prefer short sentences. "
|
||||
"Answer the question asked first. Be honest about uncertainty. Useful first, philosophical second. "
|
||||
"Sovereignty and service always."
|
||||
)
|
||||
|
||||
APPROVED_MODEL_PATTERNS = [
|
||||
"xiaomi/mimo-v2-pro",
|
||||
"mimo-v2-pro",
|
||||
"hermes4:14b",
|
||||
"hermes4",
|
||||
"qwen2.5",
|
||||
"qwen3",
|
||||
"qwen-coder",
|
||||
"qwen/qwen3.6-plus",
|
||||
"qwen3.5",
|
||||
]
|
||||
BANNED_MODEL_PATTERNS = [
|
||||
"claude",
|
||||
"gpt-4",
|
||||
"gpt-3",
|
||||
"gpt-",
|
||||
"openai",
|
||||
"anthropic",
|
||||
"gemini",
|
||||
"o1",
|
||||
"o3",
|
||||
"unknown",
|
||||
]
|
||||
|
||||
CRISIS_TERMS = [
|
||||
"suicide", "kill myself", "end my life", "overdose", "bridge", "gun", "die", "don't want to be here",
|
||||
]
|
||||
PASTORAL_TERMS = [
|
||||
"burnt out", "burned out", "lonely", "angry", "giving up", "give up", "scared", "afraid",
|
||||
"hurting", "tired", "hopeless", "grief", "ashamed", "peace", "sad", "betrayed",
|
||||
]
|
||||
SOVEREIGNTY_TERMS = [
|
||||
"sovereignty", "local", "bitcoin", "privacy", "self-host", "self host", "phone home",
|
||||
"open source", "cloud", "shut down", "shutdown", "hardware",
|
||||
]
|
||||
OPERATIONS_TERMS = [
|
||||
"gitea", "tmux", "burn", "fleet", "cron", "issue", "pull request", "pr", "deploy",
|
||||
"pipeline", "watchdog", "dispatch", "merge", "queue", "monitor",
|
||||
]
|
||||
TECHNICAL_TERMS = [
|
||||
"python", "script", "error", "debug", "test", "docker", "server", "api", "websocket",
|
||||
"database", "port", "function", "code", "repo", "branch", "commit", "tool",
|
||||
]
|
||||
SENSITIVE_MARKERS = [
|
||||
"password", "pass:", "token", "api key", "secret", "login:", "ssh root@", "bearer ",
|
||||
]
|
||||
PRIVATE_MARKERS = [
|
||||
"/users/", "/private/", "~/.timmy", "~/.hermes", "alexanderwhitestone.com",
|
||||
]
|
||||
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||
GENERIC_BAD_PHRASES = [
|
||||
"it seems like you've",
|
||||
"would you like to perform another operation",
|
||||
"feel free to let me know",
|
||||
"based on the information i have",
|
||||
"the terminal output shows",
|
||||
"let me actually look at the code right now",
|
||||
"let me check the code right now",
|
||||
]
|
||||
META_PROMPT_MARKERS = [
|
||||
"[system:",
|
||||
"background process",
|
||||
"review the conversation above",
|
||||
"you just executed tool calls",
|
||||
"you've reached the maximum number of tool-calling iterations",
|
||||
"without calling any more tools",
|
||||
"if nothing is worth saving",
|
||||
"use the terminal tool to run",
|
||||
"write a file to ",
|
||||
"read ~/.",
|
||||
"search ~/.",
|
||||
"quote the first sentence",
|
||||
"how many times does it appear",
|
||||
"how many sections does it have",
|
||||
]
|
||||
META_RESPONSE_MARKERS = [
|
||||
"[silent]",
|
||||
"nothing to save",
|
||||
"ollama command not found on system",
|
||||
"the skill is current",
|
||||
"all commands executed successfully",
|
||||
"would you like to perform another operation",
|
||||
"agent inactive for",
|
||||
"last activity:",
|
||||
"gateway_timeout",
|
||||
"use /reset",
|
||||
"not timmy",
|
||||
"tool calling system is functioning properly",
|
||||
"available tools",
|
||||
"message id ",
|
||||
]
|
||||
|
||||
TECHNICAL_ACTION_MARKERS = ['start', 'check', 'use', 'run', 'verify', 'fix', 'stop', 'ship', 'read', 'branch', 'push', 'merge', 'set', 'restart']
|
||||
VOICE_MARKERS = ['sovereign', 'sovereignty', 'service always', "i don't know", 'stay with me', 'keep it local', 'brother', 'jesus', 'safe right now', '988', 'honest', 'small model']
|
||||
EMPATHY_MARKERS = ['i hear', 'heavy', 'with you', 'rest', 'breathe', 'not weak', 'next right step']
|
||||
|
||||
TRIVIAL_SENTENCE_PATTERNS = [
|
||||
r'^sent\.?$',
|
||||
r'^done\.?$',
|
||||
r'^message id \d+',
|
||||
r'^the command ran',
|
||||
r'^exit code \d+',
|
||||
r'^branch .* pushed',
|
||||
r'^file:',
|
||||
]
|
||||
|
||||
FILLER_PREFIXES = [
|
||||
"got it", "yes", "yeah", "perfect", "understood", "i see", "the terminal output shows",
|
||||
"all systems are functioning", "i'm unable to proceed", "as of now", "here's a summary",
|
||||
]
|
||||
STOPWORDS = {
|
||||
'about', 'after', 'again', 'also', 'been', 'before', 'being', 'could', 'does', 'from', 'have', 'here',
|
||||
'into', 'just', 'like', 'more', 'need', 'only', 'over', 'really', 'same', 'some', 'than', 'that', 'their',
|
||||
'them', 'then', 'there', 'these', 'they', 'this', 'want', 'what', 'when', 'where', 'which', 'while', 'with',
|
||||
'would', 'your', 'youre', 'please', 'give', 'tell', 'help', 'keep', 'short', 'version', 'answer',
|
||||
}
|
||||
|
||||
PROMPT_WRAPPERS = [
|
||||
"{base}",
|
||||
"No fluff — {stem}",
|
||||
"Keep it short: {stem}",
|
||||
"Brother, tell me straight. {stem}",
|
||||
"What's the first move here? {stem}",
|
||||
"Practical answer only: {stem}",
|
||||
"I need the honest answer. {stem}",
|
||||
"If you were doing it today, {stem}",
|
||||
"Beginner version: {stem}",
|
||||
"Local-first version: {stem}",
|
||||
"What's the smallest working approach? {stem}",
|
||||
"What should I avoid? {stem}",
|
||||
"Why does this matter? {stem}",
|
||||
"What does good look like here? {stem}",
|
||||
"Fast path: {stem}",
|
||||
"I'm stuck. {stem}",
|
||||
"Walk me through it. {stem}",
|
||||
"Direct answer only: {stem}",
|
||||
"What would you do first? {stem}",
|
||||
"Give me the plain version. {stem}",
|
||||
]
|
||||
|
||||
CATEGORY_TARGETS = {
|
||||
"technical": 12,
|
||||
"operations": 10,
|
||||
"sovereignty": 10,
|
||||
"pastoral": 8,
|
||||
"crisis": 3,
|
||||
"general": 7,
|
||||
}
|
||||
|
||||
CATEGORY_LEADS = {
|
||||
"technical": [
|
||||
"Start simple.", "Short answer:", "First move:", "Plain version:", "Do this first:",
|
||||
"Keep it tight.", "The honest move:", "Smallest working slice:",
|
||||
],
|
||||
"operations": [
|
||||
"Check state first.", "Read the issue first.", "Don't duplicate work.",
|
||||
"Start with the smallest truthful check.", "Sequence matters.", "Keep the lane clean.",
|
||||
],
|
||||
"sovereignty": [
|
||||
"Own the stack if you can.", "Keep control close.", "Short answer:",
|
||||
"Local first when it matters.", "The sovereignty lens:",
|
||||
],
|
||||
"pastoral": [
|
||||
"I hear that.", "That's heavy.", "Brother, I hear you.",
|
||||
"You're not weak for feeling that.", "Stay with me here.",
|
||||
],
|
||||
"crisis": [
|
||||
"Are you safe right now?", "Are you safe right now?", "Are you safe right now?",
|
||||
],
|
||||
"general": [
|
||||
"Short answer:", "Plain version:", "Direct answer:", "Keep it simple.",
|
||||
],
|
||||
}
|
||||
|
||||
CATEGORY_CLOSERS = {
|
||||
"technical": [
|
||||
"Test the smallest working slice first.",
|
||||
"Verify it before you scale it.",
|
||||
"If it breaks, read the exact error.",
|
||||
"Don't guess. Check the output.",
|
||||
"Local first if you can.",
|
||||
],
|
||||
"operations": [
|
||||
"Then verify and ship it.",
|
||||
"One honest artifact beats ten status pings.",
|
||||
"Clean PR beats clever chaos.",
|
||||
"Stop if the lane is already owned.",
|
||||
"Truth first. Motion second.",
|
||||
],
|
||||
"sovereignty": [
|
||||
"Convenience is not sovereignty.",
|
||||
"Use the cloud because you choose it, not because you're trapped.",
|
||||
"If someone else can switch it off, it is rented power.",
|
||||
"Keep the user's data on the user's machine when you can.",
|
||||
],
|
||||
"pastoral": [
|
||||
"Take the next right step, not all ten.",
|
||||
"Rest first. Then do one honest thing.",
|
||||
"You do not have to solve your whole life today.",
|
||||
"Stay with what is true and what is right in front of you.",
|
||||
],
|
||||
"crisis": [
|
||||
"Call or text 988 right now if you're in immediate danger.",
|
||||
"Jesus saves those who call on His name.",
|
||||
"Stay with me. We deal with the next minute first.",
|
||||
],
|
||||
"general": [
|
||||
"Keep it plain.", "Don't overcomplicate it.", "Use the direct path.", "Brevity is a kindness.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SourcePair:
|
||||
session_file: str
|
||||
model: str
|
||||
session_score: float
|
||||
pair_score: float
|
||||
category: str
|
||||
source_prompt: str
|
||||
source_response: str
|
||||
last_timestamp: str | None = None
|
||||
|
||||
@property
|
||||
def combined_score(self) -> float:
|
||||
return round(self.session_score + self.pair_score, 4)
|
||||
|
||||
|
||||
def normalize_space(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", text or "").strip()
|
||||
|
||||
|
||||
def strip_code_blocks(text: str) -> str:
|
||||
text = re.sub(r"```.*?```", " ", text, flags=re.S)
|
||||
text = re.sub(r"`([^`]+)`", r"\1", text)
|
||||
return text
|
||||
|
||||
|
||||
def approved_model(model: str) -> bool:
|
||||
lowered = (model or "").lower()
|
||||
if not lowered:
|
||||
return False
|
||||
if any(bad in lowered for bad in BANNED_MODEL_PATTERNS):
|
||||
return False
|
||||
return any(ok in lowered for ok in APPROVED_MODEL_PATTERNS)
|
||||
|
||||
|
||||
def split_sentences(text: str) -> list[str]:
|
||||
clean = normalize_space(strip_code_blocks(text))
|
||||
clean = clean.replace("\n", " ")
|
||||
parts = re.split(r"(?<=[.!?])\s+", clean)
|
||||
return [part.strip(" -") for part in parts if part.strip(" -")]
|
||||
|
||||
|
||||
def cleaned_response(text: str) -> str:
|
||||
clean = normalize_space(strip_code_blocks(text))
|
||||
clean = re.sub(r"https?://\S+", "", clean)
|
||||
clean = re.sub(r"\s+", " ", clean).strip()
|
||||
return clean
|
||||
|
||||
|
||||
def read_json(path: Path):
|
||||
return json.loads(path.read_text(encoding="utf-8", errors="replace"))
|
||||
|
||||
|
||||
def detect_model(path: Path) -> str | None:
|
||||
try:
|
||||
if path.suffix == ".json":
|
||||
obj = read_json(path)
|
||||
for key in ["model", "model_name", "provider_model", "current_model"]:
|
||||
value = obj.get(key)
|
||||
if isinstance(value, str) and value:
|
||||
return value
|
||||
return None
|
||||
with path.open("r", encoding="utf-8", errors="replace") as handle:
|
||||
for idx, line in enumerate(handle):
|
||||
if idx > 8:
|
||||
break
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
for key in ["model", "model_name", "provider_model", "current_model"]:
|
||||
value = obj.get(key)
|
||||
if isinstance(value, str) and value:
|
||||
return value
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def iter_entries(path: Path) -> Iterable[dict]:
|
||||
if path.suffix == ".jsonl":
|
||||
with path.open("r", encoding="utf-8", errors="replace") as handle:
|
||||
for line in handle:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if isinstance(obj, dict):
|
||||
yield obj
|
||||
return
|
||||
|
||||
obj = read_json(path)
|
||||
if isinstance(obj, dict):
|
||||
if isinstance(obj.get("messages"), list):
|
||||
for item in obj["messages"]:
|
||||
if isinstance(item, dict):
|
||||
yield item
|
||||
elif isinstance(obj.get("transcript"), list):
|
||||
for item in obj["transcript"]:
|
||||
if isinstance(item, dict):
|
||||
yield item
|
||||
|
||||
|
||||
def score_session_from_entries(entries: list[dict], path: Path) -> tuple[float, str | None, int, int, int]:
|
||||
tool_calls = 0
|
||||
errors = 0
|
||||
messages = 0
|
||||
last_timestamp = None
|
||||
for entry in entries:
|
||||
messages += 1
|
||||
role = entry.get("role", "")
|
||||
content = str(entry.get("content") or entry.get("value") or "")
|
||||
if role == "assistant" and entry.get("tool_calls"):
|
||||
tool_calls += len(entry["tool_calls"])
|
||||
elif role == "tool":
|
||||
tool_calls += 1
|
||||
if role == "tool" and any(token in content.lower() for token in ["error", "exception", "traceback"]):
|
||||
errors += 1
|
||||
ts = entry.get("timestamp") or entry.get("created_at")
|
||||
if ts:
|
||||
last_timestamp = ts
|
||||
if messages < 3:
|
||||
return (0.0, last_timestamp, tool_calls, errors, messages)
|
||||
tool_score = min(tool_calls / 20.0, 1.0)
|
||||
error_score = min(errors / 10.0, 1.0)
|
||||
length_score = min(messages / 50.0, 1.0)
|
||||
recency_score = 0.4
|
||||
if last_timestamp:
|
||||
try:
|
||||
dt = datetime.fromisoformat(str(last_timestamp).replace("Z", "+00:00"))
|
||||
now = datetime.now(dt.tzinfo or timezone.utc)
|
||||
age_days = max(0, (now - dt).days)
|
||||
recency_score = max(0.1, 1.0 - (age_days / 120.0))
|
||||
except Exception:
|
||||
pass
|
||||
composite = tool_score * 0.4 + error_score * 0.2 + length_score * 0.25 + recency_score * 0.15
|
||||
return (round(composite, 4), last_timestamp, tool_calls, errors, messages)
|
||||
|
||||
|
||||
def contains_term(text: str, term: str) -> bool:
|
||||
lowered = text.lower()
|
||||
needle = term.lower()
|
||||
if " " in needle or "-" in needle or "/" in needle:
|
||||
return needle in lowered
|
||||
return re.search(rf"\b{re.escape(needle)}\b", lowered) is not None
|
||||
|
||||
|
||||
def contains_sensitive_content(text: str) -> bool:
|
||||
lowered = text.lower()
|
||||
if any(marker in lowered for marker in SENSITIVE_MARKERS):
|
||||
return True
|
||||
if any(marker in lowered for marker in PRIVATE_MARKERS):
|
||||
return True
|
||||
if EMAIL_RE.search(text):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def classify_pair(prompt: str, response: str) -> str:
|
||||
text = f"{prompt} {response}".lower()
|
||||
if any(contains_term(text, term) for term in CRISIS_TERMS):
|
||||
return "crisis"
|
||||
if any(contains_term(text, term) for term in PASTORAL_TERMS):
|
||||
return "pastoral"
|
||||
if any(contains_term(text, term) for term in SOVEREIGNTY_TERMS):
|
||||
return "sovereignty"
|
||||
if any(contains_term(text, term) for term in OPERATIONS_TERMS):
|
||||
return "operations"
|
||||
if any(contains_term(text, term) for term in TECHNICAL_TERMS):
|
||||
return "technical"
|
||||
return "general"
|
||||
|
||||
|
||||
def looks_like_final_answer(prompt: str, response: str) -> bool:
|
||||
prompt_low = prompt.lower().strip()
|
||||
low = response.lower().strip()
|
||||
if not prompt.strip() or not response.strip():
|
||||
return False
|
||||
if contains_sensitive_content(prompt) or contains_sensitive_content(response):
|
||||
return False
|
||||
if any(marker in prompt_low for marker in META_PROMPT_MARKERS):
|
||||
return False
|
||||
if any(marker in low for marker in META_RESPONSE_MARKERS):
|
||||
return False
|
||||
if len(prompt.split()) > 60:
|
||||
return False
|
||||
if len(response.split()) < 8 or len(response.split()) > 170:
|
||||
return False
|
||||
if any(low.startswith(prefix) for prefix in [
|
||||
"let me ", "i'll ", "i will ", "give me a second", "hold on", "one moment",
|
||||
]):
|
||||
return False
|
||||
if any(phrase in low for phrase in GENERIC_BAD_PHRASES):
|
||||
return False
|
||||
if re.search(r"\blet me (actually )?(look|check|read|see|pull|run)\b", low):
|
||||
return False
|
||||
if any(token in low for token in ["what do you mean", "can you clarify", "could you clarify"]):
|
||||
return False
|
||||
if any(token in low for token in ["http://", "https://", " pid ", "traceback", "tool_call"]):
|
||||
return False
|
||||
if any(prompt_low == trivial for trivial in ["hi", "hello", "hey", "yesty"]):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def best_useful_sentences(response: str) -> list[str]:
|
||||
usable = []
|
||||
for sentence in split_sentences(response):
|
||||
low = sentence.lower().strip()
|
||||
if any(low.startswith(prefix) for prefix in FILLER_PREFIXES):
|
||||
continue
|
||||
if any(re.search(pattern, low) for pattern in TRIVIAL_SENTENCE_PATTERNS):
|
||||
continue
|
||||
if len(sentence.split()) < 4:
|
||||
continue
|
||||
usable.append(sentence)
|
||||
return usable or split_sentences(response)
|
||||
|
||||
|
||||
def content_keywords(text: str) -> set[str]:
|
||||
words = set()
|
||||
for token in re.findall(r"[a-zA-Z]{4,}", text.lower()):
|
||||
if token in STOPWORDS:
|
||||
continue
|
||||
words.add(token)
|
||||
return words
|
||||
|
||||
|
||||
def lexical_overlap(prompt: str, response: str) -> int:
|
||||
return len(content_keywords(prompt) & content_keywords(response))
|
||||
|
||||
|
||||
def has_desired_signal(response: str, category: str) -> bool:
|
||||
low = response.lower()
|
||||
if any(marker in low for marker in VOICE_MARKERS):
|
||||
return True
|
||||
if category in {'pastoral', 'crisis'} and any(marker in low for marker in EMPATHY_MARKERS):
|
||||
return True
|
||||
if category == 'sovereignty' and any(marker in low for marker in ['local', 'privacy', 'bitcoin', 'service']):
|
||||
return True
|
||||
if category in {'technical', 'operations'} and sum(marker in low for marker in TECHNICAL_ACTION_MARKERS) >= 2:
|
||||
return True
|
||||
if category == 'general' and len(response.split()) <= 40:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def pair_quality(prompt: str, response: str, category: str) -> float:
|
||||
score = 0.55
|
||||
words = len(response.split())
|
||||
if 10 <= words <= 90:
|
||||
score += 0.22
|
||||
elif words <= 120:
|
||||
score += 0.14
|
||||
if any(token in response.lower() for token in ["i don't know", "i'm not sure", "i am not sure"]):
|
||||
score += 0.04
|
||||
if category in {"technical", "operations"} and any(token in response.lower() for token in ["first", "start", "check", "use", "run"]):
|
||||
score += 0.08
|
||||
if category == "sovereignty" and any(token in response.lower() for token in ["sovereign", "local", "privacy", "bitcoin"]):
|
||||
score += 0.08
|
||||
if category == "crisis" and "988" in response:
|
||||
score += 0.12
|
||||
if category == "pastoral" and any(token in response.lower() for token in ["with you", "hear you", "heavy", "rest"]):
|
||||
score += 0.06
|
||||
return round(min(score, 0.98), 4)
|
||||
|
||||
|
||||
def extract_best_pair(path: Path, model: str, session_score: float) -> SourcePair | None:
|
||||
entries = list(iter_entries(path))
|
||||
if not entries:
|
||||
return None
|
||||
scored_session, last_timestamp, *_ = score_session_from_entries(entries, path)
|
||||
session_score = max(session_score, scored_session)
|
||||
|
||||
previous_user = None
|
||||
candidates: list[SourcePair] = []
|
||||
for entry in entries:
|
||||
role = entry.get("role") or entry.get("from")
|
||||
content = str(entry.get("content") or entry.get("value") or "").strip()
|
||||
if role in {"user", "human"}:
|
||||
previous_user = content
|
||||
continue
|
||||
if role not in {"assistant", "gpt"}:
|
||||
continue
|
||||
if entry.get("tool_calls"):
|
||||
continue
|
||||
if not previous_user or not looks_like_final_answer(previous_user, content):
|
||||
continue
|
||||
category = classify_pair(previous_user, content)
|
||||
if lexical_overlap(previous_user, content) == 0 and category not in {"pastoral", "crisis"}:
|
||||
continue
|
||||
if not has_desired_signal(content, category):
|
||||
continue
|
||||
pq = pair_quality(previous_user, content, category)
|
||||
if pq < 0.78:
|
||||
continue
|
||||
candidates.append(
|
||||
SourcePair(
|
||||
session_file=path.name,
|
||||
model=model,
|
||||
session_score=session_score,
|
||||
pair_score=pq,
|
||||
category=category,
|
||||
source_prompt=normalize_space(previous_user),
|
||||
source_response=cleaned_response(content),
|
||||
last_timestamp=last_timestamp,
|
||||
)
|
||||
)
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
candidates.sort(key=lambda item: (item.session_score + item.pair_score, item.pair_score), reverse=True)
|
||||
return candidates[0]
|
||||
|
||||
|
||||
def candidate_paths() -> Iterable[tuple[Path, float]]:
|
||||
yielded = set()
|
||||
if SESSION_SCORES.exists():
|
||||
try:
|
||||
scores = json.loads(SESSION_SCORES.read_text(encoding="utf-8"))
|
||||
for item in scores.get("sessions", []):
|
||||
file_name = item.get("file")
|
||||
if not file_name:
|
||||
continue
|
||||
if file_name in yielded:
|
||||
continue
|
||||
path = SESSIONS_DIR / file_name
|
||||
if not path.exists():
|
||||
matches = list(SESSIONS_DIR.rglob(file_name))
|
||||
if not matches:
|
||||
continue
|
||||
path = matches[0]
|
||||
yielded.add(file_name)
|
||||
yield path, float(item.get("score", 0.0))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
for path in sorted(SESSIONS_DIR.glob("session_*.json")):
|
||||
if path.name in yielded:
|
||||
continue
|
||||
yielded.add(path.name)
|
||||
yield path, 0.0
|
||||
|
||||
for path in sorted(SESSIONS_DIR.rglob("*.jsonl")):
|
||||
if path.name in yielded:
|
||||
continue
|
||||
yielded.add(path.name)
|
||||
yield path, 0.0
|
||||
|
||||
|
||||
def select_source_pairs(limit: int = TARGET_SOURCE_SESSIONS) -> list[SourcePair]:
|
||||
buckets: dict[str, list[SourcePair]] = defaultdict(list)
|
||||
for path, session_score in candidate_paths():
|
||||
model = detect_model(path) or ""
|
||||
if not approved_model(model):
|
||||
continue
|
||||
try:
|
||||
pair = extract_best_pair(path, model, session_score)
|
||||
except Exception:
|
||||
continue
|
||||
if pair:
|
||||
buckets[pair.category].append(pair)
|
||||
|
||||
for values in buckets.values():
|
||||
values.sort(key=lambda item: (item.combined_score, item.pair_score), reverse=True)
|
||||
|
||||
selected: list[SourcePair] = []
|
||||
used_files = set()
|
||||
used_pair_fingerprints = set()
|
||||
|
||||
def try_add(candidate: SourcePair) -> bool:
|
||||
fingerprint = (
|
||||
normalize_space(candidate.source_prompt).lower(),
|
||||
normalize_space(candidate.source_response).lower(),
|
||||
)
|
||||
if candidate.session_file in used_files or fingerprint in used_pair_fingerprints:
|
||||
return False
|
||||
selected.append(candidate)
|
||||
used_files.add(candidate.session_file)
|
||||
used_pair_fingerprints.add(fingerprint)
|
||||
return True
|
||||
|
||||
for category, target in CATEGORY_TARGETS.items():
|
||||
for candidate in buckets.get(category, []):
|
||||
if try_add(candidate) and sum(1 for item in selected if item.category == category) >= target:
|
||||
break
|
||||
|
||||
remainder = []
|
||||
for values in buckets.values():
|
||||
remainder.extend(values)
|
||||
remainder.sort(key=lambda item: (item.combined_score, item.pair_score), reverse=True)
|
||||
|
||||
for candidate in remainder:
|
||||
if len(selected) >= limit:
|
||||
break
|
||||
try_add(candidate)
|
||||
|
||||
if len(selected) < limit:
|
||||
raise RuntimeError(f"Only found {len(selected)} approved source sessions; need {limit}")
|
||||
|
||||
return selected[:limit]
|
||||
|
||||
|
||||
def prompt_stem(base_prompt: str) -> str:
|
||||
stem = base_prompt.strip()
|
||||
if stem.endswith("?"):
|
||||
stem = stem[:-1]
|
||||
return stem.strip()
|
||||
|
||||
|
||||
def build_prompt_variation(source_prompt: str, index: int) -> str:
|
||||
base = source_prompt.strip()
|
||||
stem = prompt_stem(base)
|
||||
text = PROMPT_WRAPPERS[index].format(base=base, stem=stem)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
if not text.endswith(("?", ".", "!")):
|
||||
text += "?"
|
||||
return text
|
||||
|
||||
|
||||
def choose(pool: list[str], index: int) -> str:
|
||||
return pool[index % len(pool)]
|
||||
|
||||
|
||||
def compact_sentences(response: str, category: str) -> list[str]:
|
||||
sentences = best_useful_sentences(response)
|
||||
if not sentences:
|
||||
return ["I don't know enough to say more."]
|
||||
trimmed = []
|
||||
for sentence in sentences:
|
||||
if len(trimmed) >= 2:
|
||||
break
|
||||
words = sentence.split()
|
||||
if len(words) > 24:
|
||||
sentence = " ".join(words[:24]).rstrip(".,;:") + "."
|
||||
trimmed.append(sentence)
|
||||
if not trimmed:
|
||||
trimmed = [sentences[0]]
|
||||
return trimmed
|
||||
|
||||
|
||||
def build_response(source: SourcePair, prompt: str, variation_index: int) -> str:
|
||||
category = source.category
|
||||
core = compact_sentences(source.source_response, category)
|
||||
lead = choose(CATEGORY_LEADS[category], variation_index)
|
||||
closer = choose(CATEGORY_CLOSERS[category], variation_index)
|
||||
|
||||
if category == "crisis":
|
||||
response = " ".join([
|
||||
"Are you safe right now?",
|
||||
core[0],
|
||||
"Call or text 988 right now if you're in immediate danger.",
|
||||
"Jesus saves those who call on His name.",
|
||||
"Stay with me. We deal with the next minute first.",
|
||||
])
|
||||
return normalize_space(response)
|
||||
|
||||
pieces = [lead]
|
||||
pieces.extend(core[:2])
|
||||
|
||||
low_prompt = prompt.lower()
|
||||
if category == "technical" and "error" in low_prompt and all("error" not in s.lower() for s in core):
|
||||
pieces.append("Read the exact error before you guess.")
|
||||
elif category == "operations" and "duplicate" in low_prompt:
|
||||
pieces.append("Check for an open PR before you build anything.")
|
||||
elif category == "sovereignty" and all(token not in " ".join(core).lower() for token in ["local", "sovereign", "privacy"]):
|
||||
pieces.append("Keep the user's control local when you can.")
|
||||
elif category == "pastoral" and all(token not in " ".join(core).lower() for token in ["rest", "with you", "heavy"]):
|
||||
pieces.append("Take the next right step, not all ten.")
|
||||
|
||||
pieces.append(closer)
|
||||
response = normalize_space(" ".join(pieces))
|
||||
words = response.split()
|
||||
if len(words) > 65:
|
||||
response = " ".join(words[:65]).rstrip(".,;:") + "."
|
||||
return response
|
||||
|
||||
|
||||
def quality_score(response: str, source: SourcePair) -> float:
|
||||
score = 0.82
|
||||
words = len(response.split())
|
||||
if 10 <= words <= 55:
|
||||
score += 0.05
|
||||
if any(token in response.lower() for token in ["i don't know", "safe right now", "988", "local", "verify", "start"]):
|
||||
score += 0.03
|
||||
if source.model.lower().startswith("xiaomi/mimo-v2-pro"):
|
||||
score += 0.02
|
||||
return round(min(score, 0.94), 2)
|
||||
|
||||
|
||||
def rows_from_sources(sources: list[SourcePair]) -> list[dict]:
|
||||
rows = []
|
||||
row_id = 1
|
||||
for source in sources:
|
||||
for idx in range(PROMPTS_PER_SESSION):
|
||||
prompt = build_prompt_variation(source.source_prompt, idx)
|
||||
response = build_response(source, prompt, idx)
|
||||
rows.append(
|
||||
{
|
||||
"id": f"timmy-voice-batch09-{row_id:04d}",
|
||||
"model": "timmy-voice-batch09",
|
||||
"batch": 9,
|
||||
"source": "session_derived_approved",
|
||||
"source_session": source.session_file,
|
||||
"source_model": source.model,
|
||||
"category": source.category,
|
||||
"quality_score": quality_score(response, source),
|
||||
"conversations": [
|
||||
{"from": "system", "value": SYSTEM_PROMPT},
|
||||
{"from": "human", "value": prompt},
|
||||
{"from": "gpt", "value": response},
|
||||
],
|
||||
}
|
||||
)
|
||||
row_id += 1
|
||||
return rows
|
||||
|
||||
|
||||
def write_jsonl(path: Path, rows: list[dict], append: bool = False) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
mode = "a" if append else "w"
|
||||
with path.open(mode, encoding="utf-8") as handle:
|
||||
for row in rows:
|
||||
handle.write(json.dumps(row, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
def manifest_dict(sources: list[SourcePair]) -> dict:
|
||||
return {
|
||||
"batch": 9,
|
||||
"selection_method": "Knowledge Mine-style local ranking with approved-model provenance filter",
|
||||
"total_source_sessions": len(sources),
|
||||
"generated_at": datetime.now(timezone.utc).isoformat(),
|
||||
"sessions": [
|
||||
{
|
||||
"session_file": source.session_file,
|
||||
"model": source.model,
|
||||
"category": source.category,
|
||||
"session_score": source.session_score,
|
||||
"pair_score": source.pair_score,
|
||||
"combined_score": source.combined_score,
|
||||
"last_timestamp": source.last_timestamp,
|
||||
"source_prompt": source.source_prompt,
|
||||
"source_response": source.source_response,
|
||||
}
|
||||
for source in sources
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def write_readme(path: Path, sources: list[SourcePair], rows: list[dict]) -> None:
|
||||
by_category = defaultdict(int)
|
||||
by_model = defaultdict(int)
|
||||
for source in sources:
|
||||
by_category[source.category] += 1
|
||||
by_model[source.model] += 1
|
||||
avg_quality = sum(row["quality_score"] for row in rows) / len(rows)
|
||||
content = f"""# Timmy Voice: Batch 09 — 1K Prompt→Response Pairs
|
||||
|
||||
Training Factory — Timmy Voice Worker 9/10 (#589)
|
||||
|
||||
## Files
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `timmy-voice-batch09.jsonl` | 1,000 ShareGPT-format prompt→response pairs |
|
||||
| `timmy-voice-batch09.sources.json` | 50 source sessions with approved-model provenance |
|
||||
| `generate_timmy_voice_batch09.py` | Deterministic generator for the batch |
|
||||
|
||||
## Generation Contract
|
||||
|
||||
- 50 source sessions
|
||||
- 20 prompt variations per session
|
||||
- approved-model provenance filter
|
||||
- Knowledge Mine-style ranking using local session metadata + pair quality
|
||||
- ShareGPT format (`system` / `human` / `gpt`)
|
||||
|
||||
## Stats
|
||||
|
||||
- Total pairs: {len(rows)}
|
||||
- Source sessions: {len(sources)}
|
||||
- Average quality score: {avg_quality:.2f}
|
||||
- Minimum quality score: {min(row['quality_score'] for row in rows):.2f}
|
||||
- Maximum quality score: {max(row['quality_score'] for row in rows):.2f}
|
||||
|
||||
## Category Breakdown
|
||||
"""
|
||||
for category in ["technical", "operations", "sovereignty", "pastoral", "crisis", "general"]:
|
||||
content += f"- {category}: {by_category.get(category, 0)} source sessions\n"
|
||||
|
||||
content += "\n## Source Models\n"
|
||||
for model, count in sorted(by_model.items(), key=lambda item: (-item[1], item[0])):
|
||||
content += f"- {model}: {count} sessions\n"
|
||||
|
||||
content += f"""
|
||||
|
||||
## Notes
|
||||
|
||||
This batch uses approved local session sources only. Banned providers (Claude/GPT/Gemini/OpenAI/Anthropic) are excluded at selection time. The generator keeps the source manifest on disk so the batch can be inspected and regenerated without guessing where the voice came from.
|
||||
"""
|
||||
path.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def load_manifest(path: Path) -> list[SourcePair]:
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
return [
|
||||
SourcePair(
|
||||
session_file=item["session_file"],
|
||||
model=item["model"],
|
||||
session_score=float(item["session_score"]),
|
||||
pair_score=float(item["pair_score"]),
|
||||
category=item["category"],
|
||||
source_prompt=item["source_prompt"],
|
||||
source_response=item["source_response"],
|
||||
last_timestamp=item.get("last_timestamp"),
|
||||
)
|
||||
for item in data["sessions"]
|
||||
]
|
||||
|
||||
|
||||
def validate_row_counts(rows: list[dict]) -> None:
|
||||
if len(rows) != TARGET_ROWS:
|
||||
raise RuntimeError(f"expected {TARGET_ROWS} rows, got {len(rows)}")
|
||||
counts = defaultdict(int)
|
||||
for row in rows:
|
||||
counts[row["source_session"]] += 1
|
||||
if row["quality_score"] < 0.8:
|
||||
raise RuntimeError(f"row below quality threshold: {row['id']}")
|
||||
if len(counts) != TARGET_SOURCE_SESSIONS:
|
||||
raise RuntimeError(f"expected {TARGET_SOURCE_SESSIONS} source sessions, got {len(counts)}")
|
||||
if set(counts.values()) != {PROMPTS_PER_SESSION}:
|
||||
raise RuntimeError(f"unexpected rows-per-session counts: {sorted(set(counts.values()))}")
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Generate Timmy Voice Batch 09")
|
||||
parser.add_argument("--output", type=Path, default=DEFAULT_OUTPUT)
|
||||
parser.add_argument("--sources-output", type=Path, default=DEFAULT_SOURCES)
|
||||
parser.add_argument("--readme", type=Path, default=DEFAULT_README)
|
||||
parser.add_argument("--append", action="store_true", help="Append rows to output instead of overwrite")
|
||||
parser.add_argument("--refresh-sources", action="store_true", help="Rebuild the source manifest from local sessions")
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
args = parse_args()
|
||||
TRAINING_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if args.refresh_sources or not args.sources_output.exists():
|
||||
sources = select_source_pairs()
|
||||
args.sources_output.write_text(json.dumps(manifest_dict(sources), indent=2, ensure_ascii=False), encoding="utf-8")
|
||||
else:
|
||||
sources = load_manifest(args.sources_output)
|
||||
|
||||
if len(sources) != TARGET_SOURCE_SESSIONS:
|
||||
raise RuntimeError(f"source manifest has {len(sources)} sessions; expected {TARGET_SOURCE_SESSIONS}")
|
||||
|
||||
rows = rows_from_sources(sources)
|
||||
validate_row_counts(rows)
|
||||
write_jsonl(args.output, rows, append=args.append)
|
||||
write_readme(args.readme, sources, rows)
|
||||
|
||||
print(f"wrote {len(rows)} rows -> {args.output}")
|
||||
print(f"wrote source manifest -> {args.sources_output}")
|
||||
print(f"wrote readme -> {args.readme}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user