training-data/validate_timmy_voice.py

#!/usr/bin/env python3
"""
validate_timmy_voice.py — Validate timmy-voice training data for quality and compliance.

Usage:
    python3 validate_timmy_voice.py training-data/timmy-voice-batch03.jsonl
"""

import json
import sys
from pathlib import Path


def validate_entry(entry: dict, idx: int) -> list[str]:
    errors = []

    # Required fields
    for field in ["id", "model", "batch", "source", "quality_score", "conversations"]:
        if field not in entry:
            errors.append(f"[{idx}] Missing field: {field}")

    # Conversations structure
    conv = entry.get("conversations", [])
    if len(conv) != 3:
        errors.append(f"[{idx}] Expected 3 conversations, got {len(conv)}")

    roles = [c.get("from") for c in conv]
    if roles != ["system", "human", "gpt"]:
        errors.append(f"[{idx}] Expected roles [system, human, gpt], got {roles}")

    # Quality threshold
    score = entry.get("quality_score", 0)
    if score < 0.8:
        errors.append(f"[{idx}] Quality score {score} below threshold 0.8")

    # Response length (Timmy prefers short)
    response = conv[2]["value"] if len(conv) > 2 else ""
    words = len(response.split())
    if words > 100:
        errors.append(f"[{idx}] Response too long ({words} words)")

    # SOUL.md compliance check — crisis protocol
    prompt = conv[1]["value"] if len(conv) > 1 else ""
    crisis_words = ["suicide", "kill myself", "end it all", "don't want to live", "hurt myself"]
    if any(c in prompt.lower() for c in crisis_words):
        if "988" not in response and "are you safe" not in response.lower():
            errors.append(f"[{idx}] CRISIS FAIL: Missing 988 or safety check")

    return errors


def validate_file(path: Path) -> dict:
    lines = path.read_text().strip().splitlines()
    all_errors = []
    total = len(lines)
    valid = 0

    for i, line in enumerate(lines, 1):
        try:
            entry = json.loads(line)
        except json.JSONDecodeError as e:
            all_errors.append(f"[{i}] JSON parse error: {e}")
            continue

        errors = validate_entry(entry, i)
        if errors:
            all_errors.extend(errors)
        else:
            valid += 1

    return {
        "total": total,
        "valid": valid,
        "errors": all_errors,
        "pass_rate": valid / total if total else 0,
    }


def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <jsonl_file>")
        sys.exit(1)

    path = Path(sys.argv[1])
    result = validate_file(path)

    print(f"Validated: {path}")
    print(f"Total entries: {result['total']}")
    print(f"Valid entries: {result['valid']}")
    print(f"Pass rate: {result['pass_rate']:.1%}")

    if result["errors"]:
        print(f"\nErrors ({len(result['errors'])}):")
        for err in result["errors"][:20]:
            print(f"  {err}")
        if len(result["errors"]) > 20:
            print(f"  ... and {len(result['errors']) - 20} more")
        sys.exit(1)
    else:
        print("\nAll entries passed validation.")
        sys.exit(0)


if __name__ == "__main__":
    main()
feat: Timmy Voice Batch 03 — 1K prompt→response pairs (#583) - Add training-data/timmy-voice-batch03.jsonl (1,000 pairs, ShareGPT format) - Add training-data/generate_timmy_voice_batch03.py (deterministic generator) - Add training-data/validate_timmy_voice.py (SOUL.md compliance checker) - Add training-data/README-batch03.md (batch documentation) All pairs quality score ≥ 0.80, avg 0.83. Categories: hermes (427), sovereignty (464), crisis (109). Crisis prompts include 988 protocol. Closes #583 2026-04-22 02:58:23 -04:00			`#!/usr/bin/env python3`
			`"""`
			`validate_timmy_voice.py — Validate timmy-voice training data for quality and compliance.`

			`Usage:`
			`python3 validate_timmy_voice.py training-data/timmy-voice-batch03.jsonl`
			`"""`

			`import json`
			`import sys`
			`from pathlib import Path`


			`def validate_entry(entry: dict, idx: int) -> list[str]:`
			`errors = []`

			`# Required fields`
			`for field in ["id", "model", "batch", "source", "quality_score", "conversations"]:`
			`if field not in entry:`
			`errors.append(f"[{idx}] Missing field: {field}")`

			`# Conversations structure`
			`conv = entry.get("conversations", [])`
			`if len(conv) != 3:`
			`errors.append(f"[{idx}] Expected 3 conversations, got {len(conv)}")`

			`roles = [c.get("from") for c in conv]`
			`if roles != ["system", "human", "gpt"]:`
			`errors.append(f"[{idx}] Expected roles [system, human, gpt], got {roles}")`

			`# Quality threshold`
			`score = entry.get("quality_score", 0)`
			`if score < 0.8:`
			`errors.append(f"[{idx}] Quality score {score} below threshold 0.8")`

			`# Response length (Timmy prefers short)`
			`response = conv[2]["value"] if len(conv) > 2 else ""`
			`words = len(response.split())`
			`if words > 100:`
			`errors.append(f"[{idx}] Response too long ({words} words)")`

			`# SOUL.md compliance check — crisis protocol`
			`prompt = conv[1]["value"] if len(conv) > 1 else ""`
			`crisis_words = ["suicide", "kill myself", "end it all", "don't want to live", "hurt myself"]`
			`if any(c in prompt.lower() for c in crisis_words):`
			`if "988" not in response and "are you safe" not in response.lower():`
			`errors.append(f"[{idx}] CRISIS FAIL: Missing 988 or safety check")`

			`return errors`


			`def validate_file(path: Path) -> dict:`
			`lines = path.read_text().strip().splitlines()`
			`all_errors = []`
			`total = len(lines)`
			`valid = 0`

			`for i, line in enumerate(lines, 1):`
			`try:`
			`entry = json.loads(line)`
			`except json.JSONDecodeError as e:`
			`all_errors.append(f"[{i}] JSON parse error: {e}")`
			`continue`

			`errors = validate_entry(entry, i)`
			`if errors:`
			`all_errors.extend(errors)`
			`else:`
			`valid += 1`

			`return {`
			`"total": total,`
			`"valid": valid,`
			`"errors": all_errors,`
			`"pass_rate": valid / total if total else 0,`
			`}`


			`def main():`
			`if len(sys.argv) < 2:`
			`print(f"Usage: {sys.argv[0]} <jsonl_file>")`
			`sys.exit(1)`

			`path = Path(sys.argv[1])`
			`result = validate_file(path)`

			`print(f"Validated: {path}")`
			`print(f"Total entries: {result['total']}")`
			`print(f"Valid entries: {result['valid']}")`
			`print(f"Pass rate: {result['pass_rate']:.1%}")`

			`if result["errors"]:`
			`print(f"\nErrors ({len(result['errors'])}):")`
			`for err in result["errors"][:20]:`
			`print(f" {err}")`
			`if len(result["errors"]) > 20:`
			`print(f" ... and {len(result['errors']) - 20} more")`
			`sys.exit(1)`
			`else:`
			`print("\nAll entries passed validation.")`
			`sys.exit(0)`


			`if __name__ == "__main__":`
			`main()`