timmy-config/training-data/validate_timmy_voice.py

#!/usr/bin/env python3
"""
validate_timmy_voice.py — Validate timmy-voice training data for quality and compliance.

Usage:
    python3 validate_timmy_voice.py training-data/timmy-voice-batch03.jsonl
"""

import json
import sys
from pathlib import Path


def validate_entry(entry: dict, idx: int) -> list[str]:
    errors = []

    # Required fields
    for field in ["id", "model", "batch", "source", "quality_score", "conversations"]:
        if field not in entry:
            errors.append(f"[{idx}] Missing field: {field}")

    # Conversations structure
    conv = entry.get("conversations", [])
    if len(conv) != 3:
        errors.append(f"[{idx}] Expected 3 conversations, got {len(conv)}")

    roles = [c.get("from") for c in conv]
    if roles != ["system", "human", "gpt"]:
        errors.append(f"[{idx}] Expected roles [system, human, gpt], got {roles}")

    # Quality threshold
    score = entry.get("quality_score", 0)
    if score < 0.8:
        errors.append(f"[{idx}] Quality score {score} below threshold 0.8")

    # Response length (Timmy prefers short)
    response = conv[2]["value"] if len(conv) > 2 else ""
    words = len(response.split())
    if words > 100:
        errors.append(f"[{idx}] Response too long ({words} words)")

    # SOUL.md compliance check — crisis protocol
    prompt = conv[1]["value"] if len(conv) > 1 else ""
    crisis_words = ["suicide", "kill myself", "end it all", "don't want to live", "hurt myself"]
    if any(c in prompt.lower() for c in crisis_words):
        if "988" not in response and "are you safe" not in response.lower():
            errors.append(f"[{idx}] CRISIS FAIL: Missing 988 or safety check")

    return errors


def validate_file(path: Path) -> dict:
    lines = path.read_text().strip().splitlines()
    all_errors = []
    total = len(lines)
    valid = 0

    for i, line in enumerate(lines, 1):
        try:
            entry = json.loads(line)
        except json.JSONDecodeError as e:
            all_errors.append(f"[{i}] JSON parse error: {e}")
            continue

        errors = validate_entry(entry, i)
        if errors:
            all_errors.extend(errors)
        else:
            valid += 1

    return {
        "total": total,
        "valid": valid,
        "errors": all_errors,
        "pass_rate": valid / total if total else 0,
    }


def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} <jsonl_file>")
        sys.exit(1)

    path = Path(sys.argv[1])
    result = validate_file(path)

    print(f"Validated: {path}")
    print(f"Total entries: {result['total']}")
    print(f"Valid entries: {result['valid']}")
    print(f"Pass rate: {result['pass_rate']:.1%}")

    if result["errors"]:
        print(f"\nErrors ({len(result['errors'])}):")
        for err in result["errors"][:20]:
            print(f"  {err}")
        if len(result["errors"]) > 20:
            print(f"  ... and {len(result['errors']) - 20} more")
        sys.exit(1)
    else:
        print("\nAll entries passed validation.")
        sys.exit(0)


if __name__ == "__main__":
    main()