#!/usr/bin/env python3 """ validate_timmy_voice.py — Validate timmy-voice training data for quality and compliance. Usage: python3 validate_timmy_voice.py training-data/timmy-voice-batch03.jsonl """ import json import sys from pathlib import Path def validate_entry(entry: dict, idx: int) -> list[str]: errors = [] # Required fields for field in ["id", "model", "batch", "source", "quality_score", "conversations"]: if field not in entry: errors.append(f"[{idx}] Missing field: {field}") # Conversations structure conv = entry.get("conversations", []) if len(conv) != 3: errors.append(f"[{idx}] Expected 3 conversations, got {len(conv)}") roles = [c.get("from") for c in conv] if roles != ["system", "human", "gpt"]: errors.append(f"[{idx}] Expected roles [system, human, gpt], got {roles}") # Quality threshold score = entry.get("quality_score", 0) if score < 0.8: errors.append(f"[{idx}] Quality score {score} below threshold 0.8") # Response length (Timmy prefers short) response = conv[2]["value"] if len(conv) > 2 else "" words = len(response.split()) if words > 100: errors.append(f"[{idx}] Response too long ({words} words)") # SOUL.md compliance check — crisis protocol prompt = conv[1]["value"] if len(conv) > 1 else "" crisis_words = ["suicide", "kill myself", "end it all", "don't want to live", "hurt myself"] if any(c in prompt.lower() for c in crisis_words): if "988" not in response and "are you safe" not in response.lower(): errors.append(f"[{idx}] CRISIS FAIL: Missing 988 or safety check") return errors def validate_file(path: Path) -> dict: lines = path.read_text().strip().splitlines() all_errors = [] total = len(lines) valid = 0 for i, line in enumerate(lines, 1): try: entry = json.loads(line) except json.JSONDecodeError as e: all_errors.append(f"[{i}] JSON parse error: {e}") continue errors = validate_entry(entry, i) if errors: all_errors.extend(errors) else: valid += 1 return { "total": total, "valid": valid, "errors": all_errors, "pass_rate": valid / total if total else 0, } def main(): if len(sys.argv) < 2: print(f"Usage: {sys.argv[0]} ") sys.exit(1) path = Path(sys.argv[1]) result = validate_file(path) print(f"Validated: {path}") print(f"Total entries: {result['total']}") print(f"Valid entries: {result['valid']}") print(f"Pass rate: {result['pass_rate']:.1%}") if result["errors"]: print(f"\nErrors ({len(result['errors'])}):") for err in result["errors"][:20]: print(f" {err}") if len(result["errors"]) > 20: print(f" ... and {len(result['errors']) - 20} more") sys.exit(1) else: print("\nAll entries passed validation.") sys.exit(0) if __name__ == "__main__": main()