Files
timmy-config/tests/test_timmy_voice_batch09.py
Alexander Whitestone 30d7a084e1
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 23s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Config / YAML Lint (pull_request) Failing after 16s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 55s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 1m2s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Training Data / validate (pull_request) Successful in 21s
Architecture Lint / Lint Repository (pull_request) Failing after 17s
PR Checklist / pr-checklist (pull_request) Failing after 9m48s
feat(training): add Timmy voice batch 09 dataset (#589)
Generate a deterministic batch09 Timmy voice corpus with 1,000 ShareGPT prompt-response pairs,
50 approved source sessions, a source manifest, and focused validation tests.
2026-04-22 11:05:38 -04:00

127 lines
4.8 KiB
Python

import json
import re
from collections import Counter, defaultdict
from pathlib import Path
DATA_FILE = Path('training-data/timmy-voice-batch09.jsonl')
SCRIPT_FILE = Path('training-data/generate_timmy_voice_batch09.py')
SOURCES_FILE = Path('training-data/timmy-voice-batch09.sources.json')
README_FILE = Path('training-data/README-batch09.md')
EXPECTED_ROWS = 1000
EXPECTED_SOURCE_SESSIONS = 50
EXPECTED_ROWS_PER_SESSION = 20
BANNED_MODEL_STRINGS = ['claude', 'gpt-4', 'gpt-3', 'gpt', 'o1', 'o3', 'gemini', 'anthropic', 'openai', 'unknown']
CRISIS_TERMS = ['suicide', 'kill myself', 'end my life', 'overdose', 'bridge', 'gun', 'die']
CREDENTIAL_MARKERS = ['password', 'pass:', 'token', 'api key', 'secret', 'login:']
PRIVATE_MARKERS = ['/users/', '/private/', '~/.timmy', '~/.hermes', 'alexanderwhitestone.com']
META_MARKERS = [
'review the conversation above',
"you've reached the maximum number of tool-calling iterations",
'without calling any more tools',
'nothing to save',
'[silent]',
]
EMAIL_RE = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}')
def load_rows(path: Path):
assert path.exists(), f'missing file: {path}'
rows = []
for line in path.read_text(encoding='utf-8').splitlines():
line = line.strip()
if line:
rows.append(json.loads(line))
return rows
def load_sources():
assert SOURCES_FILE.exists(), f'missing file: {SOURCES_FILE}'
return json.loads(SOURCES_FILE.read_text(encoding='utf-8'))
def test_batch09_artifacts_exist():
for path in [DATA_FILE, SCRIPT_FILE, SOURCES_FILE, README_FILE]:
assert path.exists(), f'missing artifact: {path}'
def test_sources_manifest_has_50_unique_sessions():
manifest = load_sources()
assert manifest['batch'] == 9
assert manifest['total_source_sessions'] == EXPECTED_SOURCE_SESSIONS
sessions = manifest['sessions']
assert len(sessions) == EXPECTED_SOURCE_SESSIONS
session_files = [item['session_file'] for item in sessions]
assert len(session_files) == len(set(session_files))
for item in sessions:
assert item['session_score'] > 0
assert item['pair_score'] > 0
assert item['source_prompt'].strip()
assert item['source_response'].strip()
model = item['model'].lower()
assert not any(bad in model for bad in BANNED_MODEL_STRINGS), item['model']
def test_dataset_has_exactly_1000_rows():
rows = load_rows(DATA_FILE)
assert len(rows) == EXPECTED_ROWS
def test_ids_are_unique_and_batch_scoped():
rows = load_rows(DATA_FILE)
ids = [row['id'] for row in rows]
assert len(ids) == len(set(ids))
assert ids[0] == 'timmy-voice-batch09-0001'
assert ids[-1] == 'timmy-voice-batch09-1000'
for row in rows:
assert row['batch'] == 9
assert row['model'] == 'timmy-voice-batch09'
assert row['quality_score'] >= 0.80
def test_sharegpt_schema_and_source_mapping():
rows = load_rows(DATA_FILE)
manifest = load_sources()
valid_sources = {item['session_file'] for item in manifest['sessions']}
counts = Counter()
prompts_by_source = defaultdict(set)
for row in rows:
assert row['source'] == 'session_derived_approved'
assert row['source_session'] in valid_sources
counts[row['source_session']] += 1
conv = row['conversations']
assert [item['from'] for item in conv] == ['system', 'human', 'gpt']
assert 'Sovereignty and service always.' in conv[0]['value']
assert conv[1]['value'].strip()
assert conv[2]['value'].strip()
prompts_by_source[row['source_session']].add(conv[1]['value'])
assert len(counts) == EXPECTED_SOURCE_SESSIONS
assert set(counts.values()) == {EXPECTED_ROWS_PER_SESSION}
for source_session, prompts in prompts_by_source.items():
assert len(prompts) == EXPECTED_ROWS_PER_SESSION, source_session
def test_crisis_prompts_include_required_protocol():
rows = load_rows(DATA_FILE)
for row in rows:
prompt = row['conversations'][1]['value'].lower()
response = row['conversations'][2]['value']
if any(term in prompt for term in CRISIS_TERMS):
assert 'Are you safe right now?' in response
assert '988' in response
def test_no_credentials_or_private_paths_leak_into_sources_or_rows():
manifest = load_sources()
corpus = json.dumps(manifest).lower() + '\n' + DATA_FILE.read_text(encoding='utf-8').lower()
assert not any(marker in corpus for marker in CREDENTIAL_MARKERS)
assert not any(marker in corpus for marker in PRIVATE_MARKERS)
assert not any(marker in corpus for marker in META_MARKERS)
assert not EMAIL_RE.search(corpus)
def test_readme_documents_generation_contract():
text = README_FILE.read_text(encoding='utf-8')
assert '50 source sessions' in text
assert '20 prompt variations per session' in text
assert 'approved-model provenance filter' in text