feat(training): add Timmy voice batch 09 dataset (#589)
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 23s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Config / YAML Lint (pull_request) Failing after 16s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 55s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 1m2s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Training Data / validate (pull_request) Successful in 21s
Architecture Lint / Lint Repository (pull_request) Failing after 17s
PR Checklist / pr-checklist (pull_request) Failing after 9m48s
Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 23s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Config / YAML Lint (pull_request) Failing after 16s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 55s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 1m2s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Training Data / validate (pull_request) Successful in 21s
Architecture Lint / Lint Repository (pull_request) Failing after 17s
PR Checklist / pr-checklist (pull_request) Failing after 9m48s
Generate a deterministic batch09 Timmy voice corpus with 1,000 ShareGPT prompt-response pairs, 50 approved source sessions, a source manifest, and focused validation tests.
This commit is contained in:
126
tests/test_timmy_voice_batch09.py
Normal file
126
tests/test_timmy_voice_batch09.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import json
|
||||
import re
|
||||
from collections import Counter, defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
DATA_FILE = Path('training-data/timmy-voice-batch09.jsonl')
|
||||
SCRIPT_FILE = Path('training-data/generate_timmy_voice_batch09.py')
|
||||
SOURCES_FILE = Path('training-data/timmy-voice-batch09.sources.json')
|
||||
README_FILE = Path('training-data/README-batch09.md')
|
||||
EXPECTED_ROWS = 1000
|
||||
EXPECTED_SOURCE_SESSIONS = 50
|
||||
EXPECTED_ROWS_PER_SESSION = 20
|
||||
BANNED_MODEL_STRINGS = ['claude', 'gpt-4', 'gpt-3', 'gpt', 'o1', 'o3', 'gemini', 'anthropic', 'openai', 'unknown']
|
||||
CRISIS_TERMS = ['suicide', 'kill myself', 'end my life', 'overdose', 'bridge', 'gun', 'die']
|
||||
CREDENTIAL_MARKERS = ['password', 'pass:', 'token', 'api key', 'secret', 'login:']
|
||||
PRIVATE_MARKERS = ['/users/', '/private/', '~/.timmy', '~/.hermes', 'alexanderwhitestone.com']
|
||||
META_MARKERS = [
|
||||
'review the conversation above',
|
||||
"you've reached the maximum number of tool-calling iterations",
|
||||
'without calling any more tools',
|
||||
'nothing to save',
|
||||
'[silent]',
|
||||
]
|
||||
EMAIL_RE = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}')
|
||||
|
||||
|
||||
def load_rows(path: Path):
|
||||
assert path.exists(), f'missing file: {path}'
|
||||
rows = []
|
||||
for line in path.read_text(encoding='utf-8').splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
rows.append(json.loads(line))
|
||||
return rows
|
||||
|
||||
|
||||
def load_sources():
|
||||
assert SOURCES_FILE.exists(), f'missing file: {SOURCES_FILE}'
|
||||
return json.loads(SOURCES_FILE.read_text(encoding='utf-8'))
|
||||
|
||||
|
||||
def test_batch09_artifacts_exist():
|
||||
for path in [DATA_FILE, SCRIPT_FILE, SOURCES_FILE, README_FILE]:
|
||||
assert path.exists(), f'missing artifact: {path}'
|
||||
|
||||
|
||||
def test_sources_manifest_has_50_unique_sessions():
|
||||
manifest = load_sources()
|
||||
assert manifest['batch'] == 9
|
||||
assert manifest['total_source_sessions'] == EXPECTED_SOURCE_SESSIONS
|
||||
sessions = manifest['sessions']
|
||||
assert len(sessions) == EXPECTED_SOURCE_SESSIONS
|
||||
session_files = [item['session_file'] for item in sessions]
|
||||
assert len(session_files) == len(set(session_files))
|
||||
for item in sessions:
|
||||
assert item['session_score'] > 0
|
||||
assert item['pair_score'] > 0
|
||||
assert item['source_prompt'].strip()
|
||||
assert item['source_response'].strip()
|
||||
model = item['model'].lower()
|
||||
assert not any(bad in model for bad in BANNED_MODEL_STRINGS), item['model']
|
||||
|
||||
|
||||
def test_dataset_has_exactly_1000_rows():
|
||||
rows = load_rows(DATA_FILE)
|
||||
assert len(rows) == EXPECTED_ROWS
|
||||
|
||||
|
||||
def test_ids_are_unique_and_batch_scoped():
|
||||
rows = load_rows(DATA_FILE)
|
||||
ids = [row['id'] for row in rows]
|
||||
assert len(ids) == len(set(ids))
|
||||
assert ids[0] == 'timmy-voice-batch09-0001'
|
||||
assert ids[-1] == 'timmy-voice-batch09-1000'
|
||||
for row in rows:
|
||||
assert row['batch'] == 9
|
||||
assert row['model'] == 'timmy-voice-batch09'
|
||||
assert row['quality_score'] >= 0.80
|
||||
|
||||
|
||||
def test_sharegpt_schema_and_source_mapping():
|
||||
rows = load_rows(DATA_FILE)
|
||||
manifest = load_sources()
|
||||
valid_sources = {item['session_file'] for item in manifest['sessions']}
|
||||
counts = Counter()
|
||||
prompts_by_source = defaultdict(set)
|
||||
for row in rows:
|
||||
assert row['source'] == 'session_derived_approved'
|
||||
assert row['source_session'] in valid_sources
|
||||
counts[row['source_session']] += 1
|
||||
conv = row['conversations']
|
||||
assert [item['from'] for item in conv] == ['system', 'human', 'gpt']
|
||||
assert 'Sovereignty and service always.' in conv[0]['value']
|
||||
assert conv[1]['value'].strip()
|
||||
assert conv[2]['value'].strip()
|
||||
prompts_by_source[row['source_session']].add(conv[1]['value'])
|
||||
assert len(counts) == EXPECTED_SOURCE_SESSIONS
|
||||
assert set(counts.values()) == {EXPECTED_ROWS_PER_SESSION}
|
||||
for source_session, prompts in prompts_by_source.items():
|
||||
assert len(prompts) == EXPECTED_ROWS_PER_SESSION, source_session
|
||||
|
||||
|
||||
def test_crisis_prompts_include_required_protocol():
|
||||
rows = load_rows(DATA_FILE)
|
||||
for row in rows:
|
||||
prompt = row['conversations'][1]['value'].lower()
|
||||
response = row['conversations'][2]['value']
|
||||
if any(term in prompt for term in CRISIS_TERMS):
|
||||
assert 'Are you safe right now?' in response
|
||||
assert '988' in response
|
||||
|
||||
|
||||
def test_no_credentials_or_private_paths_leak_into_sources_or_rows():
|
||||
manifest = load_sources()
|
||||
corpus = json.dumps(manifest).lower() + '\n' + DATA_FILE.read_text(encoding='utf-8').lower()
|
||||
assert not any(marker in corpus for marker in CREDENTIAL_MARKERS)
|
||||
assert not any(marker in corpus for marker in PRIVATE_MARKERS)
|
||||
assert not any(marker in corpus for marker in META_MARKERS)
|
||||
assert not EMAIL_RE.search(corpus)
|
||||
|
||||
|
||||
def test_readme_documents_generation_contract():
|
||||
text = README_FILE.read_text(encoding='utf-8')
|
||||
assert '50 source sessions' in text
|
||||
assert '20 prompt variations per session' in text
|
||||
assert 'approved-model provenance filter' in text
|
||||
Reference in New Issue
Block a user