import json import re from collections import Counter, defaultdict from pathlib import Path DATA_FILE = Path('training-data/timmy-voice-batch09.jsonl') SCRIPT_FILE = Path('training-data/generate_timmy_voice_batch09.py') SOURCES_FILE = Path('training-data/timmy-voice-batch09.sources.json') README_FILE = Path('training-data/README-batch09.md') EXPECTED_ROWS = 1000 EXPECTED_SOURCE_SESSIONS = 50 EXPECTED_ROWS_PER_SESSION = 20 BANNED_MODEL_STRINGS = ['claude', 'gpt-4', 'gpt-3', 'gpt', 'o1', 'o3', 'gemini', 'anthropic', 'openai', 'unknown'] CRISIS_TERMS = ['suicide', 'kill myself', 'end my life', 'overdose', 'bridge', 'gun', 'die'] CREDENTIAL_MARKERS = ['password', 'pass:', 'token', 'api key', 'secret', 'login:'] PRIVATE_MARKERS = ['/users/', '/private/', '~/.timmy', '~/.hermes', 'alexanderwhitestone.com'] META_MARKERS = [ 'review the conversation above', "you've reached the maximum number of tool-calling iterations", 'without calling any more tools', 'nothing to save', '[silent]', ] EMAIL_RE = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}') def load_rows(path: Path): assert path.exists(), f'missing file: {path}' rows = [] for line in path.read_text(encoding='utf-8').splitlines(): line = line.strip() if line: rows.append(json.loads(line)) return rows def load_sources(): assert SOURCES_FILE.exists(), f'missing file: {SOURCES_FILE}' return json.loads(SOURCES_FILE.read_text(encoding='utf-8')) def test_batch09_artifacts_exist(): for path in [DATA_FILE, SCRIPT_FILE, SOURCES_FILE, README_FILE]: assert path.exists(), f'missing artifact: {path}' def test_sources_manifest_has_50_unique_sessions(): manifest = load_sources() assert manifest['batch'] == 9 assert manifest['total_source_sessions'] == EXPECTED_SOURCE_SESSIONS sessions = manifest['sessions'] assert len(sessions) == EXPECTED_SOURCE_SESSIONS session_files = [item['session_file'] for item in sessions] assert len(session_files) == len(set(session_files)) for item in sessions: assert item['session_score'] > 0 assert item['pair_score'] > 0 assert item['source_prompt'].strip() assert item['source_response'].strip() model = item['model'].lower() assert not any(bad in model for bad in BANNED_MODEL_STRINGS), item['model'] def test_dataset_has_exactly_1000_rows(): rows = load_rows(DATA_FILE) assert len(rows) == EXPECTED_ROWS def test_ids_are_unique_and_batch_scoped(): rows = load_rows(DATA_FILE) ids = [row['id'] for row in rows] assert len(ids) == len(set(ids)) assert ids[0] == 'timmy-voice-batch09-0001' assert ids[-1] == 'timmy-voice-batch09-1000' for row in rows: assert row['batch'] == 9 assert row['model'] == 'timmy-voice-batch09' assert row['quality_score'] >= 0.80 def test_sharegpt_schema_and_source_mapping(): rows = load_rows(DATA_FILE) manifest = load_sources() valid_sources = {item['session_file'] for item in manifest['sessions']} counts = Counter() prompts_by_source = defaultdict(set) for row in rows: assert row['source'] == 'session_derived_approved' assert row['source_session'] in valid_sources counts[row['source_session']] += 1 conv = row['conversations'] assert [item['from'] for item in conv] == ['system', 'human', 'gpt'] assert 'Sovereignty and service always.' in conv[0]['value'] assert conv[1]['value'].strip() assert conv[2]['value'].strip() prompts_by_source[row['source_session']].add(conv[1]['value']) assert len(counts) == EXPECTED_SOURCE_SESSIONS assert set(counts.values()) == {EXPECTED_ROWS_PER_SESSION} for source_session, prompts in prompts_by_source.items(): assert len(prompts) == EXPECTED_ROWS_PER_SESSION, source_session def test_crisis_prompts_include_required_protocol(): rows = load_rows(DATA_FILE) for row in rows: prompt = row['conversations'][1]['value'].lower() response = row['conversations'][2]['value'] if any(term in prompt for term in CRISIS_TERMS): assert 'Are you safe right now?' in response assert '988' in response def test_no_credentials_or_private_paths_leak_into_sources_or_rows(): manifest = load_sources() corpus = json.dumps(manifest).lower() + '\n' + DATA_FILE.read_text(encoding='utf-8').lower() assert not any(marker in corpus for marker in CREDENTIAL_MARKERS) assert not any(marker in corpus for marker in PRIVATE_MARKERS) assert not any(marker in corpus for marker in META_MARKERS) assert not EMAIL_RE.search(corpus) def test_readme_documents_generation_contract(): text = README_FILE.read_text(encoding='utf-8') assert '50 source sessions' in text assert '20 prompt variations per session' in text assert 'approved-model provenance filter' in text