Some checks failed
Architecture Lint / Linter Tests (pull_request) Successful in 23s
Smoke Test / smoke (pull_request) Failing after 22s
Validate Config / YAML Lint (pull_request) Failing after 16s
Validate Config / JSON Validate (pull_request) Successful in 18s
Validate Config / Python Syntax & Import Check (pull_request) Failing after 55s
Validate Config / Python Test Suite (pull_request) Has been skipped
Validate Config / Shell Script Lint (pull_request) Failing after 1m2s
Validate Config / Cron Syntax Check (pull_request) Successful in 13s
Validate Config / Deploy Script Dry Run (pull_request) Successful in 13s
Validate Config / Playbook Schema Validation (pull_request) Successful in 25s
Validate Training Data / validate (pull_request) Successful in 21s
Architecture Lint / Lint Repository (pull_request) Failing after 17s
PR Checklist / pr-checklist (pull_request) Failing after 9m48s
Generate a deterministic batch09 Timmy voice corpus with 1,000 ShareGPT prompt-response pairs, 50 approved source sessions, a source manifest, and focused validation tests.
127 lines
4.8 KiB
Python
127 lines
4.8 KiB
Python
import json
|
|
import re
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
DATA_FILE = Path('training-data/timmy-voice-batch09.jsonl')
|
|
SCRIPT_FILE = Path('training-data/generate_timmy_voice_batch09.py')
|
|
SOURCES_FILE = Path('training-data/timmy-voice-batch09.sources.json')
|
|
README_FILE = Path('training-data/README-batch09.md')
|
|
EXPECTED_ROWS = 1000
|
|
EXPECTED_SOURCE_SESSIONS = 50
|
|
EXPECTED_ROWS_PER_SESSION = 20
|
|
BANNED_MODEL_STRINGS = ['claude', 'gpt-4', 'gpt-3', 'gpt', 'o1', 'o3', 'gemini', 'anthropic', 'openai', 'unknown']
|
|
CRISIS_TERMS = ['suicide', 'kill myself', 'end my life', 'overdose', 'bridge', 'gun', 'die']
|
|
CREDENTIAL_MARKERS = ['password', 'pass:', 'token', 'api key', 'secret', 'login:']
|
|
PRIVATE_MARKERS = ['/users/', '/private/', '~/.timmy', '~/.hermes', 'alexanderwhitestone.com']
|
|
META_MARKERS = [
|
|
'review the conversation above',
|
|
"you've reached the maximum number of tool-calling iterations",
|
|
'without calling any more tools',
|
|
'nothing to save',
|
|
'[silent]',
|
|
]
|
|
EMAIL_RE = re.compile(r'[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}')
|
|
|
|
|
|
def load_rows(path: Path):
|
|
assert path.exists(), f'missing file: {path}'
|
|
rows = []
|
|
for line in path.read_text(encoding='utf-8').splitlines():
|
|
line = line.strip()
|
|
if line:
|
|
rows.append(json.loads(line))
|
|
return rows
|
|
|
|
|
|
def load_sources():
|
|
assert SOURCES_FILE.exists(), f'missing file: {SOURCES_FILE}'
|
|
return json.loads(SOURCES_FILE.read_text(encoding='utf-8'))
|
|
|
|
|
|
def test_batch09_artifacts_exist():
|
|
for path in [DATA_FILE, SCRIPT_FILE, SOURCES_FILE, README_FILE]:
|
|
assert path.exists(), f'missing artifact: {path}'
|
|
|
|
|
|
def test_sources_manifest_has_50_unique_sessions():
|
|
manifest = load_sources()
|
|
assert manifest['batch'] == 9
|
|
assert manifest['total_source_sessions'] == EXPECTED_SOURCE_SESSIONS
|
|
sessions = manifest['sessions']
|
|
assert len(sessions) == EXPECTED_SOURCE_SESSIONS
|
|
session_files = [item['session_file'] for item in sessions]
|
|
assert len(session_files) == len(set(session_files))
|
|
for item in sessions:
|
|
assert item['session_score'] > 0
|
|
assert item['pair_score'] > 0
|
|
assert item['source_prompt'].strip()
|
|
assert item['source_response'].strip()
|
|
model = item['model'].lower()
|
|
assert not any(bad in model for bad in BANNED_MODEL_STRINGS), item['model']
|
|
|
|
|
|
def test_dataset_has_exactly_1000_rows():
|
|
rows = load_rows(DATA_FILE)
|
|
assert len(rows) == EXPECTED_ROWS
|
|
|
|
|
|
def test_ids_are_unique_and_batch_scoped():
|
|
rows = load_rows(DATA_FILE)
|
|
ids = [row['id'] for row in rows]
|
|
assert len(ids) == len(set(ids))
|
|
assert ids[0] == 'timmy-voice-batch09-0001'
|
|
assert ids[-1] == 'timmy-voice-batch09-1000'
|
|
for row in rows:
|
|
assert row['batch'] == 9
|
|
assert row['model'] == 'timmy-voice-batch09'
|
|
assert row['quality_score'] >= 0.80
|
|
|
|
|
|
def test_sharegpt_schema_and_source_mapping():
|
|
rows = load_rows(DATA_FILE)
|
|
manifest = load_sources()
|
|
valid_sources = {item['session_file'] for item in manifest['sessions']}
|
|
counts = Counter()
|
|
prompts_by_source = defaultdict(set)
|
|
for row in rows:
|
|
assert row['source'] == 'session_derived_approved'
|
|
assert row['source_session'] in valid_sources
|
|
counts[row['source_session']] += 1
|
|
conv = row['conversations']
|
|
assert [item['from'] for item in conv] == ['system', 'human', 'gpt']
|
|
assert 'Sovereignty and service always.' in conv[0]['value']
|
|
assert conv[1]['value'].strip()
|
|
assert conv[2]['value'].strip()
|
|
prompts_by_source[row['source_session']].add(conv[1]['value'])
|
|
assert len(counts) == EXPECTED_SOURCE_SESSIONS
|
|
assert set(counts.values()) == {EXPECTED_ROWS_PER_SESSION}
|
|
for source_session, prompts in prompts_by_source.items():
|
|
assert len(prompts) == EXPECTED_ROWS_PER_SESSION, source_session
|
|
|
|
|
|
def test_crisis_prompts_include_required_protocol():
|
|
rows = load_rows(DATA_FILE)
|
|
for row in rows:
|
|
prompt = row['conversations'][1]['value'].lower()
|
|
response = row['conversations'][2]['value']
|
|
if any(term in prompt for term in CRISIS_TERMS):
|
|
assert 'Are you safe right now?' in response
|
|
assert '988' in response
|
|
|
|
|
|
def test_no_credentials_or_private_paths_leak_into_sources_or_rows():
|
|
manifest = load_sources()
|
|
corpus = json.dumps(manifest).lower() + '\n' + DATA_FILE.read_text(encoding='utf-8').lower()
|
|
assert not any(marker in corpus for marker in CREDENTIAL_MARKERS)
|
|
assert not any(marker in corpus for marker in PRIVATE_MARKERS)
|
|
assert not any(marker in corpus for marker in META_MARKERS)
|
|
assert not EMAIL_RE.search(corpus)
|
|
|
|
|
|
def test_readme_documents_generation_contract():
|
|
text = README_FILE.read_text(encoding='utf-8')
|
|
assert '50 source sessions' in text
|
|
assert '20 prompt variations per session' in text
|
|
assert 'approved-model provenance filter' in text
|