From c07201298323e357428a72860df8f9b92858a491 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Wed, 22 Apr 2026 10:25:06 -0400 Subject: [PATCH] docs: verify issue #600 visual scenes dataset Add a verification note and regression test showing that training/data/prompt-enhancement/visual-scenes-500.jsonl already exists on main with 500 valid records. --- docs/issue-600-verification.md | 41 ++++++++++++++++ .../test_prompt_enhancement_visual_scenes.py | 48 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 docs/issue-600-verification.md create mode 100644 tests/test_prompt_enhancement_visual_scenes.py diff --git a/docs/issue-600-verification.md b/docs/issue-600-verification.md new file mode 100644 index 00000000..71774eaa --- /dev/null +++ b/docs/issue-600-verification.md @@ -0,0 +1,41 @@ +# Issue #600 Verification + +Status: already implemented on `main`. + +Issue: Prompt Enhancement: Visual Scenes — 500 Terse→Rich Pairs + +What is already present on `main` +- `training/data/prompt-enhancement/visual-scenes-500.jsonl` +- 500 JSONL records +- every record includes `terse`, `rich`, and `domain` +- every `domain` value is `visual scenes` +- 500/500 full records are unique + +Evidence gathered from a fresh clone +- `python3` validation against `training/data/prompt-enhancement/visual-scenes-500.jsonl` returned: + - `count = 500` + - `unique_records = 500` + - `unique_terse = 435` + - `domains = ['visual scenes']` + - `missing_keys = 0` +- remote branch `fix/600` still exists from closed PR #731 (`feat: 500 visual scene prompt enhancement pairs (#600)`) +- SHA-256 of `training/data/prompt-enhancement/visual-scenes-500.jsonl` on `origin/main` matches the same file on `fix/600`, which shows the requested dataset is already present on `main` + +Verification commands +```bash +python3 - <<'PY' +import json +from pathlib import Path +path = Path('training/data/prompt-enhancement/visual-scenes-500.jsonl') +records = [json.loads(line) for line in path.read_text().splitlines() if line.strip()] +print('count', len(records)) +print('unique_records', len({json.dumps(r, sort_keys=True) for r in records})) +print('unique_terse', len({r['terse'] for r in records})) +print('domains', sorted({r.get('domain') for r in records})) +print('missing_keys', sum(any(k not in r or not str(r[k]).strip() for k in ('terse', 'rich', 'domain')) for r in records)) +PY +``` + +Recommendation +- Close issue #600 as already implemented on `main`. +- This branch only adds a durable verification note and regression test so the zombie issue can be closed cleanly without regenerating duplicate training data. diff --git a/tests/test_prompt_enhancement_visual_scenes.py b/tests/test_prompt_enhancement_visual_scenes.py new file mode 100644 index 00000000..2eccf9f4 --- /dev/null +++ b/tests/test_prompt_enhancement_visual_scenes.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 +"""Verification tests for timmy-config issue #600.""" + +import json +import unittest +from pathlib import Path + +ROOT = Path(__file__).resolve().parent.parent +DATA_PATH = ROOT / "training" / "data" / "prompt-enhancement" / "visual-scenes-500.jsonl" +DOC_PATH = ROOT / "docs" / "issue-600-verification.md" + + +def load_records(): + with DATA_PATH.open("r", encoding="utf-8") as handle: + return [json.loads(line) for line in handle if line.strip()] + + +class TestVisualScenesPromptEnhancement(unittest.TestCase): + def test_dataset_exists(self): + self.assertTrue(DATA_PATH.exists(), f"Missing dataset: {DATA_PATH}") + + def test_dataset_has_500_records(self): + self.assertEqual(len(load_records()), 500) + + def test_schema_and_domain(self): + for idx, record in enumerate(load_records(), start=1): + for key in ("terse", "rich", "domain"): + self.assertIn(key, record, f"line {idx}: missing {key}") + self.assertIsInstance(record[key], str, f"line {idx}: {key} must be a string") + self.assertTrue(record[key].strip(), f"line {idx}: {key} is empty") + self.assertEqual(record["domain"], "visual scenes", f"line {idx}: wrong domain") + + def test_records_are_unique(self): + records = load_records() + serialized = [json.dumps(record, sort_keys=True) for record in records] + self.assertEqual(len(serialized), len(set(serialized)), "duplicate full records found") + + def test_verification_doc_exists_and_mentions_evidence(self): + self.assertTrue(DOC_PATH.exists(), f"Missing verification doc: {DOC_PATH}") + text = DOC_PATH.read_text(encoding="utf-8") + self.assertIn("Issue #600 Verification", text) + self.assertIn("training/data/prompt-enhancement/visual-scenes-500.jsonl", text) + self.assertIn("PR #731", text) + self.assertIn("already implemented on `main`", text) + + +if __name__ == "__main__": + unittest.main()