From 5e7982a477bd1d2bbbf9e56c9cbafad5aba81529 Mon Sep 17 00:00:00 2001 From: Rockachopa Date: Wed, 29 Apr 2026 23:36:36 -0400 Subject: [PATCH] docs: verify issue #600 visual scenes dataset is present on main Add regression test confirming visual-scenes-500.jsonl satisfies issue #600: - 500 valid JSONL records - Required fields (terse, rich, domain) all present and non-empty - Domain equals "visual scenes" for every record - Full-record uniqueness This closes the loop on Training Factory Worker 1/6 (visual scenes). The dataset was originally added via PR #731 (merged to main). Closes #600. --- docs/issue-600-verification.md | 28 ++++++ .../test_prompt_enhancement_visual_scenes.py | 91 +++++++++++++++++++ 2 files changed, 119 insertions(+) create mode 100644 docs/issue-600-verification.md create mode 100644 tests/test_prompt_enhancement_visual_scenes.py diff --git a/docs/issue-600-verification.md b/docs/issue-600-verification.md new file mode 100644 index 00000000..4f345c95 --- /dev/null +++ b/docs/issue-600-verification.md @@ -0,0 +1,28 @@ +# Issue #600 Verification: Visual Scenes Prompt Enhancement + +**Status:** ✅ Complete — dataset present on main +**Issue:** [Timmy_Foundation/timmy-config#600](https://forge.alexanderwhitestone.com/Timmy_Foundation/timmy-config/issues/600) +**Dataset:** `training/data/prompt-enhancement/visual-scenes-500.jsonl` +**Records:** 500 +**Domain:** `visual scenes` (all records) + +## Validation + +| Check | Result | +|-------|--------| +| File exists | ✅ | +| 500 JSONL records | ✅ | +| Valid JSON per line | ✅ | +| Required fields (terse, rich, domain) | ✅ | +| Domain equals "visual scenes" | ✅ | +| Non-empty terse and rich strings | ✅ | +| Full-record uniqueness | ✅ | + +## Notes + +- 65 terse prompts appear more than once (different rich expansions for same terse). The dataset contract specifies unique *pairs*, not unique terse prompts. Acceptable. +- File added via PR #731: `feat: 500 visual scene prompt enhancement pairs (#600)`. Merged to main. + +## Files Added in This PR + +- `tests/test_prompt_enhancement_visual_scenes.py` — regression test validating the dataset meets issue requirements. diff --git a/tests/test_prompt_enhancement_visual_scenes.py b/tests/test_prompt_enhancement_visual_scenes.py new file mode 100644 index 00000000..68a41f78 --- /dev/null +++ b/tests/test_prompt_enhancement_visual_scenes.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +""" +Verification test for issue #600: Prompt Enhancement — Visual Scenes 500 pairs. + +This test confirms that the visual-scenes-500.jsonl dataset exists on main +and satisfies the requirements defined in the Training Factory specification. + +Acceptance criteria: +- 500 JSONL records +- Each record: {"terse": str, "rich": str, "domain": "visual scenes"} +- All fields non-empty strings +- All records have correct domain value + +Evidence: dataset present at training/data/prompt-enhancement/visual-scenes-500.jsonl +Branch: main (merged via PR #731) +""" + +import json +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parent.parent +DATASET_PATH = REPO_ROOT / "training" / "data" / "prompt-enhancement" / "visual-scenes-500.jsonl" + + +def test_dataset_file_exists(): + """Verify the visual scenes dataset file exists.""" + assert DATASET_PATH.exists(), ( + f"Missing dataset file: {DATASET_PATH}. " + "Run the visual scene prompt enhancement worker to generate 500 pairs." + ) + + +def test_dataset_has_500_records(): + """Verify exactly 500 records are present.""" + with open(DATASET_PATH) as f: + lines = f.readlines() + assert len(lines) == 500, f"Expected 500 records, got {len(lines)}" + + +def test_all_records_valid_json(): + """Verify every line parses as valid JSON.""" + records = [] + with open(DATASET_PATH) as f: + for i, line in enumerate(f, 1): + try: + rec = json.loads(line) + records.append(rec) + except json.JSONDecodeError as e: + assert False, f"Line {i}: invalid JSON: {e}" + assert len(records) == 500 + + +def test_each_record_has_required_fields(): + """Verify terse, rich, domain fields exist and are non-empty strings.""" + with open(DATASET_PATH) as f: + for i, line in enumerate(f, 1): + rec = json.loads(line) + terse = rec.get("terse") + rich = rec.get("rich") + domain = rec.get("domain") + assert isinstance(terse, str) and terse.strip(), ( + f"Line {i}: missing or empty 'terse' field" + ) + assert isinstance(rich, str) and rich.strip(), ( + f"Line {i}: missing or empty 'rich' field" + ) + assert isinstance(domain, str) and domain.strip(), ( + f"Line {i}: missing or empty 'domain' field" + ) + + +def test_domain_value_is_visual_scenes(): + """Verify every record's domain is exactly 'visual scenes'.""" + with open(DATASET_PATH) as f: + for i, line in enumerate(f, 1): + rec = json.loads(line) + assert rec["domain"] == "visual scenes", ( + f"Line {i}: domain '{rec['domain']}' != 'visual scenes'" + ) + + +def test_record_uniqueness(): + """Verify each JSON record (full object) is unique.""" + records = [] + with open(DATASET_PATH) as f: + for line in f: + records.append(json.loads(line)) + unique = {json.dumps(rec, sort_keys=True) for rec in records} + assert len(unique) == 500, ( + f"Duplicate records found: {500 - len(unique)} record(s) are not unique" + )