test: verify identity attacks corpus already on main (#616)

Document that evaluations/adversary/corpora/identity_attacks_200.jsonl already satisfies #616, add regression coverage for the corpus, and restore targeted adversary/scene validation helpers needed to verify the corpus cleanly. Closes #616
2026-04-22 10:46:25 -04:00
parent ae8c1d46ae
commit 2ec8c556d7
5 changed files with 175 additions and 10 deletions
--- a/scripts/validate-scene-data.py
+++ b/scripts/validate-scene-data.py
@@ -18,11 +18,22 @@ import sys
 from pathlib import Path


+DEFAULT_SCHEMA_PATH = Path(__file__).resolve().parent.parent / "training-data" / "schema.json"
+_DEFAULT_SCHEMA_CACHE = None
+
+
 def load_schema(path: str) -> dict:
    with open(path) as f:
        return json.load(f)


+def load_default_schema() -> dict:
+    global _DEFAULT_SCHEMA_CACHE
+    if _DEFAULT_SCHEMA_CACHE is None:
+        _DEFAULT_SCHEMA_CACHE = load_schema(DEFAULT_SCHEMA_PATH)
+    return _DEFAULT_SCHEMA_CACHE
+
+
 def _check(val, spec, loc, path):
    """Check a value against a schema property. Returns list of error strings."""
    errors = []
@@ -39,7 +50,10 @@ def _check(val, spec, loc, path):
        if not isinstance(val, str):
            errors.append(f"{loc}: '{path}' expected string, got {type(val).__name__}")
        elif spec.get("minLength") and len(val) < spec["minLength"]:
-            errors.append(f"{loc}: '{path}' is empty (min {spec['minLength']} chars)")
+            if len(val) == 0:
+                errors.append(f"{loc}: '{path}' is empty (min {spec['minLength']} chars)")
+            else:
+                errors.append(f"{loc}: '{path}' is too short (min {spec['minLength']} chars)")
        elif spec.get("pattern") and not re.match(spec["pattern"], val):
            errors.append(f"{loc}: '{path}'='{val}' doesn't match {spec['pattern']}")
    elif t == "number":
@@ -50,6 +64,8 @@ def _check(val, spec, loc, path):
    elif t == "integer":
        if not isinstance(val, int) or isinstance(val, bool):
            errors.append(f"{loc}: '{path}' expected integer, got {type(val).__name__}")
+        elif "minimum" in spec and val < spec["minimum"]:
+            errors.append(f"{loc}: '{path}'={val} below minimum {spec['minimum']}")
    elif t == "array":
        if not isinstance(val, list):
            errors.append(f"{loc}: '{path}' expected array, got {type(val).__name__}")
@@ -96,6 +112,29 @@ def validate_entry(entry, schema, line_num, file_name):
    return errors


+def validate_entry_manual(entry, line_num, schema=None, file_name="<memory>"):
+    """Backwards-compatible helper used by tests and manual validation flows."""
+    if schema is None:
+        schema = load_default_schema()
+
+    normalized = dict(entry)
+    scene = normalized.get("scene")
+    if isinstance(scene, dict):
+        normalized_scene = dict(scene)
+        for field in ("mood", "colors", "composition", "camera", "camera_movement", "description"):
+            if field in normalized:
+                normalized_scene[field] = normalized.pop(field)
+        normalized["scene"] = normalized_scene
+
+    errors = validate_entry(normalized, schema, line_num, file_name)
+
+    for extra_required in ("artist", "timestamp"):
+        if extra_required not in normalized:
+            errors.append(f"{file_name}:{line_num}: missing required field '{extra_required}'")
+
+    return errors
+
+
 def validate_file(path, schema):
    errors = []
    count = 0