81 lines
2.7 KiB
Python
81 lines
2.7 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""
|
||
|
|
Validate knowledge files and index.json against the schema.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
python scripts/validate_knowledge.py
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
|
||
|
|
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
|
||
|
|
MAX_FACT_LENGTH = 280
|
||
|
|
|
||
|
|
|
||
|
|
def validate_fact(fact, source=""):
|
||
|
|
errors = []
|
||
|
|
for field in REQUIRED_FACT_FIELDS:
|
||
|
|
if field not in fact:
|
||
|
|
errors.append(f"{source}: missing required field '{field}'")
|
||
|
|
if "fact" in fact:
|
||
|
|
if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
|
||
|
|
errors.append(f"{source}: 'fact' must be non-empty string")
|
||
|
|
elif len(fact["fact"]) > MAX_FACT_LENGTH:
|
||
|
|
errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars")
|
||
|
|
if "category" in fact and fact["category"] not in VALID_CATEGORIES:
|
||
|
|
errors.append(f"{source}: invalid category '{fact['category']}'")
|
||
|
|
if "confidence" in fact:
|
||
|
|
if not isinstance(fact["confidence"], (int, float)):
|
||
|
|
errors.append(f"{source}: 'confidence' must be a number")
|
||
|
|
elif not (0.0 <= fact["confidence"] <= 1.0):
|
||
|
|
errors.append(f"{source}: 'confidence' must be 0.0-1.0")
|
||
|
|
if "id" in fact:
|
||
|
|
parts = fact["id"].split(":")
|
||
|
|
if len(parts) != 3:
|
||
|
|
errors.append(f"{source}: 'id' must be domain:category:sequence")
|
||
|
|
elif parts[1] not in VALID_CATEGORIES:
|
||
|
|
errors.append(f"{source}: id category '{parts[1]}' invalid")
|
||
|
|
return errors
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
repo_root = Path(__file__).parent.parent
|
||
|
|
index_path = repo_root / "knowledge" / "index.json"
|
||
|
|
all_errors = []
|
||
|
|
|
||
|
|
if not index_path.exists():
|
||
|
|
print(f"VALIDATION FAILED: index.json not found at {index_path}")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
with open(index_path) as f:
|
||
|
|
data = json.load(f)
|
||
|
|
|
||
|
|
if "version" not in data:
|
||
|
|
all_errors.append("index.json: missing 'version'")
|
||
|
|
if "facts" not in data or not isinstance(data["facts"], list):
|
||
|
|
all_errors.append("index.json: missing or invalid 'facts'")
|
||
|
|
|
||
|
|
seen_ids = set()
|
||
|
|
for i, fact in enumerate(data.get("facts", [])):
|
||
|
|
all_errors.extend(validate_fact(fact, f"facts[{i}]"))
|
||
|
|
if "id" in fact:
|
||
|
|
if fact["id"] in seen_ids:
|
||
|
|
all_errors.append(f"index.json: duplicate id '{fact['id']}'")
|
||
|
|
seen_ids.add(fact["id"])
|
||
|
|
|
||
|
|
if all_errors:
|
||
|
|
print(f"VALIDATION FAILED - {len(all_errors)} error(s):\n")
|
||
|
|
for e in all_errors:
|
||
|
|
print(f" x {e}")
|
||
|
|
sys.exit(1)
|
||
|
|
else:
|
||
|
|
print(f"VALIDATION PASSED - {len(data.get('facts', []))} facts, schema v{data.get('version', '?')}")
|
||
|
|
sys.exit(0)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|