compounding-intelligence/scripts/validate_knowledge.py

#!/usr/bin/env python3
"""
Validate knowledge files and index.json against the schema.

Usage:
    python scripts/validate_knowledge.py
"""

import json
import sys
from pathlib import Path

VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
MAX_FACT_LENGTH = 280


def validate_fact(fact, source=""):
    errors = []
    for field in REQUIRED_FACT_FIELDS:
        if field not in fact:
            errors.append(f"{source}: missing required field '{field}'")
    if "fact" in fact:
        if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
            errors.append(f"{source}: 'fact' must be non-empty string")
        elif len(fact["fact"]) > MAX_FACT_LENGTH:
            errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars")
    if "category" in fact and fact["category"] not in VALID_CATEGORIES:
        errors.append(f"{source}: invalid category '{fact['category']}'")
    if "confidence" in fact:
        if not isinstance(fact["confidence"], (int, float)):
            errors.append(f"{source}: 'confidence' must be a number")
        elif not (0.0 <= fact["confidence"] <= 1.0):
            errors.append(f"{source}: 'confidence' must be 0.0-1.0")
    if "id" in fact:
        parts = fact["id"].split(":")
        if len(parts) != 3:
            errors.append(f"{source}: 'id' must be domain:category:sequence")
        elif parts[1] not in VALID_CATEGORIES:
            errors.append(f"{source}: id category '{parts[1]}' invalid")
    return errors


def main():
    repo_root = Path(__file__).parent.parent
    index_path = repo_root / "knowledge" / "index.json"
    all_errors = []

    if not index_path.exists():
        print(f"VALIDATION FAILED: index.json not found at {index_path}")
        sys.exit(1)

    with open(index_path) as f:
        data = json.load(f)

    if "version" not in data:
        all_errors.append("index.json: missing 'version'")
    if "facts" not in data or not isinstance(data["facts"], list):
        all_errors.append("index.json: missing or invalid 'facts'")

    seen_ids = set()
    for i, fact in enumerate(data.get("facts", [])):
        all_errors.extend(validate_fact(fact, f"facts[{i}]"))
        if "id" in fact:
            if fact["id"] in seen_ids:
                all_errors.append(f"index.json: duplicate id '{fact['id']}'")
            seen_ids.add(fact["id"])

    if all_errors:
        print(f"VALIDATION FAILED - {len(all_errors)} error(s):\n")
        for e in all_errors:
            print(f"  x {e}")
        sys.exit(1)
    else:
        print(f"VALIDATION PASSED - {len(data.get('facts', []))} facts, schema v{data.get('version', '?')}")
        sys.exit(0)


if __name__ == "__main__":
    main()