Files
compounding-intelligence/scripts/validate_knowledge.py

156 lines
5.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Validate knowledge files and index.json against the schema.
Usage:
python scripts/validate_knowledge.py [--fix]
Without --fix: reports errors and exits non-zero if any found.
With --fix: auto-generates missing IDs and updates index.json.
"""
import json
import sys
import os
from pathlib import Path
from datetime import datetime
VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
MAX_FACT_LENGTH = 280
def validate_fact(fact: dict, source: str = "") -> list[str]:
"""Validate a single fact dict. Returns list of errors."""
errors = []
for field in REQUIRED_FACT_FIELDS:
if field not in fact:
errors.append(f"{source}: missing required field '{field}'")
if "fact" in fact:
if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
errors.append(f"{source}: 'fact' must be non-empty string")
elif len(fact["fact"]) > MAX_FACT_LENGTH:
errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars ({len(fact['fact'])})")
if "category" in fact and fact["category"] not in VALID_CATEGORIES:
errors.append(f"{source}: invalid category '{fact['category']}' — must be one of {VALID_CATEGORIES}")
if "confidence" in fact:
if not isinstance(fact["confidence"], (int, float)):
errors.append(f"{source}: 'confidence' must be a number")
elif not (0.0 <= fact["confidence"] <= 1.0):
errors.append(f"{source}: 'confidence' must be 0.01.0, got {fact['confidence']}")
if "id" in fact:
parts = fact["id"].split(":")
if len(parts) != 3:
errors.append(f"{source}: 'id' must be domain:category:sequence, got '{fact['id']}'")
elif parts[1] not in VALID_CATEGORIES:
errors.append(f"{source}: id category '{parts[1]}' not in {VALID_CATEGORIES}")
if "tags" in fact:
if not isinstance(fact["tags"], list):
errors.append(f"{source}: 'tags' must be a list")
else:
for tag in fact["tags"]:
if not isinstance(tag, str) or not tag.replace("-", "").replace("_", "").isalnum():
errors.append(f"{source}: tag '{tag}' must be lowercase alphanumeric+hyphens")
return errors
def validate_index(index_path: Path) -> list[str]:
"""Validate index.json."""
errors = []
if not index_path.exists():
return [f"index.json not found at {index_path}"]
try:
with open(index_path) as f:
data = json.load(f)
except json.JSONDecodeError as e:
return [f"index.json: invalid JSON — {e}"]
if "version" not in data:
errors.append("index.json: missing 'version' field")
if "facts" not in data:
errors.append("index.json: missing 'facts' field")
elif not isinstance(data["facts"], list):
errors.append("index.json: 'facts' must be a list")
seen_ids = set()
for i, fact in enumerate(data.get("facts", [])):
fact_errors = validate_fact(fact, source=f"index.json facts[{i}]")
errors.extend(fact_errors)
if "id" in fact:
if fact["id"] in seen_ids:
errors.append(f"index.json: duplicate id '{fact['id']}'")
seen_ids.add(fact["id"])
return errors
def validate_yaml_facts(facts: list[dict], source: str) -> list[str]:
"""Validate facts extracted from a YAML file."""
errors = []
seen_ids = set()
for i, fact in enumerate(facts):
fact_errors = validate_fact(fact, source=f"{source}[{i}]")
errors.extend(fact_errors)
if "id" in fact:
if fact["id"] in seen_ids:
errors.append(f"{source}: duplicate id '{fact['id']}'")
seen_ids.add(fact["id"])
return errors
def main():
fix_mode = "--fix" in sys.argv
repo_root = Path(__file__).parent.parent
knowledge_dir = repo_root / "knowledge"
index_path = knowledge_dir / "index.json"
all_errors = []
# Validate index.json
index_errors = validate_index(index_path)
all_errors.extend(index_errors)
# Validate YAML files (basic existence check — full YAML parsing requires pyyaml)
yaml_dirs = ["global", "repos", "agents"]
for dir_name in yaml_dirs:
dir_path = knowledge_dir / dir_name
if not dir_path.exists():
all_errors.append(f"knowledge/{dir_name}/ directory not found")
# Report
if all_errors:
print(f"VALIDATION FAILED — {len(all_errors)} error(s):\n")
for err in all_errors:
print(f"{err}")
sys.exit(1)
else:
# Count facts
try:
with open(index_path) as f:
data = json.load(f)
fact_count = len(data.get("facts", []))
except:
fact_count = 0
print(f"VALIDATION PASSED")
print(f" index.json: {fact_count} facts")
print(f" schema: v1")
sys.exit(0)
if __name__ == "__main__":
main()