Files
compounding-intelligence/scripts/validate_knowledge.py

156 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Validate knowledge files and index.json against the schema.
Usage:
python scripts/validate_knowledge.py [--fix]
Without --fix: reports errors and exits non-zero if any found.
With --fix: auto-generates missing IDs and updates index.json.
"""
import json
import sys
import os
from pathlib import Path
from datetime import datetime
VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
MAX_FACT_LENGTH = 280
def validate_fact(fact: dict, source: str = "") -> list[str]:
"""Validate a single fact dict. Returns list of errors."""
errors = []
for field in REQUIRED_FACT_FIELDS:
if field not in fact:
errors.append(f"{source}: missing required field '{field}'")
if "fact" in fact:
if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
errors.append(f"{source}: 'fact' must be non-empty string")
elif len(fact["fact"]) > MAX_FACT_LENGTH:
errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars ({len(fact['fact'])})")
if "category" in fact and fact["category"] not in VALID_CATEGORIES:
errors.append(f"{source}: invalid category '{fact['category']}' — must be one of {VALID_CATEGORIES}")
if "confidence" in fact:
if not isinstance(fact["confidence"], (int, float)):
errors.append(f"{source}: 'confidence' must be a number")
elif not (0.0 <= fact["confidence"] <= 1.0):
errors.append(f"{source}: 'confidence' must be 0.01.0, got {fact['confidence']}")
if "id" in fact:
parts = fact["id"].split(":")
if len(parts) != 3:
errors.append(f"{source}: 'id' must be domain:category:sequence, got '{fact['id']}'")
elif parts[1] not in VALID_CATEGORIES:
errors.append(f"{source}: id category '{parts[1]}' not in {VALID_CATEGORIES}")
if "tags" in fact:
if not isinstance(fact["tags"], list):
errors.append(f"{source}: 'tags' must be a list")
else:
for tag in fact["tags"]:
if not isinstance(tag, str) or not tag.replace("-", "").replace("_", "").isalnum():
errors.append(f"{source}: tag '{tag}' must be lowercase alphanumeric+hyphens")
return errors
def validate_index(index_path: Path) -> list[str]:
"""Validate index.json."""
errors = []
if not index_path.exists():
return [f"index.json not found at {index_path}"]
try:
with open(index_path) as f:
data = json.load(f)
except json.JSONDecodeError as e:
return [f"index.json: invalid JSON — {e}"]
if "version" not in data:
errors.append("index.json: missing 'version' field")
if "facts" not in data:
errors.append("index.json: missing 'facts' field")
elif not isinstance(data["facts"], list):
errors.append("index.json: 'facts' must be a list")
seen_ids = set()
for i, fact in enumerate(data.get("facts", [])):
fact_errors = validate_fact(fact, source=f"index.json facts[{i}]")
errors.extend(fact_errors)
if "id" in fact:
if fact["id"] in seen_ids:
errors.append(f"index.json: duplicate id '{fact['id']}'")
seen_ids.add(fact["id"])
return errors
def validate_yaml_facts(facts: list[dict], source: str) -> list[str]:
"""Validate facts extracted from a YAML file."""
errors = []
seen_ids = set()
for i, fact in enumerate(facts):
fact_errors = validate_fact(fact, source=f"{source}[{i}]")
errors.extend(fact_errors)
if "id" in fact:
if fact["id"] in seen_ids:
errors.append(f"{source}: duplicate id '{fact['id']}'")
seen_ids.add(fact["id"])
return errors
def main():
fix_mode = "--fix" in sys.argv
repo_root = Path(__file__).parent.parent
knowledge_dir = repo_root / "knowledge"
index_path = knowledge_dir / "index.json"
all_errors = []
# Validate index.json
index_errors = validate_index(index_path)
all_errors.extend(index_errors)
# Validate YAML files (basic existence check — full YAML parsing requires pyyaml)
yaml_dirs = ["global", "repos", "agents"]
for dir_name in yaml_dirs:
dir_path = knowledge_dir / dir_name
if not dir_path.exists():
all_errors.append(f"knowledge/{dir_name}/ directory not found")
# Report
if all_errors:
print(f"VALIDATION FAILED — {len(all_errors)} error(s):\n")
for err in all_errors:
print(f"{err}")
sys.exit(1)
else:
# Count facts
try:
with open(index_path) as f:
data = json.load(f)
fact_count = len(data.get("facts", []))
except:
fact_count = 0
print(f"VALIDATION PASSED")
print(f" index.json: {fact_count} facts")
print(f" schema: v1")
sys.exit(0)
if __name__ == "__main__":
main()