156 lines
5.0 KiB
Python
156 lines
5.0 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Validate knowledge files and index.json against the schema.
|
||
|
||
Usage:
|
||
python scripts/validate_knowledge.py [--fix]
|
||
|
||
Without --fix: reports errors and exits non-zero if any found.
|
||
With --fix: auto-generates missing IDs and updates index.json.
|
||
"""
|
||
|
||
import json
|
||
import sys
|
||
import os
|
||
from pathlib import Path
|
||
from datetime import datetime
|
||
|
||
VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
|
||
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
|
||
MAX_FACT_LENGTH = 280
|
||
|
||
|
||
def validate_fact(fact: dict, source: str = "") -> list[str]:
|
||
"""Validate a single fact dict. Returns list of errors."""
|
||
errors = []
|
||
|
||
for field in REQUIRED_FACT_FIELDS:
|
||
if field not in fact:
|
||
errors.append(f"{source}: missing required field '{field}'")
|
||
|
||
if "fact" in fact:
|
||
if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
|
||
errors.append(f"{source}: 'fact' must be non-empty string")
|
||
elif len(fact["fact"]) > MAX_FACT_LENGTH:
|
||
errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars ({len(fact['fact'])})")
|
||
|
||
if "category" in fact and fact["category"] not in VALID_CATEGORIES:
|
||
errors.append(f"{source}: invalid category '{fact['category']}' — must be one of {VALID_CATEGORIES}")
|
||
|
||
if "confidence" in fact:
|
||
if not isinstance(fact["confidence"], (int, float)):
|
||
errors.append(f"{source}: 'confidence' must be a number")
|
||
elif not (0.0 <= fact["confidence"] <= 1.0):
|
||
errors.append(f"{source}: 'confidence' must be 0.0–1.0, got {fact['confidence']}")
|
||
|
||
if "id" in fact:
|
||
parts = fact["id"].split(":")
|
||
if len(parts) != 3:
|
||
errors.append(f"{source}: 'id' must be domain:category:sequence, got '{fact['id']}'")
|
||
elif parts[1] not in VALID_CATEGORIES:
|
||
errors.append(f"{source}: id category '{parts[1]}' not in {VALID_CATEGORIES}")
|
||
|
||
if "tags" in fact:
|
||
if not isinstance(fact["tags"], list):
|
||
errors.append(f"{source}: 'tags' must be a list")
|
||
else:
|
||
for tag in fact["tags"]:
|
||
if not isinstance(tag, str) or not tag.replace("-", "").replace("_", "").isalnum():
|
||
errors.append(f"{source}: tag '{tag}' must be lowercase alphanumeric+hyphens")
|
||
|
||
return errors
|
||
|
||
|
||
def validate_index(index_path: Path) -> list[str]:
|
||
"""Validate index.json."""
|
||
errors = []
|
||
|
||
if not index_path.exists():
|
||
return [f"index.json not found at {index_path}"]
|
||
|
||
try:
|
||
with open(index_path) as f:
|
||
data = json.load(f)
|
||
except json.JSONDecodeError as e:
|
||
return [f"index.json: invalid JSON — {e}"]
|
||
|
||
if "version" not in data:
|
||
errors.append("index.json: missing 'version' field")
|
||
if "facts" not in data:
|
||
errors.append("index.json: missing 'facts' field")
|
||
elif not isinstance(data["facts"], list):
|
||
errors.append("index.json: 'facts' must be a list")
|
||
|
||
seen_ids = set()
|
||
for i, fact in enumerate(data.get("facts", [])):
|
||
fact_errors = validate_fact(fact, source=f"index.json facts[{i}]")
|
||
errors.extend(fact_errors)
|
||
|
||
if "id" in fact:
|
||
if fact["id"] in seen_ids:
|
||
errors.append(f"index.json: duplicate id '{fact['id']}'")
|
||
seen_ids.add(fact["id"])
|
||
|
||
return errors
|
||
|
||
|
||
def validate_yaml_facts(facts: list[dict], source: str) -> list[str]:
|
||
"""Validate facts extracted from a YAML file."""
|
||
errors = []
|
||
seen_ids = set()
|
||
|
||
for i, fact in enumerate(facts):
|
||
fact_errors = validate_fact(fact, source=f"{source}[{i}]")
|
||
errors.extend(fact_errors)
|
||
|
||
if "id" in fact:
|
||
if fact["id"] in seen_ids:
|
||
errors.append(f"{source}: duplicate id '{fact['id']}'")
|
||
seen_ids.add(fact["id"])
|
||
|
||
return errors
|
||
|
||
|
||
def main():
|
||
fix_mode = "--fix" in sys.argv
|
||
repo_root = Path(__file__).parent.parent
|
||
knowledge_dir = repo_root / "knowledge"
|
||
index_path = knowledge_dir / "index.json"
|
||
|
||
all_errors = []
|
||
|
||
# Validate index.json
|
||
index_errors = validate_index(index_path)
|
||
all_errors.extend(index_errors)
|
||
|
||
# Validate YAML files (basic existence check — full YAML parsing requires pyyaml)
|
||
yaml_dirs = ["global", "repos", "agents"]
|
||
for dir_name in yaml_dirs:
|
||
dir_path = knowledge_dir / dir_name
|
||
if not dir_path.exists():
|
||
all_errors.append(f"knowledge/{dir_name}/ directory not found")
|
||
|
||
# Report
|
||
if all_errors:
|
||
print(f"VALIDATION FAILED — {len(all_errors)} error(s):\n")
|
||
for err in all_errors:
|
||
print(f" ✗ {err}")
|
||
sys.exit(1)
|
||
else:
|
||
# Count facts
|
||
try:
|
||
with open(index_path) as f:
|
||
data = json.load(f)
|
||
fact_count = len(data.get("facts", []))
|
||
except:
|
||
fact_count = 0
|
||
|
||
print(f"VALIDATION PASSED")
|
||
print(f" index.json: {fact_count} facts")
|
||
print(f" schema: v1")
|
||
sys.exit(0)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|