156 lines
5.0 KiB
Python
156 lines
5.0 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Validate knowledge files and index.json against the schema.
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
python scripts/validate_knowledge.py [--fix]
|
|||
|
|
|
|||
|
|
Without --fix: reports errors and exits non-zero if any found.
|
|||
|
|
With --fix: auto-generates missing IDs and updates index.json.
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import sys
|
|||
|
|
import os
|
|||
|
|
from pathlib import Path
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
|
|||
|
|
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
|
|||
|
|
MAX_FACT_LENGTH = 280
|
|||
|
|
|
|||
|
|
|
|||
|
|
def validate_fact(fact: dict, source: str = "") -> list[str]:
|
|||
|
|
"""Validate a single fact dict. Returns list of errors."""
|
|||
|
|
errors = []
|
|||
|
|
|
|||
|
|
for field in REQUIRED_FACT_FIELDS:
|
|||
|
|
if field not in fact:
|
|||
|
|
errors.append(f"{source}: missing required field '{field}'")
|
|||
|
|
|
|||
|
|
if "fact" in fact:
|
|||
|
|
if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
|
|||
|
|
errors.append(f"{source}: 'fact' must be non-empty string")
|
|||
|
|
elif len(fact["fact"]) > MAX_FACT_LENGTH:
|
|||
|
|
errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars ({len(fact['fact'])})")
|
|||
|
|
|
|||
|
|
if "category" in fact and fact["category"] not in VALID_CATEGORIES:
|
|||
|
|
errors.append(f"{source}: invalid category '{fact['category']}' — must be one of {VALID_CATEGORIES}")
|
|||
|
|
|
|||
|
|
if "confidence" in fact:
|
|||
|
|
if not isinstance(fact["confidence"], (int, float)):
|
|||
|
|
errors.append(f"{source}: 'confidence' must be a number")
|
|||
|
|
elif not (0.0 <= fact["confidence"] <= 1.0):
|
|||
|
|
errors.append(f"{source}: 'confidence' must be 0.0–1.0, got {fact['confidence']}")
|
|||
|
|
|
|||
|
|
if "id" in fact:
|
|||
|
|
parts = fact["id"].split(":")
|
|||
|
|
if len(parts) != 3:
|
|||
|
|
errors.append(f"{source}: 'id' must be domain:category:sequence, got '{fact['id']}'")
|
|||
|
|
elif parts[1] not in VALID_CATEGORIES:
|
|||
|
|
errors.append(f"{source}: id category '{parts[1]}' not in {VALID_CATEGORIES}")
|
|||
|
|
|
|||
|
|
if "tags" in fact:
|
|||
|
|
if not isinstance(fact["tags"], list):
|
|||
|
|
errors.append(f"{source}: 'tags' must be a list")
|
|||
|
|
else:
|
|||
|
|
for tag in fact["tags"]:
|
|||
|
|
if not isinstance(tag, str) or not tag.replace("-", "").replace("_", "").isalnum():
|
|||
|
|
errors.append(f"{source}: tag '{tag}' must be lowercase alphanumeric+hyphens")
|
|||
|
|
|
|||
|
|
return errors
|
|||
|
|
|
|||
|
|
|
|||
|
|
def validate_index(index_path: Path) -> list[str]:
|
|||
|
|
"""Validate index.json."""
|
|||
|
|
errors = []
|
|||
|
|
|
|||
|
|
if not index_path.exists():
|
|||
|
|
return [f"index.json not found at {index_path}"]
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
with open(index_path) as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
except json.JSONDecodeError as e:
|
|||
|
|
return [f"index.json: invalid JSON — {e}"]
|
|||
|
|
|
|||
|
|
if "version" not in data:
|
|||
|
|
errors.append("index.json: missing 'version' field")
|
|||
|
|
if "facts" not in data:
|
|||
|
|
errors.append("index.json: missing 'facts' field")
|
|||
|
|
elif not isinstance(data["facts"], list):
|
|||
|
|
errors.append("index.json: 'facts' must be a list")
|
|||
|
|
|
|||
|
|
seen_ids = set()
|
|||
|
|
for i, fact in enumerate(data.get("facts", [])):
|
|||
|
|
fact_errors = validate_fact(fact, source=f"index.json facts[{i}]")
|
|||
|
|
errors.extend(fact_errors)
|
|||
|
|
|
|||
|
|
if "id" in fact:
|
|||
|
|
if fact["id"] in seen_ids:
|
|||
|
|
errors.append(f"index.json: duplicate id '{fact['id']}'")
|
|||
|
|
seen_ids.add(fact["id"])
|
|||
|
|
|
|||
|
|
return errors
|
|||
|
|
|
|||
|
|
|
|||
|
|
def validate_yaml_facts(facts: list[dict], source: str) -> list[str]:
|
|||
|
|
"""Validate facts extracted from a YAML file."""
|
|||
|
|
errors = []
|
|||
|
|
seen_ids = set()
|
|||
|
|
|
|||
|
|
for i, fact in enumerate(facts):
|
|||
|
|
fact_errors = validate_fact(fact, source=f"{source}[{i}]")
|
|||
|
|
errors.extend(fact_errors)
|
|||
|
|
|
|||
|
|
if "id" in fact:
|
|||
|
|
if fact["id"] in seen_ids:
|
|||
|
|
errors.append(f"{source}: duplicate id '{fact['id']}'")
|
|||
|
|
seen_ids.add(fact["id"])
|
|||
|
|
|
|||
|
|
return errors
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
fix_mode = "--fix" in sys.argv
|
|||
|
|
repo_root = Path(__file__).parent.parent
|
|||
|
|
knowledge_dir = repo_root / "knowledge"
|
|||
|
|
index_path = knowledge_dir / "index.json"
|
|||
|
|
|
|||
|
|
all_errors = []
|
|||
|
|
|
|||
|
|
# Validate index.json
|
|||
|
|
index_errors = validate_index(index_path)
|
|||
|
|
all_errors.extend(index_errors)
|
|||
|
|
|
|||
|
|
# Validate YAML files (basic existence check — full YAML parsing requires pyyaml)
|
|||
|
|
yaml_dirs = ["global", "repos", "agents"]
|
|||
|
|
for dir_name in yaml_dirs:
|
|||
|
|
dir_path = knowledge_dir / dir_name
|
|||
|
|
if not dir_path.exists():
|
|||
|
|
all_errors.append(f"knowledge/{dir_name}/ directory not found")
|
|||
|
|
|
|||
|
|
# Report
|
|||
|
|
if all_errors:
|
|||
|
|
print(f"VALIDATION FAILED — {len(all_errors)} error(s):\n")
|
|||
|
|
for err in all_errors:
|
|||
|
|
print(f" ✗ {err}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
else:
|
|||
|
|
# Count facts
|
|||
|
|
try:
|
|||
|
|
with open(index_path) as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
fact_count = len(data.get("facts", []))
|
|||
|
|
except:
|
|||
|
|
fact_count = 0
|
|||
|
|
|
|||
|
|
print(f"VALIDATION PASSED")
|
|||
|
|
print(f" index.json: {fact_count} facts")
|
|||
|
|
print(f" schema: v1")
|
|||
|
|
sys.exit(0)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|