#!/usr/bin/env python3 """ Validate knowledge files and index.json against the schema. Usage: python scripts/validate_knowledge.py [--fix] Without --fix: reports errors and exits non-zero if any found. With --fix: auto-generates missing IDs and updates index.json. """ import json import sys import os from pathlib import Path from datetime import datetime VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"} REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"} MAX_FACT_LENGTH = 280 def validate_fact(fact: dict, source: str = "") -> list[str]: """Validate a single fact dict. Returns list of errors.""" errors = [] for field in REQUIRED_FACT_FIELDS: if field not in fact: errors.append(f"{source}: missing required field '{field}'") if "fact" in fact: if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0: errors.append(f"{source}: 'fact' must be non-empty string") elif len(fact["fact"]) > MAX_FACT_LENGTH: errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars ({len(fact['fact'])})") if "category" in fact and fact["category"] not in VALID_CATEGORIES: errors.append(f"{source}: invalid category '{fact['category']}' — must be one of {VALID_CATEGORIES}") if "confidence" in fact: if not isinstance(fact["confidence"], (int, float)): errors.append(f"{source}: 'confidence' must be a number") elif not (0.0 <= fact["confidence"] <= 1.0): errors.append(f"{source}: 'confidence' must be 0.0–1.0, got {fact['confidence']}") if "id" in fact: parts = fact["id"].split(":") if len(parts) != 3: errors.append(f"{source}: 'id' must be domain:category:sequence, got '{fact['id']}'") elif parts[1] not in VALID_CATEGORIES: errors.append(f"{source}: id category '{parts[1]}' not in {VALID_CATEGORIES}") if "tags" in fact: if not isinstance(fact["tags"], list): errors.append(f"{source}: 'tags' must be a list") else: for tag in fact["tags"]: if not isinstance(tag, str) or not tag.replace("-", "").replace("_", "").isalnum(): errors.append(f"{source}: tag '{tag}' must be lowercase alphanumeric+hyphens") return errors def validate_index(index_path: Path) -> list[str]: """Validate index.json.""" errors = [] if not index_path.exists(): return [f"index.json not found at {index_path}"] try: with open(index_path) as f: data = json.load(f) except json.JSONDecodeError as e: return [f"index.json: invalid JSON — {e}"] if "version" not in data: errors.append("index.json: missing 'version' field") if "facts" not in data: errors.append("index.json: missing 'facts' field") elif not isinstance(data["facts"], list): errors.append("index.json: 'facts' must be a list") seen_ids = set() for i, fact in enumerate(data.get("facts", [])): fact_errors = validate_fact(fact, source=f"index.json facts[{i}]") errors.extend(fact_errors) if "id" in fact: if fact["id"] in seen_ids: errors.append(f"index.json: duplicate id '{fact['id']}'") seen_ids.add(fact["id"]) return errors def validate_yaml_facts(facts: list[dict], source: str) -> list[str]: """Validate facts extracted from a YAML file.""" errors = [] seen_ids = set() for i, fact in enumerate(facts): fact_errors = validate_fact(fact, source=f"{source}[{i}]") errors.extend(fact_errors) if "id" in fact: if fact["id"] in seen_ids: errors.append(f"{source}: duplicate id '{fact['id']}'") seen_ids.add(fact["id"]) return errors def main(): fix_mode = "--fix" in sys.argv repo_root = Path(__file__).parent.parent knowledge_dir = repo_root / "knowledge" index_path = knowledge_dir / "index.json" all_errors = [] # Validate index.json index_errors = validate_index(index_path) all_errors.extend(index_errors) # Validate YAML files (basic existence check — full YAML parsing requires pyyaml) yaml_dirs = ["global", "repos", "agents"] for dir_name in yaml_dirs: dir_path = knowledge_dir / dir_name if not dir_path.exists(): all_errors.append(f"knowledge/{dir_name}/ directory not found") # Report if all_errors: print(f"VALIDATION FAILED — {len(all_errors)} error(s):\n") for err in all_errors: print(f" ✗ {err}") sys.exit(1) else: # Count facts try: with open(index_path) as f: data = json.load(f) fact_count = len(data.get("facts", [])) except: fact_count = 0 print(f"VALIDATION PASSED") print(f" index.json: {fact_count} facts") print(f" schema: v1") sys.exit(0) if __name__ == "__main__": main()