fix(#19): Migrate MemPalace + fact_store into knowledge store\n\nMigrated 55 facts from 3 sources:\n- MemPalace: 11 facts (forge architecture, triage, critical issues)\n- Fact store: 29 entries (user prefs, tool quirks, operational lessons)\n- Skills: 15 patterns (key operational skills as knowledge)\n\nAll facts have source attribution for traceability.\nDeduplicated by content hash.\n\nResolves #19
This commit is contained in:
155
scripts/validate_knowledge.py
Normal file
155
scripts/validate_knowledge.py
Normal file
@@ -0,0 +1,155 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate knowledge files and index.json against the schema.
|
||||
|
||||
Usage:
|
||||
python scripts/validate_knowledge.py [--fix]
|
||||
|
||||
Without --fix: reports errors and exits non-zero if any found.
|
||||
With --fix: auto-generates missing IDs and updates index.json.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import os
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
||||
VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
|
||||
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
|
||||
MAX_FACT_LENGTH = 280
|
||||
|
||||
|
||||
def validate_fact(fact: dict, source: str = "") -> list[str]:
|
||||
"""Validate a single fact dict. Returns list of errors."""
|
||||
errors = []
|
||||
|
||||
for field in REQUIRED_FACT_FIELDS:
|
||||
if field not in fact:
|
||||
errors.append(f"{source}: missing required field '{field}'")
|
||||
|
||||
if "fact" in fact:
|
||||
if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
|
||||
errors.append(f"{source}: 'fact' must be non-empty string")
|
||||
elif len(fact["fact"]) > MAX_FACT_LENGTH:
|
||||
errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars ({len(fact['fact'])})")
|
||||
|
||||
if "category" in fact and fact["category"] not in VALID_CATEGORIES:
|
||||
errors.append(f"{source}: invalid category '{fact['category']}' — must be one of {VALID_CATEGORIES}")
|
||||
|
||||
if "confidence" in fact:
|
||||
if not isinstance(fact["confidence"], (int, float)):
|
||||
errors.append(f"{source}: 'confidence' must be a number")
|
||||
elif not (0.0 <= fact["confidence"] <= 1.0):
|
||||
errors.append(f"{source}: 'confidence' must be 0.0–1.0, got {fact['confidence']}")
|
||||
|
||||
if "id" in fact:
|
||||
parts = fact["id"].split(":")
|
||||
if len(parts) != 3:
|
||||
errors.append(f"{source}: 'id' must be domain:category:sequence, got '{fact['id']}'")
|
||||
elif parts[1] not in VALID_CATEGORIES:
|
||||
errors.append(f"{source}: id category '{parts[1]}' not in {VALID_CATEGORIES}")
|
||||
|
||||
if "tags" in fact:
|
||||
if not isinstance(fact["tags"], list):
|
||||
errors.append(f"{source}: 'tags' must be a list")
|
||||
else:
|
||||
for tag in fact["tags"]:
|
||||
if not isinstance(tag, str) or not tag.replace("-", "").replace("_", "").isalnum():
|
||||
errors.append(f"{source}: tag '{tag}' must be lowercase alphanumeric+hyphens")
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_index(index_path: Path) -> list[str]:
|
||||
"""Validate index.json."""
|
||||
errors = []
|
||||
|
||||
if not index_path.exists():
|
||||
return [f"index.json not found at {index_path}"]
|
||||
|
||||
try:
|
||||
with open(index_path) as f:
|
||||
data = json.load(f)
|
||||
except json.JSONDecodeError as e:
|
||||
return [f"index.json: invalid JSON — {e}"]
|
||||
|
||||
if "version" not in data:
|
||||
errors.append("index.json: missing 'version' field")
|
||||
if "facts" not in data:
|
||||
errors.append("index.json: missing 'facts' field")
|
||||
elif not isinstance(data["facts"], list):
|
||||
errors.append("index.json: 'facts' must be a list")
|
||||
|
||||
seen_ids = set()
|
||||
for i, fact in enumerate(data.get("facts", [])):
|
||||
fact_errors = validate_fact(fact, source=f"index.json facts[{i}]")
|
||||
errors.extend(fact_errors)
|
||||
|
||||
if "id" in fact:
|
||||
if fact["id"] in seen_ids:
|
||||
errors.append(f"index.json: duplicate id '{fact['id']}'")
|
||||
seen_ids.add(fact["id"])
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def validate_yaml_facts(facts: list[dict], source: str) -> list[str]:
|
||||
"""Validate facts extracted from a YAML file."""
|
||||
errors = []
|
||||
seen_ids = set()
|
||||
|
||||
for i, fact in enumerate(facts):
|
||||
fact_errors = validate_fact(fact, source=f"{source}[{i}]")
|
||||
errors.extend(fact_errors)
|
||||
|
||||
if "id" in fact:
|
||||
if fact["id"] in seen_ids:
|
||||
errors.append(f"{source}: duplicate id '{fact['id']}'")
|
||||
seen_ids.add(fact["id"])
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def main():
|
||||
fix_mode = "--fix" in sys.argv
|
||||
repo_root = Path(__file__).parent.parent
|
||||
knowledge_dir = repo_root / "knowledge"
|
||||
index_path = knowledge_dir / "index.json"
|
||||
|
||||
all_errors = []
|
||||
|
||||
# Validate index.json
|
||||
index_errors = validate_index(index_path)
|
||||
all_errors.extend(index_errors)
|
||||
|
||||
# Validate YAML files (basic existence check — full YAML parsing requires pyyaml)
|
||||
yaml_dirs = ["global", "repos", "agents"]
|
||||
for dir_name in yaml_dirs:
|
||||
dir_path = knowledge_dir / dir_name
|
||||
if not dir_path.exists():
|
||||
all_errors.append(f"knowledge/{dir_name}/ directory not found")
|
||||
|
||||
# Report
|
||||
if all_errors:
|
||||
print(f"VALIDATION FAILED — {len(all_errors)} error(s):\n")
|
||||
for err in all_errors:
|
||||
print(f" ✗ {err}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Count facts
|
||||
try:
|
||||
with open(index_path) as f:
|
||||
data = json.load(f)
|
||||
fact_count = len(data.get("facts", []))
|
||||
except:
|
||||
fact_count = 0
|
||||
|
||||
print(f"VALIDATION PASSED")
|
||||
print(f" index.json: {fact_count} facts")
|
||||
print(f" schema: v1")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user