Implements issue #14: 7 metrics that prove knowledge compounding. Metrics: - Knowledge velocity: new facts/day (from index.json) - Knowledge coverage: % domains with 10+ facts (from YAML files) - Hit rate: % sessions referencing bootstrap knowledge - Error recurrence: same errors across sessions (should decrease) - Task completion: % sessions with successful end_reason - First-try success: actions without backtracking (tool/msg ratio) - Knowledge age: staleness of facts (freshness score) Data sources: - knowledge/index.json + YAML files for fact metrics - ~/.hermes/state.db sessions + messages tables Features: - JSON and markdown output formats - --since, --repo, --format flags - 7-day trend tracking via snapshot persistence - Runs in 33ms on 11.9K sessions / 192K messages - Dashboard auto-generation with --save-snapshot Closes #14
81 lines
2.7 KiB
Python
81 lines
2.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate knowledge files and index.json against the schema.
|
|
|
|
Usage:
|
|
python scripts/validate_knowledge.py
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
VALID_CATEGORIES = {"fact", "pitfall", "pattern", "tool-quirk", "question"}
|
|
REQUIRED_FACT_FIELDS = {"id", "fact", "category", "domain", "confidence"}
|
|
MAX_FACT_LENGTH = 280
|
|
|
|
|
|
def validate_fact(fact, source=""):
|
|
errors = []
|
|
for field in REQUIRED_FACT_FIELDS:
|
|
if field not in fact:
|
|
errors.append(f"{source}: missing required field '{field}'")
|
|
if "fact" in fact:
|
|
if not isinstance(fact["fact"], str) or len(fact["fact"].strip()) == 0:
|
|
errors.append(f"{source}: 'fact' must be non-empty string")
|
|
elif len(fact["fact"]) > MAX_FACT_LENGTH:
|
|
errors.append(f"{source}: 'fact' exceeds {MAX_FACT_LENGTH} chars")
|
|
if "category" in fact and fact["category"] not in VALID_CATEGORIES:
|
|
errors.append(f"{source}: invalid category '{fact['category']}'")
|
|
if "confidence" in fact:
|
|
if not isinstance(fact["confidence"], (int, float)):
|
|
errors.append(f"{source}: 'confidence' must be a number")
|
|
elif not (0.0 <= fact["confidence"] <= 1.0):
|
|
errors.append(f"{source}: 'confidence' must be 0.0-1.0")
|
|
if "id" in fact:
|
|
parts = fact["id"].split(":")
|
|
if len(parts) != 3:
|
|
errors.append(f"{source}: 'id' must be domain:category:sequence")
|
|
elif parts[1] not in VALID_CATEGORIES:
|
|
errors.append(f"{source}: id category '{parts[1]}' invalid")
|
|
return errors
|
|
|
|
|
|
def main():
|
|
repo_root = Path(__file__).parent.parent
|
|
index_path = repo_root / "knowledge" / "index.json"
|
|
all_errors = []
|
|
|
|
if not index_path.exists():
|
|
print(f"VALIDATION FAILED: index.json not found at {index_path}")
|
|
sys.exit(1)
|
|
|
|
with open(index_path) as f:
|
|
data = json.load(f)
|
|
|
|
if "version" not in data:
|
|
all_errors.append("index.json: missing 'version'")
|
|
if "facts" not in data or not isinstance(data["facts"], list):
|
|
all_errors.append("index.json: missing or invalid 'facts'")
|
|
|
|
seen_ids = set()
|
|
for i, fact in enumerate(data.get("facts", [])):
|
|
all_errors.extend(validate_fact(fact, f"facts[{i}]"))
|
|
if "id" in fact:
|
|
if fact["id"] in seen_ids:
|
|
all_errors.append(f"index.json: duplicate id '{fact['id']}'")
|
|
seen_ids.add(fact["id"])
|
|
|
|
if all_errors:
|
|
print(f"VALIDATION FAILED - {len(all_errors)} error(s):\n")
|
|
for e in all_errors:
|
|
print(f" x {e}")
|
|
sys.exit(1)
|
|
else:
|
|
print(f"VALIDATION PASSED - {len(data.get('facts', []))} facts, schema v{data.get('version', '?')}")
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|