fix: docs: Allegro burn-mode validator rules (#242) (closes #843)

2026-04-21 21:23:48 -04:00
parent e4ba0c8b91
commit 0ae323bfe0
25 changed files with 8987 additions and 8637 deletions
--- a/scripts/provenance_dashboard.py
+++ b/scripts/provenance_dashboard.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python3
+"""
+provenance_dashboard.py — Provenance statistics dashboard.
+
+Shows pair count by model, by source, coverage over time.
+
+Usage:
+    python3 provenance_dashboard.py
+    python3 provenance_dashboard.py --days 30
+    python3 provenance_dashboard.py --json
+"""
+
+import json
+import os
+import sys
+from collections import Counter
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Dict, List
+
+DATA_DIRS = [
+    Path.home() / "timmy-config" / "training-data",
+    Path.home() / "timmy-config" / "training" / "data",
+]
+
+
+def scan_training_data() -> List[dict]:
+    """Scan all JSONL files for provenance metadata."""
+    all_pairs = []
+
+    for data_dir in DATA_DIRS:
+        if not data_dir.exists():
+            continue
+        for filepath in data_dir.rglob("*.jsonl"):
+            try:
+                with open(filepath) as f:
+                    for line in f:
+                        if line.strip():
+                            pair = json.loads(line)
+                            pair["_source_file"] = str(filepath.relative_to(data_dir))
+                            all_pairs.append(pair)
+            except (json.JSONDecodeError, IOError):
+                continue
+
+    return all_pairs
+
+
+def compute_stats(pairs: List[dict]) -> dict:
+    """Compute provenance statistics."""
+    total = len(pairs)
+    if total == 0:
+        return {"total": 0}
+
+    with_session = sum(1 for p in pairs if p.get("source_session_id"))
+    with_model = sum(1 for p in pairs if p.get("model"))
+    with_timestamp = sum(1 for p in pairs if p.get("timestamp"))
+
+    models = Counter(p.get("model", "unknown") for p in pairs)
+    sources = Counter(p.get("source_type", p.get("source", "unknown")) for p in pairs)
+    files = Counter(p.get("_source_file", "unknown") for p in pairs)
+
+    return {
+        "total": total,
+        "with_session_id": with_session,
+        "with_model": with_model,
+        "with_timestamp": with_timestamp,
+        "coverage_session_pct": round(with_session / total * 100, 1),
+        "coverage_model_pct": round(with_model / total * 100, 1),
+        "coverage_timestamp_pct": round(with_timestamp / total * 100, 1),
+        "by_model": dict(models.most_common(15)),
+        "by_source": dict(sources.most_common()),
+        "by_file": dict(files.most_common(10)),
+    }
+
+
+def render_dashboard(stats: dict):
+    """Render human-readable dashboard."""
+    print(f"\n{'='*60}")
+    print(f"  PROVENANCE DASHBOARD")
+    print(f"  {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}")
+    print(f"{'='*60}")
+
+    total = stats.get("total", 0)
+    if total == 0:
+        print("  No training data found.")
+        print(f"{'='*60}\n")
+        return
+
+    print(f"\n  Total pairs:         {total}")
+    print(f"\n  Coverage:")
+    print(f"    source_session_id: {stats['with_session_id']:>6} ({stats['coverage_session_pct']}%)")
+    print(f"    model:             {stats['with_model']:>6} ({stats['coverage_model_pct']}%)")
+    print(f"    timestamp:         {stats['with_timestamp']:>6} ({stats['coverage_timestamp_pct']}%)")
+
+    print(f"\n  By Model:")
+    for model, count in sorted(stats["by_model"].items(), key=lambda x: -x[1]):
+        bar = "█" * (count * 40 // total)
+        print(f"    {model:<25} {count:>6} {bar}")
+
+    print(f"\n  By Source:")
+    for source, count in sorted(stats["by_source"].items(), key=lambda x: -x[1]):
+        print(f"    {source:<20} {count:>6}")
+
+    print(f"\n  Top Files:")
+    for fname, count in list(stats["by_file"].items())[:10]:
+        short = fname[-40:] if len(fname) > 40 else fname
+        print(f"    {short:<42} {count:>6}")
+
+    print(f"{'='*60}\n")
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Provenance Dashboard")
+    parser.add_argument("--json", action="store_true", help="JSON output")
+    args = parser.parse_args()
+
+    pairs = scan_training_data()
+    stats = compute_stats(pairs)
+
+    if args.json:
+        # Remove internal fields
+        for p in pairs:
+            p.pop("_source_file", None)
+        print(json.dumps(stats, indent=2))
+    else:
+        render_dashboard(stats)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/provenance_validate.py
+++ b/scripts/provenance_validate.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""
+provenance_validate.py — Validate provenance metadata on training data.
+
+Checks:
+- All pairs have required provenance fields
+- source_session_id, model, timestamp present
+- Coverage report by model and source
+
+Usage:
+    python3 provenance_validate.py training-data/*.jsonl
+    python3 provenance_validate.py --threshold 80 training-data/*.jsonl
+"""
+
+import json
+import sys
+from pathlib import Path
+from typing import List
+
+REQUIRED_FIELDS = ["source_session_id", "model", "timestamp"]
+
+
+def validate_file(filepath: str) -> dict:
+    """Validate provenance on a single JSONL file."""
+    pairs = []
+    with open(filepath) as f:
+        for line in f:
+            if line.strip():
+                pairs.append(json.loads(line))
+
+    total = len(pairs)
+    with_provenance = 0
+    missing_by_field = {f: 0 for f in REQUIRED_FIELDS}
+    by_model = {}
+    by_source = {}
+
+    for pair in pairs:
+        has_all = True
+        for field in REQUIRED_FIELDS:
+            if field not in pair or not pair[field]:
+                missing_by_field[field] += 1
+                has_all = False
+
+        if has_all:
+            with_provenance += 1
+            model = pair.get("model", "unknown")
+            source = pair.get("source_type", pair.get("source", "unknown"))
+            by_model[model] = by_model.get(model, 0) + 1
+            by_source[source] = by_source.get(source, 0) + 1
+
+    coverage = (with_provenance / total * 100) if total > 0 else 0
+
+    return {
+        "file": str(filepath),
+        "total": total,
+        "with_provenance": with_provenance,
+        "coverage_pct": round(coverage, 1),
+        "missing_by_field": missing_by_field,
+        "by_model": by_model,
+        "by_source": by_source,
+    }
+
+
+def validate_all(files: List[str], threshold: float = 0) -> dict:
+    """Validate provenance across multiple files."""
+    results = []
+    total_pairs = 0
+    total_with_prov = 0
+
+    for filepath in files:
+        result = validate_file(filepath)
+        results.append(result)
+        total_pairs += result["total"]
+        total_with_prov += result["with_provenance"]
+
+    overall_coverage = (total_with_prov / total_pairs * 100) if total_pairs > 0 else 0
+
+    return {
+        "files": results,
+        "total_pairs": total_pairs,
+        "total_with_provenance": total_with_prov,
+        "overall_coverage_pct": round(overall_coverage, 1),
+        "passes_threshold": overall_coverage >= threshold,
+    }
+
+
+def main():
+    import argparse
+    parser = argparse.ArgumentParser(description="Validate training data provenance")
+    parser.add_argument("files", nargs="+", help="JSONL files to validate")
+    parser.add_argument("--threshold", type=float, default=0,
+                        help="Minimum coverage percentage to pass")
+    parser.add_argument("--json", action="store_true", help="JSON output")
+    args = parser.parse_args()
+
+    # Expand globs
+    files = []
+    for pattern in args.files:
+        expanded = list(Path(".").glob(pattern)) if "*" in pattern else [Path(pattern)]
+        files.extend(str(f) for f in expanded if f.exists())
+
+    if not files:
+        print("No files found", file=sys.stderr)
+        sys.exit(1)
+
+    result = validate_all(files, args.threshold)
+
+    if args.json:
+        print(json.dumps(result, indent=2))
+    else:
+        print(f"\n{'='*50}")
+        print("  PROVENANCE VALIDATION REPORT")
+        print(f"{'='*50}")
+        print(f"  Total pairs:      {result['total_pairs']}")
+        print(f"  With provenance:  {result['total_with_provenance']}")
+        print(f"  Coverage:         {result['overall_coverage_pct']}%")
+
+        if args.threshold > 0:
+            status = "PASS" if result["passes_threshold"] else "FAIL"
+            print(f"  Threshold:        {args.threshold}% [{status}]")
+
+        print(f"\n  Per file:")
+        for f in result["files"]:
+            icon = "✓" if f["coverage_pct"] >= args.threshold else "✗"
+            print(f"    {icon} {f['file']}: {f['coverage_pct']}% ({f['with_provenance']}/{f['total']})")
+
+        print(f"{'='*50}\n")
+
+    # Exit code
+    if args.threshold > 0 and not result["passes_threshold"]:
+        sys.exit(1)
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()