fix: docs: Allegro burn-mode validator rules (#242) (closes #843)

This commit is contained in:
Alexander Whitestone
2026-04-21 21:23:48 -04:00
parent e4ba0c8b91
commit 0ae323bfe0
25 changed files with 8987 additions and 8637 deletions

View File

@@ -0,0 +1,131 @@
#!/usr/bin/env python3
"""
provenance_dashboard.py — Provenance statistics dashboard.
Shows pair count by model, by source, coverage over time.
Usage:
python3 provenance_dashboard.py
python3 provenance_dashboard.py --days 30
python3 provenance_dashboard.py --json
"""
import json
import os
import sys
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List
DATA_DIRS = [
Path.home() / "timmy-config" / "training-data",
Path.home() / "timmy-config" / "training" / "data",
]
def scan_training_data() -> List[dict]:
"""Scan all JSONL files for provenance metadata."""
all_pairs = []
for data_dir in DATA_DIRS:
if not data_dir.exists():
continue
for filepath in data_dir.rglob("*.jsonl"):
try:
with open(filepath) as f:
for line in f:
if line.strip():
pair = json.loads(line)
pair["_source_file"] = str(filepath.relative_to(data_dir))
all_pairs.append(pair)
except (json.JSONDecodeError, IOError):
continue
return all_pairs
def compute_stats(pairs: List[dict]) -> dict:
"""Compute provenance statistics."""
total = len(pairs)
if total == 0:
return {"total": 0}
with_session = sum(1 for p in pairs if p.get("source_session_id"))
with_model = sum(1 for p in pairs if p.get("model"))
with_timestamp = sum(1 for p in pairs if p.get("timestamp"))
models = Counter(p.get("model", "unknown") for p in pairs)
sources = Counter(p.get("source_type", p.get("source", "unknown")) for p in pairs)
files = Counter(p.get("_source_file", "unknown") for p in pairs)
return {
"total": total,
"with_session_id": with_session,
"with_model": with_model,
"with_timestamp": with_timestamp,
"coverage_session_pct": round(with_session / total * 100, 1),
"coverage_model_pct": round(with_model / total * 100, 1),
"coverage_timestamp_pct": round(with_timestamp / total * 100, 1),
"by_model": dict(models.most_common(15)),
"by_source": dict(sources.most_common()),
"by_file": dict(files.most_common(10)),
}
def render_dashboard(stats: dict):
"""Render human-readable dashboard."""
print(f"\n{'='*60}")
print(f" PROVENANCE DASHBOARD")
print(f" {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC')}")
print(f"{'='*60}")
total = stats.get("total", 0)
if total == 0:
print(" No training data found.")
print(f"{'='*60}\n")
return
print(f"\n Total pairs: {total}")
print(f"\n Coverage:")
print(f" source_session_id: {stats['with_session_id']:>6} ({stats['coverage_session_pct']}%)")
print(f" model: {stats['with_model']:>6} ({stats['coverage_model_pct']}%)")
print(f" timestamp: {stats['with_timestamp']:>6} ({stats['coverage_timestamp_pct']}%)")
print(f"\n By Model:")
for model, count in sorted(stats["by_model"].items(), key=lambda x: -x[1]):
bar = "" * (count * 40 // total)
print(f" {model:<25} {count:>6} {bar}")
print(f"\n By Source:")
for source, count in sorted(stats["by_source"].items(), key=lambda x: -x[1]):
print(f" {source:<20} {count:>6}")
print(f"\n Top Files:")
for fname, count in list(stats["by_file"].items())[:10]:
short = fname[-40:] if len(fname) > 40 else fname
print(f" {short:<42} {count:>6}")
print(f"{'='*60}\n")
def main():
import argparse
parser = argparse.ArgumentParser(description="Provenance Dashboard")
parser.add_argument("--json", action="store_true", help="JSON output")
args = parser.parse_args()
pairs = scan_training_data()
stats = compute_stats(pairs)
if args.json:
# Remove internal fields
for p in pairs:
p.pop("_source_file", None)
print(json.dumps(stats, indent=2))
else:
render_dashboard(stats)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,136 @@
#!/usr/bin/env python3
"""
provenance_validate.py — Validate provenance metadata on training data.
Checks:
- All pairs have required provenance fields
- source_session_id, model, timestamp present
- Coverage report by model and source
Usage:
python3 provenance_validate.py training-data/*.jsonl
python3 provenance_validate.py --threshold 80 training-data/*.jsonl
"""
import json
import sys
from pathlib import Path
from typing import List
REQUIRED_FIELDS = ["source_session_id", "model", "timestamp"]
def validate_file(filepath: str) -> dict:
"""Validate provenance on a single JSONL file."""
pairs = []
with open(filepath) as f:
for line in f:
if line.strip():
pairs.append(json.loads(line))
total = len(pairs)
with_provenance = 0
missing_by_field = {f: 0 for f in REQUIRED_FIELDS}
by_model = {}
by_source = {}
for pair in pairs:
has_all = True
for field in REQUIRED_FIELDS:
if field not in pair or not pair[field]:
missing_by_field[field] += 1
has_all = False
if has_all:
with_provenance += 1
model = pair.get("model", "unknown")
source = pair.get("source_type", pair.get("source", "unknown"))
by_model[model] = by_model.get(model, 0) + 1
by_source[source] = by_source.get(source, 0) + 1
coverage = (with_provenance / total * 100) if total > 0 else 0
return {
"file": str(filepath),
"total": total,
"with_provenance": with_provenance,
"coverage_pct": round(coverage, 1),
"missing_by_field": missing_by_field,
"by_model": by_model,
"by_source": by_source,
}
def validate_all(files: List[str], threshold: float = 0) -> dict:
"""Validate provenance across multiple files."""
results = []
total_pairs = 0
total_with_prov = 0
for filepath in files:
result = validate_file(filepath)
results.append(result)
total_pairs += result["total"]
total_with_prov += result["with_provenance"]
overall_coverage = (total_with_prov / total_pairs * 100) if total_pairs > 0 else 0
return {
"files": results,
"total_pairs": total_pairs,
"total_with_provenance": total_with_prov,
"overall_coverage_pct": round(overall_coverage, 1),
"passes_threshold": overall_coverage >= threshold,
}
def main():
import argparse
parser = argparse.ArgumentParser(description="Validate training data provenance")
parser.add_argument("files", nargs="+", help="JSONL files to validate")
parser.add_argument("--threshold", type=float, default=0,
help="Minimum coverage percentage to pass")
parser.add_argument("--json", action="store_true", help="JSON output")
args = parser.parse_args()
# Expand globs
files = []
for pattern in args.files:
expanded = list(Path(".").glob(pattern)) if "*" in pattern else [Path(pattern)]
files.extend(str(f) for f in expanded if f.exists())
if not files:
print("No files found", file=sys.stderr)
sys.exit(1)
result = validate_all(files, args.threshold)
if args.json:
print(json.dumps(result, indent=2))
else:
print(f"\n{'='*50}")
print(" PROVENANCE VALIDATION REPORT")
print(f"{'='*50}")
print(f" Total pairs: {result['total_pairs']}")
print(f" With provenance: {result['total_with_provenance']}")
print(f" Coverage: {result['overall_coverage_pct']}%")
if args.threshold > 0:
status = "PASS" if result["passes_threshold"] else "FAIL"
print(f" Threshold: {args.threshold}% [{status}]")
print(f"\n Per file:")
for f in result["files"]:
icon = "" if f["coverage_pct"] >= args.threshold else ""
print(f" {icon} {f['file']}: {f['coverage_pct']}% ({f['with_provenance']}/{f['total']})")
print(f"{'='*50}\n")
# Exit code
if args.threshold > 0 and not result["passes_threshold"]:
sys.exit(1)
sys.exit(0)
if __name__ == "__main__":
main()