Files
timmy-config/training/provenance_dashboard.py

135 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
Provenance Dashboard
Shows statistics about training data provenance:
- Pair count by model over time
- Pair count by source
- Exclusion statistics
- Provenance coverage
Usage:
python provenance_dashboard.py data/merged_training_data.jsonl
python provenance_dashboard.py data/ --all
"""
import argparse
import json
import sys
from pathlib import Path
from collections import defaultdict
from datetime import datetime
try:
from training_pair_provenance import get_provenance_stats
except ImportError:
sys.path.insert(0, str(Path(__file__).parent))
from training_pair_provenance import get_provenance_stats
def load_jsonl(path: Path) -> list[dict]:
"""Load a JSONL file."""
entries = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
return entries
def analyze_temporal(pairs: list[dict]) -> dict:
"""Analyze pairs by time period."""
by_date = defaultdict(lambda: {"total": 0, "with_provenance": 0})
for pair in pairs:
prov = pair.get("provenance", {})
ts = prov.get("timestamp", "")
if ts:
date = ts[:10] # YYYY-MM-DD
else:
date = "unknown"
by_date[date]["total"] += 1
if prov:
by_date[date]["with_provenance"] += 1
return dict(by_date)
def print_dashboard(pairs: list[dict], title: str = "Training Data Provenance"):
"""Print comprehensive provenance dashboard."""
stats = get_provenance_stats(pairs)
temporal = analyze_temporal(pairs)
print(f"\n{'=' * 60}")
print(f" {title}")
print(f"{'=' * 60}")
# Coverage
print(f"\nPROVENANCE COVERAGE")
print(f" Total pairs: {stats['total_pairs']}")
print(f" With provenance: {stats['with_provenance']}")
print(f" Coverage: {stats['coverage_pct']}%")
print(f" Excluded: {stats['excluded']}")
# By source type
print(f"\nBY SOURCE TYPE")
for st, count in sorted(stats["by_source_type"].items()):
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
bar = "" * int(pct / 2)
print(f" {st:15} {count:6} ({pct:5.1f}%) {bar}")
# By model
print(f"\nBY MODEL")
for model, count in sorted(stats["by_model"].items()):
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
bar = "" * int(pct / 2)
print(f" {model:20} {count:6} ({pct:5.1f}%) {bar}")
# Temporal
if temporal:
print(f"\nBY DATE (last 10)")
sorted_dates = sorted(temporal.keys())[-10:]
for date in sorted_dates:
d = temporal[date]
print(f" {date} total={d['total']:4} provenance={d['with_provenance']:4}")
# Quality indicators
print(f"\nQUALITY INDICATORS")
if stats['coverage_pct'] == 100:
print(" ✓ Full provenance coverage")
elif stats['coverage_pct'] >= 90:
print(" ⚠ Near-full coverage — some pairs missing provenance")
else:
print(" ✗ Low coverage — many pairs missing provenance")
if stats['excluded'] > 0:
print(f" {stats['excluded']} pairs excluded from training")
print()
def main():
parser = argparse.ArgumentParser(description="Provenance Dashboard")
parser.add_argument("input", type=str, help="Path to JSONL or directory")
parser.add_argument("--all", action="store_true", help="Process all JSONL in directory")
args = parser.parse_args()
input_path = Path(args.input)
if input_path.is_file():
pairs = load_jsonl(input_path)
print_dashboard(pairs, f"Provenance: {input_path.name}")
elif input_path.is_dir() and args.all:
for jsonl_file in sorted(input_path.glob("*.jsonl")):
pairs = load_jsonl(jsonl_file)
print_dashboard(pairs, f"Provenance: {jsonl_file.name}")
else:
print(f"ERROR: {input_path} is not a file (use --all for directories)")
sys.exit(1)
if __name__ == "__main__":
main()