Files
timmy-config/training/provenance_dashboard.py

135 lines
4.0 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Provenance Dashboard
Shows statistics about training data provenance:
- Pair count by model over time
- Pair count by source
- Exclusion statistics
- Provenance coverage
Usage:
python provenance_dashboard.py data/merged_training_data.jsonl
python provenance_dashboard.py data/ --all
"""
import argparse
import json
import sys
from pathlib import Path
from collections import defaultdict
from datetime import datetime
try:
from training_pair_provenance import get_provenance_stats
except ImportError:
sys.path.insert(0, str(Path(__file__).parent))
from training_pair_provenance import get_provenance_stats
def load_jsonl(path: Path) -> list[dict]:
"""Load a JSONL file."""
entries = []
with open(path) as f:
for line in f:
line = line.strip()
if line:
entries.append(json.loads(line))
return entries
def analyze_temporal(pairs: list[dict]) -> dict:
"""Analyze pairs by time period."""
by_date = defaultdict(lambda: {"total": 0, "with_provenance": 0})
for pair in pairs:
prov = pair.get("provenance", {})
ts = prov.get("timestamp", "")
if ts:
date = ts[:10] # YYYY-MM-DD
else:
date = "unknown"
by_date[date]["total"] += 1
if prov:
by_date[date]["with_provenance"] += 1
return dict(by_date)
def print_dashboard(pairs: list[dict], title: str = "Training Data Provenance"):
"""Print comprehensive provenance dashboard."""
stats = get_provenance_stats(pairs)
temporal = analyze_temporal(pairs)
print(f"\n{'=' * 60}")
print(f" {title}")
print(f"{'=' * 60}")
# Coverage
print(f"\nPROVENANCE COVERAGE")
print(f" Total pairs: {stats['total_pairs']}")
print(f" With provenance: {stats['with_provenance']}")
print(f" Coverage: {stats['coverage_pct']}%")
print(f" Excluded: {stats['excluded']}")
# By source type
print(f"\nBY SOURCE TYPE")
for st, count in sorted(stats["by_source_type"].items()):
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
bar = "" * int(pct / 2)
print(f" {st:15} {count:6} ({pct:5.1f}%) {bar}")
# By model
print(f"\nBY MODEL")
for model, count in sorted(stats["by_model"].items()):
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
bar = "" * int(pct / 2)
print(f" {model:20} {count:6} ({pct:5.1f}%) {bar}")
# Temporal
if temporal:
print(f"\nBY DATE (last 10)")
sorted_dates = sorted(temporal.keys())[-10:]
for date in sorted_dates:
d = temporal[date]
print(f" {date} total={d['total']:4} provenance={d['with_provenance']:4}")
# Quality indicators
print(f"\nQUALITY INDICATORS")
if stats['coverage_pct'] == 100:
print(" ✓ Full provenance coverage")
elif stats['coverage_pct'] >= 90:
print(" ⚠ Near-full coverage — some pairs missing provenance")
else:
print(" ✗ Low coverage — many pairs missing provenance")
if stats['excluded'] > 0:
print(f" {stats['excluded']} pairs excluded from training")
print()
def main():
parser = argparse.ArgumentParser(description="Provenance Dashboard")
parser.add_argument("input", type=str, help="Path to JSONL or directory")
parser.add_argument("--all", action="store_true", help="Process all JSONL in directory")
args = parser.parse_args()
input_path = Path(args.input)
if input_path.is_file():
pairs = load_jsonl(input_path)
print_dashboard(pairs, f"Provenance: {input_path.name}")
elif input_path.is_dir() and args.all:
for jsonl_file in sorted(input_path.glob("*.jsonl")):
pairs = load_jsonl(jsonl_file)
print_dashboard(pairs, f"Provenance: {jsonl_file.name}")
else:
print(f"ERROR: {input_path} is not a file (use --all for directories)")
sys.exit(1)
if __name__ == "__main__":
main()