135 lines
4.0 KiB
Python
135 lines
4.0 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Provenance Dashboard
|
||
|
||
Shows statistics about training data provenance:
|
||
- Pair count by model over time
|
||
- Pair count by source
|
||
- Exclusion statistics
|
||
- Provenance coverage
|
||
|
||
Usage:
|
||
python provenance_dashboard.py data/merged_training_data.jsonl
|
||
python provenance_dashboard.py data/ --all
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import sys
|
||
from pathlib import Path
|
||
from collections import defaultdict
|
||
from datetime import datetime
|
||
|
||
try:
|
||
from training_pair_provenance import get_provenance_stats
|
||
except ImportError:
|
||
sys.path.insert(0, str(Path(__file__).parent))
|
||
from training_pair_provenance import get_provenance_stats
|
||
|
||
|
||
def load_jsonl(path: Path) -> list[dict]:
|
||
"""Load a JSONL file."""
|
||
entries = []
|
||
with open(path) as f:
|
||
for line in f:
|
||
line = line.strip()
|
||
if line:
|
||
entries.append(json.loads(line))
|
||
return entries
|
||
|
||
|
||
def analyze_temporal(pairs: list[dict]) -> dict:
|
||
"""Analyze pairs by time period."""
|
||
by_date = defaultdict(lambda: {"total": 0, "with_provenance": 0})
|
||
|
||
for pair in pairs:
|
||
prov = pair.get("provenance", {})
|
||
ts = prov.get("timestamp", "")
|
||
if ts:
|
||
date = ts[:10] # YYYY-MM-DD
|
||
else:
|
||
date = "unknown"
|
||
|
||
by_date[date]["total"] += 1
|
||
if prov:
|
||
by_date[date]["with_provenance"] += 1
|
||
|
||
return dict(by_date)
|
||
|
||
|
||
def print_dashboard(pairs: list[dict], title: str = "Training Data Provenance"):
|
||
"""Print comprehensive provenance dashboard."""
|
||
stats = get_provenance_stats(pairs)
|
||
temporal = analyze_temporal(pairs)
|
||
|
||
print(f"\n{'=' * 60}")
|
||
print(f" {title}")
|
||
print(f"{'=' * 60}")
|
||
|
||
# Coverage
|
||
print(f"\nPROVENANCE COVERAGE")
|
||
print(f" Total pairs: {stats['total_pairs']}")
|
||
print(f" With provenance: {stats['with_provenance']}")
|
||
print(f" Coverage: {stats['coverage_pct']}%")
|
||
print(f" Excluded: {stats['excluded']}")
|
||
|
||
# By source type
|
||
print(f"\nBY SOURCE TYPE")
|
||
for st, count in sorted(stats["by_source_type"].items()):
|
||
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
|
||
bar = "█" * int(pct / 2)
|
||
print(f" {st:15} {count:6} ({pct:5.1f}%) {bar}")
|
||
|
||
# By model
|
||
print(f"\nBY MODEL")
|
||
for model, count in sorted(stats["by_model"].items()):
|
||
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
|
||
bar = "█" * int(pct / 2)
|
||
print(f" {model:20} {count:6} ({pct:5.1f}%) {bar}")
|
||
|
||
# Temporal
|
||
if temporal:
|
||
print(f"\nBY DATE (last 10)")
|
||
sorted_dates = sorted(temporal.keys())[-10:]
|
||
for date in sorted_dates:
|
||
d = temporal[date]
|
||
print(f" {date} total={d['total']:4} provenance={d['with_provenance']:4}")
|
||
|
||
# Quality indicators
|
||
print(f"\nQUALITY INDICATORS")
|
||
if stats['coverage_pct'] == 100:
|
||
print(" ✓ Full provenance coverage")
|
||
elif stats['coverage_pct'] >= 90:
|
||
print(" ⚠ Near-full coverage — some pairs missing provenance")
|
||
else:
|
||
print(" ✗ Low coverage — many pairs missing provenance")
|
||
|
||
if stats['excluded'] > 0:
|
||
print(f" ℹ {stats['excluded']} pairs excluded from training")
|
||
|
||
print()
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Provenance Dashboard")
|
||
parser.add_argument("input", type=str, help="Path to JSONL or directory")
|
||
parser.add_argument("--all", action="store_true", help="Process all JSONL in directory")
|
||
args = parser.parse_args()
|
||
|
||
input_path = Path(args.input)
|
||
|
||
if input_path.is_file():
|
||
pairs = load_jsonl(input_path)
|
||
print_dashboard(pairs, f"Provenance: {input_path.name}")
|
||
elif input_path.is_dir() and args.all:
|
||
for jsonl_file in sorted(input_path.glob("*.jsonl")):
|
||
pairs = load_jsonl(jsonl_file)
|
||
print_dashboard(pairs, f"Provenance: {jsonl_file.name}")
|
||
else:
|
||
print(f"ERROR: {input_path} is not a file (use --all for directories)")
|
||
sys.exit(1)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|