135 lines
4.0 KiB
Python
135 lines
4.0 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
"""
|
|||
|
|
Provenance Dashboard
|
|||
|
|
|
|||
|
|
Shows statistics about training data provenance:
|
|||
|
|
- Pair count by model over time
|
|||
|
|
- Pair count by source
|
|||
|
|
- Exclusion statistics
|
|||
|
|
- Provenance coverage
|
|||
|
|
|
|||
|
|
Usage:
|
|||
|
|
python provenance_dashboard.py data/merged_training_data.jsonl
|
|||
|
|
python provenance_dashboard.py data/ --all
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import argparse
|
|||
|
|
import json
|
|||
|
|
import sys
|
|||
|
|
from pathlib import Path
|
|||
|
|
from collections import defaultdict
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
try:
|
|||
|
|
from training_pair_provenance import get_provenance_stats
|
|||
|
|
except ImportError:
|
|||
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|||
|
|
from training_pair_provenance import get_provenance_stats
|
|||
|
|
|
|||
|
|
|
|||
|
|
def load_jsonl(path: Path) -> list[dict]:
|
|||
|
|
"""Load a JSONL file."""
|
|||
|
|
entries = []
|
|||
|
|
with open(path) as f:
|
|||
|
|
for line in f:
|
|||
|
|
line = line.strip()
|
|||
|
|
if line:
|
|||
|
|
entries.append(json.loads(line))
|
|||
|
|
return entries
|
|||
|
|
|
|||
|
|
|
|||
|
|
def analyze_temporal(pairs: list[dict]) -> dict:
|
|||
|
|
"""Analyze pairs by time period."""
|
|||
|
|
by_date = defaultdict(lambda: {"total": 0, "with_provenance": 0})
|
|||
|
|
|
|||
|
|
for pair in pairs:
|
|||
|
|
prov = pair.get("provenance", {})
|
|||
|
|
ts = prov.get("timestamp", "")
|
|||
|
|
if ts:
|
|||
|
|
date = ts[:10] # YYYY-MM-DD
|
|||
|
|
else:
|
|||
|
|
date = "unknown"
|
|||
|
|
|
|||
|
|
by_date[date]["total"] += 1
|
|||
|
|
if prov:
|
|||
|
|
by_date[date]["with_provenance"] += 1
|
|||
|
|
|
|||
|
|
return dict(by_date)
|
|||
|
|
|
|||
|
|
|
|||
|
|
def print_dashboard(pairs: list[dict], title: str = "Training Data Provenance"):
|
|||
|
|
"""Print comprehensive provenance dashboard."""
|
|||
|
|
stats = get_provenance_stats(pairs)
|
|||
|
|
temporal = analyze_temporal(pairs)
|
|||
|
|
|
|||
|
|
print(f"\n{'=' * 60}")
|
|||
|
|
print(f" {title}")
|
|||
|
|
print(f"{'=' * 60}")
|
|||
|
|
|
|||
|
|
# Coverage
|
|||
|
|
print(f"\nPROVENANCE COVERAGE")
|
|||
|
|
print(f" Total pairs: {stats['total_pairs']}")
|
|||
|
|
print(f" With provenance: {stats['with_provenance']}")
|
|||
|
|
print(f" Coverage: {stats['coverage_pct']}%")
|
|||
|
|
print(f" Excluded: {stats['excluded']}")
|
|||
|
|
|
|||
|
|
# By source type
|
|||
|
|
print(f"\nBY SOURCE TYPE")
|
|||
|
|
for st, count in sorted(stats["by_source_type"].items()):
|
|||
|
|
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
|
|||
|
|
bar = "█" * int(pct / 2)
|
|||
|
|
print(f" {st:15} {count:6} ({pct:5.1f}%) {bar}")
|
|||
|
|
|
|||
|
|
# By model
|
|||
|
|
print(f"\nBY MODEL")
|
|||
|
|
for model, count in sorted(stats["by_model"].items()):
|
|||
|
|
pct = round(count / stats['total_pairs'] * 100, 1) if stats['total_pairs'] else 0
|
|||
|
|
bar = "█" * int(pct / 2)
|
|||
|
|
print(f" {model:20} {count:6} ({pct:5.1f}%) {bar}")
|
|||
|
|
|
|||
|
|
# Temporal
|
|||
|
|
if temporal:
|
|||
|
|
print(f"\nBY DATE (last 10)")
|
|||
|
|
sorted_dates = sorted(temporal.keys())[-10:]
|
|||
|
|
for date in sorted_dates:
|
|||
|
|
d = temporal[date]
|
|||
|
|
print(f" {date} total={d['total']:4} provenance={d['with_provenance']:4}")
|
|||
|
|
|
|||
|
|
# Quality indicators
|
|||
|
|
print(f"\nQUALITY INDICATORS")
|
|||
|
|
if stats['coverage_pct'] == 100:
|
|||
|
|
print(" ✓ Full provenance coverage")
|
|||
|
|
elif stats['coverage_pct'] >= 90:
|
|||
|
|
print(" ⚠ Near-full coverage — some pairs missing provenance")
|
|||
|
|
else:
|
|||
|
|
print(" ✗ Low coverage — many pairs missing provenance")
|
|||
|
|
|
|||
|
|
if stats['excluded'] > 0:
|
|||
|
|
print(f" ℹ {stats['excluded']} pairs excluded from training")
|
|||
|
|
|
|||
|
|
print()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
parser = argparse.ArgumentParser(description="Provenance Dashboard")
|
|||
|
|
parser.add_argument("input", type=str, help="Path to JSONL or directory")
|
|||
|
|
parser.add_argument("--all", action="store_true", help="Process all JSONL in directory")
|
|||
|
|
args = parser.parse_args()
|
|||
|
|
|
|||
|
|
input_path = Path(args.input)
|
|||
|
|
|
|||
|
|
if input_path.is_file():
|
|||
|
|
pairs = load_jsonl(input_path)
|
|||
|
|
print_dashboard(pairs, f"Provenance: {input_path.name}")
|
|||
|
|
elif input_path.is_dir() and args.all:
|
|||
|
|
for jsonl_file in sorted(input_path.glob("*.jsonl")):
|
|||
|
|
pairs = load_jsonl(jsonl_file)
|
|||
|
|
print_dashboard(pairs, f"Provenance: {jsonl_file.name}")
|
|||
|
|
else:
|
|||
|
|
print(f"ERROR: {input_path} is not a file (use --all for directories)")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|