Compare commits

..

1 Commits

Author SHA1 Message Date
Alexander Payne
646588a5c8 [BURN #197] feat: provenance chain — add source_session, source_model, source_provider, timestamp to facts
Some checks failed
Test / pytest (pull_request) Failing after 30s
- Adds schemas/provenance.json defining provenance object schema
- Updates harvester.py write_knowledge() to attach provenance to every fact
- Adds extract_provider() helper to infer provider from API base URL
- Updates SCHEMA.md documenting provenance field and object
- Provenance fields: source_session, source_model, source_provider,
  timestamp (harvested_at), extraction_method, confidence, verified
- Does NOT retroactively modify existing facts in index.json
Closes #197
2026-04-25 20:56:13 -04:00
4 changed files with 139 additions and 483 deletions

View File

@@ -43,9 +43,26 @@ The harvester writes to both. The bootstrapper reads from index.json. Humans edi
| `last_confirmed` | date | no | ISO-8601 date last seen in a session |
| `expires` | date | no | Optional. After this date, fact is stale |
| `related` | string[] | no | IDs of related facts |
| `provenance` | object | no | Provenance metadata — see Provenance Object section below |
### ID Format: `{domain}:{category}:{sequence}`
### Provenance Object
Every fact may include a [`provenance`](#fact-object) field that tracks its origin.
| Field | Type | Required | Description |
|-------|------|----------|-------------|
| `source_session` | string | yes | Session ID / file path where this fact was extracted |
| `source_model` | string | yes | Model name used for extraction (e.g., `xiaomi/mimo-v2-pro`) |
| `source_provider` | string | yes | Provider name (`nous`, `openrouter`, `anthropic`, `openai`, etc.) |
| `timestamp` | date-time | yes | Extraction timestamp (ISO-8601 UTC) |
| `extraction_method` | enum | yes | `llm_extraction`, `manual`, or `retroactive_harvest` |
| `confidence` | float | yes | Confidence at extraction time (0.01.0) |
| `verified` | boolean | yes | `true` if fact has been manually reviewed, else `false` |
### Categories
| Category | Definition |
@@ -85,6 +102,35 @@ knowledge/
└── {agent-type}.yaml
```
### Provenance Object (added via `write_knowledge()` and harvester)
```json
{
"source_session": "string — session ID or file path",
"source_model": "string — model used for extraction",
"source_provider": "string — provider name (nous, openrouter, etc.)",
"timestamp": "string — ISO-8601 UTC extraction time",
"extraction_method": "string — llm_extraction|manual|retroactive_harvest",
"confidence": "float — 0.01.0 confidence from extraction",
"verified": "boolean — whether fact has been manually verified"
}
```
The `provenance` field is attached to every fact harvested via `write_knowledge()`. It provides traceability: which session produced this fact, which model/provider extracted it, when, and with what confidence.
| Provenance Field | Type | Required | Description |
|------------------|------|----------|-------------|
| `source_session` | string | yes | Session ID / file path where extracted |
| `source_model` | string | yes | Model name (e.g., `xiaomi/mimo-v2-pro`) |
| `source_provider` | string | yes | Provider (`nous`, `openrouter`, `anthropic`, `openai`) |
| `timestamp` | date-time | yes | Extraction timestamp (ISO-8601) |
| `extraction_method` | enum | yes | `llm_extraction`, `manual`, or `retroactive_harvest` |
| `confidence` | float | yes | Confidence score (0.01.0) at extraction time |
| `verified` | boolean | yes | `true` if manually reviewed, else `false` |
## YAML File Format
YAML files use frontmatter for metadata, then markdown sections with fact entries:

52
schemas/provenance.json Normal file
View File

@@ -0,0 +1,52 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Knowledge Provenance",
"description": "Provenance metadata attached to every knowledge fact",
"type": "object",
"required": [
"source_session",
"source_model",
"source_provider",
"timestamp"
],
"properties": {
"source_session": {
"type": "string",
"description": "Session ID or file path where this fact was extracted"
},
"source_model": {
"type": "string",
"description": "Model used for extraction (e.g., 'xiaomi/mimo-v2-pro')"
},
"source_provider": {
"type": "string",
"description": "Provider name (nous, openrouter, anthropic, etc.)"
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "UTC ISO-8601 timestamp when this fact was extracted"
},
"extraction_method": {
"type": "string",
"description": "How the fact was extracted (llm_extraction, manual, retroactive_harvest)",
"enum": [
"llm_extraction",
"manual",
"retroactive_harvest"
],
"default": "llm_extraction"
},
"confidence": {
"type": "number",
"minimum": 0,
"maximum": 1,
"description": "Confidence assigned during extraction (copied from top-level fact)"
},
"verified": {
"type": "boolean",
"description": "Whether this fact has been manually verified",
"default": false
}
}
}

View File

@@ -27,6 +27,22 @@ sys.path.insert(0, str(SCRIPT_DIR))
from session_reader import read_session, extract_conversation, truncate_for_context, messages_to_text
def extract_provider(api_base: str) -> str:
"""Infer provider name from API base URL."""
url = api_base.lower()
if 'nousresearch' in url or 'nous' in url:
return 'nous'
if 'openrouter' in url:
return 'openrouter'
if 'anthropic' in url:
return 'anthropic'
if 'openai' in url:
return 'openai'
# Fallback: try to extract hostname
from urllib.parse import urlparse
host = urlparse(api_base).netloc
return host.split('.')[0] if host else 'unknown'
# --- Configuration ---
DEFAULT_API_BASE = os.environ.get("HARVESTER_API_BASE", "https://api.nousresearch.com/v1")
@@ -229,15 +245,34 @@ def validate_fact(fact: dict) -> bool:
return True
def write_knowledge(index: dict, new_facts: list[dict], knowledge_dir: str, source_session: str = ""):
"""Write new facts to the knowledge store."""
def write_knowledge(index: dict, new_facts: list[dict], knowledge_dir: str, source_session: str = "", model: str = "", provider: str = ""):
"""Write new facts to the knowledge store.
Adds provenance metadata to each fact. If model/provider are empty, tries to
infer from environment or defaults.
"""
kdir = Path(knowledge_dir)
kdir.mkdir(parents=True, exist_ok=True)
# Add source tracking to each fact
# Determine model/provider defaults if not provided
model = model or os.environ.get("HARVESTER_MODEL", "xiaomi/mimo-v2-pro")
provider = provider or os.environ.get("HARVESTER_PROVIDER", "nous")
timestamp = datetime.now(timezone.utc).isoformat()
# Add provenance to each fact
for fact in new_facts:
fact['source_session'] = source_session
fact['harvested_at'] = datetime.now(timezone.utc).isoformat()
provenance = {
'source_session': source_session,
'source_model': model,
'source_provider': provider,
'timestamp': timestamp,
'extraction_method': 'llm_extraction',
'confidence': fact.get('confidence', 0.5),
'verified': False
}
fact['provenance'] = provenance
fact['harvested_at'] = timestamp
# Update index
index['facts'].extend(new_facts)
@@ -330,7 +365,7 @@ def harvest_session(session_path: str, knowledge_dir: str, api_base: str, api_ke
# 8. Write (unless dry run)
if new_facts and not dry_run:
write_knowledge(existing_index, new_facts, knowledge_dir, source_session=session_path)
write_knowledge(existing_index, new_facts, knowledge_dir, source_session=session_path, model=model, provider=extract_provider(api_base))
stats['elapsed_seconds'] = round(time.time() - start_time, 2)
return stats

View File

@@ -1,477 +0,0 @@
#!/usr/bin/env python3
"""
Progress Tracker — Pipeline 10.8
Track improvement metrics over time. Are we getting better?
Metrics tracked:
1. Test coverage — % of Python functions with associated tests (test:source file ratio + line coverage if available)
2. Doc coverage — % of Python callables with docstrings (AST-based)
3. Issue close rate — closed / (opened + closed) per week (Gitea API)
4. Dep freshness — % of requirements pinned vs outdated (pip list --outdated)
Output:
- metrics/snapshots/YYYY-MM-DD.json — one snapshot per run
- metrics/TRENDS.md — cumulative markdown table
- stdout summary
Usage:
python3 scripts/progress_tracker.py
python3 scripts/progress_tracker.py --json
python3 scripts/progress_tracker.py --output metrics/TRENDS.md
Weekly cron:
0 9 * * 1 cd /path/to/compounding-intelligence && python3 scripts/progress_tracker.py
"""
import argparse
import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import datetime, timezone, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
# ── Configuration ──────────────────────────────────────────────────────────
SCRIPT_DIR = Path(__file__).resolve().parent
REPO_ROOT = SCRIPT_DIR.parent
METRICS_DIR = REPO_ROOT / "metrics"
SNAPSHOTS_DIR = METRICS_DIR / "snapshots"
TOKEN_PATH = Path.home() / ".config" / "gitea" / "token"
GITEA_API_BASE = "https://forge.alexanderwhitestone.com/api/v1"
ORG = "Timmy_Foundation"
# Ensure paths exist
SNAPSHOTS_DIR.mkdir(parents=True, exist_ok=True)
# ── Helpers ─────────────────────────────────────────────────────────────────
def run_cmd(cmd: List[str], cwd: Path = REPO_ROOT) -> str:
"""Run a shell command and return stdout (stderr merged)."""
result = subprocess.run(
cmd, capture_output=True, text=True, cwd=cwd, timeout=30
)
if result.returncode != 0:
return ""
return result.stdout.strip()
def slugify_date(dt: datetime) -> str:
return dt.strftime("%Y-%m-%d")
def snapshot_path(dt: datetime) -> Path:
return SNAPSHOTS_DIR / f"{slugify_date(dt)}.json"
def load_snapshots() -> List[Dict[str, Any]]:
"""Load all existing snapshots sorted by date."""
snapshots = []
for f in sorted(SNAPSHOTS_DIR.glob("*.json")):
try:
with open(f) as fp:
snapshots.append(json.load(fp))
except Exception:
continue
return snapshots
# ── Metric 1: Test Coverage ─────────────────────────────────────────────────
def collect_test_coverage() -> Dict[str, Any]:
"""
Compute test coverage metrics.
Counts test_*.py and *_test.py files vs non-test .py source files.
Also attempts to read .coverage if present.
"""
all_py = list(REPO_ROOT.rglob("*.py"))
source_files = []
test_files = []
for p in all_py:
try:
rel_parts = p.relative_to(REPO_ROOT).parts
except ValueError:
continue
# Skip hidden/cache/temp dirs (check only relative parts)
if any(part.startswith('.') or part.startswith('__') for part in rel_parts):
continue
if any(part in ('node_modules', 'venv', '.venv', 'env', '.pytest_cache') for part in rel_parts):
continue
if p.name.startswith("test_") or p.name.endswith("_test.py"):
test_files.append(p)
else:
source_files.append(p)
# Try to get line coverage from .coverage
coverage_percent = None
coverage_tool = None
coverage_file = REPO_ROOT / ".coverage"
if coverage_file.exists():
try:
import coverage # type: ignore
# Use coverage API if available
cov = coverage.Coverage(data_file=str(coverage_file))
cov.load()
total = cov.report()
coverage_percent = total if isinstance(total, float) else None
coverage_tool = "coverage"
except Exception:
# Fallback: parse `coverage report` output
out = run_cmd(["coverage", "report", "--skip-empty"])
if out:
for line in out.splitlines():
if "TOTAL" in line:
parts = line.split()
if len(parts) >= 2:
try:
coverage_percent = float(parts[-1].rstrip('%'))
coverage_tool = "coverage"
break
except ValueError:
pass
return {
"test_files": len(test_files),
"source_files": len(source_files),
"test_to_source_ratio": round(len(test_files) / len(source_files), 4) if source_files else 0.0,
"coverage_tool": coverage_tool,
"coverage_percent": coverage_percent,
}
# ── Metric 2: Doc Coverage ──────────────────────────────────────────────────
def collect_doc_coverage() -> Dict[str, Any]:
"""
Check AST of Python files for docstrings.
Returns: callables_total, callables_with_doc, doc_coverage_percent
"""
import ast
all_py = list(REPO_ROOT.rglob("*.py"))
source_files = []
test_files = []
for p in all_py:
try:
rel_parts = p.relative_to(REPO_ROOT).parts
except ValueError:
continue
if any(part.startswith('.') or part.startswith('__') for part in rel_parts):
continue
if any(part in ('node_modules', 'venv', '.venv', 'env', '.pytest_cache') for part in rel_parts):
continue
if p.name.startswith("test_") or p.name.endswith("_test.py"):
test_files.append(p)
else:
source_files.append(p)
total_callables = 0
with_doc = 0
for p in source_files + test_files:
try:
with open(p) as f:
tree = ast.parse(f.read(), filename=str(p))
for node in ast.walk(tree):
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
total_callables += 1
doc = ast.get_docstring(node)
if doc and doc.strip():
with_doc += 1
except Exception:
continue
return {
"callables_total": total_callables,
"callables_with_doc": with_doc,
"doc_coverage_percent": round((with_doc / total_callables * 100) if total_callables else 0.0, 2),
}
# ── Metric 3: Issue Close Rate ──────────────────────────────────────────────
def collect_issue_metrics() -> Dict[str, Any]:
"""
Use Gitea API to get issue open/close stats for the last 7 days.
Returns counts and close rate.
"""
token = ""
if TOKEN_PATH.exists():
token = TOKEN_PATH.read_text().strip()
if not token:
return {
"opened_last_7d": None,
"closed_last_7d": None,
"close_rate": None,
"total_open": None,
"note": "Gitea token not available"
}
try:
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
except ImportError:
return {"error": "urllib not available"}
now = datetime.now(timezone.utc)
week_ago = now - timedelta(days=7)
since = week_ago.strftime("%Y-%m-%d")
headers = {"Authorization": f"token {token}"}
base_url = f"{GITEA_API_BASE}/repos/{ORG}/compounding-intelligence/issues"
try:
# Get issues from last 7 days
url = f"{base_url}?state=all&since={since}&per_page=100"
req = Request(url, headers=headers)
with urlopen(req, timeout=15) as resp:
issues = json.loads(resp.read())
opened = 0
closed = 0
for issue in issues:
created = datetime.fromisoformat(issue["created_at"].replace("Z", "+00:00"))
if created >= week_ago:
opened += 1
if issue.get("state") == "closed":
closed_at_str = issue.get("closed_at")
if closed_at_str:
closed_at = datetime.fromisoformat(closed_at_str.replace("Z", "+00:00"))
if closed_at >= week_ago:
closed += 1
# Total open issues
req2 = Request(f"{base_url}?state=open&per_page=1", headers=headers)
with urlopen(req2, timeout=15) as resp:
total_open = int(resp.headers.get("X-Total-Count", "0"))
total = opened + closed
close_rate = closed / total if total > 0 else 0.0
return {
"opened_last_7d": opened,
"closed_last_7d": closed,
"close_rate": round(close_rate, 4),
"total_open": total_open,
}
except Exception as e:
return {
"opened_last_7d": None,
"closed_last_7d": None,
"close_rate": None,
"total_open": None,
"error": str(e)[:100],
"note": "Gitea API unavailable"
}
# ── Metric 4: Dependency Freshness ─────────────────────────────────────────
def collect_dep_freshness() -> Dict[str, Any]:
"""
Check requirements.txt for outdated dependencies using pip list --outdated.
Returns freshness percentage and outdated list.
"""
req_file = REPO_ROOT / "requirements.txt"
if not req_file.exists():
return {
"total_deps": 0,
"outdated_deps": 0,
"freshness_percent": 100.0,
"outdated_list": [],
"note": "requirements.txt not found"
}
# Parse requirements (very simple: take name before comparison op)
reqs = []
with open(req_file) as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
m = re.match(r"^([a-zA-Z0-9_.-]+)", line)
if m:
reqs.append(m.group(1))
if not reqs:
return {"total_deps": 0, "outdated_deps": 0, "freshness_percent": 100.0, "outdated_list": []}
# Query pip for outdated packages (may fail if pip not available)
outdated_names = set()
try:
out = run_cmd(["pip", "list", "--outdated", "--format=json"])
if out:
data = json.loads(out)
outdated_names = {item["name"].lower() for item in data}
except Exception:
pass
outdated = [p for p in reqs if p.lower() in outdated_names]
total = len(reqs)
outdated_count = len(outdated)
freshness = round(((total - outdated_count) / total * 100) if total else 100.0, 1)
return {
"total_deps": total,
"outdated_deps": outdated_count,
"freshness_percent": freshness,
"outdated_list": outdated,
}
# ── Snapshot & Trends ───────────────────────────────────────────────────────
def take_snapshot() -> Dict[str, Any]:
"""Collect all metrics and return a snapshot dict."""
now = datetime.now(timezone.utc)
test_cov = collect_test_coverage()
doc_cov = collect_doc_coverage()
issues = collect_issue_metrics()
deps = collect_dep_freshness()
return {
"timestamp": now.isoformat(),
"date": slugify_date(now),
"metrics": {
"test_coverage": test_cov,
"doc_coverage": doc_cov,
"issues": issues,
"dependencies": deps,
}
}
def save_snapshot(snapshot: Dict[str, Any]) -> Path:
path = snapshot_path(datetime.fromisoformat(snapshot["timestamp"]))
with open(path, "w") as f:
json.dump(snapshot, f, indent=2)
return path
def generate_trends(snapshots: List[Dict[str, Any]], output_path: Optional[Path] = None) -> str:
"""Generate markdown trends table; optionally write to file."""
if not snapshots:
msg = "# Progress Tracker — Trends\n\nNo snapshots yet. Run `progress_tracker.py` to create the first snapshot."
if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(msg)
return msg
lines = [
"# Progress Tracker — Trends",
f"\nLast updated: {datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M UTC')}",
f"\nSnapshots: {len(snapshots)}\n",
"| Date | Test Files → Source | Doc Coverage | Issues Closed/Opened (7d) | Dep Freshness |",
"|------|---------------------|--------------|---------------------------|---------------|",
]
for snap in reversed(snapshots): # chronological
date = snap["date"]
m = snap["metrics"]
tc = m["test_coverage"]
test_str = f"{tc['test_files']}/{tc['source_files']} ({tc['test_to_source_ratio']:.2f})"
doc_str = f"{m['doc_coverage']['doc_coverage_percent']:.1f}%"
issues_str = f"{m['issues'].get('closed_last_7d','-')}/{m['issues'].get('opened_last_7d','-')}"
dep_str = f"{m['dependencies'].get('freshness_percent','?')}%"
lines.append(f"| {date} | {test_str} | {doc_str} | {issues_str} | {dep_str} |")
# Current snapshot summary
cur = snapshots[-1]
cm = cur["metrics"]
lines.append(f"\n## Current Snapshot ({cur['date']})\n")
tc = cm["test_coverage"]
cov_line = f"- Test coverage: {tc['coverage_percent']:.1f}% (via {tc['coverage_tool']})\n" if tc["coverage_percent"] else "- Test coverage: (pytest-cov not configured)\n"
lines.append(cov_line)
lines.append(f"- Doc coverage: {cm['doc_coverage']['doc_coverage_percent']:.1f}%")
im = cm["issues"]
if im.get("close_rate") is not None:
lines.append(f"- Issue close rate (7d): {im['close_rate']*100:.1f}% ({im['closed_last_7d']} closed, {im['opened_last_7d']} opened)")
else:
lines.append(f"- Issue metrics: {im.get('note','unavailable')}")
dd = cm["dependencies"]
lines.append(f"- Dep freshness: {dd.get('freshness_percent','?')}% outdated ({dd.get('outdated_deps',0)}/{dd.get('total_deps',0)} deps)")
if dd.get('outdated_list'):
lines.append(f" Outdated: {', '.join(dd['outdated_list'][:5])}")
content = "\n".join(lines) + "\n"
if output_path:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(content)
return content
# ── Main ─────────────────────────────────────────────────────────────────────
def main() -> int:
parser = argparse.ArgumentParser(description="Progress Tracker — 10.8")
parser.add_argument("--json", action="store_true", help="Emit snapshot as JSON only")
parser.add_argument("--output", type=Path, default=METRICS_DIR / "TRENDS.md",
help="Write trends markdown to this file")
args = parser.parse_args()
snapshot = take_snapshot()
all_snapshots = load_snapshots()
path_written = save_snapshot(snapshot)
if args.json:
print(json.dumps(snapshot, indent=2))
return 0
trends = generate_trends(all_snapshots + [snapshot], output_path=args.output)
# Print current snapshot summary
print(f"Snapshot saved: {path_written}\n")
print(f"Progress Tracker — {snapshot['date']}")
print("=" * 50)
m = snapshot["metrics"]
tc = m["test_coverage"]
print(f"Test files: {tc['test_files']} | Source files: {tc['source_files']} | Ratio: {tc['test_to_source_ratio']:.3f}")
if tc["coverage_percent"] is not None:
print(f"Line coverage: {tc['coverage_percent']:.1f}% (via {tc['coverage_tool']})")
else:
print("Line coverage: (not available — run `pytest --cov`)")
print()
dc = m["doc_coverage"]
print(f"Callables with docstrings: {dc['callables_with_doc']}/{dc['callables_total']} ({dc['doc_coverage_percent']:.1f}%)")
print()
im = m["issues"]
if im.get("close_rate") is not None:
print(f"Issues (7d): {im['closed_last_7d']} closed / {im['opened_last_7d']} opened → close rate: {im['close_rate']*100:.1f}%")
print(f"Total open: {im['total_open']}")
else:
print(f"Issues: {im.get('note','unavailable')}")
print()
dd = m["dependencies"]
print(f"Dependencies: {dd.get('total_deps',0)} total, {dd.get('outdated_deps',0)} outdated")
if dd.get('outdated_list'):
shown = dd['outdated_list'][:5]
print(f"Outdated: {', '.join(shown)}" + ("..." if len(dd['outdated_list']) > 5 else ""))
print(f"\nTrends written to: {args.output}")
return 0
if __name__ == "__main__":
sys.exit(main())