From 646588a5c89355e59133803be124e5a665a7eac4 Mon Sep 17 00:00:00 2001 From: Alexander Payne Date: Sat, 25 Apr 2026 20:56:13 -0400 Subject: [PATCH] =?UTF-8?q?[BURN=20#197]=20feat:=20provenance=20chain=20?= =?UTF-8?q?=E2=80=94=20add=20source=5Fsession,=20source=5Fmodel,=20source?= =?UTF-8?q?=5Fprovider,=20timestamp=20to=20facts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Adds schemas/provenance.json defining provenance object schema - Updates harvester.py write_knowledge() to attach provenance to every fact - Adds extract_provider() helper to infer provider from API base URL - Updates SCHEMA.md documenting provenance field and object - Provenance fields: source_session, source_model, source_provider, timestamp (harvested_at), extraction_method, confidence, verified - Does NOT retroactively modify existing facts in index.json Closes #197 --- knowledge/SCHEMA.md | 46 ++++++++++++++++++++++++++++++++++++ schemas/provenance.json | 52 +++++++++++++++++++++++++++++++++++++++++ scripts/harvester.py | 47 ++++++++++++++++++++++++++++++++----- 3 files changed, 139 insertions(+), 6 deletions(-) create mode 100644 schemas/provenance.json diff --git a/knowledge/SCHEMA.md b/knowledge/SCHEMA.md index 31b1640..f44eeb2 100644 --- a/knowledge/SCHEMA.md +++ b/knowledge/SCHEMA.md @@ -43,9 +43,26 @@ The harvester writes to both. The bootstrapper reads from index.json. Humans edi | `last_confirmed` | date | no | ISO-8601 date last seen in a session | | `expires` | date | no | Optional. After this date, fact is stale | | `related` | string[] | no | IDs of related facts | +| `provenance` | object | no | Provenance metadata — see Provenance Object section below | ### ID Format: `{domain}:{category}:{sequence}` + + +### Provenance Object + +Every fact may include a [`provenance`](#fact-object) field that tracks its origin. + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `source_session` | string | yes | Session ID / file path where this fact was extracted | +| `source_model` | string | yes | Model name used for extraction (e.g., `xiaomi/mimo-v2-pro`) | +| `source_provider` | string | yes | Provider name (`nous`, `openrouter`, `anthropic`, `openai`, etc.) | +| `timestamp` | date-time | yes | Extraction timestamp (ISO-8601 UTC) | +| `extraction_method` | enum | yes | `llm_extraction`, `manual`, or `retroactive_harvest` | +| `confidence` | float | yes | Confidence at extraction time (0.0–1.0) | +| `verified` | boolean | yes | `true` if fact has been manually reviewed, else `false` | + ### Categories | Category | Definition | @@ -85,6 +102,35 @@ knowledge/ └── {agent-type}.yaml ``` + + +### Provenance Object (added via `write_knowledge()` and harvester) + +```json +{ + "source_session": "string — session ID or file path", + "source_model": "string — model used for extraction", + "source_provider": "string — provider name (nous, openrouter, etc.)", + "timestamp": "string — ISO-8601 UTC extraction time", + "extraction_method": "string — llm_extraction|manual|retroactive_harvest", + "confidence": "float — 0.0–1.0 confidence from extraction", + "verified": "boolean — whether fact has been manually verified" +} +``` + +The `provenance` field is attached to every fact harvested via `write_knowledge()`. It provides traceability: which session produced this fact, which model/provider extracted it, when, and with what confidence. + +| Provenance Field | Type | Required | Description | +|------------------|------|----------|-------------| +| `source_session` | string | yes | Session ID / file path where extracted | +| `source_model` | string | yes | Model name (e.g., `xiaomi/mimo-v2-pro`) | +| `source_provider` | string | yes | Provider (`nous`, `openrouter`, `anthropic`, `openai`) | +| `timestamp` | date-time | yes | Extraction timestamp (ISO-8601) | +| `extraction_method` | enum | yes | `llm_extraction`, `manual`, or `retroactive_harvest` | +| `confidence` | float | yes | Confidence score (0.0–1.0) at extraction time | +| `verified` | boolean | yes | `true` if manually reviewed, else `false` | + + ## YAML File Format YAML files use frontmatter for metadata, then markdown sections with fact entries: diff --git a/schemas/provenance.json b/schemas/provenance.json new file mode 100644 index 0000000..e05b520 --- /dev/null +++ b/schemas/provenance.json @@ -0,0 +1,52 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Knowledge Provenance", + "description": "Provenance metadata attached to every knowledge fact", + "type": "object", + "required": [ + "source_session", + "source_model", + "source_provider", + "timestamp" + ], + "properties": { + "source_session": { + "type": "string", + "description": "Session ID or file path where this fact was extracted" + }, + "source_model": { + "type": "string", + "description": "Model used for extraction (e.g., 'xiaomi/mimo-v2-pro')" + }, + "source_provider": { + "type": "string", + "description": "Provider name (nous, openrouter, anthropic, etc.)" + }, + "timestamp": { + "type": "string", + "format": "date-time", + "description": "UTC ISO-8601 timestamp when this fact was extracted" + }, + "extraction_method": { + "type": "string", + "description": "How the fact was extracted (llm_extraction, manual, retroactive_harvest)", + "enum": [ + "llm_extraction", + "manual", + "retroactive_harvest" + ], + "default": "llm_extraction" + }, + "confidence": { + "type": "number", + "minimum": 0, + "maximum": 1, + "description": "Confidence assigned during extraction (copied from top-level fact)" + }, + "verified": { + "type": "boolean", + "description": "Whether this fact has been manually verified", + "default": false + } + } +} \ No newline at end of file diff --git a/scripts/harvester.py b/scripts/harvester.py index 9b85b64..871302b 100644 --- a/scripts/harvester.py +++ b/scripts/harvester.py @@ -27,6 +27,22 @@ sys.path.insert(0, str(SCRIPT_DIR)) from session_reader import read_session, extract_conversation, truncate_for_context, messages_to_text +def extract_provider(api_base: str) -> str: + """Infer provider name from API base URL.""" + url = api_base.lower() + if 'nousresearch' in url or 'nous' in url: + return 'nous' + if 'openrouter' in url: + return 'openrouter' + if 'anthropic' in url: + return 'anthropic' + if 'openai' in url: + return 'openai' + # Fallback: try to extract hostname + from urllib.parse import urlparse + host = urlparse(api_base).netloc + return host.split('.')[0] if host else 'unknown' + # --- Configuration --- DEFAULT_API_BASE = os.environ.get("HARVESTER_API_BASE", "https://api.nousresearch.com/v1") @@ -229,15 +245,34 @@ def validate_fact(fact: dict) -> bool: return True -def write_knowledge(index: dict, new_facts: list[dict], knowledge_dir: str, source_session: str = ""): - """Write new facts to the knowledge store.""" +def write_knowledge(index: dict, new_facts: list[dict], knowledge_dir: str, source_session: str = "", model: str = "", provider: str = ""): + """Write new facts to the knowledge store. + + Adds provenance metadata to each fact. If model/provider are empty, tries to + infer from environment or defaults. + """ kdir = Path(knowledge_dir) kdir.mkdir(parents=True, exist_ok=True) - # Add source tracking to each fact + # Determine model/provider defaults if not provided + model = model or os.environ.get("HARVESTER_MODEL", "xiaomi/mimo-v2-pro") + provider = provider or os.environ.get("HARVESTER_PROVIDER", "nous") + + timestamp = datetime.now(timezone.utc).isoformat() + + # Add provenance to each fact for fact in new_facts: - fact['source_session'] = source_session - fact['harvested_at'] = datetime.now(timezone.utc).isoformat() + provenance = { + 'source_session': source_session, + 'source_model': model, + 'source_provider': provider, + 'timestamp': timestamp, + 'extraction_method': 'llm_extraction', + 'confidence': fact.get('confidence', 0.5), + 'verified': False + } + fact['provenance'] = provenance + fact['harvested_at'] = timestamp # Update index index['facts'].extend(new_facts) @@ -330,7 +365,7 @@ def harvest_session(session_path: str, knowledge_dir: str, api_base: str, api_ke # 8. Write (unless dry run) if new_facts and not dry_run: - write_knowledge(existing_index, new_facts, knowledge_dir, source_session=session_path) + write_knowledge(existing_index, new_facts, knowledge_dir, source_session=session_path, model=model, provider=extract_provider(api_base)) stats['elapsed_seconds'] = round(time.time() - start_time, 2) return stats -- 2.43.0