139 lines
5.0 KiB
Python
139 lines
5.0 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
# audit_trail.py - Local logging of inputs, sources, and confidence.
|
||
|
|
# Implements SOUL.md "What Honesty Requires" - The Audit Trail.
|
||
|
|
# Logs are stored locally. Never sent anywhere. The user owns them.
|
||
|
|
# Part of #794
|
||
|
|
|
||
|
|
import json
|
||
|
|
import hashlib
|
||
|
|
import os
|
||
|
|
import time
|
||
|
|
from datetime import datetime, timezone
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Any, Dict, List, Optional
|
||
|
|
from dataclasses import dataclass, field, asdict
|
||
|
|
|
||
|
|
AUDIT_DIR = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) / "audit-trail"
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass
|
||
|
|
class AuditEntry:
|
||
|
|
id: str
|
||
|
|
ts: str
|
||
|
|
input_text: str
|
||
|
|
sources: List[str]
|
||
|
|
confidence: float
|
||
|
|
output_text: str
|
||
|
|
model: str
|
||
|
|
provider: str = ""
|
||
|
|
session_id: str = ""
|
||
|
|
source_types: List[str] = field(default_factory=list)
|
||
|
|
|
||
|
|
@staticmethod
|
||
|
|
def generate_id(input_text: str, output_text: str, ts: str) -> str:
|
||
|
|
content = f"{ts}:{input_text}:{output_text}"
|
||
|
|
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
||
|
|
|
||
|
|
|
||
|
|
class AuditTrail:
|
||
|
|
def __init__(self, audit_dir: Optional[Path] = None):
|
||
|
|
self.audit_dir = audit_dir or AUDIT_DIR
|
||
|
|
self.audit_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
self._log_file = self.audit_dir / "trail.jsonl"
|
||
|
|
|
||
|
|
def log_response(self, input_text, sources, confidence, output_text,
|
||
|
|
model="", provider="", session_id="", source_types=None):
|
||
|
|
ts = datetime.now(timezone.utc).isoformat()
|
||
|
|
entry = AuditEntry(
|
||
|
|
id=AuditEntry.generate_id(input_text, output_text, ts),
|
||
|
|
ts=ts,
|
||
|
|
input_text=input_text[:1000],
|
||
|
|
sources=[s[:200] for s in sources[:10]],
|
||
|
|
confidence=round(confidence, 3),
|
||
|
|
output_text=output_text[:2000],
|
||
|
|
model=model, provider=provider, session_id=session_id,
|
||
|
|
source_types=source_types or [],
|
||
|
|
)
|
||
|
|
with open(self._log_file, "a") as f:
|
||
|
|
f.write(json.dumps(asdict(entry)) + "\n")
|
||
|
|
return entry
|
||
|
|
|
||
|
|
def query(self, search_text, limit=10, min_confidence=0.0):
|
||
|
|
if not self._log_file.exists():
|
||
|
|
return []
|
||
|
|
results = []
|
||
|
|
search_lower = search_text.lower()
|
||
|
|
with open(self._log_file) as f:
|
||
|
|
for line in f:
|
||
|
|
line = line.strip()
|
||
|
|
if not line:
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
data = json.loads(line)
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
continue
|
||
|
|
if data.get("confidence", 0) < min_confidence:
|
||
|
|
continue
|
||
|
|
searchable = (data.get("input_text", "") + " " +
|
||
|
|
data.get("output_text", "") + " " +
|
||
|
|
" ".join(data.get("sources", []))).lower()
|
||
|
|
if search_lower in searchable:
|
||
|
|
results.append(AuditEntry(**{k: data.get(k, "") if isinstance(data.get(k), str)
|
||
|
|
else data.get(k, []) if isinstance(data.get(k), list)
|
||
|
|
else data.get(k, 0.0) for k in AuditEntry.__dataclass_fields__}))
|
||
|
|
if len(results) >= limit:
|
||
|
|
break
|
||
|
|
return results
|
||
|
|
|
||
|
|
def get_stats(self):
|
||
|
|
if not self._log_file.exists():
|
||
|
|
return {"total": 0, "avg_confidence": 0, "sources_breakdown": {}}
|
||
|
|
total = 0
|
||
|
|
confidence_sum = 0.0
|
||
|
|
source_types = {}
|
||
|
|
with open(self._log_file) as f:
|
||
|
|
for line in f:
|
||
|
|
try:
|
||
|
|
data = json.loads(line.strip())
|
||
|
|
total += 1
|
||
|
|
confidence_sum += data.get("confidence", 0)
|
||
|
|
for st in data.get("source_types", []):
|
||
|
|
source_types[st] = source_types.get(st, 0) + 1
|
||
|
|
except (json.JSONDecodeError, ValueError):
|
||
|
|
continue
|
||
|
|
return {"total": total, "avg_confidence": round(confidence_sum / max(total, 1), 3),
|
||
|
|
"sources_breakdown": source_types}
|
||
|
|
|
||
|
|
def get_by_session(self, session_id, limit=50):
|
||
|
|
if not self._log_file.exists():
|
||
|
|
return []
|
||
|
|
results = []
|
||
|
|
with open(self._log_file) as f:
|
||
|
|
for line in f:
|
||
|
|
try:
|
||
|
|
data = json.loads(line.strip())
|
||
|
|
if data.get("session_id") == session_id:
|
||
|
|
results.append(AuditEntry(**{k: data.get(k, "") if isinstance(data.get(k), str)
|
||
|
|
else data.get(k, []) if isinstance(data.get(k), list)
|
||
|
|
else data.get(k, 0.0) for k in AuditEntry.__dataclass_fields__}))
|
||
|
|
except (json.JSONDecodeError, ValueError):
|
||
|
|
continue
|
||
|
|
if len(results) >= limit:
|
||
|
|
break
|
||
|
|
return results
|
||
|
|
|
||
|
|
|
||
|
|
_default_trail = None
|
||
|
|
|
||
|
|
def get_trail():
|
||
|
|
global _default_trail
|
||
|
|
if _default_trail is None:
|
||
|
|
_default_trail = AuditTrail()
|
||
|
|
return _default_trail
|
||
|
|
|
||
|
|
def log_response(**kwargs):
|
||
|
|
return get_trail().log_response(**kwargs)
|
||
|
|
|
||
|
|
def query(search_text, **kwargs):
|
||
|
|
return get_trail().query(search_text, **kwargs)
|