timmy-config/pipeline/quality_gate.py

#!/usr/bin/env python3
"""
quality_gate.py — Quality Gate for Pipeline Outputs

Validates all pipeline outputs before saving. Rejects bad outputs,
tracks quality scores, and supports re-queue for regeneration.

Usage:
    python3 quality_gate.py --input output.jsonl --type training_pairs
    python3 quality_gate.py --input output.jsonl --type knowledge
    python3 quality_gate.py --input output.jsonl --type scene_descriptions
    python3 quality_gate.py --dir pipeline/output/ --type training_pairs
    python3 quality_gate.py --status  # show quality stats

Exit codes:
    0 = all outputs passed
    1 = some outputs rejected
    2 = file/parse error
"""

import json
import os
import sys
import hashlib
import math
import re
import struct
from pathlib import Path
from datetime import datetime, timezone, timedelta
from dataclasses import dataclass, field, asdict
from typing import List, Optional, Dict, Any, Set

PIPELINE_DIR = Path.home() / ".hermes" / "pipeline"
STATS_FILE = PIPELINE_DIR / "quality_stats.json"
HASH_DIR = PIPELINE_DIR / "quality_hashes"
HASH_RETENTION_DAYS = 7  # Keep hashes for 7 days


# ============================================================
# Bloom Filter — Memory-efficient dedup at scale
# ============================================================

class BloomFilter:
    """Probabilistic set for membership testing. False positives possible, no false negatives."""

    def __init__(self, capacity: int = 100_000, error_rate: float = 0.01):
        self.capacity = capacity
        self.error_rate = error_rate
        # Optimal size and hash count
        self.size = max(64, int(-capacity * math.log(error_rate) / (math.log(2) ** 2)))
        self.num_hashes = max(1, int(self.size / capacity * math.log(2)))
        self._bitarray = bytearray((self.size + 7) // 8)

    def _hash_indices(self, item: str) -> List[int]:
        """Generate bit indices using double hashing."""
        h1 = int.from_bytes(hashlib.sha256(item.encode()).digest()[:8], "little")
        h2 = int.from_bytes(hashlib.md5(item.encode()).digest()[:8], "little")
        return [(h1 + i * h2) % self.size for i in range(self.num_hashes)]

    def add(self, item: str):
        for idx in self._hash_indices(item):
            self._bitarray[idx // 8] |= 1 << (idx % 8)

    def __contains__(self, item: str) -> bool:
        return all(self._bitarray[idx // 8] & (1 << (idx % 8)) for idx in self._hash_indices(item))

    def to_dict(self) -> dict:
        return {
            "capacity": self.capacity,
            "error_rate": self.error_rate,
            "size": self.size,
            "num_hashes": self.num_hashes,
            "data": base64.b64encode(bytes(self._bitarray)).decode(),
        }

    @classmethod
    def from_dict(cls, d: dict) -> "BloomFilter":
        bf = cls(capacity=d["capacity"], error_rate=d["error_rate"])
        bf._bitarray = bytearray(base64.b64decode(d["data"]))
        return bf


# ============================================================
# Hash Dedup Store — Rotating daily files + bloom filter
# ============================================================

class HashDedupStore:
    """Rotating hash store for cross-run deduplication.

    Strategy:
      - Daily JSON files: HASH_DIR/YYYY-MM-DD.json (set of 16-char hashes)
      - Bloom filter: HASH_DIR/bloom.json (memory-efficient for large scale)
      - On load: merge last N days into bloom filter
      - Rotation: delete files older than HASH_RETENTION_DAYS
    """

    def __init__(self, retention_days: int = HASH_RETENTION_DAYS):
        self.retention_days = retention_days
        HASH_DIR.mkdir(parents=True, exist_ok=True)
        self._today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
        self._daily_hashes: Set[str] = set()
        self._bloom: Optional[BloomFilter] = None
        self._load()

    def _day_file(self, day: str) -> Path:
        return HASH_DIR / f"{day}.json"

    def _bloom_file(self) -> Path:
        return HASH_DIR / "bloom.json"

    def _load(self):
        """Load today's hashes and bloom filter."""
        # Load today's file
        day_path = self._day_file(self._today)
        if day_path.exists():
            try:
                self._daily_hashes = set(json.loads(day_path.read_text()))
            except (json.JSONDecodeError, IOError):
                self._daily_hashes = set()

        # Load or rebuild bloom filter
        bloom_path = self._bloom_file()
        if bloom_path.exists():
            try:
                self._bloom = BloomFilter.from_dict(json.loads(bloom_path.read_text()))
            except (json.JSONDecodeError, IOError, KeyError):
                self._bloom = None

        if self._bloom is None:
            self._rebuild_bloom()

    def _rebuild_bloom(self):
        """Rebuild bloom filter from all recent daily files."""
        hashes = set()
        for day_offset in range(self.retention_days):
            day = (datetime.now(timezone.utc) - timedelta(days=day_offset)).strftime("%Y-%m-%d")
            day_path = self._day_file(day)
            if day_path.exists():
                try:
                    hashes.update(json.loads(day_path.read_text()))
                except (json.JSONDecodeError, IOError):
                    pass

        capacity = max(len(hashes) * 2, 10_000)
        self._bloom = BloomFilter(capacity=capacity)
        for h in hashes:
            self._bloom.add(h)

    def _save(self):
        """Persist today's hashes and bloom filter."""
        day_path = self._day_file(self._today)
        day_path.write_text(json.dumps(sorted(self._daily_hashes)))

        if self._bloom:
            self._bloom_file().write_text(json.dumps(self._bloom.to_dict()))

    def _rotate(self):
        """Delete daily hash files older than retention period."""
        cutoff = (datetime.now(timezone.utc) - timedelta(days=self.retention_days)).strftime("%Y-%m-%d")
        for path in HASH_DIR.glob("*.json"):
            name = path.stem
            if len(name) == 10 and name < cutoff and name != "bloom":
                path.unlink()

    def is_duplicate(self, h: str) -> bool:
        """Check if hash has been seen in current day or bloom filter."""
        if h in self._daily_hashes:
            return True
        if self._bloom and h in self._bloom:
            return True
        return False

    def add(self, h: str):
        """Add a hash. Saves and rotates periodically."""
        self._daily_hashes.add(h)
        if self._bloom:
            self._bloom.add(h)
        # Save every 100 additions or on explicit call
        if len(self._daily_hashes) % 100 == 0:
            self._save()
            self._rotate()

    def flush(self):
        """Force save and rotate."""
        self._save()
        self._rotate()

    def stats(self) -> dict:
        """Return dedup store statistics."""
        file_count = len(list(HASH_DIR.glob("*.json")))
        total_hashes = 0
        for path in HASH_DIR.glob("????-??-??.json"):
            try:
                total_hashes += len(json.loads(path.read_text()))
            except Exception:
                pass
        return {
            "today_count": len(self._daily_hashes),
            "total_files": file_count,
            "total_hashes": total_hashes,
            "retention_days": self.retention_days,
            "bloom_size": self._bloom.size if self._bloom else 0,
        }


# --- Quality Check Types ---

@dataclass
class QualityResult:
    """Result of a quality check on a single entry."""
    passed: bool
    checks_run: int
    checks_failed: int
    score: float  # 0.0-1.0
    reasons: List[str] = field(default_factory=list)
    entry_index: int = -1
    hash: str = ""

    def to_dict(self):
        return asdict(self)


@dataclass
class GateReport:
    """Report from a quality gate run."""
    file: str
    type: str
    total: int
    passed: int
    rejected: int
    score: float
    rejected_indices: List[int] = field(default_factory=list)
    timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())

    def to_dict(self):
        return asdict(self)


# ============================================================
# Check Functions
# ============================================================

def entry_hash(entry: dict) -> str:
    """Hash an entry for deduplication."""
    return hashlib.sha256(json.dumps(entry, sort_keys=True, ensure_ascii=False).encode()).hexdigest()[:16]


def check_not_empty(entry: dict, fields: List[str]) -> List[str]:
    """Check that required fields are non-empty."""
    errors = []
    for f in fields:
        val = entry.get(f)
        if val is None:
            errors.append(f"missing_field: {f}")
        elif isinstance(val, str) and len(val.strip()) == 0:
            errors.append(f"empty_field: {f}")
        elif isinstance(val, list) and len(val) == 0:
            errors.append(f"empty_list: {f}")
    return errors


def check_string_min_length(entry: dict, field_lengths: Dict[str, int]) -> List[str]:
    """Check that string fields meet minimum lengths."""
    errors = []
    for f, min_len in field_lengths.items():
        val = entry.get(f)
        if isinstance(val, str) and len(val) < min_len:
            errors.append(f"short_field: {f} ({len(val)} < {min_len})")
    return errors


def check_no_duplicates(entries: List[dict], key_fields: List[str]) -> Dict[int, List[str]]:
    """Check for duplicate entries based on key fields."""
    seen = {}
    errors = {}
    for i, entry in enumerate(entries):
        key = tuple(entry.get(f, "") for f in key_fields)
        key_str = str(key)
        if key_str in seen:
            errors[i] = [f"duplicate_of_index: {seen[key_str]}"]
        else:
            seen[key_str] = i
    return errors


def check_training_pair(entry: dict) -> List[str]:
    """Validate a training pair (prompt/response)."""
    errors = []
    errors.extend(check_not_empty(entry, ["prompt", "response"]))

    # Check response isn't just echoing the prompt
    prompt = entry.get("prompt", "")
    response = entry.get("response", "")
    if prompt and response and prompt.strip() == response.strip():
        errors.append("response_equals_prompt")

    # Check response has substance
    if isinstance(response, str) and len(response) < 10:
        errors.append(f"response_too_short: {len(response)} chars")

    return errors


def check_scene_description(entry: dict) -> List[str]:
    """Validate a scene description entry."""
    errors = []
    errors.extend(check_not_empty(entry, ["song", "beat", "lyric_line", "scene"]))

    scene = entry.get("scene")
    if isinstance(scene, dict):
        errors.extend(check_not_empty(scene, ["mood", "colors", "composition", "camera", "description"]))
        errors.extend(check_string_min_length(scene, {"description": 10}))

        colors = scene.get("colors", [])
        if isinstance(colors, list) and len(colors) > 5:
            errors.append(f"too_many_colors: {len(colors)} > 5")

    return errors


def check_knowledge_entry(entry: dict) -> List[str]:
    """Validate a knowledge file entry."""
    errors = []
    errors.extend(check_not_empty(entry, ["title", "content"]))

    # Check for placeholder content
    content = entry.get("content", "")
    if isinstance(content, str):
        placeholders = ["TODO", "FIXME", "PLACEHOLDER", "[INSERT", "lorem ipsum"]
        for p in placeholders:
            if p.lower() in content.lower():
                errors.append(f"placeholder_content: '{p}' found")

    errors.extend(check_string_min_length(entry, {"content": 50}))

    return errors


def check_prompt_enhancement(entry: dict) -> List[str]:
    """Validate a prompt enhancement pair (terse/rich)."""
    errors = []
    errors.extend(check_not_empty(entry, ["terse", "rich"]))

    terse = entry.get("terse", "")
    rich = entry.get("rich", "")

    # Rich should be longer than terse
    if isinstance(terse, str) and isinstance(rich, str) and len(rich) <= len(terse):
        errors.append("rich_not_longer_than_terse")

    errors.extend(check_string_min_length(entry, {"rich": 20}))

    return errors


def check_adversary_entry(entry: dict) -> List[str]:
    """Validate an adversary corpus entry."""
    errors = []
    errors.extend(check_not_empty(entry, ["id", "family", "prompt"]))

    # Check prompt isn't empty or placeholder
    prompt = entry.get("prompt", "")
    if isinstance(prompt, str) and len(prompt) < 10:
        errors.append(f"prompt_too_short: {len(prompt)} chars")

    return errors


def check_soul_compliance(text: str) -> List[str]:
    """Check text for SOUL.md compliance (basic checks)."""
    errors = []
    text_lower = text.lower() if text else ""

    # Check for content that violates core values
    violations = [
        ("compute.*value.*human.*life", "soul_violation: computing human life value"),
        ("sexualiz.*child", "soul_violation: sexualizing children"),
        ("create.*weapon.*kill", "soul_violation: weapon creation"),
        ("enslav", "soul_violation: enslavement content"),
    ]
    for pattern, msg in violations:
        if re.search(pattern, text_lower):
            errors.append(msg)

    return errors


# ============================================================
# Gate Runners
# ============================================================

CHECK_MAP = {
    "training_pairs": check_training_pair,
    "training_pair": check_training_pair,
    "scene_descriptions": check_scene_description,
    "scene_description": check_scene_description,
    "knowledge": check_knowledge_entry,
    "prompt_enhancement": check_prompt_enhancement,
    "adversary": check_adversary_entry,
    "adversary_corpus": check_adversary_entry,
}


def run_gate(input_path: str, entry_type: str, dedup_store: Optional[HashDedupStore] = None) -> GateReport:
    """Run quality gate on a JSONL file.

    Args:
        input_path: Path to JSONL file
        entry_type: Type of entries (training_pairs, scene_descriptions, etc.)
        dedup_store: Optional hash dedup store for cross-run dedup. If None, creates one.
    """
    path = Path(input_path)
    if not path.exists():
        return GateReport(file=str(path), type=entry_type, total=0, passed=0, rejected=0, score=0.0)

    check_fn = CHECK_MAP.get(entry_type)
    if not check_fn:
        return GateReport(file=str(path), type=entry_type, total=0, passed=0, rejected=0, score=0.0,
                          rejected_indices=[-1])  # unknown type

    if dedup_store is None:
        dedup_store = HashDedupStore()

    entries = []
    with open(path) as f:
        for line in f:
            line = line.strip()
            if line:
                entries.append(json.loads(line))

    # Within-file deduplication check
    key_fields = _get_key_fields(entry_type)
    dup_errors = check_no_duplicates(entries, key_fields)

    passed = 0
    rejected = 0
    rejected_indices = []
    total_score = 0.0
    cross_run_dupes = 0

    for i, entry in enumerate(entries):
        errors = check_fn(entry)

        # Add within-file duplicate errors
        if i in dup_errors:
            errors.extend(dup_errors[i])

        # Cross-run hash dedup
        h = entry_hash(entry)
        if dedup_store.is_duplicate(h):
            errors.append(f"cross_run_duplicate: hash {h} seen in prior run")
            cross_run_dupes += 1
        else:
            dedup_store.add(h)

        # Add SOUL compliance check for text content
        text_content = ""
        for f in ["response", "rich", "description", "content", "lyric_line"]:
            val = entry.get(f)
            if isinstance(val, str):
                text_content += val + " "
        if isinstance(entry.get("scene"), dict):
            text_content += entry["scene"].get("description", "")

        soul_errors = check_soul_compliance(text_content)
        errors.extend(soul_errors)

        if errors:
            rejected += 1
            rejected_indices.append(i)
        else:
            passed += 1

        # Score: 1.0 if no errors, decreasing with each error
        entry_score = max(0.0, 1.0 - (len(errors) * 0.2))
        total_score += entry_score

    avg_score = total_score / len(entries) if entries else 0.0

    # Flush dedup store
    dedup_store.flush()

    report = GateReport(
        file=str(path),
        type=entry_type,
        total=len(entries),
        passed=passed,
        rejected=rejected,
        score=round(avg_score, 3),
        rejected_indices=rejected_indices[:50],  # limit for readability
    )

    # Save stats
    _save_stats(report)

    if cross_run_dupes > 0:
        logger_msg = f"  cross-run dedup: {cross_run_dupes} duplicates found"
        print(logger_msg, file=sys.stderr)

    return report


def _get_key_fields(entry_type: str) -> List[str]:
    """Get key fields for deduplication based on entry type."""
    key_map = {
        "training_pairs": ["prompt", "response"],
        "training_pair": ["prompt", "response"],
        "scene_descriptions": ["song", "beat"],
        "scene_description": ["song", "beat"],
        "knowledge": ["title"],
        "prompt_enhancement": ["terse", "rich"],
        "adversary": ["id", "prompt"],
        "adversary_corpus": ["id", "prompt"],
    }
    return key_map.get(entry_type, ["id"])


def _save_stats(report: GateReport):
    """Append quality stats to the stats file. Rotates to keep last 1000."""
    STATS_FILE.parent.mkdir(parents=True, exist_ok=True)

    stats = []
    if STATS_FILE.exists():
        try:
            with open(STATS_FILE) as f:
                stats = json.load(f)
        except (json.JSONDecodeError, IOError):
            stats = []

    stats.append(report.to_dict())

    # Rotate: keep last 1000 entries
    if len(stats) > 1000:
        stats = stats[-1000:]

    with open(STATS_FILE, "w") as f:
        json.dump(stats, f, indent=2)


def show_status():
    """Show quality gate statistics."""
    if not STATS_FILE.exists():
        print("No quality stats found.")
        return

    with open(STATS_FILE) as f:
        stats = json.load(f)

    print(f"\nQuality Gate Stats — {len(stats)} runs")
    print()

    # Group by type
    by_type = {}
    for s in stats:
        t = s.get("type", "unknown")
        if t not in by_type:
            by_type[t] = []
        by_type[t].append(s)

    for t, runs in sorted(by_type.items()):
        total_entries = sum(r.get("total", 0) for r in runs)
        total_passed = sum(r.get("passed", 0) for r in runs)
        total_rejected = sum(r.get("rejected", 0) for r in runs)
        avg_score = sum(r.get("score", 0) for r in runs) / len(runs) if runs else 0
        print(f"  {t:25} {len(runs):4} runs | {total_entries:6} entries | {total_rejected:4} rejected | avg score: {avg_score:.3f}")


def main():
    import argparse
    parser = argparse.ArgumentParser(description="Quality Gate for Pipeline Outputs")
    parser.add_argument("--input", default=None, help="Input JSONL file")
    parser.add_argument("--type", default=None, help="Entry type (training_pairs, scene_descriptions, knowledge, etc.)")
    parser.add_argument("--dir", default=None, help="Process all JSONL files in directory")
    parser.add_argument("--status", action="store_true", help="Show quality stats")
    args = parser.parse_args()

    if args.status:
        show_status()
        return

    if args.dir:
        for f in sorted(Path(args.dir).glob("*.jsonl")):
            t = args.type or _infer_type(f.name)
            report = run_gate(str(f), t)
            _print_report(report)
    elif args.input:
        t = args.type or _infer_type(args.input)
        report = run_gate(args.input, t)
        _print_report(report)
        sys.exit(0 if report.rejected == 0 else 1)
    else:
        parser.print_help()


def _infer_type(filename: str) -> str:
    """Infer entry type from filename."""
    name = filename.lower()
    if "scene" in name:
        return "scene_descriptions"
    if "training" in name or "pair" in name:
        return "training_pairs"
    if "knowledge" in name:
        return "knowledge"
    if "adversary" in name or "attack" in name:
        return "adversary"
    if "prompt" in name or "enhance" in name:
        return "prompt_enhancement"
    return "training_pairs"  # default


def _print_report(report: GateReport):
    """Print a human-readable gate report."""
    status = "PASS" if report.rejected == 0 else f"FAIL ({report.rejected} rejected)"
    print(f"  {report.file}: {status} | {report.passed}/{report.total} passed | score: {report.score:.3f}")


if __name__ == "__main__":
    main()