the-nexus/intelligence/deepdive/dedup_index.py

#!/usr/bin/env python3
"""Persistent DPO Prompt Deduplication Index.

Maintains a full-history hash index of every prompt ever exported,
preventing overfitting from accumulating duplicate training pairs
across arbitrarily many overnight runs.

Design:
  - Append-only JSON index file alongside the JSONL training data
  - On export: new prompt hashes appended (no full rescan)
  - On load: integrity check against disk manifest; incremental
    ingestion of any JSONL files not yet indexed
  - rebuild() forces full rescan of all historical JSONL files
  - Zero external dependencies (stdlib only)

Storage format (.dpo_dedup_index.json):
  {
    "version": 2,
    "created_at": "2026-04-13T...",
    "last_updated": "2026-04-13T...",
    "indexed_files": ["deepdive_20260412.jsonl", ...],
    "prompt_hashes": ["a1b2c3d4e5f6", ...],
    "stats": {"total_prompts": 142, "total_files": 12}
  }

Usage:
    from dedup_index import DedupIndex

    idx = DedupIndex(output_dir)   # Loads or builds automatically
    idx.contains("hash")           # O(1) lookup
    idx.add_hashes(["h1", "h2"])   # Append after export
    idx.register_file("new.jsonl") # Track which files are indexed
    idx.rebuild()                  # Full rescan from disk

Standalone CLI:
    python3 dedup_index.py ~/.timmy/training-data/dpo-pairs/ --rebuild
    python3 dedup_index.py ~/.timmy/training-data/dpo-pairs/ --stats
"""

import hashlib
import json
import logging
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Optional, Set

logger = logging.getLogger("deepdive.dedup_index")

INDEX_FILENAME = ".dpo_dedup_index.json"
INDEX_VERSION = 2

# JSONL filename patterns to scan (covers both deepdive and twitter archive)
JSONL_PATTERNS = ["deepdive_*.jsonl", "pairs_*.jsonl"]


class DedupIndex:
    """Persistent full-history prompt deduplication index.

    Backed by a JSON file in the training data directory.
    Loads lazily on first access, rebuilds automatically if missing.
    """

    def __init__(self, output_dir: Path, auto_load: bool = True):
        self.output_dir = Path(output_dir)
        self.index_path = self.output_dir / INDEX_FILENAME

        self._hashes: Set[str] = set()
        self._indexed_files: Set[str] = set()
        self._created_at: Optional[str] = None
        self._last_updated: Optional[str] = None
        self._loaded: bool = False

        if auto_load:
            self._ensure_loaded()

    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------

    def contains(self, prompt_hash: str) -> bool:
        """Check if a prompt hash exists in the full history."""
        self._ensure_loaded()
        return prompt_hash in self._hashes

    def contains_any(self, prompt_hashes: List[str]) -> Dict[str, bool]:
        """Batch lookup. Returns {hash: True/False} for each input."""
        self._ensure_loaded()
        return {h: h in self._hashes for h in prompt_hashes}

    def add_hashes(self, hashes: List[str]) -> int:
        """Append new prompt hashes to the index. Returns count added."""
        self._ensure_loaded()
        before = len(self._hashes)
        self._hashes.update(hashes)
        added = len(self._hashes) - before
        if added > 0:
            self._save()
            logger.debug(f"Added {added} new hashes to dedup index")
        return added

    def register_file(self, filename: str) -> None:
        """Mark a JSONL file as indexed (prevents re-scanning)."""
        self._ensure_loaded()
        self._indexed_files.add(filename)
        self._save()

    def add_hashes_and_register(self, hashes: List[str], filename: str) -> int:
        """Atomic: append hashes + register file in one save."""
        self._ensure_loaded()
        before = len(self._hashes)
        self._hashes.update(hashes)
        self._indexed_files.add(filename)
        added = len(self._hashes) - before
        self._save()
        return added

    def rebuild(self) -> Dict[str, int]:
        """Full rebuild: scan ALL JSONL files in output_dir from scratch.

        Returns stats dict with counts.
        """
        logger.info(f"Rebuilding dedup index from {self.output_dir}")
        self._hashes.clear()
        self._indexed_files.clear()
        self._created_at = datetime.now(timezone.utc).isoformat()

        files_scanned = 0
        prompts_indexed = 0

        all_jsonl = self._discover_jsonl_files()
        for path in sorted(all_jsonl):
            file_hashes = self._extract_hashes_from_file(path)
            self._hashes.update(file_hashes)
            self._indexed_files.add(path.name)
            files_scanned += 1
            prompts_indexed += len(file_hashes)

        self._save()

        stats = {
            "files_scanned": files_scanned,
            "unique_prompts": len(self._hashes),
            "total_prompts_seen": prompts_indexed,
        }
        logger.info(
            f"Rebuild complete: {files_scanned} files, "
            f"{len(self._hashes)} unique prompt hashes "
            f"({prompts_indexed} total including dupes)"
        )
        return stats

    @property
    def size(self) -> int:
        """Number of unique prompt hashes in the index."""
        self._ensure_loaded()
        return len(self._hashes)

    @property
    def files_indexed(self) -> int:
        """Number of JSONL files tracked in the index."""
        self._ensure_loaded()
        return len(self._indexed_files)

    def stats(self) -> Dict:
        """Return index statistics."""
        self._ensure_loaded()
        return {
            "version": INDEX_VERSION,
            "index_path": str(self.index_path),
            "unique_prompts": len(self._hashes),
            "files_indexed": len(self._indexed_files),
            "created_at": self._created_at,
            "last_updated": self._last_updated,
        }

    # ------------------------------------------------------------------
    # Internal: load / save / sync
    # ------------------------------------------------------------------

    def _ensure_loaded(self) -> None:
        """Load index if not yet loaded. Build if missing."""
        if self._loaded:
            return

        if self.index_path.exists():
            self._load()
            # Check for un-indexed files and ingest them
            self._sync_incremental()
        else:
            # No index exists — build from scratch
            if self.output_dir.exists():
                self.rebuild()
            else:
                # Empty dir, nothing to index
                self._created_at = datetime.now(timezone.utc).isoformat()
                self._loaded = True
                self._save()

    def _load(self) -> None:
        """Load index from disk."""
        try:
            with open(self.index_path, "r") as f:
                data = json.load(f)

            version = data.get("version", 1)
            if version < INDEX_VERSION:
                logger.info(f"Index version {version} < {INDEX_VERSION}, rebuilding")
                self.rebuild()
                return

            self._hashes = set(data.get("prompt_hashes", []))
            self._indexed_files = set(data.get("indexed_files", []))
            self._created_at = data.get("created_at")
            self._last_updated = data.get("last_updated")
            self._loaded = True

            logger.info(
                f"Loaded dedup index: {len(self._hashes)} hashes, "
                f"{len(self._indexed_files)} files"
            )
        except (json.JSONDecodeError, KeyError, TypeError) as e:
            logger.warning(f"Corrupt dedup index, rebuilding: {e}")
            self.rebuild()

    def _save(self) -> None:
        """Persist index to disk."""
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self._last_updated = datetime.now(timezone.utc).isoformat()

        data = {
            "version": INDEX_VERSION,
            "created_at": self._created_at or self._last_updated,
            "last_updated": self._last_updated,
            "indexed_files": sorted(self._indexed_files),
            "prompt_hashes": sorted(self._hashes),
            "stats": {
                "total_prompts": len(self._hashes),
                "total_files": len(self._indexed_files),
            },
        }

        # Atomic write: write to temp then rename
        tmp_path = self.index_path.with_suffix(".tmp")
        with open(tmp_path, "w") as f:
            json.dump(data, f, indent=2)
        tmp_path.rename(self.index_path)

    def _sync_incremental(self) -> None:
        """Find JSONL files on disk not in the index and ingest them."""
        on_disk = self._discover_jsonl_files()
        unindexed = [p for p in on_disk if p.name not in self._indexed_files]

        if not unindexed:
            self._loaded = True
            return

        logger.info(f"Incremental sync: {len(unindexed)} new files to index")
        new_hashes = 0
        for path in sorted(unindexed):
            file_hashes = self._extract_hashes_from_file(path)
            self._hashes.update(file_hashes)
            self._indexed_files.add(path.name)
            new_hashes += len(file_hashes)

        self._loaded = True
        self._save()
        logger.info(
            f"Incremental sync complete: +{len(unindexed)} files, "
            f"+{new_hashes} prompt hashes (total: {len(self._hashes)})"
        )

    def _discover_jsonl_files(self) -> List[Path]:
        """Find all JSONL training data files in output_dir."""
        if not self.output_dir.exists():
            return []

        files = []
        for pattern in JSONL_PATTERNS:
            files.extend(self.output_dir.glob(pattern))
        return sorted(set(files))

    @staticmethod
    def _extract_hashes_from_file(path: Path) -> List[str]:
        """Extract prompt hashes from a single JSONL file."""
        hashes = []
        try:
            with open(path) as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        pair = json.loads(line)
                        prompt = pair.get("prompt", "")
                        if prompt:
                            normalized = " ".join(prompt.lower().split())
                            h = hashlib.sha256(normalized.encode()).hexdigest()[:16]
                            hashes.append(h)
                    except json.JSONDecodeError:
                        continue
        except Exception as e:
            logger.warning(f"Failed to read {path}: {e}")
        return hashes

    @staticmethod
    def hash_prompt(prompt: str) -> str:
        """Compute the canonical prompt hash (same algorithm as validator)."""
        normalized = " ".join(prompt.lower().split())
        return hashlib.sha256(normalized.encode()).hexdigest()[:16]


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def main():
    import argparse

    parser = argparse.ArgumentParser(
        description="DPO dedup index management"
    )
    parser.add_argument(
        "output_dir", type=Path,
        help="Path to DPO pairs directory"
    )
    parser.add_argument(
        "--rebuild", action="store_true",
        help="Force full rebuild from all JSONL files"
    )
    parser.add_argument(
        "--stats", action="store_true",
        help="Print index statistics"
    )
    parser.add_argument(
        "--json", action="store_true",
        help="Output as JSON"
    )
    args = parser.parse_args()

    if not args.output_dir.exists():
        print(f"Error: directory not found: {args.output_dir}")
        return 1

    idx = DedupIndex(args.output_dir, auto_load=not args.rebuild)

    if args.rebuild:
        result = idx.rebuild()
        if args.json:
            print(json.dumps(result, indent=2))
        else:
            print(f"Rebuilt index: {result['files_scanned']} files, "
                  f"{result['unique_prompts']} unique prompts")

    s = idx.stats()
    if args.json:
        print(json.dumps(s, indent=2))
    else:
        print("=" * 50)
        print(" DPO DEDUP INDEX")
        print("=" * 50)
        print(f"  Path:           {s['index_path']}")
        print(f"  Unique prompts: {s['unique_prompts']}")
        print(f"  Files indexed:  {s['files_indexed']}")
        print(f"  Created:        {s['created_at']}")
        print(f"  Last updated:   {s['last_updated']}")
        print("=" * 50)

    return 0


if __name__ == "__main__":
    exit(main())