#!/usr/bin/env python3 """Persistent DPO Prompt Deduplication Index. Maintains a full-history hash index of every prompt ever exported, preventing overfitting from accumulating duplicate training pairs across arbitrarily many overnight runs. Design: - Append-only JSON index file alongside the JSONL training data - On export: new prompt hashes appended (no full rescan) - On load: integrity check against disk manifest; incremental ingestion of any JSONL files not yet indexed - rebuild() forces full rescan of all historical JSONL files - Zero external dependencies (stdlib only) Storage format (.dpo_dedup_index.json): { "version": 2, "created_at": "2026-04-13T...", "last_updated": "2026-04-13T...", "indexed_files": ["deepdive_20260412.jsonl", ...], "prompt_hashes": ["a1b2c3d4e5f6", ...], "stats": {"total_prompts": 142, "total_files": 12} } Usage: from dedup_index import DedupIndex idx = DedupIndex(output_dir) # Loads or builds automatically idx.contains("hash") # O(1) lookup idx.add_hashes(["h1", "h2"]) # Append after export idx.register_file("new.jsonl") # Track which files are indexed idx.rebuild() # Full rescan from disk Standalone CLI: python3 dedup_index.py ~/.timmy/training-data/dpo-pairs/ --rebuild python3 dedup_index.py ~/.timmy/training-data/dpo-pairs/ --stats """ import hashlib import json import logging from datetime import datetime, timezone from pathlib import Path from typing import Dict, List, Optional, Set logger = logging.getLogger("deepdive.dedup_index") INDEX_FILENAME = ".dpo_dedup_index.json" INDEX_VERSION = 2 # JSONL filename patterns to scan (covers both deepdive and twitter archive) JSONL_PATTERNS = ["deepdive_*.jsonl", "pairs_*.jsonl"] class DedupIndex: """Persistent full-history prompt deduplication index. Backed by a JSON file in the training data directory. Loads lazily on first access, rebuilds automatically if missing. """ def __init__(self, output_dir: Path, auto_load: bool = True): self.output_dir = Path(output_dir) self.index_path = self.output_dir / INDEX_FILENAME self._hashes: Set[str] = set() self._indexed_files: Set[str] = set() self._created_at: Optional[str] = None self._last_updated: Optional[str] = None self._loaded: bool = False if auto_load: self._ensure_loaded() # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ def contains(self, prompt_hash: str) -> bool: """Check if a prompt hash exists in the full history.""" self._ensure_loaded() return prompt_hash in self._hashes def contains_any(self, prompt_hashes: List[str]) -> Dict[str, bool]: """Batch lookup. Returns {hash: True/False} for each input.""" self._ensure_loaded() return {h: h in self._hashes for h in prompt_hashes} def add_hashes(self, hashes: List[str]) -> int: """Append new prompt hashes to the index. Returns count added.""" self._ensure_loaded() before = len(self._hashes) self._hashes.update(hashes) added = len(self._hashes) - before if added > 0: self._save() logger.debug(f"Added {added} new hashes to dedup index") return added def register_file(self, filename: str) -> None: """Mark a JSONL file as indexed (prevents re-scanning).""" self._ensure_loaded() self._indexed_files.add(filename) self._save() def add_hashes_and_register(self, hashes: List[str], filename: str) -> int: """Atomic: append hashes + register file in one save.""" self._ensure_loaded() before = len(self._hashes) self._hashes.update(hashes) self._indexed_files.add(filename) added = len(self._hashes) - before self._save() return added def rebuild(self) -> Dict[str, int]: """Full rebuild: scan ALL JSONL files in output_dir from scratch. Returns stats dict with counts. """ logger.info(f"Rebuilding dedup index from {self.output_dir}") self._hashes.clear() self._indexed_files.clear() self._created_at = datetime.now(timezone.utc).isoformat() files_scanned = 0 prompts_indexed = 0 all_jsonl = self._discover_jsonl_files() for path in sorted(all_jsonl): file_hashes = self._extract_hashes_from_file(path) self._hashes.update(file_hashes) self._indexed_files.add(path.name) files_scanned += 1 prompts_indexed += len(file_hashes) self._save() stats = { "files_scanned": files_scanned, "unique_prompts": len(self._hashes), "total_prompts_seen": prompts_indexed, } logger.info( f"Rebuild complete: {files_scanned} files, " f"{len(self._hashes)} unique prompt hashes " f"({prompts_indexed} total including dupes)" ) return stats @property def size(self) -> int: """Number of unique prompt hashes in the index.""" self._ensure_loaded() return len(self._hashes) @property def files_indexed(self) -> int: """Number of JSONL files tracked in the index.""" self._ensure_loaded() return len(self._indexed_files) def stats(self) -> Dict: """Return index statistics.""" self._ensure_loaded() return { "version": INDEX_VERSION, "index_path": str(self.index_path), "unique_prompts": len(self._hashes), "files_indexed": len(self._indexed_files), "created_at": self._created_at, "last_updated": self._last_updated, } # ------------------------------------------------------------------ # Internal: load / save / sync # ------------------------------------------------------------------ def _ensure_loaded(self) -> None: """Load index if not yet loaded. Build if missing.""" if self._loaded: return if self.index_path.exists(): self._load() # Check for un-indexed files and ingest them self._sync_incremental() else: # No index exists — build from scratch if self.output_dir.exists(): self.rebuild() else: # Empty dir, nothing to index self._created_at = datetime.now(timezone.utc).isoformat() self._loaded = True self._save() def _load(self) -> None: """Load index from disk.""" try: with open(self.index_path, "r") as f: data = json.load(f) version = data.get("version", 1) if version < INDEX_VERSION: logger.info(f"Index version {version} < {INDEX_VERSION}, rebuilding") self.rebuild() return self._hashes = set(data.get("prompt_hashes", [])) self._indexed_files = set(data.get("indexed_files", [])) self._created_at = data.get("created_at") self._last_updated = data.get("last_updated") self._loaded = True logger.info( f"Loaded dedup index: {len(self._hashes)} hashes, " f"{len(self._indexed_files)} files" ) except (json.JSONDecodeError, KeyError, TypeError) as e: logger.warning(f"Corrupt dedup index, rebuilding: {e}") self.rebuild() def _save(self) -> None: """Persist index to disk.""" self.output_dir.mkdir(parents=True, exist_ok=True) self._last_updated = datetime.now(timezone.utc).isoformat() data = { "version": INDEX_VERSION, "created_at": self._created_at or self._last_updated, "last_updated": self._last_updated, "indexed_files": sorted(self._indexed_files), "prompt_hashes": sorted(self._hashes), "stats": { "total_prompts": len(self._hashes), "total_files": len(self._indexed_files), }, } # Atomic write: write to temp then rename tmp_path = self.index_path.with_suffix(".tmp") with open(tmp_path, "w") as f: json.dump(data, f, indent=2) tmp_path.rename(self.index_path) def _sync_incremental(self) -> None: """Find JSONL files on disk not in the index and ingest them.""" on_disk = self._discover_jsonl_files() unindexed = [p for p in on_disk if p.name not in self._indexed_files] if not unindexed: self._loaded = True return logger.info(f"Incremental sync: {len(unindexed)} new files to index") new_hashes = 0 for path in sorted(unindexed): file_hashes = self._extract_hashes_from_file(path) self._hashes.update(file_hashes) self._indexed_files.add(path.name) new_hashes += len(file_hashes) self._loaded = True self._save() logger.info( f"Incremental sync complete: +{len(unindexed)} files, " f"+{new_hashes} prompt hashes (total: {len(self._hashes)})" ) def _discover_jsonl_files(self) -> List[Path]: """Find all JSONL training data files in output_dir.""" if not self.output_dir.exists(): return [] files = [] for pattern in JSONL_PATTERNS: files.extend(self.output_dir.glob(pattern)) return sorted(set(files)) @staticmethod def _extract_hashes_from_file(path: Path) -> List[str]: """Extract prompt hashes from a single JSONL file.""" hashes = [] try: with open(path) as f: for line in f: line = line.strip() if not line: continue try: pair = json.loads(line) prompt = pair.get("prompt", "") if prompt: normalized = " ".join(prompt.lower().split()) h = hashlib.sha256(normalized.encode()).hexdigest()[:16] hashes.append(h) except json.JSONDecodeError: continue except Exception as e: logger.warning(f"Failed to read {path}: {e}") return hashes @staticmethod def hash_prompt(prompt: str) -> str: """Compute the canonical prompt hash (same algorithm as validator).""" normalized = " ".join(prompt.lower().split()) return hashlib.sha256(normalized.encode()).hexdigest()[:16] # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main(): import argparse parser = argparse.ArgumentParser( description="DPO dedup index management" ) parser.add_argument( "output_dir", type=Path, help="Path to DPO pairs directory" ) parser.add_argument( "--rebuild", action="store_true", help="Force full rebuild from all JSONL files" ) parser.add_argument( "--stats", action="store_true", help="Print index statistics" ) parser.add_argument( "--json", action="store_true", help="Output as JSON" ) args = parser.parse_args() if not args.output_dir.exists(): print(f"Error: directory not found: {args.output_dir}") return 1 idx = DedupIndex(args.output_dir, auto_load=not args.rebuild) if args.rebuild: result = idx.rebuild() if args.json: print(json.dumps(result, indent=2)) else: print(f"Rebuilt index: {result['files_scanned']} files, " f"{result['unique_prompts']} unique prompts") s = idx.stats() if args.json: print(json.dumps(s, indent=2)) else: print("=" * 50) print(" DPO DEDUP INDEX") print("=" * 50) print(f" Path: {s['index_path']}") print(f" Unique prompts: {s['unique_prompts']}") print(f" Files indexed: {s['files_indexed']}") print(f" Created: {s['created_at']}") print(f" Last updated: {s['last_updated']}") print("=" * 50) return 0 if __name__ == "__main__": exit(main())