timmy-config/scripts/hash_dedup.py

#!/usr/bin/env python3
"""
hash_dedup.py — Deduplication with bounded hash storage.

Stores SHA-256 hashes of validated content in date-stamped files.
Rotates daily. Keeps only last N days. Prevents unbounded growth.

Usage:
    from hash_dedup import HashDedup

    dedup = HashDedup("/path/to/.hashes")
    if dedup.is_duplicate("some content"):
        print("Already seen")
    else:
        dedup.add("some content")
        print("New content")

    # Cleanup old hashes
    dedup.cleanup(keep_days=7)
"""

import hashlib
import json
import os
from datetime import datetime, timedelta
from pathlib import Path
from typing import Set, Optional


class HashDedup:
    """
    Bounded hash-based deduplication with daily rotation.

    Storage layout:
        .hashes/
            2026-04-15.json  (one file per day)
            2026-04-14.json
            ...
    """

    def __init__(self, storage_dir: str, max_hashes_per_file: int = 100000):
        self.storage_dir = Path(storage_dir)
        self.max_hashes_per_file = max_hashes_per_file
        self.storage_dir.mkdir(parents=True, exist_ok=True)
        self._today_hashes: Optional[Set[str]] = None
        self._today_file: Optional[Path] = None

    def _today(self) -> str:
        """Current date string."""
        return datetime.utcnow().strftime("%Y-%m-%d")

    def _date_file(self, date_str: str) -> Path:
        """Path to hash file for a given date."""
        return self.storage_dir / f"{date_str}.json"

    @property
    def today_file(self) -> Path:
        if self._today_file is None:
            self._today_file = self._date_file(self._today())
        return self._today_file

    def _load_today(self) -> Set[str]:
        """Load today's hashes from disk."""
        if self._today_hashes is not None:
            return self._today_hashes

        path = self.today_file
        if path.exists():
            try:
                with open(path) as f:
                    data = json.load(f)
                self._today_hashes = set(data.get("hashes", []))
            except (json.JSONDecodeError, IOError):
                self._today_hashes = set()
        else:
            self._today_hashes = set()
        return self._today_hashes

    def _save_today(self):
        """Save today's hashes to disk."""
        hashes = self._load_today()
        path = self.today_file

        # Enforce max size
        if len(hashes) > self.max_hashes_per_file:
            hashes = set(list(hashes)[:self.max_hashes_per_file])

        data = {
            "date": self._today(),
            "count": len(hashes),
            "hashes": sorted(hashes),
        }
        with open(path, 'w') as f:
            json.dump(data, f)

    @staticmethod
    def compute_hash(content: str) -> str:
        """Compute SHA-256 hex digest of content."""
        return hashlib.sha256(content.encode('utf-8')).hexdigest()

    def is_duplicate(self, content: str) -> bool:
        """Check if content hash exists in today's file or recent files."""
        h = self.compute_hash(content)

        # Check today
        if h in self._load_today():
            return True

        return False

    def is_duplicate_any(self, content: str, lookback_days: int = 7) -> bool:
        """Check if content hash exists in any file within lookback period."""
        h = self.compute_hash(content)

        # Check today first
        if h in self._load_today():
            return True

        # Check recent files
        for i in range(1, lookback_days + 1):
            date_str = (datetime.utcnow() - timedelta(days=i)).strftime("%Y-%m-%d")
            path = self._date_file(date_str)
            if path.exists():
                try:
                    with open(path) as f:
                        data = json.load(f)
                    if h in set(data.get("hashes", [])):
                        return True
                except (json.JSONDecodeError, IOError):
                    continue

        return False

    def add(self, content: str) -> bool:
        """Add content hash. Returns True if added (was new), False if duplicate."""
        h = self.compute_hash(content)
        hashes = self._load_today()

        if h in hashes:
            return False

        hashes.add(h)
        self._save_today()
        return True

    def add_batch(self, contents: list) -> int:
        """Add multiple content hashes. Returns count of new hashes added."""
        hashes = self._load_today()
        new_count = 0

        for content in contents:
            h = self.compute_hash(content)
            if h not in hashes:
                hashes.add(h)
                new_count += 1

        if new_count > 0:
            self._save_today()

        return new_count

    def cleanup(self, keep_days: int = 7) -> int:
        """
        Remove hash files older than keep_days.

        Returns count of files removed.
        """
        removed = 0
        cutoff = datetime.utcnow() - timedelta(days=keep_days)

        for path in self.storage_dir.glob("*.json"):
            try:
                date_str = path.stem
                file_date = datetime.strptime(date_str, "%Y-%m-%d")
                if file_date < cutoff:
                    path.unlink()
                    removed += 1
            except ValueError:
                # Not a date-named file, skip
                continue

        return removed

    def stats(self) -> dict:
        """Get dedup statistics."""
        total_hashes = 0
        file_count = 0
        oldest = None
        newest = None

        for path in self.storage_dir.glob("*.json"):
            try:
                date_str = path.stem
                file_date = datetime.strptime(date_str, "%Y-%m-%d")
                file_count += 1

                if oldest is None or file_date < oldest:
                    oldest = file_date
                if newest is None or file_date > newest:
                    newest = file_date

                with open(path) as f:
                    data = json.load(f)
                total_hashes += data.get("count", 0)
            except (ValueError, json.JSONDecodeError, IOError):
                continue

        return {
            "file_count": file_count,
            "total_hashes": total_hashes,
            "oldest_file": oldest.strftime("%Y-%m-%d") if oldest else None,
            "newest_file": newest.strftime("%Y-%m-%d") if newest else None,
            "today_count": len(self._load_today()),
        }


def main():
    """CLI for hash_dedup operations."""
    import argparse

    parser = argparse.ArgumentParser(description="Hash dedup with bounded storage")
    parser.add_argument("--dir", default=".hashes", help="Hash storage directory")
    parser.add_argument("--cleanup", type=int, metavar="DAYS", help="Remove files older than N days")
    parser.add_argument("--stats", action="store_true", help="Show statistics")
    parser.add_argument("--check", type=str, help="Check if content hash exists")
    parser.add_argument("--add", type=str, help="Add content hash")
    parser.add_argument("--file", type=str, help="Add hashes from file (one per line)")
    args = parser.parse_args()

    dedup = HashDedup(args.dir)

    if args.cleanup is not None:
        removed = dedup.cleanup(keep_days=args.cleanup)
        print(f"Cleaned up {removed} files older than {args.cleanup} days")

    if args.stats:
        stats = dedup.stats()
        print(f"Hash Dedup Statistics:")
        print(f"  Files: {stats['file_count']}")
        print(f"  Total hashes: {stats['total_hashes']}")
        print(f"  Today: {stats['today_count']}")
        print(f"  Date range: {stats['oldest_file']} to {stats['newest_file']}")

    if args.check:
        if dedup.is_duplicate_any(args.check):
            print("DUPLICATE")
        else:
            print("NEW")

    if args.add:
        if dedup.add(args.add):
            print(f"Added: {dedup.compute_hash(args.add)}")
        else:
            print("Already exists")

    if args.file:
        with open(args.file) as f:
            lines = [l.strip() for l in f if l.strip()]
        added = dedup.add_batch(lines)
        print(f"Added {added}/{len(lines)} new hashes")


if __name__ == "__main__":
    main()