""" Tests for scripts/hash_dedup.py — Bounded hash deduplication. """ import json import os import shutil import tempfile import unittest from datetime import datetime, timedelta from pathlib import Path import sys sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) from hash_dedup import HashDedup class TestHashDedup(unittest.TestCase): def setUp(self): self.tmpdir = tempfile.mkdtemp() self.dedup = HashDedup(self.tmpdir) def tearDown(self): shutil.rmtree(self.tmpdir) def test_compute_hash(self): h = HashDedup.compute_hash("test content") self.assertEqual(len(h), 64) # SHA-256 hex self.assertTrue(all(c in '0123456789abcdef' for c in h)) def test_same_content_same_hash(self): h1 = HashDedup.compute_hash("hello") h2 = HashDedup.compute_hash("hello") self.assertEqual(h1, h2) def test_different_content_different_hash(self): h1 = HashDedup.compute_hash("hello") h2 = HashDedup.compute_hash("world") self.assertNotEqual(h1, h2) def test_add_new(self): result = self.dedup.add("new content") self.assertTrue(result) def test_add_duplicate(self): self.dedup.add("content") result = self.dedup.add("content") self.assertFalse(result) def test_is_duplicate_false(self): self.assertFalse(self.dedup.is_duplicate("unknown")) def test_is_duplicate_true(self): self.dedup.add("known content") self.assertTrue(self.dedup.is_duplicate("known content")) def test_add_batch(self): items = ["a", "b", "c"] added = self.dedup.add_batch(items) self.assertEqual(added, 3) def test_add_batch_deduplicates(self): items = ["a", "b", "a", "c", "b"] added = self.dedup.add_batch(items) self.assertEqual(added, 3) def test_creates_date_file(self): self.dedup.add("test") today = datetime.utcnow().strftime("%Y-%m-%d") path = Path(self.tmpdir) / f"{today}.json" self.assertTrue(path.exists()) def test_file_format(self): self.dedup.add("test") today = datetime.utcnow().strftime("%Y-%m-%d") path = Path(self.tmpdir) / f"{today}.json" with open(path) as f: data = json.load(f) self.assertEqual(data["date"], today) self.assertEqual(data["count"], 1) self.assertEqual(len(data["hashes"]), 1) def test_cleanup_removes_old(self): # Create fake old file old_date = (datetime.utcnow() - timedelta(days=10)).strftime("%Y-%m-%d") old_path = Path(self.tmpdir) / f"{old_date}.json" with open(old_path, 'w') as f: json.dump({"date": old_date, "count": 0, "hashes": []}, f) removed = self.dedup.cleanup(keep_days=7) self.assertEqual(removed, 1) self.assertFalse(old_path.exists()) def test_cleanup_keeps_recent(self): recent_date = (datetime.utcnow() - timedelta(days=3)).strftime("%Y-%m-%d") recent_path = Path(self.tmpdir) / f"{recent_date}.json" with open(recent_path, 'w') as f: json.dump({"date": recent_date, "count": 0, "hashes": []}, f) removed = self.dedup.cleanup(keep_days=7) self.assertEqual(removed, 0) self.assertTrue(recent_path.exists()) def test_cleanup_ignores_non_date_files(self): junk = Path(self.tmpdir) / "not-a-date.json" with open(junk, 'w') as f: f.write("{}") removed = self.dedup.cleanup(keep_days=1) self.assertEqual(removed, 0) self.assertTrue(junk.exists()) def test_stats_empty(self): stats = self.dedup.stats() self.assertEqual(stats["file_count"], 0) self.assertEqual(stats["total_hashes"], 0) def test_stats_with_data(self): self.dedup.add("one") self.dedup.add("two") stats = self.dedup.stats() self.assertEqual(stats["file_count"], 1) self.assertEqual(stats["total_hashes"], 2) self.assertEqual(stats["today_count"], 2) def test_max_hashes_per_file(self): dedup = HashDedup(self.tmpdir, max_hashes_per_file=3) for i in range(10): dedup.add(f"content-{i}") today = datetime.utcnow().strftime("%Y-%m-%d") path = Path(self.tmpdir) / f"{today}.json" with open(path) as f: data = json.load(f) self.assertLessEqual(len(data["hashes"]), 3) if __name__ == "__main__": unittest.main()