feat: knowledge deduplication — content hash + token similarity (#196)

Dedup module for knowledge entries with: - SHA256 content hashing for exact duplicates - Token Jaccard similarity for near-duplicates (default 0.95) - Quality-based merge: keeps higher confidence/source_count - Metadata merging: tags, related, source_count - Dry-run mode - 30 tests passing - Built-in --test mode with generated duplicates Usage: python scripts/dedup.py --input knowledge/index.json python scripts/dedup.py --input knowledge/index.json --dry-run python scripts/dedup.py --test Closes #196.
2026-04-21 07:58:09 -04:00
parent fe8a70adc1
commit cc215e3ed7
2 changed files with 524 additions and 0 deletions
--- a/tests/test_dedup.py
+++ b/tests/test_dedup.py
@@ -0,0 +1,207 @@
+"""Tests for knowledge deduplication module (Issue #196)."""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
+
+from dedup import (
+    normalize_text,
+    content_hash,
+    tokenize,
+    token_similarity,
+    quality_score,
+    merge_facts,
+    dedup_facts,
+    generate_test_duplicates,
+)
+
+
+class TestNormalize:
+    def test_lowercases(self):
+        assert normalize_text("Hello World") == "hello world"
+
+    def test_collapses_whitespace(self):
+        assert normalize_text("  hello   world  ") == "hello world"
+
+    def test_strips(self):
+        assert normalize_text("  text  ") == "text"
+
+
+class TestContentHash:
+    def test_deterministic(self):
+        h1 = content_hash("Hello World")
+        h2 = content_hash("hello world")
+        h3 = content_hash("  Hello   World  ")
+        assert h1 == h2 == h3
+
+    def test_different_texts(self):
+        h1 = content_hash("Hello")
+        h2 = content_hash("World")
+        assert h1 != h2
+
+    def test_returns_hex(self):
+        h = content_hash("test")
+        assert len(h) == 64  # SHA256
+        assert all(c in '0123456789abcdef' for c in h)
+
+
+class TestTokenize:
+    def test_extracts_words(self):
+        tokens = tokenize("Hello World Test")
+        assert "hello" in tokens
+        assert "world" in tokens
+        assert "test" in tokens
+
+    def test_skips_short_words(self):
+        tokens = tokenize("a to is the hello")
+        assert "a" not in tokens
+        assert "to" not in tokens
+        assert "hello" in tokens
+
+    def test_returns_set(self):
+        tokens = tokenize("hello hello world")
+        assert isinstance(tokens, set)
+        assert len(tokens) == 2
+
+
+class TestTokenSimilarity:
+    def test_identical(self):
+        assert token_similarity("hello world", "hello world") == 1.0
+
+    def test_no_overlap(self):
+        assert token_similarity("alpha beta", "gamma delta") == 0.0
+
+    def test_partial_overlap(self):
+        sim = token_similarity("hello world test", "hello universe test")
+        assert 0.3 < sim < 0.7
+
+    def test_empty(self):
+        assert token_similarity("", "hello") == 0.0
+        assert token_similarity("hello", "") == 0.0
+
+    def test_symmetric(self):
+        a = "hello world test"
+        b = "hello universe test"
+        assert token_similarity(a, b) == token_similarity(b, a)
+
+
+class TestQualityScore:
+    def test_high_confidence(self):
+        fact = {"confidence": 0.95, "source_count": 5, "tags": ["test"], "related": ["x"]}
+        score = quality_score(fact)
+        assert score > 0.7
+
+    def test_low_confidence(self):
+        fact = {"confidence": 0.3, "source_count": 1}
+        score = quality_score(fact)
+        assert score < 0.5
+
+    def test_defaults(self):
+        score = quality_score({})
+        assert 0 < score < 1
+
+
+class TestMergeFacts:
+    def test_merges_tags(self):
+        keep = {"id": "a", "fact": "test", "tags": ["git"], "confidence": 0.9}
+        drop = {"id": "b", "fact": "test", "tags": ["python"], "confidence": 0.8}
+        merged = merge_facts(keep, drop)
+        assert "git" in merged["tags"]
+        assert "python" in merged["tags"]
+
+    def test_merges_source_count(self):
+        keep = {"id": "a", "fact": "test", "source_count": 3}
+        drop = {"id": "b", "fact": "test", "source_count": 2}
+        merged = merge_facts(keep, drop)
+        assert merged["source_count"] == 5
+
+    def test_keeps_higher_confidence(self):
+        keep = {"id": "a", "fact": "test", "confidence": 0.7}
+        drop = {"id": "b", "fact": "test", "confidence": 0.9}
+        merged = merge_facts(keep, drop)
+        assert merged["confidence"] == 0.9
+
+    def test_tracks_merged_from(self):
+        keep = {"id": "a", "fact": "test"}
+        drop = {"id": "b", "fact": "test"}
+        merged = merge_facts(keep, drop)
+        assert "b" in merged["_merged_from"]
+
+
+class TestDedupFacts:
+    def test_removes_exact_dupes(self):
+        facts = [
+            {"id": "1", "fact": "Always use git rebase"},
+            {"id": "2", "fact": "Always use git rebase"},  # exact dupe
+            {"id": "3", "fact": "Check logs first"},
+        ]
+        deduped, stats = dedup_facts(facts)
+        assert stats["exact_dupes"] == 1
+        assert stats["unique"] == 2
+
+    def test_removes_near_dupes(self):
+        facts = [
+            {"id": "1", "fact": "Always check logs before deploying to production server"},
+            {"id": "2", "fact": "Always check logs before deploying to production environment"},
+            {"id": "3", "fact": "Use docker compose for local development environments"},
+        ]
+        deduped, stats = dedup_facts(facts, near_threshold=0.5)
+        assert stats["near_dupes"] >= 1
+        assert stats["unique"] == 2
+
+    def test_preserves_unique(self):
+        facts = [
+            {"id": "1", "fact": "Use git rebase for clean history"},
+            {"id": "2", "fact": "Docker containers should be stateless"},
+            {"id": "3", "fact": "Always write tests before code"},
+        ]
+        deduped, stats = dedup_facts(facts)
+        assert stats["unique"] == 3
+        assert stats["removed"] == 0
+
+    def test_empty_input(self):
+        deduped, stats = dedup_facts([])
+        assert stats["total"] == 0
+        assert stats["unique"] == 0
+
+    def test_keeps_higher_quality_near_dup(self):
+        facts = [
+            {"id": "1", "fact": "Check logs before deploying to production server", "confidence": 0.5, "source_count": 1},
+            {"id": "2", "fact": "Check logs before deploying to production environment", "confidence": 0.9, "source_count": 5, "tags": ["ops"]},
+        ]
+        deduped, stats = dedup_facts(facts, near_threshold=0.5)
+        assert stats["unique"] == 1
+        # Higher quality fact should be kept
+        assert deduped[0]["confidence"] == 0.9
+
+    def test_dry_run_does_not_modify(self):
+        facts = [
+            {"id": "1", "fact": "Same text"},
+            {"id": "2", "fact": "Same text"},
+        ]
+        deduped, stats = dedup_facts(facts, dry_run=True)
+        assert stats["exact_dupes"] == 1
+        # In dry_run, merge_facts is skipped so facts aren't modified
+        assert len(deduped) == 1
+
+
+class TestGenerateTestDuplicates:
+    def test_generates_correct_count(self):
+        facts = generate_test_duplicates(20)
+        assert len(facts) > 20  # 20 unique + duplicates
+
+    def test_has_exact_dupes(self):
+        facts = generate_test_duplicates(20)
+        hashes = [content_hash(f["fact"]) for f in facts]
+        # Should have some duplicate hashes
+        assert len(hashes) != len(set(hashes))
+
+    def test_dedup_removes_dupes(self):
+        facts = generate_test_duplicates(20)
+        deduped, stats = dedup_facts(facts)
+        assert stats["unique"] <= 20
+        assert stats["removed"] > 0