tests/test_dedup.py

"""Tests for knowledge deduplication module (Issue #196)."""

import json
import sys
from pathlib import Path

import pytest

sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))

from dedup import (
    normalize_text,
    content_hash,
    tokenize,
    token_similarity,
    quality_score,
    merge_facts,
    dedup_facts,
    generate_test_duplicates,
)


class TestNormalize:
    def test_lowercases(self):
        assert normalize_text("Hello World") == "hello world"

    def test_collapses_whitespace(self):
        assert normalize_text("  hello   world  ") == "hello world"

    def test_strips(self):
        assert normalize_text("  text  ") == "text"


class TestContentHash:
    def test_deterministic(self):
        h1 = content_hash("Hello World")
        h2 = content_hash("hello world")
        h3 = content_hash("  Hello   World  ")
        assert h1 == h2 == h3

    def test_different_texts(self):
        h1 = content_hash("Hello")
        h2 = content_hash("World")
        assert h1 != h2

    def test_returns_hex(self):
        h = content_hash("test")
        assert len(h) == 64  # SHA256
        assert all(c in '0123456789abcdef' for c in h)


class TestTokenize:
    def test_extracts_words(self):
        tokens = tokenize("Hello World Test")
        assert "hello" in tokens
        assert "world" in tokens
        assert "test" in tokens

    def test_skips_short_words(self):
        tokens = tokenize("a to is the hello")
        assert "a" not in tokens
        assert "to" not in tokens
        assert "hello" in tokens

    def test_returns_set(self):
        tokens = tokenize("hello hello world")
        assert isinstance(tokens, set)
        assert len(tokens) == 2


class TestTokenSimilarity:
    def test_identical(self):
        assert token_similarity("hello world", "hello world") == 1.0

    def test_no_overlap(self):
        assert token_similarity("alpha beta", "gamma delta") == 0.0

    def test_partial_overlap(self):
        sim = token_similarity("hello world test", "hello universe test")
        assert 0.3 < sim < 0.7

    def test_empty(self):
        assert token_similarity("", "hello") == 0.0
        assert token_similarity("hello", "") == 0.0

    def test_symmetric(self):
        a = "hello world test"
        b = "hello universe test"
        assert token_similarity(a, b) == token_similarity(b, a)


class TestQualityScore:
    def test_high_confidence(self):
        fact = {"confidence": 0.95, "source_count": 5, "tags": ["test"], "related": ["x"]}
        score = quality_score(fact)
        assert score > 0.7

    def test_low_confidence(self):
        fact = {"confidence": 0.3, "source_count": 1}
        score = quality_score(fact)
        assert score < 0.5

    def test_defaults(self):
        score = quality_score({})
        assert 0 < score < 1


class TestMergeFacts:
    def test_merges_tags(self):
        keep = {"id": "a", "fact": "test", "tags": ["git"], "confidence": 0.9}
        drop = {"id": "b", "fact": "test", "tags": ["python"], "confidence": 0.8}
        merged = merge_facts(keep, drop)
        assert "git" in merged["tags"]
        assert "python" in merged["tags"]

    def test_merges_source_count(self):
        keep = {"id": "a", "fact": "test", "source_count": 3}
        drop = {"id": "b", "fact": "test", "source_count": 2}
        merged = merge_facts(keep, drop)
        assert merged["source_count"] == 5

    def test_keeps_higher_confidence(self):
        keep = {"id": "a", "fact": "test", "confidence": 0.7}
        drop = {"id": "b", "fact": "test", "confidence": 0.9}
        merged = merge_facts(keep, drop)
        assert merged["confidence"] == 0.9

    def test_tracks_merged_from(self):
        keep = {"id": "a", "fact": "test"}
        drop = {"id": "b", "fact": "test"}
        merged = merge_facts(keep, drop)
        assert "b" in merged["_merged_from"]


class TestDedupFacts:
    def test_removes_exact_dupes(self):
        facts = [
            {"id": "1", "fact": "Always use git rebase"},
            {"id": "2", "fact": "Always use git rebase"},  # exact dupe
            {"id": "3", "fact": "Check logs first"},
        ]
        deduped, stats = dedup_facts(facts)
        assert stats["exact_dupes"] == 1
        assert stats["unique"] == 2

    def test_removes_near_dupes(self):
        facts = [
            {"id": "1", "fact": "Always check logs before deploying to production server"},
            {"id": "2", "fact": "Always check logs before deploying to production environment"},
            {"id": "3", "fact": "Use docker compose for local development environments"},
        ]
        deduped, stats = dedup_facts(facts, near_threshold=0.5)
        assert stats["near_dupes"] >= 1
        assert stats["unique"] == 2

    def test_preserves_unique(self):
        facts = [
            {"id": "1", "fact": "Use git rebase for clean history"},
            {"id": "2", "fact": "Docker containers should be stateless"},
            {"id": "3", "fact": "Always write tests before code"},
        ]
        deduped, stats = dedup_facts(facts)
        assert stats["unique"] == 3
        assert stats["removed"] == 0

    def test_empty_input(self):
        deduped, stats = dedup_facts([])
        assert stats["total"] == 0
        assert stats["unique"] == 0

    def test_keeps_higher_quality_near_dup(self):
        facts = [
            {"id": "1", "fact": "Check logs before deploying to production server", "confidence": 0.5, "source_count": 1},
            {"id": "2", "fact": "Check logs before deploying to production environment", "confidence": 0.9, "source_count": 5, "tags": ["ops"]},
        ]
        deduped, stats = dedup_facts(facts, near_threshold=0.5)
        assert stats["unique"] == 1
        # Higher quality fact should be kept
        assert deduped[0]["confidence"] == 0.9

    def test_dry_run_does_not_modify(self):
        facts = [
            {"id": "1", "fact": "Same text"},
            {"id": "2", "fact": "Same text"},
        ]
        deduped, stats = dedup_facts(facts, dry_run=True)
        assert stats["exact_dupes"] == 1
        # In dry_run, merge_facts is skipped so facts aren't modified
        assert len(deduped) == 1


class TestGenerateTestDuplicates:
    def test_generates_correct_count(self):
        facts = generate_test_duplicates(20)
        assert len(facts) > 20  # 20 unique + duplicates

    def test_has_exact_dupes(self):
        facts = generate_test_duplicates(20)
        hashes = [content_hash(f["fact"]) for f in facts]
        # Should have some duplicate hashes
        assert len(hashes) != len(set(hashes))

    def test_dedup_removes_dupes(self):
        facts = generate_test_duplicates(20)
        deduped, stats = dedup_facts(facts)
        assert stats["unique"] <= 20
        assert stats["removed"] > 0
feat: knowledge deduplication — content hash + token similarity (#196) Dedup module for knowledge entries with: - SHA256 content hashing for exact duplicates - Token Jaccard similarity for near-duplicates (default 0.95) - Quality-based merge: keeps higher confidence/source_count - Metadata merging: tags, related, source_count - Dry-run mode - 30 tests passing - Built-in --test mode with generated duplicates Usage: python scripts/dedup.py --input knowledge/index.json python scripts/dedup.py --input knowledge/index.json --dry-run python scripts/dedup.py --test Closes #196. 2026-04-21 07:58:09 -04:00			`"""Tests for knowledge deduplication module (Issue #196)."""`

			`import json`
			`import sys`
			`from pathlib import Path`

			`import pytest`

			`sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))`

			`from dedup import (`
			`normalize_text,`
			`content_hash,`
			`tokenize,`
			`token_similarity,`
			`quality_score,`
			`merge_facts,`
			`dedup_facts,`
			`generate_test_duplicates,`
			`)`


			`class TestNormalize:`
			`def test_lowercases(self):`
			`assert normalize_text("Hello World") == "hello world"`

			`def test_collapses_whitespace(self):`
			`assert normalize_text(" hello world ") == "hello world"`

			`def test_strips(self):`
			`assert normalize_text(" text ") == "text"`


			`class TestContentHash:`
			`def test_deterministic(self):`
			`h1 = content_hash("Hello World")`
			`h2 = content_hash("hello world")`
			`h3 = content_hash(" Hello World ")`
			`assert h1 == h2 == h3`

			`def test_different_texts(self):`
			`h1 = content_hash("Hello")`
			`h2 = content_hash("World")`
			`assert h1 != h2`

			`def test_returns_hex(self):`
			`h = content_hash("test")`
			`assert len(h) == 64 # SHA256`
			`assert all(c in '0123456789abcdef' for c in h)`


			`class TestTokenize:`
			`def test_extracts_words(self):`
			`tokens = tokenize("Hello World Test")`
			`assert "hello" in tokens`
			`assert "world" in tokens`
			`assert "test" in tokens`

			`def test_skips_short_words(self):`
			`tokens = tokenize("a to is the hello")`
			`assert "a" not in tokens`
			`assert "to" not in tokens`
			`assert "hello" in tokens`

			`def test_returns_set(self):`
			`tokens = tokenize("hello hello world")`
			`assert isinstance(tokens, set)`
			`assert len(tokens) == 2`


			`class TestTokenSimilarity:`
			`def test_identical(self):`
			`assert token_similarity("hello world", "hello world") == 1.0`

			`def test_no_overlap(self):`
			`assert token_similarity("alpha beta", "gamma delta") == 0.0`

			`def test_partial_overlap(self):`
			`sim = token_similarity("hello world test", "hello universe test")`
			`assert 0.3 < sim < 0.7`

			`def test_empty(self):`
			`assert token_similarity("", "hello") == 0.0`
			`assert token_similarity("hello", "") == 0.0`

			`def test_symmetric(self):`
			`a = "hello world test"`
			`b = "hello universe test"`
			`assert token_similarity(a, b) == token_similarity(b, a)`


			`class TestQualityScore:`
			`def test_high_confidence(self):`
			`fact = {"confidence": 0.95, "source_count": 5, "tags": ["test"], "related": ["x"]}`
			`score = quality_score(fact)`
			`assert score > 0.7`

			`def test_low_confidence(self):`
			`fact = {"confidence": 0.3, "source_count": 1}`
			`score = quality_score(fact)`
			`assert score < 0.5`

			`def test_defaults(self):`
			`score = quality_score({})`
			`assert 0 < score < 1`


			`class TestMergeFacts:`
			`def test_merges_tags(self):`
			`keep = {"id": "a", "fact": "test", "tags": ["git"], "confidence": 0.9}`
			`drop = {"id": "b", "fact": "test", "tags": ["python"], "confidence": 0.8}`
			`merged = merge_facts(keep, drop)`
			`assert "git" in merged["tags"]`
			`assert "python" in merged["tags"]`

			`def test_merges_source_count(self):`
			`keep = {"id": "a", "fact": "test", "source_count": 3}`
			`drop = {"id": "b", "fact": "test", "source_count": 2}`
			`merged = merge_facts(keep, drop)`
			`assert merged["source_count"] == 5`

			`def test_keeps_higher_confidence(self):`
			`keep = {"id": "a", "fact": "test", "confidence": 0.7}`
			`drop = {"id": "b", "fact": "test", "confidence": 0.9}`
			`merged = merge_facts(keep, drop)`
			`assert merged["confidence"] == 0.9`

			`def test_tracks_merged_from(self):`
			`keep = {"id": "a", "fact": "test"}`
			`drop = {"id": "b", "fact": "test"}`
			`merged = merge_facts(keep, drop)`
			`assert "b" in merged["_merged_from"]`


			`class TestDedupFacts:`
			`def test_removes_exact_dupes(self):`
			`facts = [`
			`{"id": "1", "fact": "Always use git rebase"},`
			`{"id": "2", "fact": "Always use git rebase"}, # exact dupe`
			`{"id": "3", "fact": "Check logs first"},`
			`]`
			`deduped, stats = dedup_facts(facts)`
			`assert stats["exact_dupes"] == 1`
			`assert stats["unique"] == 2`

			`def test_removes_near_dupes(self):`
			`facts = [`
			`{"id": "1", "fact": "Always check logs before deploying to production server"},`
			`{"id": "2", "fact": "Always check logs before deploying to production environment"},`
			`{"id": "3", "fact": "Use docker compose for local development environments"},`
			`]`
			`deduped, stats = dedup_facts(facts, near_threshold=0.5)`
			`assert stats["near_dupes"] >= 1`
			`assert stats["unique"] == 2`

			`def test_preserves_unique(self):`
			`facts = [`
			`{"id": "1", "fact": "Use git rebase for clean history"},`
			`{"id": "2", "fact": "Docker containers should be stateless"},`
			`{"id": "3", "fact": "Always write tests before code"},`
			`]`
			`deduped, stats = dedup_facts(facts)`
			`assert stats["unique"] == 3`
			`assert stats["removed"] == 0`

			`def test_empty_input(self):`
			`deduped, stats = dedup_facts([])`
			`assert stats["total"] == 0`
			`assert stats["unique"] == 0`

			`def test_keeps_higher_quality_near_dup(self):`
			`facts = [`
			`{"id": "1", "fact": "Check logs before deploying to production server", "confidence": 0.5, "source_count": 1},`
			`{"id": "2", "fact": "Check logs before deploying to production environment", "confidence": 0.9, "source_count": 5, "tags": ["ops"]},`
			`]`
			`deduped, stats = dedup_facts(facts, near_threshold=0.5)`
			`assert stats["unique"] == 1`
			`# Higher quality fact should be kept`
			`assert deduped[0]["confidence"] == 0.9`

			`def test_dry_run_does_not_modify(self):`
			`facts = [`
			`{"id": "1", "fact": "Same text"},`
			`{"id": "2", "fact": "Same text"},`
			`]`
			`deduped, stats = dedup_facts(facts, dry_run=True)`
			`assert stats["exact_dupes"] == 1`
			`# In dry_run, merge_facts is skipped so facts aren't modified`
			`assert len(deduped) == 1`


			`class TestGenerateTestDuplicates:`
			`def test_generates_correct_count(self):`
			`facts = generate_test_duplicates(20)`
			`assert len(facts) > 20 # 20 unique + duplicates`

			`def test_has_exact_dupes(self):`
			`facts = generate_test_duplicates(20)`
			`hashes = [content_hash(f["fact"]) for f in facts]`
			`# Should have some duplicate hashes`
			`assert len(hashes) != len(set(hashes))`

			`def test_dedup_removes_dupes(self):`
			`facts = generate_test_duplicates(20)`
			`deduped, stats = dedup_facts(facts)`
			`assert stats["unique"] <= 20`
			`assert stats["removed"] > 0`