"""Tests for knowledge deduplication module (Issue #196).""" import json import sys from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).parent.parent / "scripts")) from dedup import ( normalize_text, content_hash, tokenize, token_similarity, quality_score, merge_facts, dedup_facts, generate_test_duplicates, ) class TestNormalize: def test_lowercases(self): assert normalize_text("Hello World") == "hello world" def test_collapses_whitespace(self): assert normalize_text(" hello world ") == "hello world" def test_strips(self): assert normalize_text(" text ") == "text" class TestContentHash: def test_deterministic(self): h1 = content_hash("Hello World") h2 = content_hash("hello world") h3 = content_hash(" Hello World ") assert h1 == h2 == h3 def test_different_texts(self): h1 = content_hash("Hello") h2 = content_hash("World") assert h1 != h2 def test_returns_hex(self): h = content_hash("test") assert len(h) == 64 # SHA256 assert all(c in '0123456789abcdef' for c in h) class TestTokenize: def test_extracts_words(self): tokens = tokenize("Hello World Test") assert "hello" in tokens assert "world" in tokens assert "test" in tokens def test_skips_short_words(self): tokens = tokenize("a to is the hello") assert "a" not in tokens assert "to" not in tokens assert "hello" in tokens def test_returns_set(self): tokens = tokenize("hello hello world") assert isinstance(tokens, set) assert len(tokens) == 2 class TestTokenSimilarity: def test_identical(self): assert token_similarity("hello world", "hello world") == 1.0 def test_no_overlap(self): assert token_similarity("alpha beta", "gamma delta") == 0.0 def test_partial_overlap(self): sim = token_similarity("hello world test", "hello universe test") assert 0.3 < sim < 0.7 def test_empty(self): assert token_similarity("", "hello") == 0.0 assert token_similarity("hello", "") == 0.0 def test_symmetric(self): a = "hello world test" b = "hello universe test" assert token_similarity(a, b) == token_similarity(b, a) class TestQualityScore: def test_high_confidence(self): fact = {"confidence": 0.95, "source_count": 5, "tags": ["test"], "related": ["x"]} score = quality_score(fact) assert score > 0.7 def test_low_confidence(self): fact = {"confidence": 0.3, "source_count": 1} score = quality_score(fact) assert score < 0.5 def test_defaults(self): score = quality_score({}) assert 0 < score < 1 class TestMergeFacts: def test_merges_tags(self): keep = {"id": "a", "fact": "test", "tags": ["git"], "confidence": 0.9} drop = {"id": "b", "fact": "test", "tags": ["python"], "confidence": 0.8} merged = merge_facts(keep, drop) assert "git" in merged["tags"] assert "python" in merged["tags"] def test_merges_source_count(self): keep = {"id": "a", "fact": "test", "source_count": 3} drop = {"id": "b", "fact": "test", "source_count": 2} merged = merge_facts(keep, drop) assert merged["source_count"] == 5 def test_keeps_higher_confidence(self): keep = {"id": "a", "fact": "test", "confidence": 0.7} drop = {"id": "b", "fact": "test", "confidence": 0.9} merged = merge_facts(keep, drop) assert merged["confidence"] == 0.9 def test_tracks_merged_from(self): keep = {"id": "a", "fact": "test"} drop = {"id": "b", "fact": "test"} merged = merge_facts(keep, drop) assert "b" in merged["_merged_from"] class TestDedupFacts: def test_removes_exact_dupes(self): facts = [ {"id": "1", "fact": "Always use git rebase"}, {"id": "2", "fact": "Always use git rebase"}, # exact dupe {"id": "3", "fact": "Check logs first"}, ] deduped, stats = dedup_facts(facts) assert stats["exact_dupes"] == 1 assert stats["unique"] == 2 def test_removes_near_dupes(self): facts = [ {"id": "1", "fact": "Always check logs before deploying to production server"}, {"id": "2", "fact": "Always check logs before deploying to production environment"}, {"id": "3", "fact": "Use docker compose for local development environments"}, ] deduped, stats = dedup_facts(facts, near_threshold=0.5) assert stats["near_dupes"] >= 1 assert stats["unique"] == 2 def test_preserves_unique(self): facts = [ {"id": "1", "fact": "Use git rebase for clean history"}, {"id": "2", "fact": "Docker containers should be stateless"}, {"id": "3", "fact": "Always write tests before code"}, ] deduped, stats = dedup_facts(facts) assert stats["unique"] == 3 assert stats["removed"] == 0 def test_empty_input(self): deduped, stats = dedup_facts([]) assert stats["total"] == 0 assert stats["unique"] == 0 def test_keeps_higher_quality_near_dup(self): facts = [ {"id": "1", "fact": "Check logs before deploying to production server", "confidence": 0.5, "source_count": 1}, {"id": "2", "fact": "Check logs before deploying to production environment", "confidence": 0.9, "source_count": 5, "tags": ["ops"]}, ] deduped, stats = dedup_facts(facts, near_threshold=0.5) assert stats["unique"] == 1 # Higher quality fact should be kept assert deduped[0]["confidence"] == 0.9 def test_dry_run_does_not_modify(self): facts = [ {"id": "1", "fact": "Same text"}, {"id": "2", "fact": "Same text"}, ] deduped, stats = dedup_facts(facts, dry_run=True) assert stats["exact_dupes"] == 1 # In dry_run, merge_facts is skipped so facts aren't modified assert len(deduped) == 1 class TestGenerateTestDuplicates: def test_generates_correct_count(self): facts = generate_test_duplicates(20) assert len(facts) > 20 # 20 unique + duplicates def test_has_exact_dupes(self): facts = generate_test_duplicates(20) hashes = [content_hash(f["fact"]) for f in facts] # Should have some duplicate hashes assert len(hashes) != len(set(hashes)) def test_dedup_removes_dupes(self): facts = generate_test_duplicates(20) deduped, stats = dedup_facts(facts) assert stats["unique"] <= 20 assert stats["removed"] > 0