208 lines
6.7 KiB
Python
208 lines
6.7 KiB
Python
|
|
"""Tests for knowledge deduplication module (Issue #196)."""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
|
||
|
|
|
||
|
|
from dedup import (
|
||
|
|
normalize_text,
|
||
|
|
content_hash,
|
||
|
|
tokenize,
|
||
|
|
token_similarity,
|
||
|
|
quality_score,
|
||
|
|
merge_facts,
|
||
|
|
dedup_facts,
|
||
|
|
generate_test_duplicates,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
class TestNormalize:
|
||
|
|
def test_lowercases(self):
|
||
|
|
assert normalize_text("Hello World") == "hello world"
|
||
|
|
|
||
|
|
def test_collapses_whitespace(self):
|
||
|
|
assert normalize_text(" hello world ") == "hello world"
|
||
|
|
|
||
|
|
def test_strips(self):
|
||
|
|
assert normalize_text(" text ") == "text"
|
||
|
|
|
||
|
|
|
||
|
|
class TestContentHash:
|
||
|
|
def test_deterministic(self):
|
||
|
|
h1 = content_hash("Hello World")
|
||
|
|
h2 = content_hash("hello world")
|
||
|
|
h3 = content_hash(" Hello World ")
|
||
|
|
assert h1 == h2 == h3
|
||
|
|
|
||
|
|
def test_different_texts(self):
|
||
|
|
h1 = content_hash("Hello")
|
||
|
|
h2 = content_hash("World")
|
||
|
|
assert h1 != h2
|
||
|
|
|
||
|
|
def test_returns_hex(self):
|
||
|
|
h = content_hash("test")
|
||
|
|
assert len(h) == 64 # SHA256
|
||
|
|
assert all(c in '0123456789abcdef' for c in h)
|
||
|
|
|
||
|
|
|
||
|
|
class TestTokenize:
|
||
|
|
def test_extracts_words(self):
|
||
|
|
tokens = tokenize("Hello World Test")
|
||
|
|
assert "hello" in tokens
|
||
|
|
assert "world" in tokens
|
||
|
|
assert "test" in tokens
|
||
|
|
|
||
|
|
def test_skips_short_words(self):
|
||
|
|
tokens = tokenize("a to is the hello")
|
||
|
|
assert "a" not in tokens
|
||
|
|
assert "to" not in tokens
|
||
|
|
assert "hello" in tokens
|
||
|
|
|
||
|
|
def test_returns_set(self):
|
||
|
|
tokens = tokenize("hello hello world")
|
||
|
|
assert isinstance(tokens, set)
|
||
|
|
assert len(tokens) == 2
|
||
|
|
|
||
|
|
|
||
|
|
class TestTokenSimilarity:
|
||
|
|
def test_identical(self):
|
||
|
|
assert token_similarity("hello world", "hello world") == 1.0
|
||
|
|
|
||
|
|
def test_no_overlap(self):
|
||
|
|
assert token_similarity("alpha beta", "gamma delta") == 0.0
|
||
|
|
|
||
|
|
def test_partial_overlap(self):
|
||
|
|
sim = token_similarity("hello world test", "hello universe test")
|
||
|
|
assert 0.3 < sim < 0.7
|
||
|
|
|
||
|
|
def test_empty(self):
|
||
|
|
assert token_similarity("", "hello") == 0.0
|
||
|
|
assert token_similarity("hello", "") == 0.0
|
||
|
|
|
||
|
|
def test_symmetric(self):
|
||
|
|
a = "hello world test"
|
||
|
|
b = "hello universe test"
|
||
|
|
assert token_similarity(a, b) == token_similarity(b, a)
|
||
|
|
|
||
|
|
|
||
|
|
class TestQualityScore:
|
||
|
|
def test_high_confidence(self):
|
||
|
|
fact = {"confidence": 0.95, "source_count": 5, "tags": ["test"], "related": ["x"]}
|
||
|
|
score = quality_score(fact)
|
||
|
|
assert score > 0.7
|
||
|
|
|
||
|
|
def test_low_confidence(self):
|
||
|
|
fact = {"confidence": 0.3, "source_count": 1}
|
||
|
|
score = quality_score(fact)
|
||
|
|
assert score < 0.5
|
||
|
|
|
||
|
|
def test_defaults(self):
|
||
|
|
score = quality_score({})
|
||
|
|
assert 0 < score < 1
|
||
|
|
|
||
|
|
|
||
|
|
class TestMergeFacts:
|
||
|
|
def test_merges_tags(self):
|
||
|
|
keep = {"id": "a", "fact": "test", "tags": ["git"], "confidence": 0.9}
|
||
|
|
drop = {"id": "b", "fact": "test", "tags": ["python"], "confidence": 0.8}
|
||
|
|
merged = merge_facts(keep, drop)
|
||
|
|
assert "git" in merged["tags"]
|
||
|
|
assert "python" in merged["tags"]
|
||
|
|
|
||
|
|
def test_merges_source_count(self):
|
||
|
|
keep = {"id": "a", "fact": "test", "source_count": 3}
|
||
|
|
drop = {"id": "b", "fact": "test", "source_count": 2}
|
||
|
|
merged = merge_facts(keep, drop)
|
||
|
|
assert merged["source_count"] == 5
|
||
|
|
|
||
|
|
def test_keeps_higher_confidence(self):
|
||
|
|
keep = {"id": "a", "fact": "test", "confidence": 0.7}
|
||
|
|
drop = {"id": "b", "fact": "test", "confidence": 0.9}
|
||
|
|
merged = merge_facts(keep, drop)
|
||
|
|
assert merged["confidence"] == 0.9
|
||
|
|
|
||
|
|
def test_tracks_merged_from(self):
|
||
|
|
keep = {"id": "a", "fact": "test"}
|
||
|
|
drop = {"id": "b", "fact": "test"}
|
||
|
|
merged = merge_facts(keep, drop)
|
||
|
|
assert "b" in merged["_merged_from"]
|
||
|
|
|
||
|
|
|
||
|
|
class TestDedupFacts:
|
||
|
|
def test_removes_exact_dupes(self):
|
||
|
|
facts = [
|
||
|
|
{"id": "1", "fact": "Always use git rebase"},
|
||
|
|
{"id": "2", "fact": "Always use git rebase"}, # exact dupe
|
||
|
|
{"id": "3", "fact": "Check logs first"},
|
||
|
|
]
|
||
|
|
deduped, stats = dedup_facts(facts)
|
||
|
|
assert stats["exact_dupes"] == 1
|
||
|
|
assert stats["unique"] == 2
|
||
|
|
|
||
|
|
def test_removes_near_dupes(self):
|
||
|
|
facts = [
|
||
|
|
{"id": "1", "fact": "Always check logs before deploying to production server"},
|
||
|
|
{"id": "2", "fact": "Always check logs before deploying to production environment"},
|
||
|
|
{"id": "3", "fact": "Use docker compose for local development environments"},
|
||
|
|
]
|
||
|
|
deduped, stats = dedup_facts(facts, near_threshold=0.5)
|
||
|
|
assert stats["near_dupes"] >= 1
|
||
|
|
assert stats["unique"] == 2
|
||
|
|
|
||
|
|
def test_preserves_unique(self):
|
||
|
|
facts = [
|
||
|
|
{"id": "1", "fact": "Use git rebase for clean history"},
|
||
|
|
{"id": "2", "fact": "Docker containers should be stateless"},
|
||
|
|
{"id": "3", "fact": "Always write tests before code"},
|
||
|
|
]
|
||
|
|
deduped, stats = dedup_facts(facts)
|
||
|
|
assert stats["unique"] == 3
|
||
|
|
assert stats["removed"] == 0
|
||
|
|
|
||
|
|
def test_empty_input(self):
|
||
|
|
deduped, stats = dedup_facts([])
|
||
|
|
assert stats["total"] == 0
|
||
|
|
assert stats["unique"] == 0
|
||
|
|
|
||
|
|
def test_keeps_higher_quality_near_dup(self):
|
||
|
|
facts = [
|
||
|
|
{"id": "1", "fact": "Check logs before deploying to production server", "confidence": 0.5, "source_count": 1},
|
||
|
|
{"id": "2", "fact": "Check logs before deploying to production environment", "confidence": 0.9, "source_count": 5, "tags": ["ops"]},
|
||
|
|
]
|
||
|
|
deduped, stats = dedup_facts(facts, near_threshold=0.5)
|
||
|
|
assert stats["unique"] == 1
|
||
|
|
# Higher quality fact should be kept
|
||
|
|
assert deduped[0]["confidence"] == 0.9
|
||
|
|
|
||
|
|
def test_dry_run_does_not_modify(self):
|
||
|
|
facts = [
|
||
|
|
{"id": "1", "fact": "Same text"},
|
||
|
|
{"id": "2", "fact": "Same text"},
|
||
|
|
]
|
||
|
|
deduped, stats = dedup_facts(facts, dry_run=True)
|
||
|
|
assert stats["exact_dupes"] == 1
|
||
|
|
# In dry_run, merge_facts is skipped so facts aren't modified
|
||
|
|
assert len(deduped) == 1
|
||
|
|
|
||
|
|
|
||
|
|
class TestGenerateTestDuplicates:
|
||
|
|
def test_generates_correct_count(self):
|
||
|
|
facts = generate_test_duplicates(20)
|
||
|
|
assert len(facts) > 20 # 20 unique + duplicates
|
||
|
|
|
||
|
|
def test_has_exact_dupes(self):
|
||
|
|
facts = generate_test_duplicates(20)
|
||
|
|
hashes = [content_hash(f["fact"]) for f in facts]
|
||
|
|
# Should have some duplicate hashes
|
||
|
|
assert len(hashes) != len(set(hashes))
|
||
|
|
|
||
|
|
def test_dedup_removes_dupes(self):
|
||
|
|
facts = generate_test_duplicates(20)
|
||
|
|
deduped, stats = dedup_facts(facts)
|
||
|
|
assert stats["unique"] <= 20
|
||
|
|
assert stats["removed"] > 0
|