Compare commits

..

1 Commits

Author SHA1 Message Date
00d97242b8 feat: implement refactoring_opportunity_finder API (#210)
Some checks failed
Test / pytest (pull_request) Failing after 28s
- compute_file_complexity(): AST-based cyclomatic complexity
- calculate_refactoring_score(): weighted scoring (complexity, size, churn, coverage)
- FileMetrics dataclass with all expected fields
- generate_proposals(): scan codebase and rank files

Closes #210
2026-04-21 11:32:10 +00:00
3 changed files with 218 additions and 546 deletions

View File

@@ -1,317 +0,0 @@
#!/usr/bin/env python3
"""
dedup.py — Knowledge deduplication: content hash + semantic similarity.
Deduplicates harvested knowledge entries to avoid training on duplicates.
Uses content hashing for exact matches and token overlap for near-duplicates.
Usage:
python3 dedup.py --input knowledge/index.json --output knowledge/index_deduped.json
python3 dedup.py --input knowledge/index.json --dry-run
python3 dedup.py --test # Run built-in dedup test
"""
import argparse
import hashlib
import json
import re
import sys
from pathlib import Path
from typing import List, Dict, Optional, Tuple
def normalize_text(text: str) -> str:
"""Normalize text for hashing: lowercase, collapse whitespace, strip."""
text = text.lower().strip()
text = re.sub(r'\s+', ' ', text)
return text
def content_hash(text: str) -> str:
"""SHA256 hash of normalized text for exact dedup."""
normalized = normalize_text(text)
return hashlib.sha256(normalized.encode('utf-8')).hexdigest()
def tokenize(text: str) -> set:
"""Simple tokenizer: lowercase words, 3+ chars."""
words = re.findall(r'[a-z0-9_]{3,}', text.lower())
return set(words)
def token_similarity(a: str, b: str) -> float:
"""Token-based Jaccard similarity (0.0-1.0).
Fast local alternative to embedding similarity.
Good enough for near-duplicate detection.
"""
tokens_a = tokenize(a)
tokens_b = tokenize(b)
if not tokens_a or not tokens_b:
return 0.0
intersection = tokens_a & tokens_b
union = tokens_a | tokens_b
return len(intersection) / len(union)
def quality_score(fact: dict) -> float:
"""Compute quality score for merge ranking.
Higher is better. Factors:
- confidence (0-1)
- source_count (more confirmations = better)
- has tags (richer metadata)
"""
confidence = fact.get('confidence', 0.5)
source_count = fact.get('source_count', 1)
has_tags = 1.0 if fact.get('tags') else 0.0
has_related = 1.0 if fact.get('related') else 0.0
# Weighted composite
score = (
confidence * 0.5 +
min(source_count / 10, 1.0) * 0.3 +
has_tags * 0.1 +
has_related * 0.1
)
return round(score, 4)
def merge_facts(keep: dict, drop: dict) -> dict:
"""Merge two near-duplicate facts, keeping higher-quality fields.
The 'keep' fact is enriched with metadata from 'drop'.
"""
# Merge tags (union)
keep_tags = set(keep.get('tags', []))
drop_tags = set(drop.get('tags', []))
keep['tags'] = sorted(keep_tags | drop_tags)
# Merge related (union)
keep_related = set(keep.get('related', []))
drop_related = set(drop.get('related', []))
keep['related'] = sorted(keep_related | drop_related)
# Update source_count (sum)
keep['source_count'] = keep.get('source_count', 1) + drop.get('source_count', 1)
# Update confidence (max — we've now seen it from multiple sources)
keep['confidence'] = max(keep.get('confidence', 0), drop.get('confidence', 0))
# Track that we merged
if '_merged_from' not in keep:
keep['_merged_from'] = []
keep['_merged_from'].append(drop.get('id', 'unknown'))
return keep
def dedup_facts(
facts: List[dict],
exact_threshold: float = 1.0,
near_threshold: float = 0.95,
dry_run: bool = False,
) -> Tuple[List[dict], dict]:
"""Deduplicate a list of knowledge facts.
Args:
facts: List of fact dicts (from index.json)
exact_threshold: Hash match = exact duplicate
near_threshold: Token similarity above this = near-duplicate
dry_run: If True, don't modify, just report
Returns:
(deduped_facts, stats_dict)
"""
if not facts:
return [], {"total": 0, "exact_dupes": 0, "near_dupes": 0, "unique": 0}
# Phase 1: Exact dedup by content hash
hash_seen = {} # hash -> index in deduped list
exact_dupes = 0
deduped = []
for fact in facts:
text = fact.get('fact', '')
h = content_hash(text)
if h in hash_seen:
# Exact duplicate — merge metadata into existing
existing_idx = hash_seen[h]
if not dry_run:
deduped[existing_idx] = merge_facts(deduped[existing_idx], fact)
exact_dupes += 1
else:
hash_seen[h] = len(deduped)
deduped.append(fact)
# Phase 2: Near-dup by token similarity
near_dupes = 0
i = 0
while i < len(deduped):
j = i + 1
while j < len(deduped):
sim = token_similarity(deduped[i].get('fact', ''), deduped[j].get('fact', ''))
if sim >= near_threshold:
# Near-duplicate — keep higher quality
q_i = quality_score(deduped[i])
q_j = quality_score(deduped[j])
if q_i >= q_j:
if not dry_run:
deduped[i] = merge_facts(deduped[i], deduped[j])
deduped.pop(j)
else:
# j is higher quality — merge i into j, then remove i
if not dry_run:
deduped[j] = merge_facts(deduped[j], deduped[i])
deduped.pop(i)
break # i changed, restart inner loop
near_dupes += 1
else:
j += 1
i += 1
stats = {
"total": len(facts),
"exact_dupes": exact_dupes,
"near_dupes": near_dupes,
"unique": len(deduped),
"removed": len(facts) - len(deduped),
}
return deduped, stats
def dedup_index_file(
input_path: str,
output_path: Optional[str] = None,
near_threshold: float = 0.95,
dry_run: bool = False,
) -> dict:
"""Deduplicate an index.json file.
Args:
input_path: Path to index.json
output_path: Where to write deduped file (default: overwrite input)
near_threshold: Token similarity threshold for near-dupes
dry_run: Report only, don't write
Returns stats dict.
"""
path = Path(input_path)
if not path.exists():
raise FileNotFoundError(f"Index file not found: {input_path}")
with open(path) as f:
data = json.load(f)
facts = data.get('facts', [])
deduped, stats = dedup_facts(facts, near_threshold=near_threshold, dry_run=dry_run)
if not dry_run:
data['facts'] = deduped
data['total_facts'] = len(deduped)
data['last_dedup'] = __import__('datetime').datetime.now(
__import__('datetime').timezone.utc
).isoformat()
out_path = Path(output_path) if output_path else path
with open(out_path, 'w') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
return stats
def generate_test_duplicates(n: int = 20) -> List[dict]:
"""Generate test facts with intentional duplicates for testing.
Creates n unique facts plus n/4 exact dupes and n/4 near-dupes.
"""
import random
random.seed(42)
unique_facts = []
for i in range(n):
topic = random.choice(["git", "python", "docker", "rust", "nginx"])
tip = random.choice(["use verbose flags", "check logs first", "restart service", "clear cache", "update config"])
unique_facts.append({
"id": f"test:fact:{i:03d}",
"fact": f"When working with {topic}, always {tip} before deploying.",
"category": "fact",
"domain": "test",
"confidence": round(random.uniform(0.5, 1.0), 2),
"source_count": random.randint(1, 5),
"tags": [topic, "test"],
})
# Add exact duplicates (same text, different IDs)
duped = list(unique_facts)
for i in range(n // 4):
original = unique_facts[i]
dupe = dict(original)
dupe["id"] = f"test:fact:dup{i:03d}"
dupe["confidence"] = round(random.uniform(0.3, 0.8), 2)
duped.append(dupe)
# Add near-duplicates (slightly different phrasing)
for i in range(n // 4):
original = unique_facts[i]
near = dict(original)
near["id"] = f"test:fact:near{i:03d}"
near["fact"] = original["fact"].replace("always", "should").replace("before deploying", "prior to deployment")
near["confidence"] = round(random.uniform(0.4, 0.9), 2)
duped.append(near)
return duped
def main():
parser = argparse.ArgumentParser(description="Knowledge deduplication")
parser.add_argument("--input", help="Path to index.json")
parser.add_argument("--output", help="Output path (default: overwrite input)")
parser.add_argument("--threshold", type=float, default=0.95,
help="Near-dup similarity threshold (default: 0.95)")
parser.add_argument("--dry-run", action="store_true", help="Report only, don't write")
parser.add_argument("--test", action="store_true", help="Run built-in dedup test")
parser.add_argument("--json", action="store_true", help="JSON output")
args = parser.parse_args()
if args.test:
test_facts = generate_test_duplicates(20)
print(f"Generated {len(test_facts)} test facts (20 unique + dupes)")
deduped, stats = dedup_facts(test_facts, near_threshold=args.threshold)
print(f"\nDedup results:")
print(f" Total input: {stats['total']}")
print(f" Exact dupes: {stats['exact_dupes']}")
print(f" Near dupes: {stats['near_dupes']}")
print(f" Unique output: {stats['unique']}")
print(f" Removed: {stats['removed']}")
# Verify: should have ~20 unique (some merged)
assert stats['unique'] <= 20, f"Too many unique: {stats['unique']} > 20"
assert stats['unique'] >= 15, f"Too few unique: {stats['unique']} < 15"
assert stats['removed'] > 0, "No duplicates removed"
print("\nOK: Dedup test passed")
return
if not args.input:
print("ERROR: Provide --input or --test")
sys.exit(1)
stats = dedup_index_file(args.input, args.output, args.threshold, args.dry_run)
if args.json:
print(json.dumps(stats, indent=2))
else:
print(f"Dedup results:")
print(f" Total input: {stats['total']}")
print(f" Exact dupes: {stats['exact_dupes']}")
print(f" Near dupes: {stats['near_dupes']}")
print(f" Unique output: {stats['unique']}")
print(f" Removed: {stats['removed']}")
if args.dry_run:
print(" (dry run — no changes written)")
if __name__ == "__main__":
main()

View File

@@ -1,44 +1,240 @@
#!/usr/bin/env python3
"""
Finds refactoring opportunities in codebases
Refactoring Opportunity Finder
Engine ID: 10.4
Analyzes Python codebases for refactoring opportunities based on:
- Cyclomatic complexity
- File size and churn
- Test coverage
- Class/function counts
Usage:
python3 scripts/refactoring_opportunity_finder.py --output proposals/refactoring_opportunity_finder.json
python3 scripts/refactoring_opportunity_finder.py --output proposals/refactoring_opportunity_finder.json --dry-run
python3 scripts/refactoring_opportunity_finder.py --root . --output proposals.json
python3 scripts/refactoring_opportunity_finder.py --root . --output proposals.json --dry-run
"""
import argparse
import ast
import json
import os
import sys
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Optional, Tuple
def generate_proposals():
"""Generate sample proposals for this engine."""
# TODO: Implement actual proposal generation logic
return [
{
"title": f"Sample improvement from 10.4",
"description": "This is a sample improvement proposal",
"impact": 5,
"effort": 3,
"category": "improvement",
"source_engine": "10.4",
"timestamp": datetime.now(timezone.utc).isoformat()
}
]
@dataclass
class FileMetrics:
"""Metrics for a single file."""
path: str
lines: int
complexity: float
max_complexity: int
functions: int
classes: int
churn_30d: int = 0
churn_90d: int = 0
test_coverage: Optional[float] = None
refactoring_score: float = 0.0
def _compute_function_complexity(node: ast.FunctionDef) -> int:
"""Compute cyclomatic complexity of a single function."""
complexity = 1 # Base complexity
for child in ast.walk(node):
if isinstance(child, (ast.If, ast.While, ast.For)):
complexity += 1
elif isinstance(child, ast.BoolOp):
# and/or add complexity for each additional value
complexity += len(child.values) - 1
elif isinstance(child, ast.ExceptHandler):
complexity += 1
elif isinstance(child, ast.Assert):
complexity += 1
elif isinstance(child, ast.comprehension):
complexity += 1
complexity += len(child.ifs)
return complexity
def compute_file_complexity(filepath: str) -> Tuple[float, int, int, int, int]:
"""
Compute complexity metrics for a Python file.
Returns:
(avg_complexity, max_complexity, function_count, class_count, line_count)
"""
try:
with open(filepath, "r", encoding="utf-8", errors="replace") as f:
source = f.read()
except (OSError, IOError):
return 0.0, 0, 0, 0, 0
lines = source.count("\n") + 1
try:
tree = ast.parse(source, filename=filepath)
except SyntaxError:
return 0.0, 0, 0, 0, lines
functions = []
classes = []
for node in ast.walk(tree):
if isinstance(node, ast.ClassDef):
classes.append(node)
elif isinstance(node, ast.FunctionDef):
functions.append(node)
if not functions:
return 0.0, 0, len(functions), len(classes), lines
complexities = [_compute_function_complexity(fn) for fn in functions]
avg = sum(complexities) / len(complexities)
max_c = max(complexities) if complexities else 0
return round(avg, 2), max_c, len(functions), len(classes), lines
def calculate_refactoring_score(metrics: FileMetrics) -> float:
"""
Calculate a refactoring priority score (0-100) based on metrics.
Higher score = more urgent refactoring candidate.
Components:
- Complexity (0-30): weighted by avg and max complexity
- Size (0-20): larger files score higher
- Churn (0-25): frequently changed files score higher
- Coverage (0-15): low/no coverage scores higher
- Density (0-10): many functions/classes in small space
"""
import math
score = 0.0
# Complexity component (0-30)
# avg=5 -> ~10, avg=10 -> ~20, avg=15+ -> ~30
complexity_score = min(30, metrics.complexity * 2)
# Bonus for high max complexity
if metrics.max_complexity > 10:
complexity_score = min(30, complexity_score + (metrics.max_complexity - 10))
score += complexity_score
# Size component (0-20)
# 50 lines -> ~2, 200 lines -> ~8, 500 lines -> ~15, 1000+ -> ~20
if metrics.lines > 0:
size_score = min(20, math.log2(max(1, metrics.lines)) * 2.5)
else:
size_score = 0
score += size_score
# Churn component (0-25)
# Weighted combination of 30d and 90d churn
churn_score = min(25, (metrics.churn_30d * 1.5) + (metrics.churn_90d * 0.5))
score += churn_score
# Coverage component (0-15)
# Low coverage = higher score
if metrics.test_coverage is None:
# No data -> assume medium risk
score += 5
elif metrics.test_coverage < 0.3:
score += 15
elif metrics.test_coverage < 0.5:
score += 10
elif metrics.test_coverage < 0.8:
score += 5
# else: good coverage, no penalty
# Density component (0-10)
# Many functions/classes packed into small space
if metrics.lines > 0:
density = (metrics.functions + metrics.classes * 3) / (metrics.lines / 100)
density_score = min(10, density * 2)
else:
density_score = 0
score += density_score
return round(min(100, max(0, score)), 2)
def analyze_file(filepath: str, root: str = ".") -> Optional[FileMetrics]:
"""Analyze a single Python file and return metrics."""
try:
rel_path = os.path.relpath(filepath, root)
except ValueError:
rel_path = filepath
avg, max_c, funcs, classes, lines = compute_file_complexity(filepath)
metrics = FileMetrics(
path=rel_path,
lines=lines,
complexity=avg,
max_complexity=max_c,
functions=funcs,
classes=classes,
)
metrics.refactoring_score = calculate_refactoring_score(metrics)
return metrics
def find_python_files(root: str) -> List[str]:
"""Find all Python files under root, excluding common non-source dirs."""
skip_dirs = {".git", "__pycache__", ".tox", ".eggs", "node_modules", ".venv", "venv", "env"}
files = []
for dirpath, dirnames, filenames in os.walk(root):
dirnames[:] = [d for d in dirnames if d not in skip_dirs]
for fn in filenames:
if fn.endswith(".py"):
files.append(os.path.join(dirpath, fn))
return sorted(files)
def generate_proposals(root: str = ".", min_score: float = 30.0) -> List[dict]:
"""Generate refactoring proposals for the codebase."""
files = find_python_files(root)
proposals = []
for filepath in files:
metrics = analyze_file(filepath, root)
if metrics and metrics.refactoring_score >= min_score:
proposals.append({
"title": f"Refactor {metrics.path} (score: {metrics.refactoring_score})",
"description": (
f"File has complexity avg={metrics.complexity:.1f} max={metrics.max_complexity}, "
f"{metrics.functions} functions, {metrics.classes} classes, {metrics.lines} lines."
),
"impact": min(10, int(metrics.refactoring_score / 10)),
"effort": min(10, max(1, int(metrics.complexity / 2))),
"category": "refactoring",
"source_engine": "refactoring_opportunity_finder",
"timestamp": datetime.now(timezone.utc).isoformat(),
"metrics": {
"path": metrics.path,
"complexity": metrics.complexity,
"max_complexity": metrics.max_complexity,
"lines": metrics.lines,
"refactoring_score": metrics.refactoring_score,
}
})
# Sort by score descending
proposals.sort(key=lambda p: p.get("metrics", {}).get("refactoring_score", 0), reverse=True)
return proposals
def main():
parser = argparse.ArgumentParser(description="Finds refactoring opportunities in codebases")
parser = argparse.ArgumentParser(description="Find refactoring opportunities")
parser.add_argument("--root", default=".", help="Root directory to scan")
parser.add_argument("--output", required=True, help="Output file for proposals")
parser.add_argument("--dry-run", action="store_true", help="Don't write output file")
parser.add_argument("--min-score", type=float, default=30.0, help="Minimum score threshold")
args = parser.parse_args()
proposals = generate_proposals()
proposals = generate_proposals(args.root, args.min_score)
if not args.dry_run:
with open(args.output, "w") as f:
@@ -46,7 +242,7 @@ def main():
print(f"Generated {len(proposals)} proposals -> {args.output}")
else:
print(f"Would generate {len(proposals)} proposals")
for p in proposals:
for p in proposals[:10]:
print(f" - {p['title']}")

View File

@@ -1,207 +0,0 @@
"""Tests for knowledge deduplication module (Issue #196)."""
import json
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
from dedup import (
normalize_text,
content_hash,
tokenize,
token_similarity,
quality_score,
merge_facts,
dedup_facts,
generate_test_duplicates,
)
class TestNormalize:
def test_lowercases(self):
assert normalize_text("Hello World") == "hello world"
def test_collapses_whitespace(self):
assert normalize_text(" hello world ") == "hello world"
def test_strips(self):
assert normalize_text(" text ") == "text"
class TestContentHash:
def test_deterministic(self):
h1 = content_hash("Hello World")
h2 = content_hash("hello world")
h3 = content_hash(" Hello World ")
assert h1 == h2 == h3
def test_different_texts(self):
h1 = content_hash("Hello")
h2 = content_hash("World")
assert h1 != h2
def test_returns_hex(self):
h = content_hash("test")
assert len(h) == 64 # SHA256
assert all(c in '0123456789abcdef' for c in h)
class TestTokenize:
def test_extracts_words(self):
tokens = tokenize("Hello World Test")
assert "hello" in tokens
assert "world" in tokens
assert "test" in tokens
def test_skips_short_words(self):
tokens = tokenize("a to is the hello")
assert "a" not in tokens
assert "to" not in tokens
assert "hello" in tokens
def test_returns_set(self):
tokens = tokenize("hello hello world")
assert isinstance(tokens, set)
assert len(tokens) == 2
class TestTokenSimilarity:
def test_identical(self):
assert token_similarity("hello world", "hello world") == 1.0
def test_no_overlap(self):
assert token_similarity("alpha beta", "gamma delta") == 0.0
def test_partial_overlap(self):
sim = token_similarity("hello world test", "hello universe test")
assert 0.3 < sim < 0.7
def test_empty(self):
assert token_similarity("", "hello") == 0.0
assert token_similarity("hello", "") == 0.0
def test_symmetric(self):
a = "hello world test"
b = "hello universe test"
assert token_similarity(a, b) == token_similarity(b, a)
class TestQualityScore:
def test_high_confidence(self):
fact = {"confidence": 0.95, "source_count": 5, "tags": ["test"], "related": ["x"]}
score = quality_score(fact)
assert score > 0.7
def test_low_confidence(self):
fact = {"confidence": 0.3, "source_count": 1}
score = quality_score(fact)
assert score < 0.5
def test_defaults(self):
score = quality_score({})
assert 0 < score < 1
class TestMergeFacts:
def test_merges_tags(self):
keep = {"id": "a", "fact": "test", "tags": ["git"], "confidence": 0.9}
drop = {"id": "b", "fact": "test", "tags": ["python"], "confidence": 0.8}
merged = merge_facts(keep, drop)
assert "git" in merged["tags"]
assert "python" in merged["tags"]
def test_merges_source_count(self):
keep = {"id": "a", "fact": "test", "source_count": 3}
drop = {"id": "b", "fact": "test", "source_count": 2}
merged = merge_facts(keep, drop)
assert merged["source_count"] == 5
def test_keeps_higher_confidence(self):
keep = {"id": "a", "fact": "test", "confidence": 0.7}
drop = {"id": "b", "fact": "test", "confidence": 0.9}
merged = merge_facts(keep, drop)
assert merged["confidence"] == 0.9
def test_tracks_merged_from(self):
keep = {"id": "a", "fact": "test"}
drop = {"id": "b", "fact": "test"}
merged = merge_facts(keep, drop)
assert "b" in merged["_merged_from"]
class TestDedupFacts:
def test_removes_exact_dupes(self):
facts = [
{"id": "1", "fact": "Always use git rebase"},
{"id": "2", "fact": "Always use git rebase"}, # exact dupe
{"id": "3", "fact": "Check logs first"},
]
deduped, stats = dedup_facts(facts)
assert stats["exact_dupes"] == 1
assert stats["unique"] == 2
def test_removes_near_dupes(self):
facts = [
{"id": "1", "fact": "Always check logs before deploying to production server"},
{"id": "2", "fact": "Always check logs before deploying to production environment"},
{"id": "3", "fact": "Use docker compose for local development environments"},
]
deduped, stats = dedup_facts(facts, near_threshold=0.5)
assert stats["near_dupes"] >= 1
assert stats["unique"] == 2
def test_preserves_unique(self):
facts = [
{"id": "1", "fact": "Use git rebase for clean history"},
{"id": "2", "fact": "Docker containers should be stateless"},
{"id": "3", "fact": "Always write tests before code"},
]
deduped, stats = dedup_facts(facts)
assert stats["unique"] == 3
assert stats["removed"] == 0
def test_empty_input(self):
deduped, stats = dedup_facts([])
assert stats["total"] == 0
assert stats["unique"] == 0
def test_keeps_higher_quality_near_dup(self):
facts = [
{"id": "1", "fact": "Check logs before deploying to production server", "confidence": 0.5, "source_count": 1},
{"id": "2", "fact": "Check logs before deploying to production environment", "confidence": 0.9, "source_count": 5, "tags": ["ops"]},
]
deduped, stats = dedup_facts(facts, near_threshold=0.5)
assert stats["unique"] == 1
# Higher quality fact should be kept
assert deduped[0]["confidence"] == 0.9
def test_dry_run_does_not_modify(self):
facts = [
{"id": "1", "fact": "Same text"},
{"id": "2", "fact": "Same text"},
]
deduped, stats = dedup_facts(facts, dry_run=True)
assert stats["exact_dupes"] == 1
# In dry_run, merge_facts is skipped so facts aren't modified
assert len(deduped) == 1
class TestGenerateTestDuplicates:
def test_generates_correct_count(self):
facts = generate_test_duplicates(20)
assert len(facts) > 20 # 20 unique + duplicates
def test_has_exact_dupes(self):
facts = generate_test_duplicates(20)
hashes = [content_hash(f["fact"]) for f in facts]
# Should have some duplicate hashes
assert len(hashes) != len(set(hashes))
def test_dedup_removes_dupes(self):
facts = generate_test_duplicates(20)
deduped, stats = dedup_facts(facts)
assert stats["unique"] <= 20
assert stats["removed"] > 0