feat: permissive block_anchor thresholds and unicode normalization (#1539)

Salvaged from PR #1528 by an420eth. Closes #517.

Improves _strategy_block_anchor in fuzzy_match.py:
- Add unicode normalization (smart quotes, em/en-dashes, ellipsis,
  non-breaking spaces → ASCII) so LLM-produced unicode artifacts
  don't break anchor line matching
- Lower thresholds: 0.10 for unique matches (was 0.70), 0.30 for
  multiple candidates — if first/last lines match exactly, the
  block is almost certainly correct
- Use original (non-normalized) content for offset calculation to
  preserve correct character positions

Tested: 3 new scenarios fixed (em-dash anchors, non-breaking space
anchors, very-low-similarity unique matches), zero regressions on
all 9 existing fuzzy match tests.

Co-authored-by: an420eth <an420eth@users.noreply.github.com>
This commit is contained in:
Teknium
2026-03-16 05:29:25 -07:00
committed by GitHub
parent b411b979cb
commit 352980311b

View File

@@ -32,6 +32,19 @@ import re
from typing import Tuple, Optional, List, Callable
from difflib import SequenceMatcher
UNICODE_MAP = {
"\u201c": '"', "\u201d": '"', # smart double quotes
"\u2018": "'", "\u2019": "'", # smart single quotes
"\u2014": "--", "\u2013": "-", # em/en dashes
"\u2026": "...", "\u00a0": " ", # ellipsis and non-breaking space
}
def _unicode_normalize(text: str) -> str:
"""Normalizes Unicode characters to their standard ASCII equivalents."""
for char, repl in UNICODE_MAP.items():
text = text.replace(char, repl)
return text
def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
replace_all: bool = False) -> Tuple[str, int, Optional[str]]:
@@ -253,42 +266,52 @@ def _strategy_trimmed_boundary(content: str, pattern: str) -> List[Tuple[int, in
def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
"""
Strategy 7: Match by anchoring on first and last lines.
If first and last lines match exactly, accept middle with 70% similarity.
Adjusted with permissive thresholds and unicode normalization.
"""
pattern_lines = pattern.split('\n')
# Normalize both strings for comparison while keeping original content for offset calculation
norm_pattern = _unicode_normalize(pattern)
norm_content = _unicode_normalize(content)
pattern_lines = norm_pattern.split('\n')
if len(pattern_lines) < 2:
return [] # Need at least 2 lines for anchoring
return []
first_line = pattern_lines[0].strip()
last_line = pattern_lines[-1].strip()
content_lines = content.split('\n')
matches = []
# Use normalized lines for matching logic
norm_content_lines = norm_content.split('\n')
# BUT use original lines for calculating start/end positions to prevent index shift
orig_content_lines = content.split('\n')
pattern_line_count = len(pattern_lines)
for i in range(len(content_lines) - pattern_line_count + 1):
# Check if first and last lines match
if (content_lines[i].strip() == first_line and
content_lines[i + pattern_line_count - 1].strip() == last_line):
potential_matches = []
for i in range(len(norm_content_lines) - pattern_line_count + 1):
if (norm_content_lines[i].strip() == first_line and
norm_content_lines[i + pattern_line_count - 1].strip() == last_line):
potential_matches.append(i)
# Check middle similarity
if pattern_line_count <= 2:
# Only first and last, they match
similarity = 1.0
else:
content_middle = '\n'.join(content_lines[i+1:i+pattern_line_count-1])
pattern_middle = '\n'.join(pattern_lines[1:-1])
similarity = SequenceMatcher(None, content_middle, pattern_middle).ratio()
if similarity >= 0.70:
# Calculate positions
start_pos = sum(len(line) + 1 for line in content_lines[:i])
end_pos = sum(len(line) + 1 for line in content_lines[:i + pattern_line_count]) - 1
if end_pos >= len(content):
end_pos = len(content)
matches.append((start_pos, end_pos))
matches = []
candidate_count = len(potential_matches)
# Thresholding logic: 0.10 for unique matches (max flexibility), 0.30 for multiple candidates
threshold = 0.10 if candidate_count == 1 else 0.30
for i in potential_matches:
if pattern_line_count <= 2:
similarity = 1.0
else:
# Compare normalized middle sections
content_middle = '\n'.join(norm_content_lines[i+1:i+pattern_line_count-1])
pattern_middle = '\n'.join(pattern_lines[1:-1])
similarity = SequenceMatcher(None, content_middle, pattern_middle).ratio()
if similarity >= threshold:
# Calculate positions using ORIGINAL lines to ensure correct character offsets in the file
start_pos = sum(len(line) + 1 for line in orig_content_lines[:i])
end_pos = sum(len(line) + 1 for line in orig_content_lines[:i + pattern_line_count]) - 1
matches.append((start_pos, min(end_pos, len(content))))
return matches