From 352980311b3ac224cacaec89ecdfd0b5cf43d722 Mon Sep 17 00:00:00 2001
From: Teknium <127238744+teknium1@users.noreply.github.com>
Date: Mon, 16 Mar 2026 05:29:25 -0700
Subject: [PATCH] feat: permissive block_anchor thresholds and unicode
 normalization (#1539)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Salvaged from PR #1528 by an420eth. Closes #517.

Improves _strategy_block_anchor in fuzzy_match.py:
- Add unicode normalization (smart quotes, em/en-dashes, ellipsis,
  non-breaking spaces → ASCII) so LLM-produced unicode artifacts
  don't break anchor line matching
- Lower thresholds: 0.10 for unique matches (was 0.70), 0.30 for
  multiple candidates — if first/last lines match exactly, the
  block is almost certainly correct
- Use original (non-normalized) content for offset calculation to
  preserve correct character positions

Tested: 3 new scenarios fixed (em-dash anchors, non-breaking space
anchors, very-low-similarity unique matches), zero regressions on
all 9 existing fuzzy match tests.

Co-authored-by: an420eth <an420eth@users.noreply.github.com>
---
 tools/fuzzy_match.py | 75 +++++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 26 deletions(-)

diff --git a/tools/fuzzy_match.py b/tools/fuzzy_match.py
index bc8e34403..f53451c63 100644
--- a/tools/fuzzy_match.py
+++ b/tools/fuzzy_match.py
@@ -32,6 +32,19 @@ import re
 from typing import Tuple, Optional, List, Callable
 from difflib import SequenceMatcher
 
+UNICODE_MAP = {
+    "\u201c": '"', "\u201d": '"',  # smart double quotes
+    "\u2018": "'", "\u2019": "'",  # smart single quotes
+    "\u2014": "--", "\u2013": "-", # em/en dashes
+    "\u2026": "...", "\u00a0": " ", # ellipsis and non-breaking space
+}
+
+def _unicode_normalize(text: str) -> str:
+    """Normalizes Unicode characters to their standard ASCII equivalents."""
+    for char, repl in UNICODE_MAP.items():
+        text = text.replace(char, repl)
+    return text
+
 
 def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
                             replace_all: bool = False) -> Tuple[str, int, Optional[str]]:
@@ -253,42 +266,52 @@ def _strategy_trimmed_boundary(content: str, pattern: str) -> List[Tuple[int, in
 def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
     """
     Strategy 7: Match by anchoring on first and last lines.
-    
-    If first and last lines match exactly, accept middle with 70% similarity.
+    Adjusted with permissive thresholds and unicode normalization.
     """
-    pattern_lines = pattern.split('\n')
+    # Normalize both strings for comparison while keeping original content for offset calculation
+    norm_pattern = _unicode_normalize(pattern)
+    norm_content = _unicode_normalize(content)
+    
+    pattern_lines = norm_pattern.split('\n')
     if len(pattern_lines) < 2:
-        return []  # Need at least 2 lines for anchoring
+        return []
     
     first_line = pattern_lines[0].strip()
     last_line = pattern_lines[-1].strip()
     
-    content_lines = content.split('\n')
-    matches = []
+    # Use normalized lines for matching logic
+    norm_content_lines = norm_content.split('\n')
+    # BUT use original lines for calculating start/end positions to prevent index shift
+    orig_content_lines = content.split('\n')
     
     pattern_line_count = len(pattern_lines)
     
-    for i in range(len(content_lines) - pattern_line_count + 1):
-        # Check if first and last lines match
-        if (content_lines[i].strip() == first_line and 
-            content_lines[i + pattern_line_count - 1].strip() == last_line):
+    potential_matches = []
+    for i in range(len(norm_content_lines) - pattern_line_count + 1):
+        if (norm_content_lines[i].strip() == first_line and 
+            norm_content_lines[i + pattern_line_count - 1].strip() == last_line):
+            potential_matches.append(i)
             
-            # Check middle similarity
-            if pattern_line_count <= 2:
-                # Only first and last, they match
-                similarity = 1.0
-            else:
-                content_middle = '\n'.join(content_lines[i+1:i+pattern_line_count-1])
-                pattern_middle = '\n'.join(pattern_lines[1:-1])
-                similarity = SequenceMatcher(None, content_middle, pattern_middle).ratio()
-            
-            if similarity >= 0.70:
-                # Calculate positions
-                start_pos = sum(len(line) + 1 for line in content_lines[:i])
-                end_pos = sum(len(line) + 1 for line in content_lines[:i + pattern_line_count]) - 1
-                if end_pos >= len(content):
-                    end_pos = len(content)
-                matches.append((start_pos, end_pos))
+    matches = []
+    candidate_count = len(potential_matches)
+    
+    # Thresholding logic: 0.10 for unique matches (max flexibility), 0.30 for multiple candidates
+    threshold = 0.10 if candidate_count == 1 else 0.30
+
+    for i in potential_matches:
+        if pattern_line_count <= 2:
+            similarity = 1.0
+        else:
+            # Compare normalized middle sections
+            content_middle = '\n'.join(norm_content_lines[i+1:i+pattern_line_count-1])
+            pattern_middle = '\n'.join(pattern_lines[1:-1])
+            similarity = SequenceMatcher(None, content_middle, pattern_middle).ratio()
+        
+        if similarity >= threshold:
+            # Calculate positions using ORIGINAL lines to ensure correct character offsets in the file
+            start_pos = sum(len(line) + 1 for line in orig_content_lines[:i])
+            end_pos = sum(len(line) + 1 for line in orig_content_lines[:i + pattern_line_count]) - 1
+            matches.append((start_pos, min(end_pos, len(content))))
     
     return matches