1. browser_tool.py: Replace **args spread on browser_click, browser_type, and browser_scroll handlers with explicit parameter extraction. The **args pattern passed all dict keys as keyword arguments, causing TypeError if the LLM sent unexpected parameters. Now extracts only the expected params (ref, text, direction) with safe defaults. 2. fuzzy_match.py: Update module docstring to match actual strategy order in code. Block anchor was listed as #3 but is actually #7. Multi-occurrence is not a separate strategy but a flag. Updated count from 9 to 8.
488 lines
17 KiB
Python
488 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fuzzy Matching Module for File Operations
|
|
|
|
Implements a multi-strategy matching chain to robustly find and replace text,
|
|
accommodating variations in whitespace, indentation, and escaping common
|
|
in LLM-generated code.
|
|
|
|
The 8-strategy chain (inspired by OpenCode), tried in order:
|
|
1. Exact match - Direct string comparison
|
|
2. Line-trimmed - Strip leading/trailing whitespace per line
|
|
3. Whitespace normalized - Collapse multiple spaces/tabs to single space
|
|
4. Indentation flexible - Ignore indentation differences entirely
|
|
5. Escape normalized - Convert \\n literals to actual newlines
|
|
6. Trimmed boundary - Trim first/last line whitespace only
|
|
7. Block anchor - Match first+last lines, use similarity for middle
|
|
8. Context-aware - 50% line similarity threshold
|
|
|
|
Multi-occurrence matching is handled via the replace_all flag.
|
|
|
|
Usage:
|
|
from tools.fuzzy_match import fuzzy_find_and_replace
|
|
|
|
new_content, match_count, error = fuzzy_find_and_replace(
|
|
content="def foo():\\n pass",
|
|
old_string="def foo():",
|
|
new_string="def bar():",
|
|
replace_all=False
|
|
)
|
|
"""
|
|
|
|
import re
|
|
from typing import Tuple, Optional, List, Callable
|
|
from difflib import SequenceMatcher
|
|
|
|
UNICODE_MAP = {
|
|
"\u201c": '"', "\u201d": '"', # smart double quotes
|
|
"\u2018": "'", "\u2019": "'", # smart single quotes
|
|
"\u2014": "--", "\u2013": "-", # em/en dashes
|
|
"\u2026": "...", "\u00a0": " ", # ellipsis and non-breaking space
|
|
}
|
|
|
|
def _unicode_normalize(text: str) -> str:
|
|
"""Normalizes Unicode characters to their standard ASCII equivalents."""
|
|
for char, repl in UNICODE_MAP.items():
|
|
text = text.replace(char, repl)
|
|
return text
|
|
|
|
|
|
def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
|
|
replace_all: bool = False) -> Tuple[str, int, Optional[str]]:
|
|
"""
|
|
Find and replace text using a chain of increasingly fuzzy matching strategies.
|
|
|
|
Args:
|
|
content: The file content to search in
|
|
old_string: The text to find
|
|
new_string: The replacement text
|
|
replace_all: If True, replace all occurrences; if False, require uniqueness
|
|
|
|
Returns:
|
|
Tuple of (new_content, match_count, error_message)
|
|
- If successful: (modified_content, number_of_replacements, None)
|
|
- If failed: (original_content, 0, error_description)
|
|
"""
|
|
if not old_string:
|
|
return content, 0, "old_string cannot be empty"
|
|
|
|
if old_string == new_string:
|
|
return content, 0, "old_string and new_string are identical"
|
|
|
|
# Try each matching strategy in order
|
|
strategies: List[Tuple[str, Callable]] = [
|
|
("exact", _strategy_exact),
|
|
("line_trimmed", _strategy_line_trimmed),
|
|
("whitespace_normalized", _strategy_whitespace_normalized),
|
|
("indentation_flexible", _strategy_indentation_flexible),
|
|
("escape_normalized", _strategy_escape_normalized),
|
|
("trimmed_boundary", _strategy_trimmed_boundary),
|
|
("block_anchor", _strategy_block_anchor),
|
|
("context_aware", _strategy_context_aware),
|
|
]
|
|
|
|
for strategy_name, strategy_fn in strategies:
|
|
matches = strategy_fn(content, old_string)
|
|
|
|
if matches:
|
|
# Found matches with this strategy
|
|
if len(matches) > 1 and not replace_all:
|
|
return content, 0, (
|
|
f"Found {len(matches)} matches for old_string. "
|
|
f"Provide more context to make it unique, or use replace_all=True."
|
|
)
|
|
|
|
# Perform replacement
|
|
new_content = _apply_replacements(content, matches, new_string)
|
|
return new_content, len(matches), None
|
|
|
|
# No strategy found a match
|
|
return content, 0, "Could not find a match for old_string in the file"
|
|
|
|
|
|
def _apply_replacements(content: str, matches: List[Tuple[int, int]], new_string: str) -> str:
|
|
"""
|
|
Apply replacements at the given positions.
|
|
|
|
Args:
|
|
content: Original content
|
|
matches: List of (start, end) positions to replace
|
|
new_string: Replacement text
|
|
|
|
Returns:
|
|
Content with replacements applied
|
|
"""
|
|
# Sort matches by position (descending) to replace from end to start
|
|
# This preserves positions of earlier matches
|
|
sorted_matches = sorted(matches, key=lambda x: x[0], reverse=True)
|
|
|
|
result = content
|
|
for start, end in sorted_matches:
|
|
result = result[:start] + new_string + result[end:]
|
|
|
|
return result
|
|
|
|
|
|
# =============================================================================
|
|
# Matching Strategies
|
|
# =============================================================================
|
|
|
|
def _strategy_exact(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""Strategy 1: Exact string match."""
|
|
matches = []
|
|
start = 0
|
|
while True:
|
|
pos = content.find(pattern, start)
|
|
if pos == -1:
|
|
break
|
|
matches.append((pos, pos + len(pattern)))
|
|
start = pos + 1
|
|
return matches
|
|
|
|
|
|
def _strategy_line_trimmed(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Strategy 2: Match with line-by-line whitespace trimming.
|
|
|
|
Strips leading/trailing whitespace from each line before matching.
|
|
"""
|
|
# Normalize pattern and content by trimming each line
|
|
pattern_lines = [line.strip() for line in pattern.split('\n')]
|
|
pattern_normalized = '\n'.join(pattern_lines)
|
|
|
|
content_lines = content.split('\n')
|
|
content_normalized_lines = [line.strip() for line in content_lines]
|
|
|
|
# Build mapping from normalized positions back to original positions
|
|
return _find_normalized_matches(
|
|
content, content_lines, content_normalized_lines,
|
|
pattern, pattern_normalized
|
|
)
|
|
|
|
|
|
def _strategy_whitespace_normalized(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Strategy 3: Collapse multiple whitespace to single space.
|
|
"""
|
|
def normalize(s):
|
|
# Collapse multiple spaces/tabs to single space, preserve newlines
|
|
return re.sub(r'[ \t]+', ' ', s)
|
|
|
|
pattern_normalized = normalize(pattern)
|
|
content_normalized = normalize(content)
|
|
|
|
# Find in normalized, map back to original
|
|
matches_in_normalized = _strategy_exact(content_normalized, pattern_normalized)
|
|
|
|
if not matches_in_normalized:
|
|
return []
|
|
|
|
# Map positions back to original content
|
|
return _map_normalized_positions(content, content_normalized, matches_in_normalized)
|
|
|
|
|
|
def _strategy_indentation_flexible(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Strategy 4: Ignore indentation differences entirely.
|
|
|
|
Strips all leading whitespace from lines before matching.
|
|
"""
|
|
def strip_indent(s):
|
|
return '\n'.join(line.lstrip() for line in s.split('\n'))
|
|
|
|
pattern_stripped = strip_indent(pattern)
|
|
|
|
content_lines = content.split('\n')
|
|
content_stripped_lines = [line.lstrip() for line in content_lines]
|
|
pattern_lines = [line.lstrip() for line in pattern.split('\n')]
|
|
|
|
return _find_normalized_matches(
|
|
content, content_lines, content_stripped_lines,
|
|
pattern, '\n'.join(pattern_lines)
|
|
)
|
|
|
|
|
|
def _strategy_escape_normalized(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Strategy 5: Convert escape sequences to actual characters.
|
|
|
|
Handles \\n -> newline, \\t -> tab, etc.
|
|
"""
|
|
def unescape(s):
|
|
# Convert common escape sequences
|
|
return s.replace('\\n', '\n').replace('\\t', '\t').replace('\\r', '\r')
|
|
|
|
pattern_unescaped = unescape(pattern)
|
|
|
|
if pattern_unescaped == pattern:
|
|
# No escapes to convert, skip this strategy
|
|
return []
|
|
|
|
return _strategy_exact(content, pattern_unescaped)
|
|
|
|
|
|
def _strategy_trimmed_boundary(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Strategy 6: Trim whitespace from first and last lines only.
|
|
|
|
Useful when the pattern boundaries have whitespace differences.
|
|
"""
|
|
pattern_lines = pattern.split('\n')
|
|
if not pattern_lines:
|
|
return []
|
|
|
|
# Trim only first and last lines
|
|
pattern_lines[0] = pattern_lines[0].strip()
|
|
if len(pattern_lines) > 1:
|
|
pattern_lines[-1] = pattern_lines[-1].strip()
|
|
|
|
modified_pattern = '\n'.join(pattern_lines)
|
|
|
|
content_lines = content.split('\n')
|
|
|
|
# Search through content for matching block
|
|
matches = []
|
|
pattern_line_count = len(pattern_lines)
|
|
|
|
for i in range(len(content_lines) - pattern_line_count + 1):
|
|
block_lines = content_lines[i:i + pattern_line_count]
|
|
|
|
# Trim first and last of this block
|
|
check_lines = block_lines.copy()
|
|
check_lines[0] = check_lines[0].strip()
|
|
if len(check_lines) > 1:
|
|
check_lines[-1] = check_lines[-1].strip()
|
|
|
|
if '\n'.join(check_lines) == modified_pattern:
|
|
# Found match - calculate original positions
|
|
start_pos, end_pos = _calculate_line_positions(
|
|
content_lines, i, i + pattern_line_count, len(content)
|
|
)
|
|
matches.append((start_pos, end_pos))
|
|
|
|
return matches
|
|
|
|
|
|
def _strategy_block_anchor(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Strategy 7: Match by anchoring on first and last lines.
|
|
Adjusted with permissive thresholds and unicode normalization.
|
|
"""
|
|
# Normalize both strings for comparison while keeping original content for offset calculation
|
|
norm_pattern = _unicode_normalize(pattern)
|
|
norm_content = _unicode_normalize(content)
|
|
|
|
pattern_lines = norm_pattern.split('\n')
|
|
if len(pattern_lines) < 2:
|
|
return []
|
|
|
|
first_line = pattern_lines[0].strip()
|
|
last_line = pattern_lines[-1].strip()
|
|
|
|
# Use normalized lines for matching logic
|
|
norm_content_lines = norm_content.split('\n')
|
|
# BUT use original lines for calculating start/end positions to prevent index shift
|
|
orig_content_lines = content.split('\n')
|
|
|
|
pattern_line_count = len(pattern_lines)
|
|
|
|
potential_matches = []
|
|
for i in range(len(norm_content_lines) - pattern_line_count + 1):
|
|
if (norm_content_lines[i].strip() == first_line and
|
|
norm_content_lines[i + pattern_line_count - 1].strip() == last_line):
|
|
potential_matches.append(i)
|
|
|
|
matches = []
|
|
candidate_count = len(potential_matches)
|
|
|
|
# Thresholding logic: 0.10 for unique matches (max flexibility), 0.30 for multiple candidates
|
|
threshold = 0.10 if candidate_count == 1 else 0.30
|
|
|
|
for i in potential_matches:
|
|
if pattern_line_count <= 2:
|
|
similarity = 1.0
|
|
else:
|
|
# Compare normalized middle sections
|
|
content_middle = '\n'.join(norm_content_lines[i+1:i+pattern_line_count-1])
|
|
pattern_middle = '\n'.join(pattern_lines[1:-1])
|
|
similarity = SequenceMatcher(None, content_middle, pattern_middle).ratio()
|
|
|
|
if similarity >= threshold:
|
|
# Calculate positions using ORIGINAL lines to ensure correct character offsets in the file
|
|
start_pos, end_pos = _calculate_line_positions(
|
|
orig_content_lines, i, i + pattern_line_count, len(content)
|
|
)
|
|
matches.append((start_pos, end_pos))
|
|
|
|
return matches
|
|
|
|
|
|
def _strategy_context_aware(content: str, pattern: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Strategy 8: Line-by-line similarity with 50% threshold.
|
|
|
|
Finds blocks where at least 50% of lines have high similarity.
|
|
"""
|
|
pattern_lines = pattern.split('\n')
|
|
content_lines = content.split('\n')
|
|
|
|
if not pattern_lines:
|
|
return []
|
|
|
|
matches = []
|
|
pattern_line_count = len(pattern_lines)
|
|
|
|
for i in range(len(content_lines) - pattern_line_count + 1):
|
|
block_lines = content_lines[i:i + pattern_line_count]
|
|
|
|
# Calculate line-by-line similarity
|
|
high_similarity_count = 0
|
|
for p_line, c_line in zip(pattern_lines, block_lines):
|
|
sim = SequenceMatcher(None, p_line.strip(), c_line.strip()).ratio()
|
|
if sim >= 0.80:
|
|
high_similarity_count += 1
|
|
|
|
# Need at least 50% of lines to have high similarity
|
|
if high_similarity_count >= len(pattern_lines) * 0.5:
|
|
start_pos, end_pos = _calculate_line_positions(
|
|
content_lines, i, i + pattern_line_count, len(content)
|
|
)
|
|
matches.append((start_pos, end_pos))
|
|
|
|
return matches
|
|
|
|
|
|
# =============================================================================
|
|
# Helper Functions
|
|
# =============================================================================
|
|
|
|
def _calculate_line_positions(content_lines: List[str], start_line: int,
|
|
end_line: int, content_length: int) -> Tuple[int, int]:
|
|
"""Calculate start and end character positions from line indices.
|
|
|
|
Args:
|
|
content_lines: List of lines (without newlines)
|
|
start_line: Starting line index (0-based)
|
|
end_line: Ending line index (exclusive, 0-based)
|
|
content_length: Total length of the original content string
|
|
|
|
Returns:
|
|
Tuple of (start_pos, end_pos) in the original content
|
|
"""
|
|
start_pos = sum(len(line) + 1 for line in content_lines[:start_line])
|
|
end_pos = sum(len(line) + 1 for line in content_lines[:end_line]) - 1
|
|
if end_pos >= content_length:
|
|
end_pos = content_length
|
|
return start_pos, end_pos
|
|
|
|
|
|
def _find_normalized_matches(content: str, content_lines: List[str],
|
|
content_normalized_lines: List[str],
|
|
pattern: str, pattern_normalized: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Find matches in normalized content and map back to original positions.
|
|
|
|
Args:
|
|
content: Original content string
|
|
content_lines: Original content split by lines
|
|
content_normalized_lines: Normalized content lines
|
|
pattern: Original pattern
|
|
pattern_normalized: Normalized pattern
|
|
|
|
Returns:
|
|
List of (start, end) positions in the original content
|
|
"""
|
|
pattern_norm_lines = pattern_normalized.split('\n')
|
|
num_pattern_lines = len(pattern_norm_lines)
|
|
|
|
matches = []
|
|
|
|
for i in range(len(content_normalized_lines) - num_pattern_lines + 1):
|
|
# Check if this block matches
|
|
block = '\n'.join(content_normalized_lines[i:i + num_pattern_lines])
|
|
|
|
if block == pattern_normalized:
|
|
# Found a match - calculate original positions
|
|
start_pos, end_pos = _calculate_line_positions(
|
|
content_lines, i, i + num_pattern_lines, len(content)
|
|
)
|
|
matches.append((start_pos, end_pos))
|
|
|
|
return matches
|
|
|
|
|
|
def _map_normalized_positions(original: str, normalized: str,
|
|
normalized_matches: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
|
|
"""
|
|
Map positions from normalized string back to original.
|
|
|
|
This is a best-effort mapping that works for whitespace normalization.
|
|
"""
|
|
if not normalized_matches:
|
|
return []
|
|
|
|
# Build character mapping from normalized to original
|
|
orig_to_norm = [] # orig_to_norm[i] = position in normalized
|
|
|
|
orig_idx = 0
|
|
norm_idx = 0
|
|
|
|
while orig_idx < len(original) and norm_idx < len(normalized):
|
|
if original[orig_idx] == normalized[norm_idx]:
|
|
orig_to_norm.append(norm_idx)
|
|
orig_idx += 1
|
|
norm_idx += 1
|
|
elif original[orig_idx] in ' \t' and normalized[norm_idx] == ' ':
|
|
# Original has space/tab, normalized collapsed to space
|
|
orig_to_norm.append(norm_idx)
|
|
orig_idx += 1
|
|
# Don't advance norm_idx yet - wait until all whitespace consumed
|
|
if orig_idx < len(original) and original[orig_idx] not in ' \t':
|
|
norm_idx += 1
|
|
elif original[orig_idx] in ' \t':
|
|
# Extra whitespace in original
|
|
orig_to_norm.append(norm_idx)
|
|
orig_idx += 1
|
|
else:
|
|
# Mismatch - shouldn't happen with our normalization
|
|
orig_to_norm.append(norm_idx)
|
|
orig_idx += 1
|
|
|
|
# Fill remaining
|
|
while orig_idx < len(original):
|
|
orig_to_norm.append(len(normalized))
|
|
orig_idx += 1
|
|
|
|
# Reverse mapping: for each normalized position, find original range
|
|
norm_to_orig_start = {}
|
|
norm_to_orig_end = {}
|
|
|
|
for orig_pos, norm_pos in enumerate(orig_to_norm):
|
|
if norm_pos not in norm_to_orig_start:
|
|
norm_to_orig_start[norm_pos] = orig_pos
|
|
norm_to_orig_end[norm_pos] = orig_pos
|
|
|
|
# Map matches
|
|
original_matches = []
|
|
for norm_start, norm_end in normalized_matches:
|
|
# Find original start
|
|
if norm_start in norm_to_orig_start:
|
|
orig_start = norm_to_orig_start[norm_start]
|
|
else:
|
|
# Find nearest
|
|
orig_start = min(i for i, n in enumerate(orig_to_norm) if n >= norm_start)
|
|
|
|
# Find original end
|
|
if norm_end - 1 in norm_to_orig_end:
|
|
orig_end = norm_to_orig_end[norm_end - 1] + 1
|
|
else:
|
|
orig_end = orig_start + (norm_end - norm_start)
|
|
|
|
# Expand to include trailing whitespace that was normalized
|
|
while orig_end < len(original) and original[orig_end] in ' \t':
|
|
orig_end += 1
|
|
|
|
original_matches.append((orig_start, min(orig_end, len(original))))
|
|
|
|
return original_matches
|