perf: use ripgrep for file search (200x faster than find)
search_files(target='files') now uses rg --files -g instead of find. Ripgrep respects .gitignore, excludes hidden dirs by default, and has parallel directory traversal — ~200x faster on wide trees (0.14s vs 34s benchmarked on 164-repo tree). Falls back to find when rg is unavailable, preserving hidden-dir exclusion and BSD find compatibility. Salvaged from PR #1464 by @light-merlin-dark (Merlin) — adapted to preserve hidden-dir exclusion added since the original PR.
This commit is contained in:
@@ -867,53 +867,85 @@ class ShellFileOperations(FileOperations):
|
||||
|
||||
def _search_files(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult:
|
||||
"""Search for files by name pattern (glob-like)."""
|
||||
# Check if find is available (not on Windows without Git Bash/WSL)
|
||||
if not self._has_command('find'):
|
||||
return SearchResult(
|
||||
error="File search requires 'find' command. "
|
||||
"On Windows, use Git Bash, WSL, or install Unix tools."
|
||||
)
|
||||
|
||||
# Auto-prepend **/ for recursive search if not already present
|
||||
if not pattern.startswith('**/') and '/' not in pattern:
|
||||
search_pattern = pattern
|
||||
else:
|
||||
search_pattern = pattern.split('/')[-1]
|
||||
|
||||
|
||||
# Prefer ripgrep: respects .gitignore, excludes hidden dirs by
|
||||
# default, and has parallel directory traversal (~200x faster than
|
||||
# find on wide trees). Mirrors _search_content which already uses rg.
|
||||
if self._has_command('rg'):
|
||||
return self._search_files_rg(search_pattern, path, limit, offset)
|
||||
|
||||
# Fallback: find (slower, no .gitignore awareness)
|
||||
if not self._has_command('find'):
|
||||
return SearchResult(
|
||||
error="File search requires 'rg' (ripgrep) or 'find'. "
|
||||
"Install ripgrep for best results: "
|
||||
"https://github.com/BurntSushi/ripgrep#installation"
|
||||
)
|
||||
|
||||
# Exclude hidden directories (matching ripgrep's default behavior).
|
||||
# This prevents the agent from discovering internal cache files
|
||||
# (e.g. .hub/index-cache/) that may contain unvetted content.
|
||||
hidden_exclude = "-not -path '*/.*'"
|
||||
|
||||
# Use find with modification time sorting
|
||||
# -printf '%T@ %p\n' outputs: timestamp path
|
||||
# sort -rn sorts by timestamp descending (newest first)
|
||||
|
||||
cmd = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \
|
||||
f"-printf '%T@ %p\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}"
|
||||
|
||||
f"-printf '%T@ %p\\\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}"
|
||||
|
||||
result = self._exec(cmd, timeout=60)
|
||||
|
||||
|
||||
if not result.stdout.strip():
|
||||
# Try without -printf (BSD find compatibility -- macOS)
|
||||
cmd_simple = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \
|
||||
f"2>/dev/null | head -n {limit + offset} | tail -n +{offset + 1}"
|
||||
result = self._exec(cmd_simple, timeout=60)
|
||||
|
||||
|
||||
files = []
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line:
|
||||
continue
|
||||
# Parse "timestamp path" format
|
||||
parts = line.split(' ', 1)
|
||||
if len(parts) == 2 and parts[0].replace('.', '').isdigit():
|
||||
files.append(parts[1])
|
||||
else:
|
||||
files.append(line)
|
||||
|
||||
|
||||
return SearchResult(
|
||||
files=files,
|
||||
total_count=len(files)
|
||||
)
|
||||
|
||||
def _search_files_rg(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult:
|
||||
"""Search for files by name using ripgrep's --files mode.
|
||||
|
||||
rg --files respects .gitignore and excludes hidden directories by
|
||||
default, and uses parallel directory traversal for ~200x speedup
|
||||
over find on wide trees.
|
||||
"""
|
||||
# rg --files -g uses glob patterns; wrap bare names so they match
|
||||
# at any depth (equivalent to find -name).
|
||||
if '/' not in pattern and not pattern.startswith('*'):
|
||||
glob_pattern = f"*{pattern}"
|
||||
else:
|
||||
glob_pattern = pattern
|
||||
|
||||
fetch_limit = limit + offset
|
||||
cmd = (
|
||||
f"rg --files -g {self._escape_shell_arg(glob_pattern)} "
|
||||
f"{self._escape_shell_arg(path)} 2>/dev/null "
|
||||
f"| head -n {fetch_limit}"
|
||||
)
|
||||
result = self._exec(cmd, timeout=60)
|
||||
|
||||
all_files = [f for f in result.stdout.strip().split('\n') if f]
|
||||
page = all_files[offset:offset + limit]
|
||||
|
||||
return SearchResult(
|
||||
files=page,
|
||||
total_count=len(all_files),
|
||||
truncated=len(all_files) >= fetch_limit,
|
||||
)
|
||||
|
||||
def _search_content(self, pattern: str, path: str, file_glob: Optional[str],
|
||||
limit: int, offset: int, output_mode: str, context: int) -> SearchResult:
|
||||
|
||||
Reference in New Issue
Block a user