perf: use ripgrep for file search (200x faster than find)

search_files(target='files') now uses rg --files -g instead of find. Ripgrep respects .gitignore, excludes hidden dirs by default, and has parallel directory traversal — ~200x faster on wide trees (0.14s vs 34s benchmarked on 164-repo tree). Falls back to find when rg is unavailable, preserving hidden-dir exclusion and BSD find compatibility. Salvaged from PR #1464 by @light-merlin-dark (Merlin) — adapted to preserve hidden-dir exclusion added since the original PR.
2026-03-17 02:31:56 -07:00
parent 9ece1ce2de
commit 693f5786ac
1 changed files with 52 additions and 20 deletions
--- a/tools/file_operations.py
+++ b/tools/file_operations.py
@@ -867,53 +867,85 @@ class ShellFileOperations(FileOperations):
    
    def _search_files(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult:
        """Search for files by name pattern (glob-like)."""
-        # Check if find is available (not on Windows without Git Bash/WSL)
-        if not self._has_command('find'):
-            return SearchResult(
-                error="File search requires 'find' command. "
-                      "On Windows, use Git Bash, WSL, or install Unix tools."
-            )
-        
        # Auto-prepend **/ for recursive search if not already present
        if not pattern.startswith('**/') and '/' not in pattern:
            search_pattern = pattern
        else:
            search_pattern = pattern.split('/')[-1]
-        
+
+        # Prefer ripgrep: respects .gitignore, excludes hidden dirs by
+        # default, and has parallel directory traversal (~200x faster than
+        # find on wide trees).  Mirrors _search_content which already uses rg.
+        if self._has_command('rg'):
+            return self._search_files_rg(search_pattern, path, limit, offset)
+
+        # Fallback: find (slower, no .gitignore awareness)
+        if not self._has_command('find'):
+            return SearchResult(
+                error="File search requires 'rg' (ripgrep) or 'find'. "
+                      "Install ripgrep for best results: "
+                      "https://github.com/BurntSushi/ripgrep#installation"
+            )
+
        # Exclude hidden directories (matching ripgrep's default behavior).
-        # This prevents the agent from discovering internal cache files
-        # (e.g. .hub/index-cache/) that may contain unvetted content.
        hidden_exclude = "-not -path '*/.*'"
-        
-        # Use find with modification time sorting
-        # -printf '%T@ %p\n' outputs: timestamp path
-        # sort -rn sorts by timestamp descending (newest first)
+
        cmd = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \
-              f"-printf '%T@ %p\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}"
-        
+              f"-printf '%T@ %p\\\\n' 2>/dev/null | sort -rn | tail -n +{offset + 1} | head -n {limit}"
+
        result = self._exec(cmd, timeout=60)
-        
+
        if not result.stdout.strip():
            # Try without -printf (BSD find compatibility -- macOS)
            cmd_simple = f"find {self._escape_shell_arg(path)} {hidden_exclude} -type f -name {self._escape_shell_arg(search_pattern)} " \
                        f"2>/dev/null | head -n {limit + offset} | tail -n +{offset + 1}"
            result = self._exec(cmd_simple, timeout=60)
-        
+
        files = []
        for line in result.stdout.strip().split('\n'):
            if not line:
                continue
-            # Parse "timestamp path" format
            parts = line.split(' ', 1)
            if len(parts) == 2 and parts[0].replace('.', '').isdigit():
                files.append(parts[1])
            else:
                files.append(line)
-        
+
        return SearchResult(
            files=files,
            total_count=len(files)
        )
+
+    def _search_files_rg(self, pattern: str, path: str, limit: int, offset: int) -> SearchResult:
+        """Search for files by name using ripgrep's --files mode.
+
+        rg --files respects .gitignore and excludes hidden directories by
+        default, and uses parallel directory traversal for ~200x speedup
+        over find on wide trees.
+        """
+        # rg --files -g uses glob patterns; wrap bare names so they match
+        # at any depth (equivalent to find -name).
+        if '/' not in pattern and not pattern.startswith('*'):
+            glob_pattern = f"*{pattern}"
+        else:
+            glob_pattern = pattern
+
+        fetch_limit = limit + offset
+        cmd = (
+            f"rg --files -g {self._escape_shell_arg(glob_pattern)} "
+            f"{self._escape_shell_arg(path)} 2>/dev/null "
+            f"| head -n {fetch_limit}"
+        )
+        result = self._exec(cmd, timeout=60)
+
+        all_files = [f for f in result.stdout.strip().split('\n') if f]
+        page = all_files[offset:offset + limit]
+
+        return SearchResult(
+            files=page,
+            total_count=len(all_files),
+            truncated=len(all_files) >= fetch_limit,
+        )
    
    def _search_content(self, pattern: str, path: str, file_glob: Optional[str],
                        limit: int, offset: int, output_mode: str, context: int) -> SearchResult: