From fba73a60e36a84e94736d309d6777bf2247a23eb Mon Sep 17 00:00:00 2001 From: Teknium <127238744+teknium1@users.noreply.github.com> Date: Wed, 25 Mar 2026 10:48:18 -0700 Subject: [PATCH] fix(skills): use Git Trees API to prevent silent subdirectory loss during install (#2995) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix(skills): use Git Trees API to prevent silent subdirectory loss during install Refactors _download_directory() to use the Git Trees API (single call for the entire repo tree) as the primary path, falling back to the recursive Contents API when the tree endpoint is unavailable or truncated. Prevents silent subdirectory loss caused by per-directory rate limiting or transient failures. Cherry-picked from PR #2981 by tugrulguner. Fixes #2940. * fix: simplify tree API — use branch name directly as tree-ish Eliminates an extra git/ref/heads API call by passing the branch name directly to git/trees/{branch}?recursive=1, matching the pattern already used by _find_skill_in_repo_tree. --------- Co-authored-by: tugrulguner --- tests/tools/test_skills_hub.py | 150 +++++++++++++++++++++++++++++++++ tools/skills_hub.py | 70 ++++++++++++++- 2 files changed, 218 insertions(+), 2 deletions(-) diff --git a/tests/tools/test_skills_hub.py b/tests/tools/test_skills_hub.py index 778a77ba..ddd1585c 100644 --- a/tests/tools/test_skills_hub.py +++ b/tests/tools/test_skills_hub.py @@ -4,6 +4,8 @@ import json from pathlib import Path from unittest.mock import patch, MagicMock +import httpx + from tools.skills_hub import ( GitHubAuth, GitHubSource, @@ -1039,3 +1041,151 @@ class TestQuarantineBundleBinaryAssets: assert (q_path / "SKILL.md").read_text(encoding="utf-8").startswith("---") assert (q_path / "assets" / "neutts-cli" / "samples" / "jo.wav").read_bytes() == b"RIFF\x00\x01fakewav" + + +# --------------------------------------------------------------------------- +# GitHubSource._download_directory — tree API + fallback (#2940) +# --------------------------------------------------------------------------- + + +class TestDownloadDirectoryViaTree: + """Tests for the Git Trees API path in _download_directory.""" + + def _source(self): + auth = MagicMock(spec=GitHubAuth) + auth.get_headers.return_value = {} + return GitHubSource(auth=auth) + + @patch.object(GitHubSource, "_fetch_file_content") + @patch("tools.skills_hub.httpx.get") + def test_tree_api_downloads_subdirectories(self, mock_get, mock_fetch): + """Tree API returns files from nested subdirectories.""" + repo_resp = MagicMock(status_code=200, json=lambda: {"default_branch": "main"}) + tree_resp = MagicMock(status_code=200, json=lambda: { + "truncated": False, + "tree": [ + {"type": "blob", "path": "skills/my-skill/SKILL.md"}, + {"type": "blob", "path": "skills/my-skill/scripts/run.py"}, + {"type": "blob", "path": "skills/my-skill/references/api.md"}, + {"type": "tree", "path": "skills/my-skill/scripts"}, + {"type": "blob", "path": "other/file.txt"}, + ], + }) + mock_get.side_effect = [repo_resp, tree_resp] + mock_fetch.side_effect = lambda repo, path: f"content-of-{path}" + + src = self._source() + files = src._download_directory("owner/repo", "skills/my-skill") + + assert "SKILL.md" in files + assert "scripts/run.py" in files + assert "references/api.md" in files + assert "other/file.txt" not in files # outside target path + assert len(files) == 3 + + @patch.object(GitHubSource, "_download_directory_recursive", return_value={"SKILL.md": "# ok"}) + @patch("tools.skills_hub.httpx.get") + def test_falls_back_on_truncated_tree(self, mock_get, mock_fallback): + """When tree is truncated, fall back to recursive Contents API.""" + repo_resp = MagicMock(status_code=200, json=lambda: {"default_branch": "main"}) + tree_resp = MagicMock(status_code=200, json=lambda: {"truncated": True, "tree": []}) + mock_get.side_effect = [repo_resp, tree_resp] + + src = self._source() + files = src._download_directory("owner/repo", "skills/my-skill") + + assert files == {"SKILL.md": "# ok"} + mock_fallback.assert_called_once_with("owner/repo", "skills/my-skill") + + @patch.object(GitHubSource, "_download_directory_recursive", return_value={"SKILL.md": "# ok"}) + @patch("tools.skills_hub.httpx.get") + def test_falls_back_on_repo_api_failure(self, mock_get, mock_fallback): + """When the repo endpoint returns non-200, fall back to Contents API.""" + mock_get.return_value = MagicMock(status_code=404) + + src = self._source() + files = src._download_directory("owner/repo", "skills/my-skill") + + assert files == {"SKILL.md": "# ok"} + mock_fallback.assert_called_once() + + @patch.object(GitHubSource, "_fetch_file_content") + @patch("tools.skills_hub.httpx.get") + def test_tree_api_skips_failed_file_fetches(self, mock_get, mock_fetch): + """Files that fail to fetch are skipped, not fatal.""" + repo_resp = MagicMock(status_code=200, json=lambda: {"default_branch": "main"}) + tree_resp = MagicMock(status_code=200, json=lambda: { + "truncated": False, + "tree": [ + {"type": "blob", "path": "skills/my-skill/SKILL.md"}, + {"type": "blob", "path": "skills/my-skill/scripts/run.py"}, + ], + }) + mock_get.side_effect = [repo_resp, tree_resp] + mock_fetch.side_effect = lambda repo, path: ( + "# Skill" if path.endswith("SKILL.md") else None + ) + + src = self._source() + files = src._download_directory("owner/repo", "skills/my-skill") + + assert "SKILL.md" in files + assert "scripts/run.py" not in files + + @patch.object(GitHubSource, "_download_directory_recursive", return_value={}) + @patch("tools.skills_hub.httpx.get") + def test_falls_back_on_network_error(self, mock_get, mock_fallback): + """Network errors in tree API trigger fallback.""" + mock_get.side_effect = httpx.ConnectError("connection refused") + + src = self._source() + src._download_directory("owner/repo", "skills/my-skill") + + mock_fallback.assert_called_once() + + +class TestDownloadDirectoryRecursive: + """Tests for the Contents API fallback path.""" + + def _source(self): + auth = MagicMock(spec=GitHubAuth) + auth.get_headers.return_value = {} + return GitHubSource(auth=auth) + + @patch.object(GitHubSource, "_fetch_file_content") + @patch("tools.skills_hub.httpx.get") + def test_recursive_downloads_subdirectories(self, mock_get, mock_fetch): + """Contents API recursion includes subdirectories.""" + root_resp = MagicMock(status_code=200, json=lambda: [ + {"name": "SKILL.md", "type": "file", "path": "skill/SKILL.md"}, + {"name": "scripts", "type": "dir", "path": "skill/scripts"}, + ]) + sub_resp = MagicMock(status_code=200, json=lambda: [ + {"name": "run.py", "type": "file", "path": "skill/scripts/run.py"}, + ]) + mock_get.side_effect = [root_resp, sub_resp] + mock_fetch.side_effect = lambda repo, path: f"content-of-{path}" + + src = self._source() + files = src._download_directory_recursive("owner/repo", "skill") + + assert "SKILL.md" in files + assert "scripts/run.py" in files + + @patch.object(GitHubSource, "_fetch_file_content") + @patch("tools.skills_hub.httpx.get") + def test_recursive_handles_subdir_failure(self, mock_get, mock_fetch): + """Subdirectory 403/rate-limit returns empty but doesn't crash.""" + root_resp = MagicMock(status_code=200, json=lambda: [ + {"name": "SKILL.md", "type": "file", "path": "skill/SKILL.md"}, + {"name": "scripts", "type": "dir", "path": "skill/scripts"}, + ]) + sub_resp = MagicMock(status_code=403) + mock_get.side_effect = [root_resp, sub_resp] + mock_fetch.return_value = "content" + + src = self._source() + files = src._download_directory_recursive("owner/repo", "skill") + + assert "SKILL.md" in files + assert "scripts/run.py" not in files # lost due to rate limit diff --git a/tools/skills_hub.py b/tools/skills_hub.py index 8ae65ed7..782b0ce1 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -404,11 +404,75 @@ class GitHubSource(SkillSource): return skills def _download_directory(self, repo: str, path: str) -> Dict[str, str]: - """Recursively download all text files from a GitHub directory.""" + """Recursively download all text files from a GitHub directory. + + Uses the Git Trees API first (single call for the entire tree) to + avoid per-directory rate limiting that causes silent subdirectory + loss. Falls back to the recursive Contents API when the tree + endpoint is unavailable or the response is truncated. + """ + files = self._download_directory_via_tree(repo, path) + if files is not None: + return files + logger.debug("Tree API unavailable for %s/%s, falling back to Contents API", repo, path) + return self._download_directory_recursive(repo, path) + + def _download_directory_via_tree(self, repo: str, path: str) -> Optional[Dict[str, str]]: + """Download an entire directory using the Git Trees API (single request).""" + path = path.rstrip("/") + headers = self.auth.get_headers() + + # Resolve the default branch via the repo endpoint + try: + repo_url = f"https://api.github.com/repos/{repo}" + resp = httpx.get(repo_url, headers=headers, timeout=15, follow_redirects=True) + if resp.status_code != 200: + return None + default_branch = resp.json().get("default_branch", "main") + except (httpx.HTTPError, ValueError): + return None + + # Fetch the full recursive tree (branch name works as tree-ish) + try: + tree_url = f"https://api.github.com/repos/{repo}/git/trees/{default_branch}" + resp = httpx.get( + tree_url, params={"recursive": "1"}, + headers=headers, timeout=30, follow_redirects=True, + ) + if resp.status_code != 200: + return None + tree_data = resp.json() + if tree_data.get("truncated"): + logger.debug("Git tree truncated for %s, falling back to Contents API", repo) + return None + except (httpx.HTTPError, ValueError): + return None + + # Filter to blobs under our target path and fetch content + prefix = f"{path}/" + files: Dict[str, str] = {} + for item in tree_data.get("tree", []): + if item.get("type") != "blob": + continue + item_path = item.get("path", "") + if not item_path.startswith(prefix): + continue + rel_path = item_path[len(prefix):] + content = self._fetch_file_content(repo, item_path) + if content is not None: + files[rel_path] = content + else: + logger.debug("Skipped file (fetch failed): %s/%s", repo, item_path) + + return files if files else None + + def _download_directory_recursive(self, repo: str, path: str) -> Dict[str, str]: + """Recursively download via Contents API (fallback).""" url = f"https://api.github.com/repos/{repo}/contents/{path.rstrip('/')}" try: resp = httpx.get(url, headers=self.auth.get_headers(), timeout=15, follow_redirects=True) if resp.status_code != 200: + logger.debug("Contents API returned %d for %s/%s", resp.status_code, repo, path) return {} except httpx.HTTPError: return {} @@ -428,7 +492,9 @@ class GitHubSource(SkillSource): rel_path = name files[rel_path] = content elif entry_type == "dir": - sub_files = self._download_directory(repo, entry.get("path", "")) + sub_files = self._download_directory_recursive(repo, entry.get("path", "")) + if not sub_files: + logger.debug("Empty or failed subdirectory: %s/%s", repo, entry.get("path", "")) for sub_name, sub_content in sub_files.items(): files[f"{name}/{sub_name}"] = sub_content