fix(skills): use Git Trees API to prevent silent subdirectory loss during install (#2995)

* fix(skills): use Git Trees API to prevent silent subdirectory loss during install

Refactors _download_directory() to use the Git Trees API (single call
for the entire repo tree) as the primary path, falling back to the
recursive Contents API when the tree endpoint is unavailable or
truncated.  Prevents silent subdirectory loss caused by per-directory
rate limiting or transient failures.

Cherry-picked from PR #2981 by tugrulguner.
Fixes #2940.

* fix: simplify tree API — use branch name directly as tree-ish

Eliminates an extra git/ref/heads API call by passing the branch name
directly to git/trees/{branch}?recursive=1, matching the pattern
already used by _find_skill_in_repo_tree.

---------

Co-authored-by: tugrulguner <tugrulguner@users.noreply.github.com>
This commit is contained in:
Teknium
2026-03-25 10:48:18 -07:00
committed by GitHub
parent 114e636b7d
commit fba73a60e3
2 changed files with 218 additions and 2 deletions

View File

@@ -4,6 +4,8 @@ import json
from pathlib import Path
from unittest.mock import patch, MagicMock
import httpx
from tools.skills_hub import (
GitHubAuth,
GitHubSource,
@@ -1039,3 +1041,151 @@ class TestQuarantineBundleBinaryAssets:
assert (q_path / "SKILL.md").read_text(encoding="utf-8").startswith("---")
assert (q_path / "assets" / "neutts-cli" / "samples" / "jo.wav").read_bytes() == b"RIFF\x00\x01fakewav"
# ---------------------------------------------------------------------------
# GitHubSource._download_directory — tree API + fallback (#2940)
# ---------------------------------------------------------------------------
class TestDownloadDirectoryViaTree:
"""Tests for the Git Trees API path in _download_directory."""
def _source(self):
auth = MagicMock(spec=GitHubAuth)
auth.get_headers.return_value = {}
return GitHubSource(auth=auth)
@patch.object(GitHubSource, "_fetch_file_content")
@patch("tools.skills_hub.httpx.get")
def test_tree_api_downloads_subdirectories(self, mock_get, mock_fetch):
"""Tree API returns files from nested subdirectories."""
repo_resp = MagicMock(status_code=200, json=lambda: {"default_branch": "main"})
tree_resp = MagicMock(status_code=200, json=lambda: {
"truncated": False,
"tree": [
{"type": "blob", "path": "skills/my-skill/SKILL.md"},
{"type": "blob", "path": "skills/my-skill/scripts/run.py"},
{"type": "blob", "path": "skills/my-skill/references/api.md"},
{"type": "tree", "path": "skills/my-skill/scripts"},
{"type": "blob", "path": "other/file.txt"},
],
})
mock_get.side_effect = [repo_resp, tree_resp]
mock_fetch.side_effect = lambda repo, path: f"content-of-{path}"
src = self._source()
files = src._download_directory("owner/repo", "skills/my-skill")
assert "SKILL.md" in files
assert "scripts/run.py" in files
assert "references/api.md" in files
assert "other/file.txt" not in files # outside target path
assert len(files) == 3
@patch.object(GitHubSource, "_download_directory_recursive", return_value={"SKILL.md": "# ok"})
@patch("tools.skills_hub.httpx.get")
def test_falls_back_on_truncated_tree(self, mock_get, mock_fallback):
"""When tree is truncated, fall back to recursive Contents API."""
repo_resp = MagicMock(status_code=200, json=lambda: {"default_branch": "main"})
tree_resp = MagicMock(status_code=200, json=lambda: {"truncated": True, "tree": []})
mock_get.side_effect = [repo_resp, tree_resp]
src = self._source()
files = src._download_directory("owner/repo", "skills/my-skill")
assert files == {"SKILL.md": "# ok"}
mock_fallback.assert_called_once_with("owner/repo", "skills/my-skill")
@patch.object(GitHubSource, "_download_directory_recursive", return_value={"SKILL.md": "# ok"})
@patch("tools.skills_hub.httpx.get")
def test_falls_back_on_repo_api_failure(self, mock_get, mock_fallback):
"""When the repo endpoint returns non-200, fall back to Contents API."""
mock_get.return_value = MagicMock(status_code=404)
src = self._source()
files = src._download_directory("owner/repo", "skills/my-skill")
assert files == {"SKILL.md": "# ok"}
mock_fallback.assert_called_once()
@patch.object(GitHubSource, "_fetch_file_content")
@patch("tools.skills_hub.httpx.get")
def test_tree_api_skips_failed_file_fetches(self, mock_get, mock_fetch):
"""Files that fail to fetch are skipped, not fatal."""
repo_resp = MagicMock(status_code=200, json=lambda: {"default_branch": "main"})
tree_resp = MagicMock(status_code=200, json=lambda: {
"truncated": False,
"tree": [
{"type": "blob", "path": "skills/my-skill/SKILL.md"},
{"type": "blob", "path": "skills/my-skill/scripts/run.py"},
],
})
mock_get.side_effect = [repo_resp, tree_resp]
mock_fetch.side_effect = lambda repo, path: (
"# Skill" if path.endswith("SKILL.md") else None
)
src = self._source()
files = src._download_directory("owner/repo", "skills/my-skill")
assert "SKILL.md" in files
assert "scripts/run.py" not in files
@patch.object(GitHubSource, "_download_directory_recursive", return_value={})
@patch("tools.skills_hub.httpx.get")
def test_falls_back_on_network_error(self, mock_get, mock_fallback):
"""Network errors in tree API trigger fallback."""
mock_get.side_effect = httpx.ConnectError("connection refused")
src = self._source()
src._download_directory("owner/repo", "skills/my-skill")
mock_fallback.assert_called_once()
class TestDownloadDirectoryRecursive:
"""Tests for the Contents API fallback path."""
def _source(self):
auth = MagicMock(spec=GitHubAuth)
auth.get_headers.return_value = {}
return GitHubSource(auth=auth)
@patch.object(GitHubSource, "_fetch_file_content")
@patch("tools.skills_hub.httpx.get")
def test_recursive_downloads_subdirectories(self, mock_get, mock_fetch):
"""Contents API recursion includes subdirectories."""
root_resp = MagicMock(status_code=200, json=lambda: [
{"name": "SKILL.md", "type": "file", "path": "skill/SKILL.md"},
{"name": "scripts", "type": "dir", "path": "skill/scripts"},
])
sub_resp = MagicMock(status_code=200, json=lambda: [
{"name": "run.py", "type": "file", "path": "skill/scripts/run.py"},
])
mock_get.side_effect = [root_resp, sub_resp]
mock_fetch.side_effect = lambda repo, path: f"content-of-{path}"
src = self._source()
files = src._download_directory_recursive("owner/repo", "skill")
assert "SKILL.md" in files
assert "scripts/run.py" in files
@patch.object(GitHubSource, "_fetch_file_content")
@patch("tools.skills_hub.httpx.get")
def test_recursive_handles_subdir_failure(self, mock_get, mock_fetch):
"""Subdirectory 403/rate-limit returns empty but doesn't crash."""
root_resp = MagicMock(status_code=200, json=lambda: [
{"name": "SKILL.md", "type": "file", "path": "skill/SKILL.md"},
{"name": "scripts", "type": "dir", "path": "skill/scripts"},
])
sub_resp = MagicMock(status_code=403)
mock_get.side_effect = [root_resp, sub_resp]
mock_fetch.return_value = "content"
src = self._source()
files = src._download_directory_recursive("owner/repo", "skill")
assert "SKILL.md" in files
assert "scripts/run.py" not in files # lost due to rate limit

View File

@@ -404,11 +404,75 @@ class GitHubSource(SkillSource):
return skills
def _download_directory(self, repo: str, path: str) -> Dict[str, str]:
"""Recursively download all text files from a GitHub directory."""
"""Recursively download all text files from a GitHub directory.
Uses the Git Trees API first (single call for the entire tree) to
avoid per-directory rate limiting that causes silent subdirectory
loss. Falls back to the recursive Contents API when the tree
endpoint is unavailable or the response is truncated.
"""
files = self._download_directory_via_tree(repo, path)
if files is not None:
return files
logger.debug("Tree API unavailable for %s/%s, falling back to Contents API", repo, path)
return self._download_directory_recursive(repo, path)
def _download_directory_via_tree(self, repo: str, path: str) -> Optional[Dict[str, str]]:
"""Download an entire directory using the Git Trees API (single request)."""
path = path.rstrip("/")
headers = self.auth.get_headers()
# Resolve the default branch via the repo endpoint
try:
repo_url = f"https://api.github.com/repos/{repo}"
resp = httpx.get(repo_url, headers=headers, timeout=15, follow_redirects=True)
if resp.status_code != 200:
return None
default_branch = resp.json().get("default_branch", "main")
except (httpx.HTTPError, ValueError):
return None
# Fetch the full recursive tree (branch name works as tree-ish)
try:
tree_url = f"https://api.github.com/repos/{repo}/git/trees/{default_branch}"
resp = httpx.get(
tree_url, params={"recursive": "1"},
headers=headers, timeout=30, follow_redirects=True,
)
if resp.status_code != 200:
return None
tree_data = resp.json()
if tree_data.get("truncated"):
logger.debug("Git tree truncated for %s, falling back to Contents API", repo)
return None
except (httpx.HTTPError, ValueError):
return None
# Filter to blobs under our target path and fetch content
prefix = f"{path}/"
files: Dict[str, str] = {}
for item in tree_data.get("tree", []):
if item.get("type") != "blob":
continue
item_path = item.get("path", "")
if not item_path.startswith(prefix):
continue
rel_path = item_path[len(prefix):]
content = self._fetch_file_content(repo, item_path)
if content is not None:
files[rel_path] = content
else:
logger.debug("Skipped file (fetch failed): %s/%s", repo, item_path)
return files if files else None
def _download_directory_recursive(self, repo: str, path: str) -> Dict[str, str]:
"""Recursively download via Contents API (fallback)."""
url = f"https://api.github.com/repos/{repo}/contents/{path.rstrip('/')}"
try:
resp = httpx.get(url, headers=self.auth.get_headers(), timeout=15, follow_redirects=True)
if resp.status_code != 200:
logger.debug("Contents API returned %d for %s/%s", resp.status_code, repo, path)
return {}
except httpx.HTTPError:
return {}
@@ -428,7 +492,9 @@ class GitHubSource(SkillSource):
rel_path = name
files[rel_path] = content
elif entry_type == "dir":
sub_files = self._download_directory(repo, entry.get("path", ""))
sub_files = self._download_directory_recursive(repo, entry.get("path", ""))
if not sub_files:
logger.debug("Empty or failed subdirectory: %s/%s", repo, entry.get("path", ""))
for sub_name, sub_content in sub_files.items():
files[f"{name}/{sub_name}"] = sub_content