From df9020dfa315d0cdfa1a1d129e4bb75106887e57 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 22:31:09 -0700 Subject: [PATCH 1/2] fix: harden clawhub skill search exact matches --- tests/tools/test_skills_hub_clawhub.py | 142 ++++++++++++++++++++++--- tools/skills_hub.py | 128 +++++++++++++++++++--- 2 files changed, 243 insertions(+), 27 deletions(-) diff --git a/tests/tools/test_skills_hub_clawhub.py b/tests/tools/test_skills_hub_clawhub.py index 98611d8d1..3601881ca 100644 --- a/tests/tools/test_skills_hub_clawhub.py +++ b/tests/tools/test_skills_hub_clawhub.py @@ -24,19 +24,26 @@ class TestClawHubSource(unittest.TestCase): @patch("tools.skills_hub._read_index_cache", return_value=None) @patch("tools.skills_hub.httpx.get") def test_search_uses_new_endpoint_and_parses_items(self, mock_get, _mock_read_cache, _mock_write_cache): - mock_get.return_value = _MockResponse( - status_code=200, - json_data={ - "items": [ - { - "slug": "caldav-calendar", - "displayName": "CalDAV Calendar", - "summary": "Calendar integration", - "tags": ["calendar", "productivity"], - } - ] - }, - ) + def side_effect(url, *args, **kwargs): + if url.endswith("/skills"): + return _MockResponse( + status_code=200, + json_data={ + "items": [ + { + "slug": "caldav-calendar", + "displayName": "CalDAV Calendar", + "summary": "Calendar integration", + "tags": ["calendar", "productivity"], + } + ] + }, + ) + if url.endswith("/skills/caldav"): + return _MockResponse(status_code=404, json_data={}) + return _MockResponse(status_code=404, json_data={}) + + mock_get.side_effect = side_effect results = self.src.search("caldav", limit=5) @@ -45,11 +52,93 @@ class TestClawHubSource(unittest.TestCase): self.assertEqual(results[0].name, "CalDAV Calendar") self.assertEqual(results[0].description, "Calendar integration") - mock_get.assert_called_once() - args, kwargs = mock_get.call_args + first_call = mock_get.call_args_list[0] + args, kwargs = first_call self.assertTrue(args[0].endswith("/skills")) self.assertEqual(kwargs["params"], {"search": "caldav", "limit": 5}) + @patch("tools.skills_hub._write_index_cache") + @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch("tools.skills_hub.httpx.get") + def test_search_falls_back_to_exact_slug_when_search_results_are_irrelevant( + self, mock_get, _mock_read_cache, _mock_write_cache + ): + def side_effect(url, *args, **kwargs): + if url.endswith("/skills"): + return _MockResponse( + status_code=200, + json_data={ + "items": [ + { + "slug": "apple-music-dj", + "displayName": "Apple Music DJ", + "summary": "Unrelated result", + } + ] + }, + ) + if url.endswith("/skills/self-improving-agent"): + return _MockResponse( + status_code=200, + json_data={ + "skill": { + "slug": "self-improving-agent", + "displayName": "self-improving-agent", + "summary": "Captures learnings and errors for continuous improvement.", + "tags": {"latest": "3.0.2", "automation": "3.0.2"}, + }, + "latestVersion": {"version": "3.0.2"}, + }, + ) + return _MockResponse(status_code=404, json_data={}) + + mock_get.side_effect = side_effect + + results = self.src.search("self-improving-agent", limit=5) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0].identifier, "self-improving-agent") + self.assertEqual(results[0].name, "self-improving-agent") + self.assertIn("continuous improvement", results[0].description) + + @patch("tools.skills_hub.httpx.get") + @patch( + "tools.skills_hub._read_index_cache", + return_value=[ + { + "name": "Apple Music DJ", + "description": "Unrelated cached result", + "source": "clawhub", + "identifier": "apple-music-dj", + "trust_level": "community", + "repo": None, + "path": None, + "tags": [], + "extra": {}, + } + ], + ) + def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, _mock_read_cache, mock_get): + mock_get.return_value = _MockResponse( + status_code=200, + json_data={ + "skill": { + "slug": "self-improving-agent", + "displayName": "self-improving-agent", + "summary": "Captures learnings and errors for continuous improvement.", + "tags": {"latest": "3.0.2", "automation": "3.0.2"}, + }, + "latestVersion": {"version": "3.0.2"}, + }, + ) + + results = self.src.search("self-improving-agent", limit=5) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0].identifier, "self-improving-agent") + mock_get.assert_called_once() + self.assertTrue(mock_get.call_args.args[0].endswith("/skills/self-improving-agent")) + @patch("tools.skills_hub.httpx.get") def test_inspect_maps_display_name_and_summary(self, mock_get): mock_get.return_value = _MockResponse( @@ -69,6 +158,29 @@ class TestClawHubSource(unittest.TestCase): self.assertEqual(meta.description, "Calendar integration") self.assertEqual(meta.identifier, "caldav-calendar") + @patch("tools.skills_hub.httpx.get") + def test_inspect_handles_nested_skill_payload(self, mock_get): + mock_get.return_value = _MockResponse( + status_code=200, + json_data={ + "skill": { + "slug": "self-improving-agent", + "displayName": "self-improving-agent", + "summary": "Captures learnings and errors for continuous improvement.", + "tags": {"latest": "3.0.2", "automation": "3.0.2"}, + }, + "latestVersion": {"version": "3.0.2"}, + }, + ) + + meta = self.src.inspect("self-improving-agent") + + self.assertIsNotNone(meta) + self.assertEqual(meta.name, "self-improving-agent") + self.assertIn("continuous improvement", meta.description) + self.assertEqual(meta.identifier, "self-improving-agent") + self.assertEqual(meta.tags, ["automation"]) + @patch("tools.skills_hub.httpx.get") def test_fetch_resolves_latest_version_and_downloads_raw_files(self, mock_get): def side_effect(url, *args, **kwargs): diff --git a/tools/skills_hub.py b/tools/skills_hub.py index 94845fe92..3c055183e 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -1156,11 +1156,118 @@ class ClawHubSource(SkillSource): def trust_level_for(self, identifier: str) -> str: return "community" + @staticmethod + def _normalize_tags(tags: Any) -> List[str]: + if isinstance(tags, list): + return [str(t) for t in tags] + if isinstance(tags, dict): + return [str(k) for k in tags.keys() if str(k) != "latest"] + return [] + + @staticmethod + def _coerce_skill_payload(data: Any) -> Optional[Dict[str, Any]]: + if not isinstance(data, dict): + return None + nested = data.get("skill") + if isinstance(nested, dict): + merged = dict(nested) + latest_version = data.get("latestVersion") + if latest_version is not None and "latestVersion" not in merged: + merged["latestVersion"] = latest_version + return merged + return data + + @staticmethod + def _query_terms(query: str) -> List[str]: + return [term for term in re.split(r"[^a-z0-9]+", query.lower()) if term] + + @classmethod + def _search_score(cls, query: str, meta: SkillMeta) -> int: + query_norm = query.strip().lower() + if not query_norm: + return 1 + + identifier = (meta.identifier or "").lower() + name = (meta.name or "").lower() + description = (meta.description or "").lower() + score = 0 + + if query_norm == identifier: + score += 100 + if query_norm == name: + score += 95 + if query_norm in identifier: + score += 40 + if query_norm in name: + score += 35 + if query_norm in description: + score += 10 + + for term in cls._query_terms(query_norm): + if term in identifier: + score += 15 + if term in name: + score += 12 + if term in description: + score += 3 + + return score + + @staticmethod + def _dedupe_results(results: List[SkillMeta]) -> List[SkillMeta]: + seen: set[str] = set() + deduped: List[SkillMeta] = [] + for result in results: + key = (result.identifier or result.name).lower() + if key in seen: + continue + seen.add(key) + deduped.append(result) + return deduped + + def _exact_slug_meta(self, query: str) -> Optional[SkillMeta]: + slug = query.strip().split("/")[-1] + if not slug or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug): + return None + return self.inspect(slug) + + def _finalize_search_results(self, query: str, results: List[SkillMeta], limit: int) -> List[SkillMeta]: + query_norm = query.strip() + if not query_norm: + return self._dedupe_results(results)[:limit] + + filtered = [meta for meta in results if self._search_score(query_norm, meta) > 0] + filtered.sort( + key=lambda meta: ( + -self._search_score(query_norm, meta), + meta.name.lower(), + meta.identifier.lower(), + ) + ) + filtered = self._dedupe_results(filtered) + + exact = self._exact_slug_meta(query_norm) + if exact: + filtered = [meta for meta in filtered if self._search_score(query_norm, meta) >= 20] + filtered = self._dedupe_results([exact] + filtered) + + if filtered: + return filtered[:limit] + + if re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._/-]*", query_norm): + return [] + + return self._dedupe_results(results)[:limit] + def search(self, query: str, limit: int = 10) -> List[SkillMeta]: cache_key = f"clawhub_search_{hashlib.md5(query.encode()).hexdigest()}" cached = _read_index_cache(cache_key) if cached is not None: - return [SkillMeta(**s) for s in cached][:limit] + return self._finalize_search_results( + query, + [SkillMeta(**s) for s in cached], + limit, + ) try: resp = httpx.get( @@ -1185,20 +1292,19 @@ class ClawHubSource(SkillSource): continue display_name = item.get("displayName") or item.get("name") or slug summary = item.get("summary") or item.get("description") or "" - tags = item.get("tags", []) - if not isinstance(tags, list): - tags = [] + tags = self._normalize_tags(item.get("tags", [])) results.append(SkillMeta( name=display_name, description=summary, source="clawhub", identifier=slug, trust_level="community", - tags=[str(t) for t in tags], + tags=tags, )) - _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results]) - return results + final_results = self._finalize_search_results(query, results, limit) + _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in final_results]) + return final_results def fetch(self, identifier: str) -> Optional[SkillBundle]: slug = identifier.split("/")[-1] @@ -1244,13 +1350,11 @@ class ClawHubSource(SkillSource): def inspect(self, identifier: str) -> Optional[SkillMeta]: slug = identifier.split("/")[-1] - data = self._get_json(f"{self.BASE_URL}/skills/{slug}") + data = self._coerce_skill_payload(self._get_json(f"{self.BASE_URL}/skills/{slug}")) if not isinstance(data, dict): return None - tags = data.get("tags", []) - if not isinstance(tags, list): - tags = [] + tags = self._normalize_tags(data.get("tags", [])) return SkillMeta( name=data.get("displayName") or data.get("name") or data.get("slug") or slug, @@ -1258,7 +1362,7 @@ class ClawHubSource(SkillSource): source="clawhub", identifier=data.get("slug") or slug, trust_level="community", - tags=[str(t) for t in tags], + tags=tags, ) def _get_json(self, url: str, timeout: int = 20) -> Optional[Any]: From 8ccd14a0d4c1788af7f71b5987ec80c6c417e122 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 23:15:04 -0700 Subject: [PATCH 2/2] fix: improve clawhub skill search matching --- tests/tools/test_skills_hub_clawhub.py | 68 ++++++++---- tools/skills_hub.py | 140 +++++++++++++++++++++++-- 2 files changed, 177 insertions(+), 31 deletions(-) diff --git a/tests/tools/test_skills_hub_clawhub.py b/tests/tools/test_skills_hub_clawhub.py index 3601881ca..2318ec80e 100644 --- a/tests/tools/test_skills_hub_clawhub.py +++ b/tests/tools/test_skills_hub_clawhub.py @@ -3,7 +3,7 @@ import unittest from unittest.mock import patch -from tools.skills_hub import ClawHubSource +from tools.skills_hub import ClawHubSource, SkillMeta class _MockResponse: @@ -22,8 +22,11 @@ class TestClawHubSource(unittest.TestCase): @patch("tools.skills_hub._write_index_cache") @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch.object(ClawHubSource, "_load_catalog_index", return_value=[]) @patch("tools.skills_hub.httpx.get") - def test_search_uses_new_endpoint_and_parses_items(self, mock_get, _mock_read_cache, _mock_write_cache): + def test_search_uses_listing_endpoint_as_fallback( + self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache + ): def side_effect(url, *args, **kwargs): if url.endswith("/skills"): return _MockResponse( @@ -52,16 +55,21 @@ class TestClawHubSource(unittest.TestCase): self.assertEqual(results[0].name, "CalDAV Calendar") self.assertEqual(results[0].description, "Calendar integration") - first_call = mock_get.call_args_list[0] - args, kwargs = first_call + self.assertGreaterEqual(mock_get.call_count, 2) + args, kwargs = mock_get.call_args_list[0] self.assertTrue(args[0].endswith("/skills")) self.assertEqual(kwargs["params"], {"search": "caldav", "limit": 5}) @patch("tools.skills_hub._write_index_cache") @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch.object( + ClawHubSource, + "_load_catalog_index", + return_value=[], + ) @patch("tools.skills_hub.httpx.get") def test_search_falls_back_to_exact_slug_when_search_results_are_irrelevant( - self, mock_get, _mock_read_cache, _mock_write_cache + self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache ): def side_effect(url, *args, **kwargs): if url.endswith("/skills"): @@ -102,23 +110,7 @@ class TestClawHubSource(unittest.TestCase): self.assertIn("continuous improvement", results[0].description) @patch("tools.skills_hub.httpx.get") - @patch( - "tools.skills_hub._read_index_cache", - return_value=[ - { - "name": "Apple Music DJ", - "description": "Unrelated cached result", - "source": "clawhub", - "identifier": "apple-music-dj", - "trust_level": "community", - "repo": None, - "path": None, - "tags": [], - "extra": {}, - } - ], - ) - def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, _mock_read_cache, mock_get): + def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, mock_get): mock_get.return_value = _MockResponse( status_code=200, json_data={ @@ -132,13 +124,43 @@ class TestClawHubSource(unittest.TestCase): }, ) - results = self.src.search("self-improving-agent", limit=5) + poisoned = [ + SkillMeta( + name="Apple Music DJ", + description="Unrelated cached result", + source="clawhub", + identifier="apple-music-dj", + trust_level="community", + tags=[], + ) + ] + results = self.src._finalize_search_results("self-improving-agent", poisoned, 5) self.assertEqual(len(results), 1) self.assertEqual(results[0].identifier, "self-improving-agent") mock_get.assert_called_once() self.assertTrue(mock_get.call_args.args[0].endswith("/skills/self-improving-agent")) + @patch.object( + ClawHubSource, + "_exact_slug_meta", + return_value=SkillMeta( + name="self-improving-agent", + description="Captures learnings and errors for continuous improvement.", + source="clawhub", + identifier="self-improving-agent", + trust_level="community", + tags=["automation"], + ), + ) + def test_search_matches_space_separated_query_to_hyphenated_slug( + self, _mock_exact_slug + ): + results = self.src.search("self improving", limit=5) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0].identifier, "self-improving-agent") + @patch("tools.skills_hub.httpx.get") def test_inspect_maps_display_name_and_summary(self, mock_get): mock_get.return_value = _MockResponse( diff --git a/tools/skills_hub.py b/tools/skills_hub.py index 3c055183e..52472656c 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -1190,12 +1190,29 @@ class ClawHubSource(SkillSource): identifier = (meta.identifier or "").lower() name = (meta.name or "").lower() description = (meta.description or "").lower() + normalized_identifier = " ".join(cls._query_terms(identifier)) + normalized_name = " ".join(cls._query_terms(name)) + query_terms = cls._query_terms(query_norm) + identifier_terms = cls._query_terms(identifier) + name_terms = cls._query_terms(name) score = 0 if query_norm == identifier: - score += 100 + score += 140 if query_norm == name: + score += 130 + if normalized_identifier == query_norm: + score += 125 + if normalized_name == query_norm: + score += 120 + if normalized_identifier.startswith(query_norm): score += 95 + if normalized_name.startswith(query_norm): + score += 90 + if query_terms and identifier_terms[: len(query_terms)] == query_terms: + score += 70 + if query_terms and name_terms[: len(query_terms)] == query_terms: + score += 65 if query_norm in identifier: score += 40 if query_norm in name: @@ -1203,10 +1220,10 @@ class ClawHubSource(SkillSource): if query_norm in description: score += 10 - for term in cls._query_terms(query_norm): - if term in identifier: + for term in query_terms: + if term in identifier_terms: score += 15 - if term in name: + if term in name_terms: score += 12 if term in description: score += 3 @@ -1227,9 +1244,36 @@ class ClawHubSource(SkillSource): def _exact_slug_meta(self, query: str) -> Optional[SkillMeta]: slug = query.strip().split("/")[-1] - if not slug or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug): - return None - return self.inspect(slug) + query_terms = self._query_terms(query) + candidates: List[str] = [] + + if slug and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug): + candidates.append(slug) + + if query_terms: + base_slug = "-".join(query_terms) + if len(query_terms) >= 2: + candidates.extend([ + f"{base_slug}-agent", + f"{base_slug}-skill", + f"{base_slug}-tool", + f"{base_slug}-assistant", + f"{base_slug}-playbook", + base_slug, + ]) + else: + candidates.append(base_slug) + + seen: set[str] = set() + for candidate in candidates: + if candidate in seen: + continue + seen.add(candidate) + meta = self.inspect(candidate) + if meta: + return meta + + return None def _finalize_search_results(self, query: str, results: List[SkillMeta], limit: int) -> List[SkillMeta]: query_norm = query.strip() @@ -1260,7 +1304,21 @@ class ClawHubSource(SkillSource): return self._dedupe_results(results)[:limit] def search(self, query: str, limit: int = 10) -> List[SkillMeta]: - cache_key = f"clawhub_search_{hashlib.md5(query.encode()).hexdigest()}" + query = query.strip() + + if query: + query_terms = self._query_terms(query) + if len(query_terms) >= 2: + direct = self._exact_slug_meta(query) + if direct: + return [direct] + + results = self._search_catalog(query, limit=limit) + if results: + return results + + # Empty query or catalog fallback failure: use the lightweight listing API. + cache_key = f"clawhub_search_listing_v1_{hashlib.md5(query.encode()).hexdigest()}_{limit}" cached = _read_index_cache(cache_key) if cached is not None: return self._finalize_search_results( @@ -1365,6 +1423,72 @@ class ClawHubSource(SkillSource): tags=tags, ) + def _search_catalog(self, query: str, limit: int = 10) -> List[SkillMeta]: + cache_key = f"clawhub_search_catalog_v1_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}" + cached = _read_index_cache(cache_key) + if cached is not None: + return [SkillMeta(**s) for s in cached][:limit] + + catalog = self._load_catalog_index() + if not catalog: + return [] + + results = self._finalize_search_results(query, catalog, limit) + _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results]) + return results + + def _load_catalog_index(self) -> List[SkillMeta]: + cache_key = "clawhub_catalog_v1" + cached = _read_index_cache(cache_key) + if cached is not None: + return [SkillMeta(**s) for s in cached] + + cursor: Optional[str] = None + results: List[SkillMeta] = [] + seen: set[str] = set() + max_pages = 50 + + for _ in range(max_pages): + params: Dict[str, Any] = {"limit": 200} + if cursor: + params["cursor"] = cursor + + try: + resp = httpx.get(f"{self.BASE_URL}/skills", params=params, timeout=30) + if resp.status_code != 200: + break + data = resp.json() + except (httpx.HTTPError, json.JSONDecodeError): + break + + items = data.get("items", []) if isinstance(data, dict) else [] + if not isinstance(items, list) or not items: + break + + for item in items: + slug = item.get("slug") + if not isinstance(slug, str) or not slug or slug in seen: + continue + seen.add(slug) + display_name = item.get("displayName") or item.get("name") or slug + summary = item.get("summary") or item.get("description") or "" + tags = self._normalize_tags(item.get("tags", [])) + results.append(SkillMeta( + name=display_name, + description=summary, + source="clawhub", + identifier=slug, + trust_level="community", + tags=tags, + )) + + cursor = data.get("nextCursor") if isinstance(data, dict) else None + if not isinstance(cursor, str) or not cursor: + break + + _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results]) + return results + def _get_json(self, url: str, timeout: int = 20) -> Optional[Any]: try: resp = httpx.get(url, timeout=timeout)