From 8ccd14a0d4c1788af7f71b5987ec80c6c417e122 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 23:15:04 -0700 Subject: [PATCH] fix: improve clawhub skill search matching --- tests/tools/test_skills_hub_clawhub.py | 68 ++++++++---- tools/skills_hub.py | 140 +++++++++++++++++++++++-- 2 files changed, 177 insertions(+), 31 deletions(-) diff --git a/tests/tools/test_skills_hub_clawhub.py b/tests/tools/test_skills_hub_clawhub.py index 3601881c..2318ec80 100644 --- a/tests/tools/test_skills_hub_clawhub.py +++ b/tests/tools/test_skills_hub_clawhub.py @@ -3,7 +3,7 @@ import unittest from unittest.mock import patch -from tools.skills_hub import ClawHubSource +from tools.skills_hub import ClawHubSource, SkillMeta class _MockResponse: @@ -22,8 +22,11 @@ class TestClawHubSource(unittest.TestCase): @patch("tools.skills_hub._write_index_cache") @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch.object(ClawHubSource, "_load_catalog_index", return_value=[]) @patch("tools.skills_hub.httpx.get") - def test_search_uses_new_endpoint_and_parses_items(self, mock_get, _mock_read_cache, _mock_write_cache): + def test_search_uses_listing_endpoint_as_fallback( + self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache + ): def side_effect(url, *args, **kwargs): if url.endswith("/skills"): return _MockResponse( @@ -52,16 +55,21 @@ class TestClawHubSource(unittest.TestCase): self.assertEqual(results[0].name, "CalDAV Calendar") self.assertEqual(results[0].description, "Calendar integration") - first_call = mock_get.call_args_list[0] - args, kwargs = first_call + self.assertGreaterEqual(mock_get.call_count, 2) + args, kwargs = mock_get.call_args_list[0] self.assertTrue(args[0].endswith("/skills")) self.assertEqual(kwargs["params"], {"search": "caldav", "limit": 5}) @patch("tools.skills_hub._write_index_cache") @patch("tools.skills_hub._read_index_cache", return_value=None) + @patch.object( + ClawHubSource, + "_load_catalog_index", + return_value=[], + ) @patch("tools.skills_hub.httpx.get") def test_search_falls_back_to_exact_slug_when_search_results_are_irrelevant( - self, mock_get, _mock_read_cache, _mock_write_cache + self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache ): def side_effect(url, *args, **kwargs): if url.endswith("/skills"): @@ -102,23 +110,7 @@ class TestClawHubSource(unittest.TestCase): self.assertIn("continuous improvement", results[0].description) @patch("tools.skills_hub.httpx.get") - @patch( - "tools.skills_hub._read_index_cache", - return_value=[ - { - "name": "Apple Music DJ", - "description": "Unrelated cached result", - "source": "clawhub", - "identifier": "apple-music-dj", - "trust_level": "community", - "repo": None, - "path": None, - "tags": [], - "extra": {}, - } - ], - ) - def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, _mock_read_cache, mock_get): + def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, mock_get): mock_get.return_value = _MockResponse( status_code=200, json_data={ @@ -132,13 +124,43 @@ class TestClawHubSource(unittest.TestCase): }, ) - results = self.src.search("self-improving-agent", limit=5) + poisoned = [ + SkillMeta( + name="Apple Music DJ", + description="Unrelated cached result", + source="clawhub", + identifier="apple-music-dj", + trust_level="community", + tags=[], + ) + ] + results = self.src._finalize_search_results("self-improving-agent", poisoned, 5) self.assertEqual(len(results), 1) self.assertEqual(results[0].identifier, "self-improving-agent") mock_get.assert_called_once() self.assertTrue(mock_get.call_args.args[0].endswith("/skills/self-improving-agent")) + @patch.object( + ClawHubSource, + "_exact_slug_meta", + return_value=SkillMeta( + name="self-improving-agent", + description="Captures learnings and errors for continuous improvement.", + source="clawhub", + identifier="self-improving-agent", + trust_level="community", + tags=["automation"], + ), + ) + def test_search_matches_space_separated_query_to_hyphenated_slug( + self, _mock_exact_slug + ): + results = self.src.search("self improving", limit=5) + + self.assertEqual(len(results), 1) + self.assertEqual(results[0].identifier, "self-improving-agent") + @patch("tools.skills_hub.httpx.get") def test_inspect_maps_display_name_and_summary(self, mock_get): mock_get.return_value = _MockResponse( diff --git a/tools/skills_hub.py b/tools/skills_hub.py index 3c055183..52472656 100644 --- a/tools/skills_hub.py +++ b/tools/skills_hub.py @@ -1190,12 +1190,29 @@ class ClawHubSource(SkillSource): identifier = (meta.identifier or "").lower() name = (meta.name or "").lower() description = (meta.description or "").lower() + normalized_identifier = " ".join(cls._query_terms(identifier)) + normalized_name = " ".join(cls._query_terms(name)) + query_terms = cls._query_terms(query_norm) + identifier_terms = cls._query_terms(identifier) + name_terms = cls._query_terms(name) score = 0 if query_norm == identifier: - score += 100 + score += 140 if query_norm == name: + score += 130 + if normalized_identifier == query_norm: + score += 125 + if normalized_name == query_norm: + score += 120 + if normalized_identifier.startswith(query_norm): score += 95 + if normalized_name.startswith(query_norm): + score += 90 + if query_terms and identifier_terms[: len(query_terms)] == query_terms: + score += 70 + if query_terms and name_terms[: len(query_terms)] == query_terms: + score += 65 if query_norm in identifier: score += 40 if query_norm in name: @@ -1203,10 +1220,10 @@ class ClawHubSource(SkillSource): if query_norm in description: score += 10 - for term in cls._query_terms(query_norm): - if term in identifier: + for term in query_terms: + if term in identifier_terms: score += 15 - if term in name: + if term in name_terms: score += 12 if term in description: score += 3 @@ -1227,9 +1244,36 @@ class ClawHubSource(SkillSource): def _exact_slug_meta(self, query: str) -> Optional[SkillMeta]: slug = query.strip().split("/")[-1] - if not slug or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug): - return None - return self.inspect(slug) + query_terms = self._query_terms(query) + candidates: List[str] = [] + + if slug and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug): + candidates.append(slug) + + if query_terms: + base_slug = "-".join(query_terms) + if len(query_terms) >= 2: + candidates.extend([ + f"{base_slug}-agent", + f"{base_slug}-skill", + f"{base_slug}-tool", + f"{base_slug}-assistant", + f"{base_slug}-playbook", + base_slug, + ]) + else: + candidates.append(base_slug) + + seen: set[str] = set() + for candidate in candidates: + if candidate in seen: + continue + seen.add(candidate) + meta = self.inspect(candidate) + if meta: + return meta + + return None def _finalize_search_results(self, query: str, results: List[SkillMeta], limit: int) -> List[SkillMeta]: query_norm = query.strip() @@ -1260,7 +1304,21 @@ class ClawHubSource(SkillSource): return self._dedupe_results(results)[:limit] def search(self, query: str, limit: int = 10) -> List[SkillMeta]: - cache_key = f"clawhub_search_{hashlib.md5(query.encode()).hexdigest()}" + query = query.strip() + + if query: + query_terms = self._query_terms(query) + if len(query_terms) >= 2: + direct = self._exact_slug_meta(query) + if direct: + return [direct] + + results = self._search_catalog(query, limit=limit) + if results: + return results + + # Empty query or catalog fallback failure: use the lightweight listing API. + cache_key = f"clawhub_search_listing_v1_{hashlib.md5(query.encode()).hexdigest()}_{limit}" cached = _read_index_cache(cache_key) if cached is not None: return self._finalize_search_results( @@ -1365,6 +1423,72 @@ class ClawHubSource(SkillSource): tags=tags, ) + def _search_catalog(self, query: str, limit: int = 10) -> List[SkillMeta]: + cache_key = f"clawhub_search_catalog_v1_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}" + cached = _read_index_cache(cache_key) + if cached is not None: + return [SkillMeta(**s) for s in cached][:limit] + + catalog = self._load_catalog_index() + if not catalog: + return [] + + results = self._finalize_search_results(query, catalog, limit) + _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results]) + return results + + def _load_catalog_index(self) -> List[SkillMeta]: + cache_key = "clawhub_catalog_v1" + cached = _read_index_cache(cache_key) + if cached is not None: + return [SkillMeta(**s) for s in cached] + + cursor: Optional[str] = None + results: List[SkillMeta] = [] + seen: set[str] = set() + max_pages = 50 + + for _ in range(max_pages): + params: Dict[str, Any] = {"limit": 200} + if cursor: + params["cursor"] = cursor + + try: + resp = httpx.get(f"{self.BASE_URL}/skills", params=params, timeout=30) + if resp.status_code != 200: + break + data = resp.json() + except (httpx.HTTPError, json.JSONDecodeError): + break + + items = data.get("items", []) if isinstance(data, dict) else [] + if not isinstance(items, list) or not items: + break + + for item in items: + slug = item.get("slug") + if not isinstance(slug, str) or not slug or slug in seen: + continue + seen.add(slug) + display_name = item.get("displayName") or item.get("name") or slug + summary = item.get("summary") or item.get("description") or "" + tags = self._normalize_tags(item.get("tags", [])) + results.append(SkillMeta( + name=display_name, + description=summary, + source="clawhub", + identifier=slug, + trust_level="community", + tags=tags, + )) + + cursor = data.get("nextCursor") if isinstance(data, dict) else None + if not isinstance(cursor, str) or not cursor: + break + + _write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results]) + return results + def _get_json(self, url: str, timeout: int = 20) -> Optional[Any]: try: resp = httpx.get(url, timeout=timeout)