fix: improve clawhub skill search matching

This commit is contained in:
teknium1
2026-03-14 23:15:04 -07:00
parent df9020dfa3
commit 8ccd14a0d4
2 changed files with 177 additions and 31 deletions

View File

@@ -3,7 +3,7 @@
import unittest
from unittest.mock import patch
from tools.skills_hub import ClawHubSource
from tools.skills_hub import ClawHubSource, SkillMeta
class _MockResponse:
@@ -22,8 +22,11 @@ class TestClawHubSource(unittest.TestCase):
@patch("tools.skills_hub._write_index_cache")
@patch("tools.skills_hub._read_index_cache", return_value=None)
@patch.object(ClawHubSource, "_load_catalog_index", return_value=[])
@patch("tools.skills_hub.httpx.get")
def test_search_uses_new_endpoint_and_parses_items(self, mock_get, _mock_read_cache, _mock_write_cache):
def test_search_uses_listing_endpoint_as_fallback(
self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache
):
def side_effect(url, *args, **kwargs):
if url.endswith("/skills"):
return _MockResponse(
@@ -52,16 +55,21 @@ class TestClawHubSource(unittest.TestCase):
self.assertEqual(results[0].name, "CalDAV Calendar")
self.assertEqual(results[0].description, "Calendar integration")
first_call = mock_get.call_args_list[0]
args, kwargs = first_call
self.assertGreaterEqual(mock_get.call_count, 2)
args, kwargs = mock_get.call_args_list[0]
self.assertTrue(args[0].endswith("/skills"))
self.assertEqual(kwargs["params"], {"search": "caldav", "limit": 5})
@patch("tools.skills_hub._write_index_cache")
@patch("tools.skills_hub._read_index_cache", return_value=None)
@patch.object(
ClawHubSource,
"_load_catalog_index",
return_value=[],
)
@patch("tools.skills_hub.httpx.get")
def test_search_falls_back_to_exact_slug_when_search_results_are_irrelevant(
self, mock_get, _mock_read_cache, _mock_write_cache
self, mock_get, _mock_load_catalog, _mock_read_cache, _mock_write_cache
):
def side_effect(url, *args, **kwargs):
if url.endswith("/skills"):
@@ -102,23 +110,7 @@ class TestClawHubSource(unittest.TestCase):
self.assertIn("continuous improvement", results[0].description)
@patch("tools.skills_hub.httpx.get")
@patch(
"tools.skills_hub._read_index_cache",
return_value=[
{
"name": "Apple Music DJ",
"description": "Unrelated cached result",
"source": "clawhub",
"identifier": "apple-music-dj",
"trust_level": "community",
"repo": None,
"path": None,
"tags": [],
"extra": {},
}
],
)
def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, _mock_read_cache, mock_get):
def test_search_repairs_poisoned_cache_with_exact_slug_lookup(self, mock_get):
mock_get.return_value = _MockResponse(
status_code=200,
json_data={
@@ -132,13 +124,43 @@ class TestClawHubSource(unittest.TestCase):
},
)
results = self.src.search("self-improving-agent", limit=5)
poisoned = [
SkillMeta(
name="Apple Music DJ",
description="Unrelated cached result",
source="clawhub",
identifier="apple-music-dj",
trust_level="community",
tags=[],
)
]
results = self.src._finalize_search_results("self-improving-agent", poisoned, 5)
self.assertEqual(len(results), 1)
self.assertEqual(results[0].identifier, "self-improving-agent")
mock_get.assert_called_once()
self.assertTrue(mock_get.call_args.args[0].endswith("/skills/self-improving-agent"))
@patch.object(
ClawHubSource,
"_exact_slug_meta",
return_value=SkillMeta(
name="self-improving-agent",
description="Captures learnings and errors for continuous improvement.",
source="clawhub",
identifier="self-improving-agent",
trust_level="community",
tags=["automation"],
),
)
def test_search_matches_space_separated_query_to_hyphenated_slug(
self, _mock_exact_slug
):
results = self.src.search("self improving", limit=5)
self.assertEqual(len(results), 1)
self.assertEqual(results[0].identifier, "self-improving-agent")
@patch("tools.skills_hub.httpx.get")
def test_inspect_maps_display_name_and_summary(self, mock_get):
mock_get.return_value = _MockResponse(

View File

@@ -1190,12 +1190,29 @@ class ClawHubSource(SkillSource):
identifier = (meta.identifier or "").lower()
name = (meta.name or "").lower()
description = (meta.description or "").lower()
normalized_identifier = " ".join(cls._query_terms(identifier))
normalized_name = " ".join(cls._query_terms(name))
query_terms = cls._query_terms(query_norm)
identifier_terms = cls._query_terms(identifier)
name_terms = cls._query_terms(name)
score = 0
if query_norm == identifier:
score += 100
score += 140
if query_norm == name:
score += 130
if normalized_identifier == query_norm:
score += 125
if normalized_name == query_norm:
score += 120
if normalized_identifier.startswith(query_norm):
score += 95
if normalized_name.startswith(query_norm):
score += 90
if query_terms and identifier_terms[: len(query_terms)] == query_terms:
score += 70
if query_terms and name_terms[: len(query_terms)] == query_terms:
score += 65
if query_norm in identifier:
score += 40
if query_norm in name:
@@ -1203,10 +1220,10 @@ class ClawHubSource(SkillSource):
if query_norm in description:
score += 10
for term in cls._query_terms(query_norm):
if term in identifier:
for term in query_terms:
if term in identifier_terms:
score += 15
if term in name:
if term in name_terms:
score += 12
if term in description:
score += 3
@@ -1227,9 +1244,36 @@ class ClawHubSource(SkillSource):
def _exact_slug_meta(self, query: str) -> Optional[SkillMeta]:
slug = query.strip().split("/")[-1]
if not slug or not re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug):
return None
return self.inspect(slug)
query_terms = self._query_terms(query)
candidates: List[str] = []
if slug and re.fullmatch(r"[A-Za-z0-9][A-Za-z0-9._-]*", slug):
candidates.append(slug)
if query_terms:
base_slug = "-".join(query_terms)
if len(query_terms) >= 2:
candidates.extend([
f"{base_slug}-agent",
f"{base_slug}-skill",
f"{base_slug}-tool",
f"{base_slug}-assistant",
f"{base_slug}-playbook",
base_slug,
])
else:
candidates.append(base_slug)
seen: set[str] = set()
for candidate in candidates:
if candidate in seen:
continue
seen.add(candidate)
meta = self.inspect(candidate)
if meta:
return meta
return None
def _finalize_search_results(self, query: str, results: List[SkillMeta], limit: int) -> List[SkillMeta]:
query_norm = query.strip()
@@ -1260,7 +1304,21 @@ class ClawHubSource(SkillSource):
return self._dedupe_results(results)[:limit]
def search(self, query: str, limit: int = 10) -> List[SkillMeta]:
cache_key = f"clawhub_search_{hashlib.md5(query.encode()).hexdigest()}"
query = query.strip()
if query:
query_terms = self._query_terms(query)
if len(query_terms) >= 2:
direct = self._exact_slug_meta(query)
if direct:
return [direct]
results = self._search_catalog(query, limit=limit)
if results:
return results
# Empty query or catalog fallback failure: use the lightweight listing API.
cache_key = f"clawhub_search_listing_v1_{hashlib.md5(query.encode()).hexdigest()}_{limit}"
cached = _read_index_cache(cache_key)
if cached is not None:
return self._finalize_search_results(
@@ -1365,6 +1423,72 @@ class ClawHubSource(SkillSource):
tags=tags,
)
def _search_catalog(self, query: str, limit: int = 10) -> List[SkillMeta]:
cache_key = f"clawhub_search_catalog_v1_{hashlib.md5(f'{query}|{limit}'.encode()).hexdigest()}"
cached = _read_index_cache(cache_key)
if cached is not None:
return [SkillMeta(**s) for s in cached][:limit]
catalog = self._load_catalog_index()
if not catalog:
return []
results = self._finalize_search_results(query, catalog, limit)
_write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results])
return results
def _load_catalog_index(self) -> List[SkillMeta]:
cache_key = "clawhub_catalog_v1"
cached = _read_index_cache(cache_key)
if cached is not None:
return [SkillMeta(**s) for s in cached]
cursor: Optional[str] = None
results: List[SkillMeta] = []
seen: set[str] = set()
max_pages = 50
for _ in range(max_pages):
params: Dict[str, Any] = {"limit": 200}
if cursor:
params["cursor"] = cursor
try:
resp = httpx.get(f"{self.BASE_URL}/skills", params=params, timeout=30)
if resp.status_code != 200:
break
data = resp.json()
except (httpx.HTTPError, json.JSONDecodeError):
break
items = data.get("items", []) if isinstance(data, dict) else []
if not isinstance(items, list) or not items:
break
for item in items:
slug = item.get("slug")
if not isinstance(slug, str) or not slug or slug in seen:
continue
seen.add(slug)
display_name = item.get("displayName") or item.get("name") or slug
summary = item.get("summary") or item.get("description") or ""
tags = self._normalize_tags(item.get("tags", []))
results.append(SkillMeta(
name=display_name,
description=summary,
source="clawhub",
identifier=slug,
trust_level="community",
tags=tags,
))
cursor = data.get("nextCursor") if isinstance(data, dict) else None
if not isinstance(cursor, str) or not cursor:
break
_write_index_cache(cache_key, [_skill_meta_to_dict(s) for s in results])
return results
def _get_json(self, url: str, timeout: int = 20) -> Optional[Any]:
try:
resp = httpx.get(url, timeout=timeout)