Files
Timmy-time-dashboard/tests/unit/test_skill_discovery.py
Alexander Whitestone 92c677f029
Some checks failed
Tests / lint (pull_request) Successful in 13s
Tests / test (pull_request) Failing after 6m25s
feat: add automated skill discovery pipeline
Implements a background process that monitors session logs for successful
agent action sequences, uses the LLM router to extract reusable skill
templates, and stores them in a SQLite database. Discovered skills are
surfaced via dashboard notifications (push + WebSocket + event bus) and
a new /skills page with HTMX polling. Users can confirm, reject, or
archive discovered skills.

- src/timmy/skill_discovery.py: Core engine with LLM analysis + heuristic fallback
- src/dashboard/routes/skills.py: CRUD routes for skill management
- src/dashboard/templates/skills.html: Main skills page
- src/dashboard/templates/partials/skills_list.html: HTMX partial
- Background scheduler in app.py runs every 10 minutes
- 31 unit tests covering DB ops, clustering, parsing, dedup, and scan

Fixes #1011

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 19:56:35 -04:00

411 lines
14 KiB
Python

"""Unit tests for the skill discovery pipeline.
Tests the discovery engine's core logic: action clustering, skill extraction,
database persistence, deduplication, and status management.
"""
from __future__ import annotations
import json
from datetime import datetime, timedelta
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from timmy.skill_discovery import (
DiscoveredSkill,
SkillDiscoveryEngine,
)
@pytest.fixture
def engine():
"""Create a fresh SkillDiscoveryEngine for each test."""
return SkillDiscoveryEngine(confidence_threshold=0.7, min_actions=2)
@pytest.fixture(autouse=True)
def temp_db(tmp_path, monkeypatch):
"""Use a temporary database for each test."""
db_path = tmp_path / "skills.db"
monkeypatch.setattr("timmy.skill_discovery.DB_PATH", db_path)
return db_path
# ---------------------------------------------------------------------------
# DiscoveredSkill dataclass
# ---------------------------------------------------------------------------
class TestDiscoveredSkill:
def test_defaults(self):
skill = DiscoveredSkill()
assert skill.name == ""
assert skill.status == "discovered"
assert skill.confidence == 0.0
assert skill.id.startswith("skill_")
def test_to_dict(self):
skill = DiscoveredSkill(name="Test Skill", confidence=0.85)
d = skill.to_dict()
assert d["name"] == "Test Skill"
assert d["confidence"] == 0.85
assert "id" in d
assert "created_at" in d
def test_custom_fields(self):
skill = DiscoveredSkill(
name="Code Review",
category="coding",
confidence=0.92,
template="Step 1: Read code\nStep 2: Analyze",
)
assert skill.category == "coding"
assert "Step 1" in skill.template
# ---------------------------------------------------------------------------
# Database operations
# ---------------------------------------------------------------------------
class TestDatabase:
def test_save_and_list(self, engine):
skill = DiscoveredSkill(
name="Git Workflow",
description="Automates git operations",
category="devops",
confidence=0.88,
)
engine._save_skill(skill)
skills = engine.list_skills()
assert len(skills) == 1
assert skills[0]["name"] == "Git Workflow"
assert skills[0]["category"] == "devops"
def test_list_by_status(self, engine):
s1 = DiscoveredSkill(name="Skill A", status="discovered")
s2 = DiscoveredSkill(name="Skill B", status="confirmed")
engine._save_skill(s1)
engine._save_skill(s2)
discovered = engine.list_skills(status="discovered")
assert len(discovered) == 1
assert discovered[0]["name"] == "Skill A"
confirmed = engine.list_skills(status="confirmed")
assert len(confirmed) == 1
assert confirmed[0]["name"] == "Skill B"
def test_get_skill(self, engine):
skill = DiscoveredSkill(name="Find Me")
engine._save_skill(skill)
found = engine.get_skill(skill.id)
assert found is not None
assert found["name"] == "Find Me"
def test_get_skill_not_found(self, engine):
assert engine.get_skill("nonexistent") is None
def test_update_status(self, engine):
skill = DiscoveredSkill(name="Status Test")
engine._save_skill(skill)
assert engine.update_status(skill.id, "confirmed")
found = engine.get_skill(skill.id)
assert found["status"] == "confirmed"
def test_update_invalid_status(self, engine):
skill = DiscoveredSkill(name="Invalid Status")
engine._save_skill(skill)
assert not engine.update_status(skill.id, "bogus")
def test_skill_count(self, engine):
engine._save_skill(DiscoveredSkill(name="A", status="discovered"))
engine._save_skill(DiscoveredSkill(name="B", status="discovered"))
engine._save_skill(DiscoveredSkill(name="C", status="confirmed"))
counts = engine.skill_count()
assert counts["discovered"] == 2
assert counts["confirmed"] == 1
def test_list_limit(self, engine):
for i in range(5):
engine._save_skill(DiscoveredSkill(name=f"Skill {i}"))
assert len(engine.list_skills(limit=3)) == 3
# ---------------------------------------------------------------------------
# Action clustering
# ---------------------------------------------------------------------------
class TestActionClustering:
def test_empty_entries(self, engine):
assert engine._cluster_action_sequences([]) == []
def test_single_sequence(self, engine):
now = datetime.now()
entries = [
{"type": "tool_call", "tool": "read", "timestamp": now.isoformat()},
{
"type": "tool_call",
"tool": "write",
"timestamp": (now + timedelta(seconds=30)).isoformat(),
},
]
sequences = engine._cluster_action_sequences(entries)
assert len(sequences) == 1
assert len(sequences[0]) == 2
def test_split_by_gap(self, engine):
now = datetime.now()
entries = [
{"type": "tool_call", "tool": "read", "timestamp": now.isoformat()},
{
"type": "tool_call",
"tool": "write",
"timestamp": (now + timedelta(seconds=600)).isoformat(),
},
]
sequences = engine._cluster_action_sequences(entries, max_gap_seconds=300)
assert len(sequences) == 2
def test_bad_timestamps(self, engine):
entries = [
{"type": "tool_call", "tool": "read", "timestamp": "not-a-date"},
{"type": "tool_call", "tool": "write", "timestamp": "also-bad"},
]
sequences = engine._cluster_action_sequences(entries)
# Should still produce sequences (split on bad parse)
assert len(sequences) >= 1
# ---------------------------------------------------------------------------
# LLM response parsing
# ---------------------------------------------------------------------------
class TestLLMParsing:
def test_parse_valid_json(self, engine):
response = json.dumps(
{
"name": "API Search",
"description": "Searches APIs efficiently",
"category": "research",
"template": "1. Identify API\n2. Call endpoint",
"confidence": 0.85,
}
)
skill = engine._parse_llm_response(response, [])
assert skill is not None
assert skill.name == "API Search"
assert skill.confidence == 0.85
assert skill.category == "research"
def test_parse_with_markdown_fences(self, engine):
response = '```json\n{"name": "Fenced", "confidence": 0.9}\n```'
skill = engine._parse_llm_response(response, [])
assert skill is not None
assert skill.name == "Fenced"
def test_parse_invalid_json(self, engine):
assert engine._parse_llm_response("not json", []) is None
def test_parse_empty(self, engine):
assert engine._parse_llm_response("", []) is None
# ---------------------------------------------------------------------------
# Heuristic extraction
# ---------------------------------------------------------------------------
class TestHeuristicExtraction:
def test_extract_from_tool_calls(self, engine):
seq = [
{"type": "tool_call", "tool": "git_commit", "result": "ok"},
{"type": "tool_call", "tool": "git_push", "result": "ok"},
{"type": "tool_call", "tool": "git_commit", "result": "ok"},
]
skill = engine._heuristic_extraction(seq)
assert skill is not None
assert "Git Commit" in skill.name
assert skill.confidence == 0.5
def test_extract_no_tool_calls(self, engine):
seq = [{"type": "message", "role": "user", "content": "hello"}]
assert engine._heuristic_extraction(seq) is None
# ---------------------------------------------------------------------------
# Deduplication
# ---------------------------------------------------------------------------
class TestDeduplication:
def test_not_duplicate(self, engine):
skill = DiscoveredSkill(name="Unique Skill")
assert not engine._is_duplicate(skill)
def test_is_duplicate(self, engine):
skill = DiscoveredSkill(name="Duplicate Check")
engine._save_skill(skill)
new_skill = DiscoveredSkill(name="Duplicate Check")
assert engine._is_duplicate(new_skill)
def test_rejected_not_duplicate(self, engine):
skill = DiscoveredSkill(name="Rejected Skill", status="rejected")
engine._save_skill(skill)
new_skill = DiscoveredSkill(name="Rejected Skill")
assert not engine._is_duplicate(new_skill)
# ---------------------------------------------------------------------------
# Format actions
# ---------------------------------------------------------------------------
class TestFormatActions:
def test_format_tool_call(self, engine):
seq = [{"type": "tool_call", "tool": "shell", "result": "output text"}]
text = engine._format_actions(seq)
assert "shell" in text
assert "output text" in text
def test_format_message(self, engine):
seq = [{"type": "message", "role": "timmy", "content": "I analyzed the code"}]
text = engine._format_actions(seq)
assert "I analyzed the code" in text
def test_format_decision(self, engine):
seq = [{"type": "decision", "decision": "Use async"}]
text = engine._format_actions(seq)
assert "Use async" in text
# ---------------------------------------------------------------------------
# Scan integration (mocked)
# ---------------------------------------------------------------------------
class TestScan:
@pytest.mark.asyncio
async def test_scan_too_few_actions(self, engine):
with patch.object(engine, "_load_recent_successful_actions", return_value=[]):
result = await engine.scan()
assert result == []
@pytest.mark.asyncio
async def test_scan_discovers_skill(self, engine):
now = datetime.now()
entries = [
{
"type": "tool_call",
"tool": "search",
"result": "found results",
"timestamp": now.isoformat(),
},
{
"type": "tool_call",
"tool": "analyze",
"result": "analysis complete",
"timestamp": (now + timedelta(seconds=10)).isoformat(),
},
{
"type": "tool_call",
"tool": "report",
"result": "report generated",
"timestamp": (now + timedelta(seconds=20)).isoformat(),
},
]
llm_response = json.dumps(
{
"name": "Research Pipeline",
"description": "Search, analyze, and report",
"category": "research",
"template": "1. Search\n2. Analyze\n3. Report",
"confidence": 0.9,
}
)
with (
patch.object(engine, "_load_recent_successful_actions", return_value=entries),
patch(
"infrastructure.router.cascade.get_router",
return_value=MagicMock(complete=AsyncMock(return_value={"content": llm_response})),
),
patch.object(engine, "_notify", new_callable=AsyncMock),
patch.object(engine, "_write_skill_file"),
):
result = await engine.scan()
assert len(result) == 1
assert result[0].name == "Research Pipeline"
assert result[0].confidence == 0.9
@pytest.mark.asyncio
async def test_scan_skips_low_confidence(self, engine):
now = datetime.now()
entries = [
{
"type": "tool_call",
"tool": "a",
"result": "ok",
"timestamp": now.isoformat(),
},
{
"type": "tool_call",
"tool": "b",
"result": "ok",
"timestamp": (now + timedelta(seconds=10)).isoformat(),
},
]
llm_response = json.dumps(
{"name": "Low Conf", "confidence": 0.3, "category": "general", "template": "..."}
)
with (
patch.object(engine, "_load_recent_successful_actions", return_value=entries),
patch(
"infrastructure.router.cascade.get_router",
return_value=MagicMock(complete=AsyncMock(return_value={"content": llm_response})),
),
):
result = await engine.scan()
assert result == []
@pytest.mark.asyncio
async def test_scan_falls_back_to_heuristic(self, engine):
engine.confidence_threshold = 0.4 # Lower for heuristic
now = datetime.now()
entries = [
{
"type": "tool_call",
"tool": "deploy",
"result": "ok",
"timestamp": now.isoformat(),
},
{
"type": "tool_call",
"tool": "deploy",
"result": "ok",
"timestamp": (now + timedelta(seconds=10)).isoformat(),
},
]
with (
patch.object(engine, "_load_recent_successful_actions", return_value=entries),
patch(
"infrastructure.router.cascade.get_router",
return_value=MagicMock(
complete=AsyncMock(side_effect=Exception("LLM unavailable"))
),
),
patch.object(engine, "_notify", new_callable=AsyncMock),
patch.object(engine, "_write_skill_file"),
):
result = await engine.scan()
assert len(result) == 1
assert "Deploy" in result[0].name
assert result[0].confidence == 0.5