Co-authored-by: Claude (Opus 4.6) <claude@hermes.local> Co-committed-by: Claude (Opus 4.6) <claude@hermes.local>
368 lines
12 KiB
Python
368 lines
12 KiB
Python
"""Research triage — extract action items from research reports and file Gitea issues.
|
|
|
|
Closes the loop: research → knowledge → actionable engineering work.
|
|
|
|
The LLM extracts action items during synthesis (not post-processed), then
|
|
each item is filed as a Gitea issue with appropriate labels, source links,
|
|
and evidence from the original research.
|
|
|
|
Usage::
|
|
|
|
from timmy.research_triage import triage_research_report
|
|
|
|
results = await triage_research_report(
|
|
report="## Findings\\n...",
|
|
source_issue=946,
|
|
)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Regex to strip markdown code fences from LLM output
|
|
_FENCE_RE = re.compile(r"^```(?:json)?\s*\n?", re.MULTILINE)
|
|
|
|
|
|
@dataclass
|
|
class ActionItem:
|
|
"""A single actionable item extracted from a research report."""
|
|
|
|
title: str
|
|
body: str
|
|
labels: list[str] = field(default_factory=list)
|
|
priority: str = "medium"
|
|
source_urls: list[str] = field(default_factory=list)
|
|
|
|
def to_issue_body(self, source_issue: int | None = None) -> str:
|
|
"""Format for a Gitea issue body with source attribution."""
|
|
parts = [self.body]
|
|
|
|
if self.source_urls:
|
|
parts.append("\n### Source Evidence")
|
|
for url in self.source_urls:
|
|
parts.append(f"- {url}")
|
|
|
|
if source_issue:
|
|
parts.append(f"\n### Origin\nExtracted from research in #{source_issue}")
|
|
|
|
parts.append("\n---\n*Auto-triaged from research findings by Timmy*")
|
|
return "\n".join(parts)
|
|
|
|
|
|
def _build_extraction_prompt(report: str) -> str:
|
|
"""Build the LLM prompt for extracting action items from a research report."""
|
|
return (
|
|
"You are triaging a research report for actionable engineering work.\n"
|
|
"Extract 0-5 CONCRETE action items — bugs to fix, features to build,\n"
|
|
"infrastructure to set up, or investigations to run.\n\n"
|
|
"Rules:\n"
|
|
"- Only include items that map to real engineering tasks\n"
|
|
"- Skip vague recommendations or philosophical observations\n"
|
|
"- Each item should be specific enough to become a Gitea issue\n"
|
|
"- Include evidence/URLs from the report in source_urls\n"
|
|
"- Priority: high (blocking or critical), medium (important), low (nice-to-have)\n"
|
|
"- Labels: pick from [actionable, research, bug, feature, infrastructure, "
|
|
"performance, security, kimi-ready]\n"
|
|
" - 'kimi-ready' means a well-scoped task suitable for an AI agent\n"
|
|
" - 'actionable' should be on every item (these are all actionable)\n\n"
|
|
"For each item return:\n"
|
|
'- "title": Clear, specific title with area prefix '
|
|
'(e.g. "[MCP] Restore tool server with FastMCP")\n'
|
|
'- "body": Detailed markdown body with:\n'
|
|
" **What:** What needs to be done\n"
|
|
" **Why:** Why this matters (link to research finding)\n"
|
|
" **Suggested approach:** How to implement\n"
|
|
" **Acceptance criteria:** How to verify\n"
|
|
'- "labels": Array of label strings\n'
|
|
'- "priority": One of high, medium, low\n'
|
|
'- "source_urls": Array of URLs referenced in the research\n\n'
|
|
"Return ONLY a JSON array of objects. Return [] if nothing is actionable.\n\n"
|
|
f"Research report:\n{report}\n\nJSON array:"
|
|
)
|
|
|
|
|
|
def _parse_llm_response(raw: str) -> list[dict[str, Any]]:
|
|
"""Parse LLM JSON response, stripping code fences if present."""
|
|
cleaned = raw.strip()
|
|
|
|
# Strip markdown code fences
|
|
if cleaned.startswith("```"):
|
|
cleaned = cleaned.split("\n", 1)[-1].rsplit("```", 1)[0].strip()
|
|
|
|
items = json.loads(cleaned)
|
|
if not isinstance(items, list):
|
|
return []
|
|
return items
|
|
|
|
|
|
def _validate_action_item(raw_item: dict[str, Any]) -> ActionItem | None:
|
|
"""Validate and convert a raw dict to an ActionItem, or None if invalid."""
|
|
if not isinstance(raw_item, dict):
|
|
return None
|
|
|
|
title = raw_item.get("title", "").strip()
|
|
body = raw_item.get("body", "").strip()
|
|
|
|
if not title or len(title) < 10:
|
|
return None
|
|
if not body or len(body) < 20:
|
|
return None
|
|
|
|
labels = raw_item.get("labels", [])
|
|
if isinstance(labels, str):
|
|
labels = [lbl.strip() for lbl in labels.split(",") if lbl.strip()]
|
|
if not isinstance(labels, list):
|
|
labels = []
|
|
|
|
# Ensure 'actionable' label is always present
|
|
if "actionable" not in labels:
|
|
labels.insert(0, "actionable")
|
|
|
|
priority = raw_item.get("priority", "medium").strip().lower()
|
|
if priority not in ("high", "medium", "low"):
|
|
priority = "medium"
|
|
|
|
source_urls = raw_item.get("source_urls", [])
|
|
if not isinstance(source_urls, list):
|
|
source_urls = []
|
|
|
|
return ActionItem(
|
|
title=title,
|
|
body=body,
|
|
labels=labels,
|
|
priority=priority,
|
|
source_urls=source_urls,
|
|
)
|
|
|
|
|
|
async def extract_action_items(
|
|
report: str,
|
|
llm_caller: Any | None = None,
|
|
) -> list[ActionItem]:
|
|
"""Extract actionable engineering items from a research report.
|
|
|
|
Uses the LLM to identify concrete tasks, bugs, features, and
|
|
infrastructure work from structured research output.
|
|
|
|
Args:
|
|
report: The research report text (markdown).
|
|
llm_caller: Optional async callable(prompt) -> str for LLM.
|
|
Falls back to the cascade router.
|
|
|
|
Returns:
|
|
List of validated ActionItem objects (0-5 items).
|
|
"""
|
|
if not report or not report.strip():
|
|
return []
|
|
|
|
prompt = _build_extraction_prompt(report)
|
|
|
|
try:
|
|
if llm_caller is not None:
|
|
raw = await llm_caller(prompt)
|
|
else:
|
|
raw = await _call_llm(prompt)
|
|
except Exception as exc:
|
|
logger.warning("LLM extraction failed: %s", exc)
|
|
return []
|
|
|
|
if not raw or not raw.strip():
|
|
return []
|
|
|
|
try:
|
|
raw_items = _parse_llm_response(raw)
|
|
except (json.JSONDecodeError, ValueError) as exc:
|
|
logger.warning("Failed to parse LLM action items: %s", exc)
|
|
return []
|
|
|
|
items = []
|
|
for raw_item in raw_items[:5]: # Safety cap
|
|
item = _validate_action_item(raw_item)
|
|
if item is not None:
|
|
items.append(item)
|
|
|
|
logger.info("Extracted %d action items from research report", len(items))
|
|
return items
|
|
|
|
|
|
async def _call_llm(prompt: str) -> str:
|
|
"""Call the cascade router for LLM completion.
|
|
|
|
Falls back gracefully if the router is unavailable.
|
|
"""
|
|
from infrastructure.router import get_router
|
|
|
|
router = get_router()
|
|
messages = [{"role": "user", "content": prompt}]
|
|
result = await router.complete(messages=messages, temperature=0.1)
|
|
return result.get("content", "") if isinstance(result, dict) else str(result)
|
|
|
|
|
|
async def create_gitea_issue(
|
|
item: ActionItem,
|
|
source_issue: int | None = None,
|
|
) -> dict[str, Any] | None:
|
|
"""Create a Gitea issue from an ActionItem via the REST API.
|
|
|
|
Args:
|
|
item: The action item to file.
|
|
source_issue: Parent research issue number to link back to.
|
|
|
|
Returns:
|
|
The created issue dict from Gitea API, or None on failure.
|
|
"""
|
|
if not settings.gitea_enabled or not settings.gitea_token:
|
|
logger.debug("Gitea not configured — skipping issue creation")
|
|
return None
|
|
|
|
owner, repo = settings.gitea_repo.split("/", 1)
|
|
api_url = f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/issues"
|
|
|
|
body = item.to_issue_body(source_issue=source_issue)
|
|
|
|
payload: dict[str, Any] = {
|
|
"title": item.title,
|
|
"body": body,
|
|
}
|
|
|
|
# Resolve label names to IDs
|
|
label_ids = await _resolve_label_ids(item.labels, owner, repo)
|
|
if label_ids:
|
|
payload["labels"] = label_ids
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=15) as client:
|
|
resp = await client.post(
|
|
api_url,
|
|
headers={
|
|
"Authorization": f"token {settings.gitea_token}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json=payload,
|
|
)
|
|
|
|
if resp.status_code in (200, 201):
|
|
issue_data = resp.json()
|
|
logger.info(
|
|
"Created Gitea issue #%s: %s",
|
|
issue_data.get("number", "?"),
|
|
item.title[:60],
|
|
)
|
|
return issue_data
|
|
|
|
logger.warning(
|
|
"Gitea issue creation failed (HTTP %s): %s",
|
|
resp.status_code,
|
|
resp.text[:200],
|
|
)
|
|
return None
|
|
|
|
except (httpx.ConnectError, httpx.ReadError, ConnectionError) as exc:
|
|
logger.warning("Gitea connection failed: %s", exc)
|
|
return None
|
|
except Exception as exc:
|
|
logger.error("Unexpected error creating Gitea issue: %s", exc)
|
|
return None
|
|
|
|
|
|
async def _resolve_label_ids(
|
|
label_names: list[str],
|
|
owner: str,
|
|
repo: str,
|
|
) -> list[int]:
|
|
"""Resolve label names to Gitea label IDs, creating missing labels.
|
|
|
|
Returns a list of integer label IDs for the issue payload.
|
|
"""
|
|
if not label_names:
|
|
return []
|
|
|
|
labels_url = f"{settings.gitea_url}/api/v1/repos/{owner}/{repo}/labels"
|
|
headers = {
|
|
"Authorization": f"token {settings.gitea_token}",
|
|
"Content-Type": "application/json",
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
# Fetch existing labels
|
|
resp = await client.get(labels_url, headers=headers)
|
|
if resp.status_code != 200:
|
|
return []
|
|
|
|
existing = {lbl["name"]: lbl["id"] for lbl in resp.json()}
|
|
label_ids = []
|
|
|
|
for name in label_names:
|
|
if name in existing:
|
|
label_ids.append(existing[name])
|
|
else:
|
|
# Auto-create missing labels with a default color
|
|
create_resp = await client.post(
|
|
labels_url,
|
|
headers=headers,
|
|
json={"name": name, "color": "#0075ca"},
|
|
)
|
|
if create_resp.status_code in (200, 201):
|
|
label_ids.append(create_resp.json()["id"])
|
|
|
|
return label_ids
|
|
|
|
except Exception as exc:
|
|
logger.debug("Label resolution failed: %s", exc)
|
|
return []
|
|
|
|
|
|
async def triage_research_report(
|
|
report: str,
|
|
source_issue: int | None = None,
|
|
llm_caller: Any | None = None,
|
|
dry_run: bool = False,
|
|
) -> list[dict[str, Any]]:
|
|
"""End-to-end: extract action items from research and file Gitea issues.
|
|
|
|
This is the main entry point that closes the research → backlog loop.
|
|
|
|
Args:
|
|
report: Research report text (markdown).
|
|
source_issue: The Gitea issue number that produced this research.
|
|
llm_caller: Optional async callable(prompt) -> str for LLM calls.
|
|
dry_run: If True, extract items but don't create issues.
|
|
|
|
Returns:
|
|
List of dicts with 'action_item' and 'gitea_issue' (or None) keys.
|
|
"""
|
|
items = await extract_action_items(report, llm_caller=llm_caller)
|
|
|
|
if not items:
|
|
logger.info("No action items extracted from research report")
|
|
return []
|
|
|
|
results = []
|
|
for item in items:
|
|
if dry_run:
|
|
results.append({"action_item": item, "gitea_issue": None})
|
|
continue
|
|
|
|
issue_data = await create_gitea_issue(item, source_issue=source_issue)
|
|
results.append({"action_item": item, "gitea_issue": issue_data})
|
|
|
|
created_count = sum(1 for r in results if r["gitea_issue"] is not None)
|
|
logger.info(
|
|
"Research triage complete: %d items extracted, %d issues created",
|
|
len(results),
|
|
created_count,
|
|
)
|
|
return results
|