feat: code quality audit + autoresearch integration + infra hardening (#150)

2026-03-08 12:50:44 -04:00
parent fd0ede0d51
commit ae3bb1cc21
186 changed files with 5129 additions and 3289 deletions
--- a/src/timmy/tools.py
+++ b/src/timmy/tools.py
@@ -227,11 +227,7 @@ def create_aider_tool(base_path: Path):
                )

                if result.returncode == 0:
-                    return (
-                        result.stdout
-                        if result.stdout
-                        else "Code changes applied successfully"
-                    )
+                    return result.stdout if result.stdout else "Code changes applied successfully"
                else:
                    return f"Aider error: {result.stderr}"
            except FileNotFoundError:
@@ -354,7 +350,7 @@ def consult_grok(query: str) -> str:
        Grok's response text, or an error/status message.
    """
    from config import settings
-    from timmy.backends import grok_available, get_grok_backend
+    from timmy.backends import get_grok_backend, grok_available

    if not grok_available():
        return (
@@ -385,9 +381,7 @@ def consult_grok(query: str) -> str:
            ln = get_ln_backend()
            sats = min(settings.grok_max_sats_per_query, 100)
            inv = ln.create_invoice(sats, f"Grok query: {query[:50]}")
-            invoice_info = (
-                f"\n[Lightning invoice: {sats} sats — {inv.payment_request[:40]}...]"
-            )
+            invoice_info = f"\n[Lightning invoice: {sats} sats — {inv.payment_request[:40]}...]"
        except Exception:
            pass

@@ -447,7 +441,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):

    # Memory search and write — persistent recall across all channels
    try:
-        from timmy.semantic_memory import memory_search, memory_write, memory_read
+        from timmy.semantic_memory import memory_read, memory_search, memory_write

        toolkit.register(memory_search, name="memory_search")
        toolkit.register(memory_write, name="memory_write")
@@ -473,6 +467,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):
                Task ID and confirmation that background execution has started.
            """
            import asyncio
+
            task_id = None

            async def _launch():
@@ -502,11 +497,7 @@ def create_full_toolkit(base_dir: str | Path | None = None):

    # System introspection - query runtime environment (sovereign self-knowledge)
    try:
-        from timmy.tools_intro import (
-            get_system_info,
-            check_ollama_health,
-            get_memory_status,
-        )
+        from timmy.tools_intro import check_ollama_health, get_memory_status, get_system_info

        toolkit.register(get_system_info, name="get_system_info")
        toolkit.register(check_ollama_health, name="check_ollama_health")
@@ -526,6 +517,60 @@ def create_full_toolkit(base_dir: str | Path | None = None):
    return toolkit


+def create_experiment_tools(base_dir: str | Path | None = None):
+    """Create tools for the experiment agent (Lab).
+
+    Includes: prepare_experiment, run_experiment, evaluate_result,
+    plus shell + file ops for editing training code.
+    """
+    if not _AGNO_TOOLS_AVAILABLE:
+        raise ImportError(f"Agno tools not available: {_ImportError}")
+
+    from config import settings
+
+    toolkit = Toolkit(name="experiment")
+
+    from timmy.autoresearch import evaluate_result, prepare_experiment, run_experiment
+
+    workspace = (
+        Path(base_dir) if base_dir else Path(settings.repo_root) / settings.autoresearch_workspace
+    )
+
+    def _prepare(repo_url: str = "https://github.com/karpathy/autoresearch.git") -> str:
+        """Clone and prepare an autoresearch experiment workspace."""
+        return prepare_experiment(workspace, repo_url)
+
+    def _run(timeout: int = 0) -> str:
+        """Run a single training experiment with wall-clock timeout."""
+        t = timeout or settings.autoresearch_time_budget
+        result = run_experiment(workspace, timeout=t, metric_name=settings.autoresearch_metric)
+        if result["success"] and result["metric"] is not None:
+            return (
+                f"{settings.autoresearch_metric}: {result['metric']:.4f} ({result['duration_s']}s)"
+            )
+        return result.get("error") or "Experiment failed"
+
+    def _evaluate(current: float, baseline: float) -> str:
+        """Compare current metric against baseline."""
+        return evaluate_result(current, baseline, metric_name=settings.autoresearch_metric)
+
+    toolkit.register(_prepare, name="prepare_experiment")
+    toolkit.register(_run, name="run_experiment")
+    toolkit.register(_evaluate, name="evaluate_result")
+
+    # Also give Lab access to file + shell tools for editing train.py
+    shell_tools = ShellTools()
+    toolkit.register(shell_tools.run_shell_command, name="shell")
+
+    base_path = Path(base_dir) if base_dir else Path(settings.repo_root)
+    file_tools = FileTools(base_dir=base_path)
+    toolkit.register(file_tools.read_file, name="read_file")
+    toolkit.register(file_tools.save_file, name="write_file")
+    toolkit.register(file_tools.list_files, name="list_files")
+
+    return toolkit
+
+
 # Mapping of agent IDs to their toolkits
 AGENT_TOOLKITS: dict[str, Callable[[], Toolkit]] = {
    "echo": create_research_tools,
@@ -534,6 +579,7 @@ AGENT_TOOLKITS: dict[str, Callable[[], Toolkit]] = {
    "seer": create_data_tools,
    "forge": create_code_tools,
    "quill": create_writing_tools,
+    "lab": create_experiment_tools,
    "pixel": lambda base_dir=None: _create_stub_toolkit("pixel"),
    "lyra": lambda base_dir=None: _create_stub_toolkit("lyra"),
    "reel": lambda base_dir=None: _create_stub_toolkit("reel"),
@@ -553,9 +599,7 @@ def _create_stub_toolkit(name: str):
    return toolkit


-def get_tools_for_agent(
-    agent_id: str, base_dir: str | Path | None = None
-) -> Toolkit | None:
+def get_tools_for_agent(agent_id: str, base_dir: str | Path | None = None) -> Toolkit | None:
    """Get the appropriate toolkit for an agent.

    Args:
@@ -643,6 +687,21 @@ def get_all_available_tools() -> dict[str, dict]:
            "description": "Local AI coding assistant using Ollama (qwen2.5:14b or deepseek-coder)",
            "available_in": ["forge", "orchestrator"],
        },
+        "prepare_experiment": {
+            "name": "Prepare Experiment",
+            "description": "Clone autoresearch repo and run data preparation for ML experiments",
+            "available_in": ["lab", "orchestrator"],
+        },
+        "run_experiment": {
+            "name": "Run Experiment",
+            "description": "Execute a time-boxed ML training experiment and capture metrics",
+            "available_in": ["lab", "orchestrator"],
+        },
+        "evaluate_result": {
+            "name": "Evaluate Result",
+            "description": "Compare experiment metric against baseline to assess improvement",
+            "available_in": ["lab", "orchestrator"],
+        },
    }

    # ── Git tools ─────────────────────────────────────────────────────────────