timmy-config/evaluations/crewai/poc_crew.py

#!/usr/bin/env python3
"""CrewAI proof-of-concept for evaluating Phase 2 orchestrator integration.

Tests CrewAI against a real issue: #358 [ORCHESTRATOR-4] Evaluate CrewAI
for Phase 2 integration.
"""

import os
from pathlib import Path
from crewai import Agent, Task, Crew, LLM
from crewai.tools import BaseTool

# ── Configuration ─────────────────────────────────────────────────────

OPENROUTER_API_KEY = os.getenv(
    "OPENROUTER_API_KEY",
    "dsk-or-v1-f60c89db12040267458165cf192e815e339eb70548e4a0a461f5f0f69e6ef8b0",
)

llm = LLM(
    model="openrouter/google/gemini-2.0-flash-001",
    api_key=OPENROUTER_API_KEY,
    base_url="https://openrouter.ai/api/v1",
)

REPO_ROOT = Path(__file__).resolve().parents[2]


def _slurp(relpath: str, max_lines: int = 150) -> str:
    p = REPO_ROOT / relpath
    if not p.exists():
        return f"[FILE NOT FOUND: {relpath}]"
    lines = p.read_text().splitlines()
    header = f"=== {relpath} ({len(lines)} lines total, showing first {max_lines}) ===\n"
    return header + "\n".join(lines[:max_lines])


# ── Tools ─────────────────────────────────────────────────────────────

class ReadOrchestratorFilesTool(BaseTool):
    name: str = "read_orchestrator_files"
    description: str = (
        "Reads the current custom orchestrator implementation files "
        "(orchestration.py, tasks.py, timmy-orchestrator.sh, coordinator-first-protocol.md) "
        "and returns their contents for analysis."
    )

    def _run(self) -> str:
        return "\n\n".join(
            [
                _slurp("orchestration.py"),
                _slurp("tasks.py", max_lines=120),
                _slurp("bin/timmy-orchestrator.sh", max_lines=120),
                _slurp("docs/coordinator-first-protocol.md", max_lines=120),
            ]
        )


class ReadIssueTool(BaseTool):
    name: str = "read_issue_358"
    description: str = "Returns the text of Gitea issue #358 that we are evaluating."

    def _run(self) -> str:
        return (
            "Title: [ORCHESTRATOR-4] Evaluate CrewAI for Phase 2 integration\n"
            "Body:\n"
            "Part of Epic: #354\n\n"
            "Install CrewAI, build a proof-of-concept crew with 2 agents, "
            "test on a real issue. Evaluate: does it add value over our custom orchestrator? Document findings."
        )


# ── Agents ────────────────────────────────────────────────────────────

researcher = Agent(
    role="Orchestration Researcher",
    goal="Gather a complete understanding of the current custom orchestrator and how CrewAI compares to it.",
    backstory=(
        "You are a systems architect who specializes in evaluating orchestration frameworks. "
        "You read code carefully, extract facts, and avoid speculation. "
        "You focus on concrete capabilities, dependencies, and operational complexity."
    ),
    llm=llm,
    tools=[ReadOrchestratorFilesTool(), ReadIssueTool()],
    verbose=True,
)

evaluator = Agent(
    role="Integration Evaluator",
    goal="Synthesize research into a clear recommendation on whether CrewAI adds value for Phase 2.",
    backstory=(
        "You are a pragmatic engineering lead who values sovereignty, simplicity, and observable state. "
        "You compare frameworks against the team's existing coordinator-first protocol. "
        "You produce structured recommendations with explicit trade-offs."
    ),
    llm=llm,
    verbose=True,
)

# ── Tasks ─────────────────────────────────────────────────────────────

task_research = Task(
    description=(
        "Read the current custom orchestrator files and issue #358. "
        "Produce a structured research report covering:\n"
        "1. Current stack summary (Huey + tasks.py + timmy-orchestrator.sh)\n"
        "2. Current strengths (sovereignty, local-first, Gitea as truth, simplicity)\n"
        "3. Current gaps or limitations (if any)\n"
        "4. What CrewAI offers (agent roles, tasks, crews, tools, memory/RAG)\n"
        "5. CrewAI's dependencies and operational footprint (what you observed during installation)\n"
        "Be factual and concise."
    ),
    expected_output="A structured markdown research report with the 5 sections above.",
    agent=researcher,
)

task_evaluate = Task(
    description=(
        "Using the research report, evaluate whether CrewAI should be adopted for Phase 2 integration. "
        "Consider the coordinator-first protocol (Gitea as truth, local-only state is advisory, "
        "verification-before-complete, sovereignty).\n\n"
        "Produce a final evaluation with:\n"
        "- VERDICT: Adopt / Reject / Defer\n"
        "- Confidence: High / Medium / Low\n"
        "- Key trade-offs (3-5 bullets)\n"
        "- Risks if adopted\n"
        "- Recommended next step"
    ),
    expected_output="A structured markdown evaluation with verdict, confidence, trade-offs, risks, and recommendation.",
    agent=evaluator,
    context=[task_research],
)

# ── Crew ──────────────────────────────────────────────────────────────

crew = Crew(
    agents=[researcher, evaluator],
    tasks=[task_research, task_evaluate],
    verbose=True,
)

if __name__ == "__main__":
    print("=" * 70)
    print("CrewAI PoC — Evaluating CrewAI for Phase 2 Integration")
    print("=" * 70)
    result = crew.kickoff()
    print("\n" + "=" * 70)
    print("FINAL OUTPUT")
    print("=" * 70)
    print(result.raw)