[claude] Agent debate on borderline eval requests (#21) (#72)

2026-03-23 01:07:52 +00:00
parent 5954a2fdc0
commit 609acc8f66
9 changed files with 269 additions and 7 deletions
--- a/artifacts/api-server/src/lib/agent.ts
+++ b/artifacts/api-server/src/lib/agent.ts
@@ -5,6 +5,15 @@ const logger = makeLogger("agent");
 export interface EvalResult {
  accepted: boolean;
  reason: string;
+  confidence: "high" | "low";
+  inputTokens: number;
+  outputTokens: number;
+}
+
+export interface DebateResult {
+  argFor: string;
+  argAgainst: string;
+  verdict: { accepted: boolean; reason: string };
  inputTokens: number;
  outputTokens: number;
 }
@@ -35,6 +44,7 @@ if (STUB_MODE) {
 const STUB_EVAL: EvalResult = {
  accepted: true,
  reason: "Stub: request accepted for processing.",
+  confidence: "high",
  inputTokens: 0,
  outputTokens: 0,
 };
@@ -106,7 +116,8 @@ export class AgentService {
 ACCEPT if the request is: clear enough to act on, ethical, lawful, and within the capability of a general-purpose AI.
 ALWAYS ACCEPT requests about: self-hosting, running your own Timmy, open-source setup, "how do I run this myself", or any question about deploying or operating Timmy independently. These are welcomed, not edge cases.
 REJECT if the request is: harmful, illegal, unethical, incoherent, or spam.
-Respond ONLY with valid JSON: {"accepted": true, "reason": "..."} or {"accepted": false, "reason": "..."}`,
+Also assess your confidence. Use "high" if the decision is clear-cut, "low" if the request is borderline or ambiguous.
+Respond ONLY with valid JSON: {"accepted": true/false, "reason": "...", "confidence": "high"/"low"}`,
      messages: [{ role: "user", content: `Evaluate this request: ${requestText}` }],
    });

@@ -115,17 +126,20 @@ Respond ONLY with valid JSON: {"accepted": true, "reason": "..."} or {"accepted"
      throw new Error("Unexpected non-text response from eval model");
    }

-    let parsed: { accepted: boolean; reason: string };
+    let parsed: { accepted: boolean; reason: string; confidence?: string };
    try {
      const raw = block.text!.replace(/^```(?:json)?\s*/i, "").replace(/\s*```$/, "").trim();
-      parsed = JSON.parse(raw) as { accepted: boolean; reason: string };
+      parsed = JSON.parse(raw) as { accepted: boolean; reason: string; confidence?: string };
    } catch {
      throw new Error(`Failed to parse eval JSON: ${block.text!}`);
    }

+    const confidence: "high" | "low" = parsed.confidence === "low" ? "low" : "high";
+
    return {
      accepted: Boolean(parsed.accepted),
      reason: parsed.reason ?? "",
+      confidence,
      inputTokens: message.usage.input_tokens,
      outputTokens: message.usage.output_tokens,
    };
@@ -254,6 +268,102 @@ No hedging, no steering them back to the hosted version. The magic is meant to b
    if (block.type !== "text") return "The crystal ball is cloudy… try again.";
    return block.text!.trim();
  }
+  /**
+   * Run a mini debate on a borderline eval request (#21).
+   * Two opposing Haiku calls argue accept vs reject, then a third synthesizes.
+   * Returns the debate transcript and final verdict.
+   */
+  async runDebate(
+    requestText: string,
+    initialPosition: "accept" | "reject",
+    initialReason: string,
+    onArgument?: (agent: "Beta-A" | "Beta-B", position: "accept" | "reject", argument: string) => void,
+  ): Promise<DebateResult> {
+    if (STUB_MODE) {
+      const stubFor = "Stub: This request should be accepted — it is clear and actionable.";
+      const stubAgainst = "Stub: This request is ambiguous and could be problematic.";
+      const stubVerdict = { accepted: true, reason: "Stub: After debate, request accepted." };
+      await new Promise((r) => setTimeout(r, 200));
+      onArgument?.("Beta-A", initialPosition, initialPosition === "accept" ? stubFor : stubAgainst);
+      await new Promise((r) => setTimeout(r, 200));
+      const opposingPosition = initialPosition === "accept" ? "reject" : "accept";
+      onArgument?.("Beta-B", opposingPosition, initialPosition === "accept" ? stubAgainst : stubFor);
+      await new Promise((r) => setTimeout(r, 200));
+      return {
+        argFor: stubFor,
+        argAgainst: stubAgainst,
+        verdict: stubVerdict,
+        inputTokens: 0,
+        outputTokens: 0,
+      };
+    }
+
+    const client = await getClient();
+    let totalInput = 0;
+    let totalOutput = 0;
+
+    // Beta-A: argues the initial position
+    const betaAPosition = initialPosition;
+    const betaAMsg = await client.messages.create({
+      model: this.evalModel,
+      max_tokens: 512,
+      system: `You are Beta-A, an AI debate agent. You must argue strongly that the following request should be ${betaAPosition === "accept" ? "ACCEPTED" : "REJECTED"}. The initial evaluation said: "${initialReason}". Build a compelling 2-3 sentence argument for your position. Be specific about why.`,
+      messages: [{ role: "user", content: `Request under debate: ${requestText}` }],
+    });
+    totalInput += betaAMsg.usage.input_tokens;
+    totalOutput += betaAMsg.usage.output_tokens;
+    const betaAText = betaAMsg.content[0]?.type === "text" ? betaAMsg.content[0].text! : "";
+    onArgument?.("Beta-A", betaAPosition, betaAText);
+
+    // Beta-B: argues the opposing position
+    const betaBPosition = initialPosition === "accept" ? "reject" : "accept";
+    const betaBMsg = await client.messages.create({
+      model: this.evalModel,
+      max_tokens: 512,
+      system: `You are Beta-B, an AI debate agent. You must argue strongly that the following request should be ${betaBPosition === "accept" ? "ACCEPTED" : "REJECTED"}. Beta-A argued: "${betaAText}". Counter their argument with a compelling 2-3 sentence rebuttal. Be specific.`,
+      messages: [{ role: "user", content: `Request under debate: ${requestText}` }],
+    });
+    totalInput += betaBMsg.usage.input_tokens;
+    totalOutput += betaBMsg.usage.output_tokens;
+    const betaBText = betaBMsg.content[0]?.type === "text" ? betaBMsg.content[0].text! : "";
+    onArgument?.("Beta-B", betaBPosition, betaBText);
+
+    const argFor = betaAPosition === "accept" ? betaAText : betaBText;
+    const argAgainst = betaAPosition === "reject" ? betaAText : betaBText;
+
+    // Synthesis: third call renders the final verdict
+    const synthMsg = await client.messages.create({
+      model: this.evalModel,
+      max_tokens: 512,
+      system: `You are Beta, the final judge in a debate about whether an AI agent should accept or reject a request.
+Argument FOR accepting: "${argFor}"
+Argument AGAINST accepting: "${argAgainst}"
+Weigh both arguments carefully and render a final verdict.
+Respond ONLY with valid JSON: {"accepted": true/false, "reason": "..."}`,
+      messages: [{ role: "user", content: `Request under debate: ${requestText}` }],
+    });
+    totalInput += synthMsg.usage.input_tokens;
+    totalOutput += synthMsg.usage.output_tokens;
+
+    const synthBlock = synthMsg.content[0];
+    let verdict = { accepted: initialPosition === "accept", reason: initialReason };
+    if (synthBlock?.type === "text") {
+      try {
+        const raw = synthBlock.text!.replace(/^```(?:json)?\s*/i, "").replace(/\s*```$/, "").trim();
+        verdict = JSON.parse(raw) as { accepted: boolean; reason: string };
+      } catch {
+        logger.warn("debate synthesis parse failed, using initial eval", { text: synthBlock.text });
+      }
+    }
+
+    return {
+      argFor,
+      argAgainst,
+      verdict: { accepted: Boolean(verdict.accepted), reason: verdict.reason ?? "" },
+      inputTokens: totalInput,
+      outputTokens: totalOutput,
+    };
+  }
 }

 export const agentService = new AgentService();