[claude] Agent debate on borderline eval requests (#21) (#72)

This commit was merged in pull request #72.
This commit is contained in:
2026-03-23 01:07:52 +00:00
parent 5954a2fdc0
commit 609acc8f66
9 changed files with 269 additions and 7 deletions

View File

@@ -214,6 +214,39 @@ function translateEvent(ev: BusEvent): object | null {
}
return null;
// ── Debate events (#21) ────────────────────────────────────────────────
case "debate:argument": {
void logWorldEvent(
"debate:argument",
`${ev.agent} argues to ${ev.position}: ${ev.argument.slice(0, 80)}`,
"beta",
ev.jobId,
);
return {
type: "agent_debate",
jobId: ev.jobId,
agent: ev.agent,
position: ev.position,
argument: ev.argument,
};
}
case "debate:verdict": {
void logWorldEvent(
"debate:verdict",
`Verdict: ${ev.accepted ? "accepted" : "rejected"}${ev.reason.slice(0, 80)}`,
"beta",
ev.jobId,
);
return {
type: "agent_debate",
jobId: ev.jobId,
agent: "Beta",
position: "verdict",
argument: `Final verdict: ${ev.accepted ? "ACCEPTED" : "REJECTED"}${ev.reason}`,
accepted: ev.accepted,
};
}
default:
return null;
}

View File

@@ -1,6 +1,6 @@
import { Router, type Request, type Response } from "express";
import { randomUUID, createHash } from "crypto";
import { db, jobs, invoices, type Job } from "@workspace/db";
import { db, jobs, invoices, jobDebates, type Job } from "@workspace/db";
import { eq, and } from "drizzle-orm";
import { CreateJobBody, GetJobParams } from "@workspace/api-zod";
import { lnbitsService } from "../lib/lnbits.js";
@@ -41,17 +41,74 @@ async function runEvalInBackground(
): Promise<void> {
const evalStart = Date.now();
try {
const evalResult = await agentService.evaluateRequest(request);
let evalResult = await agentService.evaluateRequest(request);
latencyHistogram.record("eval_phase", Date.now() - evalStart);
logger.info("eval result", {
jobId,
accepted: evalResult.accepted,
reason: evalResult.reason,
confidence: evalResult.confidence,
inputTokens: evalResult.inputTokens,
outputTokens: evalResult.outputTokens,
});
// ── Borderline debate (#21) ─────────────────────────────────────────
// When the eval model reports low confidence, run a mini debate to
// produce a more defensible accept/reject decision.
if (evalResult.confidence === "low") {
logger.info("borderline eval — starting debate", { jobId });
eventBus.publish({ type: "job:state", jobId, state: "evaluating" }); // keep beta thinking
const initialPosition = evalResult.accepted ? "accept" : "reject";
const debateResult = await agentService.runDebate(
request,
initialPosition as "accept" | "reject",
evalResult.reason,
(agent, position, argument) => {
eventBus.publish({ type: "debate:argument", jobId, agent, position, argument });
},
);
// Publish the final verdict event
eventBus.publish({
type: "debate:verdict",
jobId,
accepted: debateResult.verdict.accepted,
reason: debateResult.verdict.reason,
});
// Store debate transcript
try {
await db.insert(jobDebates).values({
id: randomUUID(),
jobId,
argFor: debateResult.argFor,
argAgainst: debateResult.argAgainst,
verdict: JSON.stringify(debateResult.verdict),
verdictAccepted: String(debateResult.verdict.accepted),
verdictReason: debateResult.verdict.reason,
});
} catch (dbErr) {
logger.warn("failed to store debate transcript", { jobId, err: String(dbErr) });
}
// Override the eval result with the debate's verdict
evalResult = {
accepted: debateResult.verdict.accepted,
reason: debateResult.verdict.reason,
confidence: "high", // post-debate, confidence is resolved
inputTokens: evalResult.inputTokens + debateResult.inputTokens,
outputTokens: evalResult.outputTokens + debateResult.outputTokens,
};
logger.info("debate concluded", {
jobId,
accepted: evalResult.accepted,
reason: evalResult.reason,
});
}
if (evalResult.accepted) {
const { estimatedInputTokens, estimatedOutputTokens } = pricingService.estimateRequestCost(request, agentService.workModel);
const breakdown = await pricingService.calculateWorkFeeSats(