timmy-tower/artifacts/api-server/src/lib/moderation.ts

/**
 * moderation.ts — Event moderation queue + Timmy AI review
 *
 * Every Nostr event from a non-elite whitelisted account is held in
 * relay_event_queue with status "pending". Timmy (Claude haiku) reviews
 * pending events in a background poll loop and either auto_approves them
 * (injecting into strfry) or flags them for admin review.
 *
 * Elite accounts bypass this queue — their events are injected directly
 * from the relay policy handler.
 */

import { db, relayEventQueue, type QueueReviewer } from "@workspace/db";
import { eq, and } from "drizzle-orm";
import { makeLogger } from "./logger.js";
import { injectEvent } from "./strfry.js";

const logger = makeLogger("moderation");

// ── Stub mode (mirrors agent.ts) ─────────────────────────────────────────────

const STUB_MODE =
  !process.env["AI_INTEGRATIONS_ANTHROPIC_API_KEY"] ||
  !process.env["AI_INTEGRATIONS_ANTHROPIC_BASE_URL"];

if (STUB_MODE) {
  logger.warn("no Anthropic key — moderation running in STUB mode (auto-approve all)");
}

// ── Anthropic lazy client (reuse from agent.ts pattern) ──────────────────────

interface AnthropicLike {
  messages: {
    create(params: Record<string, unknown>): Promise<{
      content: Array<{ type: string; text?: string }>;
      usage: { input_tokens: number; output_tokens: number };
    }>;
  };
}

let _anthropic: AnthropicLike | null = null;

async function getClient(): Promise<AnthropicLike> {
  if (_anthropic) return _anthropic;
  // @ts-expect-error -- integrations-anthropic-ai exports src directly
  const mod = (await import("@workspace/integrations-anthropic-ai")) as { anthropic: AnthropicLike };
  _anthropic = mod.anthropic;
  return _anthropic;
}

// ── Moderation prompt ─────────────────────────────────────────────────────────

const MODERATION_SYSTEM = `You are moderating events on a sovereign Nostr relay. Your job is to approve benign content and flag anything harmful.

APPROVE if the event is: a standard text note, profile update, reaction, encrypted DM, relay list, metadata update, or other typical Nostr activity.
FLAG if the event is: spam, harassment, illegal content, NSFW without appropriate warnings, coordinated abuse, or clearly malicious.

Respond ONLY with valid JSON: {"decision": "approve", "reason": "..."} or {"decision": "flag", "reason": "..."}`;

type ModerationDecision = "approve" | "flag";

interface ModerationResult {
  decision: ModerationDecision;
  reason: string;
}

async function callClaude(kind: number, content: string): Promise<ModerationResult> {
  if (STUB_MODE) {
    return { decision: "approve", reason: "Stub: auto-approved (no Anthropic key)" };
  }

  const client = await getClient();
  const message = await client.messages.create({
    model: process.env["MODERATION_MODEL"] ?? "claude-haiku-4-5",
    max_tokens: 256,
    system: MODERATION_SYSTEM,
    messages: [
      {
        role: "user",
        content: `Nostr event kind ${kind}. Content: ${content.slice(0, 2000)}`,
      },
    ],
  });

  const block = message.content[0];
  if (!block || block.type !== "text") {
    return { decision: "flag", reason: "AI returned unexpected response" };
  }

  try {
    const raw = block.text!.replace(/^```(?:json)?\s*/i, "").replace(/\s*```$/, "").trim();
    const parsed = JSON.parse(raw) as { decision: string; reason?: string };
    const decision = parsed.decision === "approve" ? "approve" : "flag";
    return { decision, reason: parsed.reason ?? "" };
  } catch {
    logger.warn("moderation: failed to parse Claude response", {
      text: block.text!.slice(0, 100),
    });
    return { decision: "flag", reason: "Failed to parse AI response" };
  }
}

// ── ModerationService ─────────────────────────────────────────────────────────

export class ModerationService {
  /**
   * Insert an event into the moderation queue with "pending" status.
   * Idempotent: if the event_id already exists, the insert is silently skipped.
   */
  async enqueue(event: {
    id: string;
    pubkey: string;
    kind: number;
    rawJson: string;
  }): Promise<void> {
    await db
      .insert(relayEventQueue)
      .values({
        eventId: event.id,
        pubkey: event.pubkey,
        kind: event.kind,
        rawEvent: event.rawJson,
        status: "pending",
      })
      .onConflictDoNothing();

    logger.info("moderation: event enqueued", {
      eventId: event.id.slice(0, 8),
      pubkey: event.pubkey.slice(0, 8),
      kind: event.kind,
    });
  }

  /**
   * Review a single pending event with Claude.
   * Returns "approve" (event is injected into strfry + status → auto_approved)
   * or "flag" (status stays pending — admin must decide).
   */
  async autoReview(eventId: string): Promise<ModerationDecision> {
    const rows = await db
      .select()
      .from(relayEventQueue)
      .where(
        and(
          eq(relayEventQueue.eventId, eventId),
          eq(relayEventQueue.status, "pending"),
        ),
      )
      .limit(1);

    const row = rows[0];
    if (!row) {
      logger.warn("moderation: autoReview called on non-pending event", { eventId });
      return "flag";
    }

    let content = "";
    try {
      const parsed = JSON.parse(row.rawEvent) as { content?: string };
      content = parsed.content ?? "";
    } catch {
      content = "";
    }

    let result: ModerationResult;
    try {
      result = await callClaude(row.kind, content);
    } catch (err) {
      logger.error("moderation: Claude call failed — flagging for admin review", {
        eventId: eventId.slice(0, 8),
        err,
      });
      result = { decision: "flag", reason: "AI review failed — admin review required" };
    }

    if (result.decision === "approve") {
      await this.decide(eventId, "auto_approved", result.reason, "timmy_ai");
    } else {
      // Update reason but leave status as "pending" for admin
      await db
        .update(relayEventQueue)
        .set({ reviewReason: result.reason, reviewedBy: "timmy_ai" })
        .where(eq(relayEventQueue.eventId, eventId));

      logger.info("moderation: event flagged for admin review", {
        eventId: eventId.slice(0, 8),
        reason: result.reason,
      });
    }

    return result.decision;
  }

  /**
   * Apply a moderation decision (approve/auto_approved/rejected).
   * On approval: inject the event into strfry.
   */
  async decide(
    eventId: string,
    status: "approved" | "rejected" | "auto_approved",
    reason: string,
    reviewedBy: QueueReviewer,
  ): Promise<void> {
    await db
      .update(relayEventQueue)
      .set({
        status,
        reviewedBy,
        reviewReason: reason,
        decidedAt: new Date(),
      })
      .where(eq(relayEventQueue.eventId, eventId));

    logger.info("moderation: decision recorded", {
      eventId: eventId.slice(0, 8),
      status,
      reviewedBy,
    });

    if (status === "approved" || status === "auto_approved") {
      const rows = await db
        .select({ rawEvent: relayEventQueue.rawEvent })
        .from(relayEventQueue)
        .where(eq(relayEventQueue.eventId, eventId))
        .limit(1);

      const rawEvent = rows[0]?.rawEvent;
      if (rawEvent) {
        const result = await injectEvent(rawEvent);
        if (!result.ok) {
          logger.error("moderation: strfry inject failed after approval", {
            eventId: eventId.slice(0, 8),
            error: result.error,
          });
        }
      }
    }
  }

  /**
   * Background poll: auto-review up to `limit` pending events.
   * Called every 30 seconds from the startup poll loop.
   */
  async processPending(limit = 10): Promise<void> {
    const rows = await db
      .select({ eventId: relayEventQueue.eventId })
      .from(relayEventQueue)
      .where(eq(relayEventQueue.status, "pending"))
      .limit(limit);

    if (rows.length === 0) return;

    logger.info("moderation: processing pending events", { count: rows.length });

    for (const { eventId } of rows) {
      try {
        await this.autoReview(eventId);
      } catch (err) {
        logger.error("moderation: poll failed for event", {
          eventId: eventId.slice(0, 8),
          err,
        });
      }
    }
  }
}

export const moderationService = new ModerationService();