Files
hermes-agent/scripts/whatsapp-bridge/bridge.js
Teknium c6f4515f73 fix(whatsapp): download documents, audio, and video media from messages (#2978)
Add downloadMediaMessage() calls for documents, audio/voice notes, and
video in bridge.js — previously only images were downloaded, leaving all
other file types inaccessible to the agent.

Handle local file paths from the bridge for DOCUMENT, VOICE, and VIDEO
types in whatsapp.py with proper MIME detection. Inject text content
inline for readable files (.txt, .md, .csv, .json, etc.).

Follow-up fixes applied during salvage:
- Remove unused cache_document_from_bytes import
- Add 100KB size cap on text injection (matches Telegram/Discord/Slack)
- Align injection format with other platforms

Cherry-picked from PR #2818. Also fixes #2856 (bugs 1 & 2).
PR #2865 by ayberkesn fixed the same voice note issue.

Co-authored-by: noestelar <hola@noeali.com>
2026-03-25 08:37:28 -07:00

527 lines
19 KiB
JavaScript

#!/usr/bin/env node
/**
* Hermes Agent WhatsApp Bridge
*
* Standalone Node.js process that connects to WhatsApp via Baileys
* and exposes HTTP endpoints for the Python gateway adapter.
*
* Endpoints (matches gateway/platforms/whatsapp.py expectations):
* GET /messages - Long-poll for new incoming messages
* POST /send - Send a message { chatId, message, replyTo? }
* POST /edit - Edit a sent message { chatId, messageId, message }
* POST /send-media - Send media natively { chatId, filePath, mediaType?, caption?, fileName? }
* POST /typing - Send typing indicator { chatId }
* GET /chat/:id - Get chat info
* GET /health - Health check
*
* Usage:
* node bridge.js --port 3000 --session ~/.hermes/whatsapp/session
*/
import { makeWASocket, useMultiFileAuthState, DisconnectReason, fetchLatestBaileysVersion, downloadMediaMessage } from '@whiskeysockets/baileys';
import express from 'express';
import { Boom } from '@hapi/boom';
import pino from 'pino';
import path from 'path';
import { mkdirSync, readFileSync, writeFileSync, existsSync, readdirSync } from 'fs';
import { randomBytes } from 'crypto';
import qrcode from 'qrcode-terminal';
// Parse CLI args
const args = process.argv.slice(2);
function getArg(name, defaultVal) {
const idx = args.indexOf(`--${name}`);
return idx !== -1 && args[idx + 1] ? args[idx + 1] : defaultVal;
}
const WHATSAPP_DEBUG =
typeof process !== 'undefined' &&
process.env &&
typeof process.env.WHATSAPP_DEBUG === 'string' &&
['1', 'true', 'yes', 'on'].includes(process.env.WHATSAPP_DEBUG.toLowerCase());
const PORT = parseInt(getArg('port', '3000'), 10);
const SESSION_DIR = getArg('session', path.join(process.env.HOME || '~', '.hermes', 'whatsapp', 'session'));
const IMAGE_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'image_cache');
const DOCUMENT_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'document_cache');
const AUDIO_CACHE_DIR = path.join(process.env.HOME || '~', '.hermes', 'audio_cache');
const PAIR_ONLY = args.includes('--pair-only');
const WHATSAPP_MODE = getArg('mode', process.env.WHATSAPP_MODE || 'self-chat'); // "bot" or "self-chat"
const ALLOWED_USERS = (process.env.WHATSAPP_ALLOWED_USERS || '').split(',').map(s => s.trim()).filter(Boolean);
const DEFAULT_REPLY_PREFIX = '⚕ *Hermes Agent*\n────────────\n';
const REPLY_PREFIX = process.env.WHATSAPP_REPLY_PREFIX === undefined
? DEFAULT_REPLY_PREFIX
: process.env.WHATSAPP_REPLY_PREFIX.replace(/\\n/g, '\n');
function formatOutgoingMessage(message) {
return REPLY_PREFIX ? `${REPLY_PREFIX}${message}` : message;
}
mkdirSync(SESSION_DIR, { recursive: true });
// Build LID → phone reverse map from session files (lid-mapping-{phone}.json)
function buildLidMap() {
const map = {};
try {
for (const f of readdirSync(SESSION_DIR)) {
const m = f.match(/^lid-mapping-(\d+)\.json$/);
if (!m) continue;
const phone = m[1];
const lid = JSON.parse(readFileSync(path.join(SESSION_DIR, f), 'utf8'));
if (lid) map[String(lid)] = phone;
}
} catch {}
return map;
}
let lidToPhone = buildLidMap();
const logger = pino({ level: 'warn' });
// Message queue for polling
const messageQueue = [];
const MAX_QUEUE_SIZE = 100;
// Track recently sent message IDs to prevent echo-back loops with media
const recentlySentIds = new Set();
const MAX_RECENT_IDS = 50;
let sock = null;
let connectionState = 'disconnected';
async function startSocket() {
const { state, saveCreds } = await useMultiFileAuthState(SESSION_DIR);
const { version } = await fetchLatestBaileysVersion();
sock = makeWASocket({
version,
auth: state,
logger,
printQRInTerminal: false,
browser: ['Hermes Agent', 'Chrome', '120.0'],
syncFullHistory: false,
markOnlineOnConnect: false,
// Required for Baileys 7.x: without this, incoming messages that need
// E2EE session re-establishment are silently dropped (msg.message === null)
getMessage: async (key) => {
// We don't maintain a message store, so return a placeholder.
// This is enough for Baileys to complete the retry handshake.
return { conversation: '' };
},
});
sock.ev.on('creds.update', () => { saveCreds(); lidToPhone = buildLidMap(); });
sock.ev.on('connection.update', (update) => {
const { connection, lastDisconnect, qr } = update;
if (qr) {
console.log('\n📱 Scan this QR code with WhatsApp on your phone:\n');
qrcode.generate(qr, { small: true });
console.log('\nWaiting for scan...\n');
}
if (connection === 'close') {
const reason = new Boom(lastDisconnect?.error)?.output?.statusCode;
connectionState = 'disconnected';
if (reason === DisconnectReason.loggedOut) {
console.log('❌ Logged out. Delete session and restart to re-authenticate.');
process.exit(1);
} else {
// 515 = restart requested (common after pairing). Always reconnect.
if (reason === 515) {
console.log('↻ WhatsApp requested restart (code 515). Reconnecting...');
} else {
console.log(`⚠️ Connection closed (reason: ${reason}). Reconnecting in 3s...`);
}
setTimeout(startSocket, reason === 515 ? 1000 : 3000);
}
} else if (connection === 'open') {
connectionState = 'connected';
console.log('✅ WhatsApp connected!');
if (PAIR_ONLY) {
console.log('✅ Pairing complete. Credentials saved.');
// Give Baileys a moment to flush creds, then exit cleanly
setTimeout(() => process.exit(0), 2000);
}
}
});
sock.ev.on('messages.upsert', async ({ messages, type }) => {
// In self-chat mode, your own messages commonly arrive as 'append' rather
// than 'notify'. Accept both and filter agent echo-backs below.
if (type !== 'notify' && type !== 'append') return;
for (const msg of messages) {
if (!msg.message) continue;
const chatId = msg.key.remoteJid;
if (WHATSAPP_DEBUG) {
try {
console.log(JSON.stringify({
event: 'upsert', type,
fromMe: !!msg.key.fromMe, chatId,
senderId: msg.key.participant || chatId,
messageKeys: Object.keys(msg.message || {}),
}));
} catch {}
}
const senderId = msg.key.participant || chatId;
const isGroup = chatId.endsWith('@g.us');
const senderNumber = senderId.replace(/@.*/, '');
// Handle fromMe messages based on mode
if (msg.key.fromMe) {
if (isGroup || chatId.includes('status')) continue;
if (WHATSAPP_MODE === 'bot') {
// Bot mode: separate number. ALL fromMe are echo-backs of our own replies — skip.
continue;
}
// Self-chat mode: only allow messages in the user's own self-chat
// WhatsApp now uses LID (Linked Identity Device) format: 67427329167522@lid
// AND classic format: 34652029134@s.whatsapp.net
// sock.user has both: { id: "number:10@s.whatsapp.net", lid: "lid_number:10@lid" }
const myNumber = (sock.user?.id || '').replace(/:.*@/, '@').replace(/@.*/, '');
const myLid = (sock.user?.lid || '').replace(/:.*@/, '@').replace(/@.*/, '');
const chatNumber = chatId.replace(/@.*/, '');
const isSelfChat = (myNumber && chatNumber === myNumber) || (myLid && chatNumber === myLid);
if (!isSelfChat) continue;
}
// Check allowlist for messages from others (resolve LID → phone if needed)
if (!msg.key.fromMe && ALLOWED_USERS.length > 0) {
const resolvedNumber = lidToPhone[senderNumber] || senderNumber;
if (!ALLOWED_USERS.includes(resolvedNumber)) continue;
}
// Extract message body
let body = '';
let hasMedia = false;
let mediaType = '';
const mediaUrls = [];
if (msg.message.conversation) {
body = msg.message.conversation;
} else if (msg.message.extendedTextMessage?.text) {
body = msg.message.extendedTextMessage.text;
} else if (msg.message.imageMessage) {
body = msg.message.imageMessage.caption || '';
hasMedia = true;
mediaType = 'image';
try {
const buf = await downloadMediaMessage(msg, 'buffer', {}, { logger, reuploadRequest: sock.updateMediaMessage });
const mime = msg.message.imageMessage.mimetype || 'image/jpeg';
const extMap = { 'image/jpeg': '.jpg', 'image/png': '.png', 'image/webp': '.webp', 'image/gif': '.gif' };
const ext = extMap[mime] || '.jpg';
mkdirSync(IMAGE_CACHE_DIR, { recursive: true });
const filePath = path.join(IMAGE_CACHE_DIR, `img_${randomBytes(6).toString('hex')}${ext}`);
writeFileSync(filePath, buf);
mediaUrls.push(filePath);
} catch (err) {
console.error('[bridge] Failed to download image:', err.message);
}
} else if (msg.message.videoMessage) {
body = msg.message.videoMessage.caption || '';
hasMedia = true;
mediaType = 'video';
try {
const buf = await downloadMediaMessage(msg, 'buffer', {}, { logger, reuploadRequest: sock.updateMediaMessage });
const mime = msg.message.videoMessage.mimetype || 'video/mp4';
const ext = mime.includes('mp4') ? '.mp4' : '.mkv';
mkdirSync(DOCUMENT_CACHE_DIR, { recursive: true });
const filePath = path.join(DOCUMENT_CACHE_DIR, `vid_${randomBytes(6).toString('hex')}${ext}`);
writeFileSync(filePath, buf);
mediaUrls.push(filePath);
} catch (err) {
console.error('[bridge] Failed to download video:', err.message);
}
} else if (msg.message.audioMessage || msg.message.pttMessage) {
hasMedia = true;
mediaType = msg.message.pttMessage ? 'ptt' : 'audio';
try {
const audioMsg = msg.message.pttMessage || msg.message.audioMessage;
const buf = await downloadMediaMessage(msg, 'buffer', {}, { logger, reuploadRequest: sock.updateMediaMessage });
const mime = audioMsg.mimetype || 'audio/ogg';
const ext = mime.includes('ogg') ? '.ogg' : mime.includes('mp4') ? '.m4a' : '.ogg';
mkdirSync(AUDIO_CACHE_DIR, { recursive: true });
const filePath = path.join(AUDIO_CACHE_DIR, `aud_${randomBytes(6).toString('hex')}${ext}`);
writeFileSync(filePath, buf);
mediaUrls.push(filePath);
} catch (err) {
console.error('[bridge] Failed to download audio:', err.message);
}
} else if (msg.message.documentMessage) {
body = msg.message.documentMessage.caption || '';
hasMedia = true;
mediaType = 'document';
const fileName = msg.message.documentMessage.fileName || 'document';
try {
const buf = await downloadMediaMessage(msg, 'buffer', {}, { logger, reuploadRequest: sock.updateMediaMessage });
mkdirSync(DOCUMENT_CACHE_DIR, { recursive: true });
const safeFileName = path.basename(fileName).replace(/[^a-zA-Z0-9._-]/g, '_');
const filePath = path.join(DOCUMENT_CACHE_DIR, `doc_${randomBytes(6).toString('hex')}_${safeFileName}`);
writeFileSync(filePath, buf);
mediaUrls.push(filePath);
} catch (err) {
console.error('[bridge] Failed to download document:', err.message);
}
}
// For media without caption, use a placeholder so the API message is never empty
if (hasMedia && !body) {
body = `[${mediaType} received]`;
}
// Ignore Hermes' own reply messages in self-chat mode to avoid loops.
if (msg.key.fromMe && ((REPLY_PREFIX && body.startsWith(REPLY_PREFIX)) || recentlySentIds.has(msg.key.id))) {
if (WHATSAPP_DEBUG) {
try { console.log(JSON.stringify({ event: 'ignored', reason: 'agent_echo', chatId, messageId: msg.key.id })); } catch {}
}
continue;
}
// Skip empty messages
if (!body && !hasMedia) {
if (WHATSAPP_DEBUG) {
try {
console.log(JSON.stringify({ event: 'ignored', reason: 'empty', chatId, messageKeys: Object.keys(msg.message || {}) }));
} catch (err) {
console.error('Failed to log empty message event:', err);
}
}
continue;
}
const event = {
messageId: msg.key.id,
chatId,
senderId,
senderName: msg.pushName || senderNumber,
chatName: isGroup ? (chatId.split('@')[0]) : (msg.pushName || senderNumber),
isGroup,
body,
hasMedia,
mediaType,
mediaUrls,
timestamp: msg.messageTimestamp,
};
messageQueue.push(event);
if (messageQueue.length > MAX_QUEUE_SIZE) {
messageQueue.shift();
}
}
});
}
// HTTP server
const app = express();
app.use(express.json());
// Poll for new messages (long-poll style)
app.get('/messages', (req, res) => {
const msgs = messageQueue.splice(0, messageQueue.length);
res.json(msgs);
});
// Send a message
app.post('/send', async (req, res) => {
if (!sock || connectionState !== 'connected') {
return res.status(503).json({ error: 'Not connected to WhatsApp' });
}
const { chatId, message, replyTo } = req.body;
if (!chatId || !message) {
return res.status(400).json({ error: 'chatId and message are required' });
}
try {
const sent = await sock.sendMessage(chatId, { text: formatOutgoingMessage(message) });
// Track sent message ID to prevent echo-back loops
if (sent?.key?.id) {
recentlySentIds.add(sent.key.id);
if (recentlySentIds.size > MAX_RECENT_IDS) {
recentlySentIds.delete(recentlySentIds.values().next().value);
}
}
res.json({ success: true, messageId: sent?.key?.id });
} catch (err) {
res.status(500).json({ error: err.message });
}
});
// Edit a previously sent message
app.post('/edit', async (req, res) => {
if (!sock || connectionState !== 'connected') {
return res.status(503).json({ error: 'Not connected to WhatsApp' });
}
const { chatId, messageId, message } = req.body;
if (!chatId || !messageId || !message) {
return res.status(400).json({ error: 'chatId, messageId, and message are required' });
}
try {
const key = { id: messageId, fromMe: true, remoteJid: chatId };
await sock.sendMessage(chatId, { text: formatOutgoingMessage(message), edit: key });
res.json({ success: true });
} catch (err) {
res.status(500).json({ error: err.message });
}
});
// MIME type map and media type inference for /send-media
const MIME_MAP = {
jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png',
webp: 'image/webp', gif: 'image/gif',
mp4: 'video/mp4', mov: 'video/quicktime', avi: 'video/x-msvideo',
mkv: 'video/x-matroska', '3gp': 'video/3gpp',
pdf: 'application/pdf',
doc: 'application/msword',
docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
};
function inferMediaType(ext) {
if (['jpg', 'jpeg', 'png', 'webp', 'gif'].includes(ext)) return 'image';
if (['mp4', 'mov', 'avi', 'mkv', '3gp'].includes(ext)) return 'video';
if (['ogg', 'opus', 'mp3', 'wav', 'm4a'].includes(ext)) return 'audio';
return 'document';
}
// Send media (image, video, document) natively
app.post('/send-media', async (req, res) => {
if (!sock || connectionState !== 'connected') {
return res.status(503).json({ error: 'Not connected to WhatsApp' });
}
const { chatId, filePath, mediaType, caption, fileName } = req.body;
if (!chatId || !filePath) {
return res.status(400).json({ error: 'chatId and filePath are required' });
}
try {
if (!existsSync(filePath)) {
return res.status(404).json({ error: `File not found: ${filePath}` });
}
const buffer = readFileSync(filePath);
const ext = filePath.toLowerCase().split('.').pop();
const type = mediaType || inferMediaType(ext);
let msgPayload;
switch (type) {
case 'image':
msgPayload = { image: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'image/jpeg' };
break;
case 'video':
msgPayload = { video: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'video/mp4' };
break;
case 'audio': {
const audioMime = (ext === 'ogg' || ext === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg';
msgPayload = { audio: buffer, mimetype: audioMime, ptt: ext === 'ogg' || ext === 'opus' };
break;
}
case 'document':
default:
msgPayload = {
document: buffer,
fileName: fileName || path.basename(filePath),
caption: caption || undefined,
mimetype: MIME_MAP[ext] || 'application/octet-stream',
};
break;
}
const sent = await sock.sendMessage(chatId, msgPayload);
// Track sent message ID to prevent echo-back loops
if (sent?.key?.id) {
recentlySentIds.add(sent.key.id);
if (recentlySentIds.size > MAX_RECENT_IDS) {
recentlySentIds.delete(recentlySentIds.values().next().value);
}
}
res.json({ success: true, messageId: sent?.key?.id });
} catch (err) {
res.status(500).json({ error: err.message });
}
});
// Typing indicator
app.post('/typing', async (req, res) => {
if (!sock || connectionState !== 'connected') {
return res.status(503).json({ error: 'Not connected' });
}
const { chatId } = req.body;
if (!chatId) return res.status(400).json({ error: 'chatId required' });
try {
await sock.sendPresenceUpdate('composing', chatId);
res.json({ success: true });
} catch (err) {
res.json({ success: false });
}
});
// Chat info
app.get('/chat/:id', async (req, res) => {
const chatId = req.params.id;
const isGroup = chatId.endsWith('@g.us');
if (isGroup && sock) {
try {
const metadata = await sock.groupMetadata(chatId);
return res.json({
name: metadata.subject,
isGroup: true,
participants: metadata.participants.map(p => p.id),
});
} catch {
// Fall through to default
}
}
res.json({
name: chatId.replace(/@.*/, ''),
isGroup,
participants: [],
});
});
// Health check
app.get('/health', (req, res) => {
res.json({
status: connectionState,
queueLength: messageQueue.length,
uptime: process.uptime(),
});
});
// Start
if (PAIR_ONLY) {
// Pair-only mode: just connect, show QR, save creds, exit. No HTTP server.
console.log('📱 WhatsApp pairing mode');
console.log(`📁 Session: ${SESSION_DIR}`);
console.log();
startSocket();
} else {
app.listen(PORT, '127.0.0.1', () => {
console.log(`🌉 WhatsApp bridge listening on port ${PORT} (mode: ${WHATSAPP_MODE})`);
console.log(`📁 Session stored in: ${SESSION_DIR}`);
if (ALLOWED_USERS.length > 0) {
console.log(`🔒 Allowed users: ${ALLOWED_USERS.join(', ')}`);
} else {
console.log(`⚠️ No WHATSAPP_ALLOWED_USERS set — all messages will be processed`);
}
console.log();
startSocket();
});
}