From 358839626370dd192973d01a2bd404336de4a4ec Mon Sep 17 00:00:00 2001 From: Daniel Sateler Date: Mon, 2 Mar 2026 16:34:49 -0300 Subject: [PATCH] =?UTF-8?q?feat(whatsapp):=20native=20media=20sending=20?= =?UTF-8?q?=E2=80=94=20images,=20videos,=20documents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a /send-media endpoint to the WhatsApp bridge and corresponding adapter methods so the agent can send files as native WhatsApp attachments instead of plain-text URLs/paths. - bridge.js: new POST /send-media endpoint using Baileys' native image/video/document/audio message types with MIME detection - base.py: add send_video(), send_document(), send_image_file() with text fallbacks; route MEDIA: tags by file extension instead of always treating them as voice messages - whatsapp.py: implement all media methods via a shared _send_media_to_bridge() helper; override send_image() to download URLs to local cache and send as native photos - prompt_builder.py: update WhatsApp and Telegram platform hints so the agent knows it can use MEDIA:/path tags to send native media --- agent/prompt_builder.py | 16 ++++- gateway/platforms/base.py | 101 ++++++++++++++++++++++++++---- gateway/platforms/whatsapp.py | 97 +++++++++++++++++++++++++++- scripts/whatsapp-bridge/bridge.js | 73 ++++++++++++++++++++- 4 files changed, 272 insertions(+), 15 deletions(-) diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index 24c26ef86..448b1b83f 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -90,11 +90,23 @@ SKILLS_GUIDANCE = ( PLATFORM_HINTS = { "whatsapp": ( "You are on a text messaging communication platform, WhatsApp. " - "Please do not use markdown as it does not render." + "Please do not use markdown as it does not render. " + "You can send media files natively: to deliver a file to the user, " + "include MEDIA:/absolute/path/to/file in your response. The file " + "will be sent as a native WhatsApp attachment — images (.jpg, .png, " + ".webp) appear as photos, videos (.mp4, .mov) play inline, and other " + "files arrive as downloadable documents. You can also include image " + "URLs in markdown format ![alt](url) and they will be sent as photos." ), "telegram": ( "You are on a text messaging communication platform, Telegram. " - "Please do not use markdown as it does not render." + "Please do not use markdown as it does not render. " + "You can send media files natively: to deliver a file to the user, " + "include MEDIA:/absolute/path/to/file in your response. Images " + "(.jpg, .png) appear as photos, videos (.mp4) play inline, audio " + "(.ogg) sends as voice bubbles, and other files as documents. You " + "can also include image URLs in markdown format ![alt](url) and they " + "will be sent as native photos." ), "discord": ( "You are in a Discord server or group chat communicating with your user." diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index b2fd79df8..68cc90128 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -509,7 +509,63 @@ class BasePlatformAdapter(ABC): if caption: text = f"{caption}\n{text}" return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) - + + async def send_video( + self, + chat_id: str, + video_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """ + Send a video natively via the platform API. + + Override in subclasses to send videos as inline playable media. + Default falls back to sending the file path as text. + """ + text = f"🎬 Video: {video_path}" + if caption: + text = f"{caption}\n{text}" + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + + async def send_document( + self, + chat_id: str, + file_path: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """ + Send a document/file natively via the platform API. + + Override in subclasses to send files as downloadable attachments. + Default falls back to sending the file path as text. + """ + text = f"📎 File: {file_path}" + if caption: + text = f"{caption}\n{text}" + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + + async def send_image_file( + self, + chat_id: str, + image_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """ + Send a local image file natively via the platform API. + + Unlike send_image() which takes a URL, this takes a local file path. + Override in subclasses for native photo attachments. + Default falls back to sending the file path as text. + """ + text = f"🖼️ Image: {image_path}" + if caption: + text = f"{caption}\n{text}" + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + @staticmethod def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]: """ @@ -676,19 +732,42 @@ class BasePlatformAdapter(ABC): except Exception as img_err: print(f"[{self.name}] Error sending image: {img_err}") - # Send extracted audio/voice files as native attachments - for audio_path, is_voice in media_files: + # Send extracted media files — route by file type + _AUDIO_EXTS = {'.ogg', '.opus', '.mp3', '.wav', '.m4a'} + _VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.3gp'} + _IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'} + + for media_path, is_voice in media_files: if human_delay > 0: await asyncio.sleep(human_delay) try: - voice_result = await self.send_voice( - chat_id=event.source.chat_id, - audio_path=audio_path, - ) - if not voice_result.success: - print(f"[{self.name}] Failed to send voice: {voice_result.error}") - except Exception as voice_err: - print(f"[{self.name}] Error sending voice: {voice_err}") + from pathlib import Path as _Path + ext = _Path(media_path).suffix.lower() + if ext in _AUDIO_EXTS: + media_result = await self.send_voice( + chat_id=event.source.chat_id, + audio_path=media_path, + ) + elif ext in _VIDEO_EXTS: + media_result = await self.send_video( + chat_id=event.source.chat_id, + video_path=media_path, + ) + elif ext in _IMAGE_EXTS: + media_result = await self.send_image_file( + chat_id=event.source.chat_id, + image_path=media_path, + ) + else: + media_result = await self.send_document( + chat_id=event.source.chat_id, + file_path=media_path, + ) + + if not media_result.success: + print(f"[{self.name}] Failed to send media ({ext}): {media_result.error}") + except Exception as media_err: + print(f"[{self.name}] Error sending media: {media_err}") # Check if there's a pending message that was queued during our processing if session_key in self._pending_messages: diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index eb0d6f1b5..c233f5ff4 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -281,7 +281,102 @@ class WhatsAppAdapter(BasePlatformAdapter): ) except Exception as e: return SendResult(success=False, error=str(e)) - + + async def _send_media_to_bridge( + self, + chat_id: str, + file_path: str, + media_type: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + ) -> SendResult: + """Send any media file via bridge /send-media endpoint.""" + if not self._running: + return SendResult(success=False, error="Not connected") + try: + import aiohttp + + if not os.path.exists(file_path): + return SendResult(success=False, error=f"File not found: {file_path}") + + payload: Dict[str, Any] = { + "chatId": chat_id, + "filePath": file_path, + "mediaType": media_type, + } + if caption: + payload["caption"] = caption + if file_name: + payload["fileName"] = file_name + + async with aiohttp.ClientSession() as session: + async with session.post( + f"http://localhost:{self._bridge_port}/send-media", + json=payload, + timeout=aiohttp.ClientTimeout(total=120), + ) as resp: + if resp.status == 200: + data = await resp.json() + return SendResult( + success=True, + message_id=data.get("messageId"), + raw_response=data, + ) + else: + error = await resp.text() + return SendResult(success=False, error=error) + + except Exception as e: + return SendResult(success=False, error=str(e)) + + async def send_image( + self, + chat_id: str, + image_url: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Download image URL to cache, send natively via bridge.""" + try: + local_path = await cache_image_from_url(image_url) + return await self._send_media_to_bridge(chat_id, local_path, "image", caption) + except Exception: + return await super().send_image(chat_id, image_url, caption, reply_to) + + async def send_image_file( + self, + chat_id: str, + image_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a local image file natively via bridge.""" + return await self._send_media_to_bridge(chat_id, image_path, "image", caption) + + async def send_video( + self, + chat_id: str, + video_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a video natively via bridge — plays inline in WhatsApp.""" + return await self._send_media_to_bridge(chat_id, video_path, "video", caption) + + async def send_document( + self, + chat_id: str, + file_path: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a document/file as a downloadable attachment via bridge.""" + return await self._send_media_to_bridge( + chat_id, file_path, "document", caption, + file_name or os.path.basename(file_path), + ) + async def send_typing(self, chat_id: str) -> None: """Send typing indicator via bridge.""" if not self._running: diff --git a/scripts/whatsapp-bridge/bridge.js b/scripts/whatsapp-bridge/bridge.js index 48e4d880b..d3375f154 100644 --- a/scripts/whatsapp-bridge/bridge.js +++ b/scripts/whatsapp-bridge/bridge.js @@ -8,6 +8,7 @@ * Endpoints (matches gateway/platforms/whatsapp.py expectations): * GET /messages - Long-poll for new incoming messages * POST /send - Send a message { chatId, message, replyTo? } + * POST /send-media - Send media natively { chatId, filePath, mediaType?, caption?, fileName? } * POST /typing - Send typing indicator { chatId } * GET /chat/:id - Get chat info * GET /health - Health check @@ -21,7 +22,7 @@ import express from 'express'; import { Boom } from '@hapi/boom'; import pino from 'pino'; import path from 'path'; -import { mkdirSync } from 'fs'; +import { mkdirSync, readFileSync, existsSync } from 'fs'; import qrcode from 'qrcode-terminal'; // Parse CLI args @@ -210,6 +211,76 @@ app.post('/send', async (req, res) => { } }); +// MIME type map and media type inference for /send-media +const MIME_MAP = { + jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png', + webp: 'image/webp', gif: 'image/gif', + mp4: 'video/mp4', mov: 'video/quicktime', avi: 'video/x-msvideo', + mkv: 'video/x-matroska', '3gp': 'video/3gpp', + pdf: 'application/pdf', + doc: 'application/msword', + docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', +}; + +function inferMediaType(ext) { + if (['jpg', 'jpeg', 'png', 'webp', 'gif'].includes(ext)) return 'image'; + if (['mp4', 'mov', 'avi', 'mkv', '3gp'].includes(ext)) return 'video'; + if (['ogg', 'opus', 'mp3', 'wav', 'm4a'].includes(ext)) return 'audio'; + return 'document'; +} + +// Send media (image, video, document) natively +app.post('/send-media', async (req, res) => { + if (!sock || connectionState !== 'connected') { + return res.status(503).json({ error: 'Not connected to WhatsApp' }); + } + + const { chatId, filePath, mediaType, caption, fileName } = req.body; + if (!chatId || !filePath) { + return res.status(400).json({ error: 'chatId and filePath are required' }); + } + + try { + if (!existsSync(filePath)) { + return res.status(404).json({ error: `File not found: ${filePath}` }); + } + + const buffer = readFileSync(filePath); + const ext = filePath.toLowerCase().split('.').pop(); + const type = mediaType || inferMediaType(ext); + let msgPayload; + + switch (type) { + case 'image': + msgPayload = { image: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'image/jpeg' }; + break; + case 'video': + msgPayload = { video: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'video/mp4' }; + break; + case 'audio': { + const audioMime = (ext === 'ogg' || ext === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg'; + msgPayload = { audio: buffer, mimetype: audioMime, ptt: ext === 'ogg' || ext === 'opus' }; + break; + } + case 'document': + default: + msgPayload = { + document: buffer, + fileName: fileName || path.basename(filePath), + caption: caption || undefined, + mimetype: MIME_MAP[ext] || 'application/octet-stream', + }; + break; + } + + const sent = await sock.sendMessage(chatId, msgPayload); + res.json({ success: true, messageId: sent?.key?.id }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + // Typing indicator app.post('/typing', async (req, res) => { if (!sock || connectionState !== 'connected') {