From 358839626370dd192973d01a2bd404336de4a4ec Mon Sep 17 00:00:00 2001
From: Daniel Sateler <satelerd@gmail.com>
Date: Mon, 2 Mar 2026 16:34:49 -0300
Subject: [PATCH] =?UTF-8?q?feat(whatsapp):=20native=20media=20sending=20?=
 =?UTF-8?q?=E2=80=94=20images,=20videos,=20documents?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a /send-media endpoint to the WhatsApp bridge and corresponding
adapter methods so the agent can send files as native WhatsApp
attachments instead of plain-text URLs/paths.

- bridge.js: new POST /send-media endpoint using Baileys' native
  image/video/document/audio message types with MIME detection
- base.py: add send_video(), send_document(), send_image_file()
  with text fallbacks; route MEDIA: tags by file extension instead
  of always treating them as voice messages
- whatsapp.py: implement all media methods via a shared
  _send_media_to_bridge() helper; override send_image() to download
  URLs to local cache and send as native photos
- prompt_builder.py: update WhatsApp and Telegram platform hints so
  the agent knows it can use MEDIA:/path tags to send native media
---
 agent/prompt_builder.py           |  16 ++++-
 gateway/platforms/base.py         | 101 ++++++++++++++++++++++++++----
 gateway/platforms/whatsapp.py     |  97 +++++++++++++++++++++++++++-
 scripts/whatsapp-bridge/bridge.js |  73 ++++++++++++++++++++-
 4 files changed, 272 insertions(+), 15 deletions(-)

diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py
index 24c26ef86..448b1b83f 100644
--- a/agent/prompt_builder.py
+++ b/agent/prompt_builder.py
@@ -90,11 +90,23 @@ SKILLS_GUIDANCE = (
 PLATFORM_HINTS = {
     "whatsapp": (
         "You are on a text messaging communication platform, WhatsApp. "
-        "Please do not use markdown as it does not render."
+        "Please do not use markdown as it does not render. "
+        "You can send media files natively: to deliver a file to the user, "
+        "include MEDIA:/absolute/path/to/file in your response. The file "
+        "will be sent as a native WhatsApp attachment — images (.jpg, .png, "
+        ".webp) appear as photos, videos (.mp4, .mov) play inline, and other "
+        "files arrive as downloadable documents. You can also include image "
+        "URLs in markdown format ![alt](url) and they will be sent as photos."
     ),
     "telegram": (
         "You are on a text messaging communication platform, Telegram. "
-        "Please do not use markdown as it does not render."
+        "Please do not use markdown as it does not render. "
+        "You can send media files natively: to deliver a file to the user, "
+        "include MEDIA:/absolute/path/to/file in your response. Images "
+        "(.jpg, .png) appear as photos, videos (.mp4) play inline, audio "
+        "(.ogg) sends as voice bubbles, and other files as documents. You "
+        "can also include image URLs in markdown format ![alt](url) and they "
+        "will be sent as native photos."
     ),
     "discord": (
         "You are in a Discord server or group chat communicating with your user."
diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py
index b2fd79df8..68cc90128 100644
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@@ -509,7 +509,63 @@ class BasePlatformAdapter(ABC):
         if caption:
             text = f"{caption}\n{text}"
         return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
-    
+
+    async def send_video(
+        self,
+        chat_id: str,
+        video_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """
+        Send a video natively via the platform API.
+
+        Override in subclasses to send videos as inline playable media.
+        Default falls back to sending the file path as text.
+        """
+        text = f"🎬 Video: {video_path}"
+        if caption:
+            text = f"{caption}\n{text}"
+        return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
+
+    async def send_document(
+        self,
+        chat_id: str,
+        file_path: str,
+        caption: Optional[str] = None,
+        file_name: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """
+        Send a document/file natively via the platform API.
+
+        Override in subclasses to send files as downloadable attachments.
+        Default falls back to sending the file path as text.
+        """
+        text = f"📎 File: {file_path}"
+        if caption:
+            text = f"{caption}\n{text}"
+        return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
+
+    async def send_image_file(
+        self,
+        chat_id: str,
+        image_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """
+        Send a local image file natively via the platform API.
+
+        Unlike send_image() which takes a URL, this takes a local file path.
+        Override in subclasses for native photo attachments.
+        Default falls back to sending the file path as text.
+        """
+        text = f"🖼️ Image: {image_path}"
+        if caption:
+            text = f"{caption}\n{text}"
+        return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
+
     @staticmethod
     def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
         """
@@ -676,19 +732,42 @@ class BasePlatformAdapter(ABC):
                     except Exception as img_err:
                         print(f"[{self.name}] Error sending image: {img_err}")
                 
-                # Send extracted audio/voice files as native attachments
-                for audio_path, is_voice in media_files:
+                # Send extracted media files — route by file type
+                _AUDIO_EXTS = {'.ogg', '.opus', '.mp3', '.wav', '.m4a'}
+                _VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.3gp'}
+                _IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'}
+
+                for media_path, is_voice in media_files:
                     if human_delay > 0:
                         await asyncio.sleep(human_delay)
                     try:
-                        voice_result = await self.send_voice(
-                            chat_id=event.source.chat_id,
-                            audio_path=audio_path,
-                        )
-                        if not voice_result.success:
-                            print(f"[{self.name}] Failed to send voice: {voice_result.error}")
-                    except Exception as voice_err:
-                        print(f"[{self.name}] Error sending voice: {voice_err}")
+                        from pathlib import Path as _Path
+                        ext = _Path(media_path).suffix.lower()
+                        if ext in _AUDIO_EXTS:
+                            media_result = await self.send_voice(
+                                chat_id=event.source.chat_id,
+                                audio_path=media_path,
+                            )
+                        elif ext in _VIDEO_EXTS:
+                            media_result = await self.send_video(
+                                chat_id=event.source.chat_id,
+                                video_path=media_path,
+                            )
+                        elif ext in _IMAGE_EXTS:
+                            media_result = await self.send_image_file(
+                                chat_id=event.source.chat_id,
+                                image_path=media_path,
+                            )
+                        else:
+                            media_result = await self.send_document(
+                                chat_id=event.source.chat_id,
+                                file_path=media_path,
+                            )
+
+                        if not media_result.success:
+                            print(f"[{self.name}] Failed to send media ({ext}): {media_result.error}")
+                    except Exception as media_err:
+                        print(f"[{self.name}] Error sending media: {media_err}")
             
             # Check if there's a pending message that was queued during our processing
             if session_key in self._pending_messages:
diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py
index eb0d6f1b5..c233f5ff4 100644
--- a/gateway/platforms/whatsapp.py
+++ b/gateway/platforms/whatsapp.py
@@ -281,7 +281,102 @@ class WhatsAppAdapter(BasePlatformAdapter):
             )
         except Exception as e:
             return SendResult(success=False, error=str(e))
-    
+
+    async def _send_media_to_bridge(
+        self,
+        chat_id: str,
+        file_path: str,
+        media_type: str,
+        caption: Optional[str] = None,
+        file_name: Optional[str] = None,
+    ) -> SendResult:
+        """Send any media file via bridge /send-media endpoint."""
+        if not self._running:
+            return SendResult(success=False, error="Not connected")
+        try:
+            import aiohttp
+
+            if not os.path.exists(file_path):
+                return SendResult(success=False, error=f"File not found: {file_path}")
+
+            payload: Dict[str, Any] = {
+                "chatId": chat_id,
+                "filePath": file_path,
+                "mediaType": media_type,
+            }
+            if caption:
+                payload["caption"] = caption
+            if file_name:
+                payload["fileName"] = file_name
+
+            async with aiohttp.ClientSession() as session:
+                async with session.post(
+                    f"http://localhost:{self._bridge_port}/send-media",
+                    json=payload,
+                    timeout=aiohttp.ClientTimeout(total=120),
+                ) as resp:
+                    if resp.status == 200:
+                        data = await resp.json()
+                        return SendResult(
+                            success=True,
+                            message_id=data.get("messageId"),
+                            raw_response=data,
+                        )
+                    else:
+                        error = await resp.text()
+                        return SendResult(success=False, error=error)
+
+        except Exception as e:
+            return SendResult(success=False, error=str(e))
+
+    async def send_image(
+        self,
+        chat_id: str,
+        image_url: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Download image URL to cache, send natively via bridge."""
+        try:
+            local_path = await cache_image_from_url(image_url)
+            return await self._send_media_to_bridge(chat_id, local_path, "image", caption)
+        except Exception:
+            return await super().send_image(chat_id, image_url, caption, reply_to)
+
+    async def send_image_file(
+        self,
+        chat_id: str,
+        image_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send a local image file natively via bridge."""
+        return await self._send_media_to_bridge(chat_id, image_path, "image", caption)
+
+    async def send_video(
+        self,
+        chat_id: str,
+        video_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send a video natively via bridge — plays inline in WhatsApp."""
+        return await self._send_media_to_bridge(chat_id, video_path, "video", caption)
+
+    async def send_document(
+        self,
+        chat_id: str,
+        file_path: str,
+        caption: Optional[str] = None,
+        file_name: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send a document/file as a downloadable attachment via bridge."""
+        return await self._send_media_to_bridge(
+            chat_id, file_path, "document", caption,
+            file_name or os.path.basename(file_path),
+        )
+
     async def send_typing(self, chat_id: str) -> None:
         """Send typing indicator via bridge."""
         if not self._running:
diff --git a/scripts/whatsapp-bridge/bridge.js b/scripts/whatsapp-bridge/bridge.js
index 48e4d880b..d3375f154 100644
--- a/scripts/whatsapp-bridge/bridge.js
+++ b/scripts/whatsapp-bridge/bridge.js
@@ -8,6 +8,7 @@
  * Endpoints (matches gateway/platforms/whatsapp.py expectations):
  *   GET  /messages       - Long-poll for new incoming messages
  *   POST /send           - Send a message { chatId, message, replyTo? }
+ *   POST /send-media     - Send media natively { chatId, filePath, mediaType?, caption?, fileName? }
  *   POST /typing         - Send typing indicator { chatId }
  *   GET  /chat/:id       - Get chat info
  *   GET  /health         - Health check
@@ -21,7 +22,7 @@ import express from 'express';
 import { Boom } from '@hapi/boom';
 import pino from 'pino';
 import path from 'path';
-import { mkdirSync } from 'fs';
+import { mkdirSync, readFileSync, existsSync } from 'fs';
 import qrcode from 'qrcode-terminal';
 
 // Parse CLI args
@@ -210,6 +211,76 @@ app.post('/send', async (req, res) => {
   }
 });
 
+// MIME type map and media type inference for /send-media
+const MIME_MAP = {
+  jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png',
+  webp: 'image/webp', gif: 'image/gif',
+  mp4: 'video/mp4', mov: 'video/quicktime', avi: 'video/x-msvideo',
+  mkv: 'video/x-matroska', '3gp': 'video/3gpp',
+  pdf: 'application/pdf',
+  doc: 'application/msword',
+  docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+  xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+};
+
+function inferMediaType(ext) {
+  if (['jpg', 'jpeg', 'png', 'webp', 'gif'].includes(ext)) return 'image';
+  if (['mp4', 'mov', 'avi', 'mkv', '3gp'].includes(ext)) return 'video';
+  if (['ogg', 'opus', 'mp3', 'wav', 'm4a'].includes(ext)) return 'audio';
+  return 'document';
+}
+
+// Send media (image, video, document) natively
+app.post('/send-media', async (req, res) => {
+  if (!sock || connectionState !== 'connected') {
+    return res.status(503).json({ error: 'Not connected to WhatsApp' });
+  }
+
+  const { chatId, filePath, mediaType, caption, fileName } = req.body;
+  if (!chatId || !filePath) {
+    return res.status(400).json({ error: 'chatId and filePath are required' });
+  }
+
+  try {
+    if (!existsSync(filePath)) {
+      return res.status(404).json({ error: `File not found: ${filePath}` });
+    }
+
+    const buffer = readFileSync(filePath);
+    const ext = filePath.toLowerCase().split('.').pop();
+    const type = mediaType || inferMediaType(ext);
+    let msgPayload;
+
+    switch (type) {
+      case 'image':
+        msgPayload = { image: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'image/jpeg' };
+        break;
+      case 'video':
+        msgPayload = { video: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'video/mp4' };
+        break;
+      case 'audio': {
+        const audioMime = (ext === 'ogg' || ext === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg';
+        msgPayload = { audio: buffer, mimetype: audioMime, ptt: ext === 'ogg' || ext === 'opus' };
+        break;
+      }
+      case 'document':
+      default:
+        msgPayload = {
+          document: buffer,
+          fileName: fileName || path.basename(filePath),
+          caption: caption || undefined,
+          mimetype: MIME_MAP[ext] || 'application/octet-stream',
+        };
+        break;
+    }
+
+    const sent = await sock.sendMessage(chatId, msgPayload);
+    res.json({ success: true, messageId: sent?.key?.id });
+  } catch (err) {
+    res.status(500).json({ error: err.message });
+  }
+});
+
 // Typing indicator
 app.post('/typing', async (req, res) => {
   if (!sock || connectionState !== 'connected') {