diff --git a/agent/prompt_builder.py b/agent/prompt_builder.py index 24c26ef8..5df99e1a 100644 --- a/agent/prompt_builder.py +++ b/agent/prompt_builder.py @@ -90,11 +90,21 @@ SKILLS_GUIDANCE = ( PLATFORM_HINTS = { "whatsapp": ( "You are on a text messaging communication platform, WhatsApp. " - "Please do not use markdown as it does not render." + "Please do not use markdown as it does not render. " + "You can send media files natively: to deliver a file to the user, " + "include MEDIA:/absolute/path/to/file in your response. The file " + "will be sent as a native WhatsApp attachment — images (.jpg, .png, " + ".webp) appear as photos, videos (.mp4, .mov) play inline, and other " + "files arrive as downloadable documents. You can also include image " + "URLs in markdown format ![alt](url) and they will be sent as photos." ), "telegram": ( "You are on a text messaging communication platform, Telegram. " - "Please do not use markdown as it does not render." + "Please do not use markdown as it does not render. " + "You can send media files natively: to deliver a file to the user, " + "include MEDIA:/absolute/path/to/file in your response. Audio " + "(.ogg) sends as voice bubbles. You can also include image URLs " + "in markdown format ![alt](url) and they will be sent as native photos." ), "discord": ( "You are in a Discord server or group chat communicating with your user." diff --git a/gateway/platforms/base.py b/gateway/platforms/base.py index 702e737d..c06bb6f9 100644 --- a/gateway/platforms/base.py +++ b/gateway/platforms/base.py @@ -526,7 +526,63 @@ class BasePlatformAdapter(ABC): if caption: text = f"{caption}\n{text}" return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) - + + async def send_video( + self, + chat_id: str, + video_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """ + Send a video natively via the platform API. + + Override in subclasses to send videos as inline playable media. + Default falls back to sending the file path as text. + """ + text = f"🎬 Video: {video_path}" + if caption: + text = f"{caption}\n{text}" + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + + async def send_document( + self, + chat_id: str, + file_path: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """ + Send a document/file natively via the platform API. + + Override in subclasses to send files as downloadable attachments. + Default falls back to sending the file path as text. + """ + text = f"📎 File: {file_path}" + if caption: + text = f"{caption}\n{text}" + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + + async def send_image_file( + self, + chat_id: str, + image_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """ + Send a local image file natively via the platform API. + + Unlike send_image() which takes a URL, this takes a local file path. + Override in subclasses for native photo attachments. + Default falls back to sending the file path as text. + """ + text = f"🖼️ Image: {image_path}" + if caption: + text = f"{caption}\n{text}" + return await self.send(chat_id=chat_id, content=text, reply_to=reply_to) + @staticmethod def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]: """ @@ -693,19 +749,41 @@ class BasePlatformAdapter(ABC): except Exception as img_err: print(f"[{self.name}] Error sending image: {img_err}") - # Send extracted audio/voice files as native attachments - for audio_path, is_voice in media_files: + # Send extracted media files — route by file type + _AUDIO_EXTS = {'.ogg', '.opus', '.mp3', '.wav', '.m4a'} + _VIDEO_EXTS = {'.mp4', '.mov', '.avi', '.mkv', '.3gp'} + _IMAGE_EXTS = {'.jpg', '.jpeg', '.png', '.webp', '.gif'} + + for media_path, is_voice in media_files: if human_delay > 0: await asyncio.sleep(human_delay) try: - voice_result = await self.send_voice( - chat_id=event.source.chat_id, - audio_path=audio_path, - ) - if not voice_result.success: - print(f"[{self.name}] Failed to send voice: {voice_result.error}") - except Exception as voice_err: - print(f"[{self.name}] Error sending voice: {voice_err}") + ext = Path(media_path).suffix.lower() + if ext in _AUDIO_EXTS: + media_result = await self.send_voice( + chat_id=event.source.chat_id, + audio_path=media_path, + ) + elif ext in _VIDEO_EXTS: + media_result = await self.send_video( + chat_id=event.source.chat_id, + video_path=media_path, + ) + elif ext in _IMAGE_EXTS: + media_result = await self.send_image_file( + chat_id=event.source.chat_id, + image_path=media_path, + ) + else: + media_result = await self.send_document( + chat_id=event.source.chat_id, + file_path=media_path, + ) + + if not media_result.success: + print(f"[{self.name}] Failed to send media ({ext}): {media_result.error}") + except Exception as media_err: + print(f"[{self.name}] Error sending media: {media_err}") # Check if there's a pending message that was queued during our processing if session_key in self._pending_messages: diff --git a/gateway/platforms/whatsapp.py b/gateway/platforms/whatsapp.py index e3c96137..db1eea8d 100644 --- a/gateway/platforms/whatsapp.py +++ b/gateway/platforms/whatsapp.py @@ -381,6 +381,101 @@ class WhatsAppAdapter(BasePlatformAdapter): except Exception as e: return SendResult(success=False, error=str(e)) + async def _send_media_to_bridge( + self, + chat_id: str, + file_path: str, + media_type: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + ) -> SendResult: + """Send any media file via bridge /send-media endpoint.""" + if not self._running: + return SendResult(success=False, error="Not connected") + try: + import aiohttp + + if not os.path.exists(file_path): + return SendResult(success=False, error=f"File not found: {file_path}") + + payload: Dict[str, Any] = { + "chatId": chat_id, + "filePath": file_path, + "mediaType": media_type, + } + if caption: + payload["caption"] = caption + if file_name: + payload["fileName"] = file_name + + async with aiohttp.ClientSession() as session: + async with session.post( + f"http://localhost:{self._bridge_port}/send-media", + json=payload, + timeout=aiohttp.ClientTimeout(total=120), + ) as resp: + if resp.status == 200: + data = await resp.json() + return SendResult( + success=True, + message_id=data.get("messageId"), + raw_response=data, + ) + else: + error = await resp.text() + return SendResult(success=False, error=error) + + except Exception as e: + return SendResult(success=False, error=str(e)) + + async def send_image( + self, + chat_id: str, + image_url: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Download image URL to cache, send natively via bridge.""" + try: + local_path = await cache_image_from_url(image_url) + return await self._send_media_to_bridge(chat_id, local_path, "image", caption) + except Exception: + return await super().send_image(chat_id, image_url, caption, reply_to) + + async def send_image_file( + self, + chat_id: str, + image_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a local image file natively via bridge.""" + return await self._send_media_to_bridge(chat_id, image_path, "image", caption) + + async def send_video( + self, + chat_id: str, + video_path: str, + caption: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a video natively via bridge — plays inline in WhatsApp.""" + return await self._send_media_to_bridge(chat_id, video_path, "video", caption) + + async def send_document( + self, + chat_id: str, + file_path: str, + caption: Optional[str] = None, + file_name: Optional[str] = None, + reply_to: Optional[str] = None, + ) -> SendResult: + """Send a document/file as a downloadable attachment via bridge.""" + return await self._send_media_to_bridge( + chat_id, file_path, "document", caption, + file_name or os.path.basename(file_path), + ) + async def send_typing(self, chat_id: str) -> None: """Send typing indicator via bridge.""" if not self._running: diff --git a/scripts/whatsapp-bridge/bridge.js b/scripts/whatsapp-bridge/bridge.js index 7404f5ae..3710c990 100644 --- a/scripts/whatsapp-bridge/bridge.js +++ b/scripts/whatsapp-bridge/bridge.js @@ -9,6 +9,7 @@ * GET /messages - Long-poll for new incoming messages * POST /send - Send a message { chatId, message, replyTo? } * POST /edit - Edit a sent message { chatId, messageId, message } + * POST /send-media - Send media natively { chatId, filePath, mediaType?, caption?, fileName? } * POST /typing - Send typing indicator { chatId } * GET /chat/:id - Get chat info * GET /health - Health check @@ -22,7 +23,7 @@ import express from 'express'; import { Boom } from '@hapi/boom'; import pino from 'pino'; import path from 'path'; -import { mkdirSync } from 'fs'; +import { mkdirSync, readFileSync, existsSync } from 'fs'; import qrcode from 'qrcode-terminal'; // Parse CLI args @@ -238,6 +239,76 @@ app.post('/edit', async (req, res) => { } }); +// MIME type map and media type inference for /send-media +const MIME_MAP = { + jpg: 'image/jpeg', jpeg: 'image/jpeg', png: 'image/png', + webp: 'image/webp', gif: 'image/gif', + mp4: 'video/mp4', mov: 'video/quicktime', avi: 'video/x-msvideo', + mkv: 'video/x-matroska', '3gp': 'video/3gpp', + pdf: 'application/pdf', + doc: 'application/msword', + docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + xlsx: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', +}; + +function inferMediaType(ext) { + if (['jpg', 'jpeg', 'png', 'webp', 'gif'].includes(ext)) return 'image'; + if (['mp4', 'mov', 'avi', 'mkv', '3gp'].includes(ext)) return 'video'; + if (['ogg', 'opus', 'mp3', 'wav', 'm4a'].includes(ext)) return 'audio'; + return 'document'; +} + +// Send media (image, video, document) natively +app.post('/send-media', async (req, res) => { + if (!sock || connectionState !== 'connected') { + return res.status(503).json({ error: 'Not connected to WhatsApp' }); + } + + const { chatId, filePath, mediaType, caption, fileName } = req.body; + if (!chatId || !filePath) { + return res.status(400).json({ error: 'chatId and filePath are required' }); + } + + try { + if (!existsSync(filePath)) { + return res.status(404).json({ error: `File not found: ${filePath}` }); + } + + const buffer = readFileSync(filePath); + const ext = filePath.toLowerCase().split('.').pop(); + const type = mediaType || inferMediaType(ext); + let msgPayload; + + switch (type) { + case 'image': + msgPayload = { image: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'image/jpeg' }; + break; + case 'video': + msgPayload = { video: buffer, caption: caption || undefined, mimetype: MIME_MAP[ext] || 'video/mp4' }; + break; + case 'audio': { + const audioMime = (ext === 'ogg' || ext === 'opus') ? 'audio/ogg; codecs=opus' : 'audio/mpeg'; + msgPayload = { audio: buffer, mimetype: audioMime, ptt: ext === 'ogg' || ext === 'opus' }; + break; + } + case 'document': + default: + msgPayload = { + document: buffer, + fileName: fileName || path.basename(filePath), + caption: caption || undefined, + mimetype: MIME_MAP[ext] || 'application/octet-stream', + }; + break; + } + + const sent = await sock.sendMessage(chatId, msgPayload); + res.json({ success: true, messageId: sent?.key?.id }); + } catch (err) { + res.status(500).json({ error: err.message }); + } +}); + // Typing indicator app.post('/typing', async (req, res) => { if (!sock || connectionState !== 'connected') {