Authored by aydnOktay. Adds file format and size validation before API calls, specific exception handling, and improved logging.
178 lines
5.5 KiB
Python
178 lines
5.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Transcription Tools Module
|
|
|
|
Provides speech-to-text transcription using OpenAI's Whisper API.
|
|
Used by the messaging gateway to automatically transcribe voice messages
|
|
sent by users on Telegram, Discord, WhatsApp, and Slack.
|
|
|
|
Supported models:
|
|
- whisper-1 (cheapest, good quality)
|
|
- gpt-4o-mini-transcribe (better quality, higher cost)
|
|
- gpt-4o-transcribe (best quality, highest cost)
|
|
|
|
Supported input formats: mp3, mp4, mpeg, mpga, m4a, wav, webm, ogg
|
|
|
|
Usage:
|
|
from tools.transcription_tools import transcribe_audio
|
|
|
|
result = transcribe_audio("/path/to/audio.ogg")
|
|
if result["success"]:
|
|
print(result["transcript"])
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# Default STT model -- cheapest and widely available
|
|
DEFAULT_STT_MODEL = "whisper-1"
|
|
|
|
# Supported audio formats
|
|
SUPPORTED_FORMATS = {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm", ".ogg"}
|
|
|
|
# Maximum file size (25MB - OpenAI limit)
|
|
MAX_FILE_SIZE = 25 * 1024 * 1024
|
|
|
|
|
|
def transcribe_audio(file_path: str, model: Optional[str] = None) -> Dict[str, Any]:
|
|
"""
|
|
Transcribe an audio file using OpenAI's Whisper API.
|
|
|
|
This function calls the OpenAI Audio Transcriptions endpoint directly
|
|
(not via OpenRouter, since Whisper isn't available there).
|
|
|
|
Args:
|
|
file_path: Absolute path to the audio file to transcribe.
|
|
model: Whisper model to use. Defaults to config or "whisper-1".
|
|
|
|
Returns:
|
|
dict with keys:
|
|
- "success" (bool): Whether transcription succeeded
|
|
- "transcript" (str): The transcribed text (empty on failure)
|
|
- "error" (str, optional): Error message if success is False
|
|
"""
|
|
api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY")
|
|
if not api_key:
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": "VOICE_TOOLS_OPENAI_KEY or OPENAI_API_KEY not set",
|
|
}
|
|
|
|
audio_path = Path(file_path)
|
|
|
|
# Validate file exists
|
|
if not audio_path.exists():
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Audio file not found: {file_path}",
|
|
}
|
|
|
|
if not audio_path.is_file():
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Path is not a file: {file_path}",
|
|
}
|
|
|
|
# Validate file extension
|
|
if audio_path.suffix.lower() not in SUPPORTED_FORMATS:
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Unsupported file format: {audio_path.suffix}. Supported formats: {', '.join(SUPPORTED_FORMATS)}",
|
|
}
|
|
|
|
# Validate file size
|
|
try:
|
|
file_size = audio_path.stat().st_size
|
|
if file_size > MAX_FILE_SIZE:
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"File too large: {file_size / (1024*1024):.1f}MB (max {MAX_FILE_SIZE / (1024*1024)}MB)",
|
|
}
|
|
except OSError as e:
|
|
logger.error("Failed to get file size for %s: %s", file_path, e, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Failed to access file: {e}",
|
|
}
|
|
|
|
# Use provided model, or fall back to default
|
|
if model is None:
|
|
model = DEFAULT_STT_MODEL
|
|
|
|
try:
|
|
from openai import OpenAI
|
|
from openai import APIError, APIConnectionError, APITimeoutError
|
|
|
|
client = OpenAI(api_key=api_key, base_url="https://api.openai.com/v1")
|
|
|
|
with open(file_path, "rb") as audio_file:
|
|
transcription = client.audio.transcriptions.create(
|
|
model=model,
|
|
file=audio_file,
|
|
response_format="text",
|
|
)
|
|
|
|
# The response is a plain string when response_format="text"
|
|
transcript_text = str(transcription).strip()
|
|
|
|
logger.info("Transcribed %s (%d chars)", audio_path.name, len(transcript_text))
|
|
|
|
return {
|
|
"success": True,
|
|
"transcript": transcript_text,
|
|
}
|
|
|
|
except FileNotFoundError:
|
|
logger.error("Audio file not found: %s", file_path, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Audio file not found: {file_path}",
|
|
}
|
|
except PermissionError:
|
|
logger.error("Permission denied accessing file: %s", file_path, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Permission denied: {file_path}",
|
|
}
|
|
except APIConnectionError as e:
|
|
logger.error("API connection error during transcription: %s", e, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Connection error: {e}",
|
|
}
|
|
except APITimeoutError as e:
|
|
logger.error("API timeout during transcription: %s", e, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Request timeout: {e}",
|
|
}
|
|
except APIError as e:
|
|
logger.error("OpenAI API error during transcription: %s", e, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"API error: {e}",
|
|
}
|
|
except Exception as e:
|
|
logger.error("Unexpected error during transcription: %s", e, exc_info=True)
|
|
return {
|
|
"success": False,
|
|
"transcript": "",
|
|
"error": f"Transcription failed: {e}",
|
|
}
|