@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
286 lines (285 loc) • 10.3 kB
JavaScript
/**
* OpenAI Whisper Speech-to-Text Handler
*
* Implementation of STT using OpenAI's Whisper model.
*
* @module voice/providers/OpenAISTT
*/
import { logger } from "../../utils/logger.js";
import { STTError } from "../errors.js";
/**
* OpenAI Whisper Speech-to-Text Handler
*
* Supports transcription and translation using OpenAI's Whisper model.
*
* @see https://platform.openai.com/docs/api-reference/audio
*/
export class OpenAISTT {
apiKey;
baseUrl = "https://api.openai.com/v1";
/**
* Maximum audio duration in seconds (25 minutes)
*/
maxAudioDuration = 25 * 60;
/**
* Whisper does not support streaming
*/
supportsStreaming = false;
constructor(apiKey) {
const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
}
isConfigured() {
return this.apiKey !== null;
}
getSupportedFormats() {
// OpenAI Whisper transcription API accepts: flac, m4a, mp3, mp4, mpeg,
// mpga, oga, ogg, opus, wav, webm. Keep this in sync with TTSAudioFormat
// — formats not listed in TTSAudioFormat are filtered out by the type.
return [
"mp3",
"wav",
"ogg",
"opus",
"m4a",
"flac",
"webm",
"mp4",
"mpeg",
"mpga",
];
}
async getSupportedLanguages() {
// Whisper supports 100+ languages
// Return the most common ones
return [
{
code: "en",
name: "English",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "es",
name: "Spanish",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "fr",
name: "French",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "de",
name: "German",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "it",
name: "Italian",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "pt",
name: "Portuguese",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "ru",
name: "Russian",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "ja",
name: "Japanese",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "ko",
name: "Korean",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "zh",
name: "Chinese",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "ar",
name: "Arabic",
supportsDiarization: false,
supportsPunctuation: true,
},
{
code: "hi",
name: "Hindi",
supportsDiarization: false,
supportsPunctuation: true,
},
];
}
async transcribe(audio, options = {}) {
if (!this.apiKey) {
throw STTError.providerNotConfigured("whisper");
}
const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
if (audioBuffer.length === 0) {
throw STTError.audioEmpty("whisper");
}
const whisperOptions = options;
const startTime = Date.now();
try {
// Prepare form data
const formData = new FormData();
// Add audio file - convert Buffer to Uint8Array for compatibility
const audioBlob = new Blob([new Uint8Array(audioBuffer)], {
type: this.getMimeType(options.format ?? "wav"),
});
formData.append("file", audioBlob, `audio.${options.format ?? "wav"}`);
// Add model
formData.append("model", whisperOptions.model ?? "whisper-1");
// Add optional parameters
if (options.language) {
formData.append("language", options.language);
}
if (whisperOptions.prompt) {
formData.append("prompt", whisperOptions.prompt);
}
if (whisperOptions.temperature !== undefined) {
formData.append("temperature", whisperOptions.temperature.toString());
}
// Request verbose_json for detailed response
const responseFormat = whisperOptions.responseFormat ?? "verbose_json";
formData.append("response_format", responseFormat);
// Add timestamp granularities for word-level timestamps
if (options.wordTimestamps && responseFormat === "verbose_json") {
formData.append("timestamp_granularities[]", "word");
formData.append("timestamp_granularities[]", "segment");
}
// Choose endpoint based on translation option
const endpoint = whisperOptions.translate
? `${this.baseUrl}/audio/translations`
: `${this.baseUrl}/audio/transcriptions`;
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 30000);
let response;
try {
response = await fetch(endpoint, {
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
},
body: formData,
signal: controller.signal,
});
}
catch (fetchErr) {
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
throw STTError.transcriptionFailed("OpenAI STT request timed out after 30 seconds", "whisper", fetchErr);
}
throw fetchErr;
}
finally {
clearTimeout(timeoutId);
}
if (!response.ok) {
const errorData = await response
.json()
.catch(() => Object.create(null));
const errorMessage = errorData.error?.message ||
`HTTP ${response.status}`;
throw STTError.transcriptionFailed(errorMessage, "whisper");
}
const latency = Date.now() - startTime;
// Parse response based on format
if (responseFormat === "text") {
const text = await response.text();
return {
text,
confidence: 0.95, // Whisper doesn't return confidence
metadata: {
latency,
provider: "whisper",
model: whisperOptions.model ?? "whisper-1",
},
};
}
const data = (await response.json());
// Build result
const result = {
text: data.text,
confidence: 0.95, // Whisper doesn't return per-result confidence
language: data.language,
duration: data.duration,
metadata: {
latency,
provider: "whisper",
model: whisperOptions.model ?? "whisper-1",
task: data.task,
},
};
// Add word timings if available
if (data.words && data.words.length > 0) {
result.words = data.words.map((word) => ({
word: word.word,
startTime: word.start,
endTime: word.end,
}));
}
// Add segments
if (data.segments && data.segments.length > 0) {
result.segments = data.segments.map((segment, index) => ({
index,
text: segment.text,
isFinal: true,
confidence: Math.exp(segment.avg_logprob), // Convert log prob to confidence
startTime: segment.start,
endTime: segment.end,
}));
}
logger.info(`[WhisperSTTHandler] Transcribed ${data.duration?.toFixed(1) ?? "?"}s audio in ${latency}ms`);
return result;
}
catch (err) {
if (err instanceof STTError) {
throw err;
}
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
logger.error(`[WhisperSTTHandler] Transcription failed: ${errorMessage}`);
throw STTError.transcriptionFailed(errorMessage, "whisper", err instanceof Error ? err : undefined);
}
}
/**
* Get MIME type for audio format. Whisper auto-detects from headers, but
* sending a correct MIME helps providers / proxies that sniff Content-Type.
* Must stay aligned with `getSupportedFormats()`.
*/
getMimeType(format) {
const mimeTypes = {
mp3: "audio/mpeg",
wav: "audio/wav",
ogg: "audio/ogg",
opus: "audio/opus",
m4a: "audio/mp4",
flac: "audio/flac",
webm: "audio/webm",
mp4: "audio/mp4",
mpeg: "audio/mpeg",
mpga: "audio/mpeg",
};
return mimeTypes[format] ?? "audio/wav";
}
}
// Export as named exports for compatibility
export { OpenAISTT as WhisperSTT };
export { OpenAISTT as WhisperSTTHandler };
export { OpenAISTT as OpenAISTTHandler };