@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
271 lines (270 loc) • 9.79 kB
JavaScript
/**
* OpenAI Text-to-Speech Handler
*
* Implementation of TTS using OpenAI's TTS API.
*
* @module voice/providers/OpenAITTS
*/
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
import { logger } from "../../utils/logger.js";
import { TTS_ERROR_CODES, TTSError } from "../../utils/ttsProcessor.js";
/**
* OpenAI Text-to-Speech Handler
*
* Supports high-quality neural TTS with multiple voices.
*
* @see https://platform.openai.com/docs/api-reference/audio/createSpeech
*/
export class OpenAITTS {
apiKey;
baseUrl = "https://api.openai.com/v1";
/**
* Maximum text length (4096 characters)
*/
maxTextLength = 4096;
/**
* Available voices
*/
static VOICES = [
{
id: "alloy",
name: "Alloy",
languageCode: "en",
languageCodes: ["en"],
gender: "neutral",
type: "neural",
},
{
id: "echo",
name: "Echo",
languageCode: "en",
languageCodes: ["en"],
gender: "male",
type: "neural",
},
{
id: "fable",
name: "Fable",
languageCode: "en",
languageCodes: ["en"],
gender: "neutral",
type: "neural",
},
{
id: "onyx",
name: "Onyx",
languageCode: "en",
languageCodes: ["en"],
gender: "male",
type: "neural",
},
{
id: "nova",
name: "Nova",
languageCode: "en",
languageCodes: ["en"],
gender: "female",
type: "neural",
},
{
id: "shimmer",
name: "Shimmer",
languageCode: "en",
languageCodes: ["en"],
gender: "female",
type: "neural",
},
];
constructor(apiKey) {
const resolvedKey = (apiKey ?? process.env.OPENAI_API_KEY ?? "").trim();
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
}
isConfigured() {
return this.apiKey !== null;
}
async getVoices(languageCode) {
// OpenAI voices are pre-defined, filter by language if provided
if (languageCode && !languageCode.startsWith("en")) {
// OpenAI TTS works with multiple languages but voices are English-named
return OpenAITTS.VOICES;
}
return OpenAITTS.VOICES;
}
async synthesize(text, options = {}) {
if (!this.apiKey) {
throw new TTSError({
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
message: "OpenAI TTS API key not configured",
category: ErrorCategory.CONFIGURATION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
const startTime = Date.now();
const openaiOptions = options;
try {
// Determine model based on quality
const model = openaiOptions.model ??
(options.quality === "hd" ? "tts-1-hd" : "tts-1");
// Determine voice
const voice = options.voice ?? "alloy";
// Determine format
const responseFormat = this.mapFormat(options.format ?? "mp3");
// Build request
const requestBody = {
model,
input: text,
voice,
response_format: responseFormat,
speed: options.speed ?? 1.0,
};
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 30000);
let response;
try {
response = await fetch(`${this.baseUrl}/audio/speech`, {
method: "POST",
headers: {
Authorization: `Bearer ${this.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(requestBody),
signal: controller.signal,
});
}
catch (fetchErr) {
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
throw new TTSError({
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
message: "OpenAI TTS request timed out after 30 seconds",
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.HIGH,
retriable: true,
originalError: fetchErr,
});
}
throw fetchErr;
}
finally {
clearTimeout(timeoutId);
}
if (!response.ok) {
const errorData = await response
.json()
.catch(() => Object.create(null));
const errorMessage = errorData.error?.message ||
`HTTP ${response.status}`;
// Preserve HTTP status so the outer catch doesn't mark a permanent
// 4xx (auth, bad input) as retriable and trigger pointless retry loops.
const retriable = response.status === 408 ||
response.status === 429 ||
response.status >= 500;
throw new TTSError({
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
message: errorMessage,
category: retriable ? ErrorCategory.NETWORK : ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable,
context: { status: response.status, model, responseFormat },
});
}
const latency = Date.now() - startTime;
// Get audio buffer
const arrayBuffer = await response.arrayBuffer();
const audioBuffer = Buffer.from(arrayBuffer);
// Use the *effective* output format (post-mapFormat fallback), not the
// requested format — otherwise mp3-coerced "m4a" requests would mislabel
// the buffer and break consumer file-extension routing.
const effectiveFormat = this.effectiveFormat(responseFormat);
const result = {
buffer: audioBuffer,
format: effectiveFormat,
size: audioBuffer.length,
voice,
sampleRate: this.getSampleRate(effectiveFormat),
metadata: {
latency,
provider: "openai-tts",
model,
requestedFormat: options.format,
responseFormat,
},
};
logger.info(`[OpenAITTSHandler] Synthesized ${audioBuffer.length} bytes in ${latency}ms`);
return result;
}
catch (err) {
if (err instanceof TTSError) {
throw err;
}
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
logger.error(`[OpenAITTSHandler] Synthesis failed: ${errorMessage}`);
throw new TTSError({
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
message: `Synthesis failed: ${errorMessage}`,
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: true,
context: { textLength: text.length },
originalError: err instanceof Error ? err : undefined,
});
}
}
/**
* Map TTSAudioFormat to OpenAI response_format.
* OpenAI TTS supports: mp3, wav, opus (ogg maps to opus).
* Unsupported formats are coerced to mp3 with a warning.
*/
mapFormat(format) {
const formats = {
mp3: "mp3",
wav: "wav",
ogg: "opus", // OpenAI uses opus for ogg
opus: "opus",
// OpenAI's "pcm" is raw 16-bit signed LE @ 24kHz (no header) — maps to
// canonical pcm16 in TTSResult.format. See effectiveFormat() below.
pcm16: "pcm",
};
const mapped = formats[format];
if (mapped === undefined) {
logger.warn(`[OpenAITTSHandler] Unsupported format "${format}" — falling back to "mp3". Supported formats: mp3, wav, ogg, opus, pcm16.`);
return "mp3";
}
return mapped;
}
/**
* Get sample rate for format
*/
getSampleRate(format) {
switch (format) {
case "opus":
case "ogg":
return 48000;
default:
return 24000;
}
}
/**
* Map the OpenAI `response_format` string back to the canonical
* `TTSAudioFormat` so `TTSResult.format` reflects what the API actually
* returned (mapFormat() coerces unsupported requests to "mp3"). Note:
* OpenAI returns Ogg-Opus for both "ogg" and "opus" requests — both
* surface as "opus" since the bytes are an .ogg/Opus container.
*/
effectiveFormat(responseFormat) {
switch (responseFormat) {
case "mp3":
return "mp3";
case "wav":
return "wav";
case "opus":
return "opus";
// Raw PCM (16-bit signed LE @ 24kHz, no header) — keep semantics in
// TTSResult.format so consumers don't write raw bytes to a .wav file.
case "pcm":
return "pcm16";
default:
return "mp3";
}
}
}