@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
344 lines • 14.4 kB
JavaScript
/**
* Google Cloud Text-to-Speech Handler
*
* Handler for Google Cloud Text-to-Speech API integration.
*
* @module adapters/tts/googleTTSHandler
* @see https://cloud.google.com/text-to-speech/docs
*/
import { TextToSpeechClient } from "@google-cloud/text-to-speech";
import { TTSError, TTS_ERROR_CODES } from "../../utils/ttsProcessor.js";
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
import { logger } from "../../utils/logger.js";
import { SpanSerializer, SpanType, SpanStatus, getMetricsAggregator, } from "../../observability/index.js";
export class GoogleTTSHandler {
client = null;
voicesCache = null;
static CACHE_TTL_MS = 5 * 60 * 1000; // 5 minutes
/**
* Google Cloud TTS maximum input size.
* ~5000 bytes INCLUDING SSML tags.
*/
static DEFAULT_MAX_TEXT_LENGTH = 5000;
/**
* Default timeout for Google Cloud TTS API calls (milliseconds)
*
* Google typically responds within:
* - 1–5 seconds for short or normal text
* - 5–10 seconds for longer text or Neural2 voices
*/
static DEFAULT_API_TIMEOUT_MS = 30 * 1000;
/**
* Maximum text length supported by Google Cloud TTS (in bytes).
*
* NOTE:
* Validation against this limit is performed by the shared TTS processor
* before invoking provider handlers, not inside this class.
*/
maxTextLength = GoogleTTSHandler.DEFAULT_MAX_TEXT_LENGTH;
constructor(credentialsPath) {
const path = credentialsPath ?? process.env.GOOGLE_APPLICATION_CREDENTIALS;
if (path) {
this.client = new TextToSpeechClient({ keyFilename: path });
}
}
/**
* Validate that the provider is properly configured
*
* @returns True if provider can generate TTS
*/
isConfigured() {
return this.client !== null;
}
/**
* Get available voices for the provider
*
* Note: This method is optional in the TTSHandler interface, but Google Cloud TTS
* fully implements it to provide comprehensive voice discovery capabilities.
*
* @param languageCode - Optional language filter (e.g., "en-US")
* @returns List of available voices
*/
async getVoices(languageCode) {
if (!this.client) {
throw new TTSError({
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
category: ErrorCategory.CONFIGURATION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
const span = SpanSerializer.createSpan(SpanType.TTS, "tts.google.listVoices", {
"tts.operation": "listVoices",
"tts.provider": "google",
});
try {
// Return cached voices if available, valid, and no language filter is specified
if (this.voicesCache &&
Date.now() - this.voicesCache.timestamp <
GoogleTTSHandler.CACHE_TTL_MS &&
!languageCode) {
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.OK);
getMetricsAggregator().recordSpan(endedSpan);
return this.voicesCache.voices;
}
// Call Google Cloud listVoices API
const [response] = await this.client.listVoices(languageCode ? { languageCode } : {});
if (!response.voices || response.voices.length === 0) {
logger.warn("Google Cloud TTS returned no voices");
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.OK);
getMetricsAggregator().recordSpan(endedSpan);
return [];
}
const voices = [];
for (const voice of response.voices ?? []) {
// Validate required fields
if (!voice.name ||
!Array.isArray(voice.languageCodes) ||
voice.languageCodes.length === 0) {
logger.warn("Skipping voice with missing required fields", {
name: voice.name,
languageCodesCount: voice.languageCodes?.length,
});
continue;
}
const voiceName = voice.name;
const languageCodes = voice.languageCodes;
const primaryLanguageCode = languageCodes[0];
const voiceType = this.detectVoiceType(voiceName);
// Map Google's ssmlGender → internal TTSGender
const gender = voice.ssmlGender === "MALE"
? "male"
: voice.ssmlGender === "FEMALE"
? "female"
: "neutral";
voices.push({
id: voiceName,
name: voiceName,
languageCode: primaryLanguageCode,
languageCodes,
gender,
type: voiceType,
naturalSampleRateHertz: voice.naturalSampleRateHertz ?? undefined,
});
}
// Cache the result with timestamp if no language filter
if (!languageCode) {
this.voicesCache = { voices, timestamp: Date.now() };
}
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.OK);
getMetricsAggregator().recordSpan(endedSpan);
return voices;
}
catch (err) {
// Record error span
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.ERROR, err instanceof Error ? err.message : "Unknown error");
getMetricsAggregator().recordSpan(endedSpan);
// Log error but return empty array for graceful degradation
const message = err instanceof Error ? err.message : "Unknown error";
logger.error(`Failed to fetch Google TTS voices: ${message}`);
return [];
}
}
/**
* Generate audio from text using provider-specific TTS API
*
* @param text - Text or SSML to convert to speech
* @param options - TTS configuration options
* @returns Audio buffer with metadata
*/
async synthesize(text, options) {
if (!this.client) {
throw new TTSError({
code: TTS_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
message: "Google Cloud TTS client not initialized. Set GOOGLE_APPLICATION_CREDENTIALS or pass credentials path.",
category: ErrorCategory.CONFIGURATION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
const voiceId = options.voice ?? "en-US-Neural2-C";
const span = SpanSerializer.createSpan(SpanType.TTS, "tts.google.synthesize", {
"tts.operation": "synthesize",
"tts.provider": "google",
"tts.voice": voiceId,
"tts.format": options.format ?? "mp3",
});
const startTime = Date.now();
try {
const isSSML = text.startsWith("<speak>") && text.endsWith("</speak>");
// Note: This validation only checks for the presence of opening and closing <speak> tags.
// Other SSML validation, such as malformed structure, unclosed inner tags, or invalid elements,
// will be handled by Google's API.
if ((text.startsWith("<speak>") && !text.endsWith("</speak>")) ||
(!text.startsWith("<speak>") && text.endsWith("</speak>"))) {
throw new TTSError({
code: TTS_ERROR_CODES.INVALID_INPUT,
message: "Malformed SSML: missing opening <speak> or closing </speak> tag.",
category: ErrorCategory.VALIDATION,
severity: ErrorSeverity.MEDIUM,
retriable: false,
});
}
const languageCode = this.extractLanguageCode(voiceId);
const audioEncoding = this.mapFormat(options.format ?? "mp3");
const request = {
input: isSSML ? { ssml: text } : { text },
voice: {
name: voiceId,
languageCode,
},
audioConfig: {
audioEncoding,
speakingRate: options.speed ?? 1.0,
pitch: options.pitch ?? 0.0,
volumeGainDb: options.volumeGainDb ?? 0.0,
},
};
const [response] = await this.client.synthesizeSpeech(request, {
timeout: GoogleTTSHandler.DEFAULT_API_TIMEOUT_MS,
});
const audioContent = response.audioContent;
if (!audioContent) {
throw new TTSError({
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
message: "Google TTS returned empty audio content",
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: true,
});
}
const buffer = audioContent instanceof Uint8Array
? Buffer.from(audioContent)
: typeof audioContent === "string"
? Buffer.from(audioContent, "base64")
: (() => {
throw new TTSError({
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
message: "Unsupported audioContent type returned by Google TTS",
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: true,
context: { type: typeof audioContent },
});
})();
const latency = Date.now() - startTime;
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.OK);
getMetricsAggregator().recordSpan(endedSpan);
return {
buffer,
format: options.format ?? "mp3",
size: buffer.length,
voice: voiceId,
metadata: {
latency,
provider: "google-ai",
},
};
}
catch (err) {
const endedSpan = SpanSerializer.endSpan(span, SpanStatus.ERROR, err instanceof Error ? err.message : String(err));
getMetricsAggregator().recordSpan(endedSpan);
if (err instanceof TTSError) {
throw err;
}
const latency = Date.now() - startTime;
const message = err instanceof Error ? err.message : "Unknown error";
throw new TTSError({
code: TTS_ERROR_CODES.SYNTHESIS_FAILED,
message: `Google TTS failed after ${latency}ms: ${message}`,
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: true,
context: { latency },
originalError: err instanceof Error ? err : undefined,
});
}
}
/**
* Extract language code from a Google Cloud voice name
*
* Example:
* "en-US-Neural2-C" -> "en-US"
*
* @param voiceId - Google Cloud voice identifier
* @returns Language code compatible with Google TTS
*/
extractLanguageCode(voiceId) {
const parts = voiceId.split("-");
if (parts.length >= 2) {
return `${parts[0]}-${parts[1]}`;
}
else {
throw new TTSError({
code: TTS_ERROR_CODES.INVALID_INPUT,
message: `Invalid Google TTS voiceId format: "${voiceId}". Expected format like "en-US-Neural2-C".`,
category: ErrorCategory.VALIDATION,
severity: ErrorSeverity.MEDIUM,
retriable: false,
context: { voiceId },
});
}
}
/**
* Map application audio format to Google Cloud audio encoding
*
* @param format - Audio format requested by the caller
* @returns Google Cloud AudioEncoding enum value
* @throws Error if format is unsupported
*/
mapFormat(format) {
switch (format.toLowerCase()) {
case "mp3":
return "MP3";
case "wav":
return "LINEAR16";
case "ogg":
case "opus":
return "OGG_OPUS";
default:
throw new TTSError({
code: TTS_ERROR_CODES.INVALID_INPUT,
message: `Unsupported audio format: ${format}`,
category: ErrorCategory.VALIDATION,
severity: ErrorSeverity.MEDIUM,
retriable: false,
context: { format },
});
}
}
/**
* Detect the voice type from a Google Cloud TTS voice name
*
* Parses the voice name to identify the underlying voice technology/model type.
* Google Cloud TTS offers different voice types with varying quality and pricing.
*
* @param name - The full Google Cloud voice name (e.g., "en-US-Neural2-C")
* @returns The detected voice type
*
* @example
* detectVoiceType("en-US-Neural2-C") // returns "neural"
* detectVoiceType("en-US-Wavenet-A") // returns "wavenet"
* detectVoiceType("en-US-Standard-B") // returns "standard"
* detectVoiceType("en-US-Chirp-A") // returns "chirp"
* detectVoiceType("en-US-Journey-D") // returns "unknown" (unrecognized type)
*/
detectVoiceType(name) {
const tokens = name.toLowerCase().split("-");
if (tokens.some((t) => t.startsWith("chirp"))) {
return "chirp";
}
if (tokens.includes("neural2")) {
return "neural";
}
if (tokens.includes("wavenet")) {
return "wavenet";
}
if (tokens.includes("standard")) {
return "standard";
}
return "unknown";
}
}
//# sourceMappingURL=googleTTSHandler.js.map