UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

454 lines (453 loc) 18.3 kB
/** * Google Cloud Speech-to-Text Handler * * Implementation of STT using Google Cloud Speech-to-Text API. * * @module voice/providers/GoogleSTT */ import { logger } from "../../utils/logger.js"; import { STTError } from "../errors.js"; /** * Google Cloud Speech-to-Text Handler * * Supports transcription with speaker diarization, word timestamps, and punctuation. * * @see https://cloud.google.com/speech-to-text/docs */ export class GoogleSTT { apiKey; credentialsPath; baseUrl = "https://speech.googleapis.com/v1"; /** * Maximum audio duration in seconds for the synchronous recognize endpoint. * For longer audio, use the async longrunningrecognize endpoint (not yet implemented). */ maxAudioDuration = 60; /** * True streaming requires gRPC (not yet implemented). * transcribeStream() uses a chunk-and-batch workaround. */ supportsStreaming = false; constructor(apiKey, credentialsPath) { // Accept GOOGLE_AI_API_KEY / GEMINI_API_KEY as aliases since `.env.example` // documents those as the canonical Google credentials and forcing users to // also set GOOGLE_API_KEY just for STT was a footgun (Copilot review). const resolvedKey = (apiKey ?? process.env.GOOGLE_API_KEY ?? process.env.GOOGLE_AI_API_KEY ?? process.env.GEMINI_API_KEY ?? "").trim(); this.apiKey = resolvedKey.length > 0 ? resolvedKey : null; const resolvedCreds = (credentialsPath ?? process.env.GOOGLE_APPLICATION_CREDENTIALS ?? "").trim(); this.credentialsPath = resolvedCreds.length > 0 ? resolvedCreds : null; } isConfigured() { return this.apiKey !== null || this.credentialsPath !== null; } getSupportedFormats() { return ["mp3", "wav", "ogg", "opus"]; } async getSupportedLanguages() { // Return common languages supported by Google STT return [ { code: "en-US", name: "English (US)", supportsDiarization: true, supportsPunctuation: true, }, { code: "en-GB", name: "English (UK)", supportsDiarization: true, supportsPunctuation: true, }, { code: "es-ES", name: "Spanish (Spain)", supportsDiarization: true, supportsPunctuation: true, }, { code: "es-US", name: "Spanish (US)", supportsDiarization: true, supportsPunctuation: true, }, { code: "fr-FR", name: "French", supportsDiarization: true, supportsPunctuation: true, }, { code: "de-DE", name: "German", supportsDiarization: true, supportsPunctuation: true, }, { code: "it-IT", name: "Italian", supportsDiarization: true, supportsPunctuation: true, }, { code: "pt-BR", name: "Portuguese (Brazil)", supportsDiarization: true, supportsPunctuation: true, }, { code: "ja-JP", name: "Japanese", supportsDiarization: true, supportsPunctuation: true, }, { code: "ko-KR", name: "Korean", supportsDiarization: true, supportsPunctuation: true, }, { code: "zh-CN", name: "Chinese (Simplified)", supportsDiarization: true, supportsPunctuation: true, }, { code: "zh-TW", name: "Chinese (Traditional)", supportsDiarization: true, supportsPunctuation: true, }, { code: "ar-SA", name: "Arabic", supportsDiarization: true, supportsPunctuation: true, }, { code: "hi-IN", name: "Hindi", supportsDiarization: true, supportsPunctuation: true, }, { code: "ru-RU", name: "Russian", supportsDiarization: true, supportsPunctuation: true, }, ]; } async transcribe(audio, options = {}) { if (!this.isConfigured()) { throw STTError.providerNotConfigured("google-stt"); } const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio); if (audioBuffer.length === 0) { throw STTError.audioEmpty("google-stt"); } const googleOptions = options; const startTime = Date.now(); try { // Build recognition config const detectedFormat = options.format ?? "wav"; const config = { encoding: this.getEncoding(detectedFormat), // Omit sampleRateHertz for WAV/FLAC — the API reads it from the header. // Hardcoding a wrong value causes "sample_rate_hertz must match WAV header" errors. ...(detectedFormat !== "wav" && detectedFormat !== "flac" ? { sampleRateHertz: options.sampleRate ?? 16000 } : options.sampleRate ? { sampleRateHertz: options.sampleRate } : {}), languageCode: options.language ?? "en-US", enableAutomaticPunctuation: options.punctuation ?? true, enableWordTimeOffsets: options.wordTimestamps ?? false, enableWordConfidence: true, profanityFilter: options.profanityFilter ?? false, }; // Add model if specified if (googleOptions.model) { config.model = googleOptions.model; } // Add enhanced model option if (googleOptions.useEnhanced) { config.useEnhanced = true; } // Add diarization if requested if (options.speakerDiarization) { config.enableSpeakerDiarization = true; if (options.speakerCount) { config.diarizationSpeakerCount = options.speakerCount; } } // Add max alternatives if (googleOptions.maxAlternatives) { config.maxAlternatives = googleOptions.maxAlternatives; } // Build request const requestBody = { config, audio: { content: audioBuffer.toString("base64"), }, }; // Build URL with API key const url = this.apiKey ? `${this.baseUrl}/speech:recognize?key=${this.apiKey}` : `${this.baseUrl}/speech:recognize`; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 30000); let response; try { response = await fetch(url, { method: "POST", headers: { "Content-Type": "application/json", ...(this.credentialsPath && !this.apiKey ? { Authorization: `Bearer ${await this.getAccessToken()}` } : {}), }, body: JSON.stringify(requestBody), signal: controller.signal, }); } catch (fetchErr) { if (fetchErr instanceof Error && fetchErr.name === "AbortError") { throw STTError.transcriptionFailed("Google STT request timed out after 30 seconds", "google-stt", fetchErr); } throw fetchErr; } finally { clearTimeout(timeoutId); } if (!response.ok) { const errorData = await response .json() .catch(() => Object.create(null)); const errorMessage = errorData.error?.message || `HTTP ${response.status}`; throw STTError.transcriptionFailed(errorMessage, "google-stt"); } const data = (await response.json()); const latency = Date.now() - startTime; // Handle empty results if (!data.results || data.results.length === 0) { return { text: "", confidence: 0, language: options.language, metadata: { latency, provider: "google-stt", }, }; } // Build result from all alternatives const result = { text: data.results .map((r) => r.alternatives[0]?.transcript ?? "") .join(" ") .trim(), confidence: this.calculateAverageConfidence(data.results), language: data.results[0]?.languageCode ?? options.language, metadata: { latency, provider: "google-stt", billedTime: data.totalBilledTime, }, }; // Add word timings const words = []; const speakers = new Set(); for (const resultItem of data.results) { const alternative = resultItem.alternatives[0]; if (alternative?.words) { for (const wordInfo of alternative.words) { const word = { word: wordInfo.word, startTime: this.parseDuration(wordInfo.startTime), endTime: this.parseDuration(wordInfo.endTime), confidence: wordInfo.confidence, }; if (wordInfo.speakerTag !== undefined) { word.speaker = `Speaker ${wordInfo.speakerTag}`; speakers.add(word.speaker); } words.push(word); } } } if (words.length > 0) { result.words = words; } if (speakers.size > 0) { result.speakers = Array.from(speakers); } // Add segments result.segments = data.results.map((resultItem, index) => { const alt = resultItem.alternatives[0]; return { index, text: alt?.transcript ?? "", isFinal: true, confidence: alt?.confidence ?? 0, language: resultItem.languageCode, }; }); logger.info(`[GoogleSTTHandler] Transcribed audio in ${latency}ms`); return result; } catch (err) { if (err instanceof STTError) { throw err; } const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error"); logger.error(`[GoogleSTTHandler] Transcription failed: ${errorMessage}`); throw STTError.transcriptionFailed(errorMessage, "google-stt", err instanceof Error ? err : undefined); } } /** * Streaming transcription (placeholder - requires WebSocket/gRPC) */ async *transcribeStream(audioStream, options) { // Google streaming STT requires gRPC or WebSocket connection // For now, buffer and transcribe in chunks const chunks = []; let chunkIndex = 0; for await (const chunk of audioStream) { chunks.push(chunk); // Process every ~5 seconds of audio (assuming 16kHz, 16-bit) const bytesPerSecond = 16000 * 2; // 16kHz * 2 bytes const totalBytes = chunks.reduce((sum, c) => sum + c.length, 0); if (totalBytes >= bytesPerSecond * 5) { const audio = Buffer.concat(chunks); chunks.length = 0; try { const result = await this.transcribe(audio, options); yield { index: chunkIndex++, text: result.text, isFinal: false, confidence: result.confidence, }; } catch (err) { // M5: distinguish permanent (auth, schema, 4xx) from transient // (5xx, 429, network) errors. Permanent errors retry indefinitely // and racks up failed API calls; rethrow to terminate the stream. // Transient errors get logged and skipped so a multi-minute audio // stream can recover from a transient hiccup. const msg = err instanceof Error ? err.message : String(err); const isPermanent = /\b(401|403|404|UNAUTHENTICATED|PERMISSION_DENIED|INVALID_ARGUMENT|UNAUTHORIZED|FORBIDDEN|invalid.*credential|invalid.*key)\b/i.test(msg); if (isPermanent) { logger.error(`[GoogleSTTHandler] Permanent chunk error — terminating stream: ${msg}`); throw err; } logger.warn(`[GoogleSTTHandler] Transient chunk failure (skipping): ${msg}`); } } } // Process remaining audio if (chunks.length > 0) { const audio = Buffer.concat(chunks); try { const result = await this.transcribe(audio, options); yield { index: chunkIndex, text: result.text, isFinal: true, confidence: result.confidence, }; } catch (err) { // Don't swallow the final chunk's terminal errors — auth/config/4xx // failures here would otherwise look like a successful empty // transcription, hiding the root cause from callers (CodeRabbit // review). Mirror the permanent-vs-transient split used in the // chunk loop above (Azure/Google share this taxonomy). const msg = err instanceof Error ? err.message : String(err); const isPermanent = /\b(401|403|404|Forbidden|Unauthorized|Invalid.*credential|Invalid.*key|Permission|PERMISSION_DENIED|UNAUTHENTICATED|INVALID_ARGUMENT)\b/i.test(msg); if (isPermanent) { logger.error(`[GoogleSTTHandler] Permanent final-chunk error — surfacing: ${msg}`); throw err; } logger.warn(`[GoogleSTTHandler] Final chunk transcription failed (transient): ${msg}`); } } } /** * Get encoding string for audio format */ getEncoding(format) { const encodings = { mp3: "MP3", wav: "LINEAR16", ogg: "OGG_OPUS", opus: "OGG_OPUS", }; return encodings[format] ?? "LINEAR16"; } /** * Parse duration string (e.g., "1.5s") to seconds */ parseDuration(duration) { if (!duration) { return 0; } const match = duration.match(/^([\d.]+)s$/); return match ? parseFloat(match[1]) : 0; } /** * Calculate average confidence from results */ calculateAverageConfidence(results) { const confidences = results .map((r) => r.alternatives[0]?.confidence) .filter((c) => typeof c === "number"); if (confidences.length === 0) { return 0; } return confidences.reduce((sum, c) => sum + c, 0) / confidences.length; } /** * Get access token from service account credentials. * * M3: previously caught all errors and returned `""`, which then caused * a silent 401 from the Google API and a confusing downstream HTTP error * with no trace of the original auth failure. Now rethrows as STTError so * the caller sees the auth root cause. */ async getAccessToken() { try { const { GoogleAuth } = await import("google-auth-library"); const auth = new GoogleAuth({ ...(this.credentialsPath ? { keyFilename: this.credentialsPath } : {}), scopes: ["https://www.googleapis.com/auth/cloud-platform"], }); const client = await auth.getClient(); const tokenResponse = await client.getAccessToken(); const token = tokenResponse.token; if (!token) { throw STTError.transcriptionFailed("Google access token returned empty — check GOOGLE_APPLICATION_CREDENTIALS path and service account permissions", "google-stt"); } return token; } catch (err) { logger.error(`[GoogleSTTHandler] Failed to acquire access token: ${err instanceof Error ? err.message : String(err)}`); // Use instanceof — refactor-resilient and matches the pattern in // transcribe(). The earlier `err.name === "STTError"` check would // double-wrap if the base class ever overwrote `name`. if (err instanceof STTError) { throw err; } throw STTError.transcriptionFailed(`Google access token acquisition failed: ${err instanceof Error ? err.message : String(err)}`, "google-stt"); } } }