UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

345 lines 13.9 kB
/** * Azure Cognitive Services Speech-to-Text Handler * * Implementation of STT using Azure Speech Services. * * @module voice/providers/AzureSTT */ import { logger } from "../../utils/logger.js"; import { STTError } from "../errors.js"; /** * Azure Cognitive Services Speech-to-Text Handler * * Supports speech recognition with custom models and detailed output. * * @see https://docs.microsoft.com/azure/cognitive-services/speech-service/ */ export class AzureSTT { apiKey; region; /** * Maximum audio duration in seconds (60s — Azure's REST API for short audio * documented limit on `/speech/recognition/conversation/cognitiveservices/v1`). * For longer audio, use Azure Batch Transcription (not yet implemented) or * pre-segment the input. */ maxAudioDuration = 60; /** * Azure STT implementation buffers chunks via REST — not true streaming */ supportsStreaming = false; constructor(apiKey, region) { const resolvedKey = (apiKey ?? process.env.AZURE_SPEECH_KEY ?? "").trim(); this.apiKey = resolvedKey.length > 0 ? resolvedKey : null; const resolvedRegion = (region ?? process.env.AZURE_SPEECH_REGION ?? "").trim(); this.region = resolvedRegion.length > 0 ? resolvedRegion : "eastus"; } isConfigured() { return this.apiKey !== null && this.region.length > 0; } getSupportedFormats() { // Azure's "Speech-to-text REST API for short audio" only accepts uncompressed // PCM WAV (16kHz/16-bit/mono recommended) and Ogg/Opus. MP3 is NOT decoded // by this endpoint (it returns Success with empty text). For MP3 input use // the Batch Transcription API (not yet implemented) or convert to WAV first. return ["wav", "ogg", "opus"]; } async getSupportedLanguages() { // Azure supports 100+ languages return [ { code: "en-US", name: "English (US)", supportsDiarization: true, supportsPunctuation: true, }, { code: "en-GB", name: "English (UK)", supportsDiarization: true, supportsPunctuation: true, }, { code: "es-ES", name: "Spanish (Spain)", supportsDiarization: true, supportsPunctuation: true, }, { code: "es-MX", name: "Spanish (Mexico)", supportsDiarization: true, supportsPunctuation: true, }, { code: "fr-FR", name: "French", supportsDiarization: true, supportsPunctuation: true, }, { code: "de-DE", name: "German", supportsDiarization: true, supportsPunctuation: true, }, { code: "it-IT", name: "Italian", supportsDiarization: true, supportsPunctuation: true, }, { code: "pt-BR", name: "Portuguese (Brazil)", supportsDiarization: true, supportsPunctuation: true, }, { code: "ja-JP", name: "Japanese", supportsDiarization: true, supportsPunctuation: true, }, { code: "ko-KR", name: "Korean", supportsDiarization: true, supportsPunctuation: true, }, { code: "zh-CN", name: "Chinese (Simplified)", supportsDiarization: true, supportsPunctuation: true, }, { code: "hi-IN", name: "Hindi", supportsDiarization: true, supportsPunctuation: true, }, { code: "ar-SA", name: "Arabic", supportsDiarization: true, supportsPunctuation: true, }, { code: "ru-RU", name: "Russian", supportsDiarization: true, supportsPunctuation: true, }, ]; } async transcribe(audio, options = {}) { if (!this.apiKey) { throw STTError.providerNotConfigured("azure-stt"); } const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio); if (audioBuffer.length === 0) { throw STTError.audioEmpty("azure-stt"); } const azureOptions = options; const startTime = Date.now(); try { // Build the URL with query parameters const params = new URLSearchParams(); params.set("language", options.language ?? "en-US"); // Add detailed output format if (azureOptions.detailed || options.wordTimestamps) { params.set("format", "detailed"); } // Add profanity mode if (azureOptions.profanityMode) { params.set("profanity", azureOptions.profanityMode); } else if (options.profanityFilter) { params.set("profanity", "masked"); } // Add custom endpoint if provided const baseUrl = `https://${this.region}.stt.speech.microsoft.com`; if (azureOptions.customEndpointId) { params.set("cid", azureOptions.customEndpointId); } const url = `${baseUrl}/speech/recognition/conversation/cognitiveservices/v1?${params.toString()}`; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 30000); let response; try { response = await fetch(url, { method: "POST", headers: { "Ocp-Apim-Subscription-Key": this.apiKey, "Content-Type": this.getContentType(options.format ?? "wav"), Accept: "application/json", }, body: new Uint8Array(audioBuffer), signal: controller.signal, }); } catch (fetchErr) { if (fetchErr instanceof Error && fetchErr.name === "AbortError") { throw STTError.transcriptionFailed("Azure STT request timed out after 30 seconds", "azure-stt", fetchErr); } throw fetchErr; } finally { clearTimeout(timeoutId); } if (!response.ok) { const errorText = await response.text(); throw STTError.transcriptionFailed(`HTTP ${response.status}: ${errorText}`, "azure-stt"); } const data = (await response.json()); const latency = Date.now() - startTime; // Check recognition status if (data.RecognitionStatus !== "Success") { if (data.RecognitionStatus === "NoMatch") { return { text: "", confidence: 0, language: options.language, metadata: { latency, provider: "azure-stt", status: data.RecognitionStatus, }, }; } throw STTError.transcriptionFailed(`Recognition failed: ${data.RecognitionStatus}`, "azure-stt"); } // Build result from NBest or DisplayText const result = { text: data.DisplayText ?? "", confidence: 0.9, // Default confidence if not available language: options.language, duration: this.ticksToSeconds(data.Duration ?? 0), metadata: { latency, provider: "azure-stt", status: data.RecognitionStatus, }, }; // Process NBest results if available if (data.NBest && data.NBest.length > 0) { const best = data.NBest[0]; result.text = best.Display; result.confidence = best.Confidence; // Add word timings if (best.Words && best.Words.length > 0) { result.words = best.Words.map((word) => ({ word: word.Word, startTime: this.ticksToSeconds(word.Offset), endTime: this.ticksToSeconds(word.Offset + word.Duration), confidence: word.Confidence, })); } } logger.info(`[AzureSTTHandler] Transcribed audio in ${latency}ms`); return result; } catch (err) { if (err instanceof STTError) { throw err; } const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error"); logger.error(`[AzureSTTHandler] Transcription failed: ${errorMessage}`); throw STTError.transcriptionFailed(errorMessage, "azure-stt", err instanceof Error ? err : undefined); } } /** * Streaming transcription (placeholder - requires SDK) */ async *transcribeStream(audioStream, options) { // Azure streaming requires the Microsoft Speech SDK // For now, buffer and transcribe in chunks const chunks = []; let chunkIndex = 0; // Track buffered byte count incrementally — `chunks.reduce()` per incoming // chunk is O(n²) over long streams (Copilot/CodeRabbit review). Reset to 0 // every time we flush. let bufferedBytes = 0; for await (const chunk of audioStream) { chunks.push(chunk); bufferedBytes += chunk.length; // Process every ~5 seconds of audio const bytesPerSecond = (options.sampleRate ?? 16000) * 2; if (bufferedBytes >= bytesPerSecond * 5) { const audio = Buffer.concat(chunks); chunks.length = 0; bufferedBytes = 0; try { const result = await this.transcribe(audio, options); yield { index: chunkIndex++, text: result.text, isFinal: false, confidence: result.confidence, }; } catch (err) { // M5: distinguish permanent (auth, schema, 4xx) from transient // (5xx, 429, network) errors. Without this, an expired API key // would silently retry every chunk for the entire stream. const msg = err instanceof Error ? err.message : String(err); const isPermanent = /\b(401|403|404|Forbidden|Unauthorized|Invalid.*subscription|Invalid.*key|Wrong.*key|InvalidAudioFormat)\b/i.test(msg); if (isPermanent) { logger.error(`[AzureSTTHandler] Permanent chunk error — terminating stream: ${msg}`); throw err; } logger.warn(`[AzureSTTHandler] Transient chunk failure (skipping): ${msg}`); } } } // Process remaining audio if (chunks.length > 0) { const audio = Buffer.concat(chunks); try { const result = await this.transcribe(audio, options); yield { index: chunkIndex, text: result.text, isFinal: true, confidence: result.confidence, }; } catch (err) { // Mirror the permanent-vs-transient split from the chunk loop above so // auth/format failures don't masquerade as a successful empty // transcription on short streams (≤5s buffer flush). const msg = err instanceof Error ? err.message : String(err); const isPermanent = /\b(401|403|404|Forbidden|Unauthorized|Invalid.*subscription|Invalid.*key|Wrong.*key|InvalidAudioFormat)\b/i.test(msg); if (isPermanent) { logger.error(`[AzureSTTHandler] Permanent final-chunk error — surfacing: ${msg}`); throw err; } logger.warn(`[AzureSTTHandler] Final chunk transcription failed (transient): ${msg}`); } } } /** * Get Content-Type header for audio format */ getContentType(format) { // Note: MP3 is intentionally not in this map even though Azure won't reject // the Content-Type — the short-audio REST endpoint silently returns empty // text for MP3 bodies. See getSupportedFormats() for the supported list. const contentTypes = { wav: "audio/wav; codecs=audio/pcm; samplerate=16000", ogg: "audio/ogg; codecs=opus", opus: "audio/ogg; codecs=opus", }; return contentTypes[format] ?? "audio/wav"; } /** * Convert Azure ticks (100ns units) to seconds */ ticksToSeconds(ticks) { return ticks / 10000000; } } //# sourceMappingURL=AzureSTT.js.map