@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
550 lines (549 loc) • 21.2 kB
JavaScript
/**
* Deepgram Speech-to-Text Handler
*
* Implementation of STT using Deepgram's Speech Recognition API.
*
* @module voice/providers/DeepgramSTT
*/
import { logger } from "../../utils/logger.js";
import { STTError } from "../errors.js";
/**
* Deepgram Speech-to-Text Handler
*
* Supports real-time streaming, speaker diarization, and smart formatting.
*
* @see https://developers.deepgram.com/docs
*/
export class DeepgramSTT {
apiKey;
baseUrl = "https://api.deepgram.com/v1";
/**
* Maximum audio duration in seconds (2 hours)
*/
maxAudioDuration = 7200;
/**
* Deepgram supports streaming
*/
supportsStreaming = true;
constructor(apiKey) {
// Normalize: trim surrounding whitespace and treat empty string as null
// so isConfigured() and transcribe()/transcribeStream() agree on the
// contract (other voice providers all do this — Deepgram was missed).
const resolvedKey = (apiKey ?? process.env.DEEPGRAM_API_KEY ?? "").trim();
this.apiKey = resolvedKey.length > 0 ? resolvedKey : null;
}
isConfigured() {
return this.apiKey !== null;
}
getSupportedFormats() {
return ["mp3", "wav", "ogg", "opus"];
}
async getSupportedLanguages() {
// Deepgram supports 40+ languages
return [
{
code: "en",
name: "English",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "en-US",
name: "English (US)",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "en-GB",
name: "English (UK)",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "es",
name: "Spanish",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "fr",
name: "French",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "de",
name: "German",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "it",
name: "Italian",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "pt",
name: "Portuguese",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "nl",
name: "Dutch",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "ja",
name: "Japanese",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "ko",
name: "Korean",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "zh",
name: "Chinese",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "hi",
name: "Hindi",
supportsDiarization: true,
supportsPunctuation: true,
},
{
code: "ru",
name: "Russian",
supportsDiarization: true,
supportsPunctuation: true,
},
];
}
async transcribe(audio, options = {}) {
if (!this.apiKey) {
throw STTError.providerNotConfigured("deepgram");
}
const audioBuffer = Buffer.isBuffer(audio) ? audio : Buffer.from(audio);
if (audioBuffer.length === 0) {
throw STTError.audioEmpty("deepgram");
}
const deepgramOptions = options;
const startTime = Date.now();
try {
// Build query parameters
const params = new URLSearchParams();
// Add model
params.set("model", deepgramOptions.model ?? "nova-2");
// Add language
if (options.language) {
params.set("language", options.language);
}
// Add punctuation
if (options.punctuation !== false) {
params.set("punctuate", "true");
}
// Add diarization
if (options.speakerDiarization) {
params.set("diarize", "true");
if (options.speakerCount) {
params.set("diarize_version", "latest");
}
}
// Add smart format
if (deepgramOptions.smartFormat) {
params.set("smart_format", "true");
}
// Add utterances
if (deepgramOptions.utterances) {
params.set("utterances", "true");
if (deepgramOptions.uttSplit !== undefined) {
params.set("utt_split", deepgramOptions.uttSplit.toString());
}
}
// Add paragraphs
if (deepgramOptions.paragraphs) {
params.set("paragraphs", "true");
}
// Add filler words
if (deepgramOptions.fillerWords) {
params.set("filler_words", "true");
}
// Add keywords
if (deepgramOptions.keywords && deepgramOptions.keywords.length > 0) {
for (const keyword of deepgramOptions.keywords) {
params.append("keywords", keyword);
}
if (deepgramOptions.keywordBoost) {
params.set("keyword_boost", deepgramOptions.keywordBoost);
}
}
// Add redaction
if (deepgramOptions.redact && deepgramOptions.redact.length > 0) {
for (const redactType of deepgramOptions.redact) {
params.append("redact", redactType);
}
}
// Add profanity filter
if (options.profanityFilter) {
params.set("profanity_filter", "true");
}
const url = `${this.baseUrl}/listen?${params.toString()}`;
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 30000);
let response;
try {
response = await fetch(url, {
method: "POST",
headers: {
Authorization: `Token ${this.apiKey}`,
"Content-Type": this.getMimeType(options.format ?? "wav"),
},
body: new Uint8Array(audioBuffer),
signal: controller.signal,
});
}
catch (fetchErr) {
if (fetchErr instanceof Error && fetchErr.name === "AbortError") {
throw STTError.transcriptionFailed("Deepgram STT request timed out after 30 seconds", "deepgram", fetchErr);
}
throw fetchErr;
}
finally {
clearTimeout(timeoutId);
}
if (!response.ok) {
const errorData = await response
.json()
.catch(() => Object.create(null));
const errorMessage = errorData.err_msg ||
`HTTP ${response.status}`;
throw STTError.transcriptionFailed(errorMessage, "deepgram");
}
const data = (await response.json());
const latency = Date.now() - startTime;
// Handle empty results
if (!data.results?.channels ||
data.results.channels.length === 0 ||
!data.results.channels[0].alternatives ||
data.results.channels[0].alternatives.length === 0) {
return {
text: "",
confidence: 0,
language: options.language,
duration: data.metadata?.duration,
metadata: {
latency,
provider: "deepgram",
requestId: data.metadata?.request_id,
},
};
}
const firstChannel = data.results.channels[0];
const firstAlternative = firstChannel.alternatives[0];
// Build result
const result = {
text: firstAlternative.transcript,
confidence: firstAlternative.confidence,
language: options.language,
duration: data.metadata?.duration,
metadata: {
latency,
provider: "deepgram",
model: deepgramOptions.model ?? "nova-2",
requestId: data.metadata?.request_id,
},
};
// Add word timings
if (firstAlternative.words && firstAlternative.words.length > 0) {
const speakers = new Set();
result.words = firstAlternative.words.map((word) => {
const wordTiming = {
word: word.punctuated_word ?? word.word,
startTime: word.start,
endTime: word.end,
confidence: word.confidence,
};
if (word.speaker !== undefined) {
wordTiming.speaker = `Speaker ${word.speaker}`;
speakers.add(wordTiming.speaker);
}
return wordTiming;
});
if (speakers.size > 0) {
result.speakers = Array.from(speakers);
}
}
// Add utterances as segments
if (data.results.utterances && data.results.utterances.length > 0) {
result.segments = data.results.utterances.map((utt, index) => ({
index,
text: utt.transcript,
isFinal: true,
confidence: utt.confidence,
startTime: utt.start,
endTime: utt.end,
speaker: utt.speaker !== undefined ? `Speaker ${utt.speaker}` : undefined,
}));
}
logger.info(`[DeepgramSTTHandler] Transcribed ${data.metadata?.duration?.toFixed(1) ?? "?"}s audio in ${latency}ms`);
return result;
}
catch (err) {
if (err instanceof STTError) {
throw err;
}
const errorMessage = err instanceof Error ? err.message : String(err || "Unknown error");
logger.error(`[DeepgramSTTHandler] Transcription failed: ${errorMessage}`);
throw STTError.transcriptionFailed(errorMessage, "deepgram", err instanceof Error ? err : undefined);
}
}
/**
* Streaming transcription using WebSocket
*/
async *transcribeStream(audioStream, options) {
if (!this.apiKey) {
throw STTError.providerNotConfigured("deepgram");
}
const deepgramOptions = options;
// Build query parameters
const params = new URLSearchParams();
params.set("model", deepgramOptions.model ?? "nova-2");
if (options.language) {
params.set("language", options.language);
}
if (options.punctuation !== false) {
params.set("punctuate", "true");
}
if (options.speakerDiarization) {
params.set("diarize", "true");
}
if (deepgramOptions.smartFormat) {
params.set("smart_format", "true");
}
// Indicate interim results
params.set("interim_results", "true");
const wsUrl = `wss://api.deepgram.com/v1/listen?${params.toString()}`;
// Create WebSocket connection
const WebSocket = (await import("ws")).default;
const ws = new WebSocket(wsUrl, {
headers: {
Authorization: `Token ${this.apiKey}`,
},
});
let segmentIndex = 0;
const messageQueue = [];
let resolveNext = null;
let done = false;
let error = null;
// Bug 4 fix: name the three permanent handlers so timeout cleanup can call
// ws.off(event, ref) per pair instead of removeAllListeners(event). The
// surgical .off() pattern survives any future code that attaches more
// listeners between this block and the connection-timeout firing.
const onMessage = (data) => {
try {
const response = JSON.parse(data.toString());
if (response.type === "Results" && response.channel?.alternatives) {
const alt = response.channel.alternatives[0];
if (alt && alt.transcript) {
const segment = {
index: segmentIndex++,
text: alt.transcript,
isFinal: response.is_final ?? false,
confidence: alt.confidence ?? 0,
};
if (resolveNext) {
resolveNext({ value: segment, done: false });
resolveNext = null;
}
else {
messageQueue.push(segment);
}
}
}
}
catch {
logger.warn(`[DeepgramSTTHandler] Failed to parse WebSocket message`);
}
};
const onError = (err) => {
error = err;
if (resolveNext) {
resolveNext({
value: undefined,
done: true,
});
resolveNext = null;
}
};
const onClose = () => {
done = true;
if (resolveNext) {
resolveNext({
value: undefined,
done: true,
});
resolveNext = null;
}
};
ws.on("message", onMessage);
ws.on("error", onError);
ws.on("close", onClose);
// Wait for connection (10-second timeout to avoid hanging indefinitely)
await new Promise((resolve, reject) => {
const openHandler = () => {
clearTimeout(connectionTimeout);
ws.off("error", openErrorHandler);
resolve();
};
const openErrorHandler = (err) => {
clearTimeout(connectionTimeout);
ws.off("open", openHandler);
reject(err);
};
const connectionTimeout = setTimeout(() => {
// Bug 4 fix: surgical .off() per (event, handlerRef) so any future
// listener attached to this socket survives the timeout cleanup.
ws.off("message", onMessage);
ws.off("error", onError);
ws.off("close", onClose);
ws.off("open", openHandler);
ws.off("error", openErrorHandler);
ws.terminate();
reject(STTError.streamError("WebSocket connection to Deepgram timed out after 10 seconds", "deepgram"));
}, 10000);
ws.on("open", openHandler);
ws.on("error", openErrorHandler);
});
// Send audio chunks
const sendAudio = async () => {
try {
for await (const chunk of audioStream) {
if (ws.readyState === WebSocket.OPEN) {
ws.send(chunk);
}
}
}
catch (sendError) {
logger.error(`[DeepgramSTTHandler] Error sending audio: ${sendError instanceof Error ? sendError.message : String(sendError)}`);
// Surface the error so the generator loop can exit instead of hanging.
error = sendError;
if (resolveNext) {
resolveNext({
value: undefined,
done: true,
});
resolveNext = null;
}
}
finally {
// Always send CloseStream so Deepgram closes the WS even on send error;
// otherwise `done` is never set and the generator hangs.
if (ws.readyState === WebSocket.OPEN) {
try {
ws.send(JSON.stringify({ type: "CloseStream" }));
}
catch {
/* WS already broken */
}
}
}
};
// Start sending audio in background — explicitly fire-and-forget with .catch
// to surface unhandled rejections instead of crashing the process.
void sendAudio().catch((err) => {
logger.error(`[DeepgramSTTHandler] sendAudio rejected: ${err instanceof Error ? err.message : String(err)}`);
});
// Track teardown so the audio-pump generator can stop pulling from
// `audioStream` after the consumer breaks out of the for-await loop or
// the WS errors. Without this, an infinite/live producer keeps running
// and leaks the upstream resource (CodeRabbit review).
const stopProducerEarly = () => {
const ret = audioStream
.return;
if (typeof ret === "function") {
try {
void Promise.resolve(ret.call(audioStream)).catch(() => undefined);
}
catch {
// Best-effort — ignore if the iterator's return() throws.
}
}
};
// Yield segments — wrapped in try/finally so the WebSocket is always
// closed and a CloseStream message sent, even when the consumer breaks
// out of the for-await loop early (C2: previously the WS would leak and
// sendAudio would keep running in the background).
try {
while (!done) {
if (error) {
throw STTError.streamError(error.message, "deepgram");
}
if (messageQueue.length > 0) {
// Issue 9: explicit narrowing — `length > 0` proves shift returns a
// value, but TypeScript can't tie the two; narrow without `!`.
const next = messageQueue.shift();
if (next !== undefined) {
yield next;
}
}
else {
// Wait for next message — capture and yield the resolved segment
const result = await new Promise((resolve) => {
resolveNext = resolve;
});
if (!result.done && result.value) {
yield result.value;
}
}
}
// Yield remaining messages
while (messageQueue.length > 0) {
const next = messageQueue.shift();
if (next !== undefined) {
yield next;
}
}
}
finally {
// Tell the upstream producer (the caller's audioStream iterator) to
// stop — sendAudio() is the only consumer of that iterator, so once
// we're tearing down it should not be pulling more chunks.
stopProducerEarly();
// C2: always close the socket — sends Deepgram's CloseStream sentinel
// when reachable, then terminates if still open after a short window.
if (ws.readyState === WebSocket.OPEN) {
try {
ws.send(JSON.stringify({ type: "CloseStream" }));
}
catch {
// Ignore — socket may have been closed by the server
}
ws.close();
}
else if (ws.readyState === WebSocket.CONNECTING ||
ws.readyState === WebSocket.CLOSING) {
ws.terminate();
}
}
}
/**
* Get MIME type for audio format
*/
getMimeType(format) {
const mimeTypes = {
mp3: "audio/mpeg",
wav: "audio/wav",
ogg: "audio/ogg",
opus: "audio/opus",
};
return mimeTypes[format] ?? "audio/wav";
}
}