UNPKG

@pompeii-labs/audio

Version:
910 lines (893 loc) 20.5 kB
'use strict'; var sdk = require('@deepgram/sdk'); var hume = require('hume'); var OpenAI = require('openai'); function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; } var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI); // src/helpers/bufferToInt16Array.ts function bufferToInt16Array(buffer) { return new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2); } // src/decoders/mulaw.ts function mulawToPcm16(mulawData) { const pcmData = new Int16Array(mulawData.length); for (let i = 0; i < mulawData.length; i++) { pcmData[i] = mulawToLinear(mulawData[i]); } return pcmData; } function mulawToLinear(mulawByte) { const inverted = mulawByte ^ 255; const sign = inverted & 128; const segment = (inverted & 112) >> 4; const step = inverted & 15; let linear; if (segment === 0) { linear = (step << 1) + 1; } else { linear = (step << 1) + 1 + 32 << segment + 2; } linear -= 33; return sign ? -linear : linear; } // src/encoders/mulaw.ts var BIAS = 132; var CLIP = 32635; var encodeTable = [ 0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 ]; function encodeSample(sample) { const sign = sample >> 8 & 128; if (sign !== 0) sample = -sample; sample = sample + BIAS; if (sample > CLIP) sample = CLIP; const exponent = encodeTable[sample >> 7 & 255]; const mantissa = sample >> exponent + 3 & 15; return ~(sign | exponent << 4 | mantissa); } function pcm16ToMulaw(pcmData) { const mulawData = new Uint8Array(pcmData.length); for (let i = 0; i < pcmData.length; i++) { mulawData[i] = encodeSample(pcmData[i]); } return mulawData; } // src/helpers/int16ArrayToBuffer.ts function int16ArrayToBuffer(int16Array) { return Buffer.from(int16Array.buffer, int16Array.byteOffset, int16Array.byteLength); } // src/helpers/convertAudioFormat.ts function encodePcm(audio, encoding) { switch (encoding) { case "mulaw": return Buffer.from(pcm16ToMulaw(audio)); case "pcm": return int16ArrayToBuffer(audio); default: throw new Error(`Could not encode audio: Unsupported encoding: ${encoding}`); } } function decodeToPcm(audio, encoding) { switch (encoding) { case "mulaw": return mulawToPcm16(audio); case "pcm": return bufferToInt16Array(audio); default: throw new Error(`Could not decode audio: Unsupported encoding: ${encoding}`); } } // src/helpers/generateFadeOutSamples.ts function generateFadeOutSamples(lastSampleValue, fadeDurationMs, sampleRate) { const fadeNumSamples = Math.ceil(fadeDurationMs / 1e3 * sampleRate); const fadeSamples = new Int16Array(fadeNumSamples); for (let i = 0; i < fadeNumSamples; i++) { const progress = 1 - i / (fadeNumSamples - 1); fadeSamples[i] = Math.round(lastSampleValue * progress); } return new Uint8Array(fadeSamples.buffer); } // src/helpers/resamplePcm.ts function resamplePcm(pcm, originalSampleRate, targetSampleRate) { if (originalSampleRate === targetSampleRate) { return pcm; } const ratio = originalSampleRate / targetSampleRate; const newLength = Math.floor(pcm.length / ratio); const newSamples = new Int16Array(newLength); if (ratio < 1) { for (let i = 0; i < newSamples.length; i++) { const exactPos = i * ratio; const lowerIndex = Math.floor(exactPos); const upperIndex = Math.min(lowerIndex + 1, pcm.length - 1); const fraction = exactPos - lowerIndex; const lowerSample = pcm[lowerIndex]; const upperSample = pcm[upperIndex]; newSamples[i] = Math.round(lowerSample + (upperSample - lowerSample) * fraction); } return newSamples; } const nyquistFreq = targetSampleRate / 2; const cutoffFreq = nyquistFreq * 0.9; const filteredPcm = applyLowPassFilter(pcm, originalSampleRate, cutoffFreq); for (let i = 0; i < newSamples.length; i++) { const exactPos = i * ratio; const lowerIndex = Math.floor(exactPos); const upperIndex = Math.min(lowerIndex + 1, filteredPcm.length - 1); const fraction = exactPos - lowerIndex; const lowerSample = filteredPcm[lowerIndex]; const upperSample = filteredPcm[upperIndex]; newSamples[i] = Math.round(lowerSample + (upperSample - lowerSample) * fraction); } return newSamples; } function applyLowPassFilter(pcm, sampleRate, cutoffFreq) { const filterOrder = Math.max(3, Math.floor(sampleRate / (cutoffFreq * 4))); const filtered = new Int16Array(pcm.length); for (let i = 0; i < pcm.length; i++) { let sum = 0; let count = 0; for (let j = Math.max(0, i - filterOrder); j <= Math.min(pcm.length - 1, i + filterOrder); j++) { sum += pcm[j]; count++; } filtered[i] = Math.round(sum / count); } return filtered; } // src/voice/helpers.ts function splitTextIntoChunks(text, targetLength = 100) { const endOfSentencePunctuation = [".", "!", "?"]; const sentences = []; for (let i = targetLength; i < text.length; i++) { if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " " || text[i + 1] === "\n")) { sentences.push(text.slice(0, i + 1)); text = text.slice(i + 1); i = targetLength; } } return sentences; } // src/voice/client.ts var uniformSampleRate = 48e3; var MagmaFlow = class { stt; tts; inputFormat; outputFormat; onAudioOutput; textBuffer = ""; textQueue = []; generatingAudio = false; currentRequestId = null; audioBuffer = []; lastChunk = null; config = { pauseDurationMs: 500, sentenceChunkLength: 50 }; constructor(args) { this.stt = args.stt; this.tts = args.tts; this.inputFormat = args.inputFormat; this.outputFormat = args.outputFormat; this.onAudioOutput = args.onAudioOutput; this.config = { ...this.config, ...args.config }; this.tts.onOutput = (audio, requestId) => { if (this.currentRequestId !== requestId) { console.log("[MagmaFlow] Skipping output for cancelled request"); return; } if (!audio) { if (this.lastChunk) { const lastChunkSamples = bufferToInt16Array(this.lastChunk); const lastSampleValue = lastChunkSamples[lastChunkSamples.length - 1]; this.audioBuffer.push( Buffer.from( generateFadeOutSamples( lastSampleValue, this.config.pauseDurationMs ?? 500, 48e3 ) ) ); } this.sendAudio(); this.generatingAudio = false; this.lastChunk = null; this.generateAudio(); return; } this.audioBuffer.push(audio); this.lastChunk = audio; if (this.audioBuffer.reduce((acc, curr) => acc + curr.length, 0) % (2 * this.outputFormat.channels) === 0) { this.sendAudio(); } }; this.stt.onOutput = args.onTranscription; this.stt.onSpeechDetected = args.onSpeechDetected; } inputAudio(audio) { const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding); const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3); this.stt.input(int16ArrayToBuffer(resampledPCM)); } inputText(text) { if (text === void 0 || text === null) { if (this.textBuffer.length === 0) return; this.textQueue.push(this.textBuffer); this.textBuffer = ""; this.generateAudio(); return; } this.textBuffer += text; const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50); for (const chunk of chunks) { if (chunk.length === 0) continue; this.textQueue.push(chunk); this.textBuffer = this.textBuffer.slice(chunk.length); this.generateAudio(); } } generateAudio() { if (this.generatingAudio) return; const chunk = this.textQueue.shift(); if (!chunk) return; this.generatingAudio = true; if (!this.currentRequestId) { this.currentRequestId = Math.random().toString(36).substring(2, 15); } this.tts.input(chunk, this.currentRequestId); } sendAudio() { if (this.audioBuffer.length === 0) return; const concatenatedBuffer = Buffer.concat(this.audioBuffer); this.audioBuffer = []; const resampledPCM = resamplePcm( bufferToInt16Array(concatenatedBuffer), uniformSampleRate, this.outputFormat.sampleRate ); const encodedAudio = encodePcm(resampledPCM, this.outputFormat.encoding); try { this.onAudioOutput(encodedAudio); } catch (error) { console.error("Audio output callback error:", error); } } interruptTTS() { this.textQueue = []; this.textBuffer = ""; this.audioBuffer = []; this.generatingAudio = false; this.currentRequestId = null; } kill() { this.stt.kill(); this.tts.kill(); this.audioBuffer = []; this.textQueue = []; this.textBuffer = ""; this.generatingAudio = false; } }; // src/voice/speechToText/base.ts var MagmaFlowSpeechToText = class { onSpeechDetected() { console.log(`[Default STT] Speech detected`); } onOutput(output) { console.log(`[Default STT] Output: ${JSON.stringify(output)}`); } constructor() { } }; var kKeepAliveInterval = 5e3; var DeepgramModel = /* @__PURE__ */ ((DeepgramModel2) => { DeepgramModel2["NOVA_3"] = "nova-3"; return DeepgramModel2; })(DeepgramModel || {}); var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => { DeepgramLanguage2["EN_US"] = "en-US"; return DeepgramLanguage2; })(DeepgramLanguage || {}); var DeepgramSTT = class extends MagmaFlowSpeechToText { client; connection = null; config; turnBuffer = []; utteranceEnded = false; constructor(args) { super(); this.config = { model: args.model, vad_events: true, interim_results: true, encoding: "linear16", sample_rate: 48e3, channels: 1, utterance_end_ms: 1e3, ...args.config }; this.client = args.client ?? new sdk.DeepgramClient({ key: process.env.DEEPGRAM_API_KEY }); } setup() { this.connection = this.client.listen.live(this.config); this.connection.on(sdk.LiveTranscriptionEvents.Error, (event) => { console.log(`[Deepgram] Error: ${JSON.stringify(event)}`); }); this.connection.on(sdk.LiveTranscriptionEvents.Close, (event) => { console.log(`[Deepgram] Close: ${JSON.stringify(event)}`); }); this.connection.on(sdk.LiveTranscriptionEvents.Open, this.onOpen.bind(this)); this.connection.on(sdk.LiveTranscriptionEvents.Unhandled, (event) => { console.log(`[Deepgram] Unhandled event: ${JSON.stringify(event)}`); }); this.connection.on( sdk.LiveTranscriptionEvents.Transcript, this.handleTranscriptionEvent.bind(this) ); this.connection.on(sdk.LiveTranscriptionEvents.UtteranceEnd, (event) => { console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`); this.handleUtteranceEnd(); }); } input(audio) { if (!this.connection) { this.setup(); return this.input(audio); } this.connection?.send(audio.buffer); } flush() { this.connection?.finalize(); } kill() { this.connection?.requestClose(); this.connection = null; } handleTranscriptionEvent(transcriptionEvent) { const transcriptOption = transcriptionEvent.channel.alternatives[0]; if (transcriptOption.transcript.trim() === "") { return; } this.onSpeechDetected(); if (transcriptionEvent.speech_final) { this.utteranceEnded = false; } if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) { const turns = this.computeTurns(transcriptOption.words); this.turnBuffer = this.turnBuffer.concat(turns); if (transcriptionEvent.speech_final) { this.sendOutput(); } } } handleUtteranceEnd() { this.utteranceEnded = true; this.sendOutput(); } sendOutput() { if (!this.utteranceEnded) { return; } if (this.turnBuffer.length === 0) { return; } const text = this.turnBuffer.map((turn) => turn.text).join(" "); let turns = void 0; if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) { turns = this.turnBuffer.reduce((acc, turn) => { if (acc.at(-1)?.speaker === turn.speaker) { acc.at(-1).text += turn.text; } else { acc.push(turn); } return acc; }, []); } this.onOutput({ text, turns }); this.turnBuffer = []; this.utteranceEnded = false; } onOpen() { console.log(`[Deepgram] Connected`); this.keepAlive(); } keepAlive() { setTimeout(() => { if (this.connection?.isConnected()) { this.connection.keepAlive(); this.keepAlive(); } else { return; } }, kKeepAliveInterval); } computeTurns(words) { try { const turns = []; let currentTurn = null; let currentTurnConfidence = 0; let currentTurnWordCount = 0; for (const word of words) { const speaker = word.speaker; const utterance = word.punctuated_word || word.word; if (currentTurn && currentTurn.speaker === speaker) { currentTurn.text += ` ${utterance}`; currentTurnConfidence += word.confidence; currentTurnWordCount++; } else { if (currentTurn) { currentTurn.confidence = currentTurnConfidence / currentTurnWordCount; if (currentTurn.confidence < 0.5) { currentTurn.text = "[inaudible]"; } else if (currentTurn.confidence < 0.75) { currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`; } turns.push(currentTurn); } currentTurn = { speaker, text: utterance, confidence: 0 }; currentTurnConfidence = word.confidence; currentTurnWordCount = 1; } } if (currentTurn) { currentTurn.confidence = currentTurnConfidence / currentTurnWordCount; turns.push(currentTurn); } return turns; } catch (error) { console.error(error); return []; } } }; // src/voice/textToSpeech/base.ts var MagmaFlowTextToSpeech = class { onOutput(audio, requestId) { console.log("[Default TTS] Output:", audio); } constructor() { } }; var DeepgramTTS = class extends MagmaFlowTextToSpeech { client; constructor(args) { super(); this.client = args.client ?? new sdk.DeepgramClient({ key: process.env.DEEPGRAM_API_KEY }); } async setup() { } input(text, requestId) { if (!text) { return; } this.client.speak.request( { text }, { sample_rate: 48e3, encoding: "linear16", model: "aura-2-thalia-en", container: "none" } ).then(async (response) => { const stream = await response.getStream(); if (!stream) { return; } for await (const chunk of stream) { this.onOutput(Buffer.from(chunk), requestId); } this.onOutput(null, requestId); console.log("[Deepgram] Finished:", text); }); } kill() { } reset() { } }; // src/voice/textToSpeech/elevenlabs.ts var ElevenLabsVoice = /* @__PURE__ */ ((ElevenLabsVoice2) => { ElevenLabsVoice2["chris"] = "iP95p4xoKVk53GoZ742B"; ElevenLabsVoice2["josh"] = "TxGEqnHWrfWFTfGW9XjX"; ElevenLabsVoice2["rachel"] = "21m00Tcm4TlvDq8ikWAM"; ElevenLabsVoice2["laura"] = "FGY2WhTYpPnrIDTdsKH5"; ElevenLabsVoice2["felicity"] = "aTbnroHRGIomiKpqAQR8"; return ElevenLabsVoice2; })(ElevenLabsVoice || {}); var ElevenLabsTTS = class extends MagmaFlowTextToSpeech { apiKey; model; voice; config; constructor(args) { super(); this.apiKey = args.apiKey ?? process.env.ELEVENLABS_API_KEY; this.model = args.model; this.voice = args.voice; this.config = args.config ?? {}; } async setup() { } input(text, requestId) { if (!text) { return; } const textToSend = text.replaceAll(/([A-Z])-([A-Z])/g, "$1 - $2").replaceAll(/([0-9])-([0-9])/g, "$1 - $2").replaceAll(/(-\s*[A-Z])\s+([A-Z]\s*-)/g, "$1 - $2").replaceAll(/(-\s*[0-9])\s+([0-9]\s*-)/g, "$1 - $2"); fetch( `https://api.elevenlabs.io/v1/text-to-speech/${this.voice}/stream?output_format=pcm_48000`, { method: "POST", headers: { "Content-Type": "application/json", "xi-api-key": this.apiKey }, body: JSON.stringify({ text: textToSend, model_id: this.model, ...this.config }) } ).then(async (response) => { const reader = response.body?.getReader(); if (!reader) return; new TextDecoder(); while (true) { const { done, value } = await reader.read(); if (done) break; this.onOutput(Buffer.from(value), requestId); } this.onOutput(null, requestId); console.log("[ElevenLabs] Finished:", textToSend); }); } kill() { } reset() { } }; var HumeTTS = class extends MagmaFlowTextToSpeech { client; constructor(args) { super(); this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY }); } async setup() { } input(text, requestId) { if (!text) { return; } this.client.tts.synthesizeJsonStreaming({ utterances: [ { text } ], format: { type: "pcm" }, instantMode: true }).then(async (stream) => { for await (const chunk of stream) { this.onOutput(Buffer.from(chunk.audio, "base64"), requestId); } this.onOutput(null, requestId); console.log("[Hume] Finished:", text); }); } kill() { } reset() { } }; var WhisperTTS = class extends MagmaFlowTextToSpeech { client; constructor(args) { super(); this.client = args.client ?? new OpenAI__default.default({ apiKey: process.env.OPENAI_API_KEY }); } async setup() { } input(text, requestId) { if (!text) { return; } this.client.audio.speech.create({ model: "gpt-4o-mini-tts", voice: "alloy", input: text, response_format: "pcm" }).then(async (res) => { const result = await res.arrayBuffer(); const resampledPCM = resamplePcm( bufferToInt16Array(Buffer.from(result)), 24e3, 48e3 ); this.onOutput(int16ArrayToBuffer(resampledPCM), requestId); this.onOutput(null, requestId); console.log("[Whisper] Finished:", text); }); } kill() { } reset() { } }; exports.DeepgramLanguage = DeepgramLanguage; exports.DeepgramModel = DeepgramModel; exports.DeepgramSTT = DeepgramSTT; exports.DeepgramTTS = DeepgramTTS; exports.ElevenLabsTTS = ElevenLabsTTS; exports.ElevenLabsVoice = ElevenLabsVoice; exports.HumeTTS = HumeTTS; exports.MagmaFlow = MagmaFlow; exports.MagmaFlowSpeechToText = MagmaFlowSpeechToText; exports.MagmaFlowTextToSpeech = MagmaFlowTextToSpeech; exports.WhisperTTS = WhisperTTS; exports.splitTextIntoChunks = splitTextIntoChunks;