@pompeii-labs/audio
Version:
The Audio SDK from Pompeii Labs
910 lines (893 loc) • 20.5 kB
JavaScript
'use strict';
var sdk = require('@deepgram/sdk');
var hume = require('hume');
var OpenAI = require('openai');
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
var OpenAI__default = /*#__PURE__*/_interopDefault(OpenAI);
// src/helpers/bufferToInt16Array.ts
function bufferToInt16Array(buffer) {
return new Int16Array(buffer.buffer, buffer.byteOffset, buffer.byteLength / 2);
}
// src/decoders/mulaw.ts
function mulawToPcm16(mulawData) {
const pcmData = new Int16Array(mulawData.length);
for (let i = 0; i < mulawData.length; i++) {
pcmData[i] = mulawToLinear(mulawData[i]);
}
return pcmData;
}
function mulawToLinear(mulawByte) {
const inverted = mulawByte ^ 255;
const sign = inverted & 128;
const segment = (inverted & 112) >> 4;
const step = inverted & 15;
let linear;
if (segment === 0) {
linear = (step << 1) + 1;
} else {
linear = (step << 1) + 1 + 32 << segment + 2;
}
linear -= 33;
return sign ? -linear : linear;
}
// src/encoders/mulaw.ts
var BIAS = 132;
var CLIP = 32635;
var encodeTable = [
0,
0,
1,
1,
2,
2,
2,
2,
3,
3,
3,
3,
3,
3,
3,
3,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
4,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
6,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7
];
function encodeSample(sample) {
const sign = sample >> 8 & 128;
if (sign !== 0) sample = -sample;
sample = sample + BIAS;
if (sample > CLIP) sample = CLIP;
const exponent = encodeTable[sample >> 7 & 255];
const mantissa = sample >> exponent + 3 & 15;
return ~(sign | exponent << 4 | mantissa);
}
function pcm16ToMulaw(pcmData) {
const mulawData = new Uint8Array(pcmData.length);
for (let i = 0; i < pcmData.length; i++) {
mulawData[i] = encodeSample(pcmData[i]);
}
return mulawData;
}
// src/helpers/int16ArrayToBuffer.ts
function int16ArrayToBuffer(int16Array) {
return Buffer.from(int16Array.buffer, int16Array.byteOffset, int16Array.byteLength);
}
// src/helpers/convertAudioFormat.ts
function encodePcm(audio, encoding) {
switch (encoding) {
case "mulaw":
return Buffer.from(pcm16ToMulaw(audio));
case "pcm":
return int16ArrayToBuffer(audio);
default:
throw new Error(`Could not encode audio: Unsupported encoding: ${encoding}`);
}
}
function decodeToPcm(audio, encoding) {
switch (encoding) {
case "mulaw":
return mulawToPcm16(audio);
case "pcm":
return bufferToInt16Array(audio);
default:
throw new Error(`Could not decode audio: Unsupported encoding: ${encoding}`);
}
}
// src/helpers/generateFadeOutSamples.ts
function generateFadeOutSamples(lastSampleValue, fadeDurationMs, sampleRate) {
const fadeNumSamples = Math.ceil(fadeDurationMs / 1e3 * sampleRate);
const fadeSamples = new Int16Array(fadeNumSamples);
for (let i = 0; i < fadeNumSamples; i++) {
const progress = 1 - i / (fadeNumSamples - 1);
fadeSamples[i] = Math.round(lastSampleValue * progress);
}
return new Uint8Array(fadeSamples.buffer);
}
// src/helpers/resamplePcm.ts
function resamplePcm(pcm, originalSampleRate, targetSampleRate) {
if (originalSampleRate === targetSampleRate) {
return pcm;
}
const ratio = originalSampleRate / targetSampleRate;
const newLength = Math.floor(pcm.length / ratio);
const newSamples = new Int16Array(newLength);
if (ratio < 1) {
for (let i = 0; i < newSamples.length; i++) {
const exactPos = i * ratio;
const lowerIndex = Math.floor(exactPos);
const upperIndex = Math.min(lowerIndex + 1, pcm.length - 1);
const fraction = exactPos - lowerIndex;
const lowerSample = pcm[lowerIndex];
const upperSample = pcm[upperIndex];
newSamples[i] = Math.round(lowerSample + (upperSample - lowerSample) * fraction);
}
return newSamples;
}
const nyquistFreq = targetSampleRate / 2;
const cutoffFreq = nyquistFreq * 0.9;
const filteredPcm = applyLowPassFilter(pcm, originalSampleRate, cutoffFreq);
for (let i = 0; i < newSamples.length; i++) {
const exactPos = i * ratio;
const lowerIndex = Math.floor(exactPos);
const upperIndex = Math.min(lowerIndex + 1, filteredPcm.length - 1);
const fraction = exactPos - lowerIndex;
const lowerSample = filteredPcm[lowerIndex];
const upperSample = filteredPcm[upperIndex];
newSamples[i] = Math.round(lowerSample + (upperSample - lowerSample) * fraction);
}
return newSamples;
}
function applyLowPassFilter(pcm, sampleRate, cutoffFreq) {
const filterOrder = Math.max(3, Math.floor(sampleRate / (cutoffFreq * 4)));
const filtered = new Int16Array(pcm.length);
for (let i = 0; i < pcm.length; i++) {
let sum = 0;
let count = 0;
for (let j = Math.max(0, i - filterOrder); j <= Math.min(pcm.length - 1, i + filterOrder); j++) {
sum += pcm[j];
count++;
}
filtered[i] = Math.round(sum / count);
}
return filtered;
}
// src/voice/helpers.ts
function splitTextIntoChunks(text, targetLength = 100) {
const endOfSentencePunctuation = [".", "!", "?"];
const sentences = [];
for (let i = targetLength; i < text.length; i++) {
if (endOfSentencePunctuation.includes(text[i]) && (i === text.length - 1 || text[i + 1] === " " || text[i + 1] === "\n")) {
sentences.push(text.slice(0, i + 1));
text = text.slice(i + 1);
i = targetLength;
}
}
return sentences;
}
// src/voice/client.ts
var uniformSampleRate = 48e3;
var MagmaFlow = class {
stt;
tts;
inputFormat;
outputFormat;
onAudioOutput;
textBuffer = "";
textQueue = [];
generatingAudio = false;
currentRequestId = null;
audioBuffer = [];
lastChunk = null;
config = {
pauseDurationMs: 500,
sentenceChunkLength: 50
};
constructor(args) {
this.stt = args.stt;
this.tts = args.tts;
this.inputFormat = args.inputFormat;
this.outputFormat = args.outputFormat;
this.onAudioOutput = args.onAudioOutput;
this.config = { ...this.config, ...args.config };
this.tts.onOutput = (audio, requestId) => {
if (this.currentRequestId !== requestId) {
console.log("[MagmaFlow] Skipping output for cancelled request");
return;
}
if (!audio) {
if (this.lastChunk) {
const lastChunkSamples = bufferToInt16Array(this.lastChunk);
const lastSampleValue = lastChunkSamples[lastChunkSamples.length - 1];
this.audioBuffer.push(
Buffer.from(
generateFadeOutSamples(
lastSampleValue,
this.config.pauseDurationMs ?? 500,
48e3
)
)
);
}
this.sendAudio();
this.generatingAudio = false;
this.lastChunk = null;
this.generateAudio();
return;
}
this.audioBuffer.push(audio);
this.lastChunk = audio;
if (this.audioBuffer.reduce((acc, curr) => acc + curr.length, 0) % (2 * this.outputFormat.channels) === 0) {
this.sendAudio();
}
};
this.stt.onOutput = args.onTranscription;
this.stt.onSpeechDetected = args.onSpeechDetected;
}
inputAudio(audio) {
const decodedAudio = decodeToPcm(audio, this.inputFormat.encoding);
const resampledPCM = resamplePcm(decodedAudio, this.inputFormat.sampleRate, 48e3);
this.stt.input(int16ArrayToBuffer(resampledPCM));
}
inputText(text) {
if (text === void 0 || text === null) {
if (this.textBuffer.length === 0) return;
this.textQueue.push(this.textBuffer);
this.textBuffer = "";
this.generateAudio();
return;
}
this.textBuffer += text;
const chunks = splitTextIntoChunks(this.textBuffer, this.config.sentenceChunkLength ?? 50);
for (const chunk of chunks) {
if (chunk.length === 0) continue;
this.textQueue.push(chunk);
this.textBuffer = this.textBuffer.slice(chunk.length);
this.generateAudio();
}
}
generateAudio() {
if (this.generatingAudio) return;
const chunk = this.textQueue.shift();
if (!chunk) return;
this.generatingAudio = true;
if (!this.currentRequestId) {
this.currentRequestId = Math.random().toString(36).substring(2, 15);
}
this.tts.input(chunk, this.currentRequestId);
}
sendAudio() {
if (this.audioBuffer.length === 0) return;
const concatenatedBuffer = Buffer.concat(this.audioBuffer);
this.audioBuffer = [];
const resampledPCM = resamplePcm(
bufferToInt16Array(concatenatedBuffer),
uniformSampleRate,
this.outputFormat.sampleRate
);
const encodedAudio = encodePcm(resampledPCM, this.outputFormat.encoding);
try {
this.onAudioOutput(encodedAudio);
} catch (error) {
console.error("Audio output callback error:", error);
}
}
interruptTTS() {
this.textQueue = [];
this.textBuffer = "";
this.audioBuffer = [];
this.generatingAudio = false;
this.currentRequestId = null;
}
kill() {
this.stt.kill();
this.tts.kill();
this.audioBuffer = [];
this.textQueue = [];
this.textBuffer = "";
this.generatingAudio = false;
}
};
// src/voice/speechToText/base.ts
var MagmaFlowSpeechToText = class {
onSpeechDetected() {
console.log(`[Default STT] Speech detected`);
}
onOutput(output) {
console.log(`[Default STT] Output: ${JSON.stringify(output)}`);
}
constructor() {
}
};
var kKeepAliveInterval = 5e3;
var DeepgramModel = /* @__PURE__ */ ((DeepgramModel2) => {
DeepgramModel2["NOVA_3"] = "nova-3";
return DeepgramModel2;
})(DeepgramModel || {});
var DeepgramLanguage = /* @__PURE__ */ ((DeepgramLanguage2) => {
DeepgramLanguage2["EN_US"] = "en-US";
return DeepgramLanguage2;
})(DeepgramLanguage || {});
var DeepgramSTT = class extends MagmaFlowSpeechToText {
client;
connection = null;
config;
turnBuffer = [];
utteranceEnded = false;
constructor(args) {
super();
this.config = {
model: args.model,
vad_events: true,
interim_results: true,
encoding: "linear16",
sample_rate: 48e3,
channels: 1,
utterance_end_ms: 1e3,
...args.config
};
this.client = args.client ?? new sdk.DeepgramClient({
key: process.env.DEEPGRAM_API_KEY
});
}
setup() {
this.connection = this.client.listen.live(this.config);
this.connection.on(sdk.LiveTranscriptionEvents.Error, (event) => {
console.log(`[Deepgram] Error: ${JSON.stringify(event)}`);
});
this.connection.on(sdk.LiveTranscriptionEvents.Close, (event) => {
console.log(`[Deepgram] Close: ${JSON.stringify(event)}`);
});
this.connection.on(sdk.LiveTranscriptionEvents.Open, this.onOpen.bind(this));
this.connection.on(sdk.LiveTranscriptionEvents.Unhandled, (event) => {
console.log(`[Deepgram] Unhandled event: ${JSON.stringify(event)}`);
});
this.connection.on(
sdk.LiveTranscriptionEvents.Transcript,
this.handleTranscriptionEvent.bind(this)
);
this.connection.on(sdk.LiveTranscriptionEvents.UtteranceEnd, (event) => {
console.log(`[Deepgram] Utterance end: ${JSON.stringify(event)}`);
this.handleUtteranceEnd();
});
}
input(audio) {
if (!this.connection) {
this.setup();
return this.input(audio);
}
this.connection?.send(audio.buffer);
}
flush() {
this.connection?.finalize();
}
kill() {
this.connection?.requestClose();
this.connection = null;
}
handleTranscriptionEvent(transcriptionEvent) {
const transcriptOption = transcriptionEvent.channel.alternatives[0];
if (transcriptOption.transcript.trim() === "") {
return;
}
this.onSpeechDetected();
if (transcriptionEvent.speech_final) {
this.utteranceEnded = false;
}
if (transcriptionEvent.is_final || transcriptionEvent.speech_final || transcriptionEvent.from_finalize) {
const turns = this.computeTurns(transcriptOption.words);
this.turnBuffer = this.turnBuffer.concat(turns);
if (transcriptionEvent.speech_final) {
this.sendOutput();
}
}
}
handleUtteranceEnd() {
this.utteranceEnded = true;
this.sendOutput();
}
sendOutput() {
if (!this.utteranceEnded) {
return;
}
if (this.turnBuffer.length === 0) {
return;
}
const text = this.turnBuffer.map((turn) => turn.text).join(" ");
let turns = void 0;
if (this.turnBuffer.every((turn) => turn.speaker !== void 0 && turn.speaker !== null)) {
turns = this.turnBuffer.reduce((acc, turn) => {
if (acc.at(-1)?.speaker === turn.speaker) {
acc.at(-1).text += turn.text;
} else {
acc.push(turn);
}
return acc;
}, []);
}
this.onOutput({
text,
turns
});
this.turnBuffer = [];
this.utteranceEnded = false;
}
onOpen() {
console.log(`[Deepgram] Connected`);
this.keepAlive();
}
keepAlive() {
setTimeout(() => {
if (this.connection?.isConnected()) {
this.connection.keepAlive();
this.keepAlive();
} else {
return;
}
}, kKeepAliveInterval);
}
computeTurns(words) {
try {
const turns = [];
let currentTurn = null;
let currentTurnConfidence = 0;
let currentTurnWordCount = 0;
for (const word of words) {
const speaker = word.speaker;
const utterance = word.punctuated_word || word.word;
if (currentTurn && currentTurn.speaker === speaker) {
currentTurn.text += ` ${utterance}`;
currentTurnConfidence += word.confidence;
currentTurnWordCount++;
} else {
if (currentTurn) {
currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
if (currentTurn.confidence < 0.5) {
currentTurn.text = "[inaudible]";
} else if (currentTurn.confidence < 0.75) {
currentTurn.text = `[unclear, confidence=${currentTurn.confidence.toFixed(2)}] ${currentTurn.text}`;
}
turns.push(currentTurn);
}
currentTurn = { speaker, text: utterance, confidence: 0 };
currentTurnConfidence = word.confidence;
currentTurnWordCount = 1;
}
}
if (currentTurn) {
currentTurn.confidence = currentTurnConfidence / currentTurnWordCount;
turns.push(currentTurn);
}
return turns;
} catch (error) {
console.error(error);
return [];
}
}
};
// src/voice/textToSpeech/base.ts
var MagmaFlowTextToSpeech = class {
onOutput(audio, requestId) {
console.log("[Default TTS] Output:", audio);
}
constructor() {
}
};
var DeepgramTTS = class extends MagmaFlowTextToSpeech {
client;
constructor(args) {
super();
this.client = args.client ?? new sdk.DeepgramClient({ key: process.env.DEEPGRAM_API_KEY });
}
async setup() {
}
input(text, requestId) {
if (!text) {
return;
}
this.client.speak.request(
{
text
},
{
sample_rate: 48e3,
encoding: "linear16",
model: "aura-2-thalia-en",
container: "none"
}
).then(async (response) => {
const stream = await response.getStream();
if (!stream) {
return;
}
for await (const chunk of stream) {
this.onOutput(Buffer.from(chunk), requestId);
}
this.onOutput(null, requestId);
console.log("[Deepgram] Finished:", text);
});
}
kill() {
}
reset() {
}
};
// src/voice/textToSpeech/elevenlabs.ts
var ElevenLabsVoice = /* @__PURE__ */ ((ElevenLabsVoice2) => {
ElevenLabsVoice2["chris"] = "iP95p4xoKVk53GoZ742B";
ElevenLabsVoice2["josh"] = "TxGEqnHWrfWFTfGW9XjX";
ElevenLabsVoice2["rachel"] = "21m00Tcm4TlvDq8ikWAM";
ElevenLabsVoice2["laura"] = "FGY2WhTYpPnrIDTdsKH5";
ElevenLabsVoice2["felicity"] = "aTbnroHRGIomiKpqAQR8";
return ElevenLabsVoice2;
})(ElevenLabsVoice || {});
var ElevenLabsTTS = class extends MagmaFlowTextToSpeech {
apiKey;
model;
voice;
config;
constructor(args) {
super();
this.apiKey = args.apiKey ?? process.env.ELEVENLABS_API_KEY;
this.model = args.model;
this.voice = args.voice;
this.config = args.config ?? {};
}
async setup() {
}
input(text, requestId) {
if (!text) {
return;
}
const textToSend = text.replaceAll(/([A-Z])-([A-Z])/g, "$1 - $2").replaceAll(/([0-9])-([0-9])/g, "$1 - $2").replaceAll(/(-\s*[A-Z])\s+([A-Z]\s*-)/g, "$1 - $2").replaceAll(/(-\s*[0-9])\s+([0-9]\s*-)/g, "$1 - $2");
fetch(
`https://api.elevenlabs.io/v1/text-to-speech/${this.voice}/stream?output_format=pcm_48000`,
{
method: "POST",
headers: {
"Content-Type": "application/json",
"xi-api-key": this.apiKey
},
body: JSON.stringify({
text: textToSend,
model_id: this.model,
...this.config
})
}
).then(async (response) => {
const reader = response.body?.getReader();
if (!reader) return;
new TextDecoder();
while (true) {
const { done, value } = await reader.read();
if (done) break;
this.onOutput(Buffer.from(value), requestId);
}
this.onOutput(null, requestId);
console.log("[ElevenLabs] Finished:", textToSend);
});
}
kill() {
}
reset() {
}
};
var HumeTTS = class extends MagmaFlowTextToSpeech {
client;
constructor(args) {
super();
this.client = args.client ?? new hume.HumeClient({ apiKey: process.env.HUME_API_KEY });
}
async setup() {
}
input(text, requestId) {
if (!text) {
return;
}
this.client.tts.synthesizeJsonStreaming({
utterances: [
{
text
}
],
format: {
type: "pcm"
},
instantMode: true
}).then(async (stream) => {
for await (const chunk of stream) {
this.onOutput(Buffer.from(chunk.audio, "base64"), requestId);
}
this.onOutput(null, requestId);
console.log("[Hume] Finished:", text);
});
}
kill() {
}
reset() {
}
};
var WhisperTTS = class extends MagmaFlowTextToSpeech {
client;
constructor(args) {
super();
this.client = args.client ?? new OpenAI__default.default({ apiKey: process.env.OPENAI_API_KEY });
}
async setup() {
}
input(text, requestId) {
if (!text) {
return;
}
this.client.audio.speech.create({
model: "gpt-4o-mini-tts",
voice: "alloy",
input: text,
response_format: "pcm"
}).then(async (res) => {
const result = await res.arrayBuffer();
const resampledPCM = resamplePcm(
bufferToInt16Array(Buffer.from(result)),
24e3,
48e3
);
this.onOutput(int16ArrayToBuffer(resampledPCM), requestId);
this.onOutput(null, requestId);
console.log("[Whisper] Finished:", text);
});
}
kill() {
}
reset() {
}
};
exports.DeepgramLanguage = DeepgramLanguage;
exports.DeepgramModel = DeepgramModel;
exports.DeepgramSTT = DeepgramSTT;
exports.DeepgramTTS = DeepgramTTS;
exports.ElevenLabsTTS = ElevenLabsTTS;
exports.ElevenLabsVoice = ElevenLabsVoice;
exports.HumeTTS = HumeTTS;
exports.MagmaFlow = MagmaFlow;
exports.MagmaFlowSpeechToText = MagmaFlowSpeechToText;
exports.MagmaFlowTextToSpeech = MagmaFlowTextToSpeech;
exports.WhisperTTS = WhisperTTS;
exports.splitTextIntoChunks = splitTextIntoChunks;