UNPKG

speechflow

Version:

Speech Processing Flow Graph

374 lines 17.3 kB
"use strict"; /* ** SpeechFlow - Speech Processing Flow Graph ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com> ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only> */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); /* standard dependencies */ const node_stream_1 = __importDefault(require("node:stream")); /* external dependencies */ const vad_node_realtime_1 = require("@ericedouard/vad-node-realtime"); /* internal dependencies */ const speechflow_node_1 = __importDefault(require("./speechflow-node")); const util = __importStar(require("./speechflow-util")); /* SpeechFlow node for VAD speech-to-speech processing */ class SpeechFlowNodeA2AVAD extends speechflow_node_1.default { /* declare official node name */ static name = "a2a-vad"; /* internal state */ vad = null; queue = new util.Queue(); queueRecv = this.queue.pointerUse("recv"); queueVAD = this.queue.pointerUse("vad"); queueSend = this.queue.pointerUse("send"); destroyed = false; tailTimer = null; activeEventListeners = new Set(); /* construct node */ constructor(id, cfg, opts, args) { super(id, cfg, opts, args); /* declare node configuration parameters */ this.configure({ mode: { type: "string", val: "silenced", match: /^(?:silenced|unplugged)$/ }, posSpeechThreshold: { type: "number", val: 0.50 }, negSpeechThreshold: { type: "number", val: 0.35 }, minSpeechFrames: { type: "number", val: 2 }, redemptionFrames: { type: "number", val: 12 }, preSpeechPadFrames: { type: "number", val: 1 }, postSpeechTail: { type: "number", val: 1500 } }); /* declare node input/output format */ this.input = "audio"; this.output = "audio"; } /* open node */ async open() { /* sanity check situation */ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian) throw new Error("VAD node currently supports PCM-S16LE audio only"); /* clear destruction flag */ this.destroyed = false; /* internal processing constants */ const vadSampleRateTarget = 16000; /* internal target of VAD */ const vadSamplesPerFrame = 512; /* required for VAD v5 */ /* helper function for timer cleanup */ const clearTailTimer = () => { if (this.tailTimer !== null) { clearTimeout(this.tailTimer); this.tailTimer = null; } }; /* establish Voice Activity Detection (VAD) facility */ let tail = false; try { this.vad = await vad_node_realtime_1.RealTimeVAD.new({ model: "v5", sampleRate: this.config.audioSampleRate, /* before resampling to 16KHz */ frameSamples: vadSamplesPerFrame, /* after resampling to 16KHz */ positiveSpeechThreshold: this.params.posSpeechThreshold, negativeSpeechThreshold: this.params.negSpeechThreshold, minSpeechFrames: this.params.minSpeechFrames, redemptionFrames: this.params.redemptionFrames, preSpeechPadFrames: this.params.preSpeechPadFrames, onSpeechStart: () => { if (this.destroyed) return; this.log("info", "VAD: speech start"); if (this.params.mode === "unplugged") { tail = false; clearTailTimer(); } }, onSpeechEnd: (audio) => { if (this.destroyed) return; const duration = util.audioArrayDuration(audio, vadSampleRateTarget); this.log("info", `VAD: speech end (duration: ${duration.toFixed(2)}s)`); if (this.params.mode === "unplugged") { tail = true; clearTailTimer(); this.tailTimer = setTimeout(() => { if (this.destroyed || this.tailTimer === null) return; tail = false; this.tailTimer = null; }, this.params.postSpeechTail); } }, onVADMisfire: () => { if (this.destroyed) return; this.log("info", "VAD: speech end (segment too short)"); if (this.params.mode === "unplugged") { tail = true; clearTailTimer(); this.tailTimer = setTimeout(() => { if (this.destroyed || this.tailTimer === null) return; tail = false; this.tailTimer = null; }, this.params.postSpeechTail); } }, onFrameProcessed: (audio) => { if (this.destroyed) return; try { /* annotate the current audio segment */ const element = this.queueVAD.peek(); if (element === undefined || element.type !== "audio-frame") throw new Error("internal error which cannot happen: no more queued element"); if (element.segmentIdx >= element.segmentData.length) throw new Error("segment index out of bounds"); const segment = element.segmentData[element.segmentIdx++]; segment.isSpeech = (audio.isSpeech > audio.notSpeech) || tail; /* annotate the entire audio chunk */ if (element.segmentIdx >= element.segmentData.length) { element.isSpeech = element.segmentData.some(segment => segment.isSpeech); this.queueVAD.touch(); this.queueVAD.walk(+1); } } catch (error) { this.log("error", `VAD frame processing error: ${error}`, { cause: error }); } } }); this.vad.start(); } catch (error) { throw new Error(`failed to initialize VAD: ${error}`, { cause: error }); } /* provide Duplex stream and internally attach to VAD */ const self = this; this.stream = new node_stream_1.default.Duplex({ writableObjectMode: true, readableObjectMode: true, decodeStrings: false, highWaterMark: 1, /* receive audio chunk (writable side of stream) */ write(chunk, encoding, callback) { if (self.destroyed) { callback(new Error("stream already destroyed")); return; } if (!Buffer.isBuffer(chunk.payload)) callback(new Error("expected audio input as Buffer chunks")); else if (chunk.payload.byteLength === 0) callback(); else { try { /* convert audio samples from PCM/I16 to PCM/F32 */ const data = util.convertBufToF32(chunk.payload, self.config.audioLittleEndian); /* segment audio samples as individual VAD-sized frames */ const segmentData = []; const chunkSize = vadSamplesPerFrame * (self.config.audioSampleRate / vadSampleRateTarget); const chunks = Math.trunc(data.length / chunkSize); for (let i = 0; i < chunks; i++) { const frame = data.slice(i * chunkSize, (i + 1) * chunkSize); const segment = { data: frame }; segmentData.push(segment); } if ((chunks * chunkSize) < data.length) { const frame = new Float32Array(chunkSize); frame.fill(0); frame.set(data.slice(chunks * chunkSize)); const segment = { data: frame }; segmentData.push(segment); } /* queue the results */ self.queueRecv.append({ type: "audio-frame", chunk, segmentIdx: 0, segmentData }); /* push segments through Voice Activity Detection (VAD) */ if (self.vad && !self.destroyed) { try { for (const segment of segmentData) self.vad.processAudio(segment.data); } catch (error) { self.log("error", `VAD processAudio error: ${error}`); } } callback(); } catch (error) { callback(error instanceof Error ? error : new Error("VAD processing failed")); } } }, /* receive no more audio chunks (writable side of stream) */ final(callback) { if (self.destroyed) { callback(); return; } /* signal end of file */ self.queueRecv.append({ type: "audio-eof" }); callback(); }, /* send audio chunk(s) (readable side of stream) */ read(_size) { if (self.destroyed) { this.push(null); return; } /* try to perform read operation from scratch */ const tryToRead = () => { if (self.destroyed) { this.push(null); return; } /* flush pending audio chunks */ const flushPendingChunks = () => { let pushed = 0; while (true) { if (self.destroyed) { this.push(null); return; } const element = self.queueSend.peek(); if (element === undefined) break; else if (element.type === "audio-eof") { this.push(null); break; } else if (element.type === "audio-frame" && element.isSpeech === undefined) break; self.queueSend.walk(+1); self.queue.trim(); if (element.isSpeech) { this.push(element.chunk); pushed++; } else if (self.params.mode === "silenced") { const chunk = element.chunk.clone(); const buffer = chunk.payload; buffer.fill(0); this.push(chunk); pushed++; } else if (self.params.mode === "unplugged" && pushed === 0) { /* we have to await chunks now, as in unplugged mode we else would be never called again until we at least once push a new chunk as the result */ setTimeout(() => { if (self.destroyed) return; tryToRead(); }, 0); return; } } }; /* await forthcoming audio chunks */ const awaitForthcomingChunks = () => { if (self.destroyed) return; const element = self.queueSend.peek(); if (element !== undefined && element.type === "audio-frame" && element.isSpeech !== undefined) flushPendingChunks(); else if (!self.destroyed && !self.activeEventListeners.has(awaitForthcomingChunks)) { self.queue.once("write", awaitForthcomingChunks); self.activeEventListeners.add(awaitForthcomingChunks); } }; const element = self.queueSend.peek(); if (element !== undefined && element.type === "audio-eof") this.push(null); else if (element !== undefined && element.type === "audio-frame" && element.isSpeech !== undefined) flushPendingChunks(); else if (!self.destroyed && !self.activeEventListeners.has(awaitForthcomingChunks)) { self.queue.once("write", awaitForthcomingChunks); self.activeEventListeners.add(awaitForthcomingChunks); } }; tryToRead(); } }); } /* close node */ async close() { /* indicate destruction */ this.destroyed = true; /* cleanup tail timer */ if (this.tailTimer !== null) { clearTimeout(this.tailTimer); this.tailTimer = null; } /* remove all event listeners */ this.activeEventListeners.forEach((listener) => { this.queue.removeListener("write", listener); }); this.activeEventListeners.clear(); /* close stream */ if (this.stream !== null) { this.stream.destroy(); this.stream = null; } /* cleanup queue pointers before closing VAD to prevent callback access */ this.queue.pointerDelete("recv"); this.queue.pointerDelete("vad"); this.queue.pointerDelete("send"); /* close VAD */ if (this.vad !== null) { try { const flushPromise = this.vad.flush(); const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 5000)); await Promise.race([flushPromise, timeoutPromise]); } catch (error) { this.log("warning", `VAD flush error during close: ${error}`); } this.vad.destroy(); this.vad = null; } } } exports.default = SpeechFlowNodeA2AVAD; //# sourceMappingURL=speechflow-node-a2a-vad.js.map