UNPKG

speechflow

Version:

Speech Processing Flow Graph

385 lines (350 loc) 16.4 kB
/* ** SpeechFlow - Speech Processing Flow Graph ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com> ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only> */ /* standard dependencies */ import Stream from "node:stream" /* external dependencies */ import { RealTimeVAD } from "@ericedouard/vad-node-realtime" /* internal dependencies */ import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node" import * as util from "./speechflow-util" /* audio stream queue element */ type AudioQueueElementSegment = { data: Float32Array, isSpeech?: boolean } type AudioQueueElement = { type: "audio-frame", chunk: SpeechFlowChunk, segmentIdx: number, segmentData: AudioQueueElementSegment[], isSpeech?: boolean } | { type: "audio-eof" } /* SpeechFlow node for VAD speech-to-speech processing */ export default class SpeechFlowNodeA2AVAD extends SpeechFlowNode { /* declare official node name */ public static name = "a2a-vad" /* internal state */ private vad: RealTimeVAD | null = null private queue = new util.Queue<AudioQueueElement>() private queueRecv = this.queue.pointerUse("recv") private queueVAD = this.queue.pointerUse("vad") private queueSend = this.queue.pointerUse("send") private destroyed = false private tailTimer: ReturnType<typeof setTimeout> | null = null private activeEventListeners = new Set<() => void>() /* construct node */ constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) { super(id, cfg, opts, args) /* declare node configuration parameters */ this.configure({ mode: { type: "string", val: "silenced", match: /^(?:silenced|unplugged)$/ }, posSpeechThreshold: { type: "number", val: 0.50 }, negSpeechThreshold: { type: "number", val: 0.35 }, minSpeechFrames: { type: "number", val: 2 }, redemptionFrames: { type: "number", val: 12 }, preSpeechPadFrames: { type: "number", val: 1 }, postSpeechTail: { type: "number", val: 1500 } }) /* declare node input/output format */ this.input = "audio" this.output = "audio" } /* open node */ async open () { /* sanity check situation */ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian) throw new Error("VAD node currently supports PCM-S16LE audio only") /* clear destruction flag */ this.destroyed = false /* internal processing constants */ const vadSampleRateTarget = 16000 /* internal target of VAD */ const vadSamplesPerFrame = 512 /* required for VAD v5 */ /* helper function for timer cleanup */ const clearTailTimer = () => { if (this.tailTimer !== null) { clearTimeout(this.tailTimer) this.tailTimer = null } } /* establish Voice Activity Detection (VAD) facility */ let tail = false try { this.vad = await RealTimeVAD.new({ model: "v5", sampleRate: this.config.audioSampleRate, /* before resampling to 16KHz */ frameSamples: vadSamplesPerFrame, /* after resampling to 16KHz */ positiveSpeechThreshold: this.params.posSpeechThreshold, negativeSpeechThreshold: this.params.negSpeechThreshold, minSpeechFrames: this.params.minSpeechFrames, redemptionFrames: this.params.redemptionFrames, preSpeechPadFrames: this.params.preSpeechPadFrames, onSpeechStart: () => { if (this.destroyed) return this.log("info", "VAD: speech start") if (this.params.mode === "unplugged") { tail = false clearTailTimer() } }, onSpeechEnd: (audio) => { if (this.destroyed) return const duration = util.audioArrayDuration(audio, vadSampleRateTarget) this.log("info", `VAD: speech end (duration: ${duration.toFixed(2)}s)`) if (this.params.mode === "unplugged") { tail = true clearTailTimer() this.tailTimer = setTimeout(() => { if (this.destroyed || this.tailTimer === null) return tail = false this.tailTimer = null }, this.params.postSpeechTail) } }, onVADMisfire: () => { if (this.destroyed) return this.log("info", "VAD: speech end (segment too short)") if (this.params.mode === "unplugged") { tail = true clearTailTimer() this.tailTimer = setTimeout(() => { if (this.destroyed || this.tailTimer === null) return tail = false this.tailTimer = null }, this.params.postSpeechTail) } }, onFrameProcessed: (audio) => { if (this.destroyed) return try { /* annotate the current audio segment */ const element = this.queueVAD.peek() if (element === undefined || element.type !== "audio-frame") throw new Error("internal error which cannot happen: no more queued element") if (element.segmentIdx >= element.segmentData.length) throw new Error("segment index out of bounds") const segment = element.segmentData[element.segmentIdx++] segment.isSpeech = (audio.isSpeech > audio.notSpeech) || tail /* annotate the entire audio chunk */ if (element.segmentIdx >= element.segmentData.length) { element.isSpeech = element.segmentData.some(segment => segment.isSpeech) this.queueVAD.touch() this.queueVAD.walk(+1) } } catch (error) { this.log("error", `VAD frame processing error: ${error}`, { cause: error }) } } }) this.vad.start() } catch (error) { throw new Error(`failed to initialize VAD: ${error}`, { cause: error }) } /* provide Duplex stream and internally attach to VAD */ const self = this this.stream = new Stream.Duplex({ writableObjectMode: true, readableObjectMode: true, decodeStrings: false, highWaterMark: 1, /* receive audio chunk (writable side of stream) */ write (chunk: SpeechFlowChunk, encoding, callback) { if (self.destroyed) { callback(new Error("stream already destroyed")) return } if (!Buffer.isBuffer(chunk.payload)) callback(new Error("expected audio input as Buffer chunks")) else if (chunk.payload.byteLength === 0) callback() else { try { /* convert audio samples from PCM/I16 to PCM/F32 */ const data = util.convertBufToF32(chunk.payload, self.config.audioLittleEndian) /* segment audio samples as individual VAD-sized frames */ const segmentData: AudioQueueElementSegment[] = [] const chunkSize = vadSamplesPerFrame * (self.config.audioSampleRate / vadSampleRateTarget) const chunks = Math.trunc(data.length / chunkSize) for (let i = 0; i < chunks; i++) { const frame = data.slice(i * chunkSize, (i + 1) * chunkSize) const segment: AudioQueueElementSegment = { data: frame } segmentData.push(segment) } if ((chunks * chunkSize) < data.length) { const frame = new Float32Array(chunkSize) frame.fill(0) frame.set(data.slice(chunks * chunkSize)) const segment: AudioQueueElementSegment = { data: frame } segmentData.push(segment) } /* queue the results */ self.queueRecv.append({ type: "audio-frame", chunk, segmentIdx: 0, segmentData }) /* push segments through Voice Activity Detection (VAD) */ if (self.vad && !self.destroyed) { try { for (const segment of segmentData) self.vad.processAudio(segment.data) } catch (error) { self.log("error", `VAD processAudio error: ${error}`) } } callback() } catch (error) { callback(error instanceof Error ? error : new Error("VAD processing failed")) } } }, /* receive no more audio chunks (writable side of stream) */ final (callback) { if (self.destroyed) { callback() return } /* signal end of file */ self.queueRecv.append({ type: "audio-eof" }) callback() }, /* send audio chunk(s) (readable side of stream) */ read (_size) { if (self.destroyed) { this.push(null) return } /* try to perform read operation from scratch */ const tryToRead = () => { if (self.destroyed) { this.push(null) return } /* flush pending audio chunks */ const flushPendingChunks = () => { let pushed = 0 while (true) { if (self.destroyed) { this.push(null) return } const element = self.queueSend.peek() if (element === undefined) break else if (element.type === "audio-eof") { this.push(null) break } else if (element.type === "audio-frame" && element.isSpeech === undefined) break self.queueSend.walk(+1) self.queue.trim() if (element.isSpeech) { this.push(element.chunk) pushed++ } else if (self.params.mode === "silenced") { const chunk = element.chunk.clone() const buffer = chunk.payload as Buffer buffer.fill(0) this.push(chunk) pushed++ } else if (self.params.mode === "unplugged" && pushed === 0) { /* we have to await chunks now, as in unplugged mode we else would be never called again until we at least once push a new chunk as the result */ setTimeout(() => { if (self.destroyed) return tryToRead() }, 0) return } } } /* await forthcoming audio chunks */ const awaitForthcomingChunks = () => { if (self.destroyed) return const element = self.queueSend.peek() if (element !== undefined && element.type === "audio-frame" && element.isSpeech !== undefined) flushPendingChunks() else if (!self.destroyed && !self.activeEventListeners.has(awaitForthcomingChunks)) { self.queue.once("write", awaitForthcomingChunks) self.activeEventListeners.add(awaitForthcomingChunks) } } const element = self.queueSend.peek() if (element !== undefined && element.type === "audio-eof") this.push(null) else if (element !== undefined && element.type === "audio-frame" && element.isSpeech !== undefined) flushPendingChunks() else if (!self.destroyed && !self.activeEventListeners.has(awaitForthcomingChunks)) { self.queue.once("write", awaitForthcomingChunks) self.activeEventListeners.add(awaitForthcomingChunks) } } tryToRead() } }) } /* close node */ async close () { /* indicate destruction */ this.destroyed = true /* cleanup tail timer */ if (this.tailTimer !== null) { clearTimeout(this.tailTimer) this.tailTimer = null } /* remove all event listeners */ this.activeEventListeners.forEach((listener) => { this.queue.removeListener("write", listener) }) this.activeEventListeners.clear() /* close stream */ if (this.stream !== null) { this.stream.destroy() this.stream = null } /* cleanup queue pointers before closing VAD to prevent callback access */ this.queue.pointerDelete("recv") this.queue.pointerDelete("vad") this.queue.pointerDelete("send") /* close VAD */ if (this.vad !== null) { try { const flushPromise = this.vad.flush() const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 5000)) await Promise.race([ flushPromise, timeoutPromise ]) } catch (error) { this.log("warning", `VAD flush error during close: ${error}`) } this.vad.destroy() this.vad = null } } }