UNPKG

speechflow

Version:

Speech Processing Flow Graph

372 lines 16.4 kB
"use strict"; /* ** SpeechFlow - Speech Processing Flow Graph ** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com> ** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only> */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); /* standard dependencies */ const node_path_1 = __importDefault(require("node:path")); const node_stream_1 = __importDefault(require("node:stream")); /* external dependencies */ const Transformers = __importStar(require("@huggingface/transformers")); const wavefile_1 = require("wavefile"); const audio_inspect_1 = require("audio-inspect"); /* internal dependencies */ const speechflow_node_1 = __importDefault(require("./speechflow-node")); const util = __importStar(require("./speechflow-util")); /* SpeechFlow node for Gender recognition */ class SpeechFlowNodeA2AGender extends speechflow_node_1.default { /* declare official node name */ static name = "a2a-gender"; /* internal state */ classifier = null; queue = new util.Queue(); queueRecv = this.queue.pointerUse("recv"); queueAC = this.queue.pointerUse("ac"); queueSend = this.queue.pointerUse("send"); shutdown = false; workingOffTimer = null; progressInterval = null; /* construct node */ constructor(id, cfg, opts, args) { super(id, cfg, opts, args); /* declare node configuration parameters */ this.configure({ window: { type: "number", pos: 0, val: 500 }, threshold: { type: "number", pos: 1, val: 0.50 }, hysteresis: { type: "number", pos: 2, val: 0.25 }, volumeThreshold: { type: "number", pos: 3, val: -45 } }); /* declare node input/output format */ this.input = "audio"; this.output = "audio"; } /* open node */ async open() { /* sanity check situation */ if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian) throw new Error("Gender node currently supports PCM-S16LE audio only"); /* clear shutdown flag */ this.shutdown = false; /* the used model */ const model = "Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech"; /* track download progress when instantiating Transformers engine and model */ const progressState = new Map(); const progressCallback = (progress) => { if (this.shutdown) return; let artifact = model; if (typeof progress.file === "string") artifact += `:${progress.file}`; let percent = 0; if (typeof progress.loaded === "number" && typeof progress.total === "number") percent = (progress.loaded / progress.total) * 100; else if (typeof progress.progress === "number") percent = progress.progress; if (percent > 0) progressState.set(artifact, percent); }; this.progressInterval = setInterval(() => { if (this.shutdown) return; for (const [artifact, percent] of progressState) { this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`); if (percent >= 100.0) progressState.delete(artifact); } }, 1000); try { const pipelinePromise = Transformers.pipeline("audio-classification", model, { cache_dir: node_path_1.default.join(this.config.cacheDir, "gender"), dtype: "q4", device: "auto", progress_callback: progressCallback }); this.classifier = await Promise.race([ pipelinePromise, util.timeoutPromise(30 * 1000, "model initialization timeout") ]); } catch (error) { if (this.progressInterval) { clearInterval(this.progressInterval); this.progressInterval = null; } throw new Error(`failed to initialize classifier pipeline: ${error}`, { cause: error }); } if (this.progressInterval) { clearInterval(this.progressInterval); this.progressInterval = null; } if (this.classifier === null) throw new Error("failed to instantiate classifier pipeline"); /* define sample rate required by model */ const sampleRateTarget = 16000; /* classify a single large-enough concatenated audio frame */ const classify = async (data) => { if (this.shutdown || this.classifier === null) throw new Error("classifier shutdown during operation"); /* check volume level and return "unknown" if too low in order to avoid a wrong classificaton */ const audioData = { sampleRate: sampleRateTarget, numberOfChannels: 1, channelData: [data], duration: data.length / sampleRateTarget, length: data.length }; const rms = (0, audio_inspect_1.getRMS)(audioData, { asDB: true }); if (rms < this.params.volumeThreshold) return "unknown"; /* classify audio */ const result = await Promise.race([ this.classifier(data), util.timeoutPromise(30 * 1000, "classification timeout") ]); const classified = Array.isArray(result) ? result : [result]; const c1 = classified.find((c) => c.label === "male"); const c2 = classified.find((c) => c.label === "female"); const male = c1 ? c1.score : 0.0; const female = c2 ? c2.score : 0.0; const threshold = this.params.threshold; const hysteresis = this.params.hysteresis; if (male > threshold && male > female + hysteresis) return "male"; else if (female > threshold && female > male + hysteresis) return "female"; else return "unknown"; }; /* work off queued audio frames */ const frameWindowDuration = this.params.window / 1000; const frameWindowSamples = Math.floor(frameWindowDuration * sampleRateTarget); let lastGender = ""; let workingOff = false; const workOffQueue = async () => { /* control working off round */ if (workingOff || this.shutdown) return; workingOff = true; if (this.workingOffTimer !== null) { clearTimeout(this.workingOffTimer); this.workingOffTimer = null; } this.queue.off("write", workOffQueue); /* workoff the queue */ try { let pos0 = this.queueAC.position(); const posL = this.queueAC.maxPosition(); const data = new Float32Array(frameWindowSamples); data.fill(0); let samples = 0; let pos = pos0; while (pos < posL && samples < frameWindowSamples && !this.shutdown) { const element = this.queueAC.peek(pos); if (element === undefined || element.type !== "audio-frame") break; if ((samples + element.data.length) < frameWindowSamples) { data.set(element.data, samples); samples += element.data.length; } pos++; } if (pos0 < pos && samples > frameWindowSamples * 0.75 && !this.shutdown) { const gender = await classify(data); if (this.shutdown) return; const posM = pos0 + Math.trunc((pos - pos0) * 0.25); while (pos0 < posM && pos0 < posL && !this.shutdown) { const element = this.queueAC.peek(pos0); if (element === undefined || element.type !== "audio-frame") break; element.gender = gender; this.queueAC.touch(); this.queueAC.walk(+1); pos0++; } if (lastGender !== gender && !this.shutdown) { this.log("info", `gender now recognized as <${gender}>`); lastGender = gender; } } } catch (error) { this.log("error", `gender classification error: ${error}`); } /* re-initiate working off round */ workingOff = false; if (!this.shutdown) { this.workingOffTimer = setTimeout(workOffQueue, 100); this.queue.once("write", workOffQueue); } }; this.queue.once("write", workOffQueue); /* provide Duplex stream and internally attach to classifier */ const self = this; this.stream = new node_stream_1.default.Duplex({ writableObjectMode: true, readableObjectMode: true, decodeStrings: false, highWaterMark: 1, /* receive audio chunk (writable side of stream) */ write(chunk, encoding, callback) { if (self.shutdown) { callback(new Error("stream already destroyed")); return; } if (!Buffer.isBuffer(chunk.payload)) callback(new Error("expected audio input as Buffer chunks")); else if (chunk.payload.byteLength === 0) callback(); else { try { /* convert audio samples from PCM/I16/48KHz to PCM/F32/16KHz */ let data = util.convertBufToF32(chunk.payload, self.config.audioLittleEndian); const wav = new wavefile_1.WaveFile(); wav.fromScratch(self.config.audioChannels, self.config.audioSampleRate, "32f", data); wav.toSampleRate(sampleRateTarget, { method: "cubic" }); data = wav.getSamples(false, Float32Array); /* queue chunk and converted data */ self.queueRecv.append({ type: "audio-frame", chunk, data }); callback(); } catch (error) { callback(error instanceof Error ? error : new Error("audio processing failed")); } } }, /* receive no more audio chunks (writable side of stream) */ final(callback) { if (self.shutdown) { callback(); return; } /* signal end of file */ self.queueRecv.append({ type: "audio-eof" }); callback(); }, /* send audio chunk(s) (readable side of stream) */ read(_size) { /* flush pending audio chunks */ const flushPendingChunks = () => { if (self.shutdown) { this.push(null); return; } const element = self.queueSend.peek(); if (element !== undefined && element.type === "audio-eof") this.push(null); else if (element !== undefined && element.type === "audio-frame" && element.gender !== undefined) { while (true) { if (self.shutdown) { this.push(null); return; } const element = self.queueSend.peek(); if (element === undefined) break; else if (element.type === "audio-eof") { this.push(null); break; } else if (element.type === "audio-frame" && element.gender === undefined) break; const duration = util.audioArrayDuration(element.data); self.log("debug", `send chunk (${duration.toFixed(3)}s) with gender <${element.gender}>`); element.chunk.meta.set("gender", element.gender); this.push(element.chunk); self.queueSend.walk(+1); self.queue.trim(); } } else if (!self.shutdown) self.queue.once("write", flushPendingChunks); }; flushPendingChunks(); } }); } /* close node */ async close() { /* indicate shutdown */ this.shutdown = true; /* cleanup working-off timer */ if (this.workingOffTimer !== null) { clearTimeout(this.workingOffTimer); this.workingOffTimer = null; } /* cleanup progress interval */ if (this.progressInterval !== null) { clearInterval(this.progressInterval); this.progressInterval = null; } /* remove all event listeners */ this.queue.removeAllListeners("write"); /* close stream */ if (this.stream !== null) { this.stream.destroy(); this.stream = null; } /* cleanup classifier */ if (this.classifier !== null) { try { const disposePromise = this.classifier.dispose(); const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 5000)); await Promise.race([disposePromise, timeoutPromise]); } catch (error) { this.log("warning", `error during classifier cleanup: ${error}`); } this.classifier = null; } /* cleanup queue pointers */ this.queue.pointerDelete("recv"); this.queue.pointerDelete("ac"); this.queue.pointerDelete("send"); } } exports.default = SpeechFlowNodeA2AGender; //# sourceMappingURL=speechflow-node-a2a-gender.js.map