speechflow
Version:
Speech Processing Flow Graph
372 lines • 16.4 kB
JavaScript
"use strict";
/*
** SpeechFlow - Speech Processing Flow Graph
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
/* standard dependencies */
const node_path_1 = __importDefault(require("node:path"));
const node_stream_1 = __importDefault(require("node:stream"));
/* external dependencies */
const Transformers = __importStar(require("@huggingface/transformers"));
const wavefile_1 = require("wavefile");
const audio_inspect_1 = require("audio-inspect");
/* internal dependencies */
const speechflow_node_1 = __importDefault(require("./speechflow-node"));
const util = __importStar(require("./speechflow-util"));
/* SpeechFlow node for Gender recognition */
class SpeechFlowNodeA2AGender extends speechflow_node_1.default {
/* declare official node name */
static name = "a2a-gender";
/* internal state */
classifier = null;
queue = new util.Queue();
queueRecv = this.queue.pointerUse("recv");
queueAC = this.queue.pointerUse("ac");
queueSend = this.queue.pointerUse("send");
shutdown = false;
workingOffTimer = null;
progressInterval = null;
/* construct node */
constructor(id, cfg, opts, args) {
super(id, cfg, opts, args);
/* declare node configuration parameters */
this.configure({
window: { type: "number", pos: 0, val: 500 },
threshold: { type: "number", pos: 1, val: 0.50 },
hysteresis: { type: "number", pos: 2, val: 0.25 },
volumeThreshold: { type: "number", pos: 3, val: -45 }
});
/* declare node input/output format */
this.input = "audio";
this.output = "audio";
}
/* open node */
async open() {
/* sanity check situation */
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
throw new Error("Gender node currently supports PCM-S16LE audio only");
/* clear shutdown flag */
this.shutdown = false;
/* the used model */
const model = "Xenova/wav2vec2-large-xlsr-53-gender-recognition-librispeech";
/* track download progress when instantiating Transformers engine and model */
const progressState = new Map();
const progressCallback = (progress) => {
if (this.shutdown)
return;
let artifact = model;
if (typeof progress.file === "string")
artifact += `:${progress.file}`;
let percent = 0;
if (typeof progress.loaded === "number" && typeof progress.total === "number")
percent = (progress.loaded / progress.total) * 100;
else if (typeof progress.progress === "number")
percent = progress.progress;
if (percent > 0)
progressState.set(artifact, percent);
};
this.progressInterval = setInterval(() => {
if (this.shutdown)
return;
for (const [artifact, percent] of progressState) {
this.log("info", `downloaded ${percent.toFixed(2)}% of artifact "${artifact}"`);
if (percent >= 100.0)
progressState.delete(artifact);
}
}, 1000);
try {
const pipelinePromise = Transformers.pipeline("audio-classification", model, {
cache_dir: node_path_1.default.join(this.config.cacheDir, "gender"),
dtype: "q4",
device: "auto",
progress_callback: progressCallback
});
this.classifier = await Promise.race([
pipelinePromise,
util.timeoutPromise(30 * 1000, "model initialization timeout")
]);
}
catch (error) {
if (this.progressInterval) {
clearInterval(this.progressInterval);
this.progressInterval = null;
}
throw new Error(`failed to initialize classifier pipeline: ${error}`, { cause: error });
}
if (this.progressInterval) {
clearInterval(this.progressInterval);
this.progressInterval = null;
}
if (this.classifier === null)
throw new Error("failed to instantiate classifier pipeline");
/* define sample rate required by model */
const sampleRateTarget = 16000;
/* classify a single large-enough concatenated audio frame */
const classify = async (data) => {
if (this.shutdown || this.classifier === null)
throw new Error("classifier shutdown during operation");
/* check volume level and return "unknown" if too low
in order to avoid a wrong classificaton */
const audioData = {
sampleRate: sampleRateTarget,
numberOfChannels: 1,
channelData: [data],
duration: data.length / sampleRateTarget,
length: data.length
};
const rms = (0, audio_inspect_1.getRMS)(audioData, { asDB: true });
if (rms < this.params.volumeThreshold)
return "unknown";
/* classify audio */
const result = await Promise.race([
this.classifier(data),
util.timeoutPromise(30 * 1000, "classification timeout")
]);
const classified = Array.isArray(result) ?
result :
[result];
const c1 = classified.find((c) => c.label === "male");
const c2 = classified.find((c) => c.label === "female");
const male = c1 ? c1.score : 0.0;
const female = c2 ? c2.score : 0.0;
const threshold = this.params.threshold;
const hysteresis = this.params.hysteresis;
if (male > threshold && male > female + hysteresis)
return "male";
else if (female > threshold && female > male + hysteresis)
return "female";
else
return "unknown";
};
/* work off queued audio frames */
const frameWindowDuration = this.params.window / 1000;
const frameWindowSamples = Math.floor(frameWindowDuration * sampleRateTarget);
let lastGender = "";
let workingOff = false;
const workOffQueue = async () => {
/* control working off round */
if (workingOff || this.shutdown)
return;
workingOff = true;
if (this.workingOffTimer !== null) {
clearTimeout(this.workingOffTimer);
this.workingOffTimer = null;
}
this.queue.off("write", workOffQueue);
/* workoff the queue */
try {
let pos0 = this.queueAC.position();
const posL = this.queueAC.maxPosition();
const data = new Float32Array(frameWindowSamples);
data.fill(0);
let samples = 0;
let pos = pos0;
while (pos < posL && samples < frameWindowSamples && !this.shutdown) {
const element = this.queueAC.peek(pos);
if (element === undefined || element.type !== "audio-frame")
break;
if ((samples + element.data.length) < frameWindowSamples) {
data.set(element.data, samples);
samples += element.data.length;
}
pos++;
}
if (pos0 < pos && samples > frameWindowSamples * 0.75 && !this.shutdown) {
const gender = await classify(data);
if (this.shutdown)
return;
const posM = pos0 + Math.trunc((pos - pos0) * 0.25);
while (pos0 < posM && pos0 < posL && !this.shutdown) {
const element = this.queueAC.peek(pos0);
if (element === undefined || element.type !== "audio-frame")
break;
element.gender = gender;
this.queueAC.touch();
this.queueAC.walk(+1);
pos0++;
}
if (lastGender !== gender && !this.shutdown) {
this.log("info", `gender now recognized as <${gender}>`);
lastGender = gender;
}
}
}
catch (error) {
this.log("error", `gender classification error: ${error}`);
}
/* re-initiate working off round */
workingOff = false;
if (!this.shutdown) {
this.workingOffTimer = setTimeout(workOffQueue, 100);
this.queue.once("write", workOffQueue);
}
};
this.queue.once("write", workOffQueue);
/* provide Duplex stream and internally attach to classifier */
const self = this;
this.stream = new node_stream_1.default.Duplex({
writableObjectMode: true,
readableObjectMode: true,
decodeStrings: false,
highWaterMark: 1,
/* receive audio chunk (writable side of stream) */
write(chunk, encoding, callback) {
if (self.shutdown) {
callback(new Error("stream already destroyed"));
return;
}
if (!Buffer.isBuffer(chunk.payload))
callback(new Error("expected audio input as Buffer chunks"));
else if (chunk.payload.byteLength === 0)
callback();
else {
try {
/* convert audio samples from PCM/I16/48KHz to PCM/F32/16KHz */
let data = util.convertBufToF32(chunk.payload, self.config.audioLittleEndian);
const wav = new wavefile_1.WaveFile();
wav.fromScratch(self.config.audioChannels, self.config.audioSampleRate, "32f", data);
wav.toSampleRate(sampleRateTarget, { method: "cubic" });
data = wav.getSamples(false, Float32Array);
/* queue chunk and converted data */
self.queueRecv.append({ type: "audio-frame", chunk, data });
callback();
}
catch (error) {
callback(error instanceof Error ? error : new Error("audio processing failed"));
}
}
},
/* receive no more audio chunks (writable side of stream) */
final(callback) {
if (self.shutdown) {
callback();
return;
}
/* signal end of file */
self.queueRecv.append({ type: "audio-eof" });
callback();
},
/* send audio chunk(s) (readable side of stream) */
read(_size) {
/* flush pending audio chunks */
const flushPendingChunks = () => {
if (self.shutdown) {
this.push(null);
return;
}
const element = self.queueSend.peek();
if (element !== undefined
&& element.type === "audio-eof")
this.push(null);
else if (element !== undefined
&& element.type === "audio-frame"
&& element.gender !== undefined) {
while (true) {
if (self.shutdown) {
this.push(null);
return;
}
const element = self.queueSend.peek();
if (element === undefined)
break;
else if (element.type === "audio-eof") {
this.push(null);
break;
}
else if (element.type === "audio-frame"
&& element.gender === undefined)
break;
const duration = util.audioArrayDuration(element.data);
self.log("debug", `send chunk (${duration.toFixed(3)}s) with gender <${element.gender}>`);
element.chunk.meta.set("gender", element.gender);
this.push(element.chunk);
self.queueSend.walk(+1);
self.queue.trim();
}
}
else if (!self.shutdown)
self.queue.once("write", flushPendingChunks);
};
flushPendingChunks();
}
});
}
/* close node */
async close() {
/* indicate shutdown */
this.shutdown = true;
/* cleanup working-off timer */
if (this.workingOffTimer !== null) {
clearTimeout(this.workingOffTimer);
this.workingOffTimer = null;
}
/* cleanup progress interval */
if (this.progressInterval !== null) {
clearInterval(this.progressInterval);
this.progressInterval = null;
}
/* remove all event listeners */
this.queue.removeAllListeners("write");
/* close stream */
if (this.stream !== null) {
this.stream.destroy();
this.stream = null;
}
/* cleanup classifier */
if (this.classifier !== null) {
try {
const disposePromise = this.classifier.dispose();
const timeoutPromise = new Promise((resolve) => setTimeout(resolve, 5000));
await Promise.race([disposePromise, timeoutPromise]);
}
catch (error) {
this.log("warning", `error during classifier cleanup: ${error}`);
}
this.classifier = null;
}
/* cleanup queue pointers */
this.queue.pointerDelete("recv");
this.queue.pointerDelete("ac");
this.queue.pointerDelete("send");
}
}
exports.default = SpeechFlowNodeA2AGender;
//# sourceMappingURL=speechflow-node-a2a-gender.js.map