UNPKG

speech-detector

Version:

A node.js library for detecting speech in audio using Silero's VAD model

55 lines (54 loc) 1.87 kB
import ort from "onnxruntime-web"; import * as fs from "fs"; // This resolves a bug with WASM in nodejs. ort.env.wasm.numThreads = 1; ort.env.remoteModels = false; class Silero { session; sampleRate; // Internal variables c; h; constructor(session, sampleRate) { this.session = session; // Note: I suppose the model requires a tensor as input. this.sampleRate = new ort.Tensor("int64", [sampleRate]); // Note: No idea what these inputs to the model are for... this.c = new ort.Tensor("float32", Array(2 * 64).fill(0), [2, 1, 64]); this.h = new ort.Tensor("float32", Array(2 * 64).fill(0), [2, 1, 64]); } static async create(sampleRate, uri = "./model/silero_vad.onnx") { const opt = { executionProviders: ["wasm"], logSeverityLevel: 3, logVerbosityLevel: 3, }; // For compatability convert the URI into a properly // formatted URL. This will work for NodeJS and Web. const path = new URL(uri, import.meta.url); let session; if (typeof window === "undefined") { // Only read in the model file in NodeJS. const model = fs.readFileSync(path); session = await ort.InferenceSession.create(model, opt); } else { session = await ort.InferenceSession.create(uri, opt); } return new Silero(session, sampleRate); } async process(audio) { const t = new ort.Tensor("float32", audio, [1, audio.length]); const input = { input: t, h: this.h, c: this.c, sr: this.sampleRate, }; const output = await this.session.run(input); this.h = output.hn; this.c = output.cn; return output.output.data[0]; } } export { Silero };