web-voice-detection
Version:
A WebAssembly-powered Voice Activity Detection library for the browser.
109 lines (95 loc) • 3.12 kB
text/typescript
// TypeScript code
import {
defaultFrameProcessorOptions,
FrameProcessor,
FrameProcessorInterface,
FrameProcessorOptions,
validateOptions,
} from "./frame-processor";
import { Message } from "./messages";
import { ModelFetcher, ONNXModel, ONNXRuntimeAPI, OrtOptions } from "./models";
import { Resampler } from "./resampler";
type NonRealTimeDetectionSpeechData = {
audio: Float32Array;
start: number;
end: number;
};
export type NonRealTimeDetectionOptions = FrameProcessorOptions & OrtOptions;
export const defaultNonRealTimeDetectionOptions: NonRealTimeDetectionOptions = {
...defaultFrameProcessorOptions,
ortConfig: undefined,
};
export class PlatformAgnosticNonRealTimeDetection {
frameProcessor: FrameProcessorInterface | undefined;
static async _new(
modelFetcher: ModelFetcher,
ort: ONNXRuntimeAPI,
options: NonRealTimeDetectionOptions,
): Promise<PlatformAgnosticNonRealTimeDetection> {
const detection = new this(modelFetcher, ort, options);
await detection.init();
return detection;
}
constructor(
public modelFetcher: ModelFetcher,
public ort: ONNXRuntimeAPI,
public options: NonRealTimeDetectionOptions,
) {
validateOptions(options);
}
init = async () => {
const model = await ONNXModel.new(this.ort, this.modelFetcher);
this.frameProcessor = new FrameProcessor(model.process, model.reset_state, {
frameSamples: this.options.frameSamples,
positiveSpeechThreshold: this.options.positiveSpeechThreshold,
negativeSpeechThreshold: this.options.negativeSpeechThreshold,
redemptionFrames: this.options.redemptionFrames,
preSpeechPadFrames: this.options.preSpeechPadFrames,
minSpeechFrames: this.options.minSpeechFrames,
submitUserSpeechOnPause: this.options.submitUserSpeechOnPause,
});
this.frameProcessor.resume();
};
run = async function* (
this: PlatformAgnosticNonRealTimeDetection,
inputAudio: Float32Array,
sampleRate: number,
): AsyncGenerator<NonRealTimeDetectionSpeechData> {
const resamplerOptions = {
nativeSampleRate: sampleRate,
targetSampleRate: 16000,
targetFrameSize: this.options.frameSamples,
};
const resampler = new Resampler(resamplerOptions);
let start = 0;
let end = 0;
let frameIndex = 0;
for await (const frame of resampler.stream(inputAudio)) {
const { msg, audio } = await this.frameProcessor!.process(frame);
switch (msg) {
case Message.SpeechStart:
start = (frameIndex * this.options.frameSamples) / 16;
break;
case Message.SpeechEnd:
end = ((frameIndex + 1) * this.options.frameSamples) / 16;
if (audio) {
yield { audio, start, end };
}
break;
default:
break;
}
frameIndex++;
}
const { msg, audio } = this.frameProcessor!.endSegment();
if (msg === Message.SpeechEnd) {
if (audio) {
yield {
audio,
start,
end: (frameIndex * this.options.frameSamples) / 16,
};
}
}
};
}