ten-vad-lib
Version:
A JavaScript library for Ten VAD (Voice Activity Detection) based on WebAssembly
376 lines (372 loc) • 14.7 kB
JavaScript
class VADModuleLoader {
constructor() {
this.module = null;
this.loadingPromise = null;
}
static getInstance() {
if (!VADModuleLoader.instance) {
VADModuleLoader.instance = new VADModuleLoader();
}
return VADModuleLoader.instance;
}
async loadModule(options = {}) {
if (this.module) {
return this.module;
}
if (this.loadingPromise) {
return this.loadingPromise;
}
this.loadingPromise = this.loadModuleInternal(options);
return this.loadingPromise;
}
async loadModuleInternal(options = {}) {
const wasmPath = options.wasmPath || this.getDefaultWasmPath();
const jsPath = options.jsPath || this.getDefaultJsPath();
try {
if (!window.createVADModule) {
await this.loadScript(jsPath);
}
let attempts = 0;
const maxAttempts = 100;
while (!window.createVADModule && attempts < maxAttempts) {
await new Promise(resolve => setTimeout(resolve, 100));
attempts++;
}
if (!window.createVADModule) {
throw new Error('createVADModule not found. Please ensure ten_vad.js is loaded correctly.');
}
const wasmResponse = await fetch(wasmPath);
if (!wasmResponse.ok) {
throw new Error(`Failed to load WASM file from ${wasmPath}`);
}
const wasmBinary = await wasmResponse.arrayBuffer();
const module = await window.createVADModule({
wasmBinary,
locateFile: (filePath) => {
if (filePath.endsWith('.wasm')) {
return wasmPath;
}
return filePath;
},
noInitialRun: false,
noExitRuntime: true
});
this.addHelperFunctions(module);
this.module = module;
return module;
}
catch (error) {
this.loadingPromise = null;
throw error;
}
}
getDefaultWasmPath() {
if (typeof window !== 'undefined') {
const script = document.currentScript || document.querySelector('script[src*="ten-vad"]');
if (script) {
const scriptSrc = script.getAttribute('src');
if (scriptSrc) {
const baseUrl = scriptSrc.substring(0, scriptSrc.lastIndexOf('/'));
return `${baseUrl}/wasm/ten_vad.wasm`;
}
}
}
return '/wasm/ten_vad.wasm';
}
getDefaultJsPath() {
if (typeof window !== 'undefined') {
const script = document.currentScript || document.querySelector('script[src*="ten-vad"]');
if (script) {
const scriptSrc = script.getAttribute('src');
if (scriptSrc) {
const baseUrl = scriptSrc.substring(0, scriptSrc.lastIndexOf('/'));
return `${baseUrl}/wasm/ten_vad.js`;
}
}
}
return '/wasm/ten_vad.js';
}
async loadScript(src) {
return new Promise((resolve, reject) => {
const script = document.createElement('script');
script.src = src;
script.onload = () => resolve();
script.onerror = () => reject(new Error(`Failed to load script: ${src}`));
document.head.appendChild(script);
});
}
addHelperFunctions(module) {
if (!module.getValue) {
module.getValue = function (ptr, type) {
switch (type) {
case 'i32': return module.HEAP32[ptr >> 2];
case 'float': return module.HEAPF32[ptr >> 2];
default: throw new Error(`Unsupported type: ${type}`);
}
};
}
if (!module.UTF8ToString) {
module.UTF8ToString = function (ptr) {
if (!ptr)
return '';
let result = '';
let i = ptr;
while (module.HEAPU8[i]) {
result += String.fromCharCode(module.HEAPU8[i++]);
}
return result;
};
}
}
getModule() {
return this.module;
}
reset() {
this.module = null;
this.loadingPromise = null;
}
}
class VADInstance {
constructor(module, hopSize, voiceThreshold) {
this.isDestroyed = false;
this.module = module;
this.hopSize = hopSize;
this.voiceThreshold = voiceThreshold;
this.vadHandlePtr = module._malloc(4);
// Create VAD instance
const createResult = module._ten_vad_create(this.vadHandlePtr, hopSize, voiceThreshold);
if (createResult !== 0) {
throw new Error(`VAD creation failed with code: ${createResult}`);
}
this.vadHandle = module.getValue(this.vadHandlePtr, 'i32');
}
async processFrame(audioData) {
if (this.isDestroyed) {
throw new Error('VAD instance has been destroyed');
}
if (audioData.length !== this.hopSize) {
throw new Error(`Expected ${this.hopSize} samples, got ${audioData.length}`);
}
// Allocate memory for processing
const audioPtr = this.module._malloc(this.hopSize * 2);
const probPtr = this.module._malloc(4);
const flagPtr = this.module._malloc(4);
try {
// Copy audio data to WASM memory
this.module.HEAP16.set(audioData, audioPtr / 2);
// Process frame
const result = this.module._ten_vad_process(this.vadHandle, audioPtr, this.hopSize, probPtr, flagPtr);
if (result !== 0) {
throw new Error(`Frame processing failed with code: ${result}`);
}
// Get results
const probability = this.module.getValue(probPtr, 'float');
const flag = this.module.getValue(flagPtr, 'i32');
const isVoice = flag === 1;
return { probability, isVoice };
}
finally {
// Free allocated memory
this.module._free(audioPtr);
this.module._free(probPtr);
this.module._free(flagPtr);
}
}
reset() {
if (this.isDestroyed) {
throw new Error('VAD instance has been destroyed');
}
// Recreate the VAD instance
this.destroy();
this.isDestroyed = false;
this.vadHandlePtr = this.module._malloc(4);
const createResult = this.module._ten_vad_create(this.vadHandlePtr, this.hopSize, this.voiceThreshold);
if (createResult !== 0) {
throw new Error(`VAD recreation failed with code: ${createResult}`);
}
this.vadHandle = this.module.getValue(this.vadHandlePtr, 'i32');
}
destroy() {
if (this.isDestroyed) {
return;
}
if (this.vadHandlePtr) {
this.module._ten_vad_destroy(this.vadHandlePtr);
this.module._free(this.vadHandlePtr);
this.vadHandlePtr = 0;
}
this.isDestroyed = true;
}
getVersion() {
try {
const versionPtr = this.module._ten_vad_get_version();
if (versionPtr) {
return this.module.UTF8ToString(versionPtr);
}
return 'Unknown Version';
}
catch (error) {
console.error('Failed to get VAD version:', error);
return 'Error Getting Version';
}
}
}
const defaultTenVADOptions = {
hopSize: 256,
voiceThreshold: 0.5,
wasmPath: '/wasm/ten_vad.wasm',
jsPath: '/wasm/ten_vad.js',
minSpeechDuration: 100, // ms
maxSilenceDuration: 500, // ms
};
class NonRealTimeTenVAD {
constructor(options = {}) {
this.options = { ...defaultTenVADOptions, ...options };
this.moduleLoader = VADModuleLoader.getInstance();
}
static async new(options = {}) {
const vad = new NonRealTimeTenVAD(options);
await vad.moduleLoader.loadModule({
wasmPath: vad.options.wasmPath,
jsPath: vad.options.jsPath,
});
return vad;
}
async *run(inputAudio, sampleRate) {
const module = await this.moduleLoader.loadModule({
wasmPath: this.options.wasmPath,
jsPath: this.options.jsPath,
});
const vadInstance = new VADInstance(module, this.options.hopSize, this.options.voiceThreshold);
try {
// Convert Float32Array to Int16Array and resample if needed
const processedAudio = this.preprocessAudio(inputAudio, sampleRate);
// Process audio in frames
const frameSize = this.options.hopSize;
const frameCount = Math.floor(processedAudio.length / frameSize);
let currentSegment = null;
for (let i = 0; i < frameCount; i++) {
const frameStart = i * frameSize;
const frameEnd = frameStart + frameSize;
const frameData = processedAudio.slice(frameStart, frameEnd);
const result = await vadInstance.processFrame(frameData);
const frameTime = (i * frameSize) / sampleRate * 1000; // Convert to ms
if (result.isVoice) {
// Voice detected
if (!currentSegment) {
// Start new segment
currentSegment = {
start: frameTime,
end: frameTime,
audio: new Float32Array(0),
probabilities: [],
};
}
// Extend current segment
currentSegment.end = frameTime;
currentSegment.probabilities.push(result.probability);
// Add frame audio to segment
const frameAudio = new Float32Array(frameData.length);
for (let j = 0; j < frameData.length; j++) {
frameAudio[j] = frameData[j] / 32768; // Convert to float
}
const newAudio = new Float32Array(currentSegment.audio.length + frameAudio.length);
newAudio.set(currentSegment.audio);
newAudio.set(frameAudio, currentSegment.audio.length);
currentSegment.audio = newAudio;
}
else {
// No voice detected
if (currentSegment) {
const segmentDuration = currentSegment.end - currentSegment.start;
if (segmentDuration >= this.options.minSpeechDuration) {
// Yield speech segment
const avgProbability = currentSegment.probabilities.reduce((sum, p) => sum + p, 0) / currentSegment.probabilities.length;
yield {
audio: currentSegment.audio,
start: currentSegment.start,
end: currentSegment.end,
probability: avgProbability,
};
}
currentSegment = null;
}
}
}
// Handle final segment
if (currentSegment) {
const segmentDuration = currentSegment.end - currentSegment.start;
if (segmentDuration >= this.options.minSpeechDuration) {
const avgProbability = currentSegment.probabilities.reduce((sum, p) => sum + p, 0) / currentSegment.probabilities.length;
yield {
audio: currentSegment.audio,
start: currentSegment.start,
end: currentSegment.end,
probability: avgProbability,
};
}
}
}
finally {
vadInstance.destroy();
}
}
async process(inputAudio, sampleRate) {
const startTime = Date.now();
const speechSegments = [];
for await (const segment of this.run(inputAudio, sampleRate)) {
speechSegments.push(segment);
}
const processingTime = Date.now() - startTime;
const totalAudioTime = (inputAudio.length / sampleRate) * 1000;
const realTimeFactor = processingTime / totalAudioTime;
// Calculate statistics
const totalFrames = Math.floor(inputAudio.length / this.options.hopSize);
const voiceFrames = speechSegments.reduce((sum, segment) => {
const segmentFrames = Math.floor((segment.end - segment.start) / 16); // 16ms per frame
return sum + segmentFrames;
}, 0);
const voicePercentage = (voiceFrames / totalFrames) * 100;
return {
speechSegments,
statistics: {
totalFrames,
voiceFrames,
voicePercentage,
processingTime,
realTimeFactor,
},
};
}
preprocessAudio(audio, sampleRate) {
// Convert Float32Array to Int16Array
const int16Audio = new Int16Array(audio.length);
for (let i = 0; i < audio.length; i++) {
// Clamp to [-1, 1] and convert to 16-bit integer
const clamped = Math.max(-1, Math.min(1, audio[i]));
int16Audio[i] = Math.round(clamped * 32767);
}
// Simple resampling if needed (basic implementation)
if (sampleRate !== 16000) {
return this.resample(int16Audio, sampleRate, 16000);
}
return int16Audio;
}
resample(audio, fromSampleRate, toSampleRate) {
if (fromSampleRate === toSampleRate) {
return audio;
}
const ratio = fromSampleRate / toSampleRate;
const newLength = Math.round(audio.length / ratio);
const resampled = new Int16Array(newLength);
for (let i = 0; i < newLength; i++) {
const srcIndex = Math.floor(i * ratio);
if (srcIndex < audio.length) {
resampled[i] = audio[srcIndex];
}
}
return resampled;
}
}
export { NonRealTimeTenVAD, NonRealTimeTenVAD as TenVAD, VADInstance, VADModuleLoader, defaultTenVADOptions };
//# sourceMappingURL=index.esm.js.map