UNPKG

ten-vad-lib

Version:

A JavaScript library for Ten VAD (Voice Activity Detection) based on WebAssembly

376 lines (372 loc) 14.7 kB
class VADModuleLoader { constructor() { this.module = null; this.loadingPromise = null; } static getInstance() { if (!VADModuleLoader.instance) { VADModuleLoader.instance = new VADModuleLoader(); } return VADModuleLoader.instance; } async loadModule(options = {}) { if (this.module) { return this.module; } if (this.loadingPromise) { return this.loadingPromise; } this.loadingPromise = this.loadModuleInternal(options); return this.loadingPromise; } async loadModuleInternal(options = {}) { const wasmPath = options.wasmPath || this.getDefaultWasmPath(); const jsPath = options.jsPath || this.getDefaultJsPath(); try { if (!window.createVADModule) { await this.loadScript(jsPath); } let attempts = 0; const maxAttempts = 100; while (!window.createVADModule && attempts < maxAttempts) { await new Promise(resolve => setTimeout(resolve, 100)); attempts++; } if (!window.createVADModule) { throw new Error('createVADModule not found. Please ensure ten_vad.js is loaded correctly.'); } const wasmResponse = await fetch(wasmPath); if (!wasmResponse.ok) { throw new Error(`Failed to load WASM file from ${wasmPath}`); } const wasmBinary = await wasmResponse.arrayBuffer(); const module = await window.createVADModule({ wasmBinary, locateFile: (filePath) => { if (filePath.endsWith('.wasm')) { return wasmPath; } return filePath; }, noInitialRun: false, noExitRuntime: true }); this.addHelperFunctions(module); this.module = module; return module; } catch (error) { this.loadingPromise = null; throw error; } } getDefaultWasmPath() { if (typeof window !== 'undefined') { const script = document.currentScript || document.querySelector('script[src*="ten-vad"]'); if (script) { const scriptSrc = script.getAttribute('src'); if (scriptSrc) { const baseUrl = scriptSrc.substring(0, scriptSrc.lastIndexOf('/')); return `${baseUrl}/wasm/ten_vad.wasm`; } } } return '/wasm/ten_vad.wasm'; } getDefaultJsPath() { if (typeof window !== 'undefined') { const script = document.currentScript || document.querySelector('script[src*="ten-vad"]'); if (script) { const scriptSrc = script.getAttribute('src'); if (scriptSrc) { const baseUrl = scriptSrc.substring(0, scriptSrc.lastIndexOf('/')); return `${baseUrl}/wasm/ten_vad.js`; } } } return '/wasm/ten_vad.js'; } async loadScript(src) { return new Promise((resolve, reject) => { const script = document.createElement('script'); script.src = src; script.onload = () => resolve(); script.onerror = () => reject(new Error(`Failed to load script: ${src}`)); document.head.appendChild(script); }); } addHelperFunctions(module) { if (!module.getValue) { module.getValue = function (ptr, type) { switch (type) { case 'i32': return module.HEAP32[ptr >> 2]; case 'float': return module.HEAPF32[ptr >> 2]; default: throw new Error(`Unsupported type: ${type}`); } }; } if (!module.UTF8ToString) { module.UTF8ToString = function (ptr) { if (!ptr) return ''; let result = ''; let i = ptr; while (module.HEAPU8[i]) { result += String.fromCharCode(module.HEAPU8[i++]); } return result; }; } } getModule() { return this.module; } reset() { this.module = null; this.loadingPromise = null; } } class VADInstance { constructor(module, hopSize, voiceThreshold) { this.isDestroyed = false; this.module = module; this.hopSize = hopSize; this.voiceThreshold = voiceThreshold; this.vadHandlePtr = module._malloc(4); // Create VAD instance const createResult = module._ten_vad_create(this.vadHandlePtr, hopSize, voiceThreshold); if (createResult !== 0) { throw new Error(`VAD creation failed with code: ${createResult}`); } this.vadHandle = module.getValue(this.vadHandlePtr, 'i32'); } async processFrame(audioData) { if (this.isDestroyed) { throw new Error('VAD instance has been destroyed'); } if (audioData.length !== this.hopSize) { throw new Error(`Expected ${this.hopSize} samples, got ${audioData.length}`); } // Allocate memory for processing const audioPtr = this.module._malloc(this.hopSize * 2); const probPtr = this.module._malloc(4); const flagPtr = this.module._malloc(4); try { // Copy audio data to WASM memory this.module.HEAP16.set(audioData, audioPtr / 2); // Process frame const result = this.module._ten_vad_process(this.vadHandle, audioPtr, this.hopSize, probPtr, flagPtr); if (result !== 0) { throw new Error(`Frame processing failed with code: ${result}`); } // Get results const probability = this.module.getValue(probPtr, 'float'); const flag = this.module.getValue(flagPtr, 'i32'); const isVoice = flag === 1; return { probability, isVoice }; } finally { // Free allocated memory this.module._free(audioPtr); this.module._free(probPtr); this.module._free(flagPtr); } } reset() { if (this.isDestroyed) { throw new Error('VAD instance has been destroyed'); } // Recreate the VAD instance this.destroy(); this.isDestroyed = false; this.vadHandlePtr = this.module._malloc(4); const createResult = this.module._ten_vad_create(this.vadHandlePtr, this.hopSize, this.voiceThreshold); if (createResult !== 0) { throw new Error(`VAD recreation failed with code: ${createResult}`); } this.vadHandle = this.module.getValue(this.vadHandlePtr, 'i32'); } destroy() { if (this.isDestroyed) { return; } if (this.vadHandlePtr) { this.module._ten_vad_destroy(this.vadHandlePtr); this.module._free(this.vadHandlePtr); this.vadHandlePtr = 0; } this.isDestroyed = true; } getVersion() { try { const versionPtr = this.module._ten_vad_get_version(); if (versionPtr) { return this.module.UTF8ToString(versionPtr); } return 'Unknown Version'; } catch (error) { console.error('Failed to get VAD version:', error); return 'Error Getting Version'; } } } const defaultTenVADOptions = { hopSize: 256, voiceThreshold: 0.5, wasmPath: '/wasm/ten_vad.wasm', jsPath: '/wasm/ten_vad.js', minSpeechDuration: 100, // ms maxSilenceDuration: 500, // ms }; class NonRealTimeTenVAD { constructor(options = {}) { this.options = { ...defaultTenVADOptions, ...options }; this.moduleLoader = VADModuleLoader.getInstance(); } static async new(options = {}) { const vad = new NonRealTimeTenVAD(options); await vad.moduleLoader.loadModule({ wasmPath: vad.options.wasmPath, jsPath: vad.options.jsPath, }); return vad; } async *run(inputAudio, sampleRate) { const module = await this.moduleLoader.loadModule({ wasmPath: this.options.wasmPath, jsPath: this.options.jsPath, }); const vadInstance = new VADInstance(module, this.options.hopSize, this.options.voiceThreshold); try { // Convert Float32Array to Int16Array and resample if needed const processedAudio = this.preprocessAudio(inputAudio, sampleRate); // Process audio in frames const frameSize = this.options.hopSize; const frameCount = Math.floor(processedAudio.length / frameSize); let currentSegment = null; for (let i = 0; i < frameCount; i++) { const frameStart = i * frameSize; const frameEnd = frameStart + frameSize; const frameData = processedAudio.slice(frameStart, frameEnd); const result = await vadInstance.processFrame(frameData); const frameTime = (i * frameSize) / sampleRate * 1000; // Convert to ms if (result.isVoice) { // Voice detected if (!currentSegment) { // Start new segment currentSegment = { start: frameTime, end: frameTime, audio: new Float32Array(0), probabilities: [], }; } // Extend current segment currentSegment.end = frameTime; currentSegment.probabilities.push(result.probability); // Add frame audio to segment const frameAudio = new Float32Array(frameData.length); for (let j = 0; j < frameData.length; j++) { frameAudio[j] = frameData[j] / 32768; // Convert to float } const newAudio = new Float32Array(currentSegment.audio.length + frameAudio.length); newAudio.set(currentSegment.audio); newAudio.set(frameAudio, currentSegment.audio.length); currentSegment.audio = newAudio; } else { // No voice detected if (currentSegment) { const segmentDuration = currentSegment.end - currentSegment.start; if (segmentDuration >= this.options.minSpeechDuration) { // Yield speech segment const avgProbability = currentSegment.probabilities.reduce((sum, p) => sum + p, 0) / currentSegment.probabilities.length; yield { audio: currentSegment.audio, start: currentSegment.start, end: currentSegment.end, probability: avgProbability, }; } currentSegment = null; } } } // Handle final segment if (currentSegment) { const segmentDuration = currentSegment.end - currentSegment.start; if (segmentDuration >= this.options.minSpeechDuration) { const avgProbability = currentSegment.probabilities.reduce((sum, p) => sum + p, 0) / currentSegment.probabilities.length; yield { audio: currentSegment.audio, start: currentSegment.start, end: currentSegment.end, probability: avgProbability, }; } } } finally { vadInstance.destroy(); } } async process(inputAudio, sampleRate) { const startTime = Date.now(); const speechSegments = []; for await (const segment of this.run(inputAudio, sampleRate)) { speechSegments.push(segment); } const processingTime = Date.now() - startTime; const totalAudioTime = (inputAudio.length / sampleRate) * 1000; const realTimeFactor = processingTime / totalAudioTime; // Calculate statistics const totalFrames = Math.floor(inputAudio.length / this.options.hopSize); const voiceFrames = speechSegments.reduce((sum, segment) => { const segmentFrames = Math.floor((segment.end - segment.start) / 16); // 16ms per frame return sum + segmentFrames; }, 0); const voicePercentage = (voiceFrames / totalFrames) * 100; return { speechSegments, statistics: { totalFrames, voiceFrames, voicePercentage, processingTime, realTimeFactor, }, }; } preprocessAudio(audio, sampleRate) { // Convert Float32Array to Int16Array const int16Audio = new Int16Array(audio.length); for (let i = 0; i < audio.length; i++) { // Clamp to [-1, 1] and convert to 16-bit integer const clamped = Math.max(-1, Math.min(1, audio[i])); int16Audio[i] = Math.round(clamped * 32767); } // Simple resampling if needed (basic implementation) if (sampleRate !== 16000) { return this.resample(int16Audio, sampleRate, 16000); } return int16Audio; } resample(audio, fromSampleRate, toSampleRate) { if (fromSampleRate === toSampleRate) { return audio; } const ratio = fromSampleRate / toSampleRate; const newLength = Math.round(audio.length / ratio); const resampled = new Int16Array(newLength); for (let i = 0; i < newLength; i++) { const srcIndex = Math.floor(i * ratio); if (srcIndex < audio.length) { resampled[i] = audio[srcIndex]; } } return resampled; } } export { NonRealTimeTenVAD, NonRealTimeTenVAD as TenVAD, VADInstance, VADModuleLoader, defaultTenVADOptions }; //# sourceMappingURL=index.esm.js.map