UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

68 lines 3.38 kB
import { float32ToInt16Pcm } from '../audio/AudioBufferConversion.js'; import { concatFloat32Arrays } from '../utilities/Utilities.js'; import { wrapEmscriptenModuleHeap } from 'wasm-heap-manager'; import { Logger } from '../utilities/Logger.js'; import { cloneRawAudio } from '../audio/AudioUtilities.js'; let rnnoiseInstance; export async function denoiseAudio(rawAudio) { const logger = new Logger(); if (rawAudio.sampleRate != 48000) { throw new Error(`RNNoise requires a 48000 Hz sample rate (${rawAudio.sampleRate} Hz given)`); } if (rawAudio.audioChannels.length !== 1) { throw new Error('RNNoise requires a channel count of 1'); } if (rawAudio.audioChannels[0].length == 0) { return { denoisedRawAudio: cloneRawAudio(rawAudio), frameVadProbabilities: [] }; } logger.start('Get RNNoise WASM instance'); const m = await getRnnoiseInstance(); logger.start('Process with RNNoise'); const wasmHeap = wrapEmscriptenModuleHeap(m); const stateSize = m._rnnoise_get_size(); const frameSize = m._rnnoise_get_frame_size(); const denoiseState = m._rnnoise_create(0); const inputRef = wasmHeap.allocFloat32Array(frameSize); const outputRef = wasmHeap.allocFloat32Array(frameSize); const floatSamples = rawAudio.audioChannels[0]; const int16Samples = float32ToInt16Pcm(floatSamples); const int16SamplesAsFloats = new Float32Array(int16Samples); const processedFrames = []; const frameVadProbabilities = []; function outputNewFrame(newFrame, vadProbability) { processedFrames.push(newFrame); frameVadProbabilities.push(vadProbability); } for (let readOffset = 0; readOffset < int16Samples.length; readOffset += frameSize) { let frame = int16SamplesAsFloats.subarray(readOffset, readOffset + frameSize); if (frame.length < frameSize) { frame = concatFloat32Arrays([frame, new Float32Array(frameSize - frame.length)]); } inputRef.view.set(frame); const vadProbability = m._rnnoise_process_frame(denoiseState, outputRef.address, inputRef.address); // Latency compensation: don't write an output frame for the first read frame if (readOffset > 0) { outputNewFrame(outputRef.view.slice(), vadProbability); } } // Latency compensation: process an empty input frame for the last output frame inputRef.view.set(new Float32Array(frameSize)); const lastFrameVadProbability = m._rnnoise_process_frame(denoiseState, outputRef.address, inputRef.address); outputNewFrame(outputRef.view.slice(), lastFrameVadProbability); m._rnnoise_destroy(denoiseState); wasmHeap.freeAll(); const int16DenoisedSamplesAsFloats = concatFloat32Arrays(processedFrames); let denoisedSamples = int16DenoisedSamplesAsFloats.map(sample => sample / 32768); denoisedSamples = denoisedSamples.subarray(0, floatSamples.length); const denoisedRawAudio = { audioChannels: [denoisedSamples], sampleRate: 48000 }; logger.end(); return { denoisedRawAudio, frameVadProbabilities }; } export async function getRnnoiseInstance() { if (!rnnoiseInstance) { const { default: initializer } = await import('@echogarden/rnnoise-wasm'); rnnoiseInstance = await initializer(); } return rnnoiseInstance; } //# sourceMappingURL=RNNoise.js.map