echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
68 lines • 3.38 kB
JavaScript
import { float32ToInt16Pcm } from '../audio/AudioBufferConversion.js';
import { concatFloat32Arrays } from '../utilities/Utilities.js';
import { wrapEmscriptenModuleHeap } from 'wasm-heap-manager';
import { Logger } from '../utilities/Logger.js';
import { cloneRawAudio } from '../audio/AudioUtilities.js';
let rnnoiseInstance;
export async function denoiseAudio(rawAudio) {
const logger = new Logger();
if (rawAudio.sampleRate != 48000) {
throw new Error(`RNNoise requires a 48000 Hz sample rate (${rawAudio.sampleRate} Hz given)`);
}
if (rawAudio.audioChannels.length !== 1) {
throw new Error('RNNoise requires a channel count of 1');
}
if (rawAudio.audioChannels[0].length == 0) {
return { denoisedRawAudio: cloneRawAudio(rawAudio), frameVadProbabilities: [] };
}
logger.start('Get RNNoise WASM instance');
const m = await getRnnoiseInstance();
logger.start('Process with RNNoise');
const wasmHeap = wrapEmscriptenModuleHeap(m);
const stateSize = m._rnnoise_get_size();
const frameSize = m._rnnoise_get_frame_size();
const denoiseState = m._rnnoise_create(0);
const inputRef = wasmHeap.allocFloat32Array(frameSize);
const outputRef = wasmHeap.allocFloat32Array(frameSize);
const floatSamples = rawAudio.audioChannels[0];
const int16Samples = float32ToInt16Pcm(floatSamples);
const int16SamplesAsFloats = new Float32Array(int16Samples);
const processedFrames = [];
const frameVadProbabilities = [];
function outputNewFrame(newFrame, vadProbability) {
processedFrames.push(newFrame);
frameVadProbabilities.push(vadProbability);
}
for (let readOffset = 0; readOffset < int16Samples.length; readOffset += frameSize) {
let frame = int16SamplesAsFloats.subarray(readOffset, readOffset + frameSize);
if (frame.length < frameSize) {
frame = concatFloat32Arrays([frame, new Float32Array(frameSize - frame.length)]);
}
inputRef.view.set(frame);
const vadProbability = m._rnnoise_process_frame(denoiseState, outputRef.address, inputRef.address);
// Latency compensation: don't write an output frame for the first read frame
if (readOffset > 0) {
outputNewFrame(outputRef.view.slice(), vadProbability);
}
}
// Latency compensation: process an empty input frame for the last output frame
inputRef.view.set(new Float32Array(frameSize));
const lastFrameVadProbability = m._rnnoise_process_frame(denoiseState, outputRef.address, inputRef.address);
outputNewFrame(outputRef.view.slice(), lastFrameVadProbability);
m._rnnoise_destroy(denoiseState);
wasmHeap.freeAll();
const int16DenoisedSamplesAsFloats = concatFloat32Arrays(processedFrames);
let denoisedSamples = int16DenoisedSamplesAsFloats.map(sample => sample / 32768);
denoisedSamples = denoisedSamples.subarray(0, floatSamples.length);
const denoisedRawAudio = { audioChannels: [denoisedSamples], sampleRate: 48000 };
logger.end();
return { denoisedRawAudio, frameVadProbabilities };
}
export async function getRnnoiseInstance() {
if (!rnnoiseInstance) {
const { default: initializer } = await import('@echogarden/rnnoise-wasm');
rnnoiseInstance = await initializer();
}
return rnnoiseInstance;
}
//# sourceMappingURL=RNNoise.js.map