echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
221 lines • 9.59 kB
JavaScript
import { extendDeep } from '../utilities/ObjectUtilities.js';
import { logToStderr } from '../utilities/Utilities.js';
import { cropToTimeline, ensureRawAudio, } from '../audio/AudioUtilities.js';
import { Logger } from '../utilities/Logger.js';
import { loadPackage } from '../utilities/PackageManager.js';
import chalk from 'chalk';
import { joinPath } from '../utilities/PathUtilities.js';
const log = logToStderr;
export async function detectVoiceActivity(input, options) {
const logger = new Logger();
const startTimestamp = logger.getTimestamp();
const inputRawAudio = await ensureRawAudio(input);
logger.start(`Resample audio to 16kHz mono`);
let sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1);
logger.start(`Detect voice activity with ${options.engine}`);
options = extendDeep(defaultVADOptions, options);
const activityThreshold = options.activityThreshold;
let verboseTimeline;
switch (options.engine) {
case 'webrtc': {
const WebRtcVAD = await import('../voice-activity-detection/WebRtcVAD.js');
const webrtcOptions = options.webrtc;
const frameProbabilities = await WebRtcVAD.detectVoiceActivity(sourceRawAudio, webrtcOptions.frameDuration);
const frameDurationSeconds = webrtcOptions.frameDuration / 1000;
verboseTimeline = frameProbabilitiesToTimeline(frameProbabilities, frameDurationSeconds, activityThreshold);
break;
}
case 'silero': {
const SileroVAD = await import('../voice-activity-detection/SileroVAD.js');
const sileroOptions = options.silero;
const modelDir = await loadPackage('silero-vad');
const modelPath = joinPath(modelDir, 'silero-vad.onnx');
const frameDuration = sileroOptions.frameDuration;
const onnxExecutionProviders = sileroOptions.provider ? [sileroOptions.provider] : [];
const frameProbabilities = await SileroVAD.detectVoiceActivity(sourceRawAudio, modelPath, frameDuration, onnxExecutionProviders);
const frameDurationSeconds = sileroOptions.frameDuration / 1000;
verboseTimeline = frameProbabilitiesToTimeline(frameProbabilities, frameDurationSeconds, activityThreshold);
break;
}
case 'rnnoise': {
const RNNoise = await import('../denoising/RNNoise.js');
const audio48k = await ensureRawAudio(sourceRawAudio, 48000, 1);
const rnnoiseOptions = options.rnnoise;
const { denoisedRawAudio, frameVadProbabilities } = await RNNoise.denoiseAudio(audio48k);
const frameDurationSeconds = 0.01;
const frameProbabilities = frameVadProbabilities;
verboseTimeline = frameProbabilitiesToTimeline(frameProbabilities, frameDurationSeconds, activityThreshold);
break;
}
case 'whisper': {
const WhisperSTT = await import('../recognition/WhisperSTT.js');
const whisperVADOptions = options.whisper;
logger.end();
const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperVADOptions.model, 'de');
logger.end();
const { partProbabilities } = await WhisperSTT.detectVoiceActivity(sourceRawAudio, modelName, modelDir, whisperVADOptions);
verboseTimeline = [];
for (const entry of partProbabilities) {
const hasSpeech = entry.confidence >= activityThreshold;
const text = hasSpeech ? 'active' : 'inactive';
if (verboseTimeline.length === 0 || verboseTimeline[verboseTimeline.length - 1].text != text) {
verboseTimeline.push({
type: 'segment',
text,
startTime: entry.startTime,
endTime: entry.endTime
});
}
else {
verboseTimeline[verboseTimeline.length - 1].endTime = entry.endTime;
}
}
break;
}
case 'adaptive-gate': {
const AdaptiveGateVAD = await import('../voice-activity-detection/AdaptiveGateVAD.js');
const adaptiveGateOptions = options.adaptiveGate;
verboseTimeline = await AdaptiveGateVAD.detectVoiceActivity(sourceRawAudio, adaptiveGateOptions);
break;
}
default: {
throw new Error(`Engine '${options.engine}' is not supported`);
}
}
const timeline = verboseTimeline.filter(entry => entry.text === 'active');
const croppedRawAudio = cropToTimeline(inputRawAudio, timeline);
logger.end();
logger.log('');
logger.logDuration(`Total voice activity detection time`, startTimestamp, chalk.magentaBright);
return {
timeline,
verboseTimeline,
inputRawAudio,
croppedRawAudio
};
}
function frameProbabilitiesToTimeline(frameProbabilities, frameDurationSeconds, activityThreshold) {
const timeline = [];
for (let i = 0; i < frameProbabilities.length; i++) {
const frameProbability = frameProbabilities[i];
const startTime = i * frameDurationSeconds;
const endTime = (i + 1) * frameDurationSeconds;
if (frameProbability >= activityThreshold) {
if (timeline.length == 0 || timeline[timeline.length - 1].text == 'inactive') {
timeline.push({ type: 'segment', text: 'active', startTime, endTime });
continue;
}
}
else {
if (timeline.length == 0 || timeline[timeline.length - 1].text == 'active') {
timeline.push({ type: 'segment', text: 'inactive', startTime, endTime });
continue;
}
}
timeline[timeline.length - 1].endTime = endTime;
}
return timeline;
}
export function convertCroppedToUncroppedTimeline(timeline, uncropTimeline) {
if (timeline.length === 0) {
return;
}
for (const entry of timeline) {
const { mappedStartTime, mappedEndTime } = mapUsingUncropTimeline(entry.startTime, entry.endTime, uncropTimeline);
const mapSubTimeline = (subTimeline) => {
if (!subTimeline) {
return;
}
for (const subEntry of subTimeline) {
subEntry.startTime = Math.min(mappedStartTime + (subEntry.startTime - entry.startTime), mappedEndTime);
subEntry.endTime = Math.min(mappedStartTime + (subEntry.endTime - entry.startTime), mappedEndTime);
mapSubTimeline(subEntry.timeline);
}
};
mapSubTimeline(entry.timeline);
entry.startTime = mappedStartTime;
entry.endTime = mappedEndTime;
}
}
function mapUsingUncropTimeline(startTimeInCroppedAudio, endTimeInCroppedAudio, uncropTimeline) {
if (uncropTimeline.length === 0) {
return {
mappedStartTime: 0,
mappedEndTime: 0,
};
}
let offsetInCroppedAudio = 0;
if (endTimeInCroppedAudio < startTimeInCroppedAudio) {
endTimeInCroppedAudio = startTimeInCroppedAudio;
}
let bestOverlapDuration = -1;
let mappedStartTime = -1;
let mappedEndTime = -1;
for (const uncropEntry of uncropTimeline) {
const uncropEntryDuration = uncropEntry.endTime - uncropEntry.startTime;
const overlapStartTime = Math.max(startTimeInCroppedAudio, offsetInCroppedAudio);
const overlapEndTime = Math.min(endTimeInCroppedAudio, offsetInCroppedAudio + uncropEntryDuration);
const overlapDuration = overlapEndTime - overlapStartTime;
if (overlapDuration >= 0 && overlapDuration > bestOverlapDuration) {
bestOverlapDuration = overlapDuration;
mappedStartTime = uncropEntry.startTime + (overlapStartTime - offsetInCroppedAudio);
mappedEndTime = uncropEntry.startTime + (overlapEndTime - offsetInCroppedAudio);
}
offsetInCroppedAudio += uncropEntryDuration;
}
if (bestOverlapDuration === -1) {
if (startTimeInCroppedAudio >= offsetInCroppedAudio) {
const maxTimestamp = uncropTimeline[uncropTimeline.length - 1].endTime;
return {
mappedStartTime: maxTimestamp,
mappedEndTime: maxTimestamp
};
}
else {
throw new Error(`Given start time ${startTimeInCroppedAudio} was smaller than audio duration but no match was found in uncrop timeline (should not occur)`);
}
}
return {
mappedStartTime,
mappedEndTime
};
}
export const defaultVADOptions = {
engine: 'silero',
activityThreshold: 0.5,
webrtc: {
frameDuration: 30,
mode: 1
},
silero: {
frameDuration: 90,
provider: undefined,
},
rnnoise: {},
whisper: {
model: 'tiny',
temperature: 1.0,
},
adaptiveGate: {}
};
export const vadEngines = [
{
id: 'webrtc',
name: 'WebRTC VAD',
description: 'A voice activity detector from the Chromium browser sources.',
type: 'local'
},
{
id: 'silero',
name: 'Silero VAD',
description: 'A voice activity detection model by Silero.',
type: 'local'
},
{
id: 'rnnoise',
name: 'RNNoise',
description: `Uses RNNoise's internal speech probabilities as VAD metrics.`,
type: 'local'
}
];
//# sourceMappingURL=VoiceActivityDetection.js.map