UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

138 lines 6.07 kB
import { getRawAudioDuration } from '../audio/AudioUtilities.js'; import { createHighpassFilter, createLowpassFilter } from '../dsp/BiquadFilter.js'; import { DecayingPeakEstimator } from '../dsp/DecayingPeakEstimator.js'; import { LoudnessEstimator } from '../dsp/LoudnessEstimator.js'; import { extendDeep } from '../utilities/ObjectUtilities.js'; import { logToStderr } from '../utilities/Utilities.js'; const log = logToStderr; export async function detectVoiceActivity(rawAudio, options) { const channelCount = rawAudio.audioChannels.length; const sampleCount = rawAudio.audioChannels[0].length; const sampleRate = rawAudio.sampleRate; const audioDuration = getRawAudioDuration(rawAudio); options = extendDeep(defaultAdaptiveGateOptions, options); const gateVAD = new AdaptiveGateVAD(sampleRate, channelCount, options); const frameDuration = 0.01; const frameRecords = []; for (let sampleIndex = 0; sampleIndex < sampleCount; sampleIndex++) { const timePosition = sampleIndex / sampleRate; for (let channelIndex = 0; channelIndex < channelCount; channelIndex++) { const sample = rawAudio.audioChannels[channelIndex][sampleIndex]; gateVAD.process(sample, channelIndex); } if (frameRecords.length == 0 || timePosition > frameRecords[frameRecords.length - 1].timePosition + frameDuration) { const record = { timePosition, loudness: gateVAD.loudnessEstimator.currentLoudness, minimumLoudness: gateVAD.minimumLoudnessEstimator.currentPeak, maximumLoudness: gateVAD.maximumLoudnessEstimator.currentPeak, }; frameRecords.push(record); //log(`${timePosition.toFixed(3)}: loudness: ${record.loudness.toFixed(2)}dB, min: ${record.minimumLoudness.toFixed(2)}dB, max: ${record.maximumLoudness.toFixed(2)}dB dynamic range: ${record.dynamicRange.toFixed(2)}dB`) } } const frameActive = []; for (let i = 0; i < frameRecords.length; i++) { frameActive[i] = false; } { const backwardExtensionFrameCount = Math.floor(options.backwardExtensionDuration / frameDuration); const relativeThreshold = options.relativeThreshold; let extendedActivityStartIndex = frameRecords.length; for (let i = frameRecords.length - 1; i >= 0; i--) { const record = frameRecords[i]; const referenceLoudness = Math.max(record.maximumLoudness, -30); let isActive = false; if (i >= extendedActivityStartIndex) { isActive = true; } if (record.loudness >= referenceLoudness + relativeThreshold) { isActive = true; extendedActivityStartIndex = Math.max(i - backwardExtensionFrameCount, 0); } frameActive[i] = isActive; } } const timeline = []; for (let i = 0; i < frameRecords.length; i++) { const record = frameRecords[i]; const isActive = frameActive[i]; const activityText = isActive ? 'active' : 'inactive'; const startTime = record.timePosition; const endTime = Math.min(startTime + frameDuration, audioDuration); if (timeline.length == 0 || timeline[timeline.length - 1].text != activityText) { timeline.push({ type: 'segment', text: activityText, startTime, endTime, }); } else { timeline[timeline.length - 1].endTime = endTime; } } return timeline; } export class AdaptiveGateVAD { sampleRate; channelCount; options; channelHighpassFilters; channelLowpassFilters; loudnessEstimator; minimumLoudnessEstimator; maximumLoudnessEstimator; constructor(sampleRate, channelCount, options) { this.sampleRate = sampleRate; this.channelCount = channelCount; this.options = options; this.channelHighpassFilters = []; this.channelLowpassFilters = []; for (let i = 0; i < this.channelCount; i++) { this.channelHighpassFilters.push(createHighpassFilter(this.sampleRate, options.lowCutoff)); this.channelLowpassFilters.push(createLowpassFilter(this.sampleRate, options.highCutoff)); } this.loudnessEstimator = new LoudnessEstimator({ sampleRate: this.sampleRate, channelCount: this.channelCount, positiveAdaptationRate: options.positiveAdaptationRate, negativeAdaptationRate: options.negativeAdaptationRate, initialEstimate: -60, minimumLoudness: -60, applyKWeighting: false, }); const ticksPerSecond = this.sampleRate * this.channelCount; this.minimumLoudnessEstimator = new DecayingPeakEstimator({ kind: 'minimum', decayPerSecond: options.peakLoudnessDecay, initialPeak: -60, }, ticksPerSecond); this.maximumLoudnessEstimator = new DecayingPeakEstimator({ kind: 'maximum', decayPerSecond: options.peakLoudnessDecay, initialPeak: -60, }, ticksPerSecond); } process(sample, channelIndex) { sample = this.channelHighpassFilters[channelIndex].filter(sample); sample = this.channelLowpassFilters[channelIndex].filter(sample); this.loudnessEstimator.process(sample, channelIndex); const currentLoudness = this.loudnessEstimator.currentLoudness; this.minimumLoudnessEstimator.process(currentLoudness); if (currentLoudness >= -60) { this.maximumLoudnessEstimator.process(currentLoudness); } } } export const defaultAdaptiveGateOptions = { lowCutoff: 100, highCutoff: 1000, positiveAdaptationRate: 400.0, negativeAdaptationRate: 10.0, peakLoudnessDecay: 4.0, backwardExtensionDuration: 0.2, relativeThreshold: -15, }; //# sourceMappingURL=AdaptiveGateVAD.js.map