UNPKG

audio-sentence-detector

Version:

Advanced audio sentence detection using signal processing and voice activity detection

github.com/qwertyczee/audio-sentence-detector

qwertyczee/audio-sentence-detector

584 lines (486 loc) • 22.7 kB

JavaScript

const AudioContext = require('web-audio-api').AudioContext; class AudioSentenceDetector { constructor(options = {}) { this.options = { minSilenceDuration: options.minSilenceDuration || 0.5, silenceThreshold: options.silenceThreshold || 0.01, minSentenceLength: options.minSentenceLength || 1, maxSentenceLength: options.maxSentenceLength || 15, windowSize: options.windowSize || 2048, allowGaps: options.allowGaps !== undefined ? options.allowGaps : true, minSegmentLength: options.minSegmentLength || 0, alignToAudioBoundaries: options.alignToAudioBoundaries || false, fundamentalFreqMin: options.fundamentalFreqMin || 85, fundamentalFreqMax: options.fundamentalFreqMax || 255, formantFreqRanges: [ [270, 730], [840, 2290], [1690, 3010] ], voiceActivityThreshold: options.voiceActivityThreshold || 0.4, minVoiceActivityDuration: options.minVoiceActivityDuration || 0.1, energySmoothing: options.energySmoothing || 0.95, formantEmphasis: options.formantEmphasis || 0.7, zeroCrossingRateThreshold: options.zeroCrossingRateThreshold || 0.3 }; this.debug = options.debug || false; this.fftBuffers = { real: null, imag: null, magnitudes: null, window: null }; this.voiceActivityBuffer = new Float32Array(1024); this.memoizedFFT = new Map(); } calculateZeroCrossingRate(buffer) { let crossings = 0; for (let i = 1; i < buffer.length; i++) { if ((buffer[i] >= 0 && buffer[i - 1] < 0) || (buffer[i] < 0 && buffer[i - 1] >= 0)) { crossings++; } } return crossings / (buffer.length - 1); } calculateSpectralCentroid(magnitudes, sampleRate) { let weightedSum = 0; let sum = 0; const freqResolution = sampleRate / (magnitudes.length * 2); for (let i = 0; i < magnitudes.length; i++) { const frequency = i * freqResolution; weightedSum += frequency * magnitudes[i]; sum += magnitudes[i]; } return sum === 0 ? 0 : weightedSum / sum; } detectFormants(magnitudes, sampleRate) { const formantScores = this.options.formantFreqRanges.map(([min, max]) => { let energy = 0; const minBin = Math.floor(min / (sampleRate / magnitudes.length / 2)); const maxBin = Math.ceil(max / (sampleRate / magnitudes.length / 2)); for (let i = minBin; i <= maxBin && i < magnitudes.length; i++) { energy += magnitudes[i]; } return energy; }); const totalEnergy = magnitudes.reduce((sum, mag) => sum + mag, 0); return formantScores.map(score => score / totalEnergy); } isVoiceSegment(buffer, sampleRate) { // 1. Zero-crossing rate const zcr = this.calculateZeroCrossingRate(buffer); // 2. Spektrální analýza const magnitudes = this.performFFT(buffer); const spectralCentroid = this.calculateSpectralCentroid(magnitudes, sampleRate); // 3. Detekce formantů const formantScores = this.detectFormants(magnitudes, sampleRate); // 4. Energie v pásmech lidského hlasu const voiceBandEnergy = this.calculateVoiceBandEnergy(magnitudes, sampleRate); // 5. Výpočet pravděpodobnosti přítomnosti hlasu const zcrScore = zcr > 0.1 && zcr < this.options.zeroCrossingRateThreshold ? 1 : 0; const centroidScore = spectralCentroid > 100 && spectralCentroid < 3000 ? 1 : 0; const formantScore = formantScores.reduce((acc, score) => acc + (score > 0.1 ? 1 : 0), 0) / formantScores.length; const energyScore = voiceBandEnergy > this.options.voiceActivityThreshold ? 1 : 0; // Weighted average of all scores const weights = [0.3, 0.2, 0.3, 0.2]; // zcr, centroid, formanty, energie const finalScore = ( zcrScore * weights[0] + centroidScore * weights[1] + formantScore * weights[2] + energyScore * weights[3] ); if (this.debug) { console.log('Voice detection scores:', { zcr: zcrScore, centroid: centroidScore, formant: formantScore, energy: energyScore, final: finalScore }); } return finalScore > 0.6; // Stricter threshold for voice detection } calculateVoiceBandEnergy(magnitudes, sampleRate) { const freqResolution = sampleRate / (magnitudes.length * 2); let voiceBandEnergy = 0; let totalEnergy = 0; for (let i = 0; i < magnitudes.length; i++) { const frequency = i * freqResolution; const magnitude = magnitudes[i]; totalEnergy += magnitude; // Check if the frequency falls within the human voice range if (frequency >= this.options.fundamentalFreqMin && frequency <= this.options.formantFreqRanges[2][1]) { // Up to the third formant // Weighting based on the importance of frequencies for the human voice let weight = 1.0; // Emphasize the fundamental frequency if (frequency >= this.options.fundamentalFreqMin && frequency <= this.options.fundamentalFreqMax) { weight = 2.0; } // Highlighting formant areas for (const [min, max] of this.options.formantFreqRanges) { if (frequency >= min && frequency <= max) { weight = 1.5; break; } } voiceBandEnergy += magnitude * weight; } } return voiceBandEnergy / totalEnergy; } performFFT(buffer) { const n = buffer.length; if (!this.fftBuffers.real || this.fftBuffers.real.length !== n) { this.fftBuffers.real = new Float32Array(n); this.fftBuffers.imag = new Float32Array(n); this.fftBuffers.magnitudes = new Float32Array(n / 2); this.fftBuffers.window = new Float32Array(n); // Precompute window function for (let i = 0; i < n; i++) { this.fftBuffers.window[i] = 0.54 - 0.46 * Math.cos((2 * Math.PI * i) / (n - 1)); } } for (let i = 0; i < n; i++) { this.fftBuffers.real[i] = buffer[i] * this.fftBuffers.window[i]; this.fftBuffers.imag[i] = 0; } // In-place FFT this.fftIterative(this.fftBuffers.real, this.fftBuffers.imag); for (let i = 0; i < n / 2; i++) { this.fftBuffers.magnitudes[i] = Math.sqrt( this.fftBuffers.real[i] * this.fftBuffers.real[i] + this.fftBuffers.imag[i] * this.fftBuffers.imag[i] ); } return this.fftBuffers.magnitudes; } fftIterative(real, imag) { const n = real.length; // Bit reversal for (let i = 0; i < n; i++) { const j = this.reverseBits(i, Math.log2(n)); if (j > i) { [real[i], real[j]] = [real[j], real[i]]; [imag[i], imag[j]] = [imag[j], imag[i]]; } } // Butterfly operations for (let size = 2; size <= n; size *= 2) { const halfSize = size / 2; const angle = -2 * Math.PI / size; for (let i = 0; i < n; i += size) { for (let j = 0; j < halfSize; j++) { const tReal = real[i + j + halfSize] * Math.cos(angle * j) - imag[i + j + halfSize] * Math.sin(angle * j); const tImag = real[i + j + halfSize] * Math.sin(angle * j) + imag[i + j + halfSize] * Math.cos(angle * j); real[i + j + halfSize] = real[i + j] - tReal; imag[i + j + halfSize] = imag[i + j] - tImag; real[i + j] += tReal; imag[i + j] += tImag; } } } } reverseBits(x, bits) { let result = 0; for (let i = 0; i < bits; i++) { result = (result << 1) | (x & 1); x >>= 1; } return result; } fft(real, imag) { const n = real.length; if (n <= 1) return; const halfN = n / 2; const evenReal = new Float32Array(halfN); const evenImag = new Float32Array(halfN); const oddReal = new Float32Array(halfN); const oddImag = new Float32Array(halfN); for (let i = 0; i < halfN; i++) { evenReal[i] = real[i * 2]; evenImag[i] = imag[i * 2]; oddReal[i] = real[i * 2 + 1]; oddImag[i] = imag[i * 2 + 1]; } this.fft(evenReal, evenImag); this.fft(oddReal, oddImag); for (let k = 0; k < halfN; k++) { const theta = -2 * Math.PI * k / n; const cosTheta = Math.cos(theta); const sinTheta = Math.sin(theta); const tReal = oddReal[k] * cosTheta - oddImag[k] * sinTheta; const tImag = oddReal[k] * sinTheta + oddImag[k] * cosTheta; real[k] = evenReal[k] + tReal; imag[k] = evenImag[k] + tImag; real[k + halfN] = evenReal[k] - tReal; imag[k + halfN] = evenImag[k] - tImag; } } async detect(buffer) { try { const audioData = await this.getAudioData(buffer); const sentences = await this.detectSentences(audioData.channelData[0], audioData.sampleRate); return sentences; } catch (error) { throw new Error(`Error processing audio buffer: ${error.message}`); } } async getAudioData(buffer) { return await new Promise((resolve, reject) => { const audioContext = new AudioContext(); audioContext.decodeAudioData(buffer, (audioBuffer) => { const channels = audioBuffer.numberOfChannels; const sampleRate = audioBuffer.sampleRate; const channelData = []; for (let i = 0; i < channels; i++) { channelData.push(audioBuffer.getChannelData(i)); } resolve({ channels, sampleRate, channelData }); }, (err) => { reject(err); }); }); } async detectSentences(audioData, sampleRate) { if (this.debug) { console.log(`Audio loaded: ${audioData.length} samples, ${sampleRate}Hz`); console.log(`Audio duration: ${audioData.length / sampleRate} seconds`); } const silenceMarkers = this.detectSilentRegions(audioData, sampleRate); if (this.debug) { console.log(`Found ${silenceMarkers.length} silent regions`); } const sentences = this.findSentenceBoundaries(silenceMarkers, audioData, sampleRate); if (this.debug) { console.log(`Detected ${sentences.length} sentences`); } if (this.options.alignToAudioBoundaries) { if (sentences.length === 0) { sentences.push({ index: 0, start: 0, end: audioData.length / sampleRate, duration: audioData.length / sampleRate }); } else { if (sentences[0].start !== 0) { sentences[0].start = 0; } const lastSentence = sentences[sentences.length - 1]; lastSentence.end = audioData.length / sampleRate; lastSentence.duration = lastSentence.end - lastSentence.start; } } return sentences; } detectSilentRegions(audioData, sampleRate) { const windowSize = this.options.windowSize; const silentRegions = []; let currentSilenceStart = null; let maxRMSInSilence = 0; let prevVoiceActivity = false; let voiceActivityBuffer = []; // Buffer for smoothing voice detection const smoothingBufferSize = Math.floor(0.1 * sampleRate / windowSize); // 100ms buffer for (let i = 0; i < audioData.length; i += windowSize) { const windowEnd = Math.min(i + windowSize, audioData.length); const window = audioData.slice(i, windowEnd); // Calculation of RMS energy const rms = Math.sqrt(window.reduce((sum, sample) => sum + sample * sample, 0) / window.length); // Voice detection in the current window const isVoice = this.isVoiceSegment(window, sampleRate); // Adding to the smoothing buffer voiceActivityBuffer.push(isVoice); if (voiceActivityBuffer.length > smoothingBufferSize) { voiceActivityBuffer.shift(); } // Smoothed voice detection const voiceActivityRatio = voiceActivityBuffer.filter(v => v).length / voiceActivityBuffer.length; const isSmoothedVoice = voiceActivityRatio > 0.6; if (this.debug && i % (windowSize * 100) === 0) { console.log(`Position ${(i / sampleRate).toFixed(4)}s, RMS: ${rms.toFixed(4)}, Voice: ${isSmoothedVoice}`); } // Detection of silence or absence of voice if (!isSmoothedVoice || rms < this.options.silenceThreshold) { if (currentSilenceStart === null) { currentSilenceStart = i; maxRMSInSilence = rms; } else { maxRMSInSilence = Math.max(maxRMSInSilence, rms); } } else if (currentSilenceStart !== null) { const silenceDuration = (i - currentSilenceStart) / sampleRate; if (silenceDuration >= this.options.minSilenceDuration) { silentRegions.push({ start: currentSilenceStart / sampleRate, end: i / sampleRate, duration: silenceDuration, avgRMS: maxRMSInSilence }); } currentSilenceStart = null; maxRMSInSilence = 0; } prevVoiceActivity = isSmoothedVoice; } // Processing the last region if (currentSilenceStart !== null) { const silenceDuration = (audioData.length - currentSilenceStart) / sampleRate; if (silenceDuration >= this.options.minSilenceDuration) { silentRegions.push({ start: currentSilenceStart / sampleRate, end: audioData.length / sampleRate, duration: silenceDuration, avgRMS: maxRMSInSilence }); } } return this.mergeCloseRegions(silentRegions); } mergeCloseRegions(regions) { if (regions.length < 2) return regions; const mergedRegions = []; let currentRegion = regions[0]; for (let i = 1; i < regions.length; i++) { const nextRegion = regions[i]; const gap = nextRegion.start - currentRegion.end; // If the gap between regions is less than 0.3s, merge them. if (gap < 0.3) { currentRegion = { start: currentRegion.start, end: nextRegion.end, duration: nextRegion.end - currentRegion.start, avgRMS: (currentRegion.avgRMS + nextRegion.avgRMS) / 2 }; } else { mergedRegions.push(currentRegion); currentRegion = nextRegion; } } mergedRegions.push(currentRegion); return mergedRegions; } findSentenceBoundaries(silentRegions, audioData, sampleRate) { let sentences = []; let lastEnd = 0; const totalDuration = audioData.length / sampleRate; for (let i = 0; i < silentRegions.length; i++) { const region = silentRegions[i]; const sentenceDuration = region.start - lastEnd; if (sentenceDuration >= this.options.minSentenceLength && sentenceDuration <= this.options.maxSentenceLength) { let segmentEnd = region.start; if (!this.options.allowGaps && i < silentRegions.length - 1) { const gapMiddle = (region.end + silentRegions[i + 1].start) / 2; segmentEnd = gapMiddle; } sentences.push({ index: sentences.length, start: lastEnd, end: segmentEnd, duration: segmentEnd - lastEnd }); } else if (sentenceDuration > this.options.maxSentenceLength) { const numParts = Math.ceil(sentenceDuration / this.options.maxSentenceLength); const partDuration = sentenceDuration / numParts; for (let j = 0; j < numParts; j++) { let partEnd = lastEnd + ((j + 1) * partDuration); if (j === numParts - 1) { if (!this.options.allowGaps && i < silentRegions.length - 1) { partEnd = (region.end + silentRegions[i + 1].start) / 2; } else { partEnd = region.start; } } sentences.push({ index: sentences.length, start: lastEnd + (j * partDuration), end: partEnd, duration: partEnd - (lastEnd + (j * partDuration)) }); } } lastEnd = this.options.allowGaps ? region.end : ( i < silentRegions.length - 1 ? (region.end + silentRegions[i + 1].start) / 2 : region.end ); } if (lastEnd < totalDuration) { const remainingDuration = totalDuration - lastEnd; if (remainingDuration >= this.options.minSentenceLength) { sentences.push({ index: sentences.length, start: lastEnd, end: totalDuration, duration: remainingDuration }); } } if (this.options.minSegmentLength > 0) { sentences = this.mergeShortSegments(sentences); } return sentences; } mergeShortSegments(sentences) { if (sentences.length <= 1) return sentences; const mergedSegments = []; let currentSegment = sentences[0]; let segmentsToMerge = []; for (let i = 0; i < sentences.length; i++) { const segment = sentences[i]; if (segmentsToMerge.length === 0) { segmentsToMerge.push(segment); continue; } const currentDuration = segmentsToMerge.reduce((sum, seg) => sum + seg.duration, 0); if (currentDuration + segment.duration <= this.options.minSegmentLength) { // Přidáme segment do skupiny pro spojení segmentsToMerge.push(segment); } else { // Pokud máme nějaké segmenty ke spojení if (segmentsToMerge.length > 0) { if (currentDuration >= this.options.minSegmentLength) { // Současná skupina splňuje minimální délku const mergedSegment = this.mergeSegmentGroup(segmentsToMerge); mergedSegments.push(mergedSegment); segmentsToMerge = [segment]; } else { // Současná skupina je příliš krátká, spojíme ji s následujícím segmentem segmentsToMerge.push(segment); const mergedSegment = this.mergeSegmentGroup(segmentsToMerge); mergedSegments.push(mergedSegment); segmentsToMerge = []; } } else { mergedSegments.push(segment); } } } // Zpracování zbývajících segmentů if (segmentsToMerge.length > 0) { const mergedSegment = this.mergeSegmentGroup(segmentsToMerge); mergedSegments.push(mergedSegment); } return mergedSegments; } mergeSegmentGroup(segments) { if (segments.length === 0) return null; if (segments.length === 1) return segments[0]; const start = segments[0].start; const end = segments[segments.length - 1].end; const duration = end - start; return { index: segments[0].index, start: start, end: end, duration: duration }; } } module.exports = AudioSentenceDetector;