UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

140 lines 6.21 kB
import { extendDeep } from '../utilities/ObjectUtilities.js'; import { Logger } from '../utilities/Logger.js'; import { resampleAudioSpeex } from './SpeexResampler.js'; import { computeMelSpectrogram } from './MelSpectrogram.js'; import { powerToDecibels } from '../audio/AudioUtilities.js'; import { normalizeVectors } from '../math/VectorMath.js'; export async function computeMFCCs(monoAudio, options = {}) { const logger = new Logger(); logger.start('Initialize options'); if (monoAudio.audioChannels.length != 1) { throw new Error('Audio must be mono'); } options = extendDefaultMfccOptions(options); const analysisSampleRate = options.analysisSampleRate; const featureCount = options.featureCount; const fftOrder = options.fftOrder; const windowDuration = options.windowDuration; const windowSize = windowDuration * analysisSampleRate; const hopDuration = options.hopDuration; const hopLength = hopDuration * analysisSampleRate; const filterbankCount = options.filterbankCount; const lowerFrequencyHz = options.lowerFreq; const upperFrequencyHz = options.upperFreq; const emphasisFactor = options.emphasisFactor; const lifteringFactor = options.lifteringFactor; const zeroFirstCoefficient = options.zeroFirstCoefficient; logger.start(`Resample audio to analysis sample rate (${analysisSampleRate}Hz)`); const resampledAudio = await resampleAudioSpeex(monoAudio, analysisSampleRate); if (emphasisFactor > 0) { logger.start('Apply emphasis'); resampledAudio.audioChannels[0] = applyEmphasis(resampledAudio.audioChannels[0], emphasisFactor); } logger.start('Compute Mel spectrogram'); const { melSpectrogram } = await computeMelSpectrogram(resampledAudio, fftOrder, windowSize, hopLength, filterbankCount, lowerFrequencyHz, upperFrequencyHz); logger.start('Extract MFCCs from Mel spectrogram'); let mfccs = melSpectrogramToMFCCs(melSpectrogram, featureCount); if (options.normalize) { logger.start('Normalize MFCCs'); const { normalizedVectors, mean, stdDeviation } = normalizeVectors(mfccs); mfccs = normalizedVectors; } if (lifteringFactor > 0) { logger.start('Apply liftering to MFCCs'); mfccs = applyLiftering(mfccs, lifteringFactor); } if (zeroFirstCoefficient) { for (const mfcc of mfccs) { mfcc[0] = 0; } } logger.end(); return mfccs; } export function melSpectrogramToMFCCs(melSpectrogram, mfccFeatureCount) { const melBandCount = melSpectrogram[0].length; const dctMatrix = createDCTType2CoefficientMatrix(mfccFeatureCount, melBandCount); const mfccs = melSpectrogram.map(frame => melSpectrumToMFCC(frame, mfccFeatureCount, dctMatrix)); return mfccs; } export function melSpectrumToMFCC(melSpectrum, mfccFeatureCount, dctMatrix, normalization = 'orthonormal') { const melBandCount = melSpectrum.length; let firstFeatureNormalizationFactor; let nonfirstFeatureNormalizationFactor; if (normalization == 'orthonormal') { firstFeatureNormalizationFactor = Math.sqrt(1 / (4 * mfccFeatureCount)); nonfirstFeatureNormalizationFactor = Math.sqrt(1 / (2 * mfccFeatureCount)); } else { firstFeatureNormalizationFactor = 1; nonfirstFeatureNormalizationFactor = 1; } const mfcc = new Float32Array(mfccFeatureCount); for (let mfccFeatureIndex = 0; mfccFeatureIndex < mfccFeatureCount; mfccFeatureIndex++) { const dctMatrixRow = dctMatrix[mfccFeatureIndex]; let sum = 0; for (let j = 0; j < melBandCount; j++) { const dctCoefficient = dctMatrixRow[j]; const logMel = powerToDecibels(melSpectrum[j]); sum += dctCoefficient * logMel; } const normalizationFactor = mfccFeatureIndex == 0 ? firstFeatureNormalizationFactor : nonfirstFeatureNormalizationFactor; //mfcc[mfccFeatureIndex] = normalizationFactor * sum mfcc[mfccFeatureIndex] = normalizationFactor * 2 * sum; // Sum multiplied by 2 to match with librosa } return mfcc; } export function createDCTType2CoefficientMatrix(mfccFeatureCount, melBandCount) { const dctMatrix = []; for (let mfccFeatureIndex = 0; mfccFeatureIndex < mfccFeatureCount; mfccFeatureIndex++) { const row = new Float32Array(melBandCount); const innerMultiplier = Math.PI * mfccFeatureIndex / melBandCount; for (let melBandIndex = 0; melBandIndex < melBandCount; melBandIndex++) { row[melBandIndex] = Math.cos(innerMultiplier * (melBandIndex + 0.5)); } dctMatrix.push(row); } return dctMatrix; } export function applyEmphasis(samples, emphasisFactor = 0.97, initialState = 0) { const processedSamples = new Float32Array(samples.length); processedSamples[0] = samples[0] - (emphasisFactor * initialState); for (let i = 1; i < processedSamples.length; i++) { processedSamples[i] = samples[i] - (emphasisFactor * samples[i - 1]); } return processedSamples; } export function applyLiftering(mfccs, lifteringFactor) { const featureCount = mfccs[0].length; const lifterMultipliers = new Float32Array(featureCount); for (let i = 0; i < featureCount; i++) { lifterMultipliers[i] = 1 + (lifteringFactor / 2) * Math.sin(Math.PI * (i + 1) / lifteringFactor); } const lifteredMfccs = []; for (const mfcc of mfccs) { const lifteredMfcc = new Float32Array(featureCount); for (let i = 0; i < featureCount; i++) { lifteredMfcc[i] = mfcc[i] * lifterMultipliers[i]; } lifteredMfccs.push(lifteredMfcc); } return lifteredMfccs; } export const defaultMfccOptions = { filterbankCount: 40, featureCount: 13, fftOrder: 512, lowerFreq: 133.3333, upperFreq: 6855.4976, windowDuration: 0.025, hopDuration: 0.010, emphasisFactor: 0.97, analysisSampleRate: 16000, lifteringFactor: 0, normalize: false, zeroFirstCoefficient: false, }; export function extendDefaultMfccOptions(options) { return extendDeep(defaultMfccOptions, options); } //# sourceMappingURL=MFCC.js.map