echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
140 lines • 6.21 kB
JavaScript
import { extendDeep } from '../utilities/ObjectUtilities.js';
import { Logger } from '../utilities/Logger.js';
import { resampleAudioSpeex } from './SpeexResampler.js';
import { computeMelSpectrogram } from './MelSpectrogram.js';
import { powerToDecibels } from '../audio/AudioUtilities.js';
import { normalizeVectors } from '../math/VectorMath.js';
export async function computeMFCCs(monoAudio, options = {}) {
const logger = new Logger();
logger.start('Initialize options');
if (monoAudio.audioChannels.length != 1) {
throw new Error('Audio must be mono');
}
options = extendDefaultMfccOptions(options);
const analysisSampleRate = options.analysisSampleRate;
const featureCount = options.featureCount;
const fftOrder = options.fftOrder;
const windowDuration = options.windowDuration;
const windowSize = windowDuration * analysisSampleRate;
const hopDuration = options.hopDuration;
const hopLength = hopDuration * analysisSampleRate;
const filterbankCount = options.filterbankCount;
const lowerFrequencyHz = options.lowerFreq;
const upperFrequencyHz = options.upperFreq;
const emphasisFactor = options.emphasisFactor;
const lifteringFactor = options.lifteringFactor;
const zeroFirstCoefficient = options.zeroFirstCoefficient;
logger.start(`Resample audio to analysis sample rate (${analysisSampleRate}Hz)`);
const resampledAudio = await resampleAudioSpeex(monoAudio, analysisSampleRate);
if (emphasisFactor > 0) {
logger.start('Apply emphasis');
resampledAudio.audioChannels[0] = applyEmphasis(resampledAudio.audioChannels[0], emphasisFactor);
}
logger.start('Compute Mel spectrogram');
const { melSpectrogram } = await computeMelSpectrogram(resampledAudio, fftOrder, windowSize, hopLength, filterbankCount, lowerFrequencyHz, upperFrequencyHz);
logger.start('Extract MFCCs from Mel spectrogram');
let mfccs = melSpectrogramToMFCCs(melSpectrogram, featureCount);
if (options.normalize) {
logger.start('Normalize MFCCs');
const { normalizedVectors, mean, stdDeviation } = normalizeVectors(mfccs);
mfccs = normalizedVectors;
}
if (lifteringFactor > 0) {
logger.start('Apply liftering to MFCCs');
mfccs = applyLiftering(mfccs, lifteringFactor);
}
if (zeroFirstCoefficient) {
for (const mfcc of mfccs) {
mfcc[0] = 0;
}
}
logger.end();
return mfccs;
}
export function melSpectrogramToMFCCs(melSpectrogram, mfccFeatureCount) {
const melBandCount = melSpectrogram[0].length;
const dctMatrix = createDCTType2CoefficientMatrix(mfccFeatureCount, melBandCount);
const mfccs = melSpectrogram.map(frame => melSpectrumToMFCC(frame, mfccFeatureCount, dctMatrix));
return mfccs;
}
export function melSpectrumToMFCC(melSpectrum, mfccFeatureCount, dctMatrix, normalization = 'orthonormal') {
const melBandCount = melSpectrum.length;
let firstFeatureNormalizationFactor;
let nonfirstFeatureNormalizationFactor;
if (normalization == 'orthonormal') {
firstFeatureNormalizationFactor = Math.sqrt(1 / (4 * mfccFeatureCount));
nonfirstFeatureNormalizationFactor = Math.sqrt(1 / (2 * mfccFeatureCount));
}
else {
firstFeatureNormalizationFactor = 1;
nonfirstFeatureNormalizationFactor = 1;
}
const mfcc = new Float32Array(mfccFeatureCount);
for (let mfccFeatureIndex = 0; mfccFeatureIndex < mfccFeatureCount; mfccFeatureIndex++) {
const dctMatrixRow = dctMatrix[mfccFeatureIndex];
let sum = 0;
for (let j = 0; j < melBandCount; j++) {
const dctCoefficient = dctMatrixRow[j];
const logMel = powerToDecibels(melSpectrum[j]);
sum += dctCoefficient * logMel;
}
const normalizationFactor = mfccFeatureIndex == 0 ? firstFeatureNormalizationFactor : nonfirstFeatureNormalizationFactor;
//mfcc[mfccFeatureIndex] = normalizationFactor * sum
mfcc[mfccFeatureIndex] = normalizationFactor * 2 * sum; // Sum multiplied by 2 to match with librosa
}
return mfcc;
}
export function createDCTType2CoefficientMatrix(mfccFeatureCount, melBandCount) {
const dctMatrix = [];
for (let mfccFeatureIndex = 0; mfccFeatureIndex < mfccFeatureCount; mfccFeatureIndex++) {
const row = new Float32Array(melBandCount);
const innerMultiplier = Math.PI * mfccFeatureIndex / melBandCount;
for (let melBandIndex = 0; melBandIndex < melBandCount; melBandIndex++) {
row[melBandIndex] = Math.cos(innerMultiplier * (melBandIndex + 0.5));
}
dctMatrix.push(row);
}
return dctMatrix;
}
export function applyEmphasis(samples, emphasisFactor = 0.97, initialState = 0) {
const processedSamples = new Float32Array(samples.length);
processedSamples[0] = samples[0] - (emphasisFactor * initialState);
for (let i = 1; i < processedSamples.length; i++) {
processedSamples[i] = samples[i] - (emphasisFactor * samples[i - 1]);
}
return processedSamples;
}
export function applyLiftering(mfccs, lifteringFactor) {
const featureCount = mfccs[0].length;
const lifterMultipliers = new Float32Array(featureCount);
for (let i = 0; i < featureCount; i++) {
lifterMultipliers[i] = 1 + (lifteringFactor / 2) * Math.sin(Math.PI * (i + 1) / lifteringFactor);
}
const lifteredMfccs = [];
for (const mfcc of mfccs) {
const lifteredMfcc = new Float32Array(featureCount);
for (let i = 0; i < featureCount; i++) {
lifteredMfcc[i] = mfcc[i] * lifterMultipliers[i];
}
lifteredMfccs.push(lifteredMfcc);
}
return lifteredMfccs;
}
export const defaultMfccOptions = {
filterbankCount: 40,
featureCount: 13,
fftOrder: 512,
lowerFreq: 133.3333,
upperFreq: 6855.4976,
windowDuration: 0.025,
hopDuration: 0.010,
emphasisFactor: 0.97,
analysisSampleRate: 16000,
lifteringFactor: 0,
normalize: false,
zeroFirstCoefficient: false,
};
export function extendDefaultMfccOptions(options) {
return extendDeep(defaultMfccOptions, options);
}
//# sourceMappingURL=MFCC.js.map