echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
156 lines (110 loc) • 5.43 kB
text/typescript
import { RawAudio } from '../audio/AudioUtilities.js'
import { Logger } from '../utilities/Logger.js'
import * as FFT from './FFT.js'
export async function computeMelSpectrogram(rawAudio: RawAudio, fftOrder: number, windowSize: number, hopLength: number, filterbankCount: number, lowerFrequencyHz: number, upperFrequencyHz: number, windowType: FFT.WindowType = 'hann') {
const logger = new Logger()
logger.start('Compute Mel filterbank')
const binCount = (fftOrder / 2) + 2
const nyquistFrequency = rawAudio.sampleRate / 2
const binFrequencies = FFT.getBinFrequencies(binCount, nyquistFrequency)
const lowerFrequencyMel = hertzToMel(lowerFrequencyHz)
const upperFrequencyMel = hertzToMel(upperFrequencyHz)
const filterbanksCenterFrequencies = getMelFilterbanksCenterFrequencies(filterbankCount, lowerFrequencyMel, upperFrequencyMel)
const melFilterbanks = getMelFilterbanks(binFrequencies, filterbanksCenterFrequencies, lowerFrequencyMel, upperFrequencyMel)
logger.end()
return computeMelSpectrogramUsingFilterbanks(rawAudio, fftOrder, windowSize, hopLength, melFilterbanks, windowType)
}
export async function computeMelSpectrogramUsingFilterbanks(rawAudio: RawAudio, fftOrder: number, windowSize: number, hopLength: number, filterbanks: Filterbank[], windowType: FFT.WindowType = 'hann') {
const logger = new Logger()
logger.start('Compute short-time FFTs')
const audioSamples = rawAudio.audioChannels[0]
const fftFrames = await FFT.stftr(audioSamples, fftOrder, windowSize, hopLength, windowType)
logger.start('Convert FFT frames to Mel spectrogram')
const melSpectrogram = fftFramesToMelSpectrogram(fftFrames, filterbanks)
logger.end()
return { melSpectrogram, fftFrames }
}
export function fftFramesToMelSpectrogram(fftFrames: Float32Array[], melFilterbanks: Filterbank[]) {
return fftFrames.map(fftFrame => {
const powerSpectrum = FFT.fftFrameToPowerSpectrum(fftFrame)
return powerSpectrumToMelSpectrum(powerSpectrum, melFilterbanks)
})
}
export function powerSpectrumToMelSpectrum(powerSpectrum: Float32Array, filterbanks: Filterbank[]) {
const filterbankCount = filterbanks.length
const melSpectrum = new Float32Array(filterbankCount)
for (let melBandIndex = 0; melBandIndex < filterbankCount; melBandIndex++) {
const filterbank = filterbanks[melBandIndex]
const filterbankStartIndex = filterbank.startIndex
const filterbankWeights = filterbank.weights
if (filterbankStartIndex === -1) {
continue
}
let melBandValue = 0
for (let i = 0; i < filterbankWeights.length; i++) {
const powerSpectrumIndex = filterbankStartIndex + i
if (powerSpectrumIndex >= powerSpectrum.length) {
break
}
const weight = filterbankWeights[i]
const powerSpectrumValue = powerSpectrum[powerSpectrumIndex]
melBandValue += weight * powerSpectrumValue
}
melSpectrum[melBandIndex] = melBandValue
}
return melSpectrum
}
export function getMelFilterbanks(powerSpectrumFrequenciesHz: Float32Array, centerFrequenciesMel: Float32Array, lowerFrequencyMel: number, upperFrequencyMel: number) {
const filterbankCount = centerFrequenciesMel.length
const powerSpectrumFrequenciesMel = powerSpectrumFrequenciesHz.map(frequencyHz => hertzToMel(frequencyHz))
const filterbanks: Filterbank[] = []
for (let filterbankIndex = 0; filterbankIndex < filterbankCount; filterbankIndex++) {
const centerFrequency = centerFrequenciesMel[filterbankIndex]
const leftFrequency = filterbankIndex > 0 ? centerFrequenciesMel[filterbankIndex - 1] : lowerFrequencyMel
const rightFrequency = filterbankIndex < filterbankCount - 1 ? centerFrequenciesMel[filterbankIndex + 1] : upperFrequencyMel
const width = rightFrequency - leftFrequency
const halfWidth = width / 2
let startIndex = -1
let weights: number[] = []
let weightSum = 0
for (let powerSpectrumBandIndex = 0; powerSpectrumBandIndex < powerSpectrumFrequenciesMel.length; powerSpectrumBandIndex++) {
const powerSpectrumBandFrequencyMel = powerSpectrumFrequenciesMel[powerSpectrumBandIndex]
let weight = 0
if (powerSpectrumBandFrequencyMel >= leftFrequency && powerSpectrumBandFrequencyMel <= centerFrequency) {
weight = (powerSpectrumBandFrequencyMel - leftFrequency) / halfWidth
} else if (powerSpectrumBandFrequencyMel > centerFrequency && powerSpectrumBandFrequencyMel <= rightFrequency) {
weight = (rightFrequency - powerSpectrumBandFrequencyMel) / halfWidth
}
if (weight > 0) {
if (startIndex == -1) {
startIndex = powerSpectrumBandIndex
}
weights.push(weight)
weightSum += weight
} else if (startIndex != -1) {
break
}
}
weights = weights.map(weight => weight / weightSum)
filterbanks.push({ startIndex, weights })
}
return filterbanks
}
export function getMelFilterbanksCenterFrequencies(melBandCount: number, lowerFrequencyMel: number, upperFrequencyMel: number) {
const stepSizeMel = (upperFrequencyMel - lowerFrequencyMel) / (melBandCount + 1)
const centerFrequencies = new Float32Array(melBandCount)
for (let i = 0; i < melBandCount; i++) {
centerFrequencies[i] = lowerFrequencyMel + ((i + 1) * stepSizeMel)
}
return centerFrequencies
}
export function hertzToMel(frequency: number) {
return 2595.0 * Math.log10(1.0 + (frequency / 700.0))
}
export function melToHertz(mel: number) {
return 700.0 * (Math.pow(10.0, mel / 2595.0) - 1.0)
}
export type Filterbank = {
startIndex: number
weights: number[]
}