echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

github.com/echogarden-project/echogarden

echogarden-project/echogarden

755 lines (543 loc) • 25.9 kB

text/typescript

import { clip, splitFloat32Array } from '../utilities/Utilities.js' import * as API from '../api/API.js' import { computeMFCCs, extendDefaultMfccOptions, MfccOptions } from '../dsp/MFCC.js' import { alignMFCC_DTW, getCostMatrixMemorySizeMB } from './DTWMfccSequenceAlignment.js' import { Logger } from '../utilities/Logger.js' import { addTimeOffsetToTimeline, Timeline, TimelineEntry } from '../utilities/Timeline.js' import { concatAudioSegments, downmixToMonoAndNormalize, getEmptyRawAudio, getEndingSilentSampleCount, getRawAudioDuration, getStartingSilentSampleCount, RawAudio } from '../audio/AudioUtilities.js' import chalk from 'chalk' import { synthesize } from '../api/API.js' import { resampleAudioSpeex } from '../dsp/SpeexResampler.js' import { deepClone } from '../utilities/ObjectUtilities.js' import { cosineDistance, euclideanDistance, zeroIfNaN } from '../math/VectorMath.js' import { EspeakEvent, EspeakOptions } from '../synthesis/EspeakTTS.js' import { alignDTWWindowed } from './DTWSequenceAlignmentWindowed.js' import { loadPackage } from '../utilities/PackageManager.js' import { joinPath } from '../utilities/PathUtilities.js' export async function alignUsingDtw( sourceRawAudio: RawAudio, referenceRawAudio: RawAudio, referenceTimeline: Timeline, granularities: DtwGranularity[], windowDurations: number[]) { const logger = new Logger() if (windowDurations.length == 0) { throw new Error(`Window durations array has length 0.`) } if (windowDurations.length != granularities.length) { throw new Error(`Window durations and granularities are not the same length.`) } const rawAudioDuration = getRawAudioDuration(sourceRawAudio) let framesPerSecond: number let compactedPath: CompactedPath let relativeCenters: number[] | undefined for (let passIndex = 0; passIndex < windowDurations.length; passIndex++) { const granularity = granularities[passIndex] const windowDuration = windowDurations[passIndex] logger.logTitledMessage(`\nStarting alignment pass ${passIndex + 1}/${windowDurations.length}`, `granularity: ${granularity}, max window duration: ${windowDuration}s (${(windowDuration / rawAudioDuration * 100).toFixed(1)}%)`, chalk.magentaBright) const mfccOptions = extendDefaultMfccOptions({ ...getMfccOptionsForGranularity(granularity), zeroFirstCoefficient: true }) as MfccOptions framesPerSecond = 1 / mfccOptions.hopDuration! // Compute reference MFCCs logger.start('Compute reference MFCC features') const referenceMfccs = await computeMFCCs(referenceRawAudio, mfccOptions) // Compute source MFCCs logger.start('Compute source MFCC features') const sourceMfccs = await computeMFCCs(sourceRawAudio, mfccOptions) logger.end() // Compute path logger.logTitledMessage(`DTW cost matrix memory size`, `${getCostMatrixMemorySizeMB(referenceMfccs.length, sourceMfccs.length, windowDuration * framesPerSecond).toFixed(1)}MB`) if (passIndex == 0) { const minRecommendedWindowDuration = 0.2 * rawAudioDuration if (windowDuration < minRecommendedWindowDuration) { logger.logTitledMessage('Warning', `Maximum DTW window duration is set to ${windowDuration.toFixed(1)}s (${(windowDuration / rawAudioDuration * 100).toFixed(1)}%), which is less than 20% of the source audio duration (audio duration is ${rawAudioDuration.toFixed(1)}s and a 20% window would be ${(rawAudioDuration * 0.2).toFixed(1)}s). This may lead to suboptimal results in some cases. Consider increasing window duration if needed.`, chalk.yellowBright, 'warning') } } logger.start('Align reference and source MFCC features using DTW') const dtwWindowLength = Math.floor(windowDuration * framesPerSecond) let centerIndexes: number[] | undefined if (relativeCenters) { centerIndexes = [] for (let i = 0; i < referenceMfccs.length; i++) { const relativeReferencePosition = i / referenceMfccs.length const relativeCenterIndex = Math.floor(relativeReferencePosition * relativeCenters!.length) const relativeCenter = relativeCenters[relativeCenterIndex] const centerIndex = Math.floor(relativeCenter * sourceMfccs.length) centerIndexes.push(centerIndex) } } const rawPath = await alignMFCC_DTW(referenceMfccs, sourceMfccs, dtwWindowLength, undefined, centerIndexes) compactedPath = compactPath(rawPath) relativeCenters = compactedPath.map(entry => (entry.first + entry.last) / 2 / sourceMfccs.length) logger.end() } logger.start('\nConvert path to timeline') const mappedTimeline = referenceTimeline.map(entry => getMappedTimelineEntry(entry, sourceRawAudio, framesPerSecond, compactedPath)) logger.end() return mappedTimeline } export async function alignUsingDtwWithRecognition( sourceRawAudio: RawAudio, referenceRawAudio: RawAudio, referenceTimeline: Timeline, recognitionTimeline: Timeline, granularities: DtwGranularity[], windowDurations: number[], espeakOptions: EspeakOptions, phoneAlignmentMethod: API.PhoneAlignmentMethod = 'interpolation') { const logger = new Logger() if (recognitionTimeline.length == 0) { const sourceDuration = getRawAudioDuration(sourceRawAudio) const referenceDuration = getRawAudioDuration(referenceRawAudio) const ratio = sourceDuration / referenceDuration const interpolatedTimeline: Timeline = [] for (const entry of referenceTimeline) { interpolatedTimeline.push({ type: entry.type, text: entry.text, startTime: entry.startTime * ratio, endTime: entry.endTime * ratio }) } return interpolatedTimeline } // Synthesize the recognized transcript and get its timeline logger.start("Synthesize recognized transcript with eSpeak") const recognizedWords = recognitionTimeline.map(entry => entry.text) const { rawAudio: synthesizedRecognizedTranscriptRawAudio, timeline: synthesizedRecognitionTimeline } = await createAlignmentReferenceUsingEspeakForFragments(recognizedWords, espeakOptions) let recognitionTimelineWithPhones: Timeline if (phoneAlignmentMethod == 'interpolation') { // Add phone timelines by interpolating from reference words logger.start('Interpolate phone timing') recognitionTimelineWithPhones = await interpolatePhoneTimelines( recognitionTimeline, synthesizedRecognitionTimeline ) } else if (phoneAlignmentMethod == 'dtw') { logger.start('Align phone timing') // Add phone timelines by aligning each individual recognized word with the corresponding word // in the reference timeline recognitionTimelineWithPhones = await alignPhoneTimelines( sourceRawAudio, recognitionTimeline, synthesizedRecognizedTranscriptRawAudio, synthesizedRecognitionTimeline, 60) } else { throw new Error(`Unknown phone alignment method: ${phoneAlignmentMethod}`) } // Create a mapping from the synthesized recognized timeline to the recognized timeline logger.start("Map from the synthesized recognized timeline to the recognized timeline") type SynthesizedToRecognizedTimeMappingEntry = { synthesized: number recognized: number } type SynthesizedToRecognizedTimeMapping = SynthesizedToRecognizedTimeMappingEntry[] const synthesizedToRecognizedTimeMapping: SynthesizedToRecognizedTimeMapping = [] for (let wordEntryIndex = 0; wordEntryIndex < synthesizedRecognitionTimeline.length; wordEntryIndex++) { const synthesizedTimelineEntry = synthesizedRecognitionTimeline[wordEntryIndex] const recognitionTimelineEntry = recognitionTimelineWithPhones[wordEntryIndex] synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedTimelineEntry.startTime, recognized: recognitionTimelineEntry.startTime }) if (synthesizedTimelineEntry.timeline) { for (let tokenEntryIndex = 0; tokenEntryIndex < synthesizedTimelineEntry.timeline.length; tokenEntryIndex++) { const synthesizedPhoneTimelineEntry = synthesizedTimelineEntry.timeline[tokenEntryIndex] const recognitionPhoneTimelineEntry = recognitionTimelineEntry.timeline![tokenEntryIndex] synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedPhoneTimelineEntry.startTime, recognized: recognitionPhoneTimelineEntry.startTime }) synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedPhoneTimelineEntry.endTime, recognized: recognitionPhoneTimelineEntry.endTime }) } } synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedTimelineEntry.endTime, recognized: recognitionTimelineEntry.endTime }) } // Align the synthesized recognized transcript to the synthesized reference transcript logger.start("Align the synthesized recognized transcript with the synthesized ground-truth transcript") const alignedSynthesizedRecognitionTimeline = await alignUsingDtw( synthesizedRecognizedTranscriptRawAudio, referenceRawAudio, referenceTimeline, granularities, windowDurations) let currentSynthesizedToRecognizedMappingIndex = 0 // Map from synthesized reference timestamps to the recognition timestamps function mapSynthesizedToRecognizedTimeAndAdvance(synthesizedTime: number) { for (; ; currentSynthesizedToRecognizedMappingIndex += 1) { const left = synthesizedToRecognizedTimeMapping[currentSynthesizedToRecognizedMappingIndex].synthesized let right: number if (currentSynthesizedToRecognizedMappingIndex < synthesizedToRecognizedTimeMapping.length - 1) { right = synthesizedToRecognizedTimeMapping[currentSynthesizedToRecognizedMappingIndex + 1].synthesized } else { right = Infinity } if (left > right) { throw new Error("Left is larger than right!") } if (Math.abs(synthesizedTime - left) < Math.abs(synthesizedTime - right)) { return synthesizedToRecognizedTimeMapping[currentSynthesizedToRecognizedMappingIndex].recognized } } } function mapTimeline(timeline: Timeline) { const mappedTimeline: Timeline = [] for (const entry of timeline) { const mappedEntry = { ...entry } mappedEntry.startTime = mapSynthesizedToRecognizedTimeAndAdvance(entry.startTime) if (entry.timeline) { mappedEntry.timeline = mapTimeline(entry.timeline) } mappedEntry.endTime = mapSynthesizedToRecognizedTimeAndAdvance(entry.endTime) mappedTimeline.push(mappedEntry) } return mappedTimeline } const result = mapTimeline(alignedSynthesizedRecognitionTimeline) logger.end() return result } // This is experimental code. It doesn't work well enough to be usable for anything. // Just testing some alternative approaches. export async function alignUsingDtwWithEmbeddings( sourceRawAudio: RawAudio, referenceRawAudio: RawAudio, referenceTimeline: Timeline, language: string, granularities: DtwGranularity[], windowDurations: number[]) { const logger = new Logger() if (sourceRawAudio.sampleRate != 16000) { throw new Error('Source audio must have a sample rate of 16000 Hz') } if (referenceRawAudio.sampleRate != 16000) { throw new Error('Reference audio must have a sample rate of 16000 Hz') } const embeddingType: 'w2v-bert-2.0' | 'whisper' = 'w2v-bert-2.0' let sourceEmbeddings: Float32Array[] let referenceEmbeddings: Float32Array[] let framesPerSecond: number if (embeddingType === 'w2v-bert-2.0') { const packageName = 'w2v-bert-2.0-uint8' const modelDir = await loadPackage(packageName) const modelFilePath = joinPath(modelDir, `${packageName}.onnx`) const { Wav2Vec2BertFeatureEmbeddings } = await import('../speech-embeddings/WavToVec2BertFeatureEmbeddings.js') const wav2vecBert = new Wav2Vec2BertFeatureEmbeddings( modelFilePath, ['cpu'], ) logger.start(`Extract source audio embeddings using the W2V-BERT-2.0 model`) sourceEmbeddings = await wav2vecBert.computeEmbeddings(sourceRawAudio) logger.start(`Extract reference audio embeddings using the W2V-BERT-2.0 model`) referenceEmbeddings = await wav2vecBert.computeEmbeddings(referenceRawAudio) framesPerSecond = 1000 / 10 / 2 } else if (embeddingType === 'whisper') { const sourceSamples = sourceRawAudio.audioChannels[0] const referenceSamples = referenceRawAudio.audioChannels[0] const WhisperSTT = await import(`../recognition/WhisperSTT.js`) const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths('base.en', language) const whisper = new WhisperSTT.Whisper(modelName, modelDir, ['dml', 'cpu'], ['cpu']) async function encodeToAudioFeatures(samples: Float32Array) { const featureVectors: Float32Array[] = [] for (let i = 0; i < samples.length; i += 16000 * 30) { const startSampleIndex = i const endSampleIndex = Math.min(samples.length, i + 16000 * 30) const partSampleCount = endSampleIndex - startSampleIndex const audioPart = samples.subarray(startSampleIndex, endSampleIndex) const rawAudioForPart = { audioChannels: [audioPart], sampleRate: 16000 } as RawAudio const resultTensor = await whisper.encodeAudio(rawAudioForPart) const vectorLength = resultTensor.dims[2] let featureVectorsForPart = splitFloat32Array(resultTensor.data as Float32Array, vectorLength) featureVectorsForPart = featureVectorsForPart.slice(0, Math.floor((partSampleCount / (16000 * 30)) * 1500)) featureVectors.push(...featureVectorsForPart) } return featureVectors } logger.start(`Extract source audio embeddings using the Whisper encoder model`) sourceEmbeddings = await encodeToAudioFeatures(sourceSamples) logger.start(`Extract reference audio embeddings using the Whisper encoder model`) referenceEmbeddings = await encodeToAudioFeatures(referenceSamples) framesPerSecond = 1500 / 30 } else { throw new Error(`Unknown embedding type: ${embeddingType}`) } logger.start(`Align source and reference audio embeddings using DTW`) const { path: alignmentPath } = alignDTWWindowed( referenceEmbeddings, sourceEmbeddings, cosineDistance, 1000 * 1000 ) const compactedPath = compactPath(alignmentPath) logger.start('\nConvert path to timeline') const mappedTimeline = referenceTimeline.map(entry => getMappedTimelineEntry(entry, sourceRawAudio, framesPerSecond, compactedPath)) logger.end() return mappedTimeline } function getMappedTimelineEntry( timelineEntry: TimelineEntry, sourceRawAudio: RawAudio, framesPerSecond: number, compactedPath: CompactedPath, recurse = true): TimelineEntry { const referenceStartFrameIndex = Math.floor(timelineEntry.startTime * framesPerSecond) const referenceEndFrameIndex = Math.floor(timelineEntry.endTime * framesPerSecond) if (referenceStartFrameIndex < 0 || referenceEndFrameIndex < 0) { throw new Error('Unexpected: encountered a negative timestamp in timeline') } const mappedStartFrameIndex = getMappedFrameIndexForPath(referenceStartFrameIndex, compactedPath, 'first') const mappedEndFrameIndex = getMappedFrameIndexForPath(referenceEndFrameIndex, compactedPath, 'first') let innerTimeline: Timeline | undefined if (recurse && timelineEntry.timeline != null) { innerTimeline = timelineEntry.timeline.map((entry) => getMappedTimelineEntry(entry, sourceRawAudio, framesPerSecond, compactedPath, recurse)) } // Trim silent samples from start and end of mapped entry range const sourceSamplesPerFrame = Math.floor(sourceRawAudio.sampleRate / framesPerSecond) let startSampleIndex = mappedStartFrameIndex * sourceSamplesPerFrame let endSampleIndex = mappedEndFrameIndex * sourceSamplesPerFrame const frameSamples = sourceRawAudio.audioChannels[0].subarray(startSampleIndex, endSampleIndex) const silenceThresholdDecibels = -40 startSampleIndex += getStartingSilentSampleCount(frameSamples, silenceThresholdDecibels) endSampleIndex -= getEndingSilentSampleCount(frameSamples, silenceThresholdDecibels) endSampleIndex = Math.max(endSampleIndex, startSampleIndex) // Build mapped timeline entry const startTime = startSampleIndex / sourceRawAudio.sampleRate const endTime = endSampleIndex / sourceRawAudio.sampleRate return { type: timelineEntry.type, text: timelineEntry.text, startTime, endTime, timeline: innerTimeline } } export async function interpolatePhoneTimelines(sourceTimeline: Timeline, referenceTimeline: Timeline) { const interpolatedTimeline: Timeline = [] for (let wordEntryIndex = 0; wordEntryIndex < sourceTimeline.length; wordEntryIndex++) { const referenceEntry = referenceTimeline[wordEntryIndex] const interpolatedEntry = deepClone(sourceTimeline[wordEntryIndex]) interpolatedTimeline.push(interpolatedEntry) if (interpolatedEntry.type != 'word') { continue } const interpolatedEntryDuration = interpolatedEntry.endTime - interpolatedEntry.startTime const synthesisEntryDuration = referenceEntry.endTime - referenceEntry.startTime function mapEntry(targetEntry: TimelineEntry): TimelineEntry { const targetStartTimePercentageRelativeToWord = (targetEntry.startTime - referenceEntry.startTime) / synthesisEntryDuration const targetEndTimePercentageRelativeToWord = (targetEntry.endTime - referenceEntry.startTime) / synthesisEntryDuration const interpolatedStartTime = interpolatedEntry.startTime + (zeroIfNaN(targetStartTimePercentageRelativeToWord) * interpolatedEntryDuration) const interpolatedEndTime = interpolatedEntry.startTime + (zeroIfNaN(targetEndTimePercentageRelativeToWord) * interpolatedEntryDuration) return { ...targetEntry, startTime: interpolatedStartTime, endTime: interpolatedEndTime } } const interpolatedPhoneEntries: Timeline = [] for (const phoneEntry of (referenceEntry.timeline || [])) { interpolatedPhoneEntries.push(mapEntry(phoneEntry)) } interpolatedEntry.timeline = interpolatedPhoneEntries } return interpolatedTimeline } export async function alignPhoneTimelines( sourceRawAudio: RawAudio, sourceWordTimeline: Timeline, referenceRawAudio: RawAudio, referenceTimeline: Timeline, windowDuration: number) { const mfccOptions: MfccOptions = extendDefaultMfccOptions({ zeroFirstCoefficient: true }) const framesPerSecond = 1 / mfccOptions.hopDuration! const referenceMfccs = await computeMFCCs(referenceRawAudio, mfccOptions) const sourceMfccs = await computeMFCCs(sourceRawAudio, mfccOptions) const alignedWordTimeline: Timeline = [] for (let i = 0; i < referenceTimeline.length; i++) { const referenceWordEntry = referenceTimeline[i] const alignedWordEntry = deepClone(sourceWordTimeline[i]) if (alignedWordEntry.type != 'word') { continue } const referenceWordStartFrameIndex = Math.floor(referenceWordEntry.startTime * framesPerSecond) let referenceWordEndFrameIndex = Math.floor(referenceWordEntry.endTime * framesPerSecond) // Ensure there is at least one frame in range if (referenceWordEndFrameIndex <= referenceWordStartFrameIndex) { referenceWordEndFrameIndex = referenceWordEndFrameIndex + 1 } const referenceWordMfccs = referenceMfccs.slice(referenceWordStartFrameIndex, referenceWordEndFrameIndex) const alignedWordStartFrameIndex = Math.floor(alignedWordEntry.startTime * framesPerSecond) let alignedWordEndFrameIndex = Math.floor(alignedWordEntry.endTime * framesPerSecond) // Ensure there is at least one frame in range if (alignedWordEndFrameIndex <= alignedWordStartFrameIndex) { alignedWordEndFrameIndex = alignedWordStartFrameIndex + 1 } const sourceWordMfccs = sourceMfccs.slice(alignedWordStartFrameIndex, alignedWordEndFrameIndex) // Compute DTW path const rawPath = await alignMFCC_DTW(referenceWordMfccs, sourceWordMfccs, windowDuration * framesPerSecond) const compactedPath = compactPath(rawPath) function mapEntry(referenceEntry: TimelineEntry): TimelineEntry { const referenceStartFrameOffset = Math.floor((referenceEntry.startTime - referenceWordEntry.startTime) * framesPerSecond) const alignedStartFrameOffset = getMappedFrameIndexForPath(referenceStartFrameOffset, compactedPath) const alignedStartTime = alignedWordEntry.startTime + (alignedStartFrameOffset / framesPerSecond) const referenceEndFrameOffset = Math.floor((referenceEntry.endTime - referenceWordEntry.startTime) * framesPerSecond) const alignedEndFrameOffset = getMappedFrameIndexForPath(referenceEndFrameOffset, compactedPath) const alignedEndTime = alignedWordEntry.startTime + (alignedEndFrameOffset / framesPerSecond) return { ...referenceEntry, startTime: alignedStartTime, endTime: alignedEndTime } } // Add phone timeline using the mapped time information const alignedPhoneTimeline: Timeline = [] for (const referencePhoneEntry of (referenceWordEntry.timeline || [])) { alignedPhoneTimeline.push(mapEntry(referencePhoneEntry)) } alignedWordEntry.timeline = alignedPhoneTimeline alignedWordTimeline.push(alignedWordEntry) } return alignedWordTimeline } export async function createAlignmentReferenceUsingEspeakForFragments(fragments: string[], espeakOptions: EspeakOptions) { const progressLogger = new Logger() progressLogger.start("Load espeak module") const Espeak = await import("../synthesis/EspeakTTS.js") progressLogger.start("Synthesize alignment reference using eSpeak") const result = { rawAudio: getEmptyRawAudio(1, await Espeak.getSampleRate()) as RawAudio, timeline: [] as Timeline, events: [] as EspeakEvent[], } { // Split fragments to chunks, process each chunk individually, // and incrementally merge the chunks to the final result. const maxCharactersInChunk = 1000 let timeOffset = 0 let currentChunk: string[] = [] let currentChunkCharacterCount = 0 for (let fragmentIndex = 0; fragmentIndex < fragments.length; fragmentIndex++) { const fragment = fragments[fragmentIndex] currentChunk.push(fragment) currentChunkCharacterCount += fragment.length if (currentChunkCharacterCount >= maxCharactersInChunk || fragmentIndex === fragments.length - 1) { // Process current chunk const chunkResult = await Espeak.synthesizeFragments(currentChunk, espeakOptions) result.rawAudio = { sampleRate: result.rawAudio.sampleRate, audioChannels: concatAudioSegments([result.rawAudio.audioChannels, chunkResult.rawAudio.audioChannels]) } const chunkTimeline = addTimeOffsetToTimeline(chunkResult.timeline, timeOffset) result.timeline = [...result.timeline, ...chunkTimeline] result.events = [...result.events, ...chunkResult.events] timeOffset += getRawAudioDuration(chunkResult.rawAudio) currentChunk = [] currentChunkCharacterCount = 0 } } } result.timeline = result.timeline.flatMap(clause => clause.timeline!) for (const wordEntry of result.timeline) { wordEntry.timeline = wordEntry.timeline!.flatMap(tokenEntry => tokenEntry.timeline!) } progressLogger.end() return result } export async function createAlignmentReferenceUsingEspeak( transcript: string, language: string, plaintextOptions?: API.PlainTextOptions, customLexiconPaths?: string[], insertSeparators?: boolean, useKlatt?: boolean) { const logger = new Logger() logger.start('Synthesize alignment reference using eSpeak') const synthesisOptions: API.SynthesisOptions = { engine: 'espeak', language, plainText: plaintextOptions, customLexiconPaths: customLexiconPaths, espeak: { useKlatt, insertSeparators, } } let { audio: referenceRawAudio, timeline: segmentTimeline, voice: espeakVoice } = await synthesize(transcript, synthesisOptions) const sentenceTimeline = segmentTimeline.flatMap(entry => entry.timeline!) const wordTimeline = sentenceTimeline.flatMap(entry => entry.timeline!) referenceRawAudio = await resampleAudioSpeex(referenceRawAudio as RawAudio, 16000) referenceRawAudio = downmixToMonoAndNormalize(referenceRawAudio) logger.end() return { referenceRawAudio, referenceTimeline: wordTimeline, espeakVoice } } function compactPath(path: AlignmentPath) { const compactedPath: CompactedPath = [] for (let i = 0; i < path.length; i++) { const pathEntry = path[i] if (compactedPath.length <= pathEntry.source) { compactedPath.push({ first: pathEntry.dest, last: pathEntry.dest }) } else { compactedPath[compactedPath.length - 1].last = pathEntry.dest } } return compactedPath } function getMappedFrameIndexForPath(referenceFrameIndex: number, compactedPath: CompactedPath, mappingKind: 'first' | 'last' = 'first') { if (compactedPath.length == 0) { return 0 } referenceFrameIndex = clip(referenceFrameIndex, 0, compactedPath.length - 1) const compactedPathEntry = compactedPath[referenceFrameIndex] let mappedFrameIndex: number if (mappingKind == 'first') { mappedFrameIndex = compactedPathEntry.first } else { mappedFrameIndex = compactedPathEntry.last } return mappedFrameIndex } export function getMfccOptionsForGranularity(granularity: DtwGranularity) { let mfccOptions: MfccOptions if (granularity == 'xx-low') { mfccOptions = { windowDuration: 0.400, hopDuration: 0.160, fftOrder: 8192 } } else if (granularity == 'x-low') { mfccOptions = { windowDuration: 0.200, hopDuration: 0.080, fftOrder: 4096 } } else if (granularity == 'low') { mfccOptions = { windowDuration: 0.100, hopDuration: 0.040, fftOrder: 2048 } } else if (granularity == 'medium') { mfccOptions = { windowDuration: 0.050, hopDuration: 0.020, fftOrder: 1024 } } else if (granularity == 'high') { mfccOptions = { windowDuration: 0.025, hopDuration: 0.010, fftOrder: 512 } } else if (granularity == 'x-high') { mfccOptions = { windowDuration: 0.020, hopDuration: 0.005, fftOrder: 512 } } else { throw new Error(`Invalid granularity setting: '${granularity}'`) } return mfccOptions } export type AlignmentPath = AlignmentPathEntry[] export type AlignmentPathEntry = { source: number, dest: number } export type CompactedPath = CompactedPathEntry[] export type CompactedPathEntry = { first: number, last: number } export type DtwGranularity = 'xx-low' | 'x-low' | 'low' | 'medium' | 'high' | 'x-high'