UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

473 lines 26.7 kB
import { clip, splitFloat32Array } from '../utilities/Utilities.js'; import { computeMFCCs, extendDefaultMfccOptions } from '../dsp/MFCC.js'; import { alignMFCC_DTW, getCostMatrixMemorySizeMB } from './DTWMfccSequenceAlignment.js'; import { Logger } from '../utilities/Logger.js'; import { addTimeOffsetToTimeline } from '../utilities/Timeline.js'; import { concatAudioSegments, downmixToMonoAndNormalize, getEmptyRawAudio, getEndingSilentSampleCount, getRawAudioDuration, getStartingSilentSampleCount } from '../audio/AudioUtilities.js'; import chalk from 'chalk'; import { synthesize } from '../api/API.js'; import { resampleAudioSpeex } from '../dsp/SpeexResampler.js'; import { deepClone } from '../utilities/ObjectUtilities.js'; import { cosineDistance, zeroIfNaN } from '../math/VectorMath.js'; import { alignDTWWindowed } from './DTWSequenceAlignmentWindowed.js'; import { loadPackage } from '../utilities/PackageManager.js'; import { joinPath } from '../utilities/PathUtilities.js'; export async function alignUsingDtw(sourceRawAudio, referenceRawAudio, referenceTimeline, granularities, windowDurations) { const logger = new Logger(); if (windowDurations.length == 0) { throw new Error(`Window durations array has length 0.`); } if (windowDurations.length != granularities.length) { throw new Error(`Window durations and granularities are not the same length.`); } const rawAudioDuration = getRawAudioDuration(sourceRawAudio); let framesPerSecond; let compactedPath; let relativeCenters; for (let passIndex = 0; passIndex < windowDurations.length; passIndex++) { const granularity = granularities[passIndex]; const windowDuration = windowDurations[passIndex]; logger.logTitledMessage(`\nStarting alignment pass ${passIndex + 1}/${windowDurations.length}`, `granularity: ${granularity}, max window duration: ${windowDuration}s (${(windowDuration / rawAudioDuration * 100).toFixed(1)}%)`, chalk.magentaBright); const mfccOptions = extendDefaultMfccOptions({ ...getMfccOptionsForGranularity(granularity), zeroFirstCoefficient: true }); framesPerSecond = 1 / mfccOptions.hopDuration; // Compute reference MFCCs logger.start('Compute reference MFCC features'); const referenceMfccs = await computeMFCCs(referenceRawAudio, mfccOptions); // Compute source MFCCs logger.start('Compute source MFCC features'); const sourceMfccs = await computeMFCCs(sourceRawAudio, mfccOptions); logger.end(); // Compute path logger.logTitledMessage(`DTW cost matrix memory size`, `${getCostMatrixMemorySizeMB(referenceMfccs.length, sourceMfccs.length, windowDuration * framesPerSecond).toFixed(1)}MB`); if (passIndex == 0) { const minRecommendedWindowDuration = 0.2 * rawAudioDuration; if (windowDuration < minRecommendedWindowDuration) { logger.logTitledMessage('Warning', `Maximum DTW window duration is set to ${windowDuration.toFixed(1)}s (${(windowDuration / rawAudioDuration * 100).toFixed(1)}%), which is less than 20% of the source audio duration (audio duration is ${rawAudioDuration.toFixed(1)}s and a 20% window would be ${(rawAudioDuration * 0.2).toFixed(1)}s). This may lead to suboptimal results in some cases. Consider increasing window duration if needed.`, chalk.yellowBright, 'warning'); } } logger.start('Align reference and source MFCC features using DTW'); const dtwWindowLength = Math.floor(windowDuration * framesPerSecond); let centerIndexes; if (relativeCenters) { centerIndexes = []; for (let i = 0; i < referenceMfccs.length; i++) { const relativeReferencePosition = i / referenceMfccs.length; const relativeCenterIndex = Math.floor(relativeReferencePosition * relativeCenters.length); const relativeCenter = relativeCenters[relativeCenterIndex]; const centerIndex = Math.floor(relativeCenter * sourceMfccs.length); centerIndexes.push(centerIndex); } } const rawPath = await alignMFCC_DTW(referenceMfccs, sourceMfccs, dtwWindowLength, undefined, centerIndexes); compactedPath = compactPath(rawPath); relativeCenters = compactedPath.map(entry => (entry.first + entry.last) / 2 / sourceMfccs.length); logger.end(); } logger.start('\nConvert path to timeline'); const mappedTimeline = referenceTimeline.map(entry => getMappedTimelineEntry(entry, sourceRawAudio, framesPerSecond, compactedPath)); logger.end(); return mappedTimeline; } export async function alignUsingDtwWithRecognition(sourceRawAudio, referenceRawAudio, referenceTimeline, recognitionTimeline, granularities, windowDurations, espeakOptions, phoneAlignmentMethod = 'interpolation') { const logger = new Logger(); if (recognitionTimeline.length == 0) { const sourceDuration = getRawAudioDuration(sourceRawAudio); const referenceDuration = getRawAudioDuration(referenceRawAudio); const ratio = sourceDuration / referenceDuration; const interpolatedTimeline = []; for (const entry of referenceTimeline) { interpolatedTimeline.push({ type: entry.type, text: entry.text, startTime: entry.startTime * ratio, endTime: entry.endTime * ratio }); } return interpolatedTimeline; } // Synthesize the recognized transcript and get its timeline logger.start("Synthesize recognized transcript with eSpeak"); const recognizedWords = recognitionTimeline.map(entry => entry.text); const { rawAudio: synthesizedRecognizedTranscriptRawAudio, timeline: synthesizedRecognitionTimeline } = await createAlignmentReferenceUsingEspeakForFragments(recognizedWords, espeakOptions); let recognitionTimelineWithPhones; if (phoneAlignmentMethod == 'interpolation') { // Add phone timelines by interpolating from reference words logger.start('Interpolate phone timing'); recognitionTimelineWithPhones = await interpolatePhoneTimelines(recognitionTimeline, synthesizedRecognitionTimeline); } else if (phoneAlignmentMethod == 'dtw') { logger.start('Align phone timing'); // Add phone timelines by aligning each individual recognized word with the corresponding word // in the reference timeline recognitionTimelineWithPhones = await alignPhoneTimelines(sourceRawAudio, recognitionTimeline, synthesizedRecognizedTranscriptRawAudio, synthesizedRecognitionTimeline, 60); } else { throw new Error(`Unknown phone alignment method: ${phoneAlignmentMethod}`); } // Create a mapping from the synthesized recognized timeline to the recognized timeline logger.start("Map from the synthesized recognized timeline to the recognized timeline"); const synthesizedToRecognizedTimeMapping = []; for (let wordEntryIndex = 0; wordEntryIndex < synthesizedRecognitionTimeline.length; wordEntryIndex++) { const synthesizedTimelineEntry = synthesizedRecognitionTimeline[wordEntryIndex]; const recognitionTimelineEntry = recognitionTimelineWithPhones[wordEntryIndex]; synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedTimelineEntry.startTime, recognized: recognitionTimelineEntry.startTime }); if (synthesizedTimelineEntry.timeline) { for (let tokenEntryIndex = 0; tokenEntryIndex < synthesizedTimelineEntry.timeline.length; tokenEntryIndex++) { const synthesizedPhoneTimelineEntry = synthesizedTimelineEntry.timeline[tokenEntryIndex]; const recognitionPhoneTimelineEntry = recognitionTimelineEntry.timeline[tokenEntryIndex]; synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedPhoneTimelineEntry.startTime, recognized: recognitionPhoneTimelineEntry.startTime }); synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedPhoneTimelineEntry.endTime, recognized: recognitionPhoneTimelineEntry.endTime }); } } synthesizedToRecognizedTimeMapping.push({ synthesized: synthesizedTimelineEntry.endTime, recognized: recognitionTimelineEntry.endTime }); } // Align the synthesized recognized transcript to the synthesized reference transcript logger.start("Align the synthesized recognized transcript with the synthesized ground-truth transcript"); const alignedSynthesizedRecognitionTimeline = await alignUsingDtw(synthesizedRecognizedTranscriptRawAudio, referenceRawAudio, referenceTimeline, granularities, windowDurations); let currentSynthesizedToRecognizedMappingIndex = 0; // Map from synthesized reference timestamps to the recognition timestamps function mapSynthesizedToRecognizedTimeAndAdvance(synthesizedTime) { for (;; currentSynthesizedToRecognizedMappingIndex += 1) { const left = synthesizedToRecognizedTimeMapping[currentSynthesizedToRecognizedMappingIndex].synthesized; let right; if (currentSynthesizedToRecognizedMappingIndex < synthesizedToRecognizedTimeMapping.length - 1) { right = synthesizedToRecognizedTimeMapping[currentSynthesizedToRecognizedMappingIndex + 1].synthesized; } else { right = Infinity; } if (left > right) { throw new Error("Left is larger than right!"); } if (Math.abs(synthesizedTime - left) < Math.abs(synthesizedTime - right)) { return synthesizedToRecognizedTimeMapping[currentSynthesizedToRecognizedMappingIndex].recognized; } } } function mapTimeline(timeline) { const mappedTimeline = []; for (const entry of timeline) { const mappedEntry = { ...entry }; mappedEntry.startTime = mapSynthesizedToRecognizedTimeAndAdvance(entry.startTime); if (entry.timeline) { mappedEntry.timeline = mapTimeline(entry.timeline); } mappedEntry.endTime = mapSynthesizedToRecognizedTimeAndAdvance(entry.endTime); mappedTimeline.push(mappedEntry); } return mappedTimeline; } const result = mapTimeline(alignedSynthesizedRecognitionTimeline); logger.end(); return result; } // This is experimental code. It doesn't work well enough to be usable for anything. // Just testing some alternative approaches. export async function alignUsingDtwWithEmbeddings(sourceRawAudio, referenceRawAudio, referenceTimeline, language, granularities, windowDurations) { const logger = new Logger(); if (sourceRawAudio.sampleRate != 16000) { throw new Error('Source audio must have a sample rate of 16000 Hz'); } if (referenceRawAudio.sampleRate != 16000) { throw new Error('Reference audio must have a sample rate of 16000 Hz'); } const embeddingType = 'w2v-bert-2.0'; let sourceEmbeddings; let referenceEmbeddings; let framesPerSecond; if (embeddingType === 'w2v-bert-2.0') { const packageName = 'w2v-bert-2.0-uint8'; const modelDir = await loadPackage(packageName); const modelFilePath = joinPath(modelDir, `${packageName}.onnx`); const { Wav2Vec2BertFeatureEmbeddings } = await import('../speech-embeddings/WavToVec2BertFeatureEmbeddings.js'); const wav2vecBert = new Wav2Vec2BertFeatureEmbeddings(modelFilePath, ['cpu']); logger.start(`Extract source audio embeddings using the W2V-BERT-2.0 model`); sourceEmbeddings = await wav2vecBert.computeEmbeddings(sourceRawAudio); logger.start(`Extract reference audio embeddings using the W2V-BERT-2.0 model`); referenceEmbeddings = await wav2vecBert.computeEmbeddings(referenceRawAudio); framesPerSecond = 1000 / 10 / 2; } else if (embeddingType === 'whisper') { const sourceSamples = sourceRawAudio.audioChannels[0]; const referenceSamples = referenceRawAudio.audioChannels[0]; const WhisperSTT = await import(`../recognition/WhisperSTT.js`); const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths('base.en', language); const whisper = new WhisperSTT.Whisper(modelName, modelDir, ['dml', 'cpu'], ['cpu']); async function encodeToAudioFeatures(samples) { const featureVectors = []; for (let i = 0; i < samples.length; i += 16000 * 30) { const startSampleIndex = i; const endSampleIndex = Math.min(samples.length, i + 16000 * 30); const partSampleCount = endSampleIndex - startSampleIndex; const audioPart = samples.subarray(startSampleIndex, endSampleIndex); const rawAudioForPart = { audioChannels: [audioPart], sampleRate: 16000 }; const resultTensor = await whisper.encodeAudio(rawAudioForPart); const vectorLength = resultTensor.dims[2]; let featureVectorsForPart = splitFloat32Array(resultTensor.data, vectorLength); featureVectorsForPart = featureVectorsForPart.slice(0, Math.floor((partSampleCount / (16000 * 30)) * 1500)); featureVectors.push(...featureVectorsForPart); } return featureVectors; } logger.start(`Extract source audio embeddings using the Whisper encoder model`); sourceEmbeddings = await encodeToAudioFeatures(sourceSamples); logger.start(`Extract reference audio embeddings using the Whisper encoder model`); referenceEmbeddings = await encodeToAudioFeatures(referenceSamples); framesPerSecond = 1500 / 30; } else { throw new Error(`Unknown embedding type: ${embeddingType}`); } logger.start(`Align source and reference audio embeddings using DTW`); const { path: alignmentPath } = alignDTWWindowed(referenceEmbeddings, sourceEmbeddings, cosineDistance, 1000 * 1000); const compactedPath = compactPath(alignmentPath); logger.start('\nConvert path to timeline'); const mappedTimeline = referenceTimeline.map(entry => getMappedTimelineEntry(entry, sourceRawAudio, framesPerSecond, compactedPath)); logger.end(); return mappedTimeline; } function getMappedTimelineEntry(timelineEntry, sourceRawAudio, framesPerSecond, compactedPath, recurse = true) { const referenceStartFrameIndex = Math.floor(timelineEntry.startTime * framesPerSecond); const referenceEndFrameIndex = Math.floor(timelineEntry.endTime * framesPerSecond); if (referenceStartFrameIndex < 0 || referenceEndFrameIndex < 0) { throw new Error('Unexpected: encountered a negative timestamp in timeline'); } const mappedStartFrameIndex = getMappedFrameIndexForPath(referenceStartFrameIndex, compactedPath, 'first'); const mappedEndFrameIndex = getMappedFrameIndexForPath(referenceEndFrameIndex, compactedPath, 'first'); let innerTimeline; if (recurse && timelineEntry.timeline != null) { innerTimeline = timelineEntry.timeline.map((entry) => getMappedTimelineEntry(entry, sourceRawAudio, framesPerSecond, compactedPath, recurse)); } // Trim silent samples from start and end of mapped entry range const sourceSamplesPerFrame = Math.floor(sourceRawAudio.sampleRate / framesPerSecond); let startSampleIndex = mappedStartFrameIndex * sourceSamplesPerFrame; let endSampleIndex = mappedEndFrameIndex * sourceSamplesPerFrame; const frameSamples = sourceRawAudio.audioChannels[0].subarray(startSampleIndex, endSampleIndex); const silenceThresholdDecibels = -40; startSampleIndex += getStartingSilentSampleCount(frameSamples, silenceThresholdDecibels); endSampleIndex -= getEndingSilentSampleCount(frameSamples, silenceThresholdDecibels); endSampleIndex = Math.max(endSampleIndex, startSampleIndex); // Build mapped timeline entry const startTime = startSampleIndex / sourceRawAudio.sampleRate; const endTime = endSampleIndex / sourceRawAudio.sampleRate; return { type: timelineEntry.type, text: timelineEntry.text, startTime, endTime, timeline: innerTimeline }; } export async function interpolatePhoneTimelines(sourceTimeline, referenceTimeline) { const interpolatedTimeline = []; for (let wordEntryIndex = 0; wordEntryIndex < sourceTimeline.length; wordEntryIndex++) { const referenceEntry = referenceTimeline[wordEntryIndex]; const interpolatedEntry = deepClone(sourceTimeline[wordEntryIndex]); interpolatedTimeline.push(interpolatedEntry); if (interpolatedEntry.type != 'word') { continue; } const interpolatedEntryDuration = interpolatedEntry.endTime - interpolatedEntry.startTime; const synthesisEntryDuration = referenceEntry.endTime - referenceEntry.startTime; function mapEntry(targetEntry) { const targetStartTimePercentageRelativeToWord = (targetEntry.startTime - referenceEntry.startTime) / synthesisEntryDuration; const targetEndTimePercentageRelativeToWord = (targetEntry.endTime - referenceEntry.startTime) / synthesisEntryDuration; const interpolatedStartTime = interpolatedEntry.startTime + (zeroIfNaN(targetStartTimePercentageRelativeToWord) * interpolatedEntryDuration); const interpolatedEndTime = interpolatedEntry.startTime + (zeroIfNaN(targetEndTimePercentageRelativeToWord) * interpolatedEntryDuration); return { ...targetEntry, startTime: interpolatedStartTime, endTime: interpolatedEndTime }; } const interpolatedPhoneEntries = []; for (const phoneEntry of (referenceEntry.timeline || [])) { interpolatedPhoneEntries.push(mapEntry(phoneEntry)); } interpolatedEntry.timeline = interpolatedPhoneEntries; } return interpolatedTimeline; } export async function alignPhoneTimelines(sourceRawAudio, sourceWordTimeline, referenceRawAudio, referenceTimeline, windowDuration) { const mfccOptions = extendDefaultMfccOptions({ zeroFirstCoefficient: true }); const framesPerSecond = 1 / mfccOptions.hopDuration; const referenceMfccs = await computeMFCCs(referenceRawAudio, mfccOptions); const sourceMfccs = await computeMFCCs(sourceRawAudio, mfccOptions); const alignedWordTimeline = []; for (let i = 0; i < referenceTimeline.length; i++) { const referenceWordEntry = referenceTimeline[i]; const alignedWordEntry = deepClone(sourceWordTimeline[i]); if (alignedWordEntry.type != 'word') { continue; } const referenceWordStartFrameIndex = Math.floor(referenceWordEntry.startTime * framesPerSecond); let referenceWordEndFrameIndex = Math.floor(referenceWordEntry.endTime * framesPerSecond); // Ensure there is at least one frame in range if (referenceWordEndFrameIndex <= referenceWordStartFrameIndex) { referenceWordEndFrameIndex = referenceWordEndFrameIndex + 1; } const referenceWordMfccs = referenceMfccs.slice(referenceWordStartFrameIndex, referenceWordEndFrameIndex); const alignedWordStartFrameIndex = Math.floor(alignedWordEntry.startTime * framesPerSecond); let alignedWordEndFrameIndex = Math.floor(alignedWordEntry.endTime * framesPerSecond); // Ensure there is at least one frame in range if (alignedWordEndFrameIndex <= alignedWordStartFrameIndex) { alignedWordEndFrameIndex = alignedWordStartFrameIndex + 1; } const sourceWordMfccs = sourceMfccs.slice(alignedWordStartFrameIndex, alignedWordEndFrameIndex); // Compute DTW path const rawPath = await alignMFCC_DTW(referenceWordMfccs, sourceWordMfccs, windowDuration * framesPerSecond); const compactedPath = compactPath(rawPath); function mapEntry(referenceEntry) { const referenceStartFrameOffset = Math.floor((referenceEntry.startTime - referenceWordEntry.startTime) * framesPerSecond); const alignedStartFrameOffset = getMappedFrameIndexForPath(referenceStartFrameOffset, compactedPath); const alignedStartTime = alignedWordEntry.startTime + (alignedStartFrameOffset / framesPerSecond); const referenceEndFrameOffset = Math.floor((referenceEntry.endTime - referenceWordEntry.startTime) * framesPerSecond); const alignedEndFrameOffset = getMappedFrameIndexForPath(referenceEndFrameOffset, compactedPath); const alignedEndTime = alignedWordEntry.startTime + (alignedEndFrameOffset / framesPerSecond); return { ...referenceEntry, startTime: alignedStartTime, endTime: alignedEndTime }; } // Add phone timeline using the mapped time information const alignedPhoneTimeline = []; for (const referencePhoneEntry of (referenceWordEntry.timeline || [])) { alignedPhoneTimeline.push(mapEntry(referencePhoneEntry)); } alignedWordEntry.timeline = alignedPhoneTimeline; alignedWordTimeline.push(alignedWordEntry); } return alignedWordTimeline; } export async function createAlignmentReferenceUsingEspeakForFragments(fragments, espeakOptions) { const progressLogger = new Logger(); progressLogger.start("Load espeak module"); const Espeak = await import("../synthesis/EspeakTTS.js"); progressLogger.start("Synthesize alignment reference using eSpeak"); const result = { rawAudio: getEmptyRawAudio(1, await Espeak.getSampleRate()), timeline: [], events: [], }; { // Split fragments to chunks, process each chunk individually, // and incrementally merge the chunks to the final result. const maxCharactersInChunk = 1000; let timeOffset = 0; let currentChunk = []; let currentChunkCharacterCount = 0; for (let fragmentIndex = 0; fragmentIndex < fragments.length; fragmentIndex++) { const fragment = fragments[fragmentIndex]; currentChunk.push(fragment); currentChunkCharacterCount += fragment.length; if (currentChunkCharacterCount >= maxCharactersInChunk || fragmentIndex === fragments.length - 1) { // Process current chunk const chunkResult = await Espeak.synthesizeFragments(currentChunk, espeakOptions); result.rawAudio = { sampleRate: result.rawAudio.sampleRate, audioChannels: concatAudioSegments([result.rawAudio.audioChannels, chunkResult.rawAudio.audioChannels]) }; const chunkTimeline = addTimeOffsetToTimeline(chunkResult.timeline, timeOffset); result.timeline = [...result.timeline, ...chunkTimeline]; result.events = [...result.events, ...chunkResult.events]; timeOffset += getRawAudioDuration(chunkResult.rawAudio); currentChunk = []; currentChunkCharacterCount = 0; } } } result.timeline = result.timeline.flatMap(clause => clause.timeline); for (const wordEntry of result.timeline) { wordEntry.timeline = wordEntry.timeline.flatMap(tokenEntry => tokenEntry.timeline); } progressLogger.end(); return result; } export async function createAlignmentReferenceUsingEspeak(transcript, language, plaintextOptions, customLexiconPaths, insertSeparators, useKlatt) { const logger = new Logger(); logger.start('Synthesize alignment reference using eSpeak'); const synthesisOptions = { engine: 'espeak', language, plainText: plaintextOptions, customLexiconPaths: customLexiconPaths, espeak: { useKlatt, insertSeparators, } }; let { audio: referenceRawAudio, timeline: segmentTimeline, voice: espeakVoice } = await synthesize(transcript, synthesisOptions); const sentenceTimeline = segmentTimeline.flatMap(entry => entry.timeline); const wordTimeline = sentenceTimeline.flatMap(entry => entry.timeline); referenceRawAudio = await resampleAudioSpeex(referenceRawAudio, 16000); referenceRawAudio = downmixToMonoAndNormalize(referenceRawAudio); logger.end(); return { referenceRawAudio, referenceTimeline: wordTimeline, espeakVoice }; } function compactPath(path) { const compactedPath = []; for (let i = 0; i < path.length; i++) { const pathEntry = path[i]; if (compactedPath.length <= pathEntry.source) { compactedPath.push({ first: pathEntry.dest, last: pathEntry.dest }); } else { compactedPath[compactedPath.length - 1].last = pathEntry.dest; } } return compactedPath; } function getMappedFrameIndexForPath(referenceFrameIndex, compactedPath, mappingKind = 'first') { if (compactedPath.length == 0) { return 0; } referenceFrameIndex = clip(referenceFrameIndex, 0, compactedPath.length - 1); const compactedPathEntry = compactedPath[referenceFrameIndex]; let mappedFrameIndex; if (mappingKind == 'first') { mappedFrameIndex = compactedPathEntry.first; } else { mappedFrameIndex = compactedPathEntry.last; } return mappedFrameIndex; } export function getMfccOptionsForGranularity(granularity) { let mfccOptions; if (granularity == 'xx-low') { mfccOptions = { windowDuration: 0.400, hopDuration: 0.160, fftOrder: 8192 }; } else if (granularity == 'x-low') { mfccOptions = { windowDuration: 0.200, hopDuration: 0.080, fftOrder: 4096 }; } else if (granularity == 'low') { mfccOptions = { windowDuration: 0.100, hopDuration: 0.040, fftOrder: 2048 }; } else if (granularity == 'medium') { mfccOptions = { windowDuration: 0.050, hopDuration: 0.020, fftOrder: 1024 }; } else if (granularity == 'high') { mfccOptions = { windowDuration: 0.025, hopDuration: 0.010, fftOrder: 512 }; } else if (granularity == 'x-high') { mfccOptions = { windowDuration: 0.020, hopDuration: 0.005, fftOrder: 512 }; } else { throw new Error(`Invalid granularity setting: '${granularity}'`); } return mfccOptions; } //# sourceMappingURL=SpeechAlignment.js.map