UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

313 lines 15.6 kB
import { extendDeep } from '../utilities/ObjectUtilities.js'; import { logToStderr } from '../utilities/Utilities.js'; import { ensureRawAudio, getRawAudioDuration, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js'; import { Logger } from '../utilities/Logger.js'; import * as API from './API.js'; import { addTimeOffsetToTimeline, addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js'; import { formatLanguageCodeWithName, getDefaultDialectForLanguageCodeIfPossible, getShortLanguageCode, parseLangIdentifier } from '../utilities/Locale.js'; import chalk from 'chalk'; import { alignUsingDtwWithEmbeddings, createAlignmentReferenceUsingEspeak } from '../alignment/SpeechAlignment.js'; import { defaultEspeakOptions } from '../synthesis/EspeakTTS.js'; import { isWord } from '../nlp/Segmentation.js'; const log = logToStderr; export async function align(input, transcript, options) { const logger = new Logger(); const startTimestamp = logger.getTimestamp(); options = extendDeep(defaultAlignmentOptions, options); const inputRawAudio = await ensureRawAudio(input); let sourceRawAudio; let isolatedRawAudio; let backgroundRawAudio; if (options.isolate) { logger.log(``); logger.end(); ({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation)); logger.end(); logger.log(``); logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1); } else { logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1); } let sourceUncropTimeline; if (options.crop) { logger.start('Crop using voice activity detection'); ({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad)); logger.end(); } logger.start('Normalize and trim audio'); normalizeAudioLevelInPlace(sourceRawAudio); sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0]); logger.end(); let language; if (options.language) { const languageData = await parseLangIdentifier(options.language); language = languageData.Name; logger.logTitledMessage('Language specified', formatLanguageCodeWithName(language)); } else { logger.start('No language specified. Detect language using reference text'); const { detectedLanguage } = await API.detectTextLanguage(transcript, options.languageDetection || {}); language = detectedLanguage; logger.end(); logger.logTitledMessage('Language detected', formatLanguageCodeWithName(language)); } language = getDefaultDialectForLanguageCodeIfPossible(language); logger.start('Load alignment module'); const { alignUsingDtwWithRecognition, alignUsingDtw } = await import('../alignment/SpeechAlignment.js'); function getDtwWindowGranularitiesAndDurations() { const sourceAudioDuration = getRawAudioDuration(sourceRawAudio); let granularities; let windowDurations; const dtwOptions = options.dtw; if (typeof dtwOptions.granularity == 'string') { granularities = [dtwOptions.granularity]; } else if (Array.isArray(dtwOptions.granularity)) { granularities = dtwOptions.granularity; } else { if (sourceAudioDuration < 1 * 60) { // If up to 1 minute, set granularity to high, single pass granularities = ['high']; } else if (sourceAudioDuration < 5 * 60) { // If up to 5 minutes, set granularity to medium, single pass granularities = ['medium']; } else if (sourceAudioDuration < 30 * 60) { // If up to 30 minutes, set granularity to low, single pass granularities = ['low']; } else { // Otherwise, use multipass processing, first with xx-low granularity, then low granularities = ['xx-low', 'low']; } } if (dtwOptions.windowDuration) { function tryParsePercentageWindowDuration(durationString) { durationString = durationString.trim(); const parseResult = durationString.match(/^([0-9]+)%$/); if (parseResult == null) { throw new Error(`A DTW window duration, when provided as a string, must be formatted as an integer percentage value like '15%'.`); } const percentageValue = parseInt(parseResult[1]); if (percentageValue == null || isNaN(percentageValue) || percentageValue <= 0 || percentageValue > 100) { throw new Error(`'A DTW window duration, when provided as a percentage value, must be between 0 (non-inclusive) and 100 (inclusive).`); } let durationSeconds = percentageValue / 100 * sourceAudioDuration; durationSeconds = Math.ceil(durationSeconds); durationSeconds = Math.min(durationSeconds, sourceAudioDuration); return durationSeconds; } if (typeof dtwOptions.windowDuration === 'number') { const duration = Math.min(dtwOptions.windowDuration, sourceAudioDuration); windowDurations = [duration]; } else if (typeof dtwOptions.windowDuration === 'string') { const durationString = dtwOptions.windowDuration.trim(); const durationSeconds = tryParsePercentageWindowDuration(durationString); windowDurations = [durationSeconds]; } else if (Array.isArray(dtwOptions.windowDuration)) { const durationsValues = dtwOptions.windowDuration; if (durationsValues.length < 1) { throw new Error(`DTW window durations, when given as an array, must have at least one element.`); } const durations = []; for (const durationValue of durationsValues) { let durationSeconds; if (typeof durationValue === 'number') { durationSeconds = durationValue; durationSeconds = Math.min(durationSeconds, sourceAudioDuration); } else { durationSeconds = tryParsePercentageWindowDuration(durationValue); } durations.push(durationSeconds); } windowDurations = durations; } else { throw new Error(`'dtw.windowDuration' must be a number or a percentage string, or array of numbers / percentage strings.`); } } else { if (granularities.length > 2) { throw new Error(`More than two passes requested, this requires window durations to be explicitly specified for each pass. For example 'dtw.windowDuration=['20%',60,10]'.`); } if (sourceAudioDuration < 5 * 60) { // If up to 5 minutes, set window duration to one minute windowDurations = [60]; } else if (sourceAudioDuration < 2.5 * 60 * 60) { // If less than 2.5 hours, set window duration to 20% of total duration windowDurations = [Math.ceil(sourceAudioDuration * 0.2)]; } else { // Otherwise, set window duration to 30 minutes windowDurations = [30 * 60]; } } if (granularities.length === 2 && windowDurations.length === 1) { windowDurations = [windowDurations[0], 15]; } if (granularities.length != windowDurations.length) { throw new Error(`The option 'dtw.granularity' has ${granularities.length} values, but 'dtw.windowDuration' has ${windowDurations.length} values. The lengths should be equal.`); } return { windowDurations, granularities }; } let mappedTimeline; switch (options.engine) { case 'dtw': { const { windowDurations, granularities } = getDtwWindowGranularitiesAndDurations(); logger.end(); const { referenceRawAudio, referenceTimeline } = await createAlignmentReferenceUsingEspeak(transcript, language, options.plainText, options.customLexiconPaths, false, false); logger.end(); mappedTimeline = await alignUsingDtw(sourceRawAudio, referenceRawAudio, referenceTimeline, granularities, windowDurations); break; } case 'dtw-ra': { const { windowDurations, granularities } = getDtwWindowGranularitiesAndDurations(); logger.end(); const recognitionOptions = extendDeep({ crop: options.crop, language }, options.recognition); // Recognize source audio let { wordTimeline: recognitionTimeline } = await API.recognize(sourceRawAudio, recognitionOptions); logger.log(''); // Remove non-word entries from recognition timeline recognitionTimeline = recognitionTimeline.filter(entry => isWord(entry.text)); // Synthesize the ground-truth transcript and get its timeline logger.start('Synthesize ground-truth transcript with eSpeak'); const { referenceRawAudio, referenceTimeline, espeakVoice, } = await createAlignmentReferenceUsingEspeak(transcript, language, options.plainText, options.customLexiconPaths, false, false); logger.end(); const phoneAlignmentMethod = options.dtw.phoneAlignmentMethod; const espeakOptions = { ...defaultEspeakOptions, voice: espeakVoice, useKlatt: false, insertSeparators: true }; // Align the ground-truth transcript and the recognized transcript mappedTimeline = await alignUsingDtwWithRecognition(sourceRawAudio, referenceRawAudio, referenceTimeline, recognitionTimeline, granularities, windowDurations, espeakOptions, phoneAlignmentMethod); break; } case 'dtw-ea': { const { windowDurations, granularities } = getDtwWindowGranularitiesAndDurations(); logger.end(); logger.logTitledMessage(`Warning`, `The dtw-ea alignment engine is just an early experiment and doesn't currently perform as well as, or as efficiently as other alignment engines.`, chalk.yellow, 'warning'); const { referenceRawAudio, referenceTimeline } = await createAlignmentReferenceUsingEspeak(transcript, language, options.plainText, options.customLexiconPaths, false, true); logger.end(); const shortLanguageCode = getShortLanguageCode(language); mappedTimeline = await alignUsingDtwWithEmbeddings(sourceRawAudio, referenceRawAudio, referenceTimeline, shortLanguageCode, granularities, windowDurations); break; } case 'whisper': { const WhisperSTT = await import('../recognition/WhisperSTT.js'); const whisperAlignmnentOptions = options.whisper; const shortLanguageCode = getShortLanguageCode(language); const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperAlignmnentOptions.model, shortLanguageCode); logger.end(); mappedTimeline = await WhisperSTT.align(sourceRawAudio, transcript, modelName, modelDir, shortLanguageCode, whisperAlignmnentOptions); break; } default: { throw new Error(`Engine '${options.engine}' is not supported`); } } logger.start(`Postprocess timeline`); // If the audio was cropped before recognition, map the timestamps back to the original audio if (sourceUncropTimeline && sourceUncropTimeline.length > 0) { API.convertCroppedToUncroppedTimeline(mappedTimeline, sourceUncropTimeline); } // Add text offsets addWordTextOffsetsToTimelineInPlace(mappedTimeline, transcript); // Make segment timeline const { segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(mappedTimeline, transcript, language, options.plainText?.paragraphBreaks, options.plainText?.whitespace); logger.end(); logger.logDuration(`Total alignment time`, startTimestamp, chalk.magentaBright); return { timeline: segmentTimeline, wordTimeline: mappedTimeline, transcript, language, inputRawAudio, isolatedRawAudio, backgroundRawAudio, }; } export async function alignSegments(sourceRawAudio, segmentTimeline, alignmentOptions) { const timeline = []; for (const segmentEntry of segmentTimeline) { const segmentText = segmentEntry.text; const segmentStartTime = segmentEntry.startTime; const segmentEndTime = segmentEntry.endTime; const segmentStartSampleIndex = Math.floor(segmentStartTime * sourceRawAudio.sampleRate); const segmentEndSampleIndex = Math.floor(segmentEndTime * sourceRawAudio.sampleRate); const segmentAudioSamples = sourceRawAudio.audioChannels[0].slice(segmentStartSampleIndex, segmentEndSampleIndex); const segmentRawAudio = { audioChannels: [segmentAudioSamples], sampleRate: sourceRawAudio.sampleRate }; const { wordTimeline: mappedTimeline } = await align(segmentRawAudio, segmentText, alignmentOptions); const segmentTimelineWithOffset = addTimeOffsetToTimeline(mappedTimeline, segmentStartTime); timeline.push(...segmentTimelineWithOffset); } return timeline; } export const defaultAlignmentOptions = { engine: 'dtw', language: undefined, isolate: false, crop: true, customLexiconPaths: undefined, languageDetection: {}, plainText: { paragraphBreaks: 'double', whitespace: 'collapse' }, subtitles: {}, dtw: { granularity: undefined, windowDuration: undefined, phoneAlignmentMethod: 'dtw' }, recognition: { whisper: { temperature: 0.15, topCandidateCount: 5, punctuationThreshold: 0.2, maxTokensPerPart: 250, autoPromptParts: false, suppressRepetition: true, decodeTimestampTokens: true, } }, vad: { engine: 'adaptive-gate' }, sourceSeparation: {}, whisper: {} }; export const alignmentEngines = [ { id: 'dtw', name: 'Dynamic Time Warping', description: 'Makes use of a synthesized reference to find the best mapping between the spoken audio and its transcript.', type: 'local' }, { id: 'dtw-ra', name: 'Dynamic Time Warping with Recognition Assist', description: 'Makes use of both a synthesized reference and a synthsized recognized transcript to find the best mapping between the spoken audio and its transcript.', type: 'local' }, { id: 'whisper', name: 'OpenAI Whisper', description: 'Extracts timestamps by guiding the Whisper recognition model to recognize the transcript tokens.', type: 'local' } ]; //# sourceMappingURL=Alignment.js.map