UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

499 lines (359 loc) 15 kB
import { extendDeep } from '../utilities/ObjectUtilities.js' import { logToStderr } from '../utilities/Utilities.js' import { AudioSourceParam, RawAudio, ensureRawAudio, getRawAudioDuration, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js' import { Logger } from '../utilities/Logger.js' import * as API from './API.js' import { Timeline, addTimeOffsetToTimeline, addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js' import { formatLanguageCodeWithName, getDefaultDialectForLanguageCodeIfPossible, getShortLanguageCode, parseLangIdentifier } from '../utilities/Locale.js' import { type WhisperAlignmentOptions } from '../recognition/WhisperSTT.js' import chalk from 'chalk' import { DtwGranularity, alignUsingDtwWithEmbeddings, createAlignmentReferenceUsingEspeak } from '../alignment/SpeechAlignment.js' import { type SubtitlesConfig } from '../subtitles/Subtitles.js' import { type EspeakOptions, defaultEspeakOptions } from '../synthesis/EspeakTTS.js' import { isWord } from '../nlp/Segmentation.js' const log = logToStderr export async function align(input: AudioSourceParam, transcript: string, options: AlignmentOptions): Promise<AlignmentResult> { const logger = new Logger() const startTimestamp = logger.getTimestamp() options = extendDeep(defaultAlignmentOptions, options) const inputRawAudio = await ensureRawAudio(input) let sourceRawAudio: RawAudio let isolatedRawAudio: RawAudio | undefined let backgroundRawAudio: RawAudio | undefined if (options.isolate) { logger.log(``) logger.end(); ({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation!)) logger.end() logger.log(``) logger.start(`Resample audio to 16kHz mono`) sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1) } else { logger.start(`Resample audio to 16kHz mono`) sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1) } let sourceUncropTimeline: Timeline | undefined if (options.crop) { logger.start('Crop using voice activity detection'); ({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad!)) logger.end() } logger.start('Normalize and trim audio') normalizeAudioLevelInPlace(sourceRawAudio) sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0]) logger.end() let language: string if (options.language) { const languageData = await parseLangIdentifier(options.language) language = languageData.Name logger.logTitledMessage('Language specified', formatLanguageCodeWithName(language)) } else { logger.start('No language specified. Detect language using reference text') const { detectedLanguage } = await API.detectTextLanguage(transcript, options.languageDetection || {}) language = detectedLanguage logger.end() logger.logTitledMessage('Language detected', formatLanguageCodeWithName(language)) } language = getDefaultDialectForLanguageCodeIfPossible(language) logger.start('Load alignment module') const { alignUsingDtwWithRecognition, alignUsingDtw } = await import('../alignment/SpeechAlignment.js') function getDtwWindowGranularitiesAndDurations() { const sourceAudioDuration = getRawAudioDuration(sourceRawAudio) let granularities: DtwGranularity[] let windowDurations: number[] const dtwOptions = options.dtw! if (typeof dtwOptions.granularity == 'string') { granularities = [dtwOptions.granularity] } else if (Array.isArray(dtwOptions.granularity)) { granularities = dtwOptions.granularity } else { if (sourceAudioDuration < 1 * 60) { // If up to 1 minute, set granularity to high, single pass granularities = ['high'] } else if (sourceAudioDuration < 5 * 60) { // If up to 5 minutes, set granularity to medium, single pass granularities = ['medium'] } else if (sourceAudioDuration < 30 * 60) { // If up to 30 minutes, set granularity to low, single pass granularities = ['low'] } else { // Otherwise, use multipass processing, first with xx-low granularity, then low granularities = ['xx-low', 'low'] } } if (dtwOptions.windowDuration) { function tryParsePercentageWindowDuration(durationString: string) { durationString = durationString.trim() const parseResult = durationString.match(/^([0-9]+)%$/) if (parseResult == null) { throw new Error(`A DTW window duration, when provided as a string, must be formatted as an integer percentage value like '15%'.`) } const percentageValue = parseInt(parseResult[1]) if (percentageValue == null || isNaN(percentageValue) || percentageValue <= 0 || percentageValue > 100) { throw new Error(`'A DTW window duration, when provided as a percentage value, must be between 0 (non-inclusive) and 100 (inclusive).`) } let durationSeconds = percentageValue / 100 * sourceAudioDuration durationSeconds = Math.ceil(durationSeconds) durationSeconds = Math.min(durationSeconds, sourceAudioDuration) return durationSeconds } if (typeof dtwOptions.windowDuration === 'number') { const duration = Math.min(dtwOptions.windowDuration, sourceAudioDuration) windowDurations = [duration] } else if (typeof dtwOptions.windowDuration === 'string') { const durationString = dtwOptions.windowDuration.trim() const durationSeconds = tryParsePercentageWindowDuration(durationString) windowDurations = [durationSeconds] } else if (Array.isArray(dtwOptions.windowDuration)) { const durationsValues = dtwOptions.windowDuration if (durationsValues.length < 1) { throw new Error(`DTW window durations, when given as an array, must have at least one element.`) } const durations: number[] = [] for (const durationValue of durationsValues) { let durationSeconds: number if (typeof durationValue === 'number') { durationSeconds = durationValue durationSeconds = Math.min(durationSeconds, sourceAudioDuration) } else { durationSeconds = tryParsePercentageWindowDuration(durationValue) } durations.push(durationSeconds) } windowDurations = durations } else { throw new Error(`'dtw.windowDuration' must be a number or a percentage string, or array of numbers / percentage strings.`) } } else { if (granularities.length > 2) { throw new Error(`More than two passes requested, this requires window durations to be explicitly specified for each pass. For example 'dtw.windowDuration=['20%',60,10]'.`) } if (sourceAudioDuration < 5 * 60) { // If up to 5 minutes, set window duration to one minute windowDurations = [60] } else if (sourceAudioDuration < 2.5 * 60 * 60) { // If less than 2.5 hours, set window duration to 20% of total duration windowDurations = [Math.ceil(sourceAudioDuration * 0.2)] } else { // Otherwise, set window duration to 30 minutes windowDurations = [30 * 60] } } if (granularities.length === 2 && windowDurations.length === 1) { windowDurations = [windowDurations[0], 15] } if (granularities.length != windowDurations.length) { throw new Error(`The option 'dtw.granularity' has ${granularities.length} values, but 'dtw.windowDuration' has ${windowDurations.length} values. The lengths should be equal.`) } return { windowDurations, granularities } } let mappedTimeline: Timeline switch (options.engine) { case 'dtw': { const { windowDurations, granularities } = getDtwWindowGranularitiesAndDurations() logger.end() const { referenceRawAudio, referenceTimeline } = await createAlignmentReferenceUsingEspeak(transcript, language, options.plainText, options.customLexiconPaths, false, false) logger.end() mappedTimeline = await alignUsingDtw(sourceRawAudio, referenceRawAudio, referenceTimeline, granularities, windowDurations) break } case 'dtw-ra': { const { windowDurations, granularities } = getDtwWindowGranularitiesAndDurations() logger.end() const recognitionOptions: API.RecognitionOptions = extendDeep({ crop: options.crop, language }, options.recognition) // Recognize source audio let { wordTimeline: recognitionTimeline } = await API.recognize(sourceRawAudio, recognitionOptions) logger.log('') // Remove non-word entries from recognition timeline recognitionTimeline = recognitionTimeline.filter(entry => isWord(entry.text)) // Synthesize the ground-truth transcript and get its timeline logger.start('Synthesize ground-truth transcript with eSpeak') const { referenceRawAudio, referenceTimeline, espeakVoice, } = await createAlignmentReferenceUsingEspeak(transcript, language, options.plainText, options.customLexiconPaths, false, false) logger.end() const phoneAlignmentMethod = options.dtw!.phoneAlignmentMethod! const espeakOptions: EspeakOptions = { ...defaultEspeakOptions, voice: espeakVoice, useKlatt: false, insertSeparators: true } // Align the ground-truth transcript and the recognized transcript mappedTimeline = await alignUsingDtwWithRecognition( sourceRawAudio, referenceRawAudio, referenceTimeline, recognitionTimeline, granularities, windowDurations, espeakOptions, phoneAlignmentMethod) break } case 'dtw-ea': { const { windowDurations, granularities } = getDtwWindowGranularitiesAndDurations() logger.end() logger.logTitledMessage(`Warning`, `The dtw-ea alignment engine is just an early experiment and doesn't currently perform as well as, or as efficiently as other alignment engines.`, chalk.yellow, 'warning') const { referenceRawAudio, referenceTimeline } = await createAlignmentReferenceUsingEspeak(transcript, language, options.plainText, options.customLexiconPaths, false, true) logger.end() const shortLanguageCode = getShortLanguageCode(language) mappedTimeline = await alignUsingDtwWithEmbeddings( sourceRawAudio, referenceRawAudio, referenceTimeline, shortLanguageCode, granularities, windowDurations) break } case 'whisper': { const WhisperSTT = await import('../recognition/WhisperSTT.js') const whisperAlignmnentOptions = options.whisper! const shortLanguageCode = getShortLanguageCode(language) const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperAlignmnentOptions.model, shortLanguageCode) logger.end() mappedTimeline = await WhisperSTT.align(sourceRawAudio, transcript, modelName, modelDir, shortLanguageCode, whisperAlignmnentOptions) break } default: { throw new Error(`Engine '${options.engine}' is not supported`) } } logger.start(`Postprocess timeline`) // If the audio was cropped before recognition, map the timestamps back to the original audio if (sourceUncropTimeline && sourceUncropTimeline.length > 0) { API.convertCroppedToUncroppedTimeline(mappedTimeline, sourceUncropTimeline) } // Add text offsets addWordTextOffsetsToTimelineInPlace(mappedTimeline, transcript) // Make segment timeline const { segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(mappedTimeline, transcript, language, options.plainText?.paragraphBreaks, options.plainText?.whitespace) logger.end() logger.logDuration(`Total alignment time`, startTimestamp, chalk.magentaBright) return { timeline: segmentTimeline, wordTimeline: mappedTimeline, transcript, language, inputRawAudio, isolatedRawAudio, backgroundRawAudio, } } export async function alignSegments(sourceRawAudio: RawAudio, segmentTimeline: Timeline, alignmentOptions: AlignmentOptions) { const timeline: Timeline = [] for (const segmentEntry of segmentTimeline) { const segmentText = segmentEntry.text const segmentStartTime = segmentEntry.startTime const segmentEndTime = segmentEntry.endTime const segmentStartSampleIndex = Math.floor(segmentStartTime * sourceRawAudio.sampleRate) const segmentEndSampleIndex = Math.floor(segmentEndTime * sourceRawAudio.sampleRate) const segmentAudioSamples = sourceRawAudio.audioChannels[0].slice(segmentStartSampleIndex, segmentEndSampleIndex) const segmentRawAudio: RawAudio = { audioChannels: [segmentAudioSamples], sampleRate: sourceRawAudio.sampleRate } const { wordTimeline: mappedTimeline } = await align(segmentRawAudio, segmentText, alignmentOptions) const segmentTimelineWithOffset = addTimeOffsetToTimeline(mappedTimeline, segmentStartTime) timeline.push(...segmentTimelineWithOffset) } return timeline } export interface AlignmentResult { timeline: Timeline wordTimeline: Timeline transcript: string language: string inputRawAudio: RawAudio isolatedRawAudio?: RawAudio backgroundRawAudio?: RawAudio } export type AlignmentEngine = 'dtw' | 'dtw-ra' | 'dtw-ea' | 'whisper' export type PhoneAlignmentMethod = 'interpolation' | 'dtw' export interface AlignmentOptions { engine?: AlignmentEngine language?: string isolate?: boolean crop?: boolean customLexiconPaths?: string[] languageDetection?: API.TextLanguageDetectionOptions vad?: API.VADOptions plainText?: API.PlainTextOptions subtitles?: SubtitlesConfig dtw?: { granularity?: DtwGranularity | DtwGranularity[] windowDuration?: number | string | (string | number)[] phoneAlignmentMethod?: PhoneAlignmentMethod } recognition?: API.RecognitionOptions sourceSeparation?: API.SourceSeparationOptions whisper?: WhisperAlignmentOptions } export const defaultAlignmentOptions: AlignmentOptions = { engine: 'dtw', language: undefined, isolate: false, crop: true, customLexiconPaths: undefined, languageDetection: { }, plainText: { paragraphBreaks: 'double', whitespace: 'collapse' }, subtitles: { }, dtw: { granularity: undefined, windowDuration: undefined, phoneAlignmentMethod: 'dtw' }, recognition: { whisper: { temperature: 0.15, topCandidateCount: 5, punctuationThreshold: 0.2, maxTokensPerPart: 250, autoPromptParts: false, suppressRepetition: true, decodeTimestampTokens: true, } }, vad: { engine: 'adaptive-gate' }, sourceSeparation: { }, whisper: { } } export const alignmentEngines: API.EngineMetadata[] = [ { id: 'dtw', name: 'Dynamic Time Warping', description: 'Makes use of a synthesized reference to find the best mapping between the spoken audio and its transcript.', type: 'local' }, { id: 'dtw-ra', name: 'Dynamic Time Warping with Recognition Assist', description: 'Makes use of both a synthesized reference and a synthsized recognized transcript to find the best mapping between the spoken audio and its transcript.', type: 'local' }, { id: 'whisper', name: 'OpenAI Whisper', description: 'Extracts timestamps by guiding the Whisper recognition model to recognize the transcript tokens.', type: 'local' } ]