UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

300 lines 12.4 kB
import chalk from 'chalk'; import * as API from './API.js'; import { extendDeep } from '../utilities/ObjectUtilities.js'; import { logToStderr } from '../utilities/Utilities.js'; import { ensureRawAudio, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js'; import { Logger } from '../utilities/Logger.js'; import { addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js'; import { formatLanguageCodeWithName, parseLangIdentifier } from '../utilities/Locale.js'; import { loadPackage } from '../utilities/PackageManager.js'; const log = logToStderr; export async function recognize(input, options, onPart) { const logger = new Logger(); const startTimestamp = logger.getTimestamp(); options = extendDeep(defaultRecognitionOptions, options); const inputRawAudio = await ensureRawAudio(input); let sourceRawAudio; let isolatedRawAudio; let backgroundRawAudio; if (options.isolate) { logger.log(``); logger.end(); ({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation)); logger.end(); logger.log(``); logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1); } else { logger.start(`Resample audio to 16kHz mono`); sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1); } let sourceUncropTimeline; if (options.crop) { logger.start('Crop using voice activity detection'); ({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad)); logger.end(); } logger.start('Normalize and trim audio'); normalizeAudioLevelInPlace(sourceRawAudio); sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0]); const engine = options.engine; if (options.language) { const languageData = await parseLangIdentifier(options.language); options.language = languageData.Name; logger.end(); logger.logTitledMessage('Language specified', formatLanguageCodeWithName(options.language)); } else { logger.start('No language specified. Detect speech language'); const { detectedLanguage } = await API.detectSpeechLanguage(sourceRawAudio, options.languageDetection); options.language = detectedLanguage; logger.end(); logger.logTitledMessage('Language detected', formatLanguageCodeWithName(detectedLanguage)); } const languageData = await parseLangIdentifier(options.language); const languageCode = languageData.Name; const shortLanguageCode = languageData.TwoLetterISOLanguageName; let transcript; let timeline; logger.start(`Load ${engine} module`); switch (engine) { case 'whisper': { const WhisperSTT = await import('../recognition/WhisperSTT.js'); const whisperOptions = options.whisper; logger.end(); const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperOptions.model, shortLanguageCode); logger.end(); ({ transcript, timeline } = await WhisperSTT.recognize(sourceRawAudio, modelName, modelDir, 'transcribe', shortLanguageCode, whisperOptions, onPart)); break; } case 'whisper.cpp': { const WhisperCppSTT = await import('../recognition/WhisperCppSTT.js'); const whisperCppOptions = options.whisperCpp; logger.end(); const { modelName, modelPath } = await WhisperCppSTT.loadModelPackage(whisperCppOptions.model, shortLanguageCode); logger.end(); ({ transcript, timeline } = await WhisperCppSTT.recognize(sourceRawAudio, 'transcribe', shortLanguageCode, modelName, modelPath, whisperCppOptions)); break; } case 'vosk': { const VoskSTT = await import('../recognition/VoskSTT.js'); try { await import('@echogarden/vosk'); } catch (e) { log(e); throw new Error(`The vosk npm package, which is required for Vosk support, was not found, or had an error loading. If missing, you can install it by running 'npm install @echogarden/vosk -g'.`); } const voskOptions = options.vosk; const modelPath = voskOptions.modelPath; if (!modelPath) { throw new Error(`Vosk models are not currently auto-downloaded. You'll need to download a model manually and set a model path in 'vosk.modelPath'.`); } logger.end(); ({ transcript, timeline } = await VoskSTT.recognize(sourceRawAudio, modelPath, true)); break; } case 'silero': { const SileroSTT = await import('../recognition/SileroSTT.js'); const sileroOptions = options.silero; let modelPath = sileroOptions.modelPath; if (!modelPath) { const packageName = SileroSTT.languageCodeToPackageName[shortLanguageCode]; if (!packageName) { throw new Error(`Language '${shortLanguageCode}' is not supported by Silero`); } modelPath = await loadPackage(packageName); } const onnxExecutionProviders = sileroOptions.provider ? [sileroOptions.provider] : []; logger.end(); ({ transcript, timeline } = await SileroSTT.recognize(sourceRawAudio, modelPath, onnxExecutionProviders)); break; } case 'google-cloud': { const GoogleCloudSTT = await import('../recognition/GoogleCloudSTT.js'); const apiKey = options.googleCloud.apiKey; if (!apiKey) { throw new Error(`No API key given`); } logger.end(); ({ transcript, timeline } = await GoogleCloudSTT.recognize(sourceRawAudio, apiKey, shortLanguageCode)); break; } case 'microsoft-azure': { const AzureCognitiveServicesSTT = await import('../recognition/AzureCognitiveServicesSTT.js'); const subscriptionKey = options.microsoftAzure.subscriptionKey; if (!subscriptionKey) { throw new Error(`No Microsoft Azure subscription key provided`); } const serviceRegion = options.microsoftAzure.serviceRegion; if (!serviceRegion) { throw new Error(`No Microsoft Azure service region provided`); } logger.end(); ({ transcript, timeline } = await AzureCognitiveServicesSTT.recognize(sourceRawAudio, subscriptionKey, serviceRegion, shortLanguageCode)); break; } case 'amazon-transcribe': { const AmazonTranscribeSTT = await import('../recognition/AmazonTranscribeSTT.js'); const region = options.amazonTranscribe.region; if (!region) { throw new Error(`No Amazon Transcribe region provided`); } const accessKeyId = options.amazonTranscribe.accessKeyId; if (!accessKeyId) { throw new Error(`No Amazon Transcribe access key id provided`); } const secretAccessKey = options.amazonTranscribe.secretAccessKey; if (!secretAccessKey) { throw new Error(`No Amazon Transcribe secret access key provided`); } logger.end(); ({ transcript, timeline } = await AmazonTranscribeSTT.recgonize(sourceRawAudio, languageCode, region, accessKeyId, secretAccessKey)); break; } case 'openai-cloud': { const OpenAICloudSTT = await import('../recognition/OpenAICloudSTT.js'); const openAICloudSTTOptions = options.openAICloud; if (!openAICloudSTTOptions.apiKey) { throw new Error(`No OpanAI Cloud API key provided`); } logger.end(); ({ transcript, timeline } = await OpenAICloudSTT.recognize(sourceRawAudio, shortLanguageCode, openAICloudSTTOptions)); break; } case 'deepgram': { const DeepgramSTT = await import('../recognition/DeepgramSTT.js'); const deepgramOptions = options.deepgram; if (!deepgramOptions.apiKey) { throw new Error(`No Deepgram API key provided`); } logger.end(); ({ transcript, timeline } = await DeepgramSTT.recognize(sourceRawAudio, options.language ? shortLanguageCode : undefined, deepgramOptions)); break; } default: { throw new Error(`Engine '${options.engine}' is not supported`); } } // If the engine didn't return a timeline, align to get it if (!timeline) { logger.start(`Align audio to transcript`); const alignmentOptions = extendDeep(options.alignment, { language: languageCode }); const { wordTimeline } = await API.align(sourceRawAudio, transcript, alignmentOptions); timeline = wordTimeline; } // If the audio was cropped before recognition, map the timestamps back to the original audio if (sourceUncropTimeline && sourceUncropTimeline.length > 0) { API.convertCroppedToUncroppedTimeline(timeline, sourceUncropTimeline); } // Add text offsets addWordTextOffsetsToTimelineInPlace(timeline, transcript); // Make segment timeline const { segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(timeline, transcript, languageCode, 'single', 'preserve'); logger.end(); logger.logDuration('\nTotal recognition time', startTimestamp, chalk.magentaBright); return { transcript, timeline: segmentTimeline, wordTimeline: timeline, language: languageCode, inputRawAudio, isolatedRawAudio, backgroundRawAudio, }; } export const defaultRecognitionOptions = { engine: 'whisper', language: undefined, maxAlternatives: 1, isolate: false, crop: true, alignment: {}, languageDetection: {}, subtitles: {}, vad: { engine: 'adaptive-gate' }, whisper: {}, whisperCpp: {}, vosk: { modelPath: undefined }, silero: {}, googleCloud: { apiKey: undefined, alternativeLanguageCodes: [], profanityFilter: false, autoPunctuation: true, useEnhancedModel: true, }, microsoftAzure: { subscriptionKey: undefined, serviceRegion: undefined }, amazonTranscribe: { region: undefined, accessKeyId: undefined, secretAccessKey: undefined, }, openAICloud: {}, deepgram: {} }; export const recognitionEngines = [ { id: 'whisper', name: 'OpenAI Whisper', description: 'A high accuracy transformer-based speech recognition architecture by OpenAI.', type: 'local' }, { id: 'whisper.cpp', name: 'OpenAI Whisper (C++ port)', description: 'A C++ port of the Whisper speech recognition architecture.', type: 'local' }, { id: 'vosk', name: 'Vosk', description: 'A speech recognition toolkit.', type: 'local' }, { id: 'silero', name: 'Silero', description: 'Speech recognition models.', type: 'local' }, { id: 'google-cloud', name: 'Google Cloud', description: 'Google Cloud speech-to-text service.', type: 'cloud' }, { id: 'microsoft-azure', name: 'Azure Cognitive Services', description: 'Microsoft Azure speech-to-text service.', type: 'cloud' }, { id: 'amazon-transcribe', name: 'Amazon Transcribe', description: 'Amazon cloud speech-to-text service.', type: 'cloud' }, { id: 'openai-cloud', name: 'OpenAI Cloud', description: 'OpenAI cloud speech-to-text service.', type: 'cloud' }, { id: 'deepgram', name: 'Deepgram', description: 'Deepgram cloud speech-to-text service.', type: 'cloud' }, ]; //# sourceMappingURL=Recognition.js.map