echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
300 lines • 12.4 kB
JavaScript
import chalk from 'chalk';
import * as API from './API.js';
import { extendDeep } from '../utilities/ObjectUtilities.js';
import { logToStderr } from '../utilities/Utilities.js';
import { ensureRawAudio, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js';
import { Logger } from '../utilities/Logger.js';
import { addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js';
import { formatLanguageCodeWithName, parseLangIdentifier } from '../utilities/Locale.js';
import { loadPackage } from '../utilities/PackageManager.js';
const log = logToStderr;
export async function recognize(input, options, onPart) {
const logger = new Logger();
const startTimestamp = logger.getTimestamp();
options = extendDeep(defaultRecognitionOptions, options);
const inputRawAudio = await ensureRawAudio(input);
let sourceRawAudio;
let isolatedRawAudio;
let backgroundRawAudio;
if (options.isolate) {
logger.log(``);
logger.end();
({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation));
logger.end();
logger.log(``);
logger.start(`Resample audio to 16kHz mono`);
sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1);
}
else {
logger.start(`Resample audio to 16kHz mono`);
sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1);
}
let sourceUncropTimeline;
if (options.crop) {
logger.start('Crop using voice activity detection');
({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad));
logger.end();
}
logger.start('Normalize and trim audio');
normalizeAudioLevelInPlace(sourceRawAudio);
sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0]);
const engine = options.engine;
if (options.language) {
const languageData = await parseLangIdentifier(options.language);
options.language = languageData.Name;
logger.end();
logger.logTitledMessage('Language specified', formatLanguageCodeWithName(options.language));
}
else {
logger.start('No language specified. Detect speech language');
const { detectedLanguage } = await API.detectSpeechLanguage(sourceRawAudio, options.languageDetection);
options.language = detectedLanguage;
logger.end();
logger.logTitledMessage('Language detected', formatLanguageCodeWithName(detectedLanguage));
}
const languageData = await parseLangIdentifier(options.language);
const languageCode = languageData.Name;
const shortLanguageCode = languageData.TwoLetterISOLanguageName;
let transcript;
let timeline;
logger.start(`Load ${engine} module`);
switch (engine) {
case 'whisper': {
const WhisperSTT = await import('../recognition/WhisperSTT.js');
const whisperOptions = options.whisper;
logger.end();
const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperOptions.model, shortLanguageCode);
logger.end();
({ transcript, timeline } = await WhisperSTT.recognize(sourceRawAudio, modelName, modelDir, 'transcribe', shortLanguageCode, whisperOptions, onPart));
break;
}
case 'whisper.cpp': {
const WhisperCppSTT = await import('../recognition/WhisperCppSTT.js');
const whisperCppOptions = options.whisperCpp;
logger.end();
const { modelName, modelPath } = await WhisperCppSTT.loadModelPackage(whisperCppOptions.model, shortLanguageCode);
logger.end();
({ transcript, timeline } = await WhisperCppSTT.recognize(sourceRawAudio, 'transcribe', shortLanguageCode, modelName, modelPath, whisperCppOptions));
break;
}
case 'vosk': {
const VoskSTT = await import('../recognition/VoskSTT.js');
try {
await import('@echogarden/vosk');
}
catch (e) {
log(e);
throw new Error(`The vosk npm package, which is required for Vosk support, was not found, or had an error loading. If missing, you can install it by running 'npm install @echogarden/vosk -g'.`);
}
const voskOptions = options.vosk;
const modelPath = voskOptions.modelPath;
if (!modelPath) {
throw new Error(`Vosk models are not currently auto-downloaded. You'll need to download a model manually and set a model path in 'vosk.modelPath'.`);
}
logger.end();
({ transcript, timeline } = await VoskSTT.recognize(sourceRawAudio, modelPath, true));
break;
}
case 'silero': {
const SileroSTT = await import('../recognition/SileroSTT.js');
const sileroOptions = options.silero;
let modelPath = sileroOptions.modelPath;
if (!modelPath) {
const packageName = SileroSTT.languageCodeToPackageName[shortLanguageCode];
if (!packageName) {
throw new Error(`Language '${shortLanguageCode}' is not supported by Silero`);
}
modelPath = await loadPackage(packageName);
}
const onnxExecutionProviders = sileroOptions.provider ? [sileroOptions.provider] : [];
logger.end();
({ transcript, timeline } = await SileroSTT.recognize(sourceRawAudio, modelPath, onnxExecutionProviders));
break;
}
case 'google-cloud': {
const GoogleCloudSTT = await import('../recognition/GoogleCloudSTT.js');
const apiKey = options.googleCloud.apiKey;
if (!apiKey) {
throw new Error(`No API key given`);
}
logger.end();
({ transcript, timeline } = await GoogleCloudSTT.recognize(sourceRawAudio, apiKey, shortLanguageCode));
break;
}
case 'microsoft-azure': {
const AzureCognitiveServicesSTT = await import('../recognition/AzureCognitiveServicesSTT.js');
const subscriptionKey = options.microsoftAzure.subscriptionKey;
if (!subscriptionKey) {
throw new Error(`No Microsoft Azure subscription key provided`);
}
const serviceRegion = options.microsoftAzure.serviceRegion;
if (!serviceRegion) {
throw new Error(`No Microsoft Azure service region provided`);
}
logger.end();
({ transcript, timeline } = await AzureCognitiveServicesSTT.recognize(sourceRawAudio, subscriptionKey, serviceRegion, shortLanguageCode));
break;
}
case 'amazon-transcribe': {
const AmazonTranscribeSTT = await import('../recognition/AmazonTranscribeSTT.js');
const region = options.amazonTranscribe.region;
if (!region) {
throw new Error(`No Amazon Transcribe region provided`);
}
const accessKeyId = options.amazonTranscribe.accessKeyId;
if (!accessKeyId) {
throw new Error(`No Amazon Transcribe access key id provided`);
}
const secretAccessKey = options.amazonTranscribe.secretAccessKey;
if (!secretAccessKey) {
throw new Error(`No Amazon Transcribe secret access key provided`);
}
logger.end();
({ transcript, timeline } = await AmazonTranscribeSTT.recgonize(sourceRawAudio, languageCode, region, accessKeyId, secretAccessKey));
break;
}
case 'openai-cloud': {
const OpenAICloudSTT = await import('../recognition/OpenAICloudSTT.js');
const openAICloudSTTOptions = options.openAICloud;
if (!openAICloudSTTOptions.apiKey) {
throw new Error(`No OpanAI Cloud API key provided`);
}
logger.end();
({ transcript, timeline } = await OpenAICloudSTT.recognize(sourceRawAudio, shortLanguageCode, openAICloudSTTOptions));
break;
}
case 'deepgram': {
const DeepgramSTT = await import('../recognition/DeepgramSTT.js');
const deepgramOptions = options.deepgram;
if (!deepgramOptions.apiKey) {
throw new Error(`No Deepgram API key provided`);
}
logger.end();
({ transcript, timeline } = await DeepgramSTT.recognize(sourceRawAudio, options.language ? shortLanguageCode : undefined, deepgramOptions));
break;
}
default: {
throw new Error(`Engine '${options.engine}' is not supported`);
}
}
// If the engine didn't return a timeline, align to get it
if (!timeline) {
logger.start(`Align audio to transcript`);
const alignmentOptions = extendDeep(options.alignment, { language: languageCode });
const { wordTimeline } = await API.align(sourceRawAudio, transcript, alignmentOptions);
timeline = wordTimeline;
}
// If the audio was cropped before recognition, map the timestamps back to the original audio
if (sourceUncropTimeline && sourceUncropTimeline.length > 0) {
API.convertCroppedToUncroppedTimeline(timeline, sourceUncropTimeline);
}
// Add text offsets
addWordTextOffsetsToTimelineInPlace(timeline, transcript);
// Make segment timeline
const { segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(timeline, transcript, languageCode, 'single', 'preserve');
logger.end();
logger.logDuration('\nTotal recognition time', startTimestamp, chalk.magentaBright);
return {
transcript,
timeline: segmentTimeline,
wordTimeline: timeline,
language: languageCode,
inputRawAudio,
isolatedRawAudio,
backgroundRawAudio,
};
}
export const defaultRecognitionOptions = {
engine: 'whisper',
language: undefined,
maxAlternatives: 1,
isolate: false,
crop: true,
alignment: {},
languageDetection: {},
subtitles: {},
vad: {
engine: 'adaptive-gate'
},
whisper: {},
whisperCpp: {},
vosk: {
modelPath: undefined
},
silero: {},
googleCloud: {
apiKey: undefined,
alternativeLanguageCodes: [],
profanityFilter: false,
autoPunctuation: true,
useEnhancedModel: true,
},
microsoftAzure: {
subscriptionKey: undefined,
serviceRegion: undefined
},
amazonTranscribe: {
region: undefined,
accessKeyId: undefined,
secretAccessKey: undefined,
},
openAICloud: {},
deepgram: {}
};
export const recognitionEngines = [
{
id: 'whisper',
name: 'OpenAI Whisper',
description: 'A high accuracy transformer-based speech recognition architecture by OpenAI.',
type: 'local'
},
{
id: 'whisper.cpp',
name: 'OpenAI Whisper (C++ port)',
description: 'A C++ port of the Whisper speech recognition architecture.',
type: 'local'
},
{
id: 'vosk',
name: 'Vosk',
description: 'A speech recognition toolkit.',
type: 'local'
},
{
id: 'silero',
name: 'Silero',
description: 'Speech recognition models.',
type: 'local'
},
{
id: 'google-cloud',
name: 'Google Cloud',
description: 'Google Cloud speech-to-text service.',
type: 'cloud'
},
{
id: 'microsoft-azure',
name: 'Azure Cognitive Services',
description: 'Microsoft Azure speech-to-text service.',
type: 'cloud'
},
{
id: 'amazon-transcribe',
name: 'Amazon Transcribe',
description: 'Amazon cloud speech-to-text service.',
type: 'cloud'
},
{
id: 'openai-cloud',
name: 'OpenAI Cloud',
description: 'OpenAI cloud speech-to-text service.',
type: 'cloud'
},
{
id: 'deepgram',
name: 'Deepgram',
description: 'Deepgram cloud speech-to-text service.',
type: 'cloud'
},
];
//# sourceMappingURL=Recognition.js.map