echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
529 lines (375 loc) • 12.8 kB
text/typescript
import chalk from 'chalk'
import * as API from './API.js'
import { extendDeep } from '../utilities/ObjectUtilities.js'
import { logToStderr } from '../utilities/Utilities.js'
import { AudioSourceParam, RawAudio, ensureRawAudio, normalizeAudioLevelInPlace, trimAudioEnd } from '../audio/AudioUtilities.js'
import { Logger } from '../utilities/Logger.js'
import { Timeline, addWordTextOffsetsToTimelineInPlace, wordTimelineToSegmentSentenceTimeline } from '../utilities/Timeline.js'
import { formatLanguageCodeWithName, parseLangIdentifier } from '../utilities/Locale.js'
import { loadPackage } from '../utilities/PackageManager.js'
import { type WhisperPartCallback, type WhisperOptions } from '../recognition/WhisperSTT.js'
import { type SubtitlesConfig } from '../subtitles/Subtitles.js'
import { type OpenAICloudSTTOptions } from '../recognition/OpenAICloudSTT.js'
import { type WhisperCppOptions } from '../recognition/WhisperCppSTT.js'
import { type SileroRecognitionOptions } from '../recognition/SileroSTT.js'
import { type DeepgramSTTOptions } from '../recognition/DeepgramSTT.js'
import { type OnnxExecutionProvider } from '../utilities/OnnxUtilities.js'
const log = logToStderr
export async function recognize(input: AudioSourceParam, options: RecognitionOptions, onPart?: WhisperPartCallback): Promise<RecognitionResult> {
const logger = new Logger()
const startTimestamp = logger.getTimestamp()
options = extendDeep(defaultRecognitionOptions, options)
const inputRawAudio = await ensureRawAudio(input)
let sourceRawAudio: RawAudio
let isolatedRawAudio: RawAudio | undefined
let backgroundRawAudio: RawAudio | undefined
if (options.isolate) {
logger.log(``)
logger.end();
({ isolatedRawAudio, backgroundRawAudio } = await API.isolate(inputRawAudio, options.sourceSeparation!))
logger.end()
logger.log(``)
logger.start(`Resample audio to 16kHz mono`)
sourceRawAudio = await ensureRawAudio(isolatedRawAudio, 16000, 1)
} else {
logger.start(`Resample audio to 16kHz mono`)
sourceRawAudio = await ensureRawAudio(inputRawAudio, 16000, 1)
}
let sourceUncropTimeline: Timeline | undefined
if (options.crop) {
logger.start('Crop using voice activity detection');
({ timeline: sourceUncropTimeline, croppedRawAudio: sourceRawAudio } = await API.detectVoiceActivity(sourceRawAudio, options.vad!))
logger.end()
}
logger.start('Normalize and trim audio')
normalizeAudioLevelInPlace(sourceRawAudio)
sourceRawAudio.audioChannels[0] = trimAudioEnd(sourceRawAudio.audioChannels[0])
const engine = options.engine!
if (options.language) {
const languageData = await parseLangIdentifier(options.language)
options.language = languageData.Name
logger.end()
logger.logTitledMessage('Language specified', formatLanguageCodeWithName(options.language))
} else {
logger.start('No language specified. Detect speech language')
const { detectedLanguage } = await API.detectSpeechLanguage(sourceRawAudio, options.languageDetection!)
options.language = detectedLanguage
logger.end()
logger.logTitledMessage('Language detected', formatLanguageCodeWithName(detectedLanguage))
}
const languageData = await parseLangIdentifier(options.language)
const languageCode = languageData.Name
const shortLanguageCode = languageData.TwoLetterISOLanguageName
let transcript: string
let timeline: Timeline | undefined
logger.start(`Load ${engine} module`)
switch (engine) {
case 'whisper': {
const WhisperSTT = await import('../recognition/WhisperSTT.js')
const whisperOptions = options.whisper!
logger.end()
const { modelName, modelDir } = await WhisperSTT.loadPackagesAndGetPaths(whisperOptions.model, shortLanguageCode)
logger.end();
({ transcript, timeline } = await WhisperSTT.recognize(
sourceRawAudio,
modelName,
modelDir,
'transcribe',
shortLanguageCode,
whisperOptions,
onPart,
))
break
}
case 'whisper.cpp': {
const WhisperCppSTT = await import('../recognition/WhisperCppSTT.js')
const whisperCppOptions = options.whisperCpp!
logger.end()
const { modelName, modelPath } = await WhisperCppSTT.loadModelPackage(whisperCppOptions.model, shortLanguageCode)
logger.end();
({ transcript, timeline } = await WhisperCppSTT.recognize(
sourceRawAudio,
'transcribe',
shortLanguageCode,
modelName,
modelPath,
whisperCppOptions,
))
break
}
case 'vosk': {
const VoskSTT = await import('../recognition/VoskSTT.js')
try {
await import('@echogarden/vosk')
} catch (e) {
log(e)
throw new Error(`The vosk npm package, which is required for Vosk support, was not found, or had an error loading. If missing, you can install it by running 'npm install @echogarden/vosk -g'.`)
}
const voskOptions = options.vosk!
const modelPath = voskOptions.modelPath
if (!modelPath) {
throw new Error(`Vosk models are not currently auto-downloaded. You'll need to download a model manually and set a model path in 'vosk.modelPath'.`)
}
logger.end();
({ transcript, timeline } = await VoskSTT.recognize(sourceRawAudio, modelPath, true))
break
}
case 'silero': {
const SileroSTT = await import('../recognition/SileroSTT.js')
const sileroOptions = options.silero!
let modelPath = sileroOptions.modelPath
if (!modelPath) {
const packageName = SileroSTT.languageCodeToPackageName[shortLanguageCode]
if (!packageName) {
throw new Error(`Language '${shortLanguageCode}' is not supported by Silero`)
}
modelPath = await loadPackage(packageName)
}
const onnxExecutionProviders: OnnxExecutionProvider[] = sileroOptions.provider ? [sileroOptions.provider] : []
logger.end();
({ transcript, timeline } = await SileroSTT.recognize(
sourceRawAudio,
modelPath,
onnxExecutionProviders))
break
}
case 'google-cloud': {
const GoogleCloudSTT = await import('../recognition/GoogleCloudSTT.js')
const apiKey = options.googleCloud!.apiKey
if (!apiKey) {
throw new Error(`No API key given`)
}
logger.end();
({ transcript, timeline } = await GoogleCloudSTT.recognize(sourceRawAudio, apiKey, shortLanguageCode))
break
}
case 'microsoft-azure': {
const AzureCognitiveServicesSTT = await import('../recognition/AzureCognitiveServicesSTT.js')
const subscriptionKey = options.microsoftAzure!.subscriptionKey
if (!subscriptionKey) {
throw new Error(`No Microsoft Azure subscription key provided`)
}
const serviceRegion = options.microsoftAzure!.serviceRegion
if (!serviceRegion) {
throw new Error(`No Microsoft Azure service region provided`)
}
logger.end();
({ transcript, timeline } = await AzureCognitiveServicesSTT.recognize(sourceRawAudio, subscriptionKey, serviceRegion, shortLanguageCode))
break
}
case 'amazon-transcribe': {
const AmazonTranscribeSTT = await import('../recognition/AmazonTranscribeSTT.js')
const region = options.amazonTranscribe!.region
if (!region) {
throw new Error(`No Amazon Transcribe region provided`)
}
const accessKeyId = options.amazonTranscribe!.accessKeyId
if (!accessKeyId) {
throw new Error(`No Amazon Transcribe access key id provided`)
}
const secretAccessKey = options.amazonTranscribe!.secretAccessKey
if (!secretAccessKey) {
throw new Error(`No Amazon Transcribe secret access key provided`)
}
logger.end();
({ transcript, timeline } = await AmazonTranscribeSTT.recgonize(sourceRawAudio, languageCode, region, accessKeyId, secretAccessKey))
break
}
case 'openai-cloud': {
const OpenAICloudSTT = await import('../recognition/OpenAICloudSTT.js')
const openAICloudSTTOptions = options.openAICloud!
if (!openAICloudSTTOptions.apiKey) {
throw new Error(`No OpanAI Cloud API key provided`)
}
logger.end();
({ transcript, timeline } = await OpenAICloudSTT.recognize(sourceRawAudio, shortLanguageCode, openAICloudSTTOptions))
break
}
case 'deepgram': {
const DeepgramSTT = await import('../recognition/DeepgramSTT.js')
const deepgramOptions = options.deepgram!
if (!deepgramOptions.apiKey) {
throw new Error(`No Deepgram API key provided`)
}
logger.end();
({ transcript, timeline } = await DeepgramSTT.recognize(sourceRawAudio, options.language ? shortLanguageCode : undefined, deepgramOptions))
break
}
default: {
throw new Error(`Engine '${options.engine}' is not supported`)
}
}
// If the engine didn't return a timeline, align to get it
if (!timeline) {
logger.start(`Align audio to transcript`)
const alignmentOptions: API.AlignmentOptions = extendDeep(options.alignment, { language: languageCode })
const { wordTimeline } = await API.align(sourceRawAudio, transcript, alignmentOptions)
timeline = wordTimeline
}
// If the audio was cropped before recognition, map the timestamps back to the original audio
if (sourceUncropTimeline && sourceUncropTimeline.length > 0) {
API.convertCroppedToUncroppedTimeline(timeline, sourceUncropTimeline)
}
// Add text offsets
addWordTextOffsetsToTimelineInPlace(timeline, transcript)
// Make segment timeline
const { segmentTimeline } = await wordTimelineToSegmentSentenceTimeline(timeline, transcript, languageCode, 'single', 'preserve')
logger.end()
logger.logDuration('\nTotal recognition time', startTimestamp, chalk.magentaBright)
return {
transcript,
timeline: segmentTimeline,
wordTimeline: timeline,
language: languageCode,
inputRawAudio,
isolatedRawAudio,
backgroundRawAudio,
}
}
export interface RecognitionResult {
transcript: string
timeline: Timeline
wordTimeline: Timeline
language: string
inputRawAudio: RawAudio
isolatedRawAudio?: RawAudio
backgroundRawAudio?: RawAudio
}
export type RecognitionEngine = 'whisper' | 'whisper.cpp' | 'vosk' | 'silero' | 'google-cloud' | 'microsoft-azure' | 'amazon-transcribe' | 'openai-cloud' | 'deepgram'
export interface RecognitionOptions {
engine?: RecognitionEngine
language?: string
maxAlternatives?: number
isolate?: boolean
crop?: boolean
alignment?: API.AlignmentOptions
languageDetection?: API.SpeechLanguageDetectionOptions
subtitles?: SubtitlesConfig
vad?: API.VADOptions
sourceSeparation?: API.SourceSeparationOptions
whisper?: WhisperOptions
whisperCpp?: WhisperCppOptions
vosk?: {
modelPath?: string
}
silero?: SileroRecognitionOptions
googleCloud?: {
apiKey?: string
alternativeLanguageCodes?: string[]
profanityFilter?: boolean
autoPunctuation?: boolean
useEnhancedModel?: boolean
}
microsoftAzure?: {
subscriptionKey?: string
serviceRegion?: string
}
amazonTranscribe?: {
region?: string
accessKeyId?: string
secretAccessKey?: string
}
openAICloud?: OpenAICloudSTTOptions
deepgram?: DeepgramSTTOptions
}
export const defaultRecognitionOptions: RecognitionOptions = {
engine: 'whisper',
language: undefined,
maxAlternatives: 1,
isolate: false,
crop: true,
alignment: {
},
languageDetection: {
},
subtitles: {
},
vad: {
engine: 'adaptive-gate'
},
whisper: {
},
whisperCpp: {
},
vosk: {
modelPath: undefined
},
silero: {
},
googleCloud: {
apiKey: undefined,
alternativeLanguageCodes: [],
profanityFilter: false,
autoPunctuation: true,
useEnhancedModel: true,
},
microsoftAzure: {
subscriptionKey: undefined,
serviceRegion: undefined
},
amazonTranscribe: {
region: undefined,
accessKeyId: undefined,
secretAccessKey: undefined,
},
openAICloud: {
},
deepgram: {
}
}
export const recognitionEngines: API.EngineMetadata[] = [
{
id: 'whisper',
name: 'OpenAI Whisper',
description: 'A high accuracy transformer-based speech recognition architecture by OpenAI.',
type: 'local'
},
{
id: 'whisper.cpp',
name: 'OpenAI Whisper (C++ port)',
description: 'A C++ port of the Whisper speech recognition architecture.',
type: 'local'
},
{
id: 'vosk',
name: 'Vosk',
description: 'A speech recognition toolkit.',
type: 'local'
},
{
id: 'silero',
name: 'Silero',
description: 'Speech recognition models.',
type: 'local'
},
{
id: 'google-cloud',
name: 'Google Cloud',
description: 'Google Cloud speech-to-text service.',
type: 'cloud'
},
{
id: 'microsoft-azure',
name: 'Azure Cognitive Services',
description: 'Microsoft Azure speech-to-text service.',
type: 'cloud'
},
{
id: 'amazon-transcribe',
name: 'Amazon Transcribe',
description: 'Amazon cloud speech-to-text service.',
type: 'cloud'
},
{
id: 'openai-cloud',
name: 'OpenAI Cloud',
description: 'OpenAI cloud speech-to-text service.',
type: 'cloud'
},
{
id: 'deepgram',
name: 'Deepgram',
description: 'Deepgram cloud speech-to-text service.',
type: 'cloud'
},
]