echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

github.com/echogarden-project/echogarden

echogarden-project/echogarden

613 lines (474 loc) • 14.9 kB

text/typescript

import { spawn } from 'node:child_process' import { RawAudio, encodeRawAudioToWave, getRawAudioDuration } from '../audio/AudioUtilities.js' import { Logger } from '../utilities/Logger.js' import { type WhisperTask, type WhisperModelName } from './WhisperSTT.js' import { getRandomHexString } from '../utilities/Utilities.js' import { Timeline, TimelineEntryType } from '../utilities/Timeline.js' import { tryParseTimeRangePatternWithHours } from '../subtitles/Subtitles.js' import { getAppTempDir, joinPath } from '../utilities/PathUtilities.js' import { appName } from '../api/Common.js' import { readAndParseJsonFile, remove } from '../utilities/FileSystem.js' import { splitToLines } from '../nlp/Segmentation.js' import { extendDeep } from '../utilities/ObjectUtilities.js' import { formatLanguageCodeWithName, getShortLanguageCode } from '../utilities/Locale.js' import { loadPackage } from '../utilities/PackageManager.js' import { detectSpeechLanguageByParts } from '../api/SpeechLanguageDetection.js' export async function recognize( sourceRawAudio: RawAudio, task: WhisperTask, sourceLanguage: string | undefined, modelName: WhisperModelName, modelPath: string, options: WhisperCppOptions) { return new Promise<RecognitionResult>(async (resolve, reject) => { const logger = new Logger() if (sourceRawAudio.sampleRate != 16000) { throw new Error('Source audio must have a sample rate of 16000 Hz') } options = extendDeep(defaultWhisperCppOptions, options) let buildKind: WhisperCppBuild let executablePath: string if (options.executablePath) { buildKind = 'custom' executablePath = options.executablePath if (options.enableGPU == null) { options.enableGPU = true } } else { if (options.build) { buildKind = options.build if (options.enableGPU == null) { options.enableGPU = buildKind.startsWith('cublas-') } else if (options.enableGPU === true && !buildKind.startsWith('cublas-')) { throw new Error('GPU support is only available for CUDA builds') } } else { if (options.enableGPU) { buildKind = 'cublas-12.4.0' } else { buildKind = 'cpu' } } logger.end() executablePath = await loadExecutablePackage(buildKind) } if (options.enableFlashAttention && options.enableDTW) { options.enableDTW = false } if (task === 'translate' && options.model!.startsWith('large-v3-turbo')) { throw new Error(`The 'large-v3-turbo' model doesn't support translation tasks.`) } logger.start(`Recognize with command-line whisper.cpp (model: ${options.model || modelName}, build: ${buildKind})`) logger.log('') logger.log('') const sourceAsWave = encodeRawAudioToWave(sourceRawAudio) const tempDirPath = getAppTempDir(appName) const outJsonFilePathWithoutExtension = joinPath(tempDirPath, `${getRandomHexString(16)}`) const outJsonFilePath = `${outJsonFilePathWithoutExtension}.json` const args: string[] = [ '--output-json-full', '--output-file', outJsonFilePathWithoutExtension, '--model', modelPath, '--language', sourceLanguage || 'auto', '--threads', `${options.threadCount!}`, '--processors', `${options.splitCount!}`, '--best-of', `${options.topCandidateCount!}`, '--beam-size', `${options.beamCount!}`, '--entropy-thold', `${options.repetitionThreshold!}`, '--temperature', `${options.temperature!}`, '--temperature-inc', `${options.temperatureIncrement!}`, ] if (options.prompt) { args.push( '--prompt', options.prompt, ) } if (!options.enableGPU) { args.push( '--no-gpu' ) } if (options.enableDTW) { args.push( '--max-len', '0', '--dtw', modelName.replaceAll('-', '.'), ) } else { args.push( '--max-len', '0', ) } if (options.enableFlashAttention) { args.push( '--flash-attn' ) } if (task === 'translate') { args.push('--translate') } else if (task === 'detect-language') { args.push('--detect-language') } const argsString = args.join(' ') const process = spawn(executablePath, [...args, '-']) const stdoutLines: string[] = [] let stderrOutput = '' process.stdout.setEncoding('utf8') process.stdout.on('data', (str: string) => { if (task === 'detect-language') { return } const parts = splitToLines(str) .map(line => line.trim()) .filter(line => line.length > 0) logger.log(parts.join('\n')) stdoutLines.push(...parts) }) process.stderr.setEncoding('utf8') process.stderr.on('data', (str: string) => { if (options.verbose) { logger.log(str) } stderrOutput += str }) process.on('error', (e) => { reject(e) }) process.on('close', async (exitCode) => { logger.end() if (exitCode === 0) { const parsedStdOut = parseStdOutLinesToTimeline(stdoutLines, 'word') const resultObject: WhisperCppVerboseResult = await readAndParseJsonFile(outJsonFilePath) await remove(outJsonFilePath) if (task === 'detect-language') { resolve({ timeline: [], transcript: '', language: resultObject.result.language }) } else { const parsedResultObject = await parseResultObject(resultObject, modelName, getRawAudioDuration(sourceRawAudio), options.enableDTW!) resolve(parsedResultObject) } } else { reject(`whisper.cpp exited with code ${exitCode}`) logger.log(stderrOutput) } }) //writeToStdinInChunks(process, sourceAsWave, 2 ** 10) process.stdin.end(sourceAsWave) }) } export async function detectLanguage(sourceRawAudio: RawAudio, modelName: WhisperModelName, modelPath: string) { if (sourceRawAudio.sampleRate != 16000) { throw new Error('Source audio must have a sample rate of 16000') } async function detectLanguageForPart(partAudio: RawAudio) { const { language } = await recognize( partAudio, 'detect-language', undefined, modelName, modelPath, {}, ) const partResults = [{ language: language!, languageName: formatLanguageCodeWithName(language!), probability: 1.0, }] return partResults } const results = await detectSpeechLanguageByParts(sourceRawAudio, detectLanguageForPart) results.sort((entry1, entry2) => entry2.probability - entry1.probability) return results } async function parseResultObject(resultObject: WhisperCppVerboseResult, modelName: WhisperModelName, totalDuration: number, enableDTW: boolean): Promise<RecognitionResult> { const { Whisper } = await import('../recognition/WhisperSTT.js') const whisper = new Whisper(modelName, '', [], []) await whisper.initializeTokenizerIfNeeded() const tokenTimeline: Timeline = [] let currentCorrectionTimeOffset = 0 let lastTokenEndOffset = 0 for (let segmentIndex = 0; segmentIndex < resultObject.transcription.length; segmentIndex++) { const segmentObject = resultObject.transcription[segmentIndex] const tokens = segmentObject.tokens for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) { const tokenObject = tokens[tokenIndex] // Workaround whisper.cpp issue with missing offsets by falling back to last known end offset // when they are not included if (!tokenObject.offsets) { tokenObject.offsets = { from: lastTokenEndOffset, to: lastTokenEndOffset, } } else { lastTokenEndOffset = tokenObject.offsets.to } if (tokenIndex === 0 && tokenObject.text === '[_BEG_]' && tokenObject.offsets.from === 0) { currentCorrectionTimeOffset = segmentObject.offsets.from / 1000 } const tokenId = tokenObject.id const tokenText = whisper.tokenToText(tokenId, true) const tokenConfidence = tokenObject.p let startTime: number let endTime: number if (enableDTW) { const nextTokenEntry = tokens[tokenIndex + 1] const tokenEntryDtwStartTime = tokenObject.t_dtw / 100 const nextTokenEntryDtwStartTime = nextTokenEntry ? nextTokenEntry.t_dtw / 100 : totalDuration startTime = Math.max(tokenEntryDtwStartTime, 0) endTime = nextTokenEntryDtwStartTime } else { startTime = tokenObject.offsets.from / 1000 endTime = tokenObject.offsets.to / 1000 } startTime += currentCorrectionTimeOffset endTime += currentCorrectionTimeOffset tokenTimeline.push({ type: 'token', text: tokenText, id: tokenId, startTime, endTime, confidence: tokenConfidence }) } } const allTokenIds = tokenTimeline.map(entry => entry.id!) const transcript = whisper.tokensToText(allTokenIds).trim() const language = resultObject.result.language const timeline = whisper.tokenTimelineToWordTimeline(tokenTimeline, language) return { transcript, timeline, language } } function parseStdOutLinesToTimeline(lines: string[], entryType: TimelineEntryType): RecognitionResult { let transcript = '' const timeline: Timeline = [] for (const line of lines) { const openingSquareBracketIndex = line.indexOf('[') const closingSquareBracketIndex = line.indexOf(']', openingSquareBracketIndex + 1) const timeRangeString = line.substring(openingSquareBracketIndex + 1, closingSquareBracketIndex) const { startTime, endTime, succeeded } = tryParseTimeRangePatternWithHours(timeRangeString) if (!succeeded) { continue } const text = line.substring(closingSquareBracketIndex + 1 + 2) if (text.length === 0) { continue } transcript += text if (timeline.length === 0 || text.startsWith(' ')) { timeline.push({ type: entryType, text: text.trim(), startTime: startTime, endTime: endTime, }) } else { const previousEntry = timeline[timeline.length - 1] previousEntry.text += text previousEntry.endTime = endTime } } return { transcript, timeline } } export async function loadModelPackage(modelId: WhisperCppModelId | undefined, languageCode: string | undefined) { if (modelId === 'large') { modelId = 'large-v2' } if (modelId) { const modelName = getModelNameFromModelId(modelId) if (languageCode != 'en' && modelName.endsWith('.en')) { throw new Error(`The English-only model '${modelName}' cannot be used with a non-English language '${languageCode}'.`) } } else { if (languageCode) { const shortLanguageCode = getShortLanguageCode(languageCode) modelId = shortLanguageCode == 'en' ? 'base.en' : 'base' } else { modelId = 'base' } } const packageName = `whisper.cpp-${modelId}` const modelDir = await loadPackage(packageName) const modelPath = joinPath(modelDir, `ggml-${modelId}.bin`) const modelName = getModelNameFromModelId(modelId) return { modelName, modelPath } } export type WhisperCppBuild = 'cpu' | 'cublas-12.4.0' | 'custom' export async function loadExecutablePackage(buildKind: WhisperCppBuild) { if (buildKind === 'custom') { throw new Error(`A 'custom' build kind requires providing a custom path to the 'whisper-cli' executable in the 'executablePath' option.`) } const platform = process.platform const arch = process.arch let packageName: string if (buildKind.startsWith('cublas-')) { if (platform === 'win32' && arch === 'x64') { packageName = `whisper.cpp-binaries-windows-x64-${buildKind}-latest` } else if (platform === 'linux' && arch === 'x64') { packageName = `whisper.cpp-binaries-linux-x64-${buildKind}-latest` } else { throw new Error(`whisper.cpp GPU builds (NVIDIA CUDA only) are currently only available as packages for Windows x64 and Linux x64. Please specify a custom path to a whisper.cpp 'main' binary in the 'executablePath' option.`) } } else if (buildKind === 'cpu') { if (platform === 'win32' && arch === 'x64') { packageName = `whisper.cpp-binaries-windows-x64-cpu-latest` } else if (platform === 'linux' && arch === 'x64') { packageName = `whisper.cpp-binaries-linux-x64-cpu-latest` } else { throw new Error(`Couldn't find a matching whisper.cpp binary package. Please specify a custom path to a whisper.cpp 'main' binary in the 'executablePath' option.`) } } else { throw new Error(`Unsupported build kind '${buildKind}'`) } const packagePath = await loadPackage(packageName) let filename = 'whisper-cli' // used to be called 'main' but 'main' is now deprecated if (platform === 'win32') { filename += '.exe' } return joinPath(packagePath, filename) } function getModelNameFromModelId(modelId: WhisperCppModelId): WhisperModelName { if (modelId.startsWith('large-v1')) { return 'large-v1' } if (modelId.startsWith('large-v2')) { return 'large-v2' } if (modelId.startsWith('large-v3-turbo')) { return 'large-v3-turbo' } if (modelId.startsWith('large-v3')) { return 'large-v3' } const lastDashIndex = modelId.lastIndexOf('-') let modelName: string if (lastDashIndex >= 0) { modelName = modelId.substring(0, lastDashIndex) as WhisperModelName } else { modelName = modelId } return modelName as WhisperModelName } export interface WhisperCppVerboseResult { model: { type: string multilingual: boolean ftype: number mels: number vocab: number text: { ctx: number state: number head: number layer: number } audio: { ctx: number state: number head: number layer: number } } params: { language: string model: string translate: boolean } result: { language: string } systeminfo: string transcription: { text: string timestamps: { from: string, to: string } offsets: { from: number, to: number } tokens: { text: string timestamps: { from: string, to: string } offsets: { from: number, to: number } t_dtw: number p: number id: number }[] }[] } interface RecognitionResult { transcript: string timeline: Timeline language?: string } export interface WhisperCppOptions { build?: WhisperCppBuild executablePath?: string enableGPU?: boolean model?: WhisperCppModelId threadCount?: number, splitCount?: number, topCandidateCount?: number beamCount?: number repetitionThreshold?: number temperature?: number temperatureIncrement?: number prompt?: string enableDTW?: boolean enableFlashAttention?: boolean verbose?: boolean } export const defaultWhisperCppOptions: WhisperCppOptions = { build: undefined, executablePath: undefined, model: undefined, threadCount: 4, splitCount: 1, enableGPU: undefined, topCandidateCount: 5, beamCount: 5, repetitionThreshold: 2.4, temperature: 0, temperatureIncrement: 0.2, prompt: undefined, enableDTW: false, enableFlashAttention: false, verbose: false, } export type WhisperCppModelId = 'tiny' | 'tiny-q5_1' | 'tiny.en' | 'tiny.en-q5_1' | 'tiny.en-q8_0' | 'base' | 'base-q5_1' | 'base.en' | 'base.en-q5_1' | 'small' | 'small-q5_1' | 'small.en' | 'small.en-q5_1' | 'medium' | 'medium-q5_0' | 'medium.en' | 'medium.en-q5_0' | 'large' | 'large-v1' | 'large-v2' | 'large-v2-q5_0' | 'large-v3' | 'large-v3-q5_0' | `large-v3-turbo` | `large-v3-turbo-q5_0`