echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
373 lines • 14.9 kB
JavaScript
import { spawn } from 'node:child_process';
import { encodeRawAudioToWave, getRawAudioDuration } from '../audio/AudioUtilities.js';
import { Logger } from '../utilities/Logger.js';
import { getRandomHexString } from '../utilities/Utilities.js';
import { tryParseTimeRangePatternWithHours } from '../subtitles/Subtitles.js';
import { getAppTempDir, joinPath } from '../utilities/PathUtilities.js';
import { appName } from '../api/Common.js';
import { readAndParseJsonFile, remove } from '../utilities/FileSystem.js';
import { splitToLines } from '../nlp/Segmentation.js';
import { extendDeep } from '../utilities/ObjectUtilities.js';
import { formatLanguageCodeWithName, getShortLanguageCode } from '../utilities/Locale.js';
import { loadPackage } from '../utilities/PackageManager.js';
import { detectSpeechLanguageByParts } from '../api/SpeechLanguageDetection.js';
export async function recognize(sourceRawAudio, task, sourceLanguage, modelName, modelPath, options) {
return new Promise(async (resolve, reject) => {
const logger = new Logger();
if (sourceRawAudio.sampleRate != 16000) {
throw new Error('Source audio must have a sample rate of 16000 Hz');
}
options = extendDeep(defaultWhisperCppOptions, options);
let buildKind;
let executablePath;
if (options.executablePath) {
buildKind = 'custom';
executablePath = options.executablePath;
if (options.enableGPU == null) {
options.enableGPU = true;
}
}
else {
if (options.build) {
buildKind = options.build;
if (options.enableGPU == null) {
options.enableGPU = buildKind.startsWith('cublas-');
}
else if (options.enableGPU === true && !buildKind.startsWith('cublas-')) {
throw new Error('GPU support is only available for CUDA builds');
}
}
else {
if (options.enableGPU) {
buildKind = 'cublas-12.4.0';
}
else {
buildKind = 'cpu';
}
}
logger.end();
executablePath = await loadExecutablePackage(buildKind);
}
if (options.enableFlashAttention && options.enableDTW) {
options.enableDTW = false;
}
if (task === 'translate' && options.model.startsWith('large-v3-turbo')) {
throw new Error(`The 'large-v3-turbo' model doesn't support translation tasks.`);
}
logger.start(`Recognize with command-line whisper.cpp (model: ${options.model || modelName}, build: ${buildKind})`);
logger.log('');
logger.log('');
const sourceAsWave = encodeRawAudioToWave(sourceRawAudio);
const tempDirPath = getAppTempDir(appName);
const outJsonFilePathWithoutExtension = joinPath(tempDirPath, `${getRandomHexString(16)}`);
const outJsonFilePath = `${outJsonFilePathWithoutExtension}.json`;
const args = [
'--output-json-full',
'--output-file',
outJsonFilePathWithoutExtension,
'--model',
modelPath,
'--language',
sourceLanguage || 'auto',
'--threads',
`${options.threadCount}`,
'--processors',
`${options.splitCount}`,
'--best-of',
`${options.topCandidateCount}`,
'--beam-size',
`${options.beamCount}`,
'--entropy-thold',
`${options.repetitionThreshold}`,
'--temperature',
`${options.temperature}`,
'--temperature-inc',
`${options.temperatureIncrement}`,
];
if (options.prompt) {
args.push('--prompt', options.prompt);
}
if (!options.enableGPU) {
args.push('--no-gpu');
}
if (options.enableDTW) {
args.push('--max-len', '0', '--dtw', modelName.replaceAll('-', '.'));
}
else {
args.push('--max-len', '0');
}
if (options.enableFlashAttention) {
args.push('--flash-attn');
}
if (task === 'translate') {
args.push('--translate');
}
else if (task === 'detect-language') {
args.push('--detect-language');
}
const argsString = args.join(' ');
const process = spawn(executablePath, [...args, '-']);
const stdoutLines = [];
let stderrOutput = '';
process.stdout.setEncoding('utf8');
process.stdout.on('data', (str) => {
if (task === 'detect-language') {
return;
}
const parts = splitToLines(str)
.map(line => line.trim())
.filter(line => line.length > 0);
logger.log(parts.join('\n'));
stdoutLines.push(...parts);
});
process.stderr.setEncoding('utf8');
process.stderr.on('data', (str) => {
if (options.verbose) {
logger.log(str);
}
stderrOutput += str;
});
process.on('error', (e) => {
reject(e);
});
process.on('close', async (exitCode) => {
logger.end();
if (exitCode === 0) {
const parsedStdOut = parseStdOutLinesToTimeline(stdoutLines, 'word');
const resultObject = await readAndParseJsonFile(outJsonFilePath);
await remove(outJsonFilePath);
if (task === 'detect-language') {
resolve({ timeline: [], transcript: '', language: resultObject.result.language });
}
else {
const parsedResultObject = await parseResultObject(resultObject, modelName, getRawAudioDuration(sourceRawAudio), options.enableDTW);
resolve(parsedResultObject);
}
}
else {
reject(`whisper.cpp exited with code ${exitCode}`);
logger.log(stderrOutput);
}
});
//writeToStdinInChunks(process, sourceAsWave, 2 ** 10)
process.stdin.end(sourceAsWave);
});
}
export async function detectLanguage(sourceRawAudio, modelName, modelPath) {
if (sourceRawAudio.sampleRate != 16000) {
throw new Error('Source audio must have a sample rate of 16000');
}
async function detectLanguageForPart(partAudio) {
const { language } = await recognize(partAudio, 'detect-language', undefined, modelName, modelPath, {});
const partResults = [{
language: language,
languageName: formatLanguageCodeWithName(language),
probability: 1.0,
}];
return partResults;
}
const results = await detectSpeechLanguageByParts(sourceRawAudio, detectLanguageForPart);
results.sort((entry1, entry2) => entry2.probability - entry1.probability);
return results;
}
async function parseResultObject(resultObject, modelName, totalDuration, enableDTW) {
const { Whisper } = await import('../recognition/WhisperSTT.js');
const whisper = new Whisper(modelName, '', [], []);
await whisper.initializeTokenizerIfNeeded();
const tokenTimeline = [];
let currentCorrectionTimeOffset = 0;
let lastTokenEndOffset = 0;
for (let segmentIndex = 0; segmentIndex < resultObject.transcription.length; segmentIndex++) {
const segmentObject = resultObject.transcription[segmentIndex];
const tokens = segmentObject.tokens;
for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
const tokenObject = tokens[tokenIndex];
// Workaround whisper.cpp issue with missing offsets by falling back to last known end offset
// when they are not included
if (!tokenObject.offsets) {
tokenObject.offsets = {
from: lastTokenEndOffset,
to: lastTokenEndOffset,
};
}
else {
lastTokenEndOffset = tokenObject.offsets.to;
}
if (tokenIndex === 0 && tokenObject.text === '[_BEG_]' && tokenObject.offsets.from === 0) {
currentCorrectionTimeOffset = segmentObject.offsets.from / 1000;
}
const tokenId = tokenObject.id;
const tokenText = whisper.tokenToText(tokenId, true);
const tokenConfidence = tokenObject.p;
let startTime;
let endTime;
if (enableDTW) {
const nextTokenEntry = tokens[tokenIndex + 1];
const tokenEntryDtwStartTime = tokenObject.t_dtw / 100;
const nextTokenEntryDtwStartTime = nextTokenEntry ? nextTokenEntry.t_dtw / 100 : totalDuration;
startTime = Math.max(tokenEntryDtwStartTime, 0);
endTime = nextTokenEntryDtwStartTime;
}
else {
startTime = tokenObject.offsets.from / 1000;
endTime = tokenObject.offsets.to / 1000;
}
startTime += currentCorrectionTimeOffset;
endTime += currentCorrectionTimeOffset;
tokenTimeline.push({
type: 'token',
text: tokenText,
id: tokenId,
startTime,
endTime,
confidence: tokenConfidence
});
}
}
const allTokenIds = tokenTimeline.map(entry => entry.id);
const transcript = whisper.tokensToText(allTokenIds).trim();
const language = resultObject.result.language;
const timeline = whisper.tokenTimelineToWordTimeline(tokenTimeline, language);
return {
transcript,
timeline,
language
};
}
function parseStdOutLinesToTimeline(lines, entryType) {
let transcript = '';
const timeline = [];
for (const line of lines) {
const openingSquareBracketIndex = line.indexOf('[');
const closingSquareBracketIndex = line.indexOf(']', openingSquareBracketIndex + 1);
const timeRangeString = line.substring(openingSquareBracketIndex + 1, closingSquareBracketIndex);
const { startTime, endTime, succeeded } = tryParseTimeRangePatternWithHours(timeRangeString);
if (!succeeded) {
continue;
}
const text = line.substring(closingSquareBracketIndex + 1 + 2);
if (text.length === 0) {
continue;
}
transcript += text;
if (timeline.length === 0 || text.startsWith(' ')) {
timeline.push({
type: entryType,
text: text.trim(),
startTime: startTime,
endTime: endTime,
});
}
else {
const previousEntry = timeline[timeline.length - 1];
previousEntry.text += text;
previousEntry.endTime = endTime;
}
}
return { transcript, timeline };
}
export async function loadModelPackage(modelId, languageCode) {
if (modelId === 'large') {
modelId = 'large-v2';
}
if (modelId) {
const modelName = getModelNameFromModelId(modelId);
if (languageCode != 'en' && modelName.endsWith('.en')) {
throw new Error(`The English-only model '${modelName}' cannot be used with a non-English language '${languageCode}'.`);
}
}
else {
if (languageCode) {
const shortLanguageCode = getShortLanguageCode(languageCode);
modelId = shortLanguageCode == 'en' ? 'base.en' : 'base';
}
else {
modelId = 'base';
}
}
const packageName = `whisper.cpp-${modelId}`;
const modelDir = await loadPackage(packageName);
const modelPath = joinPath(modelDir, `ggml-${modelId}.bin`);
const modelName = getModelNameFromModelId(modelId);
return { modelName, modelPath };
}
export async function loadExecutablePackage(buildKind) {
if (buildKind === 'custom') {
throw new Error(`A 'custom' build kind requires providing a custom path to the 'whisper-cli' executable in the 'executablePath' option.`);
}
const platform = process.platform;
const arch = process.arch;
let packageName;
if (buildKind.startsWith('cublas-')) {
if (platform === 'win32' && arch === 'x64') {
packageName = `whisper.cpp-binaries-windows-x64-${buildKind}-latest`;
}
else if (platform === 'linux' && arch === 'x64') {
packageName = `whisper.cpp-binaries-linux-x64-${buildKind}-latest`;
}
else {
throw new Error(`whisper.cpp GPU builds (NVIDIA CUDA only) are currently only available as packages for Windows x64 and Linux x64. Please specify a custom path to a whisper.cpp 'main' binary in the 'executablePath' option.`);
}
}
else if (buildKind === 'cpu') {
if (platform === 'win32' && arch === 'x64') {
packageName = `whisper.cpp-binaries-windows-x64-cpu-latest`;
}
else if (platform === 'linux' && arch === 'x64') {
packageName = `whisper.cpp-binaries-linux-x64-cpu-latest`;
}
else {
throw new Error(`Couldn't find a matching whisper.cpp binary package. Please specify a custom path to a whisper.cpp 'main' binary in the 'executablePath' option.`);
}
}
else {
throw new Error(`Unsupported build kind '${buildKind}'`);
}
const packagePath = await loadPackage(packageName);
let filename = 'whisper-cli'; // used to be called 'main' but 'main' is now deprecated
if (platform === 'win32') {
filename += '.exe';
}
return joinPath(packagePath, filename);
}
function getModelNameFromModelId(modelId) {
if (modelId.startsWith('large-v1')) {
return 'large-v1';
}
if (modelId.startsWith('large-v2')) {
return 'large-v2';
}
if (modelId.startsWith('large-v3-turbo')) {
return 'large-v3-turbo';
}
if (modelId.startsWith('large-v3')) {
return 'large-v3';
}
const lastDashIndex = modelId.lastIndexOf('-');
let modelName;
if (lastDashIndex >= 0) {
modelName = modelId.substring(0, lastDashIndex);
}
else {
modelName = modelId;
}
return modelName;
}
export const defaultWhisperCppOptions = {
build: undefined,
executablePath: undefined,
model: undefined,
threadCount: 4,
splitCount: 1,
enableGPU: undefined,
topCandidateCount: 5,
beamCount: 5,
repetitionThreshold: 2.4,
temperature: 0,
temperatureIncrement: 0.2,
prompt: undefined,
enableDTW: false,
enableFlashAttention: false,
verbose: false,
};
//# sourceMappingURL=WhisperCppSTT.js.map