UNPKG

echogarden

Version:

An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.

99 lines 3.72 kB
import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js'; import { createVirtualFileReadStreamForBuffer } from '../utilities/VirtualFileReadStream.js'; import { Logger } from '../utilities/Logger.js'; import { extendDeep } from '../utilities/ObjectUtilities.js'; import { alignSegments } from '../api/Alignment.js'; export async function recognize(rawAudio, languageCode, options, task = 'transcribe') { const logger = new Logger(); logger.start('Load OpenAI module'); options = extendDeep(defaultOpenAICloudSTTOptions, options); if (options.requestWordTimestamps === undefined) { options.requestWordTimestamps = options.baseURL === undefined; } if (options.model === undefined) { if (options.baseURL === undefined) { options.model = 'whisper-1'; } else { throw new Error(`A custom provider for the OpenAI Cloud API requires specifying a model name`); } } const { default: OpenAI } = await import('openai'); const openai = new OpenAI(options); logger.start('Encode audio to send'); const ffmpegOptions = FFMpegTranscoder.getDefaultFFMpegOptionsForSpeech('mp3'); const encodedAudio = await FFMpegTranscoder.encodeFromChannels(rawAudio, ffmpegOptions); const virtualFileStream = createVirtualFileReadStreamForBuffer(encodedAudio, 'audio.mp3'); logger.start(options.baseURL ? `Send request to ${options.baseURL}` : 'Send request to OpenAI Cloud API'); let responseFormat; if (options.model === 'gpt-4o-mini-transcribe' || options.model === 'gpt-4o-transcribe') { responseFormat = 'json'; } else { responseFormat = 'verbose_json'; } let response; if (task == 'transcribe') { const timestamp_granularities = options.requestWordTimestamps ? ['word', 'segment'] : undefined; response = await openai.audio.transcriptions.create({ file: virtualFileStream, model: options.model, language: languageCode, prompt: options.prompt, response_format: responseFormat, temperature: options.temperature, timestamp_granularities, }); } else if (task == 'translate') { response = await openai.audio.translations.create({ file: virtualFileStream, model: options.model, prompt: options.prompt, response_format: responseFormat, temperature: options.temperature, }); } else { throw new Error(`Invalid task`); } const transcript = response.text.trim(); let timeline; if (response.words) { timeline = response.words.map(entry => ({ type: 'word', text: entry.word, startTime: entry.start, endTime: entry.end })); } else if (response.segments) { const segmentTimeline = response.segments.map(entry => ({ type: 'segment', text: entry.text, startTime: entry.start, endTime: entry.end })); if (task === 'transcribe') { logger.start('Align segments'); timeline = await alignSegments(rawAudio, segmentTimeline, { language: languageCode }); } else { timeline = segmentTimeline; } } logger.end(); return { transcript, timeline }; } export const defaultOpenAICloudSTTOptions = { apiKey: undefined, organization: undefined, baseURL: undefined, model: undefined, temperature: 0, prompt: undefined, timeout: undefined, maxRetries: 10, requestWordTimestamps: undefined, }; //# sourceMappingURL=OpenAICloudSTT.js.map