echogarden
Version:
An easy-to-use speech toolset. Includes tools for synthesis, recognition, alignment, speech translation, language detection, source separation and more.
169 lines (130 loc) • 4.26 kB
text/typescript
import * as FFMpegTranscoder from '../codecs/FFMpegTranscoder.js'
import { RawAudio } from '../audio/AudioUtilities.js'
import { createVirtualFileReadStreamForBuffer } from '../utilities/VirtualFileReadStream.js'
import { Logger } from '../utilities/Logger.js'
import { extendDeep } from '../utilities/ObjectUtilities.js'
import { Timeline, TimelineEntry } from '../utilities/Timeline.js'
import { alignSegments } from '../api/Alignment.js'
export async function recognize(rawAudio: RawAudio, languageCode: string, options: OpenAICloudSTTOptions, task: Task = 'transcribe') {
const logger = new Logger()
logger.start('Load OpenAI module')
options = extendDeep(defaultOpenAICloudSTTOptions, options)
if (options.requestWordTimestamps === undefined) {
options.requestWordTimestamps = options.baseURL === undefined
}
if (options.model === undefined) {
if (options.baseURL === undefined) {
options.model = 'whisper-1'
} else {
throw new Error(`A custom provider for the OpenAI Cloud API requires specifying a model name`)
}
}
const { default: OpenAI } = await import('openai')
const openai = new OpenAI(options)
logger.start('Encode audio to send')
const ffmpegOptions = FFMpegTranscoder.getDefaultFFMpegOptionsForSpeech('mp3')
const encodedAudio = await FFMpegTranscoder.encodeFromChannels(rawAudio, ffmpegOptions)
const virtualFileStream = createVirtualFileReadStreamForBuffer(encodedAudio, 'audio.mp3')
logger.start(options.baseURL ? `Send request to ${options.baseURL}` : 'Send request to OpenAI Cloud API')
let responseFormat: 'verbose_json' | 'json'
if (options.model === 'gpt-4o-mini-transcribe' || options.model === 'gpt-4o-transcribe') {
responseFormat = 'json'
} else {
responseFormat = 'verbose_json'
}
let response: VerboseResponse
if (task == 'transcribe') {
const timestamp_granularities: ('word' | 'segment')[] | undefined =
options.requestWordTimestamps ? ['word', 'segment'] : undefined
response = await openai.audio.transcriptions.create({
file: virtualFileStream,
model: options.model,
language: languageCode,
prompt: options.prompt,
response_format: responseFormat,
temperature: options.temperature,
timestamp_granularities,
}) as any as VerboseResponse
} else if (task == 'translate') {
response = await openai.audio.translations.create({
file: virtualFileStream,
model: options.model,
prompt: options.prompt,
response_format: responseFormat,
temperature: options.temperature,
}) as any as VerboseResponse
} else {
throw new Error(`Invalid task`)
}
const transcript = response.text.trim()
let timeline: Timeline | undefined
if (response.words) {
timeline = response.words.map<TimelineEntry>(entry => ({
type: 'word',
text: entry.word,
startTime: entry.start,
endTime: entry.end
}))
} else if (response.segments) {
const segmentTimeline = response.segments.map<TimelineEntry>(entry => ({
type: 'segment',
text: entry.text,
startTime: entry.start,
endTime: entry.end
}))
if (task === 'transcribe') {
logger.start('Align segments')
timeline = await alignSegments(rawAudio, segmentTimeline, { language: languageCode })
} else {
timeline = segmentTimeline
}
}
logger.end()
return { transcript, timeline }
}
interface VerboseResponse {
task: string
language: string
duration: number
text: string
segments: {
text: string
start: number
end: number
id: number
no_speech_prob: number
compression_ratio: number
avg_logprob: number
seek: number
temperature: number
tokens: number[]
}[]
words: {
word: string
start: number
end: number
}[]
}
type Task = 'transcribe' | 'translate'
export interface OpenAICloudSTTOptions {
model?: 'whisper-1' | 'gpt-4o-mini-transcribe' | 'gpt-4o-transcribe' | string
apiKey?: string
organization?: string
baseURL?: string
temperature?: number
prompt?: string
timeout?: number
maxRetries?: number
requestWordTimestamps?: boolean
}
export const defaultOpenAICloudSTTOptions: OpenAICloudSTTOptions = {
apiKey: undefined,
organization: undefined,
baseURL: undefined,
model: undefined,
temperature: 0,
prompt: undefined,
timeout: undefined,
maxRetries: 10,
requestWordTimestamps: undefined,
}