@tanstack/ai
Version:
Type-safe TypeScript AI SDK for streaming chat, tool calling, agents, structured outputs, and multimodal generation.
311 lines (284 loc) • 8.89 kB
text/typescript
/**
* Transcription Activity
*
* Transcribes audio to text using speech-to-text models.
* This is a self-contained module with implementation, types, and JSDoc.
*/
import { aiEventClient } from '@tanstack/ai-event-client'
import { streamGenerationResult } from '../stream-generation-result.js'
import { resolveDebugOption } from '../../logger/resolve'
import {
createGenerationContext,
runGenerationError,
runGenerationFinish,
runGenerationStart,
runGenerationUsage,
} from '../middleware'
import type { InternalLogger } from '../../logger/internal-logger'
import type { DebugOption } from '../../logger/types'
import type { GenerationMiddleware } from '../middleware'
import type { TranscriptionAdapter } from './adapter'
import type { StreamChunk, TranscriptionResult } from '../../types'
// ===========================
// Activity Kind
// ===========================
/** The adapter kind this activity handles */
export const kind = 'transcription' as const
// ===========================
// Type Extraction Helpers
// ===========================
/**
* Extract provider options from a TranscriptionAdapter via ~types.
*/
export type TranscriptionProviderOptions<TAdapter> =
TAdapter extends TranscriptionAdapter<any, any>
? TAdapter['~types']['providerOptions']
: object
// ===========================
// Activity Options Type
// ===========================
/**
* Options for the transcription activity.
* The model is extracted from the adapter's model property.
*
* @template TAdapter - The transcription adapter type
* @template TStream - Whether to stream the output
*/
export interface TranscriptionActivityOptions<
TAdapter extends TranscriptionAdapter<
string,
TranscriptionProviderOptions<TAdapter>
>,
TStream extends boolean = false,
> {
/** The transcription adapter to use (must be created with a model) */
adapter: TAdapter & { kind: typeof kind }
/** The audio data to transcribe - can be base64 string, File, Blob, or Buffer */
audio: string | File | Blob | ArrayBuffer
/** The language of the audio in ISO-639-1 format (e.g., 'en') */
language?: string
/** An optional prompt to guide the transcription */
prompt?: string
/** The format of the transcription output */
responseFormat?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt'
/** Provider-specific options for transcription */
modelOptions?: TranscriptionProviderOptions<TAdapter>
/**
* Whether to stream the transcription result.
* When true, returns an AsyncIterable<StreamChunk> for streaming transport.
* When false or not provided, returns a Promise<TranscriptionResult>.
*
* @default false
*/
stream?: TStream
/**
* Enable debug logging. Pass `true` to enable all categories, `false` to
* silence everything including errors, or a `DebugConfig` object for granular
* control and/or a custom `Logger`.
*/
debug?: DebugOption
/**
* Observe-only middleware notified on start, usage, success, and error. Pass
* `otelMiddleware()` to emit OpenTelemetry spans, or implement the
* `GenerationMiddleware` contract for a custom backend.
*/
middleware?: Array<GenerationMiddleware>
}
// ===========================
// Activity Result Type
// ===========================
/**
* Result type for the transcription activity.
* - If stream is true: AsyncIterable<StreamChunk>
* - Otherwise: Promise<TranscriptionResult>
*/
export type TranscriptionActivityResult<TStream extends boolean = false> =
TStream extends true
? AsyncIterable<StreamChunk>
: Promise<TranscriptionResult>
function createId(prefix: string): string {
return `${prefix}-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`
}
// ===========================
// Activity Implementation
// ===========================
/**
* Transcription activity - converts audio to text.
*
* Uses AI speech-to-text models to transcribe audio content.
*
* @example Transcribe an audio file
* ```ts
* import { generateTranscription } from '@tanstack/ai'
* import { openaiTranscription } from '@tanstack/ai-openai'
*
* const result = await generateTranscription({
* adapter: openaiTranscription('whisper-1'),
* audio: audioFile, // File, Blob, or base64 string
* language: 'en'
* })
*
* console.log(result.text)
* ```
*
* @example With verbose output for timestamps
* ```ts
* const result = await generateTranscription({
* adapter: openaiTranscription('whisper-1'),
* audio: audioFile,
* responseFormat: 'verbose_json'
* })
*
* result.segments?.forEach(segment => {
* console.log(`[${segment.start}s - ${segment.end}s]: ${segment.text}`)
* })
* ```
*
* @example Streaming transcription result
* ```ts
* for await (const chunk of generateTranscription({
* adapter: openaiTranscription('whisper-1'),
* audio: audioFile,
* stream: true
* })) {
* console.log(chunk)
* }
* ```
*/
export function generateTranscription<
TAdapter extends TranscriptionAdapter<
string,
TranscriptionProviderOptions<TAdapter>
>,
TStream extends boolean = false,
>(
options: TranscriptionActivityOptions<TAdapter, TStream>,
): TranscriptionActivityResult<TStream> {
if (options.stream) {
return streamGenerationResult(() =>
runGenerateTranscription(options),
) as TranscriptionActivityResult<TStream>
}
return runGenerateTranscription(
options,
) as TranscriptionActivityResult<TStream>
}
/**
* Run non-streaming transcription
*/
async function runGenerateTranscription<
TAdapter extends TranscriptionAdapter<
string,
TranscriptionProviderOptions<TAdapter>
>,
>(
options: TranscriptionActivityOptions<TAdapter, boolean>,
): Promise<TranscriptionResult> {
const {
adapter,
stream: _stream,
debug: _debug,
middleware,
...rest
} = options
const model = adapter.model
const requestId = createId('transcription')
const startTime = Date.now()
const logger: InternalLogger = resolveDebugOption(options.debug)
const providerName =
(adapter as { name?: string; provider?: string }).provider ??
(adapter as { name?: string }).name ??
'unknown'
const mwCtx = createGenerationContext({
requestId,
activity: 'transcription',
provider: adapter.name,
model,
modelOptions: rest.modelOptions,
createId,
})
await runGenerationStart(middleware, mwCtx)
aiEventClient.emit('transcription:request:started', {
requestId,
provider: adapter.name,
model,
language: rest.language,
prompt: rest.prompt,
responseFormat: rest.responseFormat,
modelOptions: rest.modelOptions as Record<string, unknown> | undefined,
timestamp: startTime,
})
logger.request(`activity=generateTranscription provider=${providerName}`, {
provider: providerName,
model,
})
try {
const result = await adapter.transcribe({ ...rest, model, logger })
const duration = Date.now() - startTime
aiEventClient.emit('transcription:request:completed', {
requestId,
provider: adapter.name,
model,
text: result.text,
language: result.language,
duration,
modelOptions: rest.modelOptions as Record<string, unknown> | undefined,
timestamp: Date.now(),
})
logger.output(
`activity=generateTranscription length=${result.text.length}`,
{ hasText: !!result.text },
)
if (result.usage) await runGenerationUsage(middleware, mwCtx, result.usage)
await runGenerationFinish(middleware, mwCtx, {
duration,
usage: result.usage,
})
return result
} catch (error) {
const duration = Date.now() - startTime
const err = error as Error
aiEventClient.emit('transcription:request:error', {
requestId,
provider: adapter.name,
model,
error: { message: err.message, name: err.name },
duration,
modelOptions: rest.modelOptions as Record<string, unknown> | undefined,
timestamp: Date.now(),
})
await runGenerationError(middleware, mwCtx, {
error,
duration,
})
logger.errors('generateTranscription activity failed', {
error,
source: 'generateTranscription',
})
throw error
}
}
// ===========================
// Options Factory
// ===========================
/**
* Create typed options for the generateTranscription() function without executing.
*/
export function createTranscriptionOptions<
TAdapter extends TranscriptionAdapter<
string,
TranscriptionProviderOptions<TAdapter>
>,
TStream extends boolean = false,
>(
options: TranscriptionActivityOptions<TAdapter, TStream>,
): TranscriptionActivityOptions<TAdapter, TStream> {
return options
}
// Re-export adapter types
export type {
TranscriptionAdapter,
TranscriptionAdapterConfig,
AnyTranscriptionAdapter,
} from './adapter'
export { BaseTranscriptionAdapter } from './adapter'