@tanstack/ai
Version:
Core TanStack AI library - Open source AI SDK
193 lines (172 loc) • 5.45 kB
text/typescript
/**
* TTS Activity
*
* Generates speech audio from text using text-to-speech models.
* This is a self-contained module with implementation, types, and JSDoc.
*/
import { aiEventClient } from '@tanstack/ai-event-client'
import { streamGenerationResult } from '../stream-generation-result.js'
import type { TTSAdapter } from './adapter'
import type { StreamChunk, TTSResult } from '../../types'
// ===========================
// Activity Kind
// ===========================
/** The adapter kind this activity handles */
export const kind = 'tts' as const
// ===========================
// Type Extraction Helpers
// ===========================
/**
* Extract provider options from a TTSAdapter via ~types.
*/
export type TTSProviderOptions<TAdapter> =
TAdapter extends TTSAdapter<any, any>
? TAdapter['~types']['providerOptions']
: object
// ===========================
// Activity Options Type
// ===========================
/**
* Options for the TTS activity.
* The model is extracted from the adapter's model property.
*
* @template TAdapter - The TTS adapter type
* @template TStream - Whether to stream the output
*/
export interface TTSActivityOptions<
TAdapter extends TTSAdapter<string, object>,
TStream extends boolean = false,
> {
/** The TTS adapter to use (must be created with a model) */
adapter: TAdapter & { kind: typeof kind }
/** The text to convert to speech */
text: string
/** The voice to use for generation */
voice?: string
/** The output audio format */
format?: 'mp3' | 'opus' | 'aac' | 'flac' | 'wav' | 'pcm'
/** The speed of the generated audio (0.25 to 4.0) */
speed?: number
/** Provider-specific options for TTS generation */
modelOptions?: TTSProviderOptions<TAdapter>
/**
* Whether to stream the generation result.
* When true, returns an AsyncIterable<StreamChunk> for streaming transport.
* When false or not provided, returns a Promise<TTSResult>.
*
* @default false
*/
stream?: TStream
}
// ===========================
// Activity Result Type
// ===========================
/**
* Result type for the TTS activity.
* - If stream is true: AsyncIterable<StreamChunk>
* - Otherwise: Promise<TTSResult>
*/
export type TTSActivityResult<TStream extends boolean = false> =
TStream extends true ? AsyncIterable<StreamChunk> : Promise<TTSResult>
function createId(prefix: string): string {
return `${prefix}-${Date.now()}-${Math.random().toString(36).slice(2, 9)}`
}
// ===========================
// Activity Implementation
// ===========================
/**
* TTS activity - generates speech from text.
*
* Uses AI text-to-speech models to create audio from natural language text.
*
* @example Generate speech from text
* ```ts
* import { generateSpeech } from '@tanstack/ai'
* import { openaiTTS } from '@tanstack/ai-openai'
*
* const result = await generateSpeech({
* adapter: openaiTTS('tts-1-hd'),
* text: 'Hello, welcome to TanStack AI!',
* voice: 'nova'
* })
*
* console.log(result.audio) // base64-encoded audio
* ```
*
* @example With format and speed options
* ```ts
* const result = await generateSpeech({
* adapter: openaiTTS('tts-1'),
* text: 'This is slower speech.',
* voice: 'alloy',
* format: 'wav',
* speed: 0.8
* })
* ```
*/
export function generateSpeech<
TAdapter extends TTSAdapter<string, object>,
TStream extends boolean = false,
>(options: TTSActivityOptions<TAdapter, TStream>): TTSActivityResult<TStream> {
if (options.stream) {
return streamGenerationResult(() =>
runGenerateSpeech(options),
) as TTSActivityResult<TStream>
}
return runGenerateSpeech(options) as TTSActivityResult<TStream>
}
/**
* Run the core TTS generation logic (non-streaming).
*/
async function runGenerateSpeech<TAdapter extends TTSAdapter<string, object>>(
options: TTSActivityOptions<TAdapter, boolean>,
): Promise<TTSResult> {
const { adapter, stream: _stream, ...rest } = options
const model = adapter.model
const requestId = createId('speech')
const startTime = Date.now()
aiEventClient.emit('speech:request:started', {
requestId,
provider: adapter.name,
model,
text: rest.text,
voice: rest.voice,
format: rest.format,
speed: rest.speed,
modelOptions: rest.modelOptions as Record<string, unknown> | undefined,
timestamp: startTime,
})
return adapter.generateSpeech({ ...rest, model }).then((result) => {
const duration = Date.now() - startTime
aiEventClient.emit('speech:request:completed', {
requestId,
provider: adapter.name,
model,
audio: result.audio,
format: result.format,
audioDuration: result.duration,
contentType: result.contentType,
duration,
modelOptions: rest.modelOptions as Record<string, unknown> | undefined,
timestamp: Date.now(),
})
return result
})
}
// ===========================
// Options Factory
// ===========================
/**
* Create typed options for the generateSpeech() function without executing.
*/
export function createSpeechOptions<
TAdapter extends TTSAdapter<string, object>,
TStream extends boolean = false,
>(
options: TTSActivityOptions<TAdapter, TStream>,
): TTSActivityOptions<TAdapter, TStream> {
return options
}
// Re-export adapter types
export type { TTSAdapter, TTSAdapterConfig, AnyTTSAdapter } from './adapter'
export { BaseTTSAdapter } from './adapter'