@tanstack/ai
Version:
Type-safe TypeScript AI SDK for streaming chat, tool calling, agents, structured outputs, and multimodal generation.
87 lines (81 loc) • 2.51 kB
text/typescript
import type {
AudioPart,
ImagePart,
MediaInputMetadata,
MediaPrompt,
MediaPromptPart,
TextPart,
VideoPart,
} from '../types'
/**
* A {@link MediaPrompt} decomposed into the views adapters consume.
*
* Adapters with native multimodal prompts (Gemini `contents`, OpenRouter
* chat content parts) consume `parts` to preserve interleaving; named-field
* providers (fal, OpenAI) consume `text` plus the typed media buckets.
*
* Prompt text is **never rewritten**: text parts are concatenated verbatim.
* Providers that support referencing inputs from the prompt (e.g. fal's
* `@Image1`, OpenAI's "image 1" prose) expect the user to write that syntax
* themselves — the SDK does not inject or substitute markers.
*/
export interface ResolvedMediaPrompt {
/**
* Text parts concatenated verbatim (paragraph-separated). Empty string
* for media-only prompts.
*/
text: string
/** The prompt as ordered parts; a string prompt becomes one text part. */
parts: Array<MediaPromptPart>
/** Image parts in prompt order. */
images: Array<ImagePart<MediaInputMetadata>>
/** Video parts in prompt order. */
videos: Array<VideoPart<MediaInputMetadata>>
/** Audio parts in prompt order. */
audios: Array<AudioPart<MediaInputMetadata>>
}
/**
* Decompose a {@link MediaPrompt} into flattened text and per-modality part
* buckets, preserving prompt order everywhere. This is the single downrev
* point from the canonical interleaved prompt shape to the named-field
* request shapes most providers expose.
*/
export function resolveMediaPrompt(prompt: MediaPrompt): ResolvedMediaPrompt {
if (typeof prompt === 'string') {
const textPart: TextPart = { type: 'text', content: prompt }
return {
text: prompt,
parts: [textPart],
images: [],
videos: [],
audios: [],
}
}
const images: Array<ImagePart<MediaInputMetadata>> = []
const videos: Array<VideoPart<MediaInputMetadata>> = []
const audios: Array<AudioPart<MediaInputMetadata>> = []
const textSegments: Array<string> = []
for (const part of prompt) {
switch (part.type) {
case 'text':
if (part.content) textSegments.push(part.content)
break
case 'image':
images.push(part)
break
case 'video':
videos.push(part)
break
case 'audio':
audios.push(part)
break
}
}
return {
text: textSegments.join('\n\n'),
parts: prompt,
images,
videos,
audios,
}
}