UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

559 lines (558 loc) 17 kB
/** * Multimodal Content Types for NeuroLink * * Central registry for all multimodal input/output types. * This file consolidates types from content.ts and conversation.ts * to provide a single source of truth for multimodal functionality. * * @module types/multimodal * * @example Basic Multimodal Input * ```typescript * import type { MultimodalInput } from './types/multimodal.js'; * * const input: MultimodalInput = { * text: "What's in this image?", * images: [imageBuffer, "https://example.com/image.jpg"], * pdfFiles: [pdfBuffer] * }; * ``` * * @example Audio/Video Input (Future) * ```typescript * const avInput: MultimodalInput = { * text: "Transcribe this audio and analyze this video", * audioFiles: [audioBuffer], * videoFiles: ["path/to/video.mp4"] * }; * ``` * * @example Advanced Content Array * ```typescript * const advanced: MultimodalInput = { * text: "irrelevant", // ignored when content[] is provided * content: [ * { type: "text", text: "Analyze these items:" }, * { type: "image", data: imageBuffer, mediaType: "image/jpeg" }, * { type: "pdf", data: pdfBuffer, metadata: { filename: "report.pdf" } } * ] * }; * ``` */ /** * Text content type for multimodal messages */ export type TextContent = { type: "text"; text: string; }; /** * Image content type for multimodal messages */ export type ImageContent = { type: "image"; data: Buffer | string; /** Alternative text for accessibility (screen readers, SEO) */ altText?: string; mediaType?: "image/jpeg" | "image/png" | "image/gif" | "image/webp" | "image/bmp" | "image/tiff"; metadata?: { description?: string; quality?: "low" | "high" | "auto"; dimensions?: { width: number; height: number; }; filename?: string; }; }; /** * CSV content type for multimodal messages */ export type CSVContent = { type: "csv"; data: Buffer | string; metadata?: { filename?: string; maxRows?: number; formatStyle?: "raw" | "markdown" | "json"; description?: string; }; }; /** * PDF document content type for multimodal messages */ export type PDFContent = { type: "pdf"; data: Buffer | string; metadata?: { filename?: string; pages?: number; version?: string; description?: string; }; }; /** * Audio content type for multimodal messages * * NOTE: This is for FILE-BASED audio input (not streaming). * For streaming audio (live transcription), use AudioInputSpec from streamTypes.ts * * @example * ```typescript * const audioContent: AudioContent = { * type: "audio", * data: audioBuffer, * mediaType: "audio/mpeg", * metadata: { * filename: "recording.mp3", * duration: 120.5, * transcription: "Hello world" * } * }; * ``` */ export type AudioContent = { type: "audio"; data: Buffer | string; mediaType?: "audio/mpeg" | "audio/wav" | "audio/ogg" | "audio/webm" | "audio/aac" | "audio/flac" | "audio/mp4"; metadata?: { filename?: string; duration?: number; sampleRate?: number; channels?: number; transcription?: string; language?: string; }; }; /** * Video output configuration options for video generation * * Used with `output.video` in GenerateOptions when `output.mode` is "video". * Controls resolution, duration, aspect ratio, and audio settings for generated videos. * * @example * ```typescript * const videoOptions: VideoOutputOptions = { * resolution: "1080p", * length: 8, * aspectRatio: "16:9", * audio: true * }; * ``` */ export type VideoOutputOptions = { /** * Per-call cancellation signal forwarded to provider requests and polling * loops. When aborted, long-running video generation is interrupted and * the handler throws a non-retriable abort error. */ abortSignal?: AbortSignal; /** * Override the video-gen provider. Defaults to "vertex" or to the LLM * provider name if it is also a registered video handler. * * Registered providers are managed via `VideoProcessor.registerHandler` * (see src/lib/utils/videoProcessor.ts). Examples: "vertex", "kling", * "runway", "replicate". */ provider?: string; /** * Specific model to use within the provider. Provider-specific shape * (e.g. "veo-3.1-generate-001" for vertex; "atonamy/wan-alpha:..." for * replicate). */ model?: string; /** Output resolution - "720p" (1280x720) or "1080p" (1920x1080) */ resolution?: "720p" | "1080p"; /** Video duration in seconds (4, 6, or 8 seconds supported) */ length?: 4 | 6 | 8; /** Aspect ratio - "9:16" for portrait, "16:9" for landscape, "1:1" for square */ aspectRatio?: "9:16" | "16:9" | "1:1"; /** Enable audio generation (default: true) */ audio?: boolean; /** * Publicly accessible URL of the input image. * Required by providers that do not accept inline base64 data (e.g. PiAPI Kling). * When provided and the provider requires a URL, this takes precedence over the * `image` Buffer argument passed to `generate()`. */ imageUrl?: string; /** * Per-call provider credentials. Takes precedence over instance-level * credentials set at construction time, which in turn override env vars. */ credentials?: import("./providers.js").NeurolinkCredentials; }; /** * A single segment in Director Mode, representing one video clip. */ export type DirectorSegment = { /** Prompt describing the video content for this segment */ prompt: string; /** Input image for this segment (Buffer, URL string, file path, or ImageWithAltText) */ image: Buffer | string | ImageWithAltText; }; /** * Director Mode configuration options. * Used when `input.segments` is provided to control transition generation. */ export type DirectorModeOptions = { /** * Prompts for generating transition clips (array of N-1 entries for N segments). * transitionPrompts[i] is used for the transition between segment i and segment i+1. * If omitted, defaults to "Smooth cinematic transition between scenes". */ transitionPrompts?: string[]; /** * Duration of each transition clip in seconds (array of N-1 entries for N segments). * Each value must be 4, 6, or 8 (4 recommended for seamless feel). * If omitted, all transitions default to 4 seconds. * @default [4, 4, ...] */ transitionDurations?: Array<4 | 6 | 8>; }; /** * Result type for generated video content * * Returned in `GenerateResult.video` when video generation is successful. * Contains the raw video buffer and associated metadata. * * @example * ```typescript * const result = await neurolink.generate({ * input: { text: "Product showcase", images: [imageBuffer] }, * provider: "vertex", * model: "veo-3.1", * output: { mode: "video" } * }); * * if (result.video) { * writeFileSync("output.mp4", result.video.data); * console.log(`Duration: ${result.video.metadata?.duration}s`); * } * ``` */ export type VideoGenerationResult = { /** Raw video data as Buffer */ data: Buffer; /** Video media type */ mediaType: "video/mp4" | "video/webm"; /** Video metadata */ metadata?: { /** Original filename if applicable */ filename?: string; /** Video duration in seconds */ duration?: number; /** Video dimensions */ dimensions?: { width: number; height: number; }; /** Frame rate in fps */ frameRate?: number; /** Video codec used */ codec?: string; /** Model used for generation */ model?: string; /** Provider used for generation */ provider?: string; /** Aspect ratio of the video */ aspectRatio?: string; /** Whether audio was enabled during generation */ audioEnabled?: boolean; /** Processing time in milliseconds */ processingTime?: number; /** Number of main segments in the video */ segmentCount?: number; /** Number of transition clips generated */ transitionCount?: number; /** Duration of each main clip in seconds */ clipDuration?: number; /** Durations of each transition in seconds (one per transition) */ transitionDurations?: number[]; /** Per-segment metadata */ segments?: Array<{ index: number; duration: number; processingTime: number; }>; /** Per-transition metadata */ transitions?: Array<{ fromSegment: number; toSegment: number; duration: number; processingTime: number; }>; }; }; /** * Video content type for multimodal messages * * NOTE: This is for FILE-BASED video input. * For streaming video, this type may be extended in future. * * @example * ```typescript * const videoContent: VideoContent = { * type: "video", * data: videoBuffer, * mediaType: "video/mp4", * metadata: { * filename: "demo.mp4", * duration: 300, * dimensions: { width: 1920, height: 1080 } * } * }; * ``` */ export type VideoContent = { type: "video"; data: Buffer | string; mediaType?: "video/mp4" | "video/webm" | "video/ogg" | "video/quicktime" | "video/x-msvideo" | "video/x-matroska"; metadata?: { filename?: string; duration?: number; dimensions?: { width: number; height: number; }; frameRate?: number; codec?: string; extractedFrames?: string[]; transcription?: string; }; }; /** * Union type for all content types * Covers text, images, documents, and multimedia */ export type Content = TextContent | ImageContent | CSVContent | PDFContent | AudioContent | VideoContent; /** * Image data with optional alt text for accessibility * Use this when you need to provide alt text for screen readers and SEO * * @example * ```typescript * const imageWithAlt: ImageWithAltText = { * data: imageBuffer, * altText: "A dashboard showing quarterly sales trends" * }; * ``` */ export type ImageWithAltText = { /** Image data as Buffer, base64 string, URL, or data URI */ data: Buffer | string; /** Alternative text for accessibility (screen readers, SEO) */ altText?: string; }; /** * Multimodal input type for options that may contain images or content arrays * This is the primary interface for users to provide multimodal content */ export type MultimodalInput = { text: string; /** * Images to include in the request. * Can be simple image data (Buffer, string) or objects with alt text for accessibility. * * @example Simple usage * ```typescript * images: [imageBuffer, "https://example.com/image.jpg"] * ``` * * @example With alt text for accessibility * ```typescript * images: [ * { data: imageBuffer, altText: "Product screenshot showing main dashboard" }, * { data: "https://example.com/chart.png", altText: "Sales chart for Q3 2024" } * ] * ``` */ images?: Array<Buffer | string | ImageWithAltText>; content?: Content[]; csvFiles?: Array<Buffer | string>; pdfFiles?: Array<Buffer | string>; files?: Array<Buffer | string>; /** Audio files for file-based audio processing (future) */ audioFiles?: Array<Buffer | string>; /** Video files for file-based video processing (future) */ videoFiles?: Array<Buffer | string>; /** * Director Mode segments for multi-clip video generation. * Each segment contains a prompt and image for generating one video clip. * Automatically enables Director Mode when provided. * * @example * ```typescript * segments: [ * { prompt: "Product reveal", image: imageBuffer1 }, * { prompt: "Feature showcase", image: "./image2.jpg" }, * { prompt: "Call to action", image: { data: imageBuffer3, altText: "CTA" } } * ] * ``` */ segments?: DirectorSegment[]; }; /** * Content format for multimodal messages (used internally) * Compatible with Vercel AI SDK message format */ export type MessageContent = { type: string; text?: string; image?: string; mimeType?: string; [key: string]: unknown; }; /** * Extended chat message for multimodal support (internal use) * Used during message processing and transformation */ export type MultimodalChatMessage = { /** Role of the message sender */ role: "user" | "assistant" | "system"; /** Content of the message - can be text or multimodal content array */ content: string | MessageContent[]; /** Provider-specific options (e.g. Anthropic cache_control) */ providerOptions?: Record<string, unknown>; }; /** * Multimodal message structure for provider adapters */ export type MultimodalMessage = { role: "user" | "assistant" | "system"; content: Content[]; }; /** * Vision capability information for providers */ export type VisionCapability = { provider: string; supportedModels: string[]; maxImageSize?: number; supportedFormats: string[]; maxImagesPerRequest?: number; }; /** * Provider-specific image format requirements */ export type ProviderImageFormat = { provider: string; format: "data_uri" | "base64" | "inline_data" | "source"; requiresPrefix?: boolean; mimeTypeField?: string; dataField?: string; }; /** * Image processing result */ export type ProcessedImage = { data: string; mediaType: string; size: number; format: "data_uri" | "base64" | "inline_data" | "source"; }; /** * Provider-specific multimodal payload */ export type ProviderMultimodalPayload = { provider: string; model: string; messages?: MultimodalMessage[]; contents?: unknown[]; [key: string]: unknown; }; /** * Type guard to check if content is TextContent */ export declare function isTextContent(content: Content): content is TextContent; /** * Type guard to check if content is ImageContent */ export declare function isImageContent(content: Content): content is ImageContent; /** * Type guard to check if content is CSVContent */ export declare function isCSVContent(content: Content): content is CSVContent; /** * Type guard to check if content is PDFContent */ export declare function isPDFContent(content: Content): content is PDFContent; /** * Type guard to check if content is AudioContent */ export declare function isAudioContent(content: Content): content is AudioContent; /** * Type guard to check if content is VideoContent */ export declare function isVideoContent(content: Content): content is VideoContent; export declare function isMultimodalInput(input: unknown): input is MultimodalInput; /** * Type guard to check if message content is multimodal (array) */ export declare function isMultimodalMessageContent(content: string | MessageContent[]): content is MessageContent[]; /** Result of a single director-mode clip generation. */ export type ClipResult = { buffer: Buffer; processingTime: number; }; /** Completion status for ordered circuit-breaker tracking. */ export type ClipCompletion = { status: "pending"; } | { status: "success"; result: ClipResult; } | { status: "failure"; error: Error; }; /** State shared across clip-generation tasks for circuit-breaker logic. */ export type ClipGenState = { consecutiveFailures: number; circuitOpen: boolean; results: Array<ClipResult | null>; completions: ClipCompletion[]; nextExpectedIndex: number; }; /** Result of a single director-mode transition generation. */ export type TransitionResult = { buffer: Buffer | null; fromSegment: number; toSegment: number; duration: number; processingTime: number; }; /** Polling result envelope returned by Vertex Veo long-running operations. */ export type VertexOperationResult = { done?: boolean; response?: { videos?: Array<{ bytesBase64Encoded?: string; gcsUri?: string; }>; }; error?: { message?: string; }; }; /** Output format accepted by the image compressor. */ export type SupportedFormat = "jpeg" | "png" | "webp"; /** Options consumed by compressImage(). */ export type CompressionOptions = { provider: import("./providers.js").ProviderName; quality?: number; maxDimension?: number; format?: SupportedFormat; }; /** Result of compressImage() with metadata. */ export type CompressionResult = { buffer: Buffer; originalSize: number; compressedSize: number; compressionRatio: number; metadata: { width: number; height: number; format: string; }; };