@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
667 lines • 23.6 kB
JavaScript
/**
* Provider Image Adapter - Smart routing for multimodal content
* Handles provider-specific image formatting and vision capability validation
*/
import { ImageProcessor } from "../utils/imageProcessor.js";
import { logger } from "../utils/logger.js";
/**
* Simplified logger for essential error reporting only
*/
export class MultimodalLogger {
static logError(step, error, context) {
logger.error(`Multimodal ${step} failed: ${error.message}`);
if (process.env.NODE_ENV === "development") {
logger.error("Context:", JSON.stringify(context, null, 2));
logger.error("Stack:", error.stack);
}
}
}
/**
* Image count limits per provider
* These limits prevent API rejections when too many images are sent
*/
const IMAGE_LIMITS = {
openai: 10,
azure: 10, // Same as OpenAI
"google-ai": 16,
google: 16,
anthropic: 20,
vertex: {
// Vertex has model-specific limits
claude: 20, // Claude models on Vertex
gemini: 16, // Gemini models on Vertex
default: 16,
},
ollama: 10, // Conservative limit for Ollama
litellm: 10, // Conservative limit, as it proxies to various providers
mistral: 10, // Conservative limit for Mistral
// Note: Bedrock limit defined for future use when vision support is added
bedrock: 20, // Same as Anthropic for Claude models on Bedrock
openrouter: 10, // Conservative limit, routes to various underlying providers
};
/**
* Proxy providers that route to arbitrary underlying models.
* Vision capability cannot be statically determined for these — pass requests
* through and let the underlying provider surface errors if needed.
*/
const PROXY_PROVIDERS = new Set(["litellm", "openrouter"]);
/**
* Normalize provider name/alias to its canonical form for vision checks.
*/
function normalizeVisionProvider(provider) {
const lower = provider.toLowerCase();
// Strip non-alpha characters so alias forms (e.g. "lm-studio", "lm_studio",
// "llama.cpp", "nvidia_nim") all collapse onto a canonical key. Mirrors
// the alias-normalization pattern used in pricing.ts and contextWindows.ts.
const stripped = lower.replace(/[^a-z]/g, "");
switch (stripped) {
case "lmstudio":
return "lm-studio";
case "llamacpp":
return "llamacpp";
case "nvidianim":
return "nvidia-nim";
case "googleaistudio":
return "google-ai";
case "or":
return "openrouter";
default:
return lower;
}
}
/**
* Vision capability definitions for each provider
*/
const VISION_CAPABILITIES = {
openai: [
// GPT-5.4 family (released Mar 2026) - Latest flagship models
"gpt-5.4",
"gpt-5.4-mini",
"gpt-5.4-nano",
"gpt-5.4-pro",
// GPT-5.2 family (released Dec 11, 2025)
"gpt-5.2",
"gpt-5.2-chat-latest",
"gpt-5.2-pro",
// GPT-5 family (released Aug 2025)
"gpt-5",
"gpt-5-2025-08-07",
"gpt-5-pro",
"gpt-5-mini",
"gpt-5-nano",
// GPT-4.1 family (released Apr 2025)
"gpt-4.1",
"gpt-4.1-mini",
"gpt-4.1-nano",
// o-series reasoning models (released Apr 2025)
"o3",
"o3-mini",
"o3-pro",
"o4",
"o4-mini",
"o4-mini-deep-research",
// Existing GPT-4 models
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"gpt-4-vision-preview",
],
"google-ai": [
// Gemini 3.1 Series (all require -preview suffix)
"gemini-3.1-pro-preview",
"gemini-3.1-flash-lite-preview",
"gemini-3.1-flash-image-preview",
"gemini-3.1-pro-preview-customtools",
// Gemini 3 Series
"gemini-3-flash-preview",
"gemini-3-pro-image-preview",
"gemini-3-pro-preview",
// Gemini 2.5 Series
"gemini-2.5-pro",
"gemini-2.5-flash",
"gemini-2.5-flash-lite",
"gemini-2.5-flash-image",
// Gemini 2.0 Series
"gemini-2.0-flash",
"gemini-2.0-flash-001",
"gemini-2.0-flash-lite",
"gemini-2.0-flash-preview-image-generation",
// Gemini 1.5 Series (Legacy)
"gemini-1.5-pro",
"gemini-1.5-flash",
"gemini-pro-vision",
],
anthropic: [
// Claude 4.6 Series (February 2026)
"claude-opus-4-6",
"claude-sonnet-4-6",
// Claude 4.5 Series (September-November 2025)
"claude-sonnet-4-5",
"claude-sonnet-4-5-20250929",
"claude-opus-4-5",
"claude-opus-4-5-20251101",
"claude-haiku-4-5",
"claude-haiku-4-5-20251001",
// Claude 4.1 and 4.0 Series
"claude-opus-4-1",
"claude-opus-4-1-20250805",
"claude-opus-4",
"claude-opus-4-20250514",
"claude-sonnet-4",
"claude-sonnet-4-20250514",
// Claude 3.7 Series
"claude-3-7-sonnet",
"claude-3-7-sonnet-20250219",
// Claude 3.5 Series
"claude-3-5-sonnet",
"claude-3-5-sonnet-20241022",
// Claude 3 Series
"claude-3-opus",
"claude-3-sonnet",
"claude-3-haiku",
],
azure: [
// GPT-5.1 family (December 2025)
"gpt-5.1",
"gpt-5.1-chat",
"gpt-5.1-codex",
// GPT-5 family
"gpt-5",
"gpt-5-pro",
"gpt-5-turbo",
"gpt-5-chat",
"gpt-5-mini",
// GPT-4.1 family
"gpt-4.1",
"gpt-4.1-mini",
"gpt-4.1-nano",
// O-series
"o3",
"o3-mini",
"o3-pro",
"o4-mini",
// Existing GPT-4
"gpt-4o",
"gpt-4o-mini",
"gpt-4-turbo",
"gpt-4-vision-preview",
"gpt-4",
],
vertex: [
// Gemini 3.1 models on Vertex AI (all require -preview suffix)
"gemini-3.1-pro-preview",
"gemini-3.1-flash-lite-preview",
"gemini-3.1-flash-image-preview",
"gemini-3.1-pro-preview-customtools",
// Gemini 3 Series on Vertex AI
"gemini-3-flash-preview",
"gemini-3-pro-image-preview",
"gemini-3-pro-preview",
// Gemini 2.5 models on Vertex AI
"gemini-2.5-pro",
"gemini-2.5-flash",
"gemini-2.5-flash-lite",
"gemini-2.5-flash-image",
// Gemini 2.0 models on Vertex AI
"gemini-2.0-flash-001",
"gemini-2.0-flash-lite",
"gemini-2.0-flash",
// Gemini 1.5 models on Vertex AI
"gemini-1.5-pro",
"gemini-1.5-flash",
// Claude 4.5 models (versioned format - September-November 2025)
"claude-sonnet-4-5@",
"claude-opus-4-5@",
"claude-haiku-4-5@",
"claude-haiku-4-5",
// Claude 4 models (versioned format - May 2025)
"claude-sonnet-4@",
"claude-opus-4@",
"claude-opus-4-1@",
// Claude 3.x models (versioned format)
"claude-3-7-sonnet@",
"claude-3-5-sonnet@",
"claude-opus-3@",
"claude-haiku-3@",
// Claude models (non-versioned format)
"claude-3-7-sonnet",
"claude-3-5-sonnet",
"claude-3-opus",
"claude-3-sonnet",
"claude-3-haiku",
"claude-sonnet-4",
"claude-sonnet-3",
"claude-opus-3",
"claude-haiku-3",
"claude-haiku-4",
// Additional patterns for compatibility
"claude-3.5-sonnet",
"claude-3.5-haiku",
"claude-4.5-sonnet",
"claude-4.5-opus",
"claude-4.5-haiku",
"claude-haiku-4-5",
"claude-3.0-sonnet",
"claude-3.0-opus",
],
litellm: [
// LiteLLM proxies to underlying providers
// List models that support vision when going through the proxy
// OpenAI models via LiteLLM
"openai/gpt-5",
"openai/gpt-4o",
"openai/gpt-4o-mini",
"openai/gpt-4-turbo",
"gpt-5",
"gpt-4o",
"gpt-4.1",
// Anthropic models via LiteLLM
"anthropic/claude-sonnet-4-5-20250929",
"anthropic/claude-opus-4-1-20250805",
"anthropic/claude-3-5-sonnet-20240620",
"claude-sonnet-4-5",
"claude-sonnet-4-5-20250929",
"claude-opus-4-5",
"claude-opus-4-5-20251101",
"claude-haiku-4-5-20251001",
"claude-sonnet-4",
"claude-opus-4-1",
// Gemini models via LiteLLM
"vertex_ai/gemini-2.5-pro",
"gemini/gemini-2.5-pro",
"gemini/gemini-2.0-flash",
"gemini-3.1-pro-preview",
"gemini-3-flash-preview",
"gemini-2.5-pro",
"gemini-2.5-flash",
"gemini-2.0-flash-lite",
// Groq models via LiteLLM (vision)
"groq/llama-3.2-11b-vision-preview",
],
openrouter: [
// OpenRouter provides access to vision-capable models from multiple providers
// Anthropic Claude models (via OpenRouter)
"anthropic/claude-3.7-sonnet",
"anthropic/claude-3-5-haiku",
"anthropic/claude-3-opus",
"anthropic/claude-3-sonnet",
"anthropic/claude-3-haiku",
// OpenAI models (via OpenRouter)
"openai/gpt-4o",
"openai/gpt-4o-mini",
"openai/gpt-4-turbo",
"openai/gpt-4-vision-preview",
// Google models (via OpenRouter)
"google/gemini-2.5-pro",
"google/gemini-2.5-flash",
"google/gemini-2.0-flash",
"google/gemini-2.0-flash-001",
"google/gemini-1.5-pro",
"google/gemini-1.5-flash",
"google/gemini-pro-vision",
// Meta Llama models (vision-capable via OpenRouter)
"meta-llama/llama-3.2-90b-vision-instruct",
"meta-llama/llama-3.2-11b-vision-instruct",
// Pixtral/Mistral models (via OpenRouter)
"mistralai/pixtral-12b",
"mistralai/pixtral-large",
// Qwen models (via OpenRouter)
"qwen/qwen-2-vl-72b-instruct",
"qwen/qwen-2-vl-7b-instruct",
],
mistral: [
// Mistral Large (latest has vision via Pixtral integration)
"mistral-large-latest",
"mistral-large-2512",
// Mistral Small 3.2 (vision support for images: PNG, JPEG, WEBP, GIF)
"mistral-small",
"mistral-small-latest",
"mistral-small-3.2",
"mistral-small-2506",
// Mistral Medium 3.1 (vision support)
"mistral-medium",
"mistral-medium-latest",
"mistral-medium-3.1",
"mistral-medium-2508",
// Magistral models (vision support)
"magistral-small",
"magistral-small-latest",
"magistral-medium",
"magistral-medium-latest",
// Pixtral models (specialized vision models)
"pixtral-12b",
"pixtral-12b-latest",
"pixtral-large",
"pixtral-large-latest",
"pixtral-large-2502",
],
ollama: [
// Llama 4 family (May 2025 - Best vision + tool calling)
"llama4:scout",
"llama4:maverick",
"llama4:latest",
"llama4",
// Llama 3.2 vision variants
"llama3.2-vision",
"llama3.2-vision:11b",
"llama3.2-vision:90b",
// Gemma 3 family (SigLIP vision encoder - supports tool calling + vision)
"gemma3",
"gemma3:4b",
"gemma3:12b",
"gemma3:27b",
"gemma3:latest",
// Qwen 2.5 VL (Vision-Language)
"qwen2.5-vl",
"qwen2.5-vl:72b",
"qwen2.5-vl:32b",
// Mistral Small family (vision + tool calling)
"mistral-small3.1",
"mistral-small3.1:large",
"mistral-small3.1:medium",
"mistral-small3.1:small",
// LLaVA (vision-focused)
"llava",
"llava:7b",
"llava:13b",
"llava:34b",
"llava-llama3",
"llava-llama3:8b",
],
bedrock: [
// Amazon Nova models (December 2024+) - multimodal vision support
"amazon.nova-premier",
"amazon.nova-premier-v1:0",
"amazon.nova-pro",
"amazon.nova-pro-v1:0",
"amazon.nova-lite",
"amazon.nova-lite-v1:0",
"amazon.nova-2-lite-v1:0",
"nova-premier",
"nova-pro",
"nova-lite",
// Claude 4.5 family (supports vision, PDFs, images - September-November 2025)
"claude-sonnet-4-5",
"claude-sonnet-4.5",
"anthropic.claude-sonnet-4-5",
"anthropic.claude-sonnet-4-5-20250929-v1:0",
"claude-opus-4-5",
"claude-opus-4.5",
"anthropic.claude-opus-4-5",
"anthropic.claude-opus-4-5-20251124-v1:0",
"claude-haiku-4-5",
"claude-haiku-4.5",
"anthropic.claude-haiku-4-5",
"anthropic.claude-haiku-4-5-20251001-v1:0",
// Claude 4 family (May 2025)
"claude-sonnet-4",
"claude-sonnet-4@",
"anthropic.claude-sonnet-4",
"anthropic.claude-sonnet-4-20250514-v1:0",
"claude-opus-4",
"claude-opus-4-1",
"claude-opus-4@",
"anthropic.claude-opus-4",
"anthropic.claude-opus-4-1-20250805-v1:0",
// Claude 3.7 Sonnet
"claude-3-7-sonnet",
"claude-3.7-sonnet",
"anthropic.claude-3-7-sonnet",
"anthropic.claude-3-7-sonnet-20250219-v1:0",
// Claude 3.5 Sonnet
"claude-3-5-sonnet",
"claude-3.5-sonnet",
"anthropic.claude-3-5-sonnet",
"anthropic.claude-3-5-sonnet-20241022-v1:0",
// Claude 3 Opus
"claude-3-opus",
"anthropic.claude-3-opus",
// Claude 3 Sonnet
"claude-3-sonnet",
"anthropic.claude-3-sonnet",
// Claude 3 Haiku
"claude-3-haiku",
"anthropic.claude-3-haiku",
// Meta Llama 4 models (multimodal vision)
"meta.llama4-maverick-17b-instruct-v1:0",
"meta.llama4-scout-17b-instruct-v1:0",
// Meta Llama 3.2 vision models
"meta.llama3-2-90b-instruct-v1:0",
"meta.llama3-2-11b-instruct-v1:0",
// Mistral Pixtral (multimodal vision)
"mistral.pixtral-large-2502-v1:0",
// Generic anthropic.claude prefix (catches all Claude models)
"anthropic.claude",
],
huggingface: [
// Qwen 2.5 VL (Vision-Language)
"Qwen/Qwen2.5-VL-32B-Instruct",
"Qwen/Qwen2.5-VL-7B-Instruct",
// Microsoft Phi-3 Vision
"microsoft/Phi-3-vision-128k-instruct",
// LLaVA variants
"llava-hf/llava-1.5-7b-hf",
"llava-hf/llava-v1.6-mistral-7b-hf",
],
sagemaker: [
// Meta Llama 4 vision models
"meta-llama-4-maverick-17b-128e-instruct",
"meta-llama-4-scout-17b-16e-instruct",
],
// DeepSeek has no vision support — empty list
deepseek: [],
"nvidia-nim": [
"meta/llama-3.2-90b-vision-instruct",
"meta/llama-3.2-11b-vision-instruct",
],
// LM Studio + llama.cpp: vision depends on the loaded model.
// Substrings must point at known multimodal variants only — bare
// "llama-3.2" matches the text-only Llama-3.2-1B/3B chat models.
"lm-studio": [
"llava",
"llama-3.2-11b-vision",
"llama-3.2-90b-vision",
"vision-instruct",
"qwen2-vl",
"qwen2.5-vl",
"phi-3-vision",
],
llamacpp: [
"llava",
"llama-3.2-11b-vision",
"llama-3.2-90b-vision",
"vision-instruct",
"qwen2-vl",
"phi-3-vision",
],
// xAI: only grok-2-vision is multimodal today
xai: ["grok-2-vision-latest"],
// Groq: vision models are explicit "*-vision-preview" variants
groq: ["llama-3.2-90b-vision-preview", "llama-3.2-11b-vision-preview"],
// Cohere: command-r* are text-only (no vision); empty list
cohere: [],
// Together AI: text-only by default; add vision variants if/when used.
"together-ai": [],
// Fireworks: vision via Phi-3-Vision and Llama 3.2 vision variants.
fireworks: [
"accounts/fireworks/models/phi-3-vision-128k-instruct",
"accounts/fireworks/models/llama-v3p2-90b-vision-instruct",
"accounts/fireworks/models/llama-v3p2-11b-vision-instruct",
],
// Perplexity Sonar — text-only with web grounding.
perplexity: [],
// Cloudflare: explicit vision variants only.
cloudflare: ["@cf/meta/llama-3.2-11b-vision-instruct"],
// Replicate: vision capability depends on the specific model id.
replicate: ["llava", "llama-3.2-vision", "moondream", "qwen2-vl"],
// Voyage / Jina — embedding-only, not multimodal in this sense.
voyage: [],
jina: [],
// Stability / Ideogram / Recraft — image-OUTPUT, not image-INPUT.
// VISION_CAPABILITIES tracks reference-image input support.
stability: [],
ideogram: [],
recraft: [],
};
/**
* Provider Image Adapter - Smart routing and formatting
*/
export class ProviderImageAdapter {
// NOTE: The legacy `adaptForProvider` method and its private helpers
// (formatForOpenAI, formatForGoogleAI, formatForAnthropic, formatForVertex,
// validateVisionSupport) were removed as dead code. The production image
// pipeline uses `convertSimpleImagesToProviderFormat` in messageBuilder.ts
// with Vercel AI SDK's native ImagePart format. Image count limits are
// enforced via the public `validateImageCount` method below.
/**
* Validate image count against provider limits.
* Warns at 80% threshold, throws error if limit exceeded.
*/
static validateImageCount(imageCount, provider, model) {
const normalizedProvider = provider.toLowerCase();
let limit;
// Determine the limit based on provider
if (normalizedProvider === "vertex" && model) {
// Vertex has model-specific limits
if (model.includes("claude")) {
limit = IMAGE_LIMITS.vertex.claude;
}
else if (model.includes("gemini")) {
limit = IMAGE_LIMITS.vertex.gemini;
}
else {
limit = IMAGE_LIMITS.vertex.default;
}
}
else {
// Use provider-specific limit
const providerLimit = normalizedProvider in IMAGE_LIMITS
? IMAGE_LIMITS[normalizedProvider]
: undefined;
// If provider not found in limits map, use a conservative default
if (providerLimit === undefined) {
// Conservative default for unknown providers
limit = 10;
logger.warn(`Image count limit not defined for provider ${provider}. Using conservative default of 10 images.`);
}
else {
// providerLimit is always a number when defined (except vertex which is handled separately)
limit = providerLimit;
}
}
// Warn only once at 80% threshold to avoid noise in batch processing
const warningThreshold = Math.floor(limit * 0.8);
if (imageCount === warningThreshold) {
logger.warn(`Image count (${imageCount}) is approaching the limit for ${provider}. ` +
`Maximum allowed: ${limit}. Please reduce the number of images.`);
}
// Throw error if limit exceeded
if (imageCount > limit) {
throw new Error(`Image count (${imageCount}) exceeds the maximum limit for ${provider}. ` +
`Maximum allowed: ${limit}. Please reduce the number of images.`);
}
}
/**
* Convert simple images array to advanced content format
* @param text - Text content to include
* @param images - Array of images (Buffer, string, or ImageWithAltText)
*/
static convertToContent(text, images) {
const content = [{ type: "text", text }];
if (images && images.length > 0) {
images.forEach((image) => {
// Handle both simple images and images with alt text
const imageData = typeof image === "object" &&
"data" in image &&
!Buffer.isBuffer(image)
? image.data
: image;
const altText = typeof image === "object" &&
"data" in image &&
!Buffer.isBuffer(image)
? image.altText
: undefined;
content.push({
type: "image",
data: imageData,
altText,
mediaType: ImageProcessor.detectImageType(imageData),
});
});
}
return content;
}
/**
* Check if provider supports multimodal content
*/
static supportsVision(provider, model) {
try {
const normalizedProvider = normalizeVisionProvider(provider);
const supportedModels = VISION_CAPABILITIES[normalizedProvider];
if (!supportedModels) {
return false;
}
// An empty list means the provider has NO vision support (e.g. deepseek).
// Without this guard, the no-model branch below would return `true` for
// every provider that has an entry in VISION_CAPABILITIES — even an empty
// one — letting vision requests through to a text-only API.
if (supportedModels.length === 0) {
return false;
}
if (!model) {
return true; // Provider supports vision, but need to check specific model
}
const modelMatched = supportedModels.some((supportedModel) => model.toLowerCase().includes(supportedModel.toLowerCase()));
// Proxy providers route to arbitrary underlying models — pass through if
// the model isn't in the known allowlist.
if (!modelMatched && PROXY_PROVIDERS.has(normalizedProvider)) {
return true;
}
return modelMatched;
}
catch {
return false;
}
}
/**
* Get supported models for a provider
*/
static getSupportedModels(provider) {
const normalizedProvider = provider.toLowerCase();
const models = VISION_CAPABILITIES[normalizedProvider];
return models ? [...models] : [];
}
/**
* Get all vision-capable providers
*/
static getVisionProviders() {
// Filter out providers whose allowlist is empty (e.g. deepseek). They're
// listed in VISION_CAPABILITIES so supportsVision can return false for
// them, but they should not be advertised as vision-capable.
return Object.entries(VISION_CAPABILITIES)
.filter(([, models]) => models.length > 0)
.map(([provider]) => provider);
}
/**
* Count total "images" in a message (actual images + PDF pages)
* PDF pages count toward image limits for providers
*/
static countImagesInMessage(images, pdfPages) {
const imageCount = images?.length || 0;
const pageCount = pdfPages ?? 0;
return imageCount + pageCount;
}
/**
* Extract page count from PDF metadata array
* Returns total pages across all PDFs
*/
static countImagesInPages(pdfMetadataArray) {
if (!pdfMetadataArray || pdfMetadataArray.length === 0) {
return 0;
}
return pdfMetadataArray.reduce((total, pdf) => {
return total + (pdf.pageCount ?? 0);
}, 0);
}
}
//# sourceMappingURL=providerImageAdapter.js.map