@aj-archipelago/cortex
Version:
Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.
572 lines (501 loc) • 25 kB
JavaScript
import Gemini15ChatPlugin from './gemini15ChatPlugin.js';
import CortexResponse from '../../lib/cortexResponse.js';
import { requestState } from '../requestState.js';
import { addCitationsToResolver } from '../../lib/pathwayTools.js';
import mime from 'mime-types';
class Gemini15VisionPlugin extends Gemini15ChatPlugin {
constructor(pathway, model) {
super(pathway, model);
this.isMultiModal = true;
this.pathwayToolCallback = pathway.toolCallback;
this.toolCallsBuffer = [];
this.contentBuffer = '';
this.hadToolCalls = false;
}
// Override the convertMessagesToGemini method to handle multimodal vision messages
// This function can operate on messages in Gemini native format or in OpenAI's format
// It will convert the messages to the Gemini format
convertMessagesToGemini(messages) {
let modifiedMessages = [];
let lastAuthor = '';
let systemParts = [];
// Check if the messages are already in the Gemini format
if (messages[0] && Object.prototype.hasOwnProperty.call(messages[0], 'parts')) {
modifiedMessages = messages;
} else {
messages.forEach(message => {
const { role, author, content } = message;
if (role === 'system') {
if (Array.isArray(content)) {
content.forEach(item => systemParts.push({ text: item }));
} else {
systemParts.push({ text: content });
}
return;
}
// Convert content to Gemini format, trying to maintain compatibility
const convertPartToGemini = (inputPart) => {
try {
// First try to parse as JSON if it's a string
const part = typeof inputPart === 'string' ? JSON.parse(inputPart) : inputPart;
const {type, text, image_url, gcs, url} = part;
// Check for URL in multiple places: gcs, image_url.url, or direct url property
let fileUrl = gcs || image_url?.url || url;
if (typeof part === 'string') {
return { text: inputPart };
} else if (type === 'text') {
return { text: text };
} else if (type === 'image_url') {
if (!fileUrl) {
return null;
}
if (fileUrl.startsWith('gs://')) {
// Validate GCS URL has at least a bucket name after gs://
const gcsPath = fileUrl.slice(5); // Remove 'gs://'
if (!gcsPath || gcsPath.length < 1) {
return null;
}
return {
fileData: {
mimeType: mime.lookup(fileUrl) || 'image/jpeg',
fileUri: fileUrl
}
};
} else if (fileUrl.includes('base64,')) {
const base64Data = fileUrl.split('base64,')[1];
if (!base64Data) {
return null;
}
// Extract MIME type from data URL if available
const mimeMatch = fileUrl.match(/data:([^;]+);base64,/);
const mimeType = mimeMatch ? mimeMatch[1] : 'image/jpeg';
return {
inlineData: {
mimeType: mimeType,
data: base64Data
}
};
} else if (fileUrl.includes('youtube.com/') || fileUrl.includes('youtu.be/')) {
return {
fileData: {
mimeType: 'video/youtube',
fileUri: fileUrl
}
};
} else if (fileUrl.startsWith('http://') || fileUrl.startsWith('https://')) {
// Gemini can read directly from HTTP/HTTPS URLs using fileData with fileUri
// No need to fetch and convert to base64
return {
fileData: {
mimeType: mime.lookup(fileUrl) || 'image/jpeg',
fileUri: fileUrl
}
};
}
return null;
}
} catch (e) {
// If JSON parsing fails or any other error, treat as plain text
return inputPart ? { text: inputPart } : null;
}
return inputPart ? { text: inputPart } : null;
};
const addPartToMessages = (geminiPart) => {
if (!geminiPart) { return; }
// Gemini requires alternating user: and model: messages
if ((role === lastAuthor || author === lastAuthor) && modifiedMessages.length > 0) {
modifiedMessages[modifiedMessages.length - 1].parts.push(geminiPart);
}
// Handle tool result messages
else if (role === 'tool') {
// Convert OpenAI tool result format to Gemini format
// OpenAI: { role: 'tool', tool_call_id: '...', content: '...' }
// Gemini: { role: 'function', parts: [{ functionResponse: { name: '...', response: { content: '...' } } }] }
const toolCallId = message.tool_call_id || message.toolCallId;
const toolName = toolCallId ? toolCallId.split('_')[0] : 'unknown_tool';
// Convert content array to string if needed (Gemini expects string content)
let toolContent = content;
if (Array.isArray(content)) {
toolContent = content
.map(item => typeof item === 'string' ? item :
(typeof item === 'object' && item?.text) ? item.text :
JSON.stringify(item))
.join('\n');
}
modifiedMessages.push({
role: 'function',
parts: [{
functionResponse: {
name: toolName,
response: {
content: toolContent
}
}
}]
});
lastAuthor = 'function';
}
// Gemini only supports user: and model: roles
else if (role === 'user' || role === 'assistant' || author) {
// Convert 'assistant' to 'model' for Gemini API compatibility
const geminiRole = author || (role === 'assistant' ? 'model' : role);
modifiedMessages.push({
role: geminiRole,
parts: [geminiPart],
});
lastAuthor = geminiRole;
}
};
// Content can either be in the "vision" format (array) or in the "chat" format (string)
if (Array.isArray(content)) {
content.forEach(part => {
addPartToMessages(convertPartToGemini(part));
});
}
else {
addPartToMessages(convertPartToGemini(content));
}
});
}
// Gemini requires an odd number of messages
if (modifiedMessages.length % 2 === 0) {
modifiedMessages = modifiedMessages.slice(1);
}
let system = null;
if (systemParts.length > 0) {
system = { role: 'user', parts: systemParts };
}
return {
modifiedMessages,
system,
};
}
// Recursively convert numeric enums to string enums for Gemini compatibility
convertEnumToStrings(schema) {
if (!schema || typeof schema !== 'object') {
return schema;
}
// Create a deep copy to avoid mutating the original
const converted = Array.isArray(schema) ? [...schema] : { ...schema };
// Convert enum if it exists and contains numbers
if (converted.enum && Array.isArray(converted.enum)) {
converted.enum = converted.enum.map(value => {
// Convert numbers to strings, keep strings as-is
return typeof value === 'number' ? String(value) : value;
});
}
// Recursively process nested objects
if (converted.properties && typeof converted.properties === 'object') {
converted.properties = Object.fromEntries(
Object.entries(converted.properties).map(([key, value]) => [
key,
this.convertEnumToStrings(value)
])
);
}
// Recursively process array items
if (converted.items && typeof converted.items === 'object') {
converted.items = this.convertEnumToStrings(converted.items);
}
// Recursively process anyOf, oneOf, allOf arrays
['anyOf', 'oneOf', 'allOf'].forEach(key => {
if (converted[key] && Array.isArray(converted[key])) {
converted[key] = converted[key].map(item => this.convertEnumToStrings(item));
}
});
return converted;
}
// Convert OpenAI tools to Gemini format
convertOpenAIToolsToGemini(openAITools) {
if (!openAITools || !Array.isArray(openAITools)) {
return [];
}
// Convert OpenAI tools to Gemini functionDeclarations format
const functionDeclarations = openAITools.map(tool => {
if (tool.type === 'function' && tool.function) {
const parameters = tool.function.parameters || {
type: 'object',
properties: {},
required: []
};
// Convert numeric enums to string enums for Gemini compatibility
const convertedParameters = this.convertEnumToStrings(parameters);
return {
name: tool.function.name,
description: tool.function.description || `Tool for ${tool.function.name}`,
parameters: convertedParameters
};
}
return null;
}).filter(Boolean);
// Return in the correct Gemini format: tools array with functionDeclarations
return [{
functionDeclarations: functionDeclarations
}];
}
// Override getRequestParameters to handle tool conversion
getRequestParameters(text, parameters, prompt, cortexRequest) {
// Convert OpenAI tools to Gemini format if present
let convertedTools = [];
// Handle tools parameter - could be string (from REST) or array
let toolsArray = parameters?.tools;
if (typeof toolsArray === 'string') {
try {
toolsArray = JSON.parse(toolsArray);
} catch (e) {
toolsArray = [];
}
}
if (toolsArray && Array.isArray(toolsArray) && toolsArray.length > 0) {
convertedTools = this.convertOpenAIToolsToGemini(toolsArray);
}
const baseParameters = super.getRequestParameters(text, parameters, prompt, cortexRequest);
// Handle tool_choice parameter - convert OpenAI format to Gemini toolConfig
let toolChoice = parameters.tool_choice;
if (typeof toolChoice === 'string' && toolChoice !== 'auto' && toolChoice !== 'none' && toolChoice !== 'required' && toolChoice !== 'any') {
try {
toolChoice = JSON.parse(toolChoice);
} catch (e) {
toolChoice = 'auto';
}
}
if (convertedTools[0]?.functionDeclarations?.length > 0) {
baseParameters.tools = convertedTools;
if (toolChoice) {
if (typeof toolChoice === 'string') {
if (toolChoice === 'auto') {
baseParameters.toolConfig = { functionCallingConfig: { mode: 'AUTO' } };
} else if (toolChoice === 'required' || toolChoice === 'any') {
baseParameters.toolConfig = { functionCallingConfig: { mode: 'ANY' } };
} else if (toolChoice === 'none') {
baseParameters.toolConfig = { functionCallingConfig: { mode: 'NONE' } };
}
} else if (toolChoice.type === 'function') {
// Force specific function - use ANY mode with allowed function names
baseParameters.toolConfig = {
functionCallingConfig: {
mode: 'ANY',
allowedFunctionNames: [toolChoice.function.name || toolChoice.function]
}
};
}
}
} else if (toolChoice === 'none') {
// Even when no tools are provided, if tool_choice is 'none', explicitly disable function calling
// This prevents MALFORMED_FUNCTION_CALL errors when chat history contains function messages
baseParameters.toolConfig = { functionCallingConfig: { mode: 'NONE' } };
}
return baseParameters;
}
async execute(text, parameters, prompt, cortexRequest) {
let result = null;
try {
result = await super.execute(text, parameters, prompt, cortexRequest);
} catch (e) {
const { data } = e;
if (data && data.error) {
if (data.error.code === 400 && data.error.message === 'Precondition check failed.') {
throw new Error('One or more of the included files is too large to process. Please try again with a smaller file.');
}
}
throw e;
}
return result;
}
// Build a toolCall object from a Gemini functionCall response
// Override in subclasses to capture model-specific fields (e.g., thoughtSignature for Gemini 3+)
buildToolCallFromFunctionCall(part) {
return {
id: part.functionCall.name + '_' + Date.now(),
type: "function",
function: {
name: part.functionCall.name,
arguments: JSON.stringify(part.functionCall.args || {})
}
};
}
// Override parseResponse to handle tool calls
parseResponse(data) {
if (!data) {
return data;
}
// Handle streaming data (array of chunks)
if (Array.isArray(data)) {
// For streaming, we'll handle this in processStreamEvent
return super.parseResponse(data);
}
// Handle non-streaming response with tool calls
if (data.candidates && data.candidates[0]) {
const candidate = data.candidates[0];
const { content, finishReason, safetyRatings } = candidate;
// Check for safety blocks
if (safetyRatings?.some(rating => rating.blocked)) {
const cortexResponse = new CortexResponse({
output_text: "\n\n*** Response blocked due to safety ratings ***",
finishReason: "content_filter",
usage: data.usageMetadata || null,
metadata: { model: this.modelName }
});
return cortexResponse;
}
// Check for tool calls
if (content?.parts) {
const toolCalls = [];
let textContent = '';
for (const part of content.parts) {
if (part.functionCall) {
// Use hook method to build toolCall (allows override for Gemini 3+ thoughtSignature)
toolCalls.push(this.buildToolCallFromFunctionCall(part));
} else if (part.text) {
textContent += part.text;
}
}
// Create CortexResponse object
const cortexResponse = new CortexResponse({
output_text: textContent,
finishReason: toolCalls.length > 0 ? "tool_calls" : (finishReason === "STOP" ? "stop" : "length"),
usage: data.usageMetadata || null,
metadata: { model: this.modelName }
});
if (toolCalls.length > 0) {
cortexResponse.toolCalls = toolCalls;
}
// Add citations to resolver for non-streaming responses
const pathwayResolver = requestState[this.requestId]?.pathwayResolver;
if (pathwayResolver && textContent) {
addCitationsToResolver(pathwayResolver, textContent);
}
return cortexResponse;
}
}
// Fallback to parent implementation
return super.parseResponse(data);
}
// Override processStreamEvent to handle tool calls
processStreamEvent(event, requestProgress) {
const eventData = JSON.parse(event.data);
// Initialize requestProgress if needed
requestProgress = requestProgress || {};
requestProgress.data = requestProgress.data || null;
// Reset tool calls flag for new stream
if (!requestProgress.started) {
this.hadToolCalls = false;
this.toolCallsBuffer = [];
// Don't clear contentBuffer here - it should accumulate across all chunks
// this.contentBuffer = '';
}
// Create a helper function to generate message chunks
const createChunk = (delta, finishReason = null) => ({
id: eventData.responseId || `chatcmpl-${Date.now()}`,
object: "chat.completion.chunk",
created: Math.floor(Date.now() / 1000),
model: this.modelName,
choices: [{
index: 0,
delta,
finish_reason: finishReason
}]
});
// Handle content chunks with tool calls
if (eventData.candidates?.[0]?.content?.parts) {
const parts = eventData.candidates[0].content.parts;
for (const part of parts) {
if (part.functionCall) {
// Mark that we have tool calls
this.hadToolCalls = true;
// Use hook method to build toolCall (allows override for Gemini 3+ thoughtSignature)
const toolCall = this.buildToolCallFromFunctionCall(part);
this.toolCallsBuffer.push(toolCall);
// Send tool call delta
requestProgress.data = JSON.stringify(createChunk({
tool_calls: [{
index: this.toolCallsBuffer.length - 1,
id: toolCall.id,
type: "function",
function: {
name: toolCall.function.name,
arguments: toolCall.function.arguments
}
}]
}));
} else if (part.text) {
// Regular text content
this.contentBuffer += part.text;
if (!requestProgress.started) {
// First chunk - send role
requestProgress.data = JSON.stringify(createChunk({ role: "assistant" }));
requestProgress.started = true;
}
// Send content chunk
requestProgress.data = JSON.stringify(createChunk({
content: part.text
}));
}
}
}
// Handle finish reasons
if (eventData.candidates?.[0]?.finishReason === "STOP") {
const finishReason = this.hadToolCalls ? "tool_calls" : "stop";
// Check if there's any remaining content in the final chunk that needs to be published
if (eventData.candidates?.[0]?.content?.parts) {
const parts = eventData.candidates[0].content.parts;
for (const part of parts) {
if (part.text && part.text.trim()) {
// Send the final content chunk with finish reason
requestProgress.data = JSON.stringify(createChunk({
content: part.text
}, finishReason));
break; // Only process the first text part
}
}
} else {
// No content, just send finish chunk
requestProgress.data = JSON.stringify(createChunk({}, finishReason));
}
const pathwayResolver = requestState[this.requestId]?.pathwayResolver;
if (finishReason === 'tool_calls' && this.toolCallsBuffer.length > 0 && this.pathwayToolCallback && pathwayResolver) {
// Filter out undefined elements from the tool calls buffer
const validToolCalls = this.toolCallsBuffer.filter(tc => tc && tc.function && tc.function.name);
// Execute tool callback and keep stream open
const toolMessage = {
role: 'assistant',
content: this.contentBuffer || '',
tool_calls: validToolCalls,
};
this.pathwayToolCallback(pathwayResolver?.args, toolMessage, pathwayResolver);
// Clear tool buffer after processing; keep content for citations/continuations
this.toolCallsBuffer = [];
} else {
// Either regular stop, or tool_calls without a callback → close the stream
requestProgress.progress = 1;
addCitationsToResolver(pathwayResolver, this.contentBuffer);
this.toolCallsBuffer = [];
this.contentBuffer = '';
}
}
// Handle safety blocks
if (eventData.candidates?.[0]?.safetyRatings?.some(rating => rating.blocked)) {
requestProgress.data = JSON.stringify(createChunk({
content: "\n\n*** Response blocked due to safety ratings ***"
}, "content_filter"));
requestProgress.progress = 1;
// Clear buffers on safety block (same as OpenAI plugin)
this.toolCallsBuffer = [];
this.contentBuffer = '';
return requestProgress;
}
// Handle prompt feedback blocks
if (eventData.promptFeedback?.blockReason) {
requestProgress.data = JSON.stringify(createChunk({
content: `\n\n*** Response blocked: ${eventData.promptFeedback.blockReason} ***`
}, "content_filter"));
requestProgress.progress = 1;
// Clear buffers on prompt feedback block (same as OpenAI plugin)
this.toolCallsBuffer = [];
this.contentBuffer = '';
return requestProgress;
}
return requestProgress;
}
}
export default Gemini15VisionPlugin;