UNPKG

@mixio-pro/kalaasetu-mcp

Version:

A powerful Model Context Protocol server providing AI tools for content generation and analysis

614 lines (550 loc) 18.6 kB
import { z } from "zod"; import { GoogleGenAI, createPartFromUri, createUserContent, } from "@google/genai"; import * as fs from "fs"; import * as path from "path"; import * as os from "os"; import * as wav from "wav"; import { PassThrough } from "stream"; import { getStorage } from "../storage"; import { generateTimestampedFilename } from "../utils/filename"; const ai = new GoogleGenAI({ apiKey: process.env.GEMINI_API_KEY || "", }); async function fileToGenerativePart(filePath: string) { const storage = getStorage(); // Check if file exists const exists = await storage.exists(filePath); if (!exists) { // Try to provide more helpful error information const isAbsolute = path.isAbsolute(filePath); const resolvedPath = isAbsolute ? filePath : path.resolve(process.cwd(), filePath); throw new Error( `File not found: ${filePath}\n` + `Resolved path: ${resolvedPath}\n` + `Is absolute: ${isAbsolute}\n` + `CWD: ${process.cwd()}` ); } const imageBytes = await storage.readFile(filePath); return { inlineData: { data: Buffer.from(imageBytes).toString("base64"), mimeType: "image/jpeg", }, }; } // Helper function to save WAV file // Helper function to save WAV file async function saveWaveFile( filename: string, pcmData: Buffer, channels = 1, rate = 24000, sampleWidth = 2 ): Promise<void> { return new Promise((resolve, reject) => { const writer = new wav.Writer({ channels, sampleRate: rate, bitDepth: sampleWidth * 8, }); const stream = new PassThrough(); const chunks: Buffer[] = []; writer.pipe(stream); stream.on("data", (chunk) => chunks.push(chunk)); stream.on("end", async () => { try { const wavBuffer = Buffer.concat(chunks); const storage = getStorage(); await storage.writeFile(filename, wavBuffer); resolve(); } catch (err) { reject(err); } }); writer.on("error", reject); writer.write(pcmData); writer.end(); }); } // Helper function to check if URL is YouTube URL function isYouTubeUrl(url: string): boolean { return url.includes("youtube.com/watch") || url.includes("youtu.be"); } // Helper function to get file size in bytes async function getFileSize(filePath: string): Promise<number> { const storage = getStorage(); const buffer = await storage.readFile(filePath); return buffer.length; } // Helper function to upload file to Gemini API // Helper function to upload file to Gemini API async function uploadFileToGemini(filePath: string): Promise<any> { try { const storage = getStorage(); // For Gemini API, we need a local file path. // If storage is local, we can use the path directly (if we can resolve it). // If storage is remote, we must download to a temp file. let localPath = filePath; let isTemp = false; // Check if we can get a local path from storage (hacky check for LocalStorageProvider) // A better way is to always download to temp if not sure, or ask storage for a local path. // For now, let's assume we need to download if it's not a local file system path that exists. if (!fs.existsSync(filePath)) { // Try to read from storage and write to temp const buffer = await storage.readFile(filePath); const tempDir = os.tmpdir(); const tempFilePath = path.join(tempDir, path.basename(filePath)); fs.writeFileSync(tempFilePath, buffer); localPath = tempFilePath; isTemp = true; } const uploadedFile = await ai.files.upload({ file: localPath, }); if (isTemp) { fs.unlinkSync(localPath); } // Wait for file processing to complete let getFile = await ai.files.get({ name: uploadedFile.name! }); while (getFile.state === "PROCESSING") { await new Promise((resolve) => setTimeout(resolve, 3000)); getFile = await ai.files.get({ name: uploadedFile.name! }); } if (getFile.state === "FAILED") { throw new Error("File processing failed"); } return getFile; } catch (error: any) { throw new Error(`File upload failed: ${error.message}`); } } // Helper function to process video input intelligently async function processVideoInput( input: string, config?: { fps?: number; startOffset?: string; endOffset?: string } ): Promise<any> { if (isYouTubeUrl(input)) { return { fileData: { fileUri: input, mimeType: "video/*", videoMetadata: config ? { fps: config.fps, startOffset: config.startOffset, endOffset: config.endOffset, } : undefined, }, }; } else { // Local file processing - use File Upload API const storage = getStorage(); // Check if file exists const exists = await storage.exists(input); if (!exists) { // Try to provide more helpful error information const isAbsolute = path.isAbsolute(input); const resolvedPath = isAbsolute ? input : path.resolve(process.cwd(), input); throw new Error( `Video file not found: ${input}\n` + `Resolved path: ${resolvedPath}\n` + `Is absolute: ${isAbsolute}\n` + `CWD: ${process.cwd()}` ); } // Upload file to Gemini API const uploadedFile = await uploadFileToGemini(input); return uploadedFile; } } export const geminiTextToImage = { name: "generateImage", description: "Generate images from text prompts using Gemini image models with optional reference images", parameters: z.object({ prompt: z.string().describe("Text description of the image to generate"), aspect_ratio: z .string() .optional() .describe("Aspect ratio: 1:1, 3:4, 4:3, 9:16, or 16:9 (default 9:16)"), output_path: z .string() .optional() .describe("File path to save the generated image"), reference_images: z .array(z.string()) .optional() .describe("Optional reference image file paths to guide generation"), }), execute: async (args: { prompt: string; aspect_ratio?: string; output_path?: string; reference_images?: string[]; }) => { try { const contents: any[] = [args.prompt]; if (args.reference_images && Array.isArray(args.reference_images)) { for (const refPath of args.reference_images) { contents.push(await fileToGenerativePart(refPath)); } } const response = await ai.models.generateContent({ model: "gemini-3-pro-image-preview", contents: contents, config: { responseModalities: ["TEXT", "IMAGE"], imageConfig: { aspectRatio: args.aspect_ratio || "9:16", }, }, }); const images = []; let textResponse = ""; if (response.candidates && response.candidates[0]?.content?.parts) { for (const part of response.candidates[0].content.parts) { if (part.text) { textResponse += part.text; } else if (part.inlineData?.data) { const imageData = part.inlineData.data; if (args.output_path) { const storage = getStorage(); const url = await storage.writeFile( args.output_path, Buffer.from(imageData, "base64") ); images.push({ url, filename: args.output_path, mimeType: "image/png", }); } } } } if (images.length > 0) { return JSON.stringify({ images, message: textResponse || "Image generated successfully", }); } return ( textResponse || "Image generation completed but no response received" ); } catch (error: any) { throw new Error(`Image generation failed: ${error.message}`); } }, }; export const geminiEditImage = { name: "editImage", description: "Edit existing images with text instructions using Gemini 3 Pro Image Preview", parameters: z.object({ image_path: z.string().describe("Path to the source image file"), prompt: z.string().describe("Text instructions for editing the image"), output_path: z .string() .optional() .describe("File path to save the edited image"), reference_images: z .array(z.string()) .optional() .describe("Additional image paths for reference"), }), execute: async (args: { image_path: string; prompt: string; output_path?: string; reference_images?: string[]; }) => { try { const imagePart = await fileToGenerativePart(args.image_path); const contents: any[] = [args.prompt, imagePart]; if (args.reference_images) { for (const refPath of args.reference_images) { contents.push(await fileToGenerativePart(refPath)); } } const response = await ai.models.generateContent({ model: "gemini-3-pro-image-preview", contents: contents, }); const images = []; let textResponse = ""; if (response.candidates && response.candidates[0]?.content?.parts) { for (const part of response.candidates[0].content.parts) { if (part.text) { textResponse += part.text; } else if (part.inlineData?.data) { const imageData = part.inlineData.data; if (args.output_path) { const storage = getStorage(); const url = await storage.writeFile( args.output_path, Buffer.from(imageData, "base64") ); images.push({ url, filename: args.output_path, mimeType: "image/png", }); } } } } if (images.length > 0) { return JSON.stringify({ images, message: textResponse || "Image edited successfully", }); } return textResponse || "Image editing completed but no response received"; } catch (error: any) { throw new Error(`Image editing failed: ${error.message}`); } }, }; export const geminiAnalyzeImages = { name: "analyzeImages", description: "Analyze and describe images using Gemini 2.5 Pro with advanced multimodal understanding", parameters: z.object({ image_paths: z .array(z.string()) .describe("Array of image file paths to analyze"), prompt: z.string().describe("Text prompt or question about the images"), }), execute: async (args: { image_paths: string[]; prompt: string }) => { try { // Handle array parsing if (!args.image_paths) { throw new Error("Image paths not provided"); } // Convert to array if passed as string let imagePaths: string[]; if (typeof args.image_paths === "string") { const strValue = args.image_paths as string; if (strValue.startsWith("[") && strValue.endsWith("]")) { try { imagePaths = JSON.parse(strValue); } catch { throw new Error("Invalid image_paths format"); } } else { imagePaths = [strValue]; } } else if (Array.isArray(args.image_paths)) { imagePaths = args.image_paths; } else { throw new Error("Invalid image_paths: must be array or string"); } if (imagePaths.length === 0) { throw new Error("At least one image path must be provided"); } const contents: any[] = [args.prompt]; for (const imagePath of imagePaths) { contents.push(await fileToGenerativePart(imagePath)); } const response = await ai.models.generateContent({ model: "gemini-2.5-pro", contents: contents, }); let result = ""; if (response.candidates && response.candidates[0]?.content?.parts) { for (const part of response.candidates[0].content.parts) { if (part.text) { result += part.text; } } } return result || "Analysis completed but no text response received"; } catch (error: any) { throw new Error(`Image analysis failed: ${error.message}`); } }, }; export const geminiSingleSpeakerTts = { name: "generateSpeech", description: "Generate single speaker voice audio from text using Gemini 2.5 Pro Preview TTS model", parameters: z.object({ text: z.string().describe("Text to convert to speech"), voice_name: z .string() .describe( "Voice name from supported options. Use Kore, Erinome or Despina for the female voices and Enceladus for male." ), output_path: z .string() .optional() .describe( "Output WAV file path (optional, defaults to timestamp-based filename)" ), }), execute: async (args: { text: string; voice_name: string; output_path?: string; }) => { try { const response = await ai.models.generateContent({ model: "gemini-2.5-pro-preview-tts", contents: [{ parts: [{ text: args.text }] }], config: { responseModalities: ["AUDIO"], speechConfig: { voiceConfig: { prebuiltVoiceConfig: { voiceName: args.voice_name || "Despina", }, }, }, }, }); const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data; if (!data) { throw new Error("No audio data received from Gemini API"); } const audioBuffer = Buffer.from(data, "base64"); // Use provided output path or generate default with timestamp const outputPath = args.output_path || generateTimestampedFilename("voice_output.wav"); const storage = getStorage(); const url = await storage.writeFile(outputPath, audioBuffer); return JSON.stringify({ audio: { url, filename: outputPath, mimeType: "audio/wav", }, message: "Audio generated successfully", }); } catch (error: any) { throw new Error(`Voice generation failed: ${error.message}`); } }, }; export const geminiAnalyzeVideos = { name: "analyzeVideos", description: "Analyze and understand video content using Gemini 2.5 Flash model. Intelligently handles YouTube URLs and local videos (files <20MB processed inline, ≥20MB uploaded via File API). Supports timestamp queries, clipping, and custom frame rates with default 5 FPS for local videos to optimize processing.", parameters: z.object({ video_inputs: z .array(z.string()) .describe( "Array of video inputs - mix of local file paths and YouTube URLs (max 10 videos). Local files <20MB processed inline, larger files uploaded via File API automatically." ), prompt: z .string() .describe( "Text prompt or question about the videos. Use MM:SS format for timestamp references (e.g., 'What happens at 01:30?')." ), fps: z .number() .optional() .describe( "Frame rate for video processing (default: 5 FPS for local videos to reduce file size, 1 FPS for YouTube URLs)" ), start_offset: z .string() .optional() .describe("Clip start time in seconds with 's' suffix (e.g., '40s')"), end_offset: z .string() .optional() .describe("Clip end time in seconds with 's' suffix (e.g., '80s')"), media_resolution: z .string() .optional() .describe( "Media resolution: 'default' or 'low' (low resolution uses ~100 tokens/sec vs 300 tokens/sec)" ), }), execute: async (args: { video_inputs: string[]; prompt: string; fps?: number; start_offset?: string; end_offset?: string; media_resolution?: string; }) => { try { // Handle array parsing if (!args.video_inputs) { throw new Error("Video inputs not provided"); } // Convert to array if passed as string let videoInputs: string[]; if (typeof args.video_inputs === "string") { const strValue = args.video_inputs as string; if (strValue.startsWith("[") && strValue.endsWith("]")) { try { videoInputs = JSON.parse(strValue); } catch { throw new Error("Invalid video_inputs format"); } } else { videoInputs = [strValue]; } } else if (Array.isArray(args.video_inputs)) { videoInputs = args.video_inputs; } else { throw new Error("Invalid video_inputs: must be array or string"); } if (videoInputs.length === 0) { throw new Error("At least one video input must be provided"); } if (videoInputs.length > 10) { throw new Error( "Maximum 10 videos per request allowed for Gemini 2.5+ models" ); } // Prepare video parts for content const videoParts: any[] = []; // Process each video input for (const videoInput of videoInputs) { const videoConfig = { fps: args.fps || (isYouTubeUrl(videoInput) ? 1 : 5), // Default 5 FPS for local, 1 FPS for YouTube startOffset: args.start_offset, endOffset: args.end_offset, }; const videoPart = await processVideoInput(videoInput, videoConfig); videoParts.push(videoPart); } // Build content using createUserContent and createPartFromUri for uploaded files const contentParts: any[] = [args.prompt]; for (const videoPart of videoParts) { if (videoPart.uri && videoPart.mimeType) { contentParts.push( createPartFromUri(videoPart.uri, videoPart.mimeType) ); } } const finalContents = createUserContent(contentParts); const response = await ai.models.generateContent({ model: "gemini-2.5-pro", contents: finalContents, }); let result = ""; if (response.candidates && response.candidates[0]?.content?.parts) { for (const part of response.candidates[0].content.parts) { if (part.text) { result += part.text; } } } return result || "Video analysis completed but no text response received"; } catch (error: any) { throw new Error(`Video analysis failed: ${error.message}`); } }, };