@mixio-pro/kalaasetu-mcp
Version:
A powerful Model Context Protocol server providing AI tools for content generation and analysis
614 lines (550 loc) • 18.6 kB
text/typescript
import { z } from "zod";
import {
GoogleGenAI,
createPartFromUri,
createUserContent,
} from "@google/genai";
import * as fs from "fs";
import * as path from "path";
import * as os from "os";
import * as wav from "wav";
import { PassThrough } from "stream";
import { getStorage } from "../storage";
import { generateTimestampedFilename } from "../utils/filename";
const ai = new GoogleGenAI({
apiKey: process.env.GEMINI_API_KEY || "",
});
async function fileToGenerativePart(filePath: string) {
const storage = getStorage();
// Check if file exists
const exists = await storage.exists(filePath);
if (!exists) {
// Try to provide more helpful error information
const isAbsolute = path.isAbsolute(filePath);
const resolvedPath = isAbsolute
? filePath
: path.resolve(process.cwd(), filePath);
throw new Error(
`File not found: ${filePath}\n` +
`Resolved path: ${resolvedPath}\n` +
`Is absolute: ${isAbsolute}\n` +
`CWD: ${process.cwd()}`
);
}
const imageBytes = await storage.readFile(filePath);
return {
inlineData: {
data: Buffer.from(imageBytes).toString("base64"),
mimeType: "image/jpeg",
},
};
}
// Helper function to save WAV file
// Helper function to save WAV file
async function saveWaveFile(
filename: string,
pcmData: Buffer,
channels = 1,
rate = 24000,
sampleWidth = 2
): Promise<void> {
return new Promise((resolve, reject) => {
const writer = new wav.Writer({
channels,
sampleRate: rate,
bitDepth: sampleWidth * 8,
});
const stream = new PassThrough();
const chunks: Buffer[] = [];
writer.pipe(stream);
stream.on("data", (chunk) => chunks.push(chunk));
stream.on("end", async () => {
try {
const wavBuffer = Buffer.concat(chunks);
const storage = getStorage();
await storage.writeFile(filename, wavBuffer);
resolve();
} catch (err) {
reject(err);
}
});
writer.on("error", reject);
writer.write(pcmData);
writer.end();
});
}
// Helper function to check if URL is YouTube URL
function isYouTubeUrl(url: string): boolean {
return url.includes("youtube.com/watch") || url.includes("youtu.be");
}
// Helper function to get file size in bytes
async function getFileSize(filePath: string): Promise<number> {
const storage = getStorage();
const buffer = await storage.readFile(filePath);
return buffer.length;
}
// Helper function to upload file to Gemini API
// Helper function to upload file to Gemini API
async function uploadFileToGemini(filePath: string): Promise<any> {
try {
const storage = getStorage();
// For Gemini API, we need a local file path.
// If storage is local, we can use the path directly (if we can resolve it).
// If storage is remote, we must download to a temp file.
let localPath = filePath;
let isTemp = false;
// Check if we can get a local path from storage (hacky check for LocalStorageProvider)
// A better way is to always download to temp if not sure, or ask storage for a local path.
// For now, let's assume we need to download if it's not a local file system path that exists.
if (!fs.existsSync(filePath)) {
// Try to read from storage and write to temp
const buffer = await storage.readFile(filePath);
const tempDir = os.tmpdir();
const tempFilePath = path.join(tempDir, path.basename(filePath));
fs.writeFileSync(tempFilePath, buffer);
localPath = tempFilePath;
isTemp = true;
}
const uploadedFile = await ai.files.upload({
file: localPath,
});
if (isTemp) {
fs.unlinkSync(localPath);
}
// Wait for file processing to complete
let getFile = await ai.files.get({ name: uploadedFile.name! });
while (getFile.state === "PROCESSING") {
await new Promise((resolve) => setTimeout(resolve, 3000));
getFile = await ai.files.get({ name: uploadedFile.name! });
}
if (getFile.state === "FAILED") {
throw new Error("File processing failed");
}
return getFile;
} catch (error: any) {
throw new Error(`File upload failed: ${error.message}`);
}
}
// Helper function to process video input intelligently
async function processVideoInput(
input: string,
config?: { fps?: number; startOffset?: string; endOffset?: string }
): Promise<any> {
if (isYouTubeUrl(input)) {
return {
fileData: {
fileUri: input,
mimeType: "video/*",
videoMetadata: config
? {
fps: config.fps,
startOffset: config.startOffset,
endOffset: config.endOffset,
}
: undefined,
},
};
} else {
// Local file processing - use File Upload API
const storage = getStorage();
// Check if file exists
const exists = await storage.exists(input);
if (!exists) {
// Try to provide more helpful error information
const isAbsolute = path.isAbsolute(input);
const resolvedPath = isAbsolute ? input : path.resolve(process.cwd(), input);
throw new Error(
`Video file not found: ${input}\n` +
`Resolved path: ${resolvedPath}\n` +
`Is absolute: ${isAbsolute}\n` +
`CWD: ${process.cwd()}`
);
}
// Upload file to Gemini API
const uploadedFile = await uploadFileToGemini(input);
return uploadedFile;
}
}
export const geminiTextToImage = {
name: "generateImage",
description:
"Generate images from text prompts using Gemini image models with optional reference images",
parameters: z.object({
prompt: z.string().describe("Text description of the image to generate"),
aspect_ratio: z
.string()
.optional()
.describe("Aspect ratio: 1:1, 3:4, 4:3, 9:16, or 16:9 (default 9:16)"),
output_path: z
.string()
.optional()
.describe("File path to save the generated image"),
reference_images: z
.array(z.string())
.optional()
.describe("Optional reference image file paths to guide generation"),
}),
execute: async (args: {
prompt: string;
aspect_ratio?: string;
output_path?: string;
reference_images?: string[];
}) => {
try {
const contents: any[] = [args.prompt];
if (args.reference_images && Array.isArray(args.reference_images)) {
for (const refPath of args.reference_images) {
contents.push(await fileToGenerativePart(refPath));
}
}
const response = await ai.models.generateContent({
model: "gemini-3-pro-image-preview",
contents: contents,
config: {
responseModalities: ["TEXT", "IMAGE"],
imageConfig: {
aspectRatio: args.aspect_ratio || "9:16",
},
},
});
const images = [];
let textResponse = "";
if (response.candidates && response.candidates[0]?.content?.parts) {
for (const part of response.candidates[0].content.parts) {
if (part.text) {
textResponse += part.text;
} else if (part.inlineData?.data) {
const imageData = part.inlineData.data;
if (args.output_path) {
const storage = getStorage();
const url = await storage.writeFile(
args.output_path,
Buffer.from(imageData, "base64")
);
images.push({
url,
filename: args.output_path,
mimeType: "image/png",
});
}
}
}
}
if (images.length > 0) {
return JSON.stringify({
images,
message: textResponse || "Image generated successfully",
});
}
return (
textResponse || "Image generation completed but no response received"
);
} catch (error: any) {
throw new Error(`Image generation failed: ${error.message}`);
}
},
};
export const geminiEditImage = {
name: "editImage",
description:
"Edit existing images with text instructions using Gemini 3 Pro Image Preview",
parameters: z.object({
image_path: z.string().describe("Path to the source image file"),
prompt: z.string().describe("Text instructions for editing the image"),
output_path: z
.string()
.optional()
.describe("File path to save the edited image"),
reference_images: z
.array(z.string())
.optional()
.describe("Additional image paths for reference"),
}),
execute: async (args: {
image_path: string;
prompt: string;
output_path?: string;
reference_images?: string[];
}) => {
try {
const imagePart = await fileToGenerativePart(args.image_path);
const contents: any[] = [args.prompt, imagePart];
if (args.reference_images) {
for (const refPath of args.reference_images) {
contents.push(await fileToGenerativePart(refPath));
}
}
const response = await ai.models.generateContent({
model: "gemini-3-pro-image-preview",
contents: contents,
});
const images = [];
let textResponse = "";
if (response.candidates && response.candidates[0]?.content?.parts) {
for (const part of response.candidates[0].content.parts) {
if (part.text) {
textResponse += part.text;
} else if (part.inlineData?.data) {
const imageData = part.inlineData.data;
if (args.output_path) {
const storage = getStorage();
const url = await storage.writeFile(
args.output_path,
Buffer.from(imageData, "base64")
);
images.push({
url,
filename: args.output_path,
mimeType: "image/png",
});
}
}
}
}
if (images.length > 0) {
return JSON.stringify({
images,
message: textResponse || "Image edited successfully",
});
}
return textResponse || "Image editing completed but no response received";
} catch (error: any) {
throw new Error(`Image editing failed: ${error.message}`);
}
},
};
export const geminiAnalyzeImages = {
name: "analyzeImages",
description:
"Analyze and describe images using Gemini 2.5 Pro with advanced multimodal understanding",
parameters: z.object({
image_paths: z
.array(z.string())
.describe("Array of image file paths to analyze"),
prompt: z.string().describe("Text prompt or question about the images"),
}),
execute: async (args: { image_paths: string[]; prompt: string }) => {
try {
// Handle array parsing
if (!args.image_paths) {
throw new Error("Image paths not provided");
}
// Convert to array if passed as string
let imagePaths: string[];
if (typeof args.image_paths === "string") {
const strValue = args.image_paths as string;
if (strValue.startsWith("[") && strValue.endsWith("]")) {
try {
imagePaths = JSON.parse(strValue);
} catch {
throw new Error("Invalid image_paths format");
}
} else {
imagePaths = [strValue];
}
} else if (Array.isArray(args.image_paths)) {
imagePaths = args.image_paths;
} else {
throw new Error("Invalid image_paths: must be array or string");
}
if (imagePaths.length === 0) {
throw new Error("At least one image path must be provided");
}
const contents: any[] = [args.prompt];
for (const imagePath of imagePaths) {
contents.push(await fileToGenerativePart(imagePath));
}
const response = await ai.models.generateContent({
model: "gemini-2.5-pro",
contents: contents,
});
let result = "";
if (response.candidates && response.candidates[0]?.content?.parts) {
for (const part of response.candidates[0].content.parts) {
if (part.text) {
result += part.text;
}
}
}
return result || "Analysis completed but no text response received";
} catch (error: any) {
throw new Error(`Image analysis failed: ${error.message}`);
}
},
};
export const geminiSingleSpeakerTts = {
name: "generateSpeech",
description:
"Generate single speaker voice audio from text using Gemini 2.5 Pro Preview TTS model",
parameters: z.object({
text: z.string().describe("Text to convert to speech"),
voice_name: z
.string()
.describe(
"Voice name from supported options. Use Kore, Erinome or Despina for the female voices and Enceladus for male."
),
output_path: z
.string()
.optional()
.describe(
"Output WAV file path (optional, defaults to timestamp-based filename)"
),
}),
execute: async (args: {
text: string;
voice_name: string;
output_path?: string;
}) => {
try {
const response = await ai.models.generateContent({
model: "gemini-2.5-pro-preview-tts",
contents: [{ parts: [{ text: args.text }] }],
config: {
responseModalities: ["AUDIO"],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: {
voiceName: args.voice_name || "Despina",
},
},
},
},
});
const data =
response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
if (!data) {
throw new Error("No audio data received from Gemini API");
}
const audioBuffer = Buffer.from(data, "base64");
// Use provided output path or generate default with timestamp
const outputPath = args.output_path || generateTimestampedFilename("voice_output.wav");
const storage = getStorage();
const url = await storage.writeFile(outputPath, audioBuffer);
return JSON.stringify({
audio: {
url,
filename: outputPath,
mimeType: "audio/wav",
},
message: "Audio generated successfully",
});
} catch (error: any) {
throw new Error(`Voice generation failed: ${error.message}`);
}
},
};
export const geminiAnalyzeVideos = {
name: "analyzeVideos",
description:
"Analyze and understand video content using Gemini 2.5 Flash model. Intelligently handles YouTube URLs and local videos (files <20MB processed inline, ≥20MB uploaded via File API). Supports timestamp queries, clipping, and custom frame rates with default 5 FPS for local videos to optimize processing.",
parameters: z.object({
video_inputs: z
.array(z.string())
.describe(
"Array of video inputs - mix of local file paths and YouTube URLs (max 10 videos). Local files <20MB processed inline, larger files uploaded via File API automatically."
),
prompt: z
.string()
.describe(
"Text prompt or question about the videos. Use MM:SS format for timestamp references (e.g., 'What happens at 01:30?')."
),
fps: z
.number()
.optional()
.describe(
"Frame rate for video processing (default: 5 FPS for local videos to reduce file size, 1 FPS for YouTube URLs)"
),
start_offset: z
.string()
.optional()
.describe("Clip start time in seconds with 's' suffix (e.g., '40s')"),
end_offset: z
.string()
.optional()
.describe("Clip end time in seconds with 's' suffix (e.g., '80s')"),
media_resolution: z
.string()
.optional()
.describe(
"Media resolution: 'default' or 'low' (low resolution uses ~100 tokens/sec vs 300 tokens/sec)"
),
}),
execute: async (args: {
video_inputs: string[];
prompt: string;
fps?: number;
start_offset?: string;
end_offset?: string;
media_resolution?: string;
}) => {
try {
// Handle array parsing
if (!args.video_inputs) {
throw new Error("Video inputs not provided");
}
// Convert to array if passed as string
let videoInputs: string[];
if (typeof args.video_inputs === "string") {
const strValue = args.video_inputs as string;
if (strValue.startsWith("[") && strValue.endsWith("]")) {
try {
videoInputs = JSON.parse(strValue);
} catch {
throw new Error("Invalid video_inputs format");
}
} else {
videoInputs = [strValue];
}
} else if (Array.isArray(args.video_inputs)) {
videoInputs = args.video_inputs;
} else {
throw new Error("Invalid video_inputs: must be array or string");
}
if (videoInputs.length === 0) {
throw new Error("At least one video input must be provided");
}
if (videoInputs.length > 10) {
throw new Error(
"Maximum 10 videos per request allowed for Gemini 2.5+ models"
);
}
// Prepare video parts for content
const videoParts: any[] = [];
// Process each video input
for (const videoInput of videoInputs) {
const videoConfig = {
fps: args.fps || (isYouTubeUrl(videoInput) ? 1 : 5), // Default 5 FPS for local, 1 FPS for YouTube
startOffset: args.start_offset,
endOffset: args.end_offset,
};
const videoPart = await processVideoInput(videoInput, videoConfig);
videoParts.push(videoPart);
}
// Build content using createUserContent and createPartFromUri for uploaded files
const contentParts: any[] = [args.prompt];
for (const videoPart of videoParts) {
if (videoPart.uri && videoPart.mimeType) {
contentParts.push(
createPartFromUri(videoPart.uri, videoPart.mimeType)
);
}
}
const finalContents = createUserContent(contentParts);
const response = await ai.models.generateContent({
model: "gemini-2.5-pro",
contents: finalContents,
});
let result = "";
if (response.candidates && response.candidates[0]?.content?.parts) {
for (const part of response.candidates[0].content.parts) {
if (part.text) {
result += part.text;
}
}
}
return result || "Video analysis completed but no text response received";
} catch (error: any) {
throw new Error(`Video analysis failed: ${error.message}`);
}
},
};