vram-calculator-mcp-server
Version:
Model Context Protocol server for AI VRAM calculation and GPU recommendation
361 lines (350 loc) • 17 kB
JavaScript
"use strict";
/**
* VRAM Calculator MCP Server Implementation
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.VRAMCalculatorServer = void 0;
class VRAMCalculatorServer {
constructor() {
this.models = [
// NLP Models
{ id: 'llama-2-7b', name: 'Llama 2 7B', params: 7, architecture: 'transformer', type: 'nlp' },
{ id: 'llama-2-13b', name: 'Llama 2 13B', params: 13, architecture: 'transformer', type: 'nlp' },
{ id: 'llama-2-70b', name: 'Llama 2 70B', params: 70, architecture: 'transformer', type: 'nlp' },
{ id: 'qwen2.5-7b', name: 'Qwen2.5 7B', params: 7, architecture: 'transformer', type: 'nlp' },
{ id: 'qwen2.5-14b', name: 'Qwen2.5 14B', params: 14, architecture: 'transformer', type: 'nlp' },
{ id: 'qwen2.5-32b', name: 'Qwen2.5 32B', params: 32, architecture: 'transformer', type: 'nlp' },
{ id: 'qwen2.5-72b', name: 'Qwen2.5 72B', params: 72, architecture: 'transformer', type: 'nlp' },
{ id: 'deepseek-v2', name: 'DeepSeek V2', params: 236, architecture: 'moe', type: 'nlp' },
{ id: 'yi-34b', name: 'Yi 34B', params: 34, architecture: 'transformer', type: 'nlp' },
{ id: 'baichuan2-13b', name: 'Baichuan2 13B', params: 13, architecture: 'transformer', type: 'nlp' },
// Multimodal Models
{ id: 'qwen2-vl-7b', name: 'Qwen2-VL 7B', params: 7, architecture: 'multimodal', type: 'multimodal' },
{ id: 'llava-1.5-7b', name: 'LLaVA 1.5 7B', params: 7, architecture: 'multimodal', type: 'multimodal' },
{ id: 'llava-1.5-13b', name: 'LLaVA 1.5 13B', params: 13, architecture: 'multimodal', type: 'multimodal' },
{ id: 'cogvlm-17b', name: 'CogVLM 17B', params: 17, architecture: 'multimodal', type: 'multimodal' },
{ id: 'internvl-chat-v1.5', name: 'InternVL Chat V1.5', params: 26, architecture: 'multimodal', type: 'multimodal' },
// Embedding Models
{ id: 'bge-large-zh', name: 'BGE Large ZH', params: 0.3, architecture: 'bert', type: 'embedding' },
{ id: 'text2vec-large', name: 'Text2Vec Large', params: 0.3, architecture: 'bert', type: 'embedding' },
{ id: 'gte-large', name: 'GTE Large', params: 0.3, architecture: 'bert', type: 'embedding' }
];
this.gpus = [
{ name: 'RTX 4060', vram: 8, price: 299, architecture: 'Ada Lovelace', powerConsumption: 115, memoryBandwidth: 272 },
{ name: 'RTX 4060 Ti', vram: 16, price: 499, architecture: 'Ada Lovelace', powerConsumption: 165, memoryBandwidth: 288 },
{ name: 'RTX 4070', vram: 12, price: 599, architecture: 'Ada Lovelace', powerConsumption: 200, memoryBandwidth: 504 },
{ name: 'RTX 4070 Ti', vram: 12, price: 799, architecture: 'Ada Lovelace', powerConsumption: 285, memoryBandwidth: 504 },
{ name: 'RTX 4080', vram: 16, price: 1199, architecture: 'Ada Lovelace', powerConsumption: 320, memoryBandwidth: 717 },
{ name: 'RTX 4090', vram: 24, price: 1599, architecture: 'Ada Lovelace', powerConsumption: 450, memoryBandwidth: 1008 },
{ name: 'RTX 5090', vram: 32, price: 1999, architecture: 'Blackwell', powerConsumption: 575, memoryBandwidth: 1792 },
{ name: 'A100 40GB', vram: 40, price: 10000, architecture: 'Ampere', powerConsumption: 400, memoryBandwidth: 1555 },
{ name: 'A100 80GB', vram: 80, price: 15000, architecture: 'Ampere', powerConsumption: 400, memoryBandwidth: 1935 },
{ name: 'H100 80GB', vram: 80, price: 25000, architecture: 'Hopper', powerConsumption: 700, memoryBandwidth: 3350 },
{ name: 'L40S', vram: 48, price: 8000, architecture: 'Ada Lovelace', powerConsumption: 350, memoryBandwidth: 864 },
{ name: 'A6000', vram: 48, price: 4500, architecture: 'Ampere', powerConsumption: 300, memoryBandwidth: 768 }
];
}
// Resource handlers
async listResources() {
return {
resources: [
{
uri: 'models://nlp',
name: 'NLP Models Database',
description: 'Comprehensive database of NLP/Language models with detailed specifications'
},
{
uri: 'models://multimodal',
name: 'Multimodal Models Database',
description: 'Database of multimodal models supporting text, image, audio, and video'
},
{
uri: 'models://embedding',
name: 'Embedding Models Database',
description: 'Database of text embedding and reranking models'
},
{
uri: 'gpu://specs',
name: 'GPU Specifications Database',
description: 'Detailed specifications and pricing for consumer and datacenter GPUs'
},
{
uri: 'gpu://recommendations',
name: 'GPU Recommendations',
description: 'Intelligent GPU recommendations based on VRAM requirements'
},
{
uri: 'formulas://vram',
name: 'VRAM Calculation Formulas',
description: 'Comprehensive documentation of VRAM calculation methodologies'
}
]
};
}
async readResource(uri) {
switch (uri) {
case 'models://nlp':
return {
contents: [{
uri,
text: JSON.stringify({
total: this.models.filter(m => m.type === 'nlp').length,
models: this.models.filter(m => m.type === 'nlp')
}, null, 2)
}]
};
case 'models://multimodal':
return {
contents: [{
uri,
text: JSON.stringify({
total: this.models.filter(m => m.type === 'multimodal').length,
models: this.models.filter(m => m.type === 'multimodal')
}, null, 2)
}]
};
case 'models://embedding':
return {
contents: [{
uri,
text: JSON.stringify({
total: this.models.filter(m => m.type === 'embedding').length,
models: this.models.filter(m => m.type === 'embedding')
}, null, 2)
}]
};
case 'gpu://specs':
return {
contents: [{
uri,
text: JSON.stringify({
total: this.gpus.length,
gpus: this.gpus
}, null, 2)
}]
};
case 'formulas://vram':
return {
contents: [{
uri,
text: `# VRAM Calculation Formulas
## Universal LLM VRAM Framework
Total VRAM = Model Weights + Optimizer States + Gradients + Activations + Overhead
### 1. Model Weights
- FP32: params × 4 bytes
- FP16/BF16: params × 2 bytes
- INT8: params × 1 byte
- INT4: params × 0.5 bytes
### 2. Optimizer States (Training)
- Adam: model_weights × 2 (momentum + variance)
- AdamW: model_weights × 2
- SGD: model_weights × 1
### 3. Gradients (Training)
- Same precision as model weights
- Size = model_weights
### 4. Activations
- Depends on: batch_size, sequence_length, hidden_size, num_layers
- Approximate: batch_size × seq_len × hidden_size × num_layers × precision_bytes
### 5. Overhead
- Framework overhead: ~10-20% of total
- CUDA context: ~1-2GB
- Other buffers: ~5-10% of model size
## Calculation Modes
### Inference Mode
Total = Model Weights + Activations + Overhead
### Training Mode
Total = Model Weights + Optimizer States + Gradients + Activations + Overhead
### Fine-tuning Mode
- LoRA: Reduced optimizer states and gradients
- Full fine-tuning: Same as training mode
`
}]
};
default:
throw new Error(`Unknown resource: ${uri}`);
}
}
// Tool handlers
async listTools() {
return {
tools: [
{
name: 'calculate_vram',
description: 'Calculate VRAM requirements for model training or inference',
inputSchema: {
type: 'object',
properties: {
modelId: { type: 'string', description: 'Model identifier' },
mode: { type: 'string', enum: ['inference', 'training', 'finetuning'], description: 'Calculation mode' },
batchSize: { type: 'number', description: 'Batch size', default: 1 },
sequenceLength: { type: 'number', description: 'Sequence length', default: 2048 },
precision: { type: 'string', enum: ['fp32', 'fp16', 'bf16', 'int8', 'int4'], default: 'fp16' }
},
required: ['modelId', 'mode']
}
},
{
name: 'recommend_gpu',
description: 'Recommend suitable GPUs based on VRAM requirements',
inputSchema: {
type: 'object',
properties: {
vramRequired: { type: 'number', description: 'Required VRAM in GB' },
budget: { type: 'number', description: 'Budget limit in USD', default: 10000 },
useCase: { type: 'string', enum: ['inference', 'training', 'development'], default: 'training' },
multiGPU: { type: 'boolean', description: 'Allow multi-GPU recommendations', default: false }
},
required: ['vramRequired']
}
},
{
name: 'compare_models',
description: 'Compare VRAM requirements across multiple models',
inputSchema: {
type: 'object',
properties: {
modelIds: { type: 'array', items: { type: 'string' }, description: 'List of model IDs to compare' },
mode: { type: 'string', enum: ['inference', 'training', 'finetuning'], default: 'training' },
batchSize: { type: 'number', default: 1 },
sequenceLength: { type: 'number', default: 2048 },
precision: { type: 'string', enum: ['fp32', 'fp16', 'bf16', 'int8', 'int4'], default: 'fp16' }
},
required: ['modelIds']
}
}
]
};
}
async callTool(name, args) {
switch (name) {
case 'calculate_vram':
return this.calculateVRAM(args);
case 'recommend_gpu':
return this.recommendGPU(args);
case 'compare_models':
return this.compareModels(args);
default:
throw new Error(`Unknown tool: ${name}`);
}
}
async calculateVRAM(args) {
const { modelId, mode, batchSize = 1, sequenceLength = 2048, precision = 'fp16' } = args;
const model = this.models.find(m => m.id === modelId);
if (!model) {
throw new Error(`Model not found: ${modelId}`);
}
// Precision multipliers
const precisionBytes = {
fp32: 4, fp16: 2, bf16: 2, int8: 1, int4: 0.5
};
const bytesPerParam = precisionBytes[precision];
const modelWeights = model.params * 1e9 * bytesPerParam / (1024 ** 3); // GB
let optimizer = 0;
let gradients = 0;
if (mode === 'training') {
optimizer = modelWeights * 2; // Adam optimizer
gradients = modelWeights;
}
else if (mode === 'finetuning') {
optimizer = modelWeights * 0.1; // LoRA approximation
gradients = modelWeights * 0.1;
}
// Activations (simplified calculation)
const hiddenSize = Math.sqrt(model.params * 1e9 / 12); // Rough estimate
const numLayers = Math.log2(model.params) * 4; // Rough estimate
const activations = (batchSize * sequenceLength * hiddenSize * numLayers * bytesPerParam) / (1024 ** 3);
const overhead = (modelWeights + optimizer + gradients + activations) * 0.15; // 15% overhead
const result = {
totalVRAM: modelWeights + optimizer + gradients + activations + overhead,
breakdown: {
modelWeights,
optimizer,
gradients,
activations,
overhead
},
recommendations: this.gpus.filter(gpu => gpu.vram >= (modelWeights + optimizer + gradients + activations + overhead)).slice(0, 3),
optimizations: [
precision !== 'fp16' ? 'Consider using FP16 precision to reduce memory usage' : '',
batchSize > 1 ? 'Reduce batch size to lower activation memory' : '',
mode === 'training' ? 'Consider LoRA fine-tuning instead of full training' : ''
].filter(Boolean)
};
return {
content: [{
type: 'text',
text: JSON.stringify(result, null, 2)
}]
};
}
async recommendGPU(args) {
const { vramRequired, budget = 10000, useCase = 'training', multiGPU = false } = args;
let suitableGPUs = this.gpus.filter(gpu => gpu.vram >= vramRequired && gpu.price <= budget);
if (suitableGPUs.length === 0 && multiGPU) {
// Consider multi-GPU setups
suitableGPUs = this.gpus.filter(gpu => gpu.vram * 2 >= vramRequired && gpu.price * 2 <= budget).map(gpu => ({
...gpu,
name: `2x ${gpu.name}`,
vram: gpu.vram * 2,
price: gpu.price * 2
}));
}
// Sort by price-performance ratio
suitableGPUs.sort((a, b) => (a.price / a.vram) - (b.price / b.vram));
const result = {
vramRequired,
budget,
useCase,
recommendations: suitableGPUs.slice(0, 5).map(gpu => ({
...gpu,
utilization: Math.min(100, (vramRequired / gpu.vram) * 100),
pricePerGB: gpu.price / gpu.vram
})),
summary: `Found ${suitableGPUs.length} GPU options within budget for ${vramRequired}GB VRAM requirement`
};
return {
content: [{
type: 'text',
text: JSON.stringify(result, null, 2)
}]
};
}
async compareModels(args) {
const { modelIds, mode = 'training', batchSize = 1, sequenceLength = 2048, precision = 'fp16' } = args;
const comparisons = [];
for (const modelId of modelIds) {
try {
const result = await this.calculateVRAM({ modelId, mode, batchSize, sequenceLength, precision });
const calculation = JSON.parse(result.content[0].text);
comparisons.push({
modelId,
modelName: this.models.find(m => m.id === modelId)?.name || modelId,
totalVRAM: calculation.totalVRAM,
breakdown: calculation.breakdown
});
}
catch (error) {
comparisons.push({
modelId,
error: error.message
});
}
}
// Sort by total VRAM
comparisons.sort((a, b) => (a.totalVRAM || 0) - (b.totalVRAM || 0));
const result = {
mode,
configuration: { batchSize, sequenceLength, precision },
comparisons,
summary: `Compared ${modelIds.length} models in ${mode} mode`
};
return {
content: [{
type: 'text',
text: JSON.stringify(result, null, 2)
}]
};
}
}
exports.VRAMCalculatorServer = VRAMCalculatorServer;
//# sourceMappingURL=server.js.map