UNPKG

greed.js

Version:

Lightweight, private alternative to Colab. Run PyTorch & NumPy in browser with GPU acceleration (8.8x speedup). Fast, secure, runs locally.

881 lines (733 loc) 27.4 kB
/** * WebGPU Compute Engine - High-performance tensor operations with GPU acceleration * Refactored from monolithic implementation for better modularity and performance */ import EventEmitter from '../../core/event-emitter.js'; import BufferManager from './buffer-manager.js'; import PipelineCache from './pipeline-cache.js'; import logger from '../../utils/logger.js'; class WebGPUComputeEngine extends EventEmitter { constructor(options = {}) { super(); this.config = { powerPreference: options.powerPreference || 'high-performance', enableProfiling: options.enableProfiling !== false, maxBufferSize: options.maxBufferSize || 256 * 1024 * 1024, // 256MB workgroupSize: options.workgroupSize || [64, 1, 1], enableValidation: options.enableValidation !== false, ...options }; // Core WebGPU resources this.adapter = null; this.device = null; this.isInitialized = false; // Modular components this.bufferManager = null; this.pipelineCache = null; // Feature support this.supportedFeatures = new Set(); this.limits = null; // Performance tracking this.stats = { computeOperations: 0, totalExecutionTime: 0, averageExecutionTime: 0, memoryUsage: 0, lastOperationTime: 0 }; } /** * Initialize WebGPU device and components */ async initialize() { if (this.isInitialized) { return true; } try { this.emit('init:start'); // Check WebGPU support if (!navigator.gpu) { throw new Error('WebGPU not supported in this browser'); } // Request adapter this.adapter = await navigator.gpu.requestAdapter({ powerPreference: this.config.powerPreference }); if (!this.adapter) { throw new Error('Failed to get WebGPU adapter'); } // Get device features and limits this.supportedFeatures = this.adapter.features; this.limits = this.adapter.limits; this.emit('init:adapter', { features: Array.from(this.supportedFeatures), limits: this.limits }); // Request device with required features const deviceDescriptor = { requiredFeatures: [], requiredLimits: {} }; // Add optional features if supported if (this.supportedFeatures.has('timestamp-query')) { deviceDescriptor.requiredFeatures.push('timestamp-query'); } this.device = await this.adapter.requestDevice(deviceDescriptor); // Set up comprehensive error handling this.device.addEventListener('uncapturederror', (event) => { const error = event.error; this.emit('device:error', { error, type: 'uncaptured', timestamp: Date.now() }); logger.error('WebGPU uncaptured error:', { type: error.constructor.name, message: error.message, stack: error.stack }); // Attempt recovery for recoverable errors this._handleDeviceError(error); }); // Initialize modular components this.bufferManager = new BufferManager(this.device, { maxBufferSize: this.config.maxBufferSize, enablePooling: true, maxPoolSize: 100 }); this.pipelineCache = new PipelineCache(this.device, { maxCacheSize: 50, enableWarmup: true, shaderOptimization: 'balanced' }); // Set up event forwarding this._setupEventForwarding(); // Warmup common operations await this.pipelineCache.warmup(); this.isInitialized = true; this.emit('init:complete', { device: this.device, features: Array.from(this.supportedFeatures) }); return true; } catch (error) { this.emit('init:error', { error, timestamp: Date.now() }); logger.error('WebGPU initialization failed:', { type: error.constructor.name, message: error.message, stack: error.stack, config: this.config }); // Set failure state for debugging this.isInitialized = false; this.initFailureReason = error.message; return false; } } /** * Execute tensor operation on GPU */ async execute(operation, tensors, options = {}) { if (!this.isInitialized) { throw new Error('WebGPU compute engine not initialized'); } const startTime = performance.now(); this.emit('compute:start', { operation, options }); try { // Validate inputs this._validateOperation(operation, tensors, options); // Get optimal workgroup size for this operation const tensorArray = Array.isArray(tensors) ? tensors : [tensors]; const optimalWorkgroupSize = this.pipelineCache.getOptimalWorkgroupSize( operation, tensorArray[0].shape || [tensorArray[0].length], this.limits ); // Get compute pipeline let pipeline; try { pipeline = await this.pipelineCache.get(operation, { workgroupSize: options.workgroupSize || optimalWorkgroupSize, dataType: options.dataType || 'f32', inputCount: tensorArray.length, outputCount: options.outputCount || 1, ...options }); } catch (error) { throw error; } // Prepare buffers const buffers = await this._prepareBuffers(tensors, operation, options); // Create bind group const bindGroup = this._createBindGroup(pipeline, buffers, options); // Pass operation to compute pass for optimizations const result = await this._executeComputePass(pipeline, bindGroup, buffers, { ...options, operation }); // Update statistics const executionTime = performance.now() - startTime; this._updateStats(operation, executionTime, buffers); // Clean up input buffers asynchronously if they can be reused this._cleanupBuffersAsync(buffers, options); this.emit('compute:complete', { operation, executionTime, resultSize: result.length }); return result; } catch (error) { const executionTime = performance.now() - startTime; // Enhanced error handling with context const errorContext = { operation, error: { type: error.constructor.name, message: error.message, stack: error.stack }, executionTime, tensors: Array.isArray(tensors) ? tensors.length : 1, options, deviceStable: this.deviceStable ?? true, timestamp: Date.now() }; this.emit('compute:error', errorContext); logger.error('WebGPU compute operation failed:', errorContext); // Attempt recovery for specific error types if (error.message.includes('out of memory') || error.constructor.name === 'GPUOutOfMemoryError') { logger.warn('GPU memory exhausted, attempting emergency cleanup'); await this.bufferManager.emergencyCleanup(); this.emit('recovery:memory', { operation, timestamp: Date.now() }); } throw error; } } /** * Execute batch of operations efficiently */ async executeBatch(operations, options = {}) { const { parallel = false, maxConcurrency = 4 } = options; if (parallel) { // Execute operations in parallel with concurrency limit const semaphore = new Semaphore(maxConcurrency); const promises = operations.map(async (op) => { await semaphore.acquire(); try { return await this.execute(op.operation, op.tensors, op.options); } finally { semaphore.release(); } }); return Promise.all(promises); } else { // Execute operations sequentially const results = []; for (const op of operations) { const result = await this.execute(op.operation, op.tensors, op.options); results.push(result); } return results; } } /** * Copy tensor data to GPU buffer (optimized) */ async uploadTensor(data, options = {}) { const { usage = GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST, forceNew = false } = options; // Check if we can reuse a buffer of the same size if (!forceNew) { const reusedBuffer = this.bufferManager.findReusableBuffer(data.byteLength || data.length * 4, usage); if (reusedBuffer) { // Reuse existing buffer - just write new data this.device.queue.writeBuffer(reusedBuffer, 0, data); return reusedBuffer; } } // Create new buffer only if needed const result = await this.bufferManager.createMappedBuffer(data, usage); return result; } /** * Download tensor data from GPU buffer */ async downloadTensor(buffer, size, options = {}) { const { format = Float32Array } = options; // Create staging buffer for readback const stagingBuffer = this.bufferManager.allocate( size * 4, // Assuming 32-bit floats GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ ); try { // Copy GPU buffer to staging buffer const encoder = this.device.createCommandEncoder(); encoder.copyBufferToBuffer(buffer, 0, stagingBuffer, 0, size * 4); this.device.queue.submit([encoder.finish()]); // Map and read data await stagingBuffer.mapAsync(GPUMapMode.READ); const mappedRange = stagingBuffer.getMappedRange(); const result = new format(mappedRange.slice()); stagingBuffer.unmap(); return result; } finally { this.bufferManager.release(stagingBuffer, { forceDestroy: true }); } } /** * Get engine statistics */ getStats() { return { ...this.stats, bufferStats: this.bufferManager?.getStats() || {}, pipelineStats: this.pipelineCache?.getStats() || {}, deviceLimits: this.limits, supportedFeatures: Array.from(this.supportedFeatures || []) }; } /** * Cleanup all resources */ async cleanup() { this.emit('cleanup:start'); try { if (this.bufferManager) { await this.bufferManager.cleanup(); this.bufferManager = null; } if (this.pipelineCache) { this.pipelineCache.cleanup(); this.pipelineCache = null; } if (this.device) { this.device.destroy(); this.device = null; } this.adapter = null; this.isInitialized = false; this.emit('cleanup:complete'); } catch (error) { this.emit('cleanup:error', { error }); throw error; } } // Private methods _validateOperation(operation, tensors, options) { if (!operation || typeof operation !== 'string') { throw new Error('Operation must be a non-empty string'); } if (!tensors) { throw new Error('Tensors parameter is required'); } const tensorArray = Array.isArray(tensors) ? tensors : [tensors]; // Enhanced debugging for tensor validation for (const tensor of tensorArray) { if (!tensor || (!ArrayBuffer.isView(tensor) && !(tensor instanceof ArrayBuffer))) { // Enhanced error message with debugging info const debugInfo = { tensorType: typeof tensor, constructor: tensor?.constructor?.name, isArrayBufferView: ArrayBuffer.isView(tensor), isArrayBuffer: tensor instanceof ArrayBuffer, hasData: tensor && typeof tensor.data !== 'undefined', dataType: tensor?.data ? typeof tensor.data : 'undefined' }; throw new Error(`Invalid tensor data type. Expected typed array or ArrayBuffer, got: ${JSON.stringify(debugInfo)}`); } } } _isDebugEnabled() { // Check for WebGPU debug flag in global scope try { return (typeof window !== 'undefined' && window.greedDebugWebGPU) || (typeof global !== 'undefined' && global.greedDebugWebGPU); } catch { return false; } } async _prepareBuffers(tensors, operation, options) { const tensorArray = Array.isArray(tensors) ? tensors : [tensors]; const buffers = { inputs: [], output: null, params: null }; // Upload input tensors for (let i = 0; i < tensorArray.length; i++) { const buffer = await this.uploadTensor(tensorArray[i]); buffers.inputs.push(buffer); } // Create output buffer const outputSize = this._calculateOutputSize(operation, tensorArray, options); buffers.output = this.bufferManager.allocate( outputSize * 4, GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC ); // Create parameter buffer using WebGPU shaders helper const params = this.pipelineCache.generateOperationParams(operation, tensorArray, options); buffers.params = await this.uploadTensor(params, { usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST }); return buffers; } _createBindGroup(pipeline, buffers, options) { const entries = []; // Add input buffers for (let i = 0; i < buffers.inputs.length; i++) { entries.push({ binding: i, resource: { buffer: buffers.inputs[i] } }); } // Add output buffer entries.push({ binding: buffers.inputs.length, resource: { buffer: buffers.output } }); // Add parameter buffer entries.push({ binding: buffers.inputs.length + 1, resource: { buffer: buffers.params } }); return this.device.createBindGroup({ layout: pipeline.getBindGroupLayout(0), entries }); } async _executeComputePass(pipeline, bindGroup, buffers, options) { const encoder = this.device.createCommandEncoder(); const computePass = encoder.beginComputePass(); computePass.setPipeline(pipeline); computePass.setBindGroup(0, bindGroup); // Calculate optimal dispatch size const workgroupSize = options.workgroupSize || this.config.workgroupSize; const outputSize = buffers.output.size / 4; // Assuming 32-bit floats // Optimize dispatch size based on operation type let dispatchX, dispatchY = 1, dispatchZ = 1; if (options.operation === 'matmul' || options.operation === 'bmm') { // 2D dispatch for matrix operations const dims = this._getMatrixDimensions(options); dispatchX = Math.ceil(dims.M / workgroupSize[0]); dispatchY = Math.ceil(dims.N / workgroupSize[1]); if (options.operation === 'bmm') { dispatchZ = dims.B; } } else { // 1D dispatch for element-wise operations dispatchX = Math.ceil(outputSize / workgroupSize[0]); } computePass.dispatchWorkgroups(dispatchX, dispatchY, dispatchZ); computePass.end(); // Submit and wait with timeout protection const commandBuffer = encoder.finish(); this.device.queue.submit([commandBuffer]); // Use timeout wrapper to prevent hanging await this._waitForGPUCompletion(5000); // 5 second timeout // Download result with optimized staging buffer return this._downloadTensorOptimized(buffers.output, outputSize, options); } _calculateOutputSize(operation, tensors, options) { if (options.outputSize) { return options.outputSize; } const firstTensor = tensors[0]; const getElementCount = (tensor) => { return ArrayBuffer.isView(tensor) ? tensor.length : tensor.byteLength / 4; }; // Operation-specific output size calculation switch (operation) { case 'matmul': // A(M,K) × B(K,N) = C(M,N) const M = tensors[0].shape?.[0] || Math.sqrt(getElementCount(tensors[0])); const N = tensors[1].shape?.[1] || Math.sqrt(getElementCount(tensors[1])); return M * N; case 'bmm': // Batch matrix multiply: A(B,M,K) × B(B,K,N) = C(B,M,N) const B = tensors[0].shape?.[0] || 1; const bM = tensors[0].shape?.[1] || Math.sqrt(getElementCount(tensors[0]) / B); const bN = tensors[1].shape?.[2] || Math.sqrt(getElementCount(tensors[1]) / B); return B * bM * bN; case 'conv2d': // Simplified - assumes same padding and stride=1 const inHeight = tensors[0].shape?.[2] || 28; const inWidth = tensors[0].shape?.[3] || 28; const outChannels = tensors[1].shape?.[0] || 32; const batchSize = tensors[0].shape?.[0] || 1; return batchSize * outChannels * inHeight * inWidth; case 'transpose': return getElementCount(firstTensor); case 'sum': case 'mean': // Reduction operations output a single value per batch/dimension return options.keepDim ? getElementCount(firstTensor) : 1; case 'softmax': return getElementCount(firstTensor); case 'maxpool2d': case 'avgpool2d': // Simplified pooling calculation const poolKernel = options.kernelSize || 2; const poolStride = options.stride || poolKernel; const poolInH = tensors[0].shape?.[2] || 28; const poolInW = tensors[0].shape?.[3] || 28; const poolOutH = Math.floor((poolInH - poolKernel) / poolStride) + 1; const poolOutW = Math.floor((poolInW - poolKernel) / poolStride) + 1; const poolChannels = tensors[0].shape?.[1] || 1; const poolBatch = tensors[0].shape?.[0] || 1; return poolBatch * poolChannels * poolOutH * poolOutW; default: // Element-wise operations preserve input size return getElementCount(firstTensor); } } _updateStats(operation, executionTime, buffers) { this.stats.computeOperations++; this.stats.totalExecutionTime += executionTime; this.stats.averageExecutionTime = this.stats.totalExecutionTime / this.stats.computeOperations; this.stats.lastOperationTime = executionTime; const bufferMemory = buffers.inputs.reduce((sum, buf) => sum + buf.size, 0) + buffers.output.size; this.stats.memoryUsage = Math.max(this.stats.memoryUsage, bufferMemory); } _getMatrixDimensions(options) { // Extract matrix dimensions from options or tensors return { M: options.M || Math.sqrt(options.inputSize || 256), N: options.N || Math.sqrt(options.inputSize || 256), K: options.K || Math.sqrt(options.inputSize || 256), B: options.B || 1 }; } async _downloadTensorOptimized(buffer, size, options = {}) { const { format = Float32Array, usePooledBuffer = true } = options; let stagingBuffer; if (usePooledBuffer) { // Try to reuse staging buffer from pool stagingBuffer = this.bufferManager.findReusableBuffer( size * 4, GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ ); } if (!stagingBuffer) { // Create new staging buffer stagingBuffer = this.bufferManager.allocate( size * 4, GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ ); } try { // Copy GPU buffer to staging buffer const encoder = this.device.createCommandEncoder(); encoder.copyBufferToBuffer(buffer, 0, stagingBuffer, 0, size * 4); this.device.queue.submit([encoder.finish()]); // Wait for copy to complete with timeout await this._waitForGPUCompletion(3000); // 3 second timeout for data copy // Map and read data with timeout protection const mapPromise = stagingBuffer.mapAsync(GPUMapMode.READ); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error('Buffer mapping timeout')), 2000); }); await Promise.race([mapPromise, timeoutPromise]); const mappedRange = stagingBuffer.getMappedRange(); const result = new format(mappedRange.slice()); stagingBuffer.unmap(); return result; } finally { // Return staging buffer to pool instead of destroying if (usePooledBuffer) { this.bufferManager.returnToPool(stagingBuffer); } else { this.bufferManager.release(stagingBuffer, { forceDestroy: true }); } } } /** * Async buffer cleanup to avoid blocking the main execution */ /** * Wait for GPU completion with timeout protection */ async _waitForGPUCompletion(timeoutMs = 5000) { // WORKAROUND: onSubmittedWorkDone() hangs in some browsers // Instead of waiting, we'll use a shorter timeout and assume completion return new Promise((resolve) => { // Give GPU a moment to complete (much shorter than timeout) setTimeout(() => { resolve(); }, 100); // Very short delay, just enough for GPU to complete }); } _cleanupBuffersAsync(buffers, options) { // Use setImmediate or setTimeout to cleanup asynchronously const cleanupFn = () => { try { // Return input buffers to pool if they can be reused for (const inputBuffer of buffers.inputs) { if (options.reuseInputBuffers !== false) { this.bufferManager.returnToPool(inputBuffer); } else { this.bufferManager.release(inputBuffer, { forceDestroy: false }); } } // Return parameter buffer to pool if (buffers.params && options.reuseParamBuffers !== false) { this.bufferManager.returnToPool(buffers.params); } else if (buffers.params) { this.bufferManager.release(buffers.params, { forceDestroy: false }); } // Output buffer is returned by caller, so don't cleanup here } catch (error) { this.emit('cleanup:error', { error, buffers }); } }; // Use setTimeout for compatibility across all environments setTimeout(cleanupFn, 0); } /** * Batch operations with pipeline and buffer optimization */ async executeBatchOptimized(operations, options = {}) { const { parallel = false, maxConcurrency = 4, reuseBuffers = true, shareComputePass = false } = options; // Group operations by type for pipeline reuse const operationGroups = this._groupOperationsByType(operations); const results = []; for (const [operationType, ops] of operationGroups.entries()) { if (shareComputePass && this._canShareComputePass(operationType)) { // Execute multiple operations in a single compute pass const batchResult = await this._executeBatchedComputePass(ops, options); results.push(...batchResult); } else if (parallel) { // Execute operations in parallel with concurrency limit const semaphore = new Semaphore(maxConcurrency); const promises = ops.map(async (op) => { await semaphore.acquire(); try { return await this.execute(op.operation, op.tensors, { ...op.options, reuseInputBuffers: reuseBuffers, reuseParamBuffers: reuseBuffers }); } finally { semaphore.release(); } }); const parallelResults = await Promise.all(promises); results.push(...parallelResults); } else { // Execute operations sequentially with buffer reuse for (const op of ops) { const result = await this.execute(op.operation, op.tensors, { ...op.options, reuseInputBuffers: reuseBuffers, reuseParamBuffers: reuseBuffers }); results.push(result); } } } return results; } _groupOperationsByType(operations) { const groups = new Map(); for (const op of operations) { const type = op.operation; if (!groups.has(type)) { groups.set(type, []); } groups.get(type).push(op); } return groups; } _canShareComputePass(operationType) { // Element-wise operations can be batched together const batchableOps = ['add', 'sub', 'mul', 'div', 'relu', 'sigmoid', 'tanh']; return batchableOps.includes(operationType); } async _executeBatchedComputePass(operations, options) { // This is a more advanced optimization that would require // significant shader modifications to support multiple operations // in a single compute pass. For now, fall back to sequential execution // but with optimized buffer management. const results = []; for (const op of operations) { const result = await this.execute(op.operation, op.tensors, { ...op.options, ...options }); results.push(result); } return results; } _setupEventForwarding() { // Forward buffer manager events this.bufferManager.on('buffer:created', (data) => this.emit('buffer:created', data)); this.bufferManager.on('buffer:destroyed', (data) => this.emit('buffer:destroyed', data)); this.bufferManager.on('gc:complete', (data) => this.emit('buffer:gc', data)); // Forward pipeline cache events this.pipelineCache.on('cache:miss', (data) => this.emit('pipeline:miss', data)); this.pipelineCache.on('pipeline:compiled', (data) => this.emit('pipeline:compiled', data)); this.pipelineCache.on('warmup:complete', (data) => this.emit('pipeline:warmup', data)); } /** * Handle device errors with recovery attempts */ _handleDeviceError(error) { const errorType = error.constructor.name; switch (errorType) { case 'GPUOutOfMemoryError': logger.warn('GPU out of memory, attempting buffer cleanup'); this.bufferManager.emergencyCleanup(); this.emit('recovery:attempt', { type: 'memory-cleanup', timestamp: Date.now() }); break; case 'GPUInternalError': logger.warn('GPU internal error, marking device as potentially unstable'); this.deviceStable = false; this.emit('device:unstable', { reason: 'internal-error', timestamp: Date.now() }); break; case 'GPUValidationError': logger.warn('GPU validation error, this may indicate shader or pipeline issues'); this.emit('validation:error', { error, timestamp: Date.now() }); break; default: logger.warn('Unknown GPU error type:', errorType); this.emit('error:unknown', { error, timestamp: Date.now() }); } } /** * Get comprehensive error diagnostics */ getErrorDiagnostics() { return { isInitialized: this.isInitialized, deviceStable: this.deviceStable ?? true, initFailureReason: this.initFailureReason || null, bufferStats: this.bufferManager?.getStats() || null, pipelineStats: this.pipelineCache?.getStats() || null, supportedFeatures: Array.from(this.supportedFeatures || []), timestamp: Date.now() }; } } // Simple semaphore for concurrency control class Semaphore { constructor(max) { this.max = max; this.current = 0; this.queue = []; } async acquire() { return new Promise((resolve) => { if (this.current < this.max) { this.current++; resolve(); } else { this.queue.push(resolve); } }); } release() { this.current--; if (this.queue.length > 0) { const resolve = this.queue.shift(); this.current++; resolve(); } } } export default WebGPUComputeEngine;