greed.js
Version:
Run Python libraries in the browser with WebGPU acceleration - PyTorch, NumPy, and more. Modular architecture with full backward compatibility.
1 lines • 89.5 kB
JavaScript
"use strict";(this.webpackChunkGreed=this.webpackChunkGreed||[]).push([[694],{228:(e,t,r)=>{r.d(t,{nn:()=>s});var i=r(847);class a{constructor(e){this.computeEngine=e,this.tensorRegistry=new Map,this.nextTensorId=0}createWebGPUTensor(e,t,r="float32",a="webgpu"){const n=new i.O(e,{shape:t,dtype:r,device:a,computeEngine:this.computeEngine}),s="webgpu_tensor_"+this.nextTensorId++;return this.tensorRegistry.set(s,n),{id:s,tensor:n,shape:n.shape,dtype:n.dtype,device:n.device}}getTensor(e){return this.tensorRegistry.get(e)}async executeOperation(e,t,r=null,i={}){const a=this.tensorRegistry.get(e);if(!a)throw new Error(`Tensor ${e} not found`);let n=null;if(r&&(n=this.tensorRegistry.get(r),!n))throw new Error(`Tensor ${r} not found`);try{let e;switch(t){case"add":e=await a.add(n);break;case"sub":e=await a.sub(n);break;case"mul":e=await a.mul(n);break;case"div":e=await a.div(n);break;case"matmul":e=await a.matmul(n);break;case"relu":e=await a.relu();break;case"sigmoid":e=await a.sigmoid();break;case"tanh":e=await a.tanh();break;case"softmax":e=await a.softmax(i.dim);break;case"sum":e=await a.sum(i.dim,i.keepdim);break;case"mean":e=await a.mean(i.dim,i.keepdim);break;case"transpose":e=await a.transpose(i.dim0,i.dim1);break;default:throw new Error(`Unsupported operation: ${t}`)}return{success:!0,result:this.createWebGPUTensor(e.data,e.shape,e.dtype,e.device),data:Array.from(e.data),shape:e.shape,dtype:e.dtype}}catch(e){return{success:!1,error:e.message}}}tensorToArray(e){const t=this.tensorRegistry.get(e);if(!t)throw new Error(`Tensor ${e} not found`);return{data:Array.from(t.data),shape:t.shape,dtype:t.dtype}}releaseTensor(e){return this.tensorRegistry.delete(e)}getStats(){return{tensorCount:this.tensorRegistry.size,totalMemory:Array.from(this.tensorRegistry.values()).reduce((e,t)=>e+4*t.size,0),deviceDistribution:this._getDeviceDistribution()}}cleanup(){this.tensorRegistry.clear(),this.nextTensorId=0}_getDeviceDistribution(){const e={};for(const t of this.tensorRegistry.values())e[t.device]=(e[t.device]||0)+1;return e}}let n=null;function s(e){return n=new a(e),"undefined"!=typeof window?window.greedTensorBridge=n:void 0!==r.g&&(r.g.greedTensorBridge=n),n}},493:(e,t,r)=>{r.d(t,{A:()=>x});var i=r(123);class a extends i.A{constructor(e,t={}){super(),this.device=e,this.config={maxPoolSize:t.maxPoolSize||100,maxBufferSize:t.maxBufferSize||268435456,gcThreshold:t.gcThreshold||.8,enablePooling:!1!==t.enablePooling,...t},this.pools=new Map,this.activeBuffers=new Map,this.totalMemoryUsage=0,this.peakMemoryUsage=0,this.stats={allocations:0,poolHits:0,poolMisses:0,releases:0,destroyed:0,currentActive:0,totalPooled:0}}allocate(e,t=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC|GPUBufferUsage.COPY_DST){this._validateAllocation(e,t);const r=this._getPoolKey(e,t);let i=null;this.config.enablePooling&&(i=this._getFromPool(r),i&&(this.stats.poolHits++,this.emit("buffer:reused",{size:e,usage:t,poolKey:r}))),i||(i=this.device.createBuffer({size:e,usage:t}),this.stats.poolMisses++,this.emit("buffer:created",{size:e,usage:t,poolKey:r}));const a={size:e,usage:t,poolKey:r,allocatedAt:performance.now(),lastAccessed:performance.now()};return this.activeBuffers.set(i,a),this.totalMemoryUsage+=e,this.peakMemoryUsage=Math.max(this.peakMemoryUsage,this.totalMemoryUsage),this.stats.allocations++,this.stats.currentActive=this.activeBuffers.size,this.emit("buffer:allocated",{buffer:i,metadata:a}),this._checkMemoryPressure(),i}release(e,t={}){const{forceDestroy:r=!1}=t,i=this.activeBuffers.get(e);return i?(this.activeBuffers.delete(e),this.totalMemoryUsage-=i.size,this.stats.releases++,this.stats.currentActive=this.activeBuffers.size,r||!this.config.enablePooling||this._shouldDestroyBuffer(e,i)?(this._destroyBuffer(e,i),!0):(this._addToPool(e,i)?this.emit("buffer:pooled",{buffer:e,poolKey:i.poolKey}):this._destroyBuffer(e,i),!0)):(this.emit("buffer:release-error",{error:"Buffer not found in active buffers"}),!1)}releaseAll(e,t={}){const r=[];for(const i of e)r.push(this.release(i,t));return r}async createMappedBuffer(e,t=GPUBufferUsage.COPY_SRC){const r=this._calculateBufferSize(e),i=this.allocate(r,t|GPUBufferUsage.MAP_WRITE);try{await i.mapAsync(GPUMapMode.WRITE);const t=i.getMappedRange();if(e instanceof ArrayBuffer)new Uint8Array(t).set(new Uint8Array(e));else{if(!ArrayBuffer.isView(e))throw new Error("Unsupported data type for mapped buffer");new Uint8Array(t).set(new Uint8Array(e.buffer,e.byteOffset,e.byteLength))}return i.unmap(),this.emit("buffer:mapped",{buffer:i,size:r,dataType:e.constructor.name}),i}catch(e){throw this.release(i,{forceDestroy:!0}),e}}copyBuffer(e,t,r,i={}){const{sourceOffset:a=0,destinationOffset:n=0,commandEncoder:s=null}=i;if(!this.activeBuffers.has(e)||!this.activeBuffers.has(t))throw new Error("Source or destination buffer not managed by this BufferManager");const o=s||this.device.createCommandEncoder();if(o.copyBufferToBuffer(e,a,t,n,r),!s){const e=o.finish();this.device.queue.submit([e])}this.emit("buffer:copied",{source:e,destination:t,size:r})}getStats(){return{...this.stats,totalMemoryUsageMB:Math.round(this.totalMemoryUsage/1048576*100)/100,peakMemoryUsageMB:Math.round(this.peakMemoryUsage/1048576*100)/100,poolCount:this.pools.size,totalPooled:Array.from(this.pools.values()).reduce((e,t)=>e+t.length,0),poolEfficiency:this.stats.allocations>0?this.stats.poolHits/this.stats.allocations:0}}async gc(e={}){const{aggressive:t=!1,maxAge:r=6e4,targetReduction:i=.5}=e;this.emit("gc:start",{aggressive:t,maxAge:r,targetReduction:i});let a=0;const n=performance.now(),s=this._getTotalPooledBuffers();for(const[e,o]of this.pools.entries()){const u=o.slice();for(let e=u.length-1;e>=0;e--){const p=u[e];if((t||p._pooledAt&&n-p._pooledAt>r)&&(o.splice(e,1),p.destroy(),a++,this.stats.destroyed++),a/s>=i)break}0===o.length&&this.pools.delete(e)}return this.emit("gc:complete",{destroyed:a,remaining:this._getTotalPooledBuffers()}),a}async emergencyCleanup(){this.emit("emergency:start");try{let e=0;for(const[t,r]of this.pools.entries())for(;r.length>0;){const t=r.pop();try{t.destroy(),e++,this.stats.destroyed++}catch(e){this.emit("buffer:destroy-error",{buffer:t,error:e})}}return this.pools.clear(),window.gc&&window.gc(),this.emit("emergency:complete",{destroyed:e}),e}catch(e){throw this.emit("emergency:error",{error:e}),e}}async cleanup(){this.emit("cleanup:start");try{for(const[e,t]of this.activeBuffers.entries())this._destroyBuffer(e,t);this.activeBuffers.clear();for(const e of this.pools.values())for(const t of e)t.destroy();this.pools.clear(),this.totalMemoryUsage=0,this.stats.currentActive=0,this.stats.totalPooled=0,this.emit("cleanup:complete")}catch(e){throw this.emit("cleanup:error",{error:e}),e}}_validateAllocation(e,t){if(e<=0||e>this.config.maxBufferSize)throw new Error(`Invalid buffer size: ${e}. Must be between 1 and ${this.config.maxBufferSize}`);if("number"!=typeof t)throw new Error("Buffer usage must be a number")}_getPoolKey(e,t){return`${e}-${t}`}_getFromPool(e){const t=this.pools.get(e);return t&&t.length>0?t.pop():null}_addToPool(e,t){const r=t.poolKey;this.pools.has(r)||this.pools.set(r,[]);const i=this.pools.get(r);return!(i.length>=this.config.maxPoolSize||(e._pooledAt=performance.now(),i.push(e),this.stats.totalPooled++,0))}_destroyBuffer(e,t){try{e.destroy(),this.stats.destroyed++,this.emit("buffer:destroyed",{buffer:e,metadata:t})}catch(t){this.emit("buffer:destroy-error",{buffer:e,error:t})}}_shouldDestroyBuffer(e,t){return t.size>this.config.maxBufferSize/4}_shouldRunGC(){return this.totalMemoryUsage/this.config.maxBufferSize>this.config.gcThreshold}async _runGC(){try{await this.gc({aggressive:!1})}catch(e){this.emit("gc:error",{error:e})}}_calculateBufferSize(e){if(e instanceof ArrayBuffer)return e.byteLength;if(ArrayBuffer.isView(e))return e.byteLength;if(Array.isArray(e))return 4*e.length;throw new Error("Cannot calculate buffer size for data type")}_getTotalPooledBuffers(){return Array.from(this.pools.values()).reduce((e,t)=>e+t.length,0)}_checkMemoryPressure(){const e=this.totalMemoryUsage/this.config.maxBufferSize;e>=.95?(this.emit("memory:critical",{memoryRatio:e,totalUsage:this.totalMemoryUsage,maxSize:this.config.maxBufferSize}),setTimeout(()=>this.emergencyCleanup(),0)):e>=this.config.gcThreshold?(this.emit("memory:pressure",{memoryRatio:e,totalUsage:this.totalMemoryUsage,maxSize:this.config.maxBufferSize}),setTimeout(()=>this.forceGC(),0)):e>=.6&&(this.emit("memory:warning",{memoryRatio:e,totalUsage:this.totalMemoryUsage,maxSize:this.config.maxBufferSize}),setTimeout(()=>this._runGC(),0))}_runGC(){const e=this._getTotalPooledBuffers();if(e>0){const t=Math.ceil(.2*e);let r=0;for(const[e,i]of this.pools.entries()){for(;i.length>0&&r<t;){const e=i.shift();try{e.destroy(),r++,this.stats.destroyed++}catch(t){this.emit("buffer:destroy-error",{buffer:e,error:t})}}if(0===i.length&&this.pools.delete(e),r>=t)break}this.emit("gc:automatic",{destroyed:r,remaining:this._getTotalPooledBuffers()})}}}const n=a;class s{static getShaderTemplates(){return new Map([["add",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = input1[index] + input2[index];\n }\n `],["sub",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = input1[index] - input2[index];\n }\n `],["mul",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = input1[index] * input2[index];\n }\n `],["div",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = input1[index] / input2[index];\n }\n `],["pow",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = pow(input1[index], input2[index]);\n }\n `],["matmul",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let row = global_id.x;\n let col = global_id.y;\n let M = params[0]; // rows of A\n let N = params[1]; // cols of B\n let K = params[2]; // cols of A, rows of B\n \n if (row >= M || col >= N) { return; }\n \n var sum = 0.0;\n for (var k = 0u; k < K; k = k + 1u) {\n sum = sum + input1[row * K + k] * input2[k * N + col];\n }\n output[row * N + col] = sum;\n }\n `],["bmm",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let batch = global_id.z;\n let row = global_id.x;\n let col = global_id.y;\n \n let B = params[0]; // batch size\n let M = params[1]; // rows\n let N = params[2]; // cols of second matrix\n let K = params[3]; // cols of first matrix\n \n if (batch >= B || row >= M || col >= N) { return; }\n \n let batch_offset1 = batch * M * K;\n let batch_offset2 = batch * K * N;\n let batch_offset_out = batch * M * N;\n \n var sum = 0.0;\n for (var k = 0u; k < K; k = k + 1u) {\n sum = sum + input1[batch_offset1 + row * K + k] * input2[batch_offset2 + k * N + col];\n }\n output[batch_offset_out + row * N + col] = sum;\n }\n `],["transpose",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let rows = params[0];\n let cols = params[1];\n let size = rows * cols;\n \n if (index >= size) { return; }\n \n let row = index / cols;\n let col = index % cols;\n let transposed_index = col * rows + row;\n \n output[transposed_index] = input[index];\n }\n `],["relu",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = max(input[index], 0.0);\n }\n `],["leaky_relu",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n let negative_slope = bitcast<f32>(params[1]);\n if (index >= size) { return; }\n let val = input[index];\n output[index] = select(negative_slope * val, val, val > 0.0);\n }\n `],["sigmoid",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = 1.0 / (1.0 + exp(-input[index]));\n }\n `],["tanh",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = tanh(input[index]);\n }\n `],["gelu",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n let x = input[index];\n // GELU approximation: 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))\n let sqrt_2_over_pi = 0.7978845608;\n let inner = sqrt_2_over_pi * (x + 0.044715 * x * x * x);\n output[index] = 0.5 * x * (1.0 + tanh(inner));\n }\n `],["softmax",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n var<workgroup> shared_max: f32;\n var<workgroup> shared_sum: f32;\n\n @compute @workgroup_size(${Math.min(e.workgroupSize[0],256)})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>,\n @builtin(local_invocation_id) local_id: vec3<u32>,\n @builtin(workgroup_id) workgroup_id: vec3<u32>) {\n let batch_size = params[0];\n let dim_size = params[1];\n let batch_idx = workgroup_id.x;\n let local_idx = local_id.x;\n \n if (batch_idx >= batch_size) { return; }\n \n let batch_offset = batch_idx * dim_size;\n \n // Find maximum for numerical stability\n var max_val = -3.4028235e+38; // -FLT_MAX\n for (var i = local_idx; i < dim_size; i = i + ${Math.min(e.workgroupSize[0],256)}u) {\n max_val = max(max_val, input[batch_offset + i]);\n }\n \n // Reduce maximum across workgroup\n workgroupBarrier();\n if (local_idx == 0u) {\n shared_max = max_val;\n }\n for (var stride = 1u; stride < ${Math.min(e.workgroupSize[0],256)}u; stride = stride * 2u) {\n workgroupBarrier();\n if (local_idx >= stride) {\n shared_max = max(shared_max, max_val);\n }\n }\n workgroupBarrier();\n \n // Compute exponentials and sum\n var sum = 0.0;\n for (var i = local_idx; i < dim_size; i = i + ${Math.min(e.workgroupSize[0],256)}u) {\n let exp_val = exp(input[batch_offset + i] - shared_max);\n sum = sum + exp_val;\n output[batch_offset + i] = exp_val;\n }\n \n // Reduce sum across workgroup\n workgroupBarrier();\n if (local_idx == 0u) {\n shared_sum = sum;\n }\n for (var stride = 1u; stride < ${Math.min(e.workgroupSize[0],256)}u; stride = stride * 2u) {\n workgroupBarrier();\n if (local_idx >= stride) {\n shared_sum = shared_sum + sum;\n }\n }\n workgroupBarrier();\n \n // Normalize\n for (var i = local_idx; i < dim_size; i = i + ${Math.min(e.workgroupSize[0],256)}u) {\n output[batch_offset + i] = output[batch_offset + i] / shared_sum;\n }\n }\n `],["sum",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n var<workgroup> shared_data: array<f32, ${e.workgroupSize[0]}>;\n\n @compute @workgroup_size(${e.workgroupSize[0]})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>,\n @builtin(local_invocation_id) local_id: vec3<u32>,\n @builtin(workgroup_id) workgroup_id: vec3<u32>) {\n let size = params[0];\n let local_idx = local_id.x;\n let global_idx = global_id.x;\n \n // Load data into shared memory\n var sum = 0.0;\n for (var i = global_idx; i < size; i = i + ${e.workgroupSize[0]}u) {\n sum = sum + input[i];\n }\n shared_data[local_idx] = sum;\n \n workgroupBarrier();\n \n // Parallel reduction\n for (var stride = ${e.workgroupSize[0]/2}u; stride > 0u; stride = stride >> 1u) {\n if (local_idx < stride) {\n shared_data[local_idx] = shared_data[local_idx] + shared_data[local_idx + stride];\n }\n workgroupBarrier();\n }\n \n if (local_idx == 0u) {\n output[workgroup_id.x] = shared_data[0];\n }\n }\n `],["mean",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n var<workgroup> shared_data: array<f32, ${e.workgroupSize[0]}>;\n\n @compute @workgroup_size(${e.workgroupSize[0]})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>,\n @builtin(local_invocation_id) local_id: vec3<u32>,\n @builtin(workgroup_id) workgroup_id: vec3<u32>) {\n let size = params[0];\n let local_idx = local_id.x;\n let global_idx = global_id.x;\n \n var sum = 0.0;\n for (var i = global_idx; i < size; i = i + ${e.workgroupSize[0]}u) {\n sum = sum + input[i];\n }\n shared_data[local_idx] = sum;\n \n workgroupBarrier();\n \n for (var stride = ${e.workgroupSize[0]/2}u; stride > 0u; stride = stride >> 1u) {\n if (local_idx < stride) {\n shared_data[local_idx] = shared_data[local_idx] + shared_data[local_idx + stride];\n }\n workgroupBarrier();\n }\n \n if (local_idx == 0u) {\n output[workgroup_id.x] = shared_data[0] / f32(size);\n }\n }\n `],["conv2d",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> weight: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read> bias: array<${e.dataType}>;\n @group(0) @binding(3) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(4) var<uniform> params: array<u32, 8>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let out_y = global_id.x;\n let out_x = global_id.y;\n let out_c = global_id.z;\n \n let batch_size = params[0];\n let in_channels = params[1];\n let in_height = params[2];\n let in_width = params[3];\n let out_channels = params[4];\n let out_height = params[5];\n let out_width = params[6];\n let kernel_size = params[7];\n \n if (out_y >= out_height || out_x >= out_width || out_c >= out_channels) { return; }\n \n var sum = 0.0;\n \n for (var in_c = 0u; in_c < in_channels; in_c = in_c + 1u) {\n for (var ky = 0u; ky < kernel_size; ky = ky + 1u) {\n for (var kx = 0u; kx < kernel_size; kx = kx + 1u) {\n let in_y = out_y + ky;\n let in_x = out_x + kx;\n \n if (in_y < in_height && in_x < in_width) {\n let input_idx = in_c * in_height * in_width + in_y * in_width + in_x;\n let weight_idx = out_c * in_channels * kernel_size * kernel_size + \n in_c * kernel_size * kernel_size + ky * kernel_size + kx;\n sum = sum + input[input_idx] * weight[weight_idx];\n }\n }\n }\n }\n \n sum = sum + bias[out_c];\n let output_idx = out_c * out_height * out_width + out_y * out_width + out_x;\n output[output_idx] = sum;\n }\n `],["maxpool2d",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 8>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let out_y = global_id.x;\n let out_x = global_id.y;\n let c = global_id.z;\n \n let channels = params[0];\n let in_height = params[1];\n let in_width = params[2];\n let out_height = params[3];\n let out_width = params[4];\n let kernel_size = params[5];\n let stride = params[6];\n \n if (out_y >= out_height || out_x >= out_width || c >= channels) { return; }\n \n var max_val = -3.4028235e+38; // -FLT_MAX\n \n for (var ky = 0u; ky < kernel_size; ky = ky + 1u) {\n for (var kx = 0u; kx < kernel_size; kx = kx + 1u) {\n let in_y = out_y * stride + ky;\n let in_x = out_x * stride + kx;\n \n if (in_y < in_height && in_x < in_width) {\n let input_idx = c * in_height * in_width + in_y * in_width + in_x;\n max_val = max(max_val, input[input_idx]);\n }\n }\n }\n \n let output_idx = c * out_height * out_width + out_y * out_width + out_x;\n output[output_idx] = max_val;\n }\n `],["exp",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = exp(input[index]);\n }\n `],["log",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = log(input[index]);\n }\n `],["sqrt",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = sqrt(input[index]);\n }\n `],["abs",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = abs(input[index]);\n }\n `],["max",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = max(input1[index], input2[index]);\n }\n `],["min",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n output[index] = min(input1[index], input2[index]);\n }\n `],["concat",e=>`\n @group(0) @binding(0) var<storage, read> input1: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> input2: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size1 = params[0];\n let size2 = params[1];\n let total_size = size1 + size2;\n \n if (index >= total_size) { return; }\n \n if (index < size1) {\n output[index] = input1[index];\n } else {\n output[index] = input2[index - size1];\n }\n }\n `],["slice",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(2) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let start = params[0];\n let end = params[1];\n let step = params[2];\n let output_size = (end - start + step - 1u) / step;\n \n if (index >= output_size) { return; }\n \n let input_index = start + index * step;\n output[index] = input[input_index];\n }\n `],["batch_norm",e=>`\n @group(0) @binding(0) var<storage, read> input: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> running_mean: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read> running_var: array<${e.dataType}>;\n @group(0) @binding(3) var<storage, read> weight: array<${e.dataType}>;\n @group(0) @binding(4) var<storage, read> bias: array<${e.dataType}>;\n @group(0) @binding(5) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(6) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let batch_size = params[0];\n let channels = params[1];\n let spatial_size = params[2];\n let eps = bitcast<f32>(params[3]);\n \n if (index >= batch_size * channels * spatial_size) { return; }\n \n let c = (index / spatial_size) % channels;\n let normalized = (input[index] - running_mean[c]) / sqrt(running_var[c] + eps);\n output[index] = normalized * weight[c] + bias[c];\n }\n `],["cross_entropy",e=>`\n @group(0) @binding(0) var<storage, read> logits: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> targets: array<u32>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize[0]})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let batch_idx = global_id.x;\n let batch_size = params[0];\n let num_classes = params[1];\n \n if (batch_idx >= batch_size) { return; }\n \n let batch_offset = batch_idx * num_classes;\n let target_class = targets[batch_idx];\n \n // Find max for numerical stability\n var max_logit = -3.4028235e+38;\n for (var i = 0u; i < num_classes; i = i + 1u) {\n max_logit = max(max_logit, logits[batch_offset + i]);\n }\n \n // Compute log-sum-exp\n var sum_exp = 0.0;\n for (var i = 0u; i < num_classes; i = i + 1u) {\n sum_exp = sum_exp + exp(logits[batch_offset + i] - max_logit);\n }\n let log_sum_exp = log(sum_exp) + max_logit;\n \n // Cross entropy loss = -log(softmax[target])\n let target_logit = logits[batch_offset + target_class];\n output[batch_idx] = log_sum_exp - target_logit;\n }\n `],["mse_loss",e=>`\n @group(0) @binding(0) var<storage, read> predictions: array<${e.dataType}>;\n @group(0) @binding(1) var<storage, read> targets: array<${e.dataType}>;\n @group(0) @binding(2) var<storage, read_write> output: array<${e.dataType}>;\n @group(0) @binding(3) var<uniform> params: array<u32, 4>;\n\n @compute @workgroup_size(${e.workgroupSize.join(", ")})\n fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {\n let index = global_id.x;\n let size = params[0];\n if (index >= size) { return; }\n let diff = predictions[index] - targets[index];\n output[index] = diff * diff;\n }\n `]])}static getOptimalWorkgroupSize(e,t,r){const i=r?.maxComputeWorkgroupSizeX||256;switch(e){case"matmul":case"bmm":return[Math.min(16,i),Math.min(16,i),1];case"conv2d":return[Math.min(8,i),Math.min(8,i),Math.min(8,i)];case"softmax":case"sum":case"mean":return[Math.min(256,i),1,1];default:return[Math.min(64,i),1,1]}}static getBufferLayout(e,t=2,r=1){return{add:{inputs:2,outputs:1,uniforms:1},sub:{inputs:2,outputs:1,uniforms:1},mul:{inputs:2,outputs:1,uniforms:1},div:{inputs:2,outputs:1,uniforms:1},pow:{inputs:2,outputs:1,uniforms:1},matmul:{inputs:2,outputs:1,uniforms:1},bmm:{inputs:2,outputs:1,uniforms:1},relu:{inputs:1,outputs:1,uniforms:1},sigmoid:{inputs:1,outputs:1,uniforms:1},tanh:{inputs:1,outputs:1,uniforms:1},exp:{inputs:1,outputs:1,uniforms:1},log:{inputs:1,outputs:1,uniforms:1},sqrt:{inputs:1,outputs:1,uniforms:1},abs:{inputs:1,outputs:1,uniforms:1},conv2d:{inputs:3,outputs:1,uniforms:1},batch_norm:{inputs:5,outputs:1,uniforms:1},cross_entropy:{inputs:2,outputs:1,uniforms:1},sum:{inputs:1,outputs:1,uniforms:1},mean:{inputs:1,outputs:1,uniforms:1}}[e]||{inputs:t,outputs:r,uniforms:1}}static generateParams(e,t,r={}){const i=new Uint32Array(4);switch(e){case"matmul":i[0]=t[0].shape?.[0]||Math.sqrt(t[0].length),i[1]=t[1].shape?.[1]||Math.sqrt(t[1].length),i[2]=t[0].shape?.[1]||Math.sqrt(t[0].length);break;case"bmm":i[0]=t[0].shape?.[0]||1,i[1]=t[0].shape?.[1]||Math.sqrt(t[0].length),i[2]=t[1].shape?.[2]||Math.sqrt(t[1].length),i[3]=t[0].shape?.[2]||Math.sqrt(t[0].length);break;case"conv2d":const e=t[0].shape||[1,1,28,28];t[1].shape,i[0]=e[0],i[1]=e[1],i[2]=e[2],i[3]=e[3];break;case"softmax":const a=t[0].shape||[1,t[0].length];i[0]=a.length>1?a[0]:1,i[1]=a.length>1?a[1]:a[0];break;case"leaky_relu":i[0]=t[0].length,i[1]=new Uint32Array(new Float32Array([r.negativeSlope||.01]).buffer)[0];break;default:i[0]=Array.isArray(t)?t[0].length:t.length}return i}}class o extends i.A{constructor(e,t={}){super(),this.device=e,this.config={maxCacheSize:t.maxCacheSize||100,enableWarmup:!1!==t.enableWarmup,enableMetrics:!1!==t.enableMetrics,shaderOptimization:t.shaderOptimization||"balanced",...t},this.pipelines=new Map,this.shaderModules=new Map,this.bindGroupLayouts=new Map,this.accessOrder=new Map,this.compilationQueue=new Map,this.stats={hits:0,misses:0,compilations:0,evictions:0,averageCompileTime:0,totalCompileTime:0},this.shaderTemplates=s.getShaderTemplates()}async get(e,t={}){const r=this._generateKey(e,t);if(this.pipelines.has(r))return this._updateAccess(r),this.stats.hits++,this.emit("cache:hit",{operation:e,key:r}),this.pipelines.get(r);if(this.compilationQueue.has(r))return this.emit("cache:wait",{operation:e,key:r}),await this.compilationQueue.get(r);this.stats.misses++,this.emit("cache:miss",{operation:e,key:r});const i=this._compilePipeline(e,t,r);this.compilationQueue.set(r,i);try{const e=await i;return this.compilationQueue.delete(r),e}catch(e){throw this.compilationQueue.delete(r),e}}async warmup(e=null){if(!this.config.enableWarmup)return;const t=e||["add","multiply","matmul","relu","sigmoid","softmax","conv2d","maxpool","transpose"];this.emit("warmup:start",{operations:t});const r=performance.now(),i=t.map(async e=>{try{await this.get(e,{warmup:!0}),this.emit("warmup:operation",{operation:e})}catch(t){this.emit("warmup:error",{operation:e,error:t})}});await Promise.allSettled(i);const a=performance.now()-r;this.emit("warmup:complete",{operations:t,duration:a})}getBindGroupLayout(e,t={}){const r=this._generateLayoutKey(e,t);if(this.bindGroupLayouts.has(r))return this.bindGroupLayouts.get(r);const i=this._createBindGroupLayout(e,t);return this.bindGroupLayouts.set(r,i),i}getOptimalWorkgroupSize(e,t,r){return s.getOptimalWorkgroupSize(e,t,r)}generateOperationParams(e,t,r={}){return s.generateParams(e,t,r)}async createShaderModule(e,t={}){const r=this._hashString(e);if(this.shaderModules.has(r))return this.shaderModules.get(r);try{const i=this.device.createShaderModule({code:e,...t});return this.shaderModules.set(r,i),this.emit("shader:compiled",{hash:r,size:e.length}),i}catch(t){throw this.emit("shader:error",{hash:r,error:t,source:e.substring(0,100)}),t}}generateShader(e,t={}){const r=this.shaderTemplates[e];if(!r)throw new Error(`No shader template found for operation: ${e}`);return r({workgroupSize:t.workgroupSize||[8,8,1],dataType:t.dataType||"f32",optimization:this.config.shaderOptimization,...t})}getStats(){const e=this.stats.hits+this.stats.misses>0?this.stats.hits/(this.stats.hits+this.stats.misses):0;return{...this.stats,hitRate:e,cacheSize:this.pipelines.size,shaderCacheSize:this.shaderModules.size,layoutCacheSize:this.bindGroupLayouts.size,averageCompileTimeMs:Math.round(100*this.stats.averageCompileTime)/100}}clear(){this.pipelines.clear(),this.shaderModules.clear(),this.bindGroupLayouts.clear(),this.accessOrder.clear(),this.compilationQueue.clear(),this.stats.hits=0,this.stats.misses=0,this.emit("cache:cleared")}cleanup(){this.clear(),this.shaderTemplates.clear(),this.emit("cleanup:complete")}async _compilePipeline(e,t,r){const i=performance.now();try{const a=this.generateShader(e,t),n=await this.createShaderModule(a),s=this.getBindGroupLayout(e,t),o=this.device.createPipelineLayout({bindGroupLayouts:[s]}),u=await this.device.createComputePipelineAsync({layout:o,compute:{module:n,entryPoint:"main"}});this.pipelines.set(r,u),this._updateAccess(r),this._enforceMaxCacheSize();const p=performance.now()-i;return this.stats.compilations++,this.stats.totalCompileTime+=p,this.stats.averageCompileTime=this.stats.totalCompileTime/this.stats.compilations,this.emit("pipeline:compiled",{operation:e,key:r,compileTime:p,cacheSize:this.pipelines.size}),u}catch(t){throw this.emit("pipeline:error",{operation:e,key:r,error:t}),t}}_generateKey(e,t){return[e,t.workgroupSize?.join(",")||"8,8,1",t.dataType||"f32",t.inputCount||2,t.outputCount||1,JSON.stringify(t.constants||{})].join("|")}_generateLayoutKey(e,t){return`${e}|${t.inputCount||2}|${t.outputCount||1}`}_createBindGroupLayout(e,t){const r=s.getBufferLayout(e,t.inputCount,t.outputCount),i=[];for(let e=0;e<r.inputs;e++)i.push({binding:e,visibility:GPUShaderStage.COMPUTE,buffer:{type:"read-only-storage"}});for(let e=0;e<r.outputs;e++)i.push({binding:r.inputs+e,visibility:GPUShaderStage.COMPUTE,buffer:{type:"storage"}});return r.uniforms>0&&i.push({binding:r.inputs+r.outputs,visibility:GPUShaderStage.COMPUTE,buffer:{type:"uniform"}}),this.device.createBindGroupLayout({entries:i})}_updateAccess(e){this.accessOrder.set(e,performance.now())}_enforceMaxCacheSize(){if(this.pipelines.size<=this.config.maxCacheSize)return;let e=null,t=1/0;for(const[r,i]of this.accessOrder.entries())i<t&&(t=i,e=r);e&&(this.pipelines.delete(e),this.accessOrder.delete(e),this.stats.evictions++,this.emit("cache:eviction",{key:e,cacheSize:this.pipelines.size}))}_hashString(e){let t=0;for(let r=0;r<e.length;r++)t=(t<<5)-t+e.charCodeAt(r),t&=t;return t.toString(36)}}const u=o;var p=r(626);class l extends i.A{constructor(e={}){super(),this.config={powerPreference:e.powerPreference||"high-performance",enableProfiling:!1!==e.enableProfiling,maxBufferSize:e.maxBufferSize||268435456,workgroupSize:e.workgroupSize||[64,1,1],enableValidation:!1!==e.enableValidation,...e},this.adapter=null,this.device=null,this.isInitialized=!1,this.bufferManager=null,this.pipelineCache=null,this.supportedFeatures=new Set,this.limits=null,this.stats={computeOperations:0,totalExecutionTime:0,averageExecutionTime:0,memoryUsage:0,lastOperationTime:0}}async initialize(){if(this.isInitialized)return!0;try{if(this.emit("init:start"),!navigator.gpu)throw new Error("WebGPU not supported in this browser");if(this.adapter=await navigator.gpu.requestAdapter({powerPreference:this.config.powerPreference}),!this.adapter)throw new Error("Failed to get WebGPU adapter");this.supportedFeatures=this.adapter.features,this.limits=this.adapter.limits,this.emit("init:adapter",{features:Array.from(this.supportedFeatures),limits:this.limits});const e={requiredFeatures:[],requiredLimits:{}};return this.supportedFeatures.has("timestamp-query")&&e.requiredFeatures.push("timestamp-query"),this.device=await this.adapter.requestDevice(e),this.device.addEventListener("uncapturederror",e=>{const t=e.error;this.emit("device:error",{error:t,type:"uncaptured",timestamp:Date.now()}),p.A.error("WebGPU uncaptured error:",{type:t.constructor.name,message:t.message,stack:t.stack}),this._handleDeviceError(t)}),this.bufferManager=new n(this.device,{maxBufferSize:this.config.maxBufferSize,enablePooling:!0,maxPoolSize:100}),this.pipelineCache=new u(this.device,{maxCacheSize:50,enableWarmup:!0,shaderOptimization:"balanced"}),this._setupEventForwarding(),await this.pipelineCache.warmup(),this.isInitialized=!0,this.emit("init:complete",{device:this.device,features:Array.from(this.supportedFeatures)}),!0}catch(e){return this.emit("init:error",{error:e,timestamp:Date.now()}),p.A.error("WebGPU initialization failed:",{type:e.constructor.name,message:e.message,stack:e.stack,config:this.config}),this.isInitialized=!1,this.initFailureReason=e.message,!1}}async execute(e,t,r={}){if(!this.isInitialized)throw new Error("WebGPU compute engine not initialized");const i=performance.now();this.emit("compute:start",{operation:e,options:r});try{this._validateOperation(e,t,r);const a=Array.isArray(t)?t:[t],n=this.pipelineCache.getOptimalWorkgroupSize(e,a[0].shape||[a[0].length],this.limits),s=await this.pipelineCache.get(e,{workgroupSize:r.workgroupSize||n,dataType:r.dataType||"f32",inputCount:a.length,outputCount:r.outputCount||1,...r}),o=await this._prepareBuffers(t,e,r),u=this._createBindGroup(s,o,r),p=await this._executeComputePass(s,u,o,r),l=performance.now()-i;return this._updateStats(e,l,o),this.emit("compute:complete",{operation:e,executionTime:l,resultSize:p.length}),p}catch(a){const n=performance.now()-i,s={operation:e,error:{type:a.constructor.name,message:a.message,stack:a.stack},executionTime:n,tensors:Array.isArray(t)?t.length:1,options:r,deviceStable:this.deviceStable??!0,timestamp:Date.now()};throw this.emit("compute:error",s),p.A.error("WebGPU compute operation failed:",s),(a.message.includes("out of memory")||"GPUOutOfMemoryError"===a.constructor.name)&&(p.A.warn("GPU memory exhausted, attempting emergency cleanup"),await this.bufferManager.emergencyCleanup(),this.emit("recovery:memory",{operation:e,timestamp:Date.now()})),a}}async executeBatch(e,t={}){const{parallel:r=!1,maxConcurrency:i=4}=t;if(r){const t=new c(i),r=e.map(async e=>{await t.acquire();try{return await this.execute(e.operation,e.tensors,e.options)}finally{t.release()}});return Promise.all(r)}{const t=[];for(const r of e){const e=await this.execute(r.operation,r.tensors,r.options);t.push(e)}return t}}async uploadTensor(e,t={}){const{usage:r=GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_DST}=t;return this.bufferManager.createMappedBuffer(e,r)}async downloadTensor(e,t,r={}){const{format:i=Float32Array}=r,a=this.bufferManager.allocate(4*t,GPUBufferUsage.COPY_DST|GPUBufferUsage.MAP_READ);try{const r=this.device.createCommandEncoder();r.copyBufferToBuffer(e,0,a,0,4*t),this.device.queue.submit([r.finish()]),await a.mapAsync(GPUMapMode.READ);const n=new i(a.getMappedRange().slice());return a.unmap(),n}finally{this.bufferManager.release(a,{forceDestroy:!0})}}getStats(){return{...this.stats,bufferStats:this.bufferManager?.getStats()||{},pipelineStats:this.pipelineCache?.getStats()||{},deviceLimits:this.limits,supportedFeatures:Array.from(this.supportedFeatures||[])}}async cleanup(){this.emit("cleanup:start");try{this.bufferManager&&(await this.bufferManager.cleanup(),this.bufferManager=null),this.pipelineCache&&(this.pipelineCache.cleanup(),this.pipelineCache=null),this.device&&(this.device.destroy(),this.device=null),this.adapter=null,this.isInitialized=!1,this.emit("cleanup:complete")}catch(e){throw this.emit("cleanup:error",{error:e}),e}}_validateOperation(e,t,r){if(!e||"string"!=typeof e)throw new Error("Operation must be a non-empty string");if(!t)throw new Error("Tensors parameter is required");const i=Array.isArray(t)?t:[t];for(const e of i)if(!e||!ArrayBuffer.isView(e)&&!(e instanceof ArrayBuffer))throw new Error("All tensors must be typed arrays or ArrayBuffers")}async _prepareBuffers(e,t,r){const i=Array.isArray(e)?e:[e],a={inputs:[],output:null,params:null};for(const e of i){const t=await this.uploadTensor(e);a.inputs.push(t)}const n=this._calculateOutputSize(t,i,r);a.output=this.bufferManager.allocate(4*n,GPUBufferUsage.STORAGE|GPUBufferUsage.COPY_SRC);const s=this.pipelineCache.generateOperationParams(t,i,r);return a.params=await this.uploadTensor(s,{usage:GPUBufferUsage.UNIFORM|GPUBufferUsage.COPY_DST}),a}_createBindGroup(e,t,r){const i=[];for(let e=0;e<t.inputs.length;e++)i.push({binding:e,resource:{buffer:t.inputs[e]}});return i.push({binding:t.inputs.length,resource:{buffer:t.output}}),i.push({binding:t.inputs.length+1,resource:{buffer:t.params}}),this.device.createBindGroup({layout:e.getBindGroupLayout(0),entries:i})}async _executeComputePass(e,t,r,i){const a=this.device.createCommandEncoder(),n=a.beginComputePass();n.setPipeline(e),n.setBindGroup(0,t);const s=i.workgroupSize||this.config.workgroupSize,o=r.output.size/4,u=Math.ceil(o/s[0]);return n.dispatchWorkgroups(u,1,1),n.end(),this.device.queue.submit([a.finish()]),await this.device.queue.onSubmittedWorkDone(),this.downloadTensor(r.output,o)}_calculateOutputSize(e,t,r){if(r.outputSize)return r.outputSize;const i=t[0],a=e=>ArrayBuffer.isView(e)?e.length:e.byteLength/4;switch(e){case"matmul":return(t[0].shape?.[0]||Math.sqrt(a(t[0])))*(t[1].sha