UNPKG

cuda.js

Version:

CUDA bindings for Node.js

113 lines (101 loc) 3.59 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.reduction = void 0; const gpu_array_1 = require("../gpu-array"); const kernel_1 = require("../kernel"); /** * Reduction operations on GPU arrays */ exports.reduction = { sum: (array) => { const kernel = new kernel_1.Kernel(` extern "C" __global__ void sum_reduction(float* input, float* output, int n) { extern __shared__ float sdata[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; sdata[tid] = (i < n) ? input[i] : 0; __syncthreads(); for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } if (tid == 0) { atomicAdd(output, sdata[0]); } }`, 'sum_reduction'); const output = new gpu_array_1.GpuArray(1); output.zero(); const blockSize = 256; const gridSize = Math.ceil(array.size / blockSize); const sharedMem = blockSize * 4; // sizeof(float) * blockSize kernel.run([array, output, array.size], gridSize, blockSize, sharedMem); const result = output.download()[0]; output.free(); kernel.free(); return result; }, max: (array) => { const kernel = new kernel_1.Kernel(` extern "C" __global__ void max_reduction(float* input, float* output, int n) { extern __shared__ float sdata[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; sdata[tid] = (i < n) ? input[i] : -INFINITY; __syncthreads(); for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]); } __syncthreads(); } if (tid == 0) { atomicMax((int*)output, __float_as_int(sdata[0])); } }`, 'max_reduction'); const output = new gpu_array_1.GpuArray(1); output.fill(-Infinity); const blockSize = 256; const gridSize = Math.ceil(array.size / blockSize); const sharedMem = blockSize * 4; kernel.run([array, output, array.size], gridSize, blockSize, sharedMem); const result = output.download()[0]; output.free(); kernel.free(); return result; }, min: (array) => { const kernel = new kernel_1.Kernel(` extern "C" __global__ void min_reduction(float* input, float* output, int n) { extern __shared__ float sdata[]; unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; sdata[tid] = (i < n) ? input[i] : INFINITY; __syncthreads(); for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s) { sdata[tid] = fminf(sdata[tid], sdata[tid + s]); } __syncthreads(); } if (tid == 0) { atomicMin((int*)output, __float_as_int(sdata[0])); } }`, 'min_reduction'); const output = new gpu_array_1.GpuArray(1); output.fill(Infinity); const blockSize = 256; const gridSize = Math.ceil(array.size / blockSize); const sharedMem = blockSize * 4; kernel.run([array, output, array.size], gridSize, blockSize, sharedMem); const result = output.download()[0]; output.free(); kernel.free(); return result; }, mean: (array) => { return exports.reduction.sum(array) / array.size; } }; //# sourceMappingURL=reduction.js.map