cuda.js
Version:
CUDA bindings for Node.js
113 lines (101 loc) • 3.59 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.reduction = void 0;
const gpu_array_1 = require("../gpu-array");
const kernel_1 = require("../kernel");
/**
* Reduction operations on GPU arrays
*/
exports.reduction = {
sum: (array) => {
const kernel = new kernel_1.Kernel(`
extern "C" __global__ void sum_reduction(float* input, float* output, int n) {
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? input[i] : 0;
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) {
atomicAdd(output, sdata[0]);
}
}`, 'sum_reduction');
const output = new gpu_array_1.GpuArray(1);
output.zero();
const blockSize = 256;
const gridSize = Math.ceil(array.size / blockSize);
const sharedMem = blockSize * 4; // sizeof(float) * blockSize
kernel.run([array, output, array.size], gridSize, blockSize, sharedMem);
const result = output.download()[0];
output.free();
kernel.free();
return result;
},
max: (array) => {
const kernel = new kernel_1.Kernel(`
extern "C" __global__ void max_reduction(float* input, float* output, int n) {
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? input[i] : -INFINITY;
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] = fmaxf(sdata[tid], sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0) {
atomicMax((int*)output, __float_as_int(sdata[0]));
}
}`, 'max_reduction');
const output = new gpu_array_1.GpuArray(1);
output.fill(-Infinity);
const blockSize = 256;
const gridSize = Math.ceil(array.size / blockSize);
const sharedMem = blockSize * 4;
kernel.run([array, output, array.size], gridSize, blockSize, sharedMem);
const result = output.download()[0];
output.free();
kernel.free();
return result;
},
min: (array) => {
const kernel = new kernel_1.Kernel(`
extern "C" __global__ void min_reduction(float* input, float* output, int n) {
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? input[i] : INFINITY;
__syncthreads();
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] = fminf(sdata[tid], sdata[tid + s]);
}
__syncthreads();
}
if (tid == 0) {
atomicMin((int*)output, __float_as_int(sdata[0]));
}
}`, 'min_reduction');
const output = new gpu_array_1.GpuArray(1);
output.fill(Infinity);
const blockSize = 256;
const gridSize = Math.ceil(array.size / blockSize);
const sharedMem = blockSize * 4;
kernel.run([array, output, array.size], gridSize, blockSize, sharedMem);
const result = output.download()[0];
output.free();
kernel.free();
return result;
},
mean: (array) => {
return exports.reduction.sum(array) / array.size;
}
};
//# sourceMappingURL=reduction.js.map