webcl-nodep
Version:
A fork of node-webcl without dependencies other than OpenCL
440 lines (369 loc) • 15.2 kB
JavaScript
// Copyright (c) 2011-2012, Motorola Mobility, Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of the Motorola Mobility, Inc. nor the names of its
// contributors may be used to endorse or promote products derived from this
// software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
// DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Multiple bandwidth tests
var nodejs = (typeof window === 'undefined');
if(nodejs) {
WebCL = require('../webcl');
clu = require('../lib/clUtils');
log=console.log;
}
//defines, project
var MEMCOPY_ITERATIONS = 100;
var DEFAULT_SIZE = ( 10 * ( 1 << 20 ) ); // 10 M
var DEFAULT_INCREMENT = (1 << 22); //4 M
var CACHE_CLEAR_SIZE = (1 << 24); //16 M
//enums, project
var QUICK_MODE=0, RANGE_MODE=1; // test modes
var DEVICE_TO_HOST=0, HOST_TO_DEVICE=1, DEVICE_TO_DEVICE=2; // memory copy kind
var PAGEABLE=0, PINNED=1; // memory modes
var MAPPED=0, DIRECT=1; // access modes
//First check if the WebCL extension is installed at all
if (WebCL == undefined) {
alert("Unfortunately your system does not support WebCL. " +
"Make sure that you have the WebCL extension installed.");
return;
}
// Create the OpenCL context
var ctx=null;
try {
ctx=WebCL.createContext({
deviceType: WebCL.DEVICE_TYPE_ALL,
});
}
catch(ex) {
throw new Exception("Can't create CL context");
}
var devices=ctx.getInfo(WebCL.CONTEXT_DEVICES);
log("Found "+devices.length+" devices");
devices.forEach(function(d) {
d.units=d.getInfo(WebCL.DEVICE_MAX_COMPUTE_UNITS);
d.clock=d.getInfo(WebCL.DEVICE_MAX_CLOCK_FREQUENCY);
//var timerRes=d.getInfo(WebCL.DEVICE_PROFILING_TIMER_RESOLUTION);
d.type=d.getInfo(WebCL.DEVICE_TYPE);
d.endian=(d.getInfo(WebCL.DEVICE_ENDIAN_LITTLE) ? "LITTLE" : "BIG");
d.name=d.getInfo(WebCL.DEVICE_NAME);
if(d.type==WebCL.DEVICE_TYPE_CPU) d.type="CPU";
else if(d.type==WebCL.DEVICE_TYPE_GPU) d.type="GPU";
else if(d.type==WebCL.DEVICE_TYPE_ACCELERATOR) d.type="ACCELERATOR";
else d.type="DEFAULT";
var flops=d.units * d.clock;
log(" Device "+d.type+" "+d.name+": "+d.units+" CU @ "+d.clock+" MHz, " +
d.endian+" endian");
});
// Run tests
var start=DEFAULT_SIZE; // 1 MB
var end=2*DEFAULT_SIZE; // 20 MB
var increment = DEFAULT_INCREMENT;
var mode = RANGE_MODE;
var accMode = DIRECT;
var memMode = PAGEABLE;
var startDevice=0;
var endDevice=devices.length-1;
var cqCommandQueue=null;
testBandwidth(start, end, increment, mode, HOST_TO_DEVICE, accMode, memMode, startDevice, endDevice);
testBandwidth(start, end, increment, mode, DEVICE_TO_HOST, accMode, memMode, startDevice, endDevice);
testBandwidth(start, end, increment, mode, DEVICE_TO_DEVICE, accMode, memMode, startDevice, endDevice);
function
createQueue(device)
{
// Release if there previous is already one
//if(cqCommandQueue) {
// cqCommandQueue.release();
//}
cqCommandQueue = ctx.createCommandQueue(devices[device], WebCL.QUEUE_PROFILING_ENABLE);
}
///////////////////////////////////////////////////////////////////////////////
// Run a bandwidth test
///////////////////////////////////////////////////////////////////////////////
function
testBandwidth(start, end, increment,
mode, kind, printmode, accMode,
memMode, startDevice, endDevice)
{
switch(mode)
{
case QUICK_MODE:
testBandwidthQuick( DEFAULT_SIZE, kind, printmode, accMode, memMode, startDevice, endDevice);
break;
case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, accMode, memMode, startDevice, endDevice);
break;
//case SHMOO_MODE:
// testBandwidthShmoo(kind, printmode, accMode, memMode, startDevice, endDevice);
// break;
default:
break;
}
}
//////////////////////////////////////////////////////////////////////
//Run a quick mode bandwidth test
//////////////////////////////////////////////////////////////////////
function
testBandwidthQuick(size, kind, printmode, accMode,
memMode, startDevice, endDevice)
{
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, accMode, memMode, startDevice, endDevice);
}
///////////////////////////////////////////////////////////////////////
//Run a range mode bandwidth test
//////////////////////////////////////////////////////////////////////
function testBandwidthRange(start, end, increment,
memCpyKind, accMode, memMode,
startDevice, endDevice)
{
//count the number of copies we're going to run
var count = 1 + ((end - start) / increment);
var memSizes=new Array();
var bandwidths=new Array();
// Before calculating the cumulative bandwidth, initialize bandwidths array to NULL
for (var i = 0; i < count; i++)
bandwidths[i] = 0.0;
// Use the device asked by the user
for (var d = startDevice; d <= endDevice; d++)
{
// Allocate command queue for the device (dealloc first if already allocated)
createQueue(d);
//run each of the copies
for(var i = 0; i < count; i++)
{
memSizes[i] = start + i * increment;
switch(memCpyKind)
{
case DEVICE_TO_HOST: bandwidths[i] += testDeviceToHostTransfer(memSizes[i], accMode, memMode);
break;
case HOST_TO_DEVICE: bandwidths[i] += testHostToDeviceTransfer(memSizes[i], accMode, memMode);
break;
case DEVICE_TO_DEVICE: bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
break;
}
}
} // Complete the bandwidth computation on all the devices
printResults(memSizes, bandwidths, count, memCpyKind, accMode, memMode, (1 + endDevice - startDevice));
//clean up
}
/////////////////////////////////////////////////////////
//print results in an easily read format
////////////////////////////////////////////////////////
function printResults(memSizes, bandwidths, count, kind, accMode, memMode, iNumDevs)
{
// log config information
var str="";
if (kind == DEVICE_TO_DEVICE) {
str += "Device -> Device Bandwidth, "+iNumDevs+" Device(s)";
}
else {
str += (kind == DEVICE_TO_HOST) ? "Device -> Host" : "Host -> Device";
str += " Bandwidth, "+iNumDevs+" Device(s), ";
str += (memMode == PAGEABLE) ? "Paged memory" : "Pinned memory";
str += (accMode == DIRECT) ? ", direct access\n" : ", mapped access";
}
log(str);
log(" Transfer Size (Bytes)\tBandwidth(MB/s)");
for(var i = 0; i < count; i++)
log(" "+memSizes[i]+"\t\t\t"+((memSizes[i] < 10000)? "\t" : "")+ bandwidths[i]);
log();
}
///////////////////////////////////////////////////////////////////////////////
//test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
function testDeviceToHostTransfer(memSize, accMode, memMode)
{
var elapsedTimeInSec = 0.0;
var bandwidthInMBs = 0.0;
var cmPinnedData = null;
var cmDevData = null;
//allocate and init host memory, pinned or conventional
if(memMode == PINNED) {
// Create a host buffer
cmPinnedData = ctx.createBuffer(WebCL.MEM_READ_WRITE | WebCL.MEM_ALLOC_HOST_PTR, memSize);
// Get a mapped pointer
h_data = cqCommandQueue.enqueueMapBuffer(cmPinnedData, WebCL.TRUE, WebCL.MAP_WRITE, 0, memSize);
//initialize
for(var i = 0; i < memSize; i++)
h_data[i] = (i & 0xff);
// unmap and make data in the host buffer valid
cqCommandQueue.enqueueUnmapMemObject(cmPinnedData, h_data);
}
else
{
// standard host alloc
h_data = new Uint8Array(memSize);
//initialize
for(var i = 0; i < memSize; i++)
h_data[i] = (i & 0xff);
}
// allocate device memory
cmDevData = ctx.createBuffer(WebCL.MEM_READ_WRITE, memSize);
// initialize device memory
if(memMode == PINNED) {
// Get a mapped pointer
h_data = cqCommandQueue.enqueueMapBuffer(cmPinnedData, WebCL.TRUE, WebCL.MAP_WRITE, 0, memSize);
cqCommandQueue.enqueueWriteBuffer(cmDevData, WebCL.FALSE, 0, memSize, h_data);
}
else {
ciErrNum = cqCommandQueue.enqueueWriteBuffer(cmDevData, WebCL.FALSE, 0, memSize, h_data);
}
// Sync queue to host, start timer 0, and copy data from GPU to Host
cqCommandQueue.finish();
var start=new Date();
if(accMode == DIRECT) {
// DIRECT: API access to device buffer
for(var i = 0; i < MEMCOPY_ITERATIONS; i++) {
ciErrNum = cqCommandQueue.enqueueReadBuffer(cmDevData, WebCL.FALSE, 0, memSize, h_data);
}
cqCommandQueue.finish();
}
else {
// MAPPED: mapped pointers to device buffer for conventional pointer access
var dm_idata = cqCommandQueue.enqueueMapBuffer(cmDevData, WebCL.TRUE, WebCL.MAP_WRITE, 0, memSize);
for(var i = 0; i < MEMCOPY_ITERATIONS; i++) {
for(var j=0;j<memSize;++j)
h_data[j]=dm_idata[j];
}
cqCommandQueue.enqueueUnmapMemObject(cmDevData, dm_idata);
}
//get the the elapsed time in seconds
var elapsedTimeInSec = new Date()-start;
//calculate bandwidth in MB/s
bandwidthInMBs = (memSize * MEMCOPY_ITERATIONS) / (elapsedTimeInSec * (1 << 20));
//clean up memory
//if(cmDevData) cmDevData.release();
if(cmPinnedData) {
cqCommandQueue.enqueueUnmapMemObject(cmPinnedData, h_data);
//cmPinnedData.release();
}
h_data = null;
return bandwidthInMBs;
}
///////////////////////////////////////////////////////////////////////////////
//test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
function testHostToDeviceTransfer(memSize, accMode, memMode) {
var elapsedTimeInSec = 0.0;
var bandwidthInMBs = 0.0;
var cmPinnedData = null;
var cmDevData = null;
// Allocate and init host memory, pinned or conventional
if(memMode == PINNED)
{
// Create a host buffer
cmPinnedData = ctx.createBuffer(WebCL.MEM_READ_WRITE | WebCL.MEM_ALLOC_HOST_PTR, memSize);
// Get a mapped pointer
h_data = cqCommandQueue.enqueueMapBuffer(cmPinnedData, WebCL.TRUE, WebCL.MAP_WRITE, 0, memSize);
// initialize
for(var i = 0; i < memSize; i++) {
h_data[i] = (i & 0xff);
}
// unmap and make data in the host buffer valid
cqCommandQueue.enqueueUnmapMemObject(cmPinnedData, h_data);
h_data = null; // buffer is unmapped
}
else {
// standard host alloc
h_data = new Uint8Array(memSize);
// initialize
for(var i = 0; i < memSize; i++) {
h_data[i] = (i & 0xff);
}
}
// allocate device memory
cmDevData = ctx.createBuffer(WebCL.MEM_READ_WRITE, memSize);
// Sync queue to host, start timer 0, and copy data from Host to GPU
cqCommandQueue.finish();
var start=new Date();
if(accMode == DIRECT) {
if(memMode == PINNED) {
// Get a mapped pointer
h_data = cqCommandQueue.enqueueMapBuffer(cmPinnedData, WebCL.TRUE, WebCL.MAP_READ, 0, memSize);
}
// DIRECT: API access to device buffer
for(var i = 0; i < MEMCOPY_ITERATIONS; i++) {
ciErrNum = cqCommandQueue.enqueueWriteBuffer(cmDevData, WebCL.FALSE, 0, memSize, h_data);
}
cqCommandQueue.finish();
}
else {
// MAPPED: mapped pointers to device buffer and conventional pointer access
var dm_idata = cqCommandQueue.enqueueMapBuffer(cmDevData, WebCL.TRUE, WebCL.MAP_WRITE, 0, memSize);
if(memMode == PINNED ) {
h_data = cqCommandQueue.enqueueMapBuffer(cmPinnedData, WebCL.TRUE, WebCL.MAP_READ, 0, memSize);
}
for(var i = 0; i < MEMCOPY_ITERATIONS; i++) {
for(var j=0;j<memSize;j++)
dm_idata[j]=h_data[j];
}
ciErrNum = cqCommandQueue.enqueueUnmapMemObject(cmDevData, dm_idata);
}
// get the the elapsed time in seconds
var elapsedTimeInSec = new Date()-start;
// calculate bandwidth in MB/s
bandwidthInMBs = (memSize * MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (1 << 20));
// clean up memory
//if(cmDevData) cmDevData.release();
if(cmPinnedData) {
cqCommandQueue.enqueueUnmapMemObject(cmPinnedData, h_data);
//cmPinnedData.release();
}
h_data = null;
return bandwidthInMBs;
}
///////////////////////////////////////////////////////////////////////////////
//test the bandwidth of a device to host memcopy of a specific size
///////////////////////////////////////////////////////////////////////////////
function testDeviceToDeviceTransfer(memSize)
{
var elapsedTimeInSec = 0.0;
var bandwidthInMBs = 0.0;
//allocate host memory
var h_idata = new Uint8Array(memSize);
//initialize the memory
for(var i = 0; i < memSize; i++)
h_idata[i] = (i & 0xff);
// allocate device input and output memory and initialize the device input memory
var d_idata = ctx.createBuffer(WebCL.MEM_READ_ONLY, memSize);
var d_odata = ctx.createBuffer(WebCL.MEM_WRITE_ONLY, memSize);
cqCommandQueue.enqueueWriteBuffer(d_idata, WebCL.TRUE, 0, memSize, h_idata);
// Sync queue to host, start timer 0, and copy data from one GPU buffer to another GPU bufffer
cqCommandQueue.finish();
var start=new Date();
for(var i = 0; i < MEMCOPY_ITERATIONS; i++)
{
cqCommandQueue.enqueueCopyBuffer(d_idata, d_odata, 0, 0, memSize);
}
// Sync with GPU
cqCommandQueue.finish();
//get the the elapsed time in seconds
elapsedTimeInSec = new Date()-start;
// Calculate bandwidth in MB/s
// This is for kernels that read and write GMEM simultaneously
// Obtained Throughput for unidirectional block copies will be 1/2 of this #
bandwidthInMBs = 2.0 * (memSize * MEMCOPY_ITERATIONS)/(elapsedTimeInSec * (1 << 20));
//clean up memory on host and device
//free(h_idata);
//d_idata.release();
//d_odata.release();
return bandwidthInMBs;
}