ems-typed
Version:
Persistent Shared Memory and Parallel Programming Model
178 lines (159 loc) • 8.56 kB
JavaScript
/*-----------------------------------------------------------------------------+
| Extended Memory Semantics (EMS) Version 1.4.0 |
| Synthetic Semantics http://www.synsem.com/ mogill@synsem.com |
+-----------------------------------------------------------------------------+
| Copyright (c) 2011-2014, Synthetic Semantics LLC. All rights reserved. |
| Copyright (c) 2015-2016, Jace A Mogill. All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| * Redistributions of source code must retain the above copyright |
| notice, this list of conditions and the following disclaimer. |
| * Redistributions in binary form must reproduce the above copyright |
| notice, this list of conditions and the following disclaimer in the |
| documentation and/or other materials provided with the distribution. |
| * Neither the name of the Synthetic Semantics nor the names of its |
| contributors may be used to endorse or promote products derived |
| from this software without specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SYNTHETIC |
| SEMANTICS LLC BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
+-----------------------------------------------------------------------------*/
;
var ems = require('ems')(parseInt(process.argv[2]));
var fs = require('fs');
//-------------------------------------------------------------------
// Allocate the word count dictionary
var maxNKeys = 100000000;
var wordCounts = ems.new({
dimensions: [maxNKeys], // Maximum # of different keys the array can store
heapSize: maxNKeys * 10, // 10 bytes of storage per key, used for key (dictionary word) itself
useMap: true, // Use a key-index mapping, not integer indexes
setFEtags: 'full', // Initial full/empty state of array elements
doDataFill: true, // Initialize data values
dataFill: 0 // Initial value of new keys
});
//-------------------------------------------------------------------
// Use a key-value mapped array to store execution statistics
var nStats = 200;
var stats = ems.new({
dimensions: [nStats], // Maximum number of stats that can be stored
heapSize: nStats * 200,// Space for keys, data values, and sorting
useMap: true, // Use a key-index mapping, not integer indexes
setFEtags: 'full', // Initial full/empty state of array elements
doDataFill: true, // Initialize data values
dataFill: 0 // Initial value of new keys
});
// ===============================================================================
// Perform Word Count
//
// Use the word as a key to the EMS array, the value is the count of the
// number of times the word has been encountered. This is incremented
// atomically.
//
// Bookkeeping for statistics is performed by keeping track of the total number
// of words and bytes processed in the "stats" EMS array.
//
var doc_path = '/path/to/your/document/collection/';
if (!process.argv[3]) {
console.log("usage: wordCount <# processes> /path/to/your/document/collection/");
return -1;
} else {
doc_path = process.argv[3];
}
var file_list = fs.readdirSync(doc_path);
var splitPattern = new RegExp(/[ \n,\.\\/_\-\<\>:\;\!\@\#\$\%\&\*\(\)=\[\]|\"\'\{\}\?\—]/);
// Loop over the files in parallel, counting words
var startTime = Date.now();
ems.parForEach(0, file_list.length, function (fileNum) {
try {
var text = fs.readFileSync(doc_path + file_list[fileNum], 'utf8', "r");
var words = text.replace(/[\n\r]/g, ' ').toLowerCase().split(splitPattern);
// Iterate over all the words in the document
words.forEach(function (word) {
// Ignore words over 15 characters long as non-English
if (word.length < 15 && word.length > 0) {
// Atomically increment the count of times this word was seen
var count = wordCounts.faa(word, 1);
}
});
// Accumulate some statistics: word and data counters, print progress
var nWords_read = stats.faa('nWords', words.length);
var nBytes_read = stats.faa('nBytesRead', text.length);
var now = Date.now();
if (ems.myID === 0 && fileNum % 10 === 0) {
console.log("Average Words/sec=" + Math.floor((1000 * nWords_read) / (now - startTime)) +
" MBytes/sec=" + Math.floor((100000 * nBytes_read) / ((now - startTime) * (1 << 20)))/100);
}
}
catch (Err) {
ems.diag("This is not a text file:" + file_list[fileNum]);
}
});
// ===============================================================================
// Output the Most Frequently Occurring Words
//
// First perform many sequential insertion sorts in parallel,
// then serially perform a merge sort of all the partial lists.
//
// Start by printing the total amount of data processed during word counting
ems.master(function() {
console.log("Totals: ", stats.read('nWords'), " words parsed, ",
stats.read('nBytesRead'), "bytes read.");
console.log("Starting sort....");
});
// Divide the array across all the processes, each process keeps track
// of the "local_sort_len" most frequent word it encounters.
var local_sort_len = Math.max(10, process.argv[2]);
var biggest_counts = new Array(local_sort_len).fill({"key": 0, "count": 0});
ems.parForEach(0, maxNKeys, function (keyN) {
if (keyN % (maxNKeys / 10) === 0) { ems.diag("Sorting key ID " + keyN); }
var key = wordCounts.index2key(keyN);
if (key) {
// Perform an insertion sort of the new key into the biggest_counts
// list, deleting the last (smallest) element to preserve length.
var keyCount = wordCounts.read(key);
var idx = local_sort_len - 1;
while (idx >= 0 && biggest_counts[idx].count < keyCount) {
idx -= 1;
}
var newBiggest = {"key": key, "count": keyCount};
if (idx < 0) {
biggest_counts = [newBiggest].concat(biggest_counts.slice(0, biggest_counts.length - 1));
} else if (idx < local_sort_len) {
var left = biggest_counts.slice(0, idx + 1);
var right = biggest_counts.slice(idx + 1);
biggest_counts = left.concat([newBiggest].concat(right)).slice(0, -1);
}
}
});
// Concatenate all the partial (one per process) lists into one list
stats.writeXF('most_frequent', []);
ems.barrier();
stats.writeEF('most_frequent', stats.readFE('most_frequent').concat(biggest_counts));
ems.barrier();
// Sort & print the list of words, only one process is needed
ems.master(function() {
biggest_counts = stats.readFF('most_frequent');
// Sort the list by word frequency
biggest_counts.sort(function (a, b) {
return b.count - a.count;
});
// Print only the first "local_sort_len" items -- assume the worst case
// of all the largest counts are discovered by a single process, the next
// largest after "local_sort_len" is no longer on any list.
console.log("Most frequently appearing terms:");
for (var index = 0; index < local_sort_len; index += 1) {
console.log(index + ': ' + biggest_counts[index].key + " " + biggest_counts[index].count);
}
});