UNPKG

doublearray

Version:

JavaScript implementation of Double-Array trie

792 lines (658 loc) 24.6 kB
// Copyright (c) 2014 Takuya Asano All Rights Reserved. (function () { "use strict"; var TERM_CHAR = "\u0000", // terminal character TERM_CODE = 0, // terminal character code ROOT_ID = 0, // index of root node NOT_FOUND = -1, // traverse() returns if no nodes found BASE_SIGNED = true, CHECK_SIGNED = true, BASE_BYTES = 4, CHECK_BYTES = 4, MEMORY_EXPAND_RATIO = 2; var newBC = function (initial_size) { if (initial_size == null) { initial_size = 1024; } var initBase = function (_base, start, end) { // 'end' index does not include for (var i = start; i < end; i++) { _base[i] = - i + 1; // inversed previous empty node index } if (0 < check.array[check.array.length - 1]) { var last_used_id = check.array.length - 2; while (0 < check.array[last_used_id]) { last_used_id--; } _base[start] = - last_used_id; } }; var initCheck = function (_check, start, end) { for (var i = start; i < end; i++) { _check[i] = - i - 1; // inversed next empty node index } }; var realloc = function (min_size) { // expand arrays size by given ratio var new_size = min_size * MEMORY_EXPAND_RATIO; // console.log('re-allocate memory to ' + new_size); var base_new_array = newArrayBuffer(base.signed, base.bytes, new_size); initBase(base_new_array, base.array.length, new_size); // init BASE in new range base_new_array.set(base.array); base.array = null; // explicit GC base.array = base_new_array; var check_new_array = newArrayBuffer(check.signed, check.bytes, new_size); initCheck(check_new_array, check.array.length, new_size); // init CHECK in new range check_new_array.set(check.array); check.array = null; // explicit GC check.array = check_new_array; }; var first_unused_node = ROOT_ID + 1; var base = { signed: BASE_SIGNED, bytes: BASE_BYTES, array: newArrayBuffer(BASE_SIGNED, BASE_BYTES, initial_size) }; var check = { signed: CHECK_SIGNED, bytes: CHECK_BYTES, array: newArrayBuffer(CHECK_SIGNED, CHECK_BYTES, initial_size) }; // init root node base.array[ROOT_ID] = 1; check.array[ROOT_ID] = ROOT_ID; // init BASE initBase(base.array, ROOT_ID + 1, base.array.length); // init CHECK initCheck(check.array, ROOT_ID + 1, check.array.length); return { getBaseBuffer: function () { return base.array; }, getCheckBuffer: function () { return check.array; }, loadBaseBuffer: function (base_buffer) { base.array = base_buffer; return this; }, loadCheckBuffer: function (check_buffer) { check.array = check_buffer; return this; }, size: function () { return Math.max(base.array.length, check.array.length); }, getBase: function (index) { if (base.array.length - 1 < index) { return - index + 1; // realloc(index); } // if (!Number.isFinite(base.array[index])) { // console.log('getBase:' + index); // throw 'getBase' + index; // } return base.array[index]; }, getCheck: function (index) { if (check.array.length - 1 < index) { return - index - 1; // realloc(index); } // if (!Number.isFinite(check.array[index])) { // console.log('getCheck:' + index); // throw 'getCheck' + index; // } return check.array[index]; }, setBase: function (index, base_value) { if (base.array.length - 1 < index) { realloc(index); } base.array[index] = base_value; }, setCheck: function (index, check_value) { if (check.array.length - 1 < index) { realloc(index); } check.array[index] = check_value; }, setFirstUnusedNode: function (index) { // if (!Number.isFinite(index)) { // throw 'assertion error: setFirstUnusedNode ' + index + ' is not finite number'; // } first_unused_node = index; }, getFirstUnusedNode: function () { // if (!Number.isFinite(first_unused_node)) { // throw 'assertion error: getFirstUnusedNode ' + first_unused_node + ' is not finite number'; // } return first_unused_node; }, shrink: function () { var last_index = this.size() - 1; while (true) { if (0 <= check.array[last_index]) { break; } last_index--; } base.array = base.array.subarray(0, last_index + 2); // keep last unused node check.array = check.array.subarray(0, last_index + 2); // keep last unused node }, calc: function () { var unused_count = 0; var size = check.array.length; for (var i = 0; i < size; i++) { if (check.array[i] < 0) { unused_count++; } } return { all: size, unused: unused_count, efficiency: (size - unused_count) / size }; }, dump: function () { // for debug var dump_base = ""; var dump_check = ""; var i; for (i = 0; i < base.array.length; i++) { dump_base = dump_base + " " + this.getBase(i); } for (i = 0; i < check.array.length; i++) { dump_check = dump_check + " " + this.getCheck(i); } console.log("base:" + dump_base); console.log("chck:" + dump_check); return "base:" + dump_base + " chck:" + dump_check; } }; }; /** * Factory method of double array */ function DoubleArrayBuilder(initial_size) { this.bc = newBC(initial_size); // BASE and CHECK this.keys = []; } /** * Append a key to initialize set * (This method should be called by dictionary ordered key) * * @param {String} key * @param {Number} value Integer value from 0 to max signed integer number - 1 */ DoubleArrayBuilder.prototype.append = function (key, record) { this.keys.push({ k: key, v: record }); return this; }; /** * Build double array for given keys * * @param {Array} keys Array of keys. A key is a Object which has properties 'k', 'v'. * 'k' is a key string, 'v' is a record assigned to that key. * @return {DoubleArray} Compiled double array */ DoubleArrayBuilder.prototype.build = function (keys, sorted) { if (keys == null) { keys = this.keys; } if (keys == null) { return new DoubleArray(this.bc); } if (sorted == null) { sorted = false; } // Convert key string to ArrayBuffer var buff_keys = keys.map(function (k) { return { k: stringToUtf8Bytes(k.k + TERM_CHAR), v: k.v }; }); // Sort keys by byte order if (sorted) { this.keys = buff_keys; } else { this.keys = buff_keys.sort(function (k1, k2) { var b1 = k1.k; var b2 = k2.k; var min_length = Math.min(b1.length, b2.length); for (var pos = 0; pos < min_length; pos++) { if (b1[pos] === b2[pos]) { continue; } return b1[pos] - b2[pos]; } return b1.length - b2.length; }); } buff_keys = null; // explicit GC this._build(ROOT_ID, 0, 0, this.keys.length); return new DoubleArray(this.bc); }; /** * Append nodes to BASE and CHECK array recursively */ DoubleArrayBuilder.prototype._build = function (parent_index, position, start, length) { var children_info = this.getChildrenInfo(position, start, length); var _base = this.findAllocatableBase(children_info); this.setBC(parent_index, children_info, _base); for (var i = 0; i < children_info.length; i = i + 3) { var child_code = children_info[i]; if (child_code === TERM_CODE) { continue; } var child_start = children_info[i + 1]; var child_len = children_info[i + 2]; var child_index = _base + child_code; this._build(child_index, position + 1, child_start, child_len); } }; DoubleArrayBuilder.prototype.getChildrenInfo = function (position, start, length) { var current_char = this.keys[start].k[position]; var i = 0; var children_info = new Int32Array(length * 3); children_info[i++] = current_char; // char (current) children_info[i++] = start; // start index (current) var next_pos = start; var start_pos = start; for (; next_pos < start + length; next_pos++) { var next_char = this.keys[next_pos].k[position]; if (current_char !== next_char) { children_info[i++] = next_pos - start_pos; // length (current) children_info[i++] = next_char; // char (next) children_info[i++] = next_pos; // start index (next) current_char = next_char; start_pos = next_pos; } } children_info[i++] = next_pos - start_pos; children_info = children_info.subarray(0, i); return children_info; }; DoubleArrayBuilder.prototype.setBC = function (parent_id, children_info, _base) { var bc = this.bc; bc.setBase(parent_id, _base); // Update BASE of parent node var i; for (i = 0; i < children_info.length; i = i + 3) { var code = children_info[i]; var child_id = _base + code; // Update linked list of unused nodes // Assertion // if (child_id < 0) { // throw 'assertion error: child_id is negative' // } var prev_unused_id = - bc.getBase(child_id); var next_unused_id = - bc.getCheck(child_id); // if (prev_unused_id < 0) { // throw 'assertion error: setBC' // } // if (next_unused_id < 0) { // throw 'assertion error: setBC' // } if (child_id !== bc.getFirstUnusedNode()) { bc.setCheck(prev_unused_id, - next_unused_id); } else { // Update first_unused_node bc.setFirstUnusedNode(next_unused_id); } bc.setBase(next_unused_id, - prev_unused_id); var check = parent_id; // CHECK is parent node index bc.setCheck(child_id, check); // Update CHECK of child node // Update record if (code === TERM_CODE) { var start_pos = children_info[i + 1]; // var len = children_info[i + 2]; // if (len != 1) { // throw 'assertion error: there are multiple terminal nodes. len:' + len; // } var value = this.keys[start_pos].v; if (value == null) { value = 0; } var base = - value - 1; // BASE is inverted record value bc.setBase(child_id, base); // Update BASE of child(leaf) node } } }; /** * Find BASE value that all children are allocatable in double array's region */ DoubleArrayBuilder.prototype.findAllocatableBase = function (children_info) { var bc = this.bc; // Assertion: keys are sorted by byte order // var c = -1; // for (var i = 0; i < children_info.length; i = i + 3) { // if (children_info[i] < c) { // throw 'assertion error: not sort key' // } // c = children_info[i]; // } // iterate linked list of unused nodes var _base; var curr = bc.getFirstUnusedNode(); // current index // if (curr < 0) { // throw 'assertion error: getFirstUnusedNode returns negative value' // } while (true) { _base = curr - children_info[0]; if (_base < 0) { curr = - bc.getCheck(curr); // next // if (curr < 0) { // throw 'assertion error: getCheck returns negative value' // } continue; } var empty_area_found = true; for (var i = 0; i < children_info.length; i = i + 3) { var code = children_info[i]; var candidate_id = _base + code; if (!this.isUnusedNode(candidate_id)) { // candidate_id is used node // next curr = - bc.getCheck(curr); // if (curr < 0) { // throw 'assertion error: getCheck returns negative value' // } empty_area_found = false; break; } } if (empty_area_found) { // Area is free return _base; } } }; /** * Check this double array index is unused or not */ DoubleArrayBuilder.prototype.isUnusedNode = function (index) { var bc = this.bc; var check = bc.getCheck(index); // if (index < 0) { // throw 'assertion error: isUnusedNode index:' + index; // } if (index === ROOT_ID) { // root node return false; } if (check < 0) { // unused return true; } // used node (incl. leaf) return false; }; /** * Factory method of double array */ function DoubleArray(bc) { this.bc = bc; // BASE and CHECK this.bc.shrink(); } /** * Look up a given key in this trie * * @param {String} key * @return {Boolean} True if this trie contains a given key */ DoubleArray.prototype.contain = function (key) { var bc = this.bc; key += TERM_CHAR; var buffer = stringToUtf8Bytes(key); var parent = ROOT_ID; var child = NOT_FOUND; for (var i = 0; i < buffer.length; i++) { var code = buffer[i]; child = this.traverse(parent, code); if (child === NOT_FOUND) { return false; } if (bc.getBase(child) <= 0) { // leaf node return true; } else { // not leaf parent = child; continue; } } return false; }; /** * Look up a given key in this trie * * @param {String} key * @return {Number} Record value assgned to this key, -1 if this key does not contain */ DoubleArray.prototype.lookup = function (key) { key += TERM_CHAR; var buffer = stringToUtf8Bytes(key); var parent = ROOT_ID; var child = NOT_FOUND; for (var i = 0; i < buffer.length; i++) { var code = buffer[i]; child = this.traverse(parent, code); if (child === NOT_FOUND) { return NOT_FOUND; } parent = child; } var base = this.bc.getBase(child); if (base <= 0) { // leaf node return - base - 1; } else { // not leaf return NOT_FOUND; } }; /** * Common prefix search * * @param {String} key * @return {Array} Each result object has 'k' and 'v' (key and record, * respectively) properties assigned to matched string */ DoubleArray.prototype.commonPrefixSearch = function (key) { var buffer = stringToUtf8Bytes(key); var parent = ROOT_ID; var child = NOT_FOUND; var result = []; for (var i = 0; i < buffer.length; i++) { var code = buffer[i]; child = this.traverse(parent, code); if (child !== NOT_FOUND) { parent = child; // look forward by terminal character code to check this node is a leaf or not var grand_child = this.traverse(child, TERM_CODE); if (grand_child !== NOT_FOUND) { var base = this.bc.getBase(grand_child); var r = {}; if (base <= 0) { // If child is a leaf node, add record to result r.v = - base - 1; } // If child is a leaf node, add word to result r.k = utf8BytesToString(arrayCopy(buffer, 0, i + 1)); result.push(r); } continue; } else { break; } } return result; }; DoubleArray.prototype.traverse = function (parent, code) { var child = this.bc.getBase(parent) + code; if (this.bc.getCheck(child) === parent) { return child; } else { return NOT_FOUND; } }; DoubleArray.prototype.size = function () { return this.bc.size(); }; DoubleArray.prototype.calc = function () { return this.bc.calc(); }; DoubleArray.prototype.dump = function () { return this.bc.dump(); }; // Array utility functions var newArrayBuffer = function (signed, bytes, size) { if (signed) { switch(bytes) { case 1: return new Int8Array(size); case 2: return new Int16Array(size); case 4: return new Int32Array(size); default: throw new RangeError("Invalid newArray parameter element_bytes:" + bytes); } } else { switch(bytes) { case 1: return new Uint8Array(size); case 2: return new Uint16Array(size); case 4: return new Uint32Array(size); default: throw new RangeError("Invalid newArray parameter element_bytes:" + bytes); } } }; var arrayCopy = function (src, src_offset, length) { var buffer = new ArrayBuffer(length); var dstU8 = new Uint8Array(buffer, 0, length); var srcU8 = src.subarray(src_offset, length); dstU8.set(srcU8); return dstU8; }; /** * Convert String (UTF-16) to UTF-8 ArrayBuffer * * @param {String} str UTF-16 string to convert * @return {Uint8Array} Byte sequence encoded by UTF-8 */ var stringToUtf8Bytes = function (str) { // Max size of 1 character is 4 bytes var bytes = new Uint8Array(new ArrayBuffer(str.length * 4)); var i = 0, j = 0; while (i < str.length) { var unicode_code; var utf16_code = str.charCodeAt(i++); if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) { // surrogate pair var upper = utf16_code; // high surrogate var lower = str.charCodeAt(i++); // low surrogate if (lower >= 0xDC00 && lower <= 0xDFFF) { unicode_code = (upper - 0xD800) * (1 << 10) + (1 << 16) + (lower - 0xDC00); } else { // malformed surrogate pair return null; } } else { // not surrogate code unicode_code = utf16_code; } if (unicode_code < 0x80) { // 1-byte bytes[j++] = unicode_code; } else if (unicode_code < (1 << 11)) { // 2-byte bytes[j++] = (unicode_code >>> 6) | 0xC0; bytes[j++] = (unicode_code & 0x3F) | 0x80; } else if (unicode_code < (1 << 16)) { // 3-byte bytes[j++] = (unicode_code >>> 12) | 0xE0; bytes[j++] = ((unicode_code >> 6) & 0x3f) | 0x80; bytes[j++] = (unicode_code & 0x3F) | 0x80; } else if (unicode_code < (1 << 21)) { // 4-byte bytes[j++] = (unicode_code >>> 18) | 0xF0; bytes[j++] = ((unicode_code >> 12) & 0x3F) | 0x80; bytes[j++] = ((unicode_code >> 6) & 0x3F) | 0x80; bytes[j++] = (unicode_code & 0x3F) | 0x80; } else { // malformed UCS4 code } } return bytes.subarray(0, j); }; /** * Convert UTF-8 ArrayBuffer to String (UTF-16) * * @param {Uint8Array} bytes UTF-8 byte sequence to convert * @return {String} String encoded by UTF-16 */ var utf8BytesToString = function (bytes) { var str = ""; var code, b1, b2, b3, b4, upper, lower; var i = 0; while (i < bytes.length) { b1 = bytes[i++]; if (b1 < 0x80) { // 1 byte code = b1; } else if ((b1 >> 5) === 0x06) { // 2 bytes b2 = bytes[i++]; code = ((b1 & 0x1f) << 6) | (b2 & 0x3f); } else if ((b1 >> 4) === 0x0e) { // 3 bytes b2 = bytes[i++]; b3 = bytes[i++]; code = ((b1 & 0x0f) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f); } else { // 4 bytes b2 = bytes[i++]; b3 = bytes[i++]; b4 = bytes[i++]; code = ((b1 & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3f) << 6) | (b4 & 0x3f); } if (code < 0x10000) { str += String.fromCharCode(code); } else { // surrogate pair code -= 0x10000; upper = (0xD800 | (code >> 10)); lower = (0xDC00 | (code & 0x3FF)); str += String.fromCharCode(upper, lower); } } return str; }; // public methods var doublearray = { builder: function (initial_size) { return new DoubleArrayBuilder(initial_size); }, load: function (base_buffer, check_buffer) { var bc = newBC(0); bc.loadBaseBuffer(base_buffer); bc.loadCheckBuffer(check_buffer); return new DoubleArray(bc); } }; if ("undefined" === typeof module) { // In browser window.doublearray = doublearray; } else { // In node module.exports = doublearray; } })();