UNPKG

kuromoji

Version:

JavaScript implementation of Japanese morphological analyzer

290 lines (254 loc) 7.86 kB
/* * Copyright 2014 Takuya Asano * Copyright 2010-2014 Atilika Inc. and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ "use strict"; /** * Convert String (UTF-16) to UTF-8 ArrayBuffer * * @param {String} str UTF-16 string to convert * @return {Uint8Array} Byte sequence encoded by UTF-8 */ var stringToUtf8Bytes = function (str) { // Max size of 1 character is 4 bytes var bytes = new Uint8Array(str.length * 4); var i = 0, j = 0; while (i < str.length) { var unicode_code; var utf16_code = str.charCodeAt(i++); if (utf16_code >= 0xD800 && utf16_code <= 0xDBFF) { // surrogate pair var upper = utf16_code; // high surrogate var lower = str.charCodeAt(i++); // low surrogate if (lower >= 0xDC00 && lower <= 0xDFFF) { unicode_code = (upper - 0xD800) * (1 << 10) + (1 << 16) + (lower - 0xDC00); } else { // malformed surrogate pair return null; } } else { // not surrogate code unicode_code = utf16_code; } if (unicode_code < 0x80) { // 1-byte bytes[j++] = unicode_code; } else if (unicode_code < (1 << 11)) { // 2-byte bytes[j++] = (unicode_code >>> 6) | 0xC0; bytes[j++] = (unicode_code & 0x3F) | 0x80; } else if (unicode_code < (1 << 16)) { // 3-byte bytes[j++] = (unicode_code >>> 12) | 0xE0; bytes[j++] = ((unicode_code >> 6) & 0x3f) | 0x80; bytes[j++] = (unicode_code & 0x3F) | 0x80; } else if (unicode_code < (1 << 21)) { // 4-byte bytes[j++] = (unicode_code >>> 18) | 0xF0; bytes[j++] = ((unicode_code >> 12) & 0x3F) | 0x80; bytes[j++] = ((unicode_code >> 6) & 0x3F) | 0x80; bytes[j++] = (unicode_code & 0x3F) | 0x80; } else { // malformed UCS4 code } } return bytes.subarray(0, j); }; /** * Convert UTF-8 ArrayBuffer to String (UTF-16) * * @param {Array} bytes UTF-8 byte sequence to convert * @return {String} String encoded by UTF-16 */ var utf8BytesToString = function (bytes) { var str = ""; var code, b1, b2, b3, b4, upper, lower; var i = 0; while (i < bytes.length) { b1 = bytes[i++]; if (b1 < 0x80) { // 1 byte code = b1; } else if ((b1 >> 5) === 0x06) { // 2 bytes b2 = bytes[i++]; code = ((b1 & 0x1f) << 6) | (b2 & 0x3f); } else if ((b1 >> 4) === 0x0e) { // 3 bytes b2 = bytes[i++]; b3 = bytes[i++]; code = ((b1 & 0x0f) << 12) | ((b2 & 0x3f) << 6) | (b3 & 0x3f); } else { // 4 bytes b2 = bytes[i++]; b3 = bytes[i++]; b4 = bytes[i++]; code = ((b1 & 0x07) << 18) | ((b2 & 0x3f) << 12) | ((b3 & 0x3f) << 6) | (b4 & 0x3f); } if (code < 0x10000) { str += String.fromCharCode(code); } else { // surrogate pair code -= 0x10000; upper = (0xD800 | (code >> 10)); lower = (0xDC00 | (code & 0x3FF)); str += String.fromCharCode(upper, lower); } } return str; }; /** * Utilities to manipulate byte sequence * @param {(number|Uint8Array)} arg Initial size of this buffer (number), or buffer to set (Uint8Array) * @constructor */ function ByteBuffer(arg) { var initial_size; if (arg == null) { initial_size = 1024 * 1024; } else if (typeof arg === "number") { initial_size = arg; } else if (arg instanceof Uint8Array) { this.buffer = arg; this.position = 0; // Overwrite return; } else { // typeof arg -> String throw typeof arg + " is invalid parameter type for ByteBuffer constructor"; } // arg is null or number this.buffer = new Uint8Array(initial_size); this.position = 0; } ByteBuffer.prototype.size = function () { return this.buffer.length; }; ByteBuffer.prototype.reallocate = function () { var new_array = new Uint8Array(this.buffer.length * 2); new_array.set(this.buffer); this.buffer = new_array; }; ByteBuffer.prototype.shrink = function () { this.buffer = this.buffer.subarray(0, this.position); return this.buffer; }; ByteBuffer.prototype.put = function (b) { if (this.buffer.length < this.position + 1) { this.reallocate(); } this.buffer[this.position++] = b; }; ByteBuffer.prototype.get = function (index) { if (index == null) { index = this.position; this.position += 1; } if (this.buffer.length < index + 1) { return 0; } return this.buffer[index]; }; // Write short to buffer by little endian ByteBuffer.prototype.putShort = function (num) { if (0xFFFF < num) { throw num + " is over short value"; } var lower = (0x00FF & num); var upper = (0xFF00 & num) >> 8; this.put(lower); this.put(upper); }; // Read short from buffer by little endian ByteBuffer.prototype.getShort = function (index) { if (index == null) { index = this.position; this.position += 2; } if (this.buffer.length < index + 2) { return 0; } var lower = this.buffer[index]; var upper = this.buffer[index + 1]; var value = (upper << 8) + lower; if (value & 0x8000) { value = -((value - 1) ^ 0xFFFF); } return value; }; // Write integer to buffer by little endian ByteBuffer.prototype.putInt = function (num) { if (0xFFFFFFFF < num) { throw num + " is over integer value"; } var b0 = (0x000000FF & num); var b1 = (0x0000FF00 & num) >> 8; var b2 = (0x00FF0000 & num) >> 16; var b3 = (0xFF000000 & num) >> 24; this.put(b0); this.put(b1); this.put(b2); this.put(b3); }; // Read integer from buffer by little endian ByteBuffer.prototype.getInt = function (index) { if (index == null) { index = this.position; this.position += 4; } if (this.buffer.length < index + 4) { return 0; } var b0 = this.buffer[index]; var b1 = this.buffer[index + 1]; var b2 = this.buffer[index + 2]; var b3 = this.buffer[index + 3]; return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0; }; ByteBuffer.prototype.readInt = function () { var pos = this.position; this.position += 4; return this.getInt(pos); }; ByteBuffer.prototype.putString = function (str) { var bytes = stringToUtf8Bytes(str); for (var i = 0; i < bytes.length; i++) { this.put(bytes[i]); } // put null character as terminal character this.put(0); }; ByteBuffer.prototype.getString = function (index) { var buf = [], ch; if (index == null) { index = this.position; } while (true) { if (this.buffer.length < index + 1) { break; } ch = this.get(index++); if (ch === 0) { break; } else { buf.push(ch); } } this.position = index; return utf8BytesToString(buf); }; module.exports = ByteBuffer;