UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

162 lines (143 loc) 4.48 kB
"use strict"; // UTF-8 encode/decode using native TextEncoder/TextDecoder (Node 18+). // Replaces the 100-line hand-rolled codec from the original codebase. let _encoder = null; let _decoder = null; let stringToUtf8Bytes = function (str) { if (!_encoder) _encoder = new TextEncoder(); return _encoder.encode(str); }; let utf8BytesToString = function (bytes) { if (!_decoder) _decoder = new TextDecoder(); return _decoder.decode(new Uint8Array(bytes)); }; /** * Utilities to manipulate byte sequence * @param {(number|Uint8Array)} arg Initial size of this buffer (number), or buffer to set (Uint8Array) * @constructor */ function ByteBuffer(arg) { let initial_size; if (arg == null) { initial_size = 1024 * 1024; } else if (typeof arg === "number") { initial_size = arg; } else if (arg instanceof Uint8Array) { this.buffer = arg; this.position = 0; // Overwrite return; } else { // typeof arg -> String throw typeof arg + " is invalid parameter type for ByteBuffer constructor"; } // arg is null or number this.buffer = new Uint8Array(initial_size); this.position = 0; } ByteBuffer.prototype.size = function () { return this.buffer.length; }; ByteBuffer.prototype.reallocate = function () { let new_array = new Uint8Array(this.buffer.length * 2); new_array.set(this.buffer); this.buffer = new_array; }; ByteBuffer.prototype.shrink = function () { this.buffer = this.buffer.subarray(0, this.position); return this.buffer; }; ByteBuffer.prototype.put = function (b) { if (this.buffer.length < this.position + 1) { this.reallocate(); } this.buffer[this.position++] = b; }; ByteBuffer.prototype.get = function (index) { if (index == null) { index = this.position; this.position += 1; } if (this.buffer.length < index + 1) { return 0; } return this.buffer[index]; }; // Write short to buffer by little endian ByteBuffer.prototype.putShort = function (num) { if (0xFFFF < num) { throw num + " is over short value"; } let lower = (0x00FF & num); let upper = (0xFF00 & num) >> 8; this.put(lower); this.put(upper); }; // Read signed short from buffer by little endian ByteBuffer.prototype.getShort = function (index) { if (index == null) { index = this.position; this.position += 2; } if (index + 2 > this.buffer.length) { return 0; // out-of-bounds: return 0 (legacy behavior, safe for dict reads) } let value = this.buffer[index] | (this.buffer[index + 1] << 8); return (value << 16) >> 16; // sign-extend to 32-bit signed }; // Write integer to buffer by little endian ByteBuffer.prototype.putInt = function (num) { if (0xFFFFFFFF < num) { throw num + " is over integer value"; } let b0 = (0x000000FF & num); let b1 = (0x0000FF00 & num) >> 8; let b2 = (0x00FF0000 & num) >> 16; let b3 = (0xFF000000 & num) >> 24; this.put(b0); this.put(b1); this.put(b2); this.put(b3); }; // Read integer from buffer by little endian ByteBuffer.prototype.getInt = function (index) { if (index == null) { index = this.position; this.position += 4; } if (this.buffer.length < index + 4) { return 0; } let b0 = this.buffer[index]; let b1 = this.buffer[index + 1]; let b2 = this.buffer[index + 2]; let b3 = this.buffer[index + 3]; return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0; }; ByteBuffer.prototype.readInt = function () { let pos = this.position; this.position += 4; return this.getInt(pos); }; ByteBuffer.prototype.putString = function (str) { let bytes = stringToUtf8Bytes(str); for (let i = 0; i < bytes.length; i++) { this.put(bytes[i]); } // put null character as terminal character this.put(0); }; ByteBuffer.prototype.getString = function (index) { if (index == null) { index = this.position; } let start = index; let bufLen = this.buffer.length; // Scan for null terminator directly on the buffer (no method call per byte) while (index < bufLen && this.buffer[index] !== 0) { index++; } // If no null found, position stays at EOF (not past it) this.position = (index < bufLen) ? index + 1 : bufLen; return utf8BytesToString(this.buffer.subarray(start, index)); }; module.exports = ByteBuffer;