kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
165 lines (146 loc) • 4.71 kB
JavaScript
"use strict";
// UTF-8 encode/decode using native TextEncoder/TextDecoder (Node 18+).
// Replaces the 100-line hand-rolled codec from the original codebase.
let _encoder = null;
let _decoder = null;
let stringToUtf8Bytes = function (str) {
if (!_encoder) _encoder = new TextEncoder();
return _encoder.encode(str);
};
let utf8BytesToString = function (bytes) {
if (!_decoder) _decoder = new TextDecoder();
return _decoder.decode(new Uint8Array(bytes));
};
/**
* Utilities to manipulate byte sequence
* @param {(number|Uint8Array)} arg Initial size of this buffer (number), or buffer to set (Uint8Array)
* @constructor
*/
function ByteBuffer(arg) {
let initial_size;
if (arg == null) {
initial_size = 1024 * 1024;
} else if (typeof arg === "number") {
initial_size = arg;
} else if (arg instanceof Uint8Array) {
this.buffer = arg;
this.position = 0; // Overwrite
return;
} else {
// typeof arg -> String
throw typeof arg + " is invalid parameter type for ByteBuffer constructor";
}
// arg is null or number
this.buffer = new Uint8Array(initial_size);
this.position = 0;
}
ByteBuffer.prototype.size = function () {
return this.buffer.length;
};
ByteBuffer.prototype.reallocate = function () {
let new_array = new Uint8Array(this.buffer.length * 2);
new_array.set(this.buffer);
this.buffer = new_array;
};
ByteBuffer.prototype.shrink = function () {
this.buffer = this.buffer.subarray(0, this.position);
return this.buffer;
};
ByteBuffer.prototype.put = function (b) {
if (this.buffer.length < this.position + 1) {
this.reallocate();
}
this.buffer[this.position++] = b;
};
ByteBuffer.prototype.get = function (index) {
if (index == null) {
index = this.position;
this.position += 1;
}
if (this.buffer.length < index + 1) {
return 0;
}
return this.buffer[index];
};
// Write short to buffer by little endian
ByteBuffer.prototype.putShort = function (num) {
if (0xFFFF < num) {
throw num + " is over short value";
}
let lower = (0x00FF & num);
let upper = (0xFF00 & num) >> 8;
this.put(lower);
this.put(upper);
};
// Read signed short from buffer by little endian
ByteBuffer.prototype.getShort = function (index) {
if (index == null) {
index = this.position;
this.position += 2;
}
if (index + 2 > this.buffer.length) {
return 0; // out-of-bounds: return 0 (legacy behavior, safe for dict reads)
}
let value = this.buffer[index] | (this.buffer[index + 1] << 8);
return (value << 16) >> 16; // sign-extend to 32-bit signed
};
// Write integer to buffer by little endian
ByteBuffer.prototype.putInt = function (num) {
if (0xFFFFFFFF < num) {
throw num + " is over integer value";
}
let b0 = (0x000000FF & num);
let b1 = (0x0000FF00 & num) >> 8;
let b2 = (0x00FF0000 & num) >> 16;
let b3 = (0xFF000000 & num) >> 24;
this.put(b0);
this.put(b1);
this.put(b2);
this.put(b3);
};
// Read integer from buffer by little endian
ByteBuffer.prototype.getInt = function (index) {
if (index == null) {
index = this.position;
this.position += 4;
}
if (this.buffer.length < index + 4) {
return 0;
}
let b0 = this.buffer[index];
let b1 = this.buffer[index + 1];
let b2 = this.buffer[index + 2];
let b3 = this.buffer[index + 3];
// `>>> 0` keeps the result unsigned: dict offsets (pos_id into tid_pos.dat)
// are non-negative, and a signed assembly returns negative once the offset
// crosses 2^31 (2 GiB), silently corrupting feature reads.
return ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0) >>> 0;
};
ByteBuffer.prototype.readInt = function () {
let pos = this.position;
this.position += 4;
return this.getInt(pos);
};
ByteBuffer.prototype.putString = function (str) {
let bytes = stringToUtf8Bytes(str);
for (let i = 0; i < bytes.length; i++) {
this.put(bytes[i]);
}
// put null character as terminal character
this.put(0);
};
ByteBuffer.prototype.getString = function (index) {
if (index == null) {
index = this.position;
}
let start = index;
let bufLen = this.buffer.length;
// Scan for null terminator directly on the buffer (no method call per byte)
while (index < bufLen && this.buffer[index] !== 0) {
index++;
}
// If no null found, position stays at EOF (not past it)
this.position = (index < bufLen) ? index + 1 : bufLen;
return utf8BytesToString(this.buffer.subarray(start, index));
};
module.exports = ByteBuffer;