kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
174 lines (149 loc) • 5.77 kB
JavaScript
"use strict";
/**
* LazyDiskPosSource — production implementation that disk-backs the
* ~1.3 GB tid_pos blob via a persistent fd + LRU cache.
*
* Reads from uncompressed tid_pos.dat using pread-style readSync.
* Peak memory = ~2 MB (LRU cache) instead of 1.3 GB in heap.
*/
let crypto = require("node:crypto");
let fs = require("node:fs");
let path = require("node:path");
const CHUNK_BYTES = 256;
const DEFAULT_CACHE_CAPACITY = 20000;
function getCacheCapacity() {
let env = process.env.KUSAMOJI_LAZY_POS_CACHE;
if (!env) return DEFAULT_CACHE_CAPACITY;
let n = parseInt(env, 10);
if (!Number.isFinite(n) || n <= 0) return DEFAULT_CACHE_CAPACITY;
return n;
}
class LazyDiskPosSource {
constructor({ tokenInfoDictionary, dictPath }) {
this._dict = tokenInfoDictionary;
this._path = path.join(dictPath, "tid_pos.dat");
this._fd = -1; // sentinel — ensures close() is safe even if constructor throws
if (!fs.existsSync(this._path)) {
throw new Error(
"LazyDiskPosSource: " + this._path + " not found.\n" +
"KUSAMOJI_LAZY_POS=1 was set, but the uncompressed companion file is missing."
);
}
let stat = fs.statSync(this._path);
this._size = stat.size;
this._fd = fs.openSync(this._path, "r");
// Optional SHA-256 verification — wrapped in try-finally to close
// the fd if verification fails (prevents fd leak on throw)
try {
let sidecarPath = this._path + ".sha256";
if (fs.existsSync(sidecarPath)) {
let expected = fs.readFileSync(sidecarPath, "utf8").trim().split(/\s+/)[0];
if (!/^[0-9a-f]{64}$/i.test(expected)) {
throw new Error("LazyDiskPosSource: " + sidecarPath + " does not contain a valid SHA-256 hex digest");
}
let t0 = Date.now();
let actual = this._streamHash();
if (actual !== expected.toLowerCase()) {
throw new Error("LazyDiskPosSource: SHA-256 mismatch for " + this._path);
}
this._verifiedMs = Date.now() - t0;
} else {
this._verifiedMs = -1;
}
} catch (e) {
// Close the fd before re-throwing to prevent leak
try { fs.closeSync(this._fd); } catch (_) { /* ignore */ }
this._fd = -1;
throw e;
}
this._cache = new Map();
this._cacheCap = getCacheCapacity();
this._buf = Buffer.allocUnsafe(CHUNK_BYTES);
this._calls = 0;
this._hits = 0;
this._misses = 0;
this._extends = 0;
}
getFeaturesById(token_info_id) {
this._calls++;
let pos_id = this._dict.dictionary.getInt(token_info_id + 6);
let cached = this._cache.get(pos_id);
if (cached !== undefined) {
this._hits++;
this._cache.delete(pos_id);
this._cache.set(pos_id, cached);
return cached;
}
this._misses++;
let result = this._readStringAt(pos_id);
this._cache.set(pos_id, result);
if (this._cache.size > this._cacheCap) {
this._cache.delete(this._cache.keys().next().value);
}
return result;
}
_readStringAt(offset) {
let want = Math.min(CHUNK_BYTES, this._size - offset);
if (want <= 0) return "";
let bytesRead = fs.readSync(this._fd, this._buf, 0, want, offset);
if (bytesRead <= 0) return "";
let nullAt = -1;
for (let i = 0; i < bytesRead; i++) {
if (this._buf[i] === 0) { nullAt = i; break; }
}
if (nullAt >= 0) return this._buf.toString("utf8", 0, nullAt);
this._extends++;
let collected = Buffer.from(this._buf.subarray(0, bytesRead));
let cursor = offset + bytesRead;
while (cursor < this._size) {
let want2 = Math.min(CHUNK_BYTES, this._size - cursor);
let more = Buffer.allocUnsafe(want2);
let got = fs.readSync(this._fd, more, 0, want2, cursor);
if (got <= 0) break;
let nullAt2 = -1;
for (let i = 0; i < got; i++) {
if (more[i] === 0) { nullAt2 = i; break; }
}
if (nullAt2 >= 0) {
return Buffer.concat([collected, more.subarray(0, nullAt2)]).toString("utf8");
}
collected = Buffer.concat([collected, more.subarray(0, got)]);
cursor += got;
}
return collected.toString("utf8");
}
_streamHash() {
let HASH_CHUNK = 1 << 20;
let buf = Buffer.allocUnsafe(HASH_CHUNK);
let hash = crypto.createHash("sha256");
let pos = 0;
while (pos < this._size) {
let want = Math.min(HASH_CHUNK, this._size - pos);
let got = fs.readSync(this._fd, buf, 0, want, pos);
if (got <= 0) break;
hash.update(buf.subarray(0, got));
pos += got;
}
return hash.digest("hex");
}
stats() {
let total = this._calls;
return {
kind: "lazy-disk",
calls: total,
hits: this._hits,
misses: this._misses,
extends: this._extends,
hitRate: total > 0 ? this._hits / total : 0,
cacheSize: this._cache.size,
cacheCap: this._cacheCap,
filePath: this._path,
fileSize: this._size,
verifiedMs: this._verifiedMs,
};
}
close() {
try { fs.closeSync(this._fd); } catch (e) { /* ignore */ }
}
}
module.exports = { LazyDiskPosSource };