UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

174 lines (149 loc) 5.77 kB
"use strict"; /** * LazyDiskPosSource — production implementation that disk-backs the * ~1.3 GB tid_pos blob via a persistent fd + LRU cache. * * Reads from uncompressed tid_pos.dat using pread-style readSync. * Peak memory = ~2 MB (LRU cache) instead of 1.3 GB in heap. */ let crypto = require("node:crypto"); let fs = require("node:fs"); let path = require("node:path"); const CHUNK_BYTES = 256; const DEFAULT_CACHE_CAPACITY = 20000; function getCacheCapacity() { let env = process.env.KUSAMOJI_LAZY_POS_CACHE; if (!env) return DEFAULT_CACHE_CAPACITY; let n = parseInt(env, 10); if (!Number.isFinite(n) || n <= 0) return DEFAULT_CACHE_CAPACITY; return n; } class LazyDiskPosSource { constructor({ tokenInfoDictionary, dictPath }) { this._dict = tokenInfoDictionary; this._path = path.join(dictPath, "tid_pos.dat"); this._fd = -1; // sentinel — ensures close() is safe even if constructor throws if (!fs.existsSync(this._path)) { throw new Error( "LazyDiskPosSource: " + this._path + " not found.\n" + "KUSAMOJI_LAZY_POS=1 was set, but the uncompressed companion file is missing." ); } let stat = fs.statSync(this._path); this._size = stat.size; this._fd = fs.openSync(this._path, "r"); // Optional SHA-256 verification — wrapped in try-finally to close // the fd if verification fails (prevents fd leak on throw) try { let sidecarPath = this._path + ".sha256"; if (fs.existsSync(sidecarPath)) { let expected = fs.readFileSync(sidecarPath, "utf8").trim().split(/\s+/)[0]; if (!/^[0-9a-f]{64}$/i.test(expected)) { throw new Error("LazyDiskPosSource: " + sidecarPath + " does not contain a valid SHA-256 hex digest"); } let t0 = Date.now(); let actual = this._streamHash(); if (actual !== expected.toLowerCase()) { throw new Error("LazyDiskPosSource: SHA-256 mismatch for " + this._path); } this._verifiedMs = Date.now() - t0; } else { this._verifiedMs = -1; } } catch (e) { // Close the fd before re-throwing to prevent leak try { fs.closeSync(this._fd); } catch (_) { /* ignore */ } this._fd = -1; throw e; } this._cache = new Map(); this._cacheCap = getCacheCapacity(); this._buf = Buffer.allocUnsafe(CHUNK_BYTES); this._calls = 0; this._hits = 0; this._misses = 0; this._extends = 0; } getFeaturesById(token_info_id) { this._calls++; let pos_id = this._dict.dictionary.getInt(token_info_id + 6); let cached = this._cache.get(pos_id); if (cached !== undefined) { this._hits++; this._cache.delete(pos_id); this._cache.set(pos_id, cached); return cached; } this._misses++; let result = this._readStringAt(pos_id); this._cache.set(pos_id, result); if (this._cache.size > this._cacheCap) { this._cache.delete(this._cache.keys().next().value); } return result; } _readStringAt(offset) { let want = Math.min(CHUNK_BYTES, this._size - offset); if (want <= 0) return ""; let bytesRead = fs.readSync(this._fd, this._buf, 0, want, offset); if (bytesRead <= 0) return ""; let nullAt = -1; for (let i = 0; i < bytesRead; i++) { if (this._buf[i] === 0) { nullAt = i; break; } } if (nullAt >= 0) return this._buf.toString("utf8", 0, nullAt); this._extends++; let collected = Buffer.from(this._buf.subarray(0, bytesRead)); let cursor = offset + bytesRead; while (cursor < this._size) { let want2 = Math.min(CHUNK_BYTES, this._size - cursor); let more = Buffer.allocUnsafe(want2); let got = fs.readSync(this._fd, more, 0, want2, cursor); if (got <= 0) break; let nullAt2 = -1; for (let i = 0; i < got; i++) { if (more[i] === 0) { nullAt2 = i; break; } } if (nullAt2 >= 0) { return Buffer.concat([collected, more.subarray(0, nullAt2)]).toString("utf8"); } collected = Buffer.concat([collected, more.subarray(0, got)]); cursor += got; } return collected.toString("utf8"); } _streamHash() { let HASH_CHUNK = 1 << 20; let buf = Buffer.allocUnsafe(HASH_CHUNK); let hash = crypto.createHash("sha256"); let pos = 0; while (pos < this._size) { let want = Math.min(HASH_CHUNK, this._size - pos); let got = fs.readSync(this._fd, buf, 0, want, pos); if (got <= 0) break; hash.update(buf.subarray(0, got)); pos += got; } return hash.digest("hex"); } stats() { let total = this._calls; return { kind: "lazy-disk", calls: total, hits: this._hits, misses: this._misses, extends: this._extends, hitRate: total > 0 ? this._hits / total : 0, cacheSize: this._cache.size, cacheCap: this._cacheCap, filePath: this._path, fileSize: this._size, verifiedMs: this._verifiedMs, }; } close() { try { fs.closeSync(this._fd); } catch (e) { /* ignore */ } } } module.exports = { LazyDiskPosSource };