kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
80 lines (70 loc) • 2.81 kB
JavaScript
;
let Tokenizer = require("./Tokenizer");
let DictionaryLoader = require("./loader/NodeDictionaryLoader");
let { selectPosSource } = require("./pos-source/index");
/**
* TokenizerBuilder create Tokenizer instance.
* @param {Object} option JSON object which have key-value pairs settings
* @param {string} option.dicPath Directory containing the uncompressed .dat files (required).
* @param {boolean} [option.lazyPos] If true, disk-back the POS feature blob via LRU cache.
* Reduces heap from ~1.3 GB to ~2 MB. Requires tid_pos.dat on disk.
* @constructor
*/
function TokenizerBuilder(option) {
if (option.dicPath == null) {
throw new Error("dicPath is required — pass the path to the directory containing the .dat files");
}
this.dic_path = option.dicPath;
this.lazy_pos = !!option.lazyPos;
}
/**
* Build Tokenizer instance (callback style, for backward compatibility).
* @param {TokenizerBuilder~onLoad} callback Callback function
*/
TokenizerBuilder.prototype.build = function (callback) {
let self = this;
let loader = new DictionaryLoader(this.dic_path);
loader.load(function (err, dic) {
if (err) return callback(err);
let tokenizer = new Tokenizer(dic);
// Wire POS source if lazyPos was requested OR env var is set.
// Skip entirely if mmap is active — mmap'd pos_buffer is already
// demand-paged by the OS with zero heap cost.
if (self.lazy_pos || (process.env.KUSAMOJI_LAZY_POS || "").trim() === "1") {
let mmapActive = false;
try {
let { loadMmapAddon } = require("./native/loader");
mmapActive = !!loadMmapAddon();
} catch (e) { /* no mmap */ }
if (!mmapActive) {
try {
let posSource = selectPosSource({
tokenInfoDictionary: tokenizer.token_info_dictionary,
dictPath: self.dic_path,
});
if (posSource) {
let tid = tokenizer.token_info_dictionary;
tid._posSource = posSource;
tid.pos_buffer = null; // free ~1.3 GB heap
}
} catch (e) {
console.warn("[kusamoji] POS source setup failed:", e.message);
}
}
}
callback(null, tokenizer);
});
};
/**
* Build Tokenizer instance (Promise style).
* @returns {Promise<Tokenizer>}
*/
TokenizerBuilder.prototype.buildAsync = function () {
return new Promise((resolve, reject) => {
this.build((err, tokenizer) => {
if (err) reject(err);
else resolve(tokenizer);
});
});
};
module.exports = TokenizerBuilder;