UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

80 lines (70 loc) 2.81 kB
"use strict"; let Tokenizer = require("./Tokenizer"); let DictionaryLoader = require("./loader/NodeDictionaryLoader"); let { selectPosSource } = require("./pos-source/index"); /** * TokenizerBuilder create Tokenizer instance. * @param {Object} option JSON object which have key-value pairs settings * @param {string} option.dicPath Directory containing the uncompressed .dat files (required). * @param {boolean} [option.lazyPos] If true, disk-back the POS feature blob via LRU cache. * Reduces heap from ~1.3 GB to ~2 MB. Requires tid_pos.dat on disk. * @constructor */ function TokenizerBuilder(option) { if (option.dicPath == null) { throw new Error("dicPath is required — pass the path to the directory containing the .dat files"); } this.dic_path = option.dicPath; this.lazy_pos = !!option.lazyPos; } /** * Build Tokenizer instance (callback style, for backward compatibility). * @param {TokenizerBuilder~onLoad} callback Callback function */ TokenizerBuilder.prototype.build = function (callback) { let self = this; let loader = new DictionaryLoader(this.dic_path); loader.load(function (err, dic) { if (err) return callback(err); let tokenizer = new Tokenizer(dic); // Wire POS source if lazyPos was requested OR env var is set. // Skip entirely if mmap is active — mmap'd pos_buffer is already // demand-paged by the OS with zero heap cost. if (self.lazy_pos || (process.env.KUSAMOJI_LAZY_POS || "").trim() === "1") { let mmapActive = false; try { let { loadMmapAddon } = require("./native/loader"); mmapActive = !!loadMmapAddon(); } catch (e) { /* no mmap */ } if (!mmapActive) { try { let posSource = selectPosSource({ tokenInfoDictionary: tokenizer.token_info_dictionary, dictPath: self.dic_path, }); if (posSource) { let tid = tokenizer.token_info_dictionary; tid._posSource = posSource; tid.pos_buffer = null; // free ~1.3 GB heap } } catch (e) { console.warn("[kusamoji] POS source setup failed:", e.message); } } } callback(null, tokenizer); }); }; /** * Build Tokenizer instance (Promise style). * @returns {Promise<Tokenizer>} */ TokenizerBuilder.prototype.buildAsync = function () { return new Promise((resolve, reject) => { this.build((err, tokenizer) => { if (err) reject(err); else resolve(tokenizer); }); }); }; module.exports = TokenizerBuilder;