UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

143 lines (120 loc) 4.36 kB
"use strict"; let ByteBuffer = require("../util/ByteBuffer"); /** * TokenInfoDictionary * @constructor */ function TokenInfoDictionary() { this.dictionary = new ByteBuffer(10 * 1024 * 1024); this.target_map = {}; // trie_id (of surface form) -> token_info_id (of token) this.pos_buffer = new ByteBuffer(10 * 1024 * 1024); } // left_id right_id word_cost ... // ^ this position is token_info_id TokenInfoDictionary.prototype.buildDictionary = function (entries) { let dictionary_entries = {}; // using as hashmap, string -> string (word_id -> surface_form) to build dictionary for (let i = 0; i < entries.length; i++) { let entry = entries[i]; if (entry.length < 4) { continue; } let surface_form = entry[0]; let left_id = entry[1]; let right_id = entry[2]; let word_cost = entry[3]; let feature = entry.slice(4).join(","); // TODO Optimize // Assertion if (!isFinite(left_id) || !isFinite(right_id) || !isFinite(word_cost)) { console.log(entry); } let token_info_id = this.put(left_id, right_id, word_cost, surface_form, feature); dictionary_entries[token_info_id] = surface_form; } // Remove last unused area this.dictionary.shrink(); this.pos_buffer.shrink(); return dictionary_entries; }; TokenInfoDictionary.prototype.put = function (left_id, right_id, word_cost, surface_form, feature) { let token_info_id = this.dictionary.position; let pos_id = this.pos_buffer.position; this.dictionary.putShort(left_id); this.dictionary.putShort(right_id); this.dictionary.putShort(word_cost); this.dictionary.putInt(pos_id); this.pos_buffer.putString(surface_form + "," + feature); return token_info_id; }; TokenInfoDictionary.prototype.addMapping = function (source, target) { let mapping = this.target_map[source]; if (mapping == null) { mapping = []; } mapping.push(target); this.target_map[source] = mapping; }; TokenInfoDictionary.prototype.targetMapToBuffer = function () { let buffer = new ByteBuffer(); let map_keys_size = Object.keys(this.target_map).length; buffer.putInt(map_keys_size); for (let key in this.target_map) { let values = this.target_map[key]; // Array let map_values_size = values.length; buffer.putInt(parseInt(key, 10)); buffer.putInt(map_values_size); for (let i = 0; i < values.length; i++) { buffer.putInt(values[i]); } } return buffer.shrink(); // Shrink-ed Typed Array }; // from tid.dat TokenInfoDictionary.prototype.loadDictionary = function (array_buffer) { this.dictionary = new ByteBuffer(array_buffer); return this; }; // from tid_pos.dat TokenInfoDictionary.prototype.loadPosVector = function (array_buffer) { this.pos_buffer = new ByteBuffer(array_buffer); return this; }; // from tid_map.dat TokenInfoDictionary.prototype.loadTargetMap = function (array_buffer) { let buffer = new ByteBuffer(array_buffer); buffer.position = 0; this.target_map = {}; buffer.readInt(); // map_keys_size while (true) { if (buffer.buffer.length < buffer.position + 1) { break; } let key = buffer.readInt(); let map_values_size = buffer.readInt(); for (let i = 0; i < map_values_size; i++) { let value = buffer.readInt(); this.addMapping(key, value); } } return this; }; // Pluggable POS-source strategy seam. // See the design docs for the design rationale. // Contract: { getFeaturesById(token_info_id_int): string, close?(): void } TokenInfoDictionary.prototype._posSource = null; /** * Look up features in the dictionary * @param {string} token_info_id_str Word ID to look up * @returns {string} Features string concatenated by "," */ TokenInfoDictionary.prototype.getFeatures = function (token_info_id_str) { let token_info_id = parseInt(token_info_id_str, 10); if (isNaN(token_info_id)) { return ""; } if (this._posSource) { return this._posSource.getFeaturesById(token_info_id); } let pos_id = this.dictionary.getInt(token_info_id + 6); return this.pos_buffer.getString(pos_id); }; module.exports = TokenInfoDictionary;