UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

142 lines (113 loc) 4.26 kB
"use strict"; let doublearray = require("doublearray"); let DynamicDictionaries = require("../DynamicDictionaries"); let TokenInfoDictionary = require("../TokenInfoDictionary"); let ConnectionCostsBuilder = require("./ConnectionCostsBuilder"); let CharacterDefinitionBuilder = require("./CharacterDefinitionBuilder"); let UnknownDictionary = require("../UnknownDictionary"); /** * Build dictionaries (token info, connection costs) * * Generates from matrix.def * cc.dat: Connection costs * * Generates from *.csv * dat.dat: Double array * tid.dat: Token info dictionary * tid_map.dat: targetMap * tid_pos.dat: posList (part of speech) */ function DictionaryBuilder() { // Array of entries, each entry in Mecab form // (0: surface form, 1: left id, 2: right id, 3: word cost, 4: part of speech id, 5-: other features) this.tid_entries = []; this.unk_entries = []; this.cc_builder = new ConnectionCostsBuilder(); this.cd_builder = new CharacterDefinitionBuilder(); } DictionaryBuilder.prototype.addTokenInfoDictionary = function (line) { let new_entry = line.split(","); this.tid_entries.push(new_entry); return this; }; /** * Put one line of "matrix.def" file for building ConnectionCosts object * @param {string} line is a line of "matrix.def" */ DictionaryBuilder.prototype.putCostMatrixLine = function (line) { this.cc_builder.putLine(line); return this; }; DictionaryBuilder.prototype.putCharDefLine = function (line) { this.cd_builder.putLine(line); return this; }; /** * Put one line of "unk.def" file for building UnknownDictionary object * @param {string} line is a line of "unk.def" */ DictionaryBuilder.prototype.putUnkDefLine = function (line) { this.unk_entries.push(line.split(",")); return this; }; DictionaryBuilder.prototype.build = function () { let dictionaries = this.buildTokenInfoDictionary(); let unknown_dictionary = this.buildUnknownDictionary(); return new DynamicDictionaries(dictionaries.trie, dictionaries.token_info_dictionary, this.cc_builder.build(), unknown_dictionary); }; /** * Build TokenInfoDictionary * * @returns {{trie: *, token_info_dictionary: *}} */ DictionaryBuilder.prototype.buildTokenInfoDictionary = function () { let token_info_dictionary = new TokenInfoDictionary(); // using as hashmap, string -> string (word_id -> surface_form) to build dictionary let dictionary_entries = token_info_dictionary.buildDictionary(this.tid_entries); let trie = this.buildDoubleArray(); for (let token_info_id in dictionary_entries) { let surface_form = dictionary_entries[token_info_id]; let trie_id = trie.lookup(surface_form); // Assertion // if (trie_id < 0) { // console.log("Not Found:" + surface_form); // } token_info_dictionary.addMapping(trie_id, token_info_id); } return { trie: trie, token_info_dictionary: token_info_dictionary }; }; DictionaryBuilder.prototype.buildUnknownDictionary = function () { let unk_dictionary = new UnknownDictionary(); // using as hashmap, string -> string (word_id -> surface_form) to build dictionary let dictionary_entries = unk_dictionary.buildDictionary(this.unk_entries); let char_def = this.cd_builder.build(); // Create CharacterDefinition unk_dictionary.characterDefinition(char_def); for (let token_info_id in dictionary_entries) { let class_name = dictionary_entries[token_info_id]; let class_id = char_def.invoke_definition_map.lookup(class_name); // Assertion // if (trie_id < 0) { // console.log("Not Found:" + surface_form); // } unk_dictionary.addMapping(class_id, token_info_id); } return unk_dictionary; }; /** * Build double array trie * * @returns {DoubleArray} Double-Array trie */ DictionaryBuilder.prototype.buildDoubleArray = function () { let trie_id = 0; let words = this.tid_entries.map(function (entry) { let surface_form = entry[0]; return { k: surface_form, v: trie_id++ }; }); let builder = doublearray.builder(1024 * 1024); return builder.build(words); }; module.exports = DictionaryBuilder;