kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
142 lines (113 loc) • 4.26 kB
JavaScript
;
let doublearray = require("doublearray");
let DynamicDictionaries = require("../DynamicDictionaries");
let TokenInfoDictionary = require("../TokenInfoDictionary");
let ConnectionCostsBuilder = require("./ConnectionCostsBuilder");
let CharacterDefinitionBuilder = require("./CharacterDefinitionBuilder");
let UnknownDictionary = require("../UnknownDictionary");
/**
* Build dictionaries (token info, connection costs)
*
* Generates from matrix.def
* cc.dat: Connection costs
*
* Generates from *.csv
* dat.dat: Double array
* tid.dat: Token info dictionary
* tid_map.dat: targetMap
* tid_pos.dat: posList (part of speech)
*/
function DictionaryBuilder() {
// Array of entries, each entry in Mecab form
// (0: surface form, 1: left id, 2: right id, 3: word cost, 4: part of speech id, 5-: other features)
this.tid_entries = [];
this.unk_entries = [];
this.cc_builder = new ConnectionCostsBuilder();
this.cd_builder = new CharacterDefinitionBuilder();
}
DictionaryBuilder.prototype.addTokenInfoDictionary = function (line) {
let new_entry = line.split(",");
this.tid_entries.push(new_entry);
return this;
};
/**
* Put one line of "matrix.def" file for building ConnectionCosts object
* @param {string} line is a line of "matrix.def"
*/
DictionaryBuilder.prototype.putCostMatrixLine = function (line) {
this.cc_builder.putLine(line);
return this;
};
DictionaryBuilder.prototype.putCharDefLine = function (line) {
this.cd_builder.putLine(line);
return this;
};
/**
* Put one line of "unk.def" file for building UnknownDictionary object
* @param {string} line is a line of "unk.def"
*/
DictionaryBuilder.prototype.putUnkDefLine = function (line) {
this.unk_entries.push(line.split(","));
return this;
};
DictionaryBuilder.prototype.build = function () {
let dictionaries = this.buildTokenInfoDictionary();
let unknown_dictionary = this.buildUnknownDictionary();
return new DynamicDictionaries(dictionaries.trie, dictionaries.token_info_dictionary, this.cc_builder.build(), unknown_dictionary);
};
/**
* Build TokenInfoDictionary
*
* @returns {{trie: *, token_info_dictionary: *}}
*/
DictionaryBuilder.prototype.buildTokenInfoDictionary = function () {
let token_info_dictionary = new TokenInfoDictionary();
// using as hashmap, string -> string (word_id -> surface_form) to build dictionary
let dictionary_entries = token_info_dictionary.buildDictionary(this.tid_entries);
let trie = this.buildDoubleArray();
for (let token_info_id in dictionary_entries) {
let surface_form = dictionary_entries[token_info_id];
let trie_id = trie.lookup(surface_form);
// Assertion
// if (trie_id < 0) {
// console.log("Not Found:" + surface_form);
// }
token_info_dictionary.addMapping(trie_id, token_info_id);
}
return {
trie: trie,
token_info_dictionary: token_info_dictionary
};
};
DictionaryBuilder.prototype.buildUnknownDictionary = function () {
let unk_dictionary = new UnknownDictionary();
// using as hashmap, string -> string (word_id -> surface_form) to build dictionary
let dictionary_entries = unk_dictionary.buildDictionary(this.unk_entries);
let char_def = this.cd_builder.build(); // Create CharacterDefinition
unk_dictionary.characterDefinition(char_def);
for (let token_info_id in dictionary_entries) {
let class_name = dictionary_entries[token_info_id];
let class_id = char_def.invoke_definition_map.lookup(class_name);
// Assertion
// if (trie_id < 0) {
// console.log("Not Found:" + surface_form);
// }
unk_dictionary.addMapping(class_id, token_info_id);
}
return unk_dictionary;
};
/**
* Build double array trie
*
* @returns {DoubleArray} Double-Array trie
*/
DictionaryBuilder.prototype.buildDoubleArray = function () {
let trie_id = 0;
let words = this.tid_entries.map(function (entry) {
let surface_form = entry[0];
return { k: surface_form, v: trie_id++ };
});
let builder = doublearray.builder(1024 * 1024);
return builder.build(words);
};
module.exports = DictionaryBuilder;