UNPKG

kuromoji

Version:

JavaScript implementation of Japanese morphological analyzer

102 lines (87 loc) 4.29 kB
/* * Copyright 2014 Takuya Asano * Copyright 2010-2014 Atilika Inc. and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ "use strict"; var ViterbiNode = require("./ViterbiNode"); var ViterbiLattice = require("./ViterbiLattice"); var SurrogateAwareString = require("../util/SurrogateAwareString"); /** * ViterbiBuilder builds word lattice (ViterbiLattice) * @param {DynamicDictionaries} dic dictionary * @constructor */ function ViterbiBuilder(dic) { this.trie = dic.trie; this.token_info_dictionary = dic.token_info_dictionary; this.unknown_dictionary = dic.unknown_dictionary; } /** * Build word lattice * @param {string} sentence_str Input text * @returns {ViterbiLattice} Word lattice */ ViterbiBuilder.prototype.build = function (sentence_str) { var lattice = new ViterbiLattice(); var sentence = new SurrogateAwareString(sentence_str); var key, trie_id, left_id, right_id, word_cost; for (var pos = 0; pos < sentence.length; pos++) { var tail = sentence.slice(pos); var vocabulary = this.trie.commonPrefixSearch(tail); for (var n = 0; n < vocabulary.length; n++) { // Words in dictionary do not have surrogate pair (only UCS2 set) trie_id = vocabulary[n].v; key = vocabulary[n].k; var token_info_ids = this.token_info_dictionary.target_map[trie_id]; for (var i = 0; i < token_info_ids.length; i++) { var token_info_id = parseInt(token_info_ids[i]); left_id = this.token_info_dictionary.dictionary.getShort(token_info_id); right_id = this.token_info_dictionary.dictionary.getShort(token_info_id + 2); word_cost = this.token_info_dictionary.dictionary.getShort(token_info_id + 4); // node_name, cost, start_index, length, type, left_id, right_id, surface_form lattice.append(new ViterbiNode(token_info_id, word_cost, pos + 1, key.length, "KNOWN", left_id, right_id, key)); } } // Unknown word processing var surrogate_aware_tail = new SurrogateAwareString(tail); var head_char = new SurrogateAwareString(surrogate_aware_tail.charAt(0)); var head_char_class = this.unknown_dictionary.lookup(head_char.toString()); if (vocabulary == null || vocabulary.length === 0 || head_char_class.is_always_invoke === 1) { // Process unknown word key = head_char; if (head_char_class.is_grouping === 1 && 1 < surrogate_aware_tail.length) { for (var k = 1; k < surrogate_aware_tail.length; k++) { var next_char = surrogate_aware_tail.charAt(k); var next_char_class = this.unknown_dictionary.lookup(next_char); if (head_char_class.class_name !== next_char_class.class_name) { break; } key += next_char; } } var unk_ids = this.unknown_dictionary.target_map[head_char_class.class_id]; for (var j = 0; j < unk_ids.length; j++) { var unk_id = parseInt(unk_ids[j]); left_id = this.unknown_dictionary.dictionary.getShort(unk_id); right_id = this.unknown_dictionary.dictionary.getShort(unk_id + 2); word_cost = this.unknown_dictionary.dictionary.getShort(unk_id + 4); // node_name, cost, start_index, length, type, left_id, right_id, surface_form lattice.append(new ViterbiNode(unk_id, word_cost, pos + 1, key.length, "UNKNOWN", left_id, right_id, key.toString())); } } } lattice.appendEos(); return lattice; }; module.exports = ViterbiBuilder;