UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

github.com/KimuraRisei/kusamoji

KimuraRisei/kusamoji

147 lines (130 loc) • 6.72 kB

JavaScript

"use strict"; let ViterbiNode = require("./ViterbiNode"); let ViterbiLattice = require("./ViterbiLattice"); let SurrogateAwareString = require("../util/SurrogateAwareString"); /** * ViterbiBuilder builds word lattice (ViterbiLattice) * @param {DynamicDictionaries} dic dictionary * @constructor */ function ViterbiBuilder(dic) { this.trie = dic.trie; this.token_info_dictionary = dic.token_info_dictionary; this.unknown_dictionary = dic.unknown_dictionary; // Hot-patch entries: surface → [{ left_id, right_id, word_cost, features }] // Checked at each position alongside the trie. Used during dictionary // iteration to test new entries without recompiling .dat files. this.patch_entries = {}; } /** * Add a hot-patch entry from an IPADIC CSV line. * @param {string} csvLine surface,left_id,right_id,cost,pos1,...,reading,pronunciation */ ViterbiBuilder.prototype.addPatchEntry = function (csvLine) { let fields = csvLine.split(","); if (fields.length < 13) return; let surface = fields[0]; let left_id = parseInt(fields[1], 10); let right_id = parseInt(fields[2], 10); let word_cost = parseInt(fields[3], 10); let features = fields.slice(4).join(","); if (!this.patch_entries[surface]) { this.patch_entries[surface] = []; } this.patch_entries[surface].push({ left_id: left_id, right_id: right_id, word_cost: word_cost, surface_form: surface, features_line: surface + "," + features }); }; /** * Clear all hot-patch entries. */ ViterbiBuilder.prototype.clearPatchEntries = function () { this.patch_entries = {}; }; /** * Build word lattice * @param {string} sentence_str Input text * @returns {ViterbiLattice} Word lattice */ ViterbiBuilder.prototype.build = function (sentence_str) { let lattice = new ViterbiLattice(); let sentence = new SurrogateAwareString(sentence_str); let key, trie_id, left_id, right_id, word_cost; for (let pos = 0; pos < sentence.length; pos++) { let tail = sentence.slice(pos); let vocabulary = this.trie.commonPrefixSearch(tail); if (vocabulary == null) vocabulary = []; for (let n = 0; n < vocabulary.length; n++) { // Words in dictionary do not have surrogate pair (only UCS2 set) trie_id = vocabulary[n].v; key = vocabulary[n].k; let token_info_ids = this.token_info_dictionary.target_map[trie_id]; if (token_info_ids == null) continue; for (let i = 0; i < token_info_ids.length; i++) { let token_info_id = parseInt(token_info_ids[i], 10); left_id = this.token_info_dictionary.dictionary.getShort(token_info_id); right_id = this.token_info_dictionary.dictionary.getShort(token_info_id + 2); word_cost = this.token_info_dictionary.dictionary.getShort(token_info_id + 4); // node_name, cost, start_index, length, type, left_id, right_id, surface_form // Lattice geometry is in code points (the outer loop advances per code // point via SurrogateAwareString); node length must match. key.length is // UTF-16 code units, which overshoots for astral chars — use the // surrogate-aware length. (BMP-only dict keys are unchanged; this guards // against astral dict surfaces, e.g. NEologd CJK-Ext-B entries.) lattice.append(new ViterbiNode(token_info_id, word_cost, pos + 1, new SurrogateAwareString(key.toString()).length, "KNOWN", left_id, right_id, key)); } } // Hot-patch: check patch entries at this position (prefix match) for (let surface in this.patch_entries) { if (tail.indexOf(surface) === 0) { let entries = this.patch_entries[surface]; for (let p = 0; p < entries.length; p++) { let pe = entries[p]; // Use a unique negative ID so getFeatures can look it up let patch_id = -(Object.keys(this.patch_entries).indexOf(surface) * 100 + p + 1); if (!this._patch_features) this._patch_features = {}; this._patch_features[patch_id] = pe.features_line; lattice.append(new ViterbiNode(patch_id, pe.word_cost, pos + 1, new SurrogateAwareString(surface).length, "KNOWN", pe.left_id, pe.right_id, surface)); } } } // Unknown word processing let surrogate_aware_tail = new SurrogateAwareString(tail); let head_char = new SurrogateAwareString(surrogate_aware_tail.charAt(0)); let head_char_class = this.unknown_dictionary.lookup(head_char.toString()); if (vocabulary == null || vocabulary.length === 0 || head_char_class.is_always_invoke === 1) { // Process unknown word key = head_char; if (head_char_class.is_grouping === 1 && 1 < surrogate_aware_tail.length) { for (let k = 1; k < surrogate_aware_tail.length; k++) { let next_char = surrogate_aware_tail.charAt(k); let next_char_class = this.unknown_dictionary.lookup(next_char); if (head_char_class.class_name !== next_char_class.class_name) { break; } key += next_char; } } let unk_ids = this.unknown_dictionary.target_map[head_char_class.class_id]; for (let j = 0; j < unk_ids.length; j++) { let unk_id = parseInt(unk_ids[j], 10); left_id = this.unknown_dictionary.dictionary.getShort(unk_id); right_id = this.unknown_dictionary.dictionary.getShort(unk_id + 2); word_cost = this.unknown_dictionary.dictionary.getShort(unk_id + 4); // node_name, cost, start_index, length, type, left_id, right_id, surface_form // key may have been coerced from SurrogateAwareString to a plain string by // `key += next_char` above, making key.length count UTF-16 code units. The // lattice loop advances per code point, so a grouped astral run (≥2 astral // chars, e.g. emoji runs) would overshoot and silently drop following tokens. // Count code points instead. lattice.append(new ViterbiNode(unk_id, word_cost, pos + 1, new SurrogateAwareString(key.toString()).length, "UNKNOWN", left_id, right_id, key.toString())); } } } lattice.appendEos(); return lattice; }; module.exports = ViterbiBuilder;