kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
147 lines (130 loc) • 6.72 kB
JavaScript
"use strict";
let ViterbiNode = require("./ViterbiNode");
let ViterbiLattice = require("./ViterbiLattice");
let SurrogateAwareString = require("../util/SurrogateAwareString");
/**
* ViterbiBuilder builds word lattice (ViterbiLattice)
* @param {DynamicDictionaries} dic dictionary
* @constructor
*/
function ViterbiBuilder(dic) {
this.trie = dic.trie;
this.token_info_dictionary = dic.token_info_dictionary;
this.unknown_dictionary = dic.unknown_dictionary;
// Hot-patch entries: surface → [{ left_id, right_id, word_cost, features }]
// Checked at each position alongside the trie. Used during dictionary
// iteration to test new entries without recompiling .dat files.
this.patch_entries = {};
}
/**
* Add a hot-patch entry from an IPADIC CSV line.
* @param {string} csvLine surface,left_id,right_id,cost,pos1,...,reading,pronunciation
*/
ViterbiBuilder.prototype.addPatchEntry = function (csvLine) {
let fields = csvLine.split(",");
if (fields.length < 13) return;
let surface = fields[0];
let left_id = parseInt(fields[1], 10);
let right_id = parseInt(fields[2], 10);
let word_cost = parseInt(fields[3], 10);
let features = fields.slice(4).join(",");
if (!this.patch_entries[surface]) {
this.patch_entries[surface] = [];
}
this.patch_entries[surface].push({
left_id: left_id,
right_id: right_id,
word_cost: word_cost,
surface_form: surface,
features_line: surface + "," + features
});
};
/**
* Clear all hot-patch entries.
*/
ViterbiBuilder.prototype.clearPatchEntries = function () {
this.patch_entries = {};
};
/**
* Build word lattice
* @param {string} sentence_str Input text
* @returns {ViterbiLattice} Word lattice
*/
ViterbiBuilder.prototype.build = function (sentence_str) {
let lattice = new ViterbiLattice();
let sentence = new SurrogateAwareString(sentence_str);
let key, trie_id, left_id, right_id, word_cost;
for (let pos = 0; pos < sentence.length; pos++) {
let tail = sentence.slice(pos);
let vocabulary = this.trie.commonPrefixSearch(tail);
if (vocabulary == null) vocabulary = [];
for (let n = 0; n < vocabulary.length; n++) { // Words in dictionary do not have surrogate pair (only UCS2 set)
trie_id = vocabulary[n].v;
key = vocabulary[n].k;
let token_info_ids = this.token_info_dictionary.target_map[trie_id];
if (token_info_ids == null) continue;
for (let i = 0; i < token_info_ids.length; i++) {
let token_info_id = parseInt(token_info_ids[i], 10);
left_id = this.token_info_dictionary.dictionary.getShort(token_info_id);
right_id = this.token_info_dictionary.dictionary.getShort(token_info_id + 2);
word_cost = this.token_info_dictionary.dictionary.getShort(token_info_id + 4);
// node_name, cost, start_index, length, type, left_id, right_id, surface_form
// Lattice geometry is in code points (the outer loop advances per code
// point via SurrogateAwareString); node length must match. key.length is
// UTF-16 code units, which overshoots for astral chars — use the
// surrogate-aware length. (BMP-only dict keys are unchanged; this guards
// against astral dict surfaces, e.g. NEologd CJK-Ext-B entries.)
lattice.append(new ViterbiNode(token_info_id, word_cost, pos + 1, new SurrogateAwareString(key.toString()).length, "KNOWN", left_id, right_id, key));
}
}
// Hot-patch: check patch entries at this position (prefix match)
for (let surface in this.patch_entries) {
if (tail.indexOf(surface) === 0) {
let entries = this.patch_entries[surface];
for (let p = 0; p < entries.length; p++) {
let pe = entries[p];
// Use a unique negative ID so getFeatures can look it up
let patch_id = -(Object.keys(this.patch_entries).indexOf(surface) * 100 + p + 1);
if (!this._patch_features) this._patch_features = {};
this._patch_features[patch_id] = pe.features_line;
lattice.append(new ViterbiNode(patch_id, pe.word_cost, pos + 1, new SurrogateAwareString(surface).length, "KNOWN", pe.left_id, pe.right_id, surface));
}
}
}
// Unknown word processing
let surrogate_aware_tail = new SurrogateAwareString(tail);
let head_char = new SurrogateAwareString(surrogate_aware_tail.charAt(0));
let head_char_class = this.unknown_dictionary.lookup(head_char.toString());
if (vocabulary == null || vocabulary.length === 0 || head_char_class.is_always_invoke === 1) {
// Process unknown word
key = head_char;
if (head_char_class.is_grouping === 1 && 1 < surrogate_aware_tail.length) {
for (let k = 1; k < surrogate_aware_tail.length; k++) {
let next_char = surrogate_aware_tail.charAt(k);
let next_char_class = this.unknown_dictionary.lookup(next_char);
if (head_char_class.class_name !== next_char_class.class_name) {
break;
}
key += next_char;
}
}
let unk_ids = this.unknown_dictionary.target_map[head_char_class.class_id];
for (let j = 0; j < unk_ids.length; j++) {
let unk_id = parseInt(unk_ids[j], 10);
left_id = this.unknown_dictionary.dictionary.getShort(unk_id);
right_id = this.unknown_dictionary.dictionary.getShort(unk_id + 2);
word_cost = this.unknown_dictionary.dictionary.getShort(unk_id + 4);
// node_name, cost, start_index, length, type, left_id, right_id, surface_form
// key may have been coerced from SurrogateAwareString to a plain string by
// `key += next_char` above, making key.length count UTF-16 code units. The
// lattice loop advances per code point, so a grouped astral run (≥2 astral
// chars, e.g. emoji runs) would overshoot and silently drop following tokens.
// Count code points instead.
lattice.append(new ViterbiNode(unk_id, word_cost, pos + 1, new SurrogateAwareString(key.toString()).length, "UNKNOWN", left_id, right_id, key.toString()));
}
}
}
lattice.appendEos();
return lattice;
};
module.exports = ViterbiBuilder;