kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
148 lines (132 loc) • 4.73 kB
JavaScript
;
let ViterbiBuilder = require("./viterbi/ViterbiBuilder");
let ViterbiSearcher = require("./viterbi/ViterbiSearcher");
let IpadicFormatter = require("./util/IpadicFormatter");
let PUNCTUATION = /、|。/;
/**
* Tokenizer
* @param {DynamicDictionaries} dic Dictionaries used by this tokenizer
* @constructor
*/
function Tokenizer(dic) {
this.token_info_dictionary = dic.token_info_dictionary;
this.unknown_dictionary = dic.unknown_dictionary;
this.viterbi_builder = new ViterbiBuilder(dic);
this.viterbi_searcher = new ViterbiSearcher(dic.connection_costs);
this.formatter = new IpadicFormatter(); // TODO Other dictionaries
}
/**
* Hot-patch: add a dictionary entry at runtime without recompiling .dat files.
*
* The entry bypasses the DoubleArray trie — ViterbiBuilder checks the patch
* map at each position via prefix matching. Use this during dictionary
* iteration to test new entries before committing to a full rebuild.
*
* @param {string} csvLine IPADIC CSV: surface,left_id,right_id,cost,pos1,pos2,pos3,pos4,conj_type,conj_form,basic_form,reading,pronunciation
*/
Tokenizer.prototype.addEntry = function (csvLine) {
this.viterbi_builder.addPatchEntry(csvLine);
};
/**
* Load multiple hot-patch entries from a CSV string (one entry per line).
* Lines starting with # are comments.
* @param {string} csv
*/
Tokenizer.prototype.loadPatchCsv = function (csv) {
let lines = csv.split('\n');
for (let i = 0; i < lines.length; i++) {
let line = lines[i].trim();
if (!line || line[0] === '#') continue;
this.addEntry(line);
}
};
/**
* Split into sentence by punctuation
* @param {string} input Input text
* @returns {Array.<string>} Sentences end with punctuation
*/
Tokenizer.splitByPunctuation = function (input) {
let sentences = [];
let tail = input;
while (true) {
if (tail === "") {
break;
}
let index = tail.search(PUNCTUATION);
if (index < 0) {
sentences.push(tail);
break;
}
sentences.push(tail.substring(0, index + 1));
tail = tail.substring(index + 1);
}
return sentences;
};
/**
* Tokenize text
* @param {string} text Input text to analyze
* @returns {Array} Tokens
*/
Tokenizer.prototype.tokenize = function (text) {
if (text == null || text === '') return [];
let sentences = Tokenizer.splitByPunctuation(text);
let tokens = [];
for (let i = 0; i < sentences.length; i++) {
let sentence = sentences[i];
this.tokenizeForSentence(sentence, tokens);
}
return tokens;
};
Tokenizer.prototype.tokenizeForSentence = function (sentence, tokens) {
if (tokens == null) {
tokens = [];
}
let lattice = this.getLattice(sentence);
let best_path = this.viterbi_searcher.search(lattice);
let last_pos = 0;
if (tokens.length > 0) {
last_pos = tokens[tokens.length - 1].word_position;
}
for (let j = 0; j < best_path.length; j++) {
let node = best_path[j];
let token, features, features_line;
if (node.type === "KNOWN") {
// Check if this is a hot-patch entry (negative ID)
let patch_features = this.viterbi_builder._patch_features;
if (node.name < 0 && patch_features && patch_features[node.name]) {
features_line = patch_features[node.name];
} else {
features_line = this.token_info_dictionary.getFeatures(node.name);
}
if (features_line == null) {
features = [];
} else {
features = features_line.split(",");
}
token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, features);
} else if (node.type === "UNKNOWN") {
// Unknown word
features_line = this.unknown_dictionary.getFeatures(node.name);
if (features_line == null) {
features = [];
} else {
features = features_line.split(",");
}
token = this.formatter.formatUnknownEntry(node.name, last_pos + node.start_pos, node.type, features, node.surface_form);
} else {
// TODO User dictionary
token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, []);
}
tokens.push(token);
}
return tokens;
};
/**
* Build word lattice
* @param {string} text Input text to analyze
* @returns {ViterbiLattice} Word lattice
*/
Tokenizer.prototype.getLattice = function (text) {
return this.viterbi_builder.build(text);
};
module.exports = Tokenizer;