UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

github.com/KimuraRisei/kusamoji

KimuraRisei/kusamoji

148 lines (132 loc) • 4.73 kB

JavaScript

"use strict"; let ViterbiBuilder = require("./viterbi/ViterbiBuilder"); let ViterbiSearcher = require("./viterbi/ViterbiSearcher"); let IpadicFormatter = require("./util/IpadicFormatter"); let PUNCTUATION = /、|。/; /** * Tokenizer * @param {DynamicDictionaries} dic Dictionaries used by this tokenizer * @constructor */ function Tokenizer(dic) { this.token_info_dictionary = dic.token_info_dictionary; this.unknown_dictionary = dic.unknown_dictionary; this.viterbi_builder = new ViterbiBuilder(dic); this.viterbi_searcher = new ViterbiSearcher(dic.connection_costs); this.formatter = new IpadicFormatter(); // TODO Other dictionaries } /** * Hot-patch: add a dictionary entry at runtime without recompiling .dat files. * * The entry bypasses the DoubleArray trie — ViterbiBuilder checks the patch * map at each position via prefix matching. Use this during dictionary * iteration to test new entries before committing to a full rebuild. * * @param {string} csvLine IPADIC CSV: surface,left_id,right_id,cost,pos1,pos2,pos3,pos4,conj_type,conj_form,basic_form,reading,pronunciation */ Tokenizer.prototype.addEntry = function (csvLine) { this.viterbi_builder.addPatchEntry(csvLine); }; /** * Load multiple hot-patch entries from a CSV string (one entry per line). * Lines starting with # are comments. * @param {string} csv */ Tokenizer.prototype.loadPatchCsv = function (csv) { let lines = csv.split('\n'); for (let i = 0; i < lines.length; i++) { let line = lines[i].trim(); if (!line || line[0] === '#') continue; this.addEntry(line); } }; /** * Split into sentence by punctuation * @param {string} input Input text * @returns {Array.<string>} Sentences end with punctuation */ Tokenizer.splitByPunctuation = function (input) { let sentences = []; let tail = input; while (true) { if (tail === "") { break; } let index = tail.search(PUNCTUATION); if (index < 0) { sentences.push(tail); break; } sentences.push(tail.substring(0, index + 1)); tail = tail.substring(index + 1); } return sentences; }; /** * Tokenize text * @param {string} text Input text to analyze * @returns {Array} Tokens */ Tokenizer.prototype.tokenize = function (text) { if (text == null || text === '') return []; let sentences = Tokenizer.splitByPunctuation(text); let tokens = []; for (let i = 0; i < sentences.length; i++) { let sentence = sentences[i]; this.tokenizeForSentence(sentence, tokens); } return tokens; }; Tokenizer.prototype.tokenizeForSentence = function (sentence, tokens) { if (tokens == null) { tokens = []; } let lattice = this.getLattice(sentence); let best_path = this.viterbi_searcher.search(lattice); let last_pos = 0; if (tokens.length > 0) { last_pos = tokens[tokens.length - 1].word_position; } for (let j = 0; j < best_path.length; j++) { let node = best_path[j]; let token, features, features_line; if (node.type === "KNOWN") { // Check if this is a hot-patch entry (negative ID) let patch_features = this.viterbi_builder._patch_features; if (node.name < 0 && patch_features && patch_features[node.name]) { features_line = patch_features[node.name]; } else { features_line = this.token_info_dictionary.getFeatures(node.name); } if (features_line == null) { features = []; } else { features = features_line.split(","); } token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, features); } else if (node.type === "UNKNOWN") { // Unknown word features_line = this.unknown_dictionary.getFeatures(node.name); if (features_line == null) { features = []; } else { features = features_line.split(","); } token = this.formatter.formatUnknownEntry(node.name, last_pos + node.start_pos, node.type, features, node.surface_form); } else { // TODO User dictionary token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, []); } tokens.push(token); } return tokens; }; /** * Build word lattice * @param {string} text Input text to analyze * @returns {ViterbiLattice} Word lattice */ Tokenizer.prototype.getLattice = function (text) { return this.viterbi_builder.build(text); }; module.exports = Tokenizer;