kuromoji
Version:
JavaScript implementation of Japanese morphological analyzer
130 lines (116 loc) • 4.05 kB
JavaScript
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
;
var ViterbiBuilder = require("./viterbi/ViterbiBuilder");
var ViterbiSearcher = require("./viterbi/ViterbiSearcher");
var IpadicFormatter = require("./util/IpadicFormatter");
var PUNCTUATION = /、|。/;
/**
* Tokenizer
* @param {DynamicDictionaries} dic Dictionaries used by this tokenizer
* @constructor
*/
function Tokenizer(dic) {
this.token_info_dictionary = dic.token_info_dictionary;
this.unknown_dictionary = dic.unknown_dictionary;
this.viterbi_builder = new ViterbiBuilder(dic);
this.viterbi_searcher = new ViterbiSearcher(dic.connection_costs);
this.formatter = new IpadicFormatter(); // TODO Other dictionaries
}
/**
* Split into sentence by punctuation
* @param {string} input Input text
* @returns {Array.<string>} Sentences end with punctuation
*/
Tokenizer.splitByPunctuation = function (input) {
var sentences = [];
var tail = input;
while (true) {
if (tail === "") {
break;
}
var index = tail.search(PUNCTUATION);
if (index < 0) {
sentences.push(tail);
break;
}
sentences.push(tail.substring(0, index + 1));
tail = tail.substring(index + 1);
}
return sentences;
};
/**
* Tokenize text
* @param {string} text Input text to analyze
* @returns {Array} Tokens
*/
Tokenizer.prototype.tokenize = function (text) {
var sentences = Tokenizer.splitByPunctuation(text);
var tokens = [];
for (var i = 0; i < sentences.length; i++) {
var sentence = sentences[i];
this.tokenizeForSentence(sentence, tokens);
}
return tokens;
};
Tokenizer.prototype.tokenizeForSentence = function (sentence, tokens) {
if (tokens == null) {
tokens = [];
}
var lattice = this.getLattice(sentence);
var best_path = this.viterbi_searcher.search(lattice);
var last_pos = 0;
if (tokens.length > 0) {
last_pos = tokens[tokens.length - 1].word_position;
}
for (var j = 0; j < best_path.length; j++) {
var node = best_path[j];
var token, features, features_line;
if (node.type === "KNOWN") {
features_line = this.token_info_dictionary.getFeatures(node.name);
if (features_line == null) {
features = [];
} else {
features = features_line.split(",");
}
token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, features);
} else if (node.type === "UNKNOWN") {
// Unknown word
features_line = this.unknown_dictionary.getFeatures(node.name);
if (features_line == null) {
features = [];
} else {
features = features_line.split(",");
}
token = this.formatter.formatUnknownEntry(node.name, last_pos + node.start_pos, node.type, features, node.surface_form);
} else {
// TODO User dictionary
token = this.formatter.formatEntry(node.name, last_pos + node.start_pos, node.type, []);
}
tokens.push(token);
}
return tokens;
};
/**
* Build word lattice
* @param {string} text Input text to analyze
* @returns {ViterbiLattice} Word lattice
*/
Tokenizer.prototype.getLattice = function (text) {
return this.viterbi_builder.build(text);
};
module.exports = Tokenizer;