kuromoji
Version:
JavaScript implementation of Japanese morphological analyzer
203 lines (194 loc) • 7.44 kB
JavaScript
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var expect = require("chai").expect;
var kuromoji = require("../src/kuromoji.js"); // Not to be browserifiy-ed
var Tokenizer = require("../src/Tokenizer");
var DIC_DIR = "dict/";
describe("Tokenizer static method test", function () {
it("splitByPunctuation", function () {
expect(Tokenizer.splitByPunctuation("すもももももももものうち"))
.to.deep.eql(["すもももももももものうち"]);
});
it("splitByPunctuation", function () {
expect(Tokenizer.splitByPunctuation("、"))
.to.deep.eql(["、"]);
});
it("splitByPunctuation", function () {
expect(Tokenizer.splitByPunctuation("。"))
.to.deep.eql(["。"]);
});
it("splitByPunctuation", function () {
expect(Tokenizer.splitByPunctuation("すもも、も、もも。もも、も、もも。"))
.to.deep.eql(["すもも、", "も、", "もも。", "もも、", "も、", "もも。"]);
});
it("splitByPunctuation", function () {
expect(Tokenizer.splitByPunctuation("、𠮷野屋。漢字。"))
.to.deep.eql(["、", "𠮷野屋。", "漢字。"]);
});
});
describe("Tokenizer for IPADic", function () {
var tokenizer = null; // target object
before(function (done) {
this.timeout(5 * 60 * 1000); // 5 min
kuromoji.builder({ dicPath: DIC_DIR }).build(function (error, _tokenizer) {
tokenizer = _tokenizer;
expect(tokenizer).to.be.a("object");
done();
});
});
it("Sentence すもももももももものうち is tokenized properly", function () {
var path = tokenizer.tokenize("すもももももももものうち");
var expected_tokens = [
{
word_type: "KNOWN",
word_position: 1,
surface_form: "すもも",
pos: "名詞",
pos_detail_1: "一般",
pos_detail_2: "*",
pos_detail_3: "*",
conjugated_type: "*",
conjugated_form: "*",
basic_form: "すもも",
reading: "スモモ",
pronunciation: "スモモ"
},
{
word_type: "KNOWN",
word_position: 4,
surface_form: "も",
pos: "助詞",
pos_detail_1: "係助詞",
pos_detail_2: "*",
pos_detail_3: "*",
conjugated_type: "*",
conjugated_form: "*",
basic_form: "も",
reading: "モ",
pronunciation: "モ"
},
{
word_type: "KNOWN",
word_position: 5,
surface_form: "もも",
pos: "名詞",
pos_detail_1: "一般",
pos_detail_2: "*",
pos_detail_3: "*",
conjugated_type: "*",
conjugated_form: "*",
basic_form: "もも",
reading: "モモ",
pronunciation: "モモ" },
{
word_type: "KNOWN",
word_position: 7,
surface_form: "も",
pos: "助詞",
pos_detail_1: "係助詞",
pos_detail_2: "*",
pos_detail_3: "*",
conjugated_type: "*",
conjugated_form: "*",
basic_form: "も",
reading: "モ",
pronunciation: "モ"
},
{
word_type: "KNOWN",
word_position: 8,
surface_form: "もも",
pos: "名詞",
pos_detail_1: "一般",
pos_detail_2: "*",
pos_detail_3: "*",
conjugated_type: "*",
conjugated_form: "*",
basic_form: "もも",
reading: "モモ",
pronunciation: "モモ"
},
{
word_type: "KNOWN",
word_position: 10,
surface_form: "の",
pos: "助詞",
pos_detail_1: "連体化",
pos_detail_2: "*",
pos_detail_3: "*",
conjugated_type: "*",
conjugated_form: "*",
basic_form: "の",
reading: "ノ",
pronunciation: "ノ"
},
{
word_type: "KNOWN",
word_position: 11,
surface_form: "うち",
pos: "名詞",
pos_detail_1: "非自立",
pos_detail_2: "副詞可能",
pos_detail_3: "*",
conjugated_type: "*",
conjugated_form: "*",
basic_form: "うち",
reading: "ウチ",
pronunciation: "ウチ"
}
];
expect(path).to.have.length(7);
for (var i = 0; i < expected_tokens.length; i++) {
var expected_token = expected_tokens[i];
var target_token = path[i];
for (var key in expected_token) {
expect(target_token).to.have.property(key, expected_token[key]);
}
}
});
it("Sentence include unknown words となりのトトロ are tokenized properly", function () {
var path = tokenizer.tokenize("となりのトトロ");
expect(path).to.have.length(3);
});
it("研究 is not split", function () {
var path = tokenizer.tokenize("研究");
expect(path).to.have.length(1);
});
it("Blank input", function () {
var path = tokenizer.tokenize("");
expect(path).to.have.length(0);
});
it("Sentence include UTF-16 surrogate pair", function () {
var path = tokenizer.tokenize("𠮷野屋");
expect(path).to.have.length(3);
expect(path[0].word_position).to.eql(1);
expect(path[1].word_position).to.eql(2);
expect(path[2].word_position).to.eql(3);
});
it("Sentence include punctuation あ、あ。あ、あ。 returns correct positions", function () {
var path = tokenizer.tokenize("あ、あ。あ、あ。");
expect(path).to.have.length(8);
expect(path[0].word_position).to.eql(1);
expect(path[1].word_position).to.eql(2);
expect(path[2].word_position).to.eql(3);
expect(path[3].word_position).to.eql(4);
expect(path[4].word_position).to.eql(5);
expect(path[5].word_position).to.eql(6);
expect(path[6].word_position).to.eql(7);
expect(path[7].word_position).to.eql(8);
});
});