kuromoji
Version:
JavaScript implementation of Japanese morphological analyzer
206 lines (182 loc) • 8.01 kB
JavaScript
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
;
var InvokeDefinitionMap = require("./InvokeDefinitionMap");
var CharacterClass = require("./CharacterClass");
var SurrogateAwareString = require("../util/SurrogateAwareString");
var DEFAULT_CATEGORY = "DEFAULT";
/**
* CharacterDefinition represents char.def file and
* defines behavior of unknown word processing
* @constructor
*/
function CharacterDefinition() {
this.character_category_map = new Uint8Array(65536); // for all UCS2 code points
this.compatible_category_map = new Uint32Array(65536); // for all UCS2 code points
this.invoke_definition_map = null;
}
/**
* Load CharacterDefinition
* @param {Uint8Array} cat_map_buffer
* @param {Uint32Array} compat_cat_map_buffer
* @param {InvokeDefinitionMap} invoke_def_buffer
* @returns {CharacterDefinition}
*/
CharacterDefinition.load = function (cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) {
var char_def = new CharacterDefinition();
char_def.character_category_map = cat_map_buffer;
char_def.compatible_category_map = compat_cat_map_buffer;
char_def.invoke_definition_map = InvokeDefinitionMap.load(invoke_def_buffer);
return char_def;
};
CharacterDefinition.parseCharCategory = function (class_id, parsed_category_def) {
var category = parsed_category_def[1];
var invoke = parseInt(parsed_category_def[2]);
var grouping = parseInt(parsed_category_def[3]);
var max_length = parseInt(parsed_category_def[4]);
if (!isFinite(invoke) || (invoke !== 0 && invoke !== 1)) {
console.log("char.def parse error. INVOKE is 0 or 1 in:" + invoke);
return null;
}
if (!isFinite(grouping) || (grouping !== 0 && grouping !== 1)) {
console.log("char.def parse error. GROUP is 0 or 1 in:" + grouping);
return null;
}
if (!isFinite(max_length) || max_length < 0) {
console.log("char.def parse error. LENGTH is 1 to n:" + max_length);
return null;
}
var is_invoke = (invoke === 1);
var is_grouping = (grouping === 1);
return new CharacterClass(class_id, category, is_invoke, is_grouping, max_length);
};
CharacterDefinition.parseCategoryMapping = function (parsed_category_mapping) {
var start = parseInt(parsed_category_mapping[1]);
var default_category = parsed_category_mapping[2];
var compatible_category = (3 < parsed_category_mapping.length) ? parsed_category_mapping.slice(3) : [];
if (!isFinite(start) || start < 0 || start > 0xFFFF) {
console.log("char.def parse error. CODE is invalid:" + start);
}
return { start: start, default: default_category, compatible: compatible_category};
};
CharacterDefinition.parseRangeCategoryMapping = function (parsed_category_mapping) {
var start = parseInt(parsed_category_mapping[1]);
var end = parseInt(parsed_category_mapping[2]);
var default_category = parsed_category_mapping[3];
var compatible_category = (4 < parsed_category_mapping.length) ? parsed_category_mapping.slice(4) : [];
if (!isFinite(start) || start < 0 || start > 0xFFFF) {
console.log("char.def parse error. CODE is invalid:" + start);
}
if (!isFinite(end) || end < 0 || end > 0xFFFF) {
console.log("char.def parse error. CODE is invalid:" + end);
}
return { start: start, end: end, default: default_category, compatible: compatible_category};
};
/**
* Initializing method
* @param {Array} category_mapping Array of category mapping
*/
CharacterDefinition.prototype.initCategoryMappings = function (category_mapping) {
// Initialize map by DEFAULT class
var code_point;
if (category_mapping != null) {
for (var i = 0; i < category_mapping.length; i++) {
var mapping = category_mapping[i];
var end = mapping.end || mapping.start;
for (code_point = mapping.start; code_point <= end; code_point++) {
// Default Category class ID
this.character_category_map[code_point] = this.invoke_definition_map.lookup(mapping.default);
for (var j = 0; j < mapping.compatible.length; j++) {
var bitset = this.compatible_category_map[code_point];
var compatible_category = mapping.compatible[j];
if (compatible_category == null) {
continue;
}
var class_id = this.invoke_definition_map.lookup(compatible_category); // Default Category
if (class_id == null) {
continue;
}
var class_id_bit = 1 << class_id;
bitset = bitset | class_id_bit; // Set a bit of class ID 例えば、class_idが3のとき、3ビット目に1を立てる
this.compatible_category_map[code_point] = bitset;
}
}
}
}
var default_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY);
if (default_id == null) {
return;
}
for (code_point = 0; code_point < this.character_category_map.length; code_point++) {
// 他に何のクラスも定義されていなかったときだけ DEFAULT
if (this.character_category_map[code_point] === 0) {
// DEFAULT class ID に対応するビットだけ1を立てる
this.character_category_map[code_point] = 1 << default_id;
}
}
};
/**
* Lookup compatible categories for a character (not included 1st category)
* @param {string} ch UCS2 character (just 1st character is effective)
* @returns {Array.<CharacterClass>} character classes
*/
CharacterDefinition.prototype.lookupCompatibleCategory = function (ch) {
var classes = [];
/*
if (SurrogateAwareString.isSurrogatePair(ch)) {
// Surrogate pair character codes can not be defined by char.def
return classes;
}*/
var code = ch.charCodeAt(0);
var integer;
if (code < this.compatible_category_map.length) {
integer = this.compatible_category_map[code]; // Bitset
}
if (integer == null || integer === 0) {
return classes;
}
for (var bit = 0; bit < 32; bit++) { // Treat "bit" as a class ID
if (((integer << (31 - bit)) >>> 31) === 1) {
var character_class = this.invoke_definition_map.getCharacterClass(bit);
if (character_class == null) {
continue;
}
classes.push(character_class);
}
}
return classes;
};
/**
* Lookup category for a character
* @param {string} ch UCS2 character (just 1st character is effective)
* @returns {CharacterClass} character class
*/
CharacterDefinition.prototype.lookup = function (ch) {
var class_id;
var code = ch.charCodeAt(0);
if (SurrogateAwareString.isSurrogatePair(ch)) {
// Surrogate pair character codes can not be defined by char.def, so set DEFAULT category
class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY);
} else if (code < this.character_category_map.length) {
class_id = this.character_category_map[code]; // Read as integer value
}
if (class_id == null) {
class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY);
}
return this.invoke_definition_map.getCharacterClass(class_id);
};
module.exports = CharacterDefinition;