UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

195 lines (172 loc) 7.91 kB
"use strict"; let InvokeDefinitionMap = require("./InvokeDefinitionMap"); let CharacterClass = require("./CharacterClass"); let SurrogateAwareString = require("../util/SurrogateAwareString"); let DEFAULT_CATEGORY = "DEFAULT"; /** * CharacterDefinition represents char.def file and * defines behavior of unknown word processing * @constructor */ function CharacterDefinition() { this.character_category_map = new Uint8Array(65536); // for all UCS2 code points this.compatible_category_map = new Uint32Array(65536); // for all UCS2 code points this.invoke_definition_map = null; } /** * Load CharacterDefinition * @param {Uint8Array} cat_map_buffer * @param {Uint32Array} compat_cat_map_buffer * @param {InvokeDefinitionMap} invoke_def_buffer * @returns {CharacterDefinition} */ CharacterDefinition.load = function (cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) { let char_def = new CharacterDefinition(); char_def.character_category_map = cat_map_buffer; char_def.compatible_category_map = compat_cat_map_buffer; char_def.invoke_definition_map = InvokeDefinitionMap.load(invoke_def_buffer); return char_def; }; CharacterDefinition.parseCharCategory = function (class_id, parsed_category_def) { let category = parsed_category_def[1]; let invoke = parseInt(parsed_category_def[2], 10); let grouping = parseInt(parsed_category_def[3], 10); let max_length = parseInt(parsed_category_def[4], 10); if (!isFinite(invoke) || (invoke !== 0 && invoke !== 1)) { console.log("char.def parse error. INVOKE is 0 or 1 in:" + invoke); return null; } if (!isFinite(grouping) || (grouping !== 0 && grouping !== 1)) { console.log("char.def parse error. GROUP is 0 or 1 in:" + grouping); return null; } if (!isFinite(max_length) || max_length < 0) { console.log("char.def parse error. LENGTH is 1 to n:" + max_length); return null; } let is_invoke = (invoke === 1); let is_grouping = (grouping === 1); return new CharacterClass(class_id, category, is_invoke, is_grouping, max_length); }; CharacterDefinition.parseCategoryMapping = function (parsed_category_mapping) { // char.def code points are hex (e.g. "0x3042"); parse as base 16, not base 10 // (parseInt("0x3042", 10) === 0, which collapses every mapping onto code point 0). let start = parseInt(parsed_category_mapping[1], 16); let default_category = parsed_category_mapping[2]; let compatible_category = (3 < parsed_category_mapping.length) ? parsed_category_mapping.slice(3) : []; if (!isFinite(start) || start < 0 || start > 0xFFFF) { console.log("char.def parse error. CODE is invalid:" + start); } return { start: start, default: default_category, compatible: compatible_category}; }; CharacterDefinition.parseRangeCategoryMapping = function (parsed_category_mapping) { // char.def range code points are hex (e.g. "0x3041" "0x3096"); parse as base 16. let start = parseInt(parsed_category_mapping[1], 16); let end = parseInt(parsed_category_mapping[2], 16); let default_category = parsed_category_mapping[3]; let compatible_category = (4 < parsed_category_mapping.length) ? parsed_category_mapping.slice(4) : []; if (!isFinite(start) || start < 0 || start > 0xFFFF) { console.log("char.def parse error. CODE is invalid:" + start); } if (!isFinite(end) || end < 0 || end > 0xFFFF) { console.log("char.def parse error. CODE is invalid:" + end); } return { start: start, end: end, default: default_category, compatible: compatible_category}; }; /** * Initializing method * @param {Array} category_mapping Array of category mapping */ CharacterDefinition.prototype.initCategoryMappings = function (category_mapping) { // Initialize map by DEFAULT class let code_point; if (category_mapping != null) { for (let i = 0; i < category_mapping.length; i++) { let mapping = category_mapping[i]; let end = mapping.end || mapping.start; for (code_point = mapping.start; code_point <= end; code_point++) { // Default Category class ID this.character_category_map[code_point] = this.invoke_definition_map.lookup(mapping.default); for (let j = 0; j < mapping.compatible.length; j++) { let bitset = this.compatible_category_map[code_point]; let compatible_category = mapping.compatible[j]; if (compatible_category == null) { continue; } let class_id = this.invoke_definition_map.lookup(compatible_category); // Default Category if (class_id == null) { continue; } let class_id_bit = 1 << class_id; bitset = bitset | class_id_bit; // Set a bit of class ID 例えば、class_idが3のとき、3ビット目に1を立てる this.compatible_category_map[code_point] = bitset; } } } } let default_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); if (default_id == null) { return; } for (code_point = 0; code_point < this.character_category_map.length; code_point++) { // 他に何のクラスも定義されていなかったときだけ DEFAULT if (this.character_category_map[code_point] === 0) { // Store the RAW class id (matching the mapped-cell write above and the // lookup() reader, which reads this map as a class id — NOT a bitset like // compatible_category_map). The previous `1 << default_id` wrote a bitset // here, so with default_id=0 every DEFAULT cell read back as class 1 (SPACE). this.character_category_map[code_point] = default_id; } } }; /** * Lookup compatible categories for a character (not included 1st category) * @param {string} ch UCS2 character (just 1st character is effective) * @returns {Array.<CharacterClass>} character classes */ CharacterDefinition.prototype.lookupCompatibleCategory = function (ch) { let classes = []; /* if (SurrogateAwareString.isSurrogatePair(ch)) { // Surrogate pair character codes can not be defined by char.def return classes; }*/ let code = ch.charCodeAt(0); let integer; if (code < this.compatible_category_map.length) { integer = this.compatible_category_map[code]; // Bitset } if (integer == null || integer === 0) { return classes; } for (let bit = 0; bit < 32; bit++) { // Treat "bit" as a class ID if (((integer << (31 - bit)) >>> 31) === 1) { let character_class = this.invoke_definition_map.getCharacterClass(bit); if (character_class == null) { continue; } classes.push(character_class); } } return classes; }; /** * Lookup category for a character * @param {string} ch UCS2 character (just 1st character is effective) * @returns {CharacterClass} character class */ CharacterDefinition.prototype.lookup = function (ch) { let class_id; let code = ch.charCodeAt(0); if (SurrogateAwareString.isSurrogatePair(ch)) { // Surrogate pair character codes can not be defined by char.def, so set DEFAULT category class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); } else if (code < this.character_category_map.length) { class_id = this.character_category_map[code]; // Read as integer value } if (class_id == null) { class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); } return this.invoke_definition_map.getCharacterClass(class_id); }; module.exports = CharacterDefinition;