UNPKG

@patdx/kuromoji

Version:

JavaScript implementation of Japanese morphological analyzer

1,683 lines (1,664 loc) 56.2 kB
// src/viterbi/ViterbiNode.ts var ViterbiNode = class { start_pos; length; name; cost; left_id; right_id; prev; surface_form; shortest_cost; type; constructor(node_name, node_cost, start_pos, length, type, left_id, right_id, surface_form) { this.name = node_name; this.cost = node_cost; this.start_pos = start_pos; this.length = length; this.left_id = left_id; this.right_id = right_id; this.prev = null; this.surface_form = surface_form; if (type === "BOS") { this.shortest_cost = 0; } else { this.shortest_cost = Number.MAX_VALUE; } this.type = type; } }; var ViterbiNode_default = ViterbiNode; // src/viterbi/ViterbiLattice.ts var ViterbiLattice = class { nodes_end_at; eos_pos; constructor() { this.nodes_end_at = []; this.nodes_end_at[0] = [new ViterbiNode_default(-1, 0, 0, 0, "BOS", 0, 0, "")]; this.eos_pos = 1; } append(node) { const last_pos = node.start_pos + node.length - 1; if (this.eos_pos < last_pos) { this.eos_pos = last_pos; } let prev_nodes = this.nodes_end_at[last_pos]; if (prev_nodes == null) { prev_nodes = []; } prev_nodes.push(node); this.nodes_end_at[last_pos] = prev_nodes; } appendEos() { const last_index = this.nodes_end_at.length; this.eos_pos++; this.nodes_end_at[last_index] = [ new ViterbiNode_default(-1, 0, this.eos_pos, 0, "EOS", 0, 0, "") ]; } }; var ViterbiLattice_default = ViterbiLattice; // src/util/SurrogateAwareString.ts var SurrogateAwareString = class _SurrogateAwareString { length; str; index_mapping; constructor(str) { this.str = str; this.index_mapping = []; for (let pos = 0; pos < str.length; pos++) { const ch = str.charAt(pos); this.index_mapping.push(pos); if (_SurrogateAwareString.isSurrogatePair(ch)) { pos++; } } this.length = this.index_mapping.length; } slice(index) { if (this.index_mapping.length <= index) { return ""; } const surrogate_aware_index = this.index_mapping[index]; return this.str.slice(surrogate_aware_index); } charAt(index) { if (this.str.length <= index) { return ""; } const surrogate_aware_start_index = this.index_mapping[index]; const surrogate_aware_end_index = this.index_mapping[index + 1]; if (surrogate_aware_end_index == null) { return this.str.slice(surrogate_aware_start_index); } return this.str.slice( surrogate_aware_start_index, surrogate_aware_end_index ); } charCodeAt(index) { if (this.index_mapping.length <= index) { return NaN; } const surrogate_aware_index = this.index_mapping[index]; const upper = this.str.charCodeAt(surrogate_aware_index); let lower; if (upper >= 55296 && upper <= 56319 && surrogate_aware_index < this.str.length) { lower = this.str.charCodeAt(surrogate_aware_index + 1); if (lower >= 56320 && lower <= 57343) { return (upper - 55296) * 1024 + lower - 56320 + 65536; } } return upper; } toString() { return this.str; } static isSurrogatePair(ch) { const utf16_code = ch.charCodeAt(0); if (utf16_code >= 55296 && utf16_code <= 56319) { return true; } else { return false; } } }; var SurrogateAwareString_default = SurrogateAwareString; // src/viterbi/ViterbiBuilder.ts var ViterbiBuilder = class { trie; token_info_dictionary; unknown_dictionary; constructor(dic) { this.trie = dic.trie; this.token_info_dictionary = dic.token_info_dictionary; this.unknown_dictionary = dic.unknown_dictionary; } build(sentence_str) { const lattice = new ViterbiLattice_default(); const sentence = new SurrogateAwareString_default(sentence_str); let key, trie_id, left_id, right_id, word_cost; for (let pos = 0; pos < sentence.length; pos++) { const tail = sentence.slice(pos); const vocabulary = this.trie.commonPrefixSearch(tail); for (let n = 0; n < vocabulary.length; n++) { trie_id = vocabulary[n].v; key = vocabulary[n].k; const token_info_ids = this.token_info_dictionary.target_map[trie_id]; for (let i = 0; i < token_info_ids.length; i++) { const token_info_id = parseInt( // @ts-expect-error Argument of type 'number' is not assignable to parameter of type 'string'.ts(2345) token_info_ids[i] ); left_id = this.token_info_dictionary.dictionary.getShort(token_info_id); right_id = this.token_info_dictionary.dictionary.getShort( token_info_id + 2 ); word_cost = this.token_info_dictionary.dictionary.getShort( token_info_id + 4 ); lattice.append( new ViterbiNode_default( token_info_id, word_cost, pos + 1, key.length, "KNOWN", left_id, right_id, key ) ); } } const surrogate_aware_tail = new SurrogateAwareString_default(tail); const head_char = new SurrogateAwareString_default(surrogate_aware_tail.charAt(0)); const head_char_class = this.unknown_dictionary.lookup( head_char.toString() ); if (!head_char_class) { throw new Error("Unknown character: " + head_char); } if (vocabulary == null || vocabulary.length === 0 || head_char_class.is_always_invoke === 1) { key = head_char; if (head_char_class.is_grouping === 1 && 1 < surrogate_aware_tail.length) { for (let k = 1; k < surrogate_aware_tail.length; k++) { const next_char = surrogate_aware_tail.charAt(k); const next_char_class = this.unknown_dictionary.lookup(next_char); if (head_char_class.class_name !== next_char_class.class_name) { break; } key += next_char; } } const unk_ids = this.unknown_dictionary.target_map[head_char_class.class_id]; for (let j = 0; j < unk_ids.length; j++) { const unk_id = parseInt( // @ts-expect-error Argument of type 'number' is not assignable to parameter of type 'string'.ts(2345) unk_ids[j] ); left_id = this.unknown_dictionary.dictionary.getShort(unk_id); right_id = this.unknown_dictionary.dictionary.getShort(unk_id + 2); word_cost = this.unknown_dictionary.dictionary.getShort(unk_id + 4); lattice.append( new ViterbiNode_default( unk_id, word_cost, pos + 1, key.length, "UNKNOWN", left_id, right_id, key.toString() ) ); } } } lattice.appendEos(); return lattice; } }; var ViterbiBuilder_default = ViterbiBuilder; // src/viterbi/ViterbiSearcher.ts var ViterbiSearcher = class { connection_costs; constructor(connection_costs) { this.connection_costs = connection_costs; } search(lattice) { lattice = this.forward(lattice); return this.backward(lattice); } forward(lattice) { let i, j, k; for (i = 1; i <= lattice.eos_pos; i++) { const nodes = lattice.nodes_end_at[i]; if (nodes == null) { continue; } for (j = 0; j < nodes.length; j++) { const node = nodes[j]; let cost = Number.MAX_VALUE; let shortest_prev_node = null; const prev_nodes = lattice.nodes_end_at[node.start_pos - 1]; if (prev_nodes == null) { continue; } for (k = 0; k < prev_nodes.length; k++) { const prev_node = prev_nodes[k]; let edge_cost; if (node.left_id == null || prev_node.right_id == null) { console.log("Left or right is null"); edge_cost = 0; } else { edge_cost = this.connection_costs.get( prev_node.right_id, node.left_id ); } const _cost = prev_node.shortest_cost + edge_cost + node.cost; if (_cost < cost) { shortest_prev_node = prev_node; cost = _cost; } } node.prev = shortest_prev_node; node.shortest_cost = cost; } } return lattice; } backward(lattice) { const shortest_path = []; const eos = lattice.nodes_end_at[lattice.nodes_end_at.length - 1][0]; let node_back = eos.prev; if (node_back == null) { return []; } while (node_back.type !== "BOS") { shortest_path.push(node_back); if (node_back.prev == null) { return []; } node_back = node_back.prev; } return shortest_path.reverse(); } }; var ViterbiSearcher_default = ViterbiSearcher; // src/util/IpadicFormatter.ts var IpadicFormatter = class { formatEntry(word_id, position, type, features) { const token = {}; token.word_id = word_id; token.word_type = type; token.word_position = position; token.surface_form = features[0]; token.pos = features[1]; token.pos_detail_1 = features[2]; token.pos_detail_2 = features[3]; token.pos_detail_3 = features[4]; token.conjugated_type = features[5]; token.conjugated_form = features[6]; token.basic_form = features[7]; token.reading = features[8]; token.pronunciation = features[9]; return token; } formatUnknownEntry(word_id, position, type, features, surface_form) { const token = {}; token.word_id = word_id; token.word_type = type; token.word_position = position; token.surface_form = surface_form; token.pos = features[1]; token.pos_detail_1 = features[2]; token.pos_detail_2 = features[3]; token.pos_detail_3 = features[4]; token.conjugated_type = features[5]; token.conjugated_form = features[6]; token.basic_form = features[7]; return token; } }; var IpadicFormatter_default = IpadicFormatter; // src/Tokenizer.ts var PUNCTUATION = /、|。/; var Tokenizer = class _Tokenizer { token_info_dictionary; unknown_dictionary; viterbi_builder; viterbi_searcher; formatter; constructor(dic) { this.token_info_dictionary = dic.token_info_dictionary; this.unknown_dictionary = dic.unknown_dictionary; this.viterbi_builder = new ViterbiBuilder_default(dic); this.viterbi_searcher = new ViterbiSearcher_default(dic.connection_costs); this.formatter = new IpadicFormatter_default(); } tokenize(text) { const sentences = _Tokenizer.splitByPunctuation(text); const tokens = []; for (let i = 0; i < sentences.length; i++) { const sentence = sentences[i]; this.tokenizeForSentence(sentence, tokens); } return tokens; } tokenizeForSentence(sentence, tokens) { if (tokens == null) { tokens = []; } const lattice = this.getLattice(sentence); const best_path = this.viterbi_searcher.search(lattice); let last_pos = 0; if (tokens.length > 0) { last_pos = tokens[tokens.length - 1].word_position; } for (let j = 0; j < best_path.length; j++) { const node = best_path[j]; let token; let features; let features_line; if (node.type === "KNOWN") { features_line = this.token_info_dictionary.getFeatures( // @ts-expect-error Argument of type 'number' is not assignable to parameter of type 'string'.ts(2345) node.name ); if (features_line == null) { features = []; } else { features = features_line.split(","); } token = this.formatter.formatEntry( node.name, last_pos + node.start_pos, node.type, features ); } else if (node.type === "UNKNOWN") { features_line = this.unknown_dictionary.getFeatures( // @ts-expect-error Argument of type 'number' is not assignable to parameter of type 'string'.ts(2345) node.name ); if (features_line == null) { features = []; } else { features = features_line.split(","); } token = this.formatter.formatUnknownEntry( node.name, last_pos + node.start_pos, node.type, features, node.surface_form ); } else { token = this.formatter.formatEntry( node.name, last_pos + node.start_pos, node.type, [] ); } tokens.push(token); } return tokens; } getLattice(text) { return this.viterbi_builder.build(text); } static splitByPunctuation(input) { const sentences = []; let tail = input; while (true) { if (tail === "") { break; } const index = tail.search(PUNCTUATION); if (index < 0) { sentences.push(tail); break; } sentences.push(tail.substring(0, index + 1)); tail = tail.substring(index + 1); } return sentences; } }; var Tokenizer_default = Tokenizer; // src/vendor/doublearray/doublearray.js var TERM_CHAR = "\0"; var TERM_CODE = 0; var ROOT_ID = 0; var NOT_FOUND = -1; var BASE_SIGNED = true; var CHECK_SIGNED = true; var BASE_BYTES = 4; var CHECK_BYTES = 4; var MEMORY_EXPAND_RATIO = 2; var newBC = function(initial_size) { if (initial_size == null) { initial_size = 1024; } let initBase = function(_base, start, end) { for (let i = start; i < end; i++) { _base[i] = -i + 1; } if (0 < check.array[check.array.length - 1]) { let last_used_id = check.array.length - 2; while (0 < check.array[last_used_id]) { last_used_id--; } _base[start] = -last_used_id; } }; let initCheck = function(_check, start, end) { for (let i = start; i < end; i++) { _check[i] = -i - 1; } }; let realloc = function(min_size) { let new_size = min_size * MEMORY_EXPAND_RATIO; let base_new_array = newArrayBuffer(base.signed, base.bytes, new_size); initBase(base_new_array, base.array.length, new_size); base_new_array.set(base.array); base.array = null; base.array = base_new_array; let check_new_array = newArrayBuffer(check.signed, check.bytes, new_size); initCheck(check_new_array, check.array.length, new_size); check_new_array.set(check.array); check.array = null; check.array = check_new_array; }; let first_unused_node = ROOT_ID + 1; var base = { signed: BASE_SIGNED, bytes: BASE_BYTES, array: newArrayBuffer(BASE_SIGNED, BASE_BYTES, initial_size) }; var check = { signed: CHECK_SIGNED, bytes: CHECK_BYTES, array: newArrayBuffer(CHECK_SIGNED, CHECK_BYTES, initial_size) }; base.array[ROOT_ID] = 1; check.array[ROOT_ID] = ROOT_ID; initBase(base.array, ROOT_ID + 1, base.array.length); initCheck(check.array, ROOT_ID + 1, check.array.length); return { getBaseBuffer: function() { return base.array; }, getCheckBuffer: function() { return check.array; }, loadBaseBuffer: function(base_buffer) { base.array = base_buffer; return this; }, loadCheckBuffer: function(check_buffer) { check.array = check_buffer; return this; }, size: function() { return Math.max(base.array.length, check.array.length); }, getBase: function(index) { if (base.array.length - 1 < index) { return -index + 1; } return base.array[index]; }, getCheck: function(index) { if (check.array.length - 1 < index) { return -index - 1; } return check.array[index]; }, setBase: function(index, base_value) { if (base.array.length - 1 < index) { realloc(index); } base.array[index] = base_value; }, setCheck: function(index, check_value) { if (check.array.length - 1 < index) { realloc(index); } check.array[index] = check_value; }, setFirstUnusedNode: function(index) { first_unused_node = index; }, getFirstUnusedNode: function() { return first_unused_node; }, shrink: function() { let last_index = this.size() - 1; while (true) { if (0 <= check.array[last_index]) { break; } last_index--; } base.array = base.array.subarray(0, last_index + 2); check.array = check.array.subarray(0, last_index + 2); }, calc: function() { let unused_count = 0; let size = check.array.length; for (let i = 0; i < size; i++) { if (check.array[i] < 0) { unused_count++; } } return { all: size, unused: unused_count, efficiency: (size - unused_count) / size }; }, dump: function() { let dump_base = ""; let dump_check = ""; let i; for (i = 0; i < base.array.length; i++) { dump_base = dump_base + " " + this.getBase(i); } for (i = 0; i < check.array.length; i++) { dump_check = dump_check + " " + this.getCheck(i); } console.log("base:" + dump_base); console.log("chck:" + dump_check); return "base:" + dump_base + " chck:" + dump_check; } }; }; function DoubleArrayBuilder(initial_size) { this.bc = newBC(initial_size); this.keys = []; } DoubleArrayBuilder.prototype.append = function(key, record) { this.keys.push({ k: key, v: record }); return this; }; DoubleArrayBuilder.prototype.build = function(keys, sorted) { if (keys == null) { keys = this.keys; } if (keys == null) { return new DoubleArray(this.bc); } if (sorted == null) { sorted = false; } let buff_keys = keys.map(function(k) { return { k: stringToUtf8Bytes(k.k + TERM_CHAR), v: k.v }; }); if (sorted) { this.keys = buff_keys; } else { this.keys = buff_keys.sort(function(k1, k2) { const b1 = k1.k; const b2 = k2.k; const min_length = Math.min(b1.length, b2.length); for (let pos = 0; pos < min_length; pos++) { if (b1[pos] === b2[pos]) { continue; } return b1[pos] - b2[pos]; } return b1.length - b2.length; }); } buff_keys = null; this._build(ROOT_ID, 0, 0, this.keys.length); return new DoubleArray(this.bc); }; DoubleArrayBuilder.prototype._build = function(parent_index, position, start, length) { const children_info = this.getChildrenInfo(position, start, length); const _base = this.findAllocatableBase(children_info); this.setBC(parent_index, children_info, _base); for (let i = 0; i < children_info.length; i = i + 3) { const child_code = children_info[i]; if (child_code === TERM_CODE) { continue; } const child_start = children_info[i + 1]; const child_len = children_info[i + 2]; const child_index = _base + child_code; this._build(child_index, position + 1, child_start, child_len); } }; DoubleArrayBuilder.prototype.getChildrenInfo = function(position, start, length) { let current_char = this.keys[start].k[position]; let i = 0; let children_info = new Int32Array(length * 3); children_info[i++] = current_char; children_info[i++] = start; let next_pos = start; let start_pos = start; for (; next_pos < start + length; next_pos++) { const next_char = this.keys[next_pos].k[position]; if (current_char !== next_char) { children_info[i++] = next_pos - start_pos; children_info[i++] = next_char; children_info[i++] = next_pos; current_char = next_char; start_pos = next_pos; } } children_info[i++] = next_pos - start_pos; children_info = children_info.subarray(0, i); return children_info; }; DoubleArrayBuilder.prototype.setBC = function(parent_id, children_info, _base) { const bc = this.bc; bc.setBase(parent_id, _base); let i; for (i = 0; i < children_info.length; i = i + 3) { const code = children_info[i]; const child_id = _base + code; const prev_unused_id = -bc.getBase(child_id); const next_unused_id = -bc.getCheck(child_id); if (child_id !== bc.getFirstUnusedNode()) { bc.setCheck(prev_unused_id, -next_unused_id); } else { bc.setFirstUnusedNode(next_unused_id); } bc.setBase(next_unused_id, -prev_unused_id); const check = parent_id; bc.setCheck(child_id, check); if (code === TERM_CODE) { const start_pos = children_info[i + 1]; let value = this.keys[start_pos].v; if (value == null) { value = 0; } const base = -value - 1; bc.setBase(child_id, base); } } }; DoubleArrayBuilder.prototype.findAllocatableBase = function(children_info) { const bc = this.bc; let _base; let curr = bc.getFirstUnusedNode(); while (true) { _base = curr - children_info[0]; if (_base < 0) { curr = -bc.getCheck(curr); continue; } let empty_area_found = true; for (let i = 0; i < children_info.length; i = i + 3) { const code = children_info[i]; const candidate_id = _base + code; if (!this.isUnusedNode(candidate_id)) { curr = -bc.getCheck(curr); empty_area_found = false; break; } } if (empty_area_found) { return _base; } } }; DoubleArrayBuilder.prototype.isUnusedNode = function(index) { const bc = this.bc; const check = bc.getCheck(index); if (index === ROOT_ID) { return false; } if (check < 0) { return true; } return false; }; function DoubleArray(bc) { this.bc = bc; this.bc.shrink(); } DoubleArray.prototype.contain = function(key) { const bc = this.bc; key += TERM_CHAR; const buffer = stringToUtf8Bytes(key); let parent = ROOT_ID; let child = NOT_FOUND; for (let i = 0; i < buffer.length; i++) { const code = buffer[i]; child = this.traverse(parent, code); if (child === NOT_FOUND) { return false; } if (bc.getBase(child) <= 0) { return true; } else { parent = child; continue; } } return false; }; DoubleArray.prototype.lookup = function(key) { key += TERM_CHAR; const buffer = stringToUtf8Bytes(key); let parent = ROOT_ID; let child = NOT_FOUND; for (let i = 0; i < buffer.length; i++) { const code = buffer[i]; child = this.traverse(parent, code); if (child === NOT_FOUND) { return NOT_FOUND; } parent = child; } const base = this.bc.getBase(child); if (base <= 0) { return -base - 1; } else { return NOT_FOUND; } }; DoubleArray.prototype.commonPrefixSearch = function(key) { const buffer = stringToUtf8Bytes(key); let parent = ROOT_ID; let child = NOT_FOUND; const result = []; for (let i = 0; i < buffer.length; i++) { const code = buffer[i]; child = this.traverse(parent, code); if (child !== NOT_FOUND) { parent = child; const grand_child = this.traverse(child, TERM_CODE); if (grand_child !== NOT_FOUND) { const base = this.bc.getBase(grand_child); const r = {}; if (base <= 0) { r.v = -base - 1; } r.k = utf8BytesToString(arrayCopy(buffer, 0, i + 1)); result.push(r); } continue; } else { break; } } return result; }; DoubleArray.prototype.traverse = function(parent, code) { const child = this.bc.getBase(parent) + code; if (this.bc.getCheck(child) === parent) { return child; } else { return NOT_FOUND; } }; DoubleArray.prototype.size = function() { return this.bc.size(); }; DoubleArray.prototype.calc = function() { return this.bc.calc(); }; DoubleArray.prototype.dump = function() { return this.bc.dump(); }; var newArrayBuffer = function(signed, bytes, size) { if (signed) { switch (bytes) { case 1: return new Int8Array(size); case 2: return new Int16Array(size); case 4: return new Int32Array(size); default: throw new RangeError( "Invalid newArray parameter element_bytes:" + bytes ); } } else { switch (bytes) { case 1: return new Uint8Array(size); case 2: return new Uint16Array(size); case 4: return new Uint32Array(size); default: throw new RangeError( "Invalid newArray parameter element_bytes:" + bytes ); } } }; var arrayCopy = function(src, src_offset, length) { const buffer = new ArrayBuffer(length); const dstU8 = new Uint8Array(buffer, 0, length); const srcU8 = src.subarray(src_offset, length); dstU8.set(srcU8); return dstU8; }; var stringToUtf8Bytes = function(str) { const bytes = new Uint8Array(new ArrayBuffer(str.length * 4)); let i = 0, j = 0; while (i < str.length) { var unicode_code; const utf16_code = str.charCodeAt(i++); if (utf16_code >= 55296 && utf16_code <= 56319) { const upper = utf16_code; const lower = str.charCodeAt(i++); if (lower >= 56320 && lower <= 57343) { unicode_code = (upper - 55296) * (1 << 10) + (1 << 16) + (lower - 56320); } else { return null; } } else { unicode_code = utf16_code; } if (unicode_code < 128) { bytes[j++] = unicode_code; } else if (unicode_code < 1 << 11) { bytes[j++] = unicode_code >>> 6 | 192; bytes[j++] = unicode_code & 63 | 128; } else if (unicode_code < 1 << 16) { bytes[j++] = unicode_code >>> 12 | 224; bytes[j++] = unicode_code >> 6 & 63 | 128; bytes[j++] = unicode_code & 63 | 128; } else if (unicode_code < 1 << 21) { bytes[j++] = unicode_code >>> 18 | 240; bytes[j++] = unicode_code >> 12 & 63 | 128; bytes[j++] = unicode_code >> 6 & 63 | 128; bytes[j++] = unicode_code & 63 | 128; } else { } } return bytes.subarray(0, j); }; var utf8BytesToString = function(bytes) { let str = ""; let code, b1, b2, b3, b4, upper, lower; let i = 0; while (i < bytes.length) { b1 = bytes[i++]; if (b1 < 128) { code = b1; } else if (b1 >> 5 === 6) { b2 = bytes[i++]; code = (b1 & 31) << 6 | b2 & 63; } else if (b1 >> 4 === 14) { b2 = bytes[i++]; b3 = bytes[i++]; code = (b1 & 15) << 12 | (b2 & 63) << 6 | b3 & 63; } else { b2 = bytes[i++]; b3 = bytes[i++]; b4 = bytes[i++]; code = (b1 & 7) << 18 | (b2 & 63) << 12 | (b3 & 63) << 6 | b4 & 63; } if (code < 65536) { str += String.fromCharCode(code); } else { code -= 65536; upper = 55296 | code >> 10; lower = 56320 | code & 1023; str += String.fromCharCode(upper, lower); } } return str; }; function builder(initial_size) { return new DoubleArrayBuilder(initial_size); } function load(base_buffer, check_buffer) { let bc = newBC(0); bc.loadBaseBuffer(base_buffer); bc.loadCheckBuffer(check_buffer); return new DoubleArray(bc); } // src/util/ByteBuffer.ts var stringToUtf8Bytes2 = function(str) { const bytes = new Uint8Array(str.length * 4); let i = 0, j = 0; while (i < str.length) { let unicode_code; const utf16_code = str.charCodeAt(i++); if (utf16_code >= 55296 && utf16_code <= 56319) { const upper = utf16_code; const lower = str.charCodeAt(i++); if (lower >= 56320 && lower <= 57343) { unicode_code = (upper - 55296) * (1 << 10) + (1 << 16) + (lower - 56320); } else { return null; } } else { unicode_code = utf16_code; } if (unicode_code < 128) { bytes[j++] = unicode_code; } else if (unicode_code < 1 << 11) { bytes[j++] = unicode_code >>> 6 | 192; bytes[j++] = unicode_code & 63 | 128; } else if (unicode_code < 1 << 16) { bytes[j++] = unicode_code >>> 12 | 224; bytes[j++] = unicode_code >> 6 & 63 | 128; bytes[j++] = unicode_code & 63 | 128; } else if (unicode_code < 1 << 21) { bytes[j++] = unicode_code >>> 18 | 240; bytes[j++] = unicode_code >> 12 & 63 | 128; bytes[j++] = unicode_code >> 6 & 63 | 128; bytes[j++] = unicode_code & 63 | 128; } else { } } return bytes.subarray(0, j); }; var utf8BytesToString2 = function(bytes) { let str = ""; let code, b1, b2, b3, b4, upper, lower; let i = 0; while (i < bytes.length) { b1 = bytes[i++]; if (b1 < 128) { code = b1; } else if (b1 >> 5 === 6) { b2 = bytes[i++]; code = (b1 & 31) << 6 | b2 & 63; } else if (b1 >> 4 === 14) { b2 = bytes[i++]; b3 = bytes[i++]; code = (b1 & 15) << 12 | (b2 & 63) << 6 | b3 & 63; } else { b2 = bytes[i++]; b3 = bytes[i++]; b4 = bytes[i++]; code = (b1 & 7) << 18 | (b2 & 63) << 12 | (b3 & 63) << 6 | b4 & 63; } if (code < 65536) { str += String.fromCharCode(code); } else { code -= 65536; upper = 55296 | code >> 10; lower = 56320 | code & 1023; str += String.fromCharCode(upper, lower); } } return str; }; var ByteBuffer = class { buffer; position; constructor(arg) { let initial_size; if (arg == null) { initial_size = 1024 * 1024; } else if (typeof arg === "number") { initial_size = arg; } else if (arg instanceof Uint8Array) { this.buffer = arg; this.position = 0; return; } else { throw typeof arg + " is invalid parameter type for ByteBuffer constructor"; } this.buffer = new Uint8Array(initial_size); this.position = 0; } size() { return this.buffer.length; } reallocate() { const new_array = new Uint8Array(this.buffer.length * 2); new_array.set(this.buffer); this.buffer = new_array; } shrink() { this.buffer = this.buffer.subarray(0, this.position); return this.buffer; } put(b) { if (this.buffer.length < this.position + 1) { this.reallocate(); } this.buffer[this.position++] = b; } get(index) { if (index == null) { index = this.position; this.position += 1; } if (this.buffer.length < index + 1) { return 0; } return this.buffer[index]; } putShort(num) { if (65535 < num) { throw num + " is over short value"; } const lower = 255 & num; const upper = (65280 & num) >> 8; this.put(lower); this.put(upper); } getShort(index) { if (index == null) { index = this.position; this.position += 2; } if (this.buffer.length < index + 2) { return 0; } const lower = this.buffer[index]; const upper = this.buffer[index + 1]; let value = (upper << 8) + lower; if (value & 32768) { value = -(value - 1 ^ 65535); } return value; } putInt(num) { if (4294967295 < num) { throw num + " is over integer value"; } const b0 = 255 & num; const b1 = (65280 & num) >> 8; const b2 = (16711680 & num) >> 16; const b3 = (4278190080 & num) >> 24; this.put(b0); this.put(b1); this.put(b2); this.put(b3); } getInt(index) { if (index == null) { index = this.position; this.position += 4; } if (this.buffer.length < index + 4) { return 0; } const b0 = this.buffer[index]; const b1 = this.buffer[index + 1]; const b2 = this.buffer[index + 2]; const b3 = this.buffer[index + 3]; return (b3 << 24) + (b2 << 16) + (b1 << 8) + b0; } readInt() { const pos = this.position; this.position += 4; return this.getInt(pos); } putString(str) { const bytes = stringToUtf8Bytes2(str); for (let i = 0; i < bytes.length; i++) { this.put(bytes[i]); } this.put(0); } getString(index) { const buf = []; let ch; if (index == null) { index = this.position; } while (true) { if (this.buffer.length < index + 1) { break; } ch = this.get(index++); if (ch === 0) { break; } else { buf.push(ch); } } this.position = index; return utf8BytesToString2(buf); } }; var ByteBuffer_default = ByteBuffer; // src/dict/TokenInfoDictionary.ts var TokenInfoDictionary = class { dictionary; target_map; pos_buffer; constructor() { this.dictionary = new ByteBuffer_default(10 * 1024 * 1024); this.target_map = {}; this.pos_buffer = new ByteBuffer_default(10 * 1024 * 1024); } buildDictionary(entries) { const dictionary_entries = {}; for (let i = 0; i < entries.length; i++) { const entry = entries[i]; if (entry.length < 4) { continue; } const surface_form = entry[0]; const left_id = entry[1]; const right_id = entry[2]; const word_cost = entry[3]; const feature = entry.slice(4).join(","); if (!isFinite(left_id) || !isFinite(right_id) || !isFinite(word_cost)) { console.log(entry); } const token_info_id = this.put( left_id, right_id, word_cost, surface_form, feature ); dictionary_entries[token_info_id] = surface_form; } this.dictionary.shrink(); this.pos_buffer.shrink(); return dictionary_entries; } put(left_id, right_id, word_cost, surface_form, feature) { const token_info_id = this.dictionary.position; const pos_id = this.pos_buffer.position; this.dictionary.putShort(left_id); this.dictionary.putShort(right_id); this.dictionary.putShort(word_cost); this.dictionary.putInt(pos_id); this.pos_buffer.putString(surface_form + "," + feature); return token_info_id; } addMapping(source, target) { let mapping = this.target_map[source]; if (mapping == null) { mapping = []; } mapping.push(target); this.target_map[source] = mapping; } targetMapToBuffer() { const buffer = new ByteBuffer_default(); const map_keys_size = Object.keys(this.target_map).length; buffer.putInt(map_keys_size); for (const key in this.target_map) { const values = this.target_map[key]; const map_values_size = values.length; buffer.putInt(parseInt(key)); buffer.putInt(map_values_size); for (let i = 0; i < values.length; i++) { buffer.putInt(values[i]); } } return buffer.shrink(); } loadDictionary(array_buffer) { this.dictionary = new ByteBuffer_default(array_buffer); return this; } loadPosVector(array_buffer) { this.pos_buffer = new ByteBuffer_default(array_buffer); return this; } loadTargetMap(array_buffer) { const buffer = new ByteBuffer_default(array_buffer); buffer.position = 0; this.target_map = {}; buffer.readInt(); while (true) { if (buffer.buffer.length < buffer.position + 1) { break; } const key = buffer.readInt(); const map_values_size = buffer.readInt(); for (let i = 0; i < map_values_size; i++) { const value = buffer.readInt(); this.addMapping(key, value); } } return this; } getFeatures(token_info_id_str) { const token_info_id = parseInt(token_info_id_str); if (isNaN(token_info_id)) { return ""; } const pos_id = this.dictionary.getInt(token_info_id + 6); return this.pos_buffer.getString(pos_id); } }; var TokenInfoDictionary_default = TokenInfoDictionary; // src/dict/ConnectionCosts.ts var ConnectionCosts = class { forward_dimension; backward_dimension; buffer; constructor(forward_dimension, backward_dimension) { this.forward_dimension = forward_dimension; this.backward_dimension = backward_dimension; this.buffer = new Int16Array(forward_dimension * backward_dimension + 2); this.buffer[0] = forward_dimension; this.buffer[1] = backward_dimension; } put(forward_id, backward_id, cost) { const index = forward_id * this.backward_dimension + backward_id + 2; if (this.buffer.length < index + 1) { throw "ConnectionCosts buffer overflow"; } this.buffer[index] = cost; } get(forward_id, backward_id) { const index = forward_id * this.backward_dimension + backward_id + 2; if (this.buffer.length < index + 1) { throw "ConnectionCosts buffer overflow"; } return this.buffer[index]; } loadConnectionCosts(connection_costs_buffer) { this.forward_dimension = connection_costs_buffer[0]; this.backward_dimension = connection_costs_buffer[1]; this.buffer = connection_costs_buffer; } }; var ConnectionCosts_default = ConnectionCosts; // src/dict/CharacterClass.ts var CharacterClass = class { class_id; class_name; is_always_invoke; is_grouping; max_length; constructor(class_id, class_name, is_always_invoke, is_grouping, max_length) { this.class_id = class_id; this.class_name = class_name; this.is_always_invoke = is_always_invoke; this.is_grouping = is_grouping; this.max_length = max_length; } }; var CharacterClass_default = CharacterClass; // src/dict/InvokeDefinitionMap.ts var InvokeDefinitionMap = class _InvokeDefinitionMap { map; lookup_table; constructor() { this.map = []; this.lookup_table = {}; } init(character_category_definition) { if (character_category_definition == null) { return; } for (let i = 0; i < character_category_definition.length; i++) { const character_class = character_category_definition[i]; this.map[i] = character_class; this.lookup_table[character_class.class_name] = i; } } getCharacterClass(class_id) { return this.map[class_id]; } lookup(class_name) { const class_id = this.lookup_table[class_name]; if (class_id == null) { return null; } return class_id; } toBuffer() { const buffer = new ByteBuffer_default(); for (let i = 0; i < this.map.length; i++) { const char_class = this.map[i]; buffer.put(char_class.is_always_invoke); buffer.put(char_class.is_grouping); buffer.putInt(char_class.max_length); buffer.putString(char_class.class_name); } buffer.shrink(); return buffer.buffer; } static load(invoke_def_buffer) { const invoke_def = new _InvokeDefinitionMap(); const character_category_definition = []; const buffer = new ByteBuffer_default(invoke_def_buffer); while (buffer.position + 1 < buffer.size()) { const class_id = character_category_definition.length; const is_always_invoke = buffer.get(); const is_grouping = buffer.get(); const max_length = buffer.getInt(); const class_name = buffer.getString(); character_category_definition.push( new CharacterClass_default( class_id, class_name, is_always_invoke, is_grouping, max_length ) ); } invoke_def.init(character_category_definition); return invoke_def; } }; var InvokeDefinitionMap_default = InvokeDefinitionMap; // src/dict/CharacterDefinition.ts var DEFAULT_CATEGORY = "DEFAULT"; var CharacterDefinition = class _CharacterDefinition { character_category_map; compatible_category_map; invoke_definition_map; constructor() { this.character_category_map = new Uint8Array(65536); this.compatible_category_map = new Uint32Array(65536); this.invoke_definition_map = null; } initCategoryMappings(category_mapping) { let code_point; if (category_mapping != null) { for (let i = 0; i < category_mapping.length; i++) { const mapping = category_mapping[i]; const end = mapping.end || mapping.start; for (code_point = mapping.start; code_point <= end; code_point++) { this.character_category_map[code_point] = this.invoke_definition_map.lookup(mapping.default); for (let j = 0; j < mapping.compatible.length; j++) { let bitset = this.compatible_category_map[code_point]; const compatible_category = mapping.compatible[j]; if (compatible_category == null) { continue; } const class_id = this.invoke_definition_map.lookup(compatible_category); if (class_id == null) { continue; } const class_id_bit = 1 << class_id; bitset = bitset | class_id_bit; this.compatible_category_map[code_point] = bitset; } } } } const default_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); if (default_id == null) { return; } for (code_point = 0; code_point < this.character_category_map.length; code_point++) { if (this.character_category_map[code_point] === 0) { this.character_category_map[code_point] = 1 << default_id; } } } lookupCompatibleCategory(ch) { const classes = []; const code = ch.charCodeAt(0); let integer; if (code < this.compatible_category_map.length) { integer = this.compatible_category_map[code]; } if (integer == null || integer === 0) { return classes; } for (let bit = 0; bit < 32; bit++) { if (integer << 31 - bit >>> 31 === 1) { const character_class = this.invoke_definition_map.getCharacterClass(bit); if (character_class == null) { continue; } classes.push(character_class); } } return classes; } lookup(ch) { let class_id; const code = ch.charCodeAt(0); if (SurrogateAwareString_default.isSurrogatePair(ch)) { class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); } else if (code < this.character_category_map.length) { class_id = this.character_category_map[code]; } if (class_id == null) { class_id = this.invoke_definition_map.lookup(DEFAULT_CATEGORY); } return this.invoke_definition_map.getCharacterClass(class_id); } static load(cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) { const char_def = new _CharacterDefinition(); char_def.character_category_map = cat_map_buffer; char_def.compatible_category_map = compat_cat_map_buffer; char_def.invoke_definition_map = InvokeDefinitionMap_default.load(invoke_def_buffer); return char_def; } static parseCharCategory(class_id, parsed_category_def) { const category = parsed_category_def[1]; const invoke = parseInt(parsed_category_def[2]); const grouping = parseInt(parsed_category_def[3]); const max_length = parseInt(parsed_category_def[4]); if (!isFinite(invoke) || invoke !== 0 && invoke !== 1) { console.log("char.def parse error. INVOKE is 0 or 1 in:" + invoke); return null; } if (!isFinite(grouping) || grouping !== 0 && grouping !== 1) { console.log("char.def parse error. GROUP is 0 or 1 in:" + grouping); return null; } if (!isFinite(max_length) || max_length < 0) { console.log("char.def parse error. LENGTH is 1 to n:" + max_length); return null; } const is_invoke = invoke === 1; const is_grouping = grouping === 1; return new CharacterClass_default( class_id, category, is_invoke, is_grouping, max_length ); } static parseCategoryMapping(parsed_category_mapping) { const start = parseInt(parsed_category_mapping[1]); const default_category = parsed_category_mapping[2]; const compatible_category = 3 < parsed_category_mapping.length ? parsed_category_mapping.slice(3) : []; if (!isFinite(start) || start < 0 || start > 65535) { console.log("char.def parse error. CODE is invalid:" + start); } return { start, default: default_category, compatible: compatible_category }; } static parseRangeCategoryMapping(parsed_category_mapping) { const start = parseInt(parsed_category_mapping[1]); const end = parseInt(parsed_category_mapping[2]); const default_category = parsed_category_mapping[3]; const compatible_category = 4 < parsed_category_mapping.length ? parsed_category_mapping.slice(4) : []; if (!isFinite(start) || start < 0 || start > 65535) { console.log("char.def parse error. CODE is invalid:" + start); } if (!isFinite(end) || end < 0 || end > 65535) { console.log("char.def parse error. CODE is invalid:" + end); } return { start, end, default: default_category, compatible: compatible_category }; } }; var CharacterDefinition_default = CharacterDefinition; // src/dict/UnknownDictionary.ts var UnknownDictionary = class extends TokenInfoDictionary_default { character_definition; constructor() { super(); this.dictionary = new ByteBuffer_default(10 * 1024 * 1024); this.target_map = {}; this.pos_buffer = new ByteBuffer_default(10 * 1024 * 1024); this.character_definition = null; } characterDefinition(character_definition) { this.character_definition = character_definition; return this; } lookup(ch) { return this.character_definition?.lookup(ch); } lookupCompatibleCategory(ch) { return this.character_definition?.lookupCompatibleCategory(ch); } loadUnknownDictionaries(unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) { this.loadDictionary(unk_buffer); this.loadPosVector(unk_pos_buffer); this.loadTargetMap(unk_map_buffer); this.character_definition = CharacterDefinition_default.load( cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer ); } }; var UnknownDictionary_default = UnknownDictionary; // src/dict/DynamicDictionaries.ts var DynamicDictionaries = class { trie; token_info_dictionary; connection_costs; unknown_dictionary; constructor(trie, token_info_dictionary, connection_costs, unknown_dictionary) { if (trie != null) { this.trie = trie; } else { this.trie = builder(0).build([{ k: "", v: 1 }]); } if (token_info_dictionary != null) { this.token_info_dictionary = token_info_dictionary; } else { this.token_info_dictionary = new TokenInfoDictionary_default(); } if (connection_costs != null) { this.connection_costs = connection_costs; } else { this.connection_costs = new ConnectionCosts_default(0, 0); } if (unknown_dictionary != null) { this.unknown_dictionary = unknown_dictionary; } else { this.unknown_dictionary = new UnknownDictionary_default(); } } loadTrie(base_buffer, check_buffer) { this.trie = load(base_buffer, check_buffer); return this; } loadTokenInfoDictionaries(token_info_buffer, pos_buffer, target_map_buffer) { this.token_info_dictionary.loadDictionary(token_info_buffer); this.token_info_dictionary.loadPosVector(pos_buffer); this.token_info_dictionary.loadTargetMap(target_map_buffer); return this; } loadConnectionCosts(cc_buffer) { this.connection_costs.loadConnectionCosts(cc_buffer); return this; } loadUnknownDictionaries(unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer) { this.unknown_dictionary.loadUnknownDictionaries( unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer ); return this; } }; var DynamicDictionaries_default = DynamicDictionaries; // src/loader/DictionaryLoader.ts async function loadDictionary(config) { const dic = new DynamicDictionaries_default(); async function loadTrie() { const filenames = ["base.dat.gz", "check.dat.gz"]; const buffers = await Promise.all( filenames.map((filename) => config.loadArrayBuffer(filename)) ); const base_buffer = new Int32Array(buffers[0]); const check_buffer = new Int32Array(buffers[1]); dic.loadTrie(base_buffer, check_buffer); } async function loadInfo() { const filenames = ["tid.dat.gz", "tid_pos.dat.gz", "tid_map.dat.gz"]; const buffers = await Promise.all( filenames.map((filename) => config.loadArrayBuffer(filename)) ); const token_info_buffer = new Uint8Array(buffers[0]); const pos_buffer = new Uint8Array(buffers[1]); const target_map_buffer = new Uint8Array(buffers[2]); dic.loadTokenInfoDictionaries( token_info_buffer, pos_buffer, target_map_buffer ); } async function loadCost() { const buffer = await config.loadArrayBuffer("cc.dat.gz"); const cc_buffer = new Int16Array(buffer); dic.loadConnectionCosts(cc_buffer); } async function loadUnknown() { const filenames = [ "unk.dat.gz", "unk_pos.dat.gz", "unk_map.dat.gz", "unk_char.dat.gz", "unk_compat.dat.gz", "unk_invoke.dat.gz" ]; const buffers = await Promise.all( filenames.map((filename) => config.loadArrayBuffer(filename)) ); const unk_buffer = new Uint8Array(buffers[0]); const unk_pos_buffer = new Uint8Array(buffers[1]); const unk_map_buffer = new Uint8Array(buffers[2]); const cat_map_buffer = new Uint8Array(buffers[3]); const compat_cat_map_buffer = new Uint32Array(buffers[4]); const invoke_def_buffer = new Uint8Array(buffers[5]); dic.loadUnknownDictionaries( unk_buffer, unk_pos_buffer, unk_map_buffer, cat_map_buffer, compat_cat_map_buffer, invoke_def_buffer ); } await Promise.all([loadTrie(), loadInfo(), loadCost(), loadUnknown()]); return dic; } // src/TokenizerBuilder.ts var TokenizerBuilder = class { constructor(options) { this.options = options; } async build() { const dic = await loadDictionary(this.options.loader); return new Tokenizer_default(dic); } }; var TokenizerBuilder_default = TokenizerBuilder; // src/dict/builder/ConnectionCostsBuilder.ts var ConnectionCostsBuilder = class { lines; connection_cost; constructor() { this.lines = 0; this.connection_cost = null; } putLine(line) {