kuromoji
Version:
JavaScript implementation of Japanese morphological analyzer
153 lines (130 loc) • 4.72 kB
JavaScript
/*
* Copyright 2014 Takuya Asano
* Copyright 2010-2014 Atilika Inc. and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
"use strict";
var ByteBuffer = require("../util/ByteBuffer");
/**
* TokenInfoDictionary
* @constructor
*/
function TokenInfoDictionary() {
this.dictionary = new ByteBuffer(10 * 1024 * 1024);
this.target_map = {}; // trie_id (of surface form) -> token_info_id (of token)
this.pos_buffer = new ByteBuffer(10 * 1024 * 1024);
}
// left_id right_id word_cost ...
// ^ this position is token_info_id
TokenInfoDictionary.prototype.buildDictionary = function (entries) {
var dictionary_entries = {}; // using as hashmap, string -> string (word_id -> surface_form) to build dictionary
for (var i = 0; i < entries.length; i++) {
var entry = entries[i];
if (entry.length < 4) {
continue;
}
var surface_form = entry[0];
var left_id = entry[1];
var right_id = entry[2];
var word_cost = entry[3];
var feature = entry.slice(4).join(","); // TODO Optimize
// Assertion
if (!isFinite(left_id) || !isFinite(right_id) || !isFinite(word_cost)) {
console.log(entry);
}
var token_info_id = this.put(left_id, right_id, word_cost, surface_form, feature);
dictionary_entries[token_info_id] = surface_form;
}
// Remove last unused area
this.dictionary.shrink();
this.pos_buffer.shrink();
return dictionary_entries;
};
TokenInfoDictionary.prototype.put = function (left_id, right_id, word_cost, surface_form, feature) {
var token_info_id = this.dictionary.position;
var pos_id = this.pos_buffer.position;
this.dictionary.putShort(left_id);
this.dictionary.putShort(right_id);
this.dictionary.putShort(word_cost);
this.dictionary.putInt(pos_id);
this.pos_buffer.putString(surface_form + "," + feature);
return token_info_id;
};
TokenInfoDictionary.prototype.addMapping = function (source, target) {
var mapping = this.target_map[source];
if (mapping == null) {
mapping = [];
}
mapping.push(target);
this.target_map[source] = mapping;
};
TokenInfoDictionary.prototype.targetMapToBuffer = function () {
var buffer = new ByteBuffer();
var map_keys_size = Object.keys(this.target_map).length;
buffer.putInt(map_keys_size);
for (var key in this.target_map) {
var values = this.target_map[key]; // Array
var map_values_size = values.length;
buffer.putInt(parseInt(key));
buffer.putInt(map_values_size);
for (var i = 0; i < values.length; i++) {
buffer.putInt(values[i]);
}
}
return buffer.shrink(); // Shrink-ed Typed Array
};
// from tid.dat
TokenInfoDictionary.prototype.loadDictionary = function (array_buffer) {
this.dictionary = new ByteBuffer(array_buffer);
return this;
};
// from tid_pos.dat
TokenInfoDictionary.prototype.loadPosVector = function (array_buffer) {
this.pos_buffer = new ByteBuffer(array_buffer);
return this;
};
// from tid_map.dat
TokenInfoDictionary.prototype.loadTargetMap = function (array_buffer) {
var buffer = new ByteBuffer(array_buffer);
buffer.position = 0;
this.target_map = {};
buffer.readInt(); // map_keys_size
while (true) {
if (buffer.buffer.length < buffer.position + 1) {
break;
}
var key = buffer.readInt();
var map_values_size = buffer.readInt();
for (var i = 0; i < map_values_size; i++) {
var value = buffer.readInt();
this.addMapping(key, value);
}
}
return this;
};
/**
* Look up features in the dictionary
* @param {string} token_info_id_str Word ID to look up
* @returns {string} Features string concatenated by ","
*/
TokenInfoDictionary.prototype.getFeatures = function (token_info_id_str) {
var token_info_id = parseInt(token_info_id_str);
if (isNaN(token_info_id)) {
// TODO throw error
return "";
}
var pos_id = this.dictionary.getInt(token_info_id + 6);
return this.pos_buffer.getString(pos_id);
};
module.exports = TokenInfoDictionary;