kusamoji
Version:
Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy
112 lines (96 loc) • 4.28 kB
JavaScript
;
/**
* ViterbiSearcher is for searching best Viterbi path
* @param {ConnectionCosts} connection_costs Connection costs matrix
* @constructor
*/
function ViterbiSearcher(connection_costs) {
this.connection_costs = connection_costs;
}
/**
* Search best path by forward-backward algorithm
* @param {ViterbiLattice} lattice Viterbi lattice to search
* @returns {Array} Shortest path
*/
ViterbiSearcher.prototype.search = function (lattice) {
lattice = this.forward(lattice);
return this.backward(lattice);
};
ViterbiSearcher.prototype.forward = function (lattice) {
let nodesEndAt = lattice.nodes_end_at;
let ccGet = this.connection_costs.get.bind(this.connection_costs);
for (let i = 1; i <= lattice.eos_pos; i++) {
let nodes = nodesEndAt[i];
if (nodes == null) continue;
for (let j = 0; j < nodes.length; j++) {
let node = nodes[j];
let nodeLeftId = node.left_id;
let nodeCost = node.cost;
let cost = Number.MAX_VALUE;
let shortest_prev_node;
// Length bonus: reward longer KNOWN words to prevent the Viterbi
// from preferring short fragments (e.g., テス) over longer
// correct matches (テスト). Short katakana fragments from NEologd
// have favorable connection costs that cause them to steal
// prefixes. A per-character bonus of -500 tips the balance back
// toward longer matches without disrupting normal segmentation.
//
// Only applied to KNOWN words with length >= 2 — BOS/EOS nodes
// (type === "BOS"/"EOS") have no surface_form, and single-char
// words (particles, etc.) should not be penalized.
let lengthBonus = 0;
if (node.type === "KNOWN" && node.surface_form && node.surface_form.length >= 2) {
// Quadratic bonus: longer KNOWN matches get disproportionately favored.
// 2 chars = -2000, 3 chars = -4500, 4 chars = -8000, 5 chars = -12500
// This prevents short fragments (テス, アメ, イン) from stealing
// prefixes of longer words while keeping normal 2-char segmentation
// (particles etc.) stable.
let len = Math.min(node.surface_form.length, 200); // clamp to prevent overflow
lengthBonus = -500 * len * len;
} else if (node.type === "UNKNOWN" && node.surface_form) {
// Heavy penalty for UNKNOWN nodes: the Viterbi should strongly
// prefer any path that avoids UNKNOWN tokens.
// Per-char penalty of +3000 makes even short UNKNOWN spans
// very expensive.
let len = Math.min(node.surface_form.length, 200);
lengthBonus = 3000 * len;
}
let prev_nodes = nodesEndAt[node.start_pos - 1];
if (prev_nodes == null) continue;
for (let k = 0; k < prev_nodes.length; k++) {
let prev_node = prev_nodes[k];
let prevRightId = prev_node.right_id;
let edge_cost = 0;
if (nodeLeftId != null && prevRightId != null) {
edge_cost = ccGet(prevRightId, nodeLeftId);
}
let totalCost = prev_node.shortest_cost + edge_cost + nodeCost + lengthBonus;
if (totalCost < cost) {
shortest_prev_node = prev_node;
cost = totalCost;
}
}
node.prev = shortest_prev_node;
node.shortest_cost = cost;
}
}
return lattice;
};
ViterbiSearcher.prototype.backward = function (lattice) {
let shortest_path = [];
let eos = lattice.nodes_end_at[lattice.nodes_end_at.length - 1][0];
let node_back = eos.prev;
if (node_back == null) {
return [];
}
while (node_back.type !== "BOS") {
shortest_path.push(node_back);
if (node_back.prev == null) {
// TODO Failed to back. Process unknown words?
return [];
}
node_back = node_back.prev;
}
return shortest_path.reverse();
};
module.exports = ViterbiSearcher;