UNPKG

kusamoji

Version:

Japanese morphological analyzer for Node.js — Viterbi tokenizer with mmap dict loading and pluggable POS-source strategy

112 lines (96 loc) 4.28 kB
"use strict"; /** * ViterbiSearcher is for searching best Viterbi path * @param {ConnectionCosts} connection_costs Connection costs matrix * @constructor */ function ViterbiSearcher(connection_costs) { this.connection_costs = connection_costs; } /** * Search best path by forward-backward algorithm * @param {ViterbiLattice} lattice Viterbi lattice to search * @returns {Array} Shortest path */ ViterbiSearcher.prototype.search = function (lattice) { lattice = this.forward(lattice); return this.backward(lattice); }; ViterbiSearcher.prototype.forward = function (lattice) { let nodesEndAt = lattice.nodes_end_at; let ccGet = this.connection_costs.get.bind(this.connection_costs); for (let i = 1; i <= lattice.eos_pos; i++) { let nodes = nodesEndAt[i]; if (nodes == null) continue; for (let j = 0; j < nodes.length; j++) { let node = nodes[j]; let nodeLeftId = node.left_id; let nodeCost = node.cost; let cost = Number.MAX_VALUE; let shortest_prev_node; // Length bonus: reward longer KNOWN words to prevent the Viterbi // from preferring short fragments (e.g., テス) over longer // correct matches (テスト). Short katakana fragments from NEologd // have favorable connection costs that cause them to steal // prefixes. A per-character bonus of -500 tips the balance back // toward longer matches without disrupting normal segmentation. // // Only applied to KNOWN words with length >= 2 — BOS/EOS nodes // (type === "BOS"/"EOS") have no surface_form, and single-char // words (particles, etc.) should not be penalized. let lengthBonus = 0; if (node.type === "KNOWN" && node.surface_form && node.surface_form.length >= 2) { // Quadratic bonus: longer KNOWN matches get disproportionately favored. // 2 chars = -2000, 3 chars = -4500, 4 chars = -8000, 5 chars = -12500 // This prevents short fragments (テス, アメ, イン) from stealing // prefixes of longer words while keeping normal 2-char segmentation // (particles etc.) stable. let len = Math.min(node.surface_form.length, 200); // clamp to prevent overflow lengthBonus = -500 * len * len; } else if (node.type === "UNKNOWN" && node.surface_form) { // Heavy penalty for UNKNOWN nodes: the Viterbi should strongly // prefer any path that avoids UNKNOWN tokens. // Per-char penalty of +3000 makes even short UNKNOWN spans // very expensive. let len = Math.min(node.surface_form.length, 200); lengthBonus = 3000 * len; } let prev_nodes = nodesEndAt[node.start_pos - 1]; if (prev_nodes == null) continue; for (let k = 0; k < prev_nodes.length; k++) { let prev_node = prev_nodes[k]; let prevRightId = prev_node.right_id; let edge_cost = 0; if (nodeLeftId != null && prevRightId != null) { edge_cost = ccGet(prevRightId, nodeLeftId); } let totalCost = prev_node.shortest_cost + edge_cost + nodeCost + lengthBonus; if (totalCost < cost) { shortest_prev_node = prev_node; cost = totalCost; } } node.prev = shortest_prev_node; node.shortest_cost = cost; } } return lattice; }; ViterbiSearcher.prototype.backward = function (lattice) { let shortest_path = []; let eos = lattice.nodes_end_at[lattice.nodes_end_at.length - 1][0]; let node_back = eos.prev; if (node_back == null) { return []; } while (node_back.type !== "BOS") { shortest_path.push(node_back); if (node_back.prev == null) { // TODO Failed to back. Process unknown words? return []; } node_back = node_back.prev; } return shortest_path.reverse(); }; module.exports = ViterbiSearcher;