UNPKG

novel-segment

Version:

Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本

434 lines (433 loc) 17.9 kB
"use strict"; /** * Created by user on 2018/4/16/016. */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; Object.defineProperty(o, k2, { enumerable: true, get: function() { return m[k]; } }); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); __setModuleDefault(result, mod); return result; }; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.type = exports.init = exports.ZhtSynonymOptimizer = void 0; const mod_1 = require("../mod"); const index_1 = require("../util/index"); const COLORS_1 = require("../mod/COLORS"); const uni_string_1 = __importDefault(require("uni-string")); const isUnset_1 = __importStar(require("../util/isUnset")); /** * 以詞意來自動轉換 而不需要手動加入字典於 synonym.txt * 適用於比較容易需要人工處理的轉換 * * 自動處理 `里|后` * * 建議在字典內追加人名地名等等名字 來增加準確性 * 防止轉換錯誤 * * @todo 發于余干松冲准呆只范舍涂 */ class ZhtSynonymOptimizer extends mod_1.SubSModuleOptimizer { constructor() { super(...arguments); this.name = 'ZhtSynonymOptimizer'; } _cache() { super._cache(); this._TABLE = this.segment.getDict('TABLE'); this._POSTAG = this.segment.POSTAG; this._SYNONYM = this.segment.getDict('SYNONYM') || {}; this._BLACKLIST = this.segment.getDict("BLACKLIST_FOR_SYNONYM" /* BLACKLIST_FOR_SYNONYM */) || {}; } isSynonymBlacklist(w) { if (this._BLACKLIST[w]) { return true; } return null; } _getSynonym(w, nw) { const SYNONYM = this._SYNONYM; if (w in SYNONYM) { nw = SYNONYM[w]; } if (nw in SYNONYM) { //let w = nw; nw = SYNONYM[nw]; } return nw; } doOptimize(words) { var _a; const self = this; const POSTAG = this._POSTAG; const TABLE = this._TABLE; const SYNONYM = this._SYNONYM; let i = 0; let CLOSE_P = ['】', '」', '》', '』', ']', '’', '”', '〉']; let SEP_P = ['、', ',', '…']; while (i < words.length) { let w0 = words[i - 1] || null; let w1 = words[i]; let w2 = words[i + 1] || null; if (this.isSynonymBlacklist(w1.w)) { i++; continue; } let bool; let w1_len = uni_string_1.default.size(w1.w); let new_p; if (w1_len === 1) { //console.log(w1); if (w1.w === '里') { if (w0 && (w0.w.slice(-1) === '的' || w0.w === '和')) { } else if (w0 && CLOSE_P.includes(w0.w)) { w1.ow = w1.w; w1.w = '裡'; bool = true; } else if (w0 && index_1.hexAndAny(w0.p, // 名詞 POSTAG.D_N, // 處所 POSTAG.D_S, // 方位 POSTAG.D_F, // 时间词 POSTAG.D_T, // 动词 训练 POSTAG.D_V)) { w1.ow = w1.w; w1.w = '裡'; bool = true; } } else if (w1.w === '后') { if (w0 && (w0.w === '和')) { } else if (w0 && CLOSE_P.includes(w0.w)) { w1.ow = w1.w; w1.w = '後'; bool = true; } else if (w0 && ['腰'].includes(w0.w)) { w1.ow = w1.w; w1.w = '後'; bool = true; } // 如果前一個項目為 else if (((w0 === null || w0 === void 0 ? void 0 : w0.p) && index_1.hexAndAny(w0.p, // 动词 離開 POSTAG.D_V, // 处所词 POSTAG.D_S, // 时间词 POSTAG.D_T, // 名词 名语素 POSTAG.D_N, // 数量词 - 几次后 POSTAG.D_MQ, POSTAG.A_M, // 方位词 方位语素 POSTAG.D_F, // 副词 POSTAG.D_D, POSTAG.D_R))) { w1.ow = w1.w; w1.w = '後'; bool = true; } else if (((w2 === null || w2 === void 0 ? void 0 : w2.p) && index_1.hexAndAny(w2.p, POSTAG.D_V))) { w1.ow = w1.w; w1.w = '後'; bool = true; } else if (w2 && ((isUnset_1.isSet(w0) && !w0.p) && (w2.p && index_1.hexAndAny(w2.p, // 副词 POSTAG.D_D)))) { w1.ow = w1.w; w1.w = '後'; bool = true; } else if (w2 && ((!(w0 === null || w0 === void 0 ? void 0 : w0.p)) && SEP_P.includes(w2.w))) { w1.ow = w1.w; w1.w = '後'; bool = true; } } else if (w1.w === '发' || w1.w === '發') { let c; if (w0) { c = w0.w; } if (c && COLORS_1.COLOR_HAIR[c]) { let nw = '髮'; nw = this._getSynonym(w1.w, nw); if (nw !== w1.w) { w1.ow = w1.w; w1.w = nw; new_p = POSTAG.D_N; bool = true; } } if (!bool && w1.w === '发' && (w2 === null || w2 === void 0 ? void 0 : w2.w) === '的') { w1.ow = w1.w; w1.w = '發'; bool = true; } } else if (w1.w === '于') { if ((isUnset_1.default(w0) || w0.p & POSTAG.D_W) && ((w2 === null || w2 === void 0 ? void 0 : w2.p) && (w2.p & POSTAG.D_N || w2.p & POSTAG.D_V || w2.p & POSTAG.D_R || w2.p & POSTAG.D_D || w2.p & POSTAG.D_T || w2.p & POSTAG.A_NR || w2.p & POSTAG.D_S || w2.p & POSTAG.D_F))) { /** * 當 於 在句子開頭並且後面是名詞或動詞時 */ w1.ow = w1.w; w1.w = '於'; new_p = POSTAG.D_P; w1.p = new_p; bool = true; } else if (w0 && w2) { let w3; if ((index_1.hexAndAny(w0.p, POSTAG.D_V, POSTAG.D_R, POSTAG.D_A, POSTAG.D_T, POSTAG.D_F) && index_1.hexAndAny(w2.p, POSTAG.D_N, POSTAG.D_V, POSTAG.D_R, POSTAG.D_S, POSTAG.A_NX, POSTAG.D_F, POSTAG.D_W)) || (index_1.hexAndAny(w0.p, POSTAG.D_N) && index_1.hexAndAny(w2.p, POSTAG.D_N)) || (index_1.hexAndAny(w0.p, POSTAG.D_V, POSTAG.D_N) && index_1.hexAndAny(w2.p, POSTAG.D_F, POSTAG.D_T, POSTAG.A_NR, POSTAG.D_R, POSTAG.D_S, POSTAG.D_W)) || (index_1.hexAndAny(w0.p, POSTAG.A_NS, POSTAG.D_T, POSTAG.D_C) && index_1.hexAndAny(w2.p, POSTAG.A_NS, POSTAG.D_T)) || (index_1.hexAndAny(w0.p, POSTAG.D_D) && index_1.hexAndAny(w2.p, POSTAG.D_N)) /* || (hexAndAny(w0.p, POSTAG.D_V, ) && hexAndAny(w2.p, POSTAG.D_D, )) */ || (index_1.hexAndAny(w0.p, POSTAG.A_NR) && index_1.hexAndAny(w2.p, POSTAG.A_NS, POSTAG.A_NT, POSTAG.D_S, POSTAG.D_N, POSTAG.D_V)) || (index_1.hexAndAny(w0.p, POSTAG.D_V) && index_1.hexAndAny(w2.p, POSTAG.D_W)) || (index_1.hexAndAny(w0.p, POSTAG.D_D) && index_1.hexAndAny(w2.p, POSTAG.D_V)) || (index_1.hexAndAny(w0.p, POSTAG.D_V) && index_1.hexAndAny(w2.p, POSTAG.D_D)) || (index_1.hexAndAny(w0.p, POSTAG.D_N) && index_1.hexAndAny(w2.p, POSTAG.D_V))) { w1.ow = w1.w; w1.w = '於'; new_p = POSTAG.D_P; w1.p = new_p; bool = true; } else if (!isUnset_1.default(w3 = words[i + 2])) { if (w0.p & POSTAG.D_V && w2.p & POSTAG.D_D && w3.p & POSTAG.D_V) { w1.ow = w1.w; w1.w = '於'; new_p = POSTAG.D_P; w1.p = new_p; bool = true; } } } if (!bool && ((w2 === null || w2 === void 0 ? void 0 : w2.p) & POSTAG.D_T)) { /** * 迫使法妮雅得于日后和杰弥尼成婚…… */ w1.ow = w1.w; w1.w = '於'; new_p = POSTAG.D_P; w1.p = new_p; bool = true; } } else if (w1.w === '么') { if (isUnset_1.default(w2) || w2.p & POSTAG.D_W) { w1.ow = w1.w; w1.w = '麼'; bool = true; } } } else if (w1_len > 1) { if (w1.w.match(/^(.+)[发發]$/)) { let c = RegExp.$1; if (COLORS_1.COLOR_HAIR[c]) { let nw = c + '髮'; nw = this._getSynonym(w1.w, nw); if (nw !== w1.w) { w1.ow = w1.w; w1.w = nw; bool = true; } } else if (w1.w === (c + '发') && (w1.p & POSTAG.D_MQ)) { //  一发、兩发、三发、四发、五发、六发—— let nw = c + '發'; w1.ow = w1.w; w1.w = nw; bool = true; } else if ( // 不修正繁體的 發 w1.w === (c + '发') && (isUnset_1.default(w0) || (w0.p === POSTAG.D_W //|| COLOR_HAIR[w0.w] ))) { let nw = c + '髮'; let ow = TABLE[nw]; if (ow === null || ow === void 0 ? void 0 : ow.s) { w1.ow = w1.w; w1.w = nw; new_p = ow.p; bool = true; } } } else if (index_1.hexAndAny(w1.p, POSTAG.D_MQ) && /^(.+)余$/.test(w1.w)) { let nw = RegExp.$1 + '餘'; w1.ow = w1.w; w1.w = nw; bool = true; } // 如果項目為 量词 else if (index_1.hexAndAny(w1.p, //POSTAG.A_Q, POSTAG.D_MQ)) { if (/^几/.test(w1.w) && ((_a = w1.m) === null || _a === void 0 ? void 0 : _a.length) > 1) { /* let m = w1.m as IWord[]; if (m[0].p & POSTAG.D_MQ) { } */ let nw = w1.w.replace(/^几/, '幾'); w1.ow = w1.w; w1.w = nw; bool = true; } } else if (w1.p & POSTAG.D_V && /^干(.)$/.test(w1.w)) { /** * @todo 需要更嚴謹的判斷方式 */ let c = RegExp.$1; let nw = '幹' + c; let ow = TABLE[nw]; if (ow && index_1.hexAndAny(ow.p, POSTAG.D_V)) { if (w2 && index_1.hexAndAny(w2.p, POSTAG.D_R)) { w1.ow = w1.w; w1.w = nw; bool = true; } } } // 如果項目為 錯字 else if (w1.p & POSTAG.BAD) { let nw; nw = w1.w .replace(/(.)里|里(.)/, '$1裡$2') .replace(/(.)后|后(.)/, '$1後$2') .replace(/蔘(.)/, '參$1'); nw = this._getSynonym(w1.w, nw); //console.log(w1, nw); if (nw !== w1.w) { w1.ow = w1.w; w1.w = nw; bool = true; } } // 如果項目為 方位 else if (w1.p & POSTAG.D_F) { let nw = w1.w .replace(/(.)里|里(.)/, '$1裡$2') .replace(/(.)后|后(.)/, '$1後$2'); nw = this._getSynonym(w1.w, nw); if (nw !== w1.w) { w1.ow = w1.w; w1.w = nw; bool = true; } } // 如果項目為 處所 else if (w1.p & POSTAG.D_S) { let nw = w1.w .replace(/(.)里$/, '$1裡'); nw = this._getSynonym(w1.w, nw); if (nw !== w1.w) { w1.ow = w1.w; w1.w = nw; bool = true; } } // 如果項目為 时间 else if (w1.p & POSTAG.D_T || w1.p & POSTAG.D_V) { let nw = w1.w .replace(/(.)后|后(.)/, '$1後$2'); nw = this._getSynonym(w1.w, nw); if (nw !== w1.w) { w1.op = w1.op || w1.p; w1.ow = w1.w; w1.w = nw; bool = true; } } } if (bool && w1.ow && w1.ow !== w1.w) { if (w1.w in TABLE) { let ow = TABLE[w1.w]; if (typeof new_p !== 'undefined') { w1.op = w1.op || ow.p; w1.p = new_p; } else if (ow.p !== w1.p) { w1.op = w1.op || w1.p; w1.p = ow.p; //console.log(TABLE[w1.w]); } if (ow.s !== w1.s) { w1.os = ('os' in w1) ? w1.os : (w1.s || false); w1.s = ow.s; } } this.debugToken(w1, { [this.name]: true, }); } i++; } return words; } } exports.ZhtSynonymOptimizer = ZhtSynonymOptimizer; exports.init = ZhtSynonymOptimizer.init.bind(ZhtSynonymOptimizer); exports.type = ZhtSynonymOptimizer.type; exports.default = ZhtSynonymOptimizer; //# sourceMappingURL=ZhtSynonymOptimizer.js.map