UNPKG

novel-segment

Version:

Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本

108 lines 3.43 kB
'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); exports.type = exports.init = exports.WildcardTokenizer = void 0; /** * 通配符识别模块 * * @author 老雷<leizongmin@gmail.com> */ const mod_1 = require("../mod"); class WildcardTokenizer extends mod_1.SubSModuleTokenizer { constructor() { super(...arguments); this.name = 'WildcardTokenizer'; } _cache() { super._cache(); this._TABLE = this.segment.getDict('WILDCARD'); this._TABLE2 = this.segment.getDict('WILDCARD2'); } /** * 对未识别的单词进行分词 * * @param {array} words 单词数组 * @return {array} */ split(words) { //return this._splitUnknow(words, this.splitForeign); return this._splitUnknow(words, this.splitWildcard); } createWildcardToken(word, lasttype, attr) { let nw = this.createToken(word, true, attr); return nw; } splitWildcard(text, cur) { var _a; //const POSTAG = this._POSTAG; const TABLE = this._TABLE; let ret = []; let self = this; // 分离出已识别的单词 let wordinfo = self.matchWord(text); if (wordinfo.length) { let lastc = 0; for (let ui = 0, bw; bw = wordinfo[ui]; ui++) { if (bw.c > lastc) { ret.push({ w: text.substr(lastc, bw.c - lastc), }); } let nw = self.createWildcardToken({ w: bw.w, p: (_a = TABLE[bw.w.toLowerCase()]) === null || _a === void 0 ? void 0 : _a.p, }); ret.push(nw); lastc = bw.c + bw.w.length; } let lastword = wordinfo[wordinfo.length - 1]; if (lastword.c + lastword.w.length < text.length) { ret.push({ w: text.substr(lastword.c + lastword.w.length), }); } } return ret.length ? ret : undefined; } /** * 匹配单词,返回相关信息 * * @param {string} text 文本 * @param {int} cur 开始位置 * @return {array} 返回格式 {w: '单词', c: 开始位置} */ matchWord(text, cur) { //const POSTAG = this._POSTAG; const TABLE = this._TABLE2; if (isNaN(cur)) cur = 0; let ret = []; //let self = this; let s = false; // 匹配可能出现的单词,取长度最大的那个 let lowertext = text.toLowerCase(); while (cur < text.length) { let stopword = null; for (let i in TABLE) { if (lowertext.substr(cur, i) in TABLE[i]) { stopword = { w: text.substr(cur, i), c: cur, }; } } if (stopword !== null) { ret.push(stopword); cur += stopword.w.length; } else { cur++; } } return ret; } } exports.WildcardTokenizer = WildcardTokenizer; exports.init = WildcardTokenizer.init.bind(WildcardTokenizer); exports.type = WildcardTokenizer.type; exports.default = WildcardTokenizer; //# sourceMappingURL=WildcardTokenizer.js.map