novel-segment
Version:
Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本
99 lines • 3.14 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.type = exports.init = exports.PunctuationTokenizer = void 0;
/**
* 标点符号识别模块
*
* @author 老雷<leizongmin@gmail.com>
*/
const mod_1 = require("../mod");
const STOPWORD_1 = require("../mod/data/STOPWORD");
class PunctuationTokenizer extends mod_1.SubSModuleTokenizer {
constructor() {
super(...arguments);
this.name = 'PunctuationTokenizer';
this._STOPWORD = STOPWORD_1._STOPWORD;
this.STOPWORD = STOPWORD_1.STOPWORD;
this.STOPWORD2 = STOPWORD_1.STOPWORD2;
}
/**
* 对未识别的单词进行分词
*
* @param {array} words 单词数组
* @return {array}
*/
split(words) {
const POSTAG = this._POSTAG;
const self = this;
let ret = [];
for (let i = 0, word; word = words[i]; i++) {
if (word.p > 0) {
ret.push(word);
continue;
}
// 仅对未识别的词进行匹配
let stopinfo = self.matchStopword(word.w);
if (stopinfo.length < 1) {
ret.push(word);
continue;
}
// 分离出标点符号
let lastc = 0;
for (let ui = 0, sw; sw = stopinfo[ui]; ui++) {
if (sw.c > lastc) {
ret.push({
w: word.w.substr(lastc, sw.c - lastc)
});
}
ret.push(self.debugToken({
w: sw.w,
p: POSTAG.D_W
}, {
[self.name]: true,
}, true));
lastc = sw.c + sw.w.length;
}
let lastsw = stopinfo[stopinfo.length - 1];
if (lastsw.c + lastsw.w.length < word.w.length) {
ret.push({
w: word.w.substr(lastsw.c + lastsw.w.length)
});
}
}
return ret;
}
/**
* 匹配包含的标点符号,返回相关信息
*
* @param {string} text 文本
* @param {int} cur 开始位置
* @return {array} 返回格式 {w: '网址', c: 开始位置}
*/
matchStopword(text, cur) {
const STOPWORD2 = this.STOPWORD2;
if (isNaN(cur))
cur = 0;
let ret = [];
let isMatch = false;
while (cur < text.length) {
let w;
for (let i in STOPWORD2) {
w = text.substr(cur, i);
if (w in STOPWORD2[i]) {
ret.push({ w: w, c: cur });
isMatch = true;
break;
}
}
cur += isMatch === false ? 1 : w.length;
isMatch = false;
}
return ret;
}
}
exports.PunctuationTokenizer = PunctuationTokenizer;
// debug(STOPWORD2);
exports.init = PunctuationTokenizer.init.bind(PunctuationTokenizer);
exports.type = PunctuationTokenizer.type;
exports.default = PunctuationTokenizer;
//# sourceMappingURL=PunctuationTokenizer.js.map