novel-segment
Version:
Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本
244 lines • 9.71 kB
JavaScript
/**
* 人名优化模块
*
* @author 老雷<leizongmin@gmail.com>
* @version 0.1
*/
'use strict';
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.type = exports.init = exports.ChsNameOptimizer = void 0;
const mod_1 = require("../mod");
const CHS_NAMES_1 = __importDefault(require("../mod/CHS_NAMES"));
/**
* @todo 支援 XX氏
*/
class ChsNameOptimizer extends mod_1.SubSModuleOptimizer {
constructor() {
super(...arguments);
this.name = 'ChsNameOptimizer';
}
_cache() {
super._cache();
this._TABLE = this.segment.getDict('TABLE');
this._BLACKLIST = this.segment.getDict("BLACKLIST_FOR_OPTIMIZER" /* BLACKLIST_FOR_OPTIMIZER */) || {};
}
isMergeable2(...words) {
let nw = words.join('');
if (!this._BLACKLIST[nw]) {
return true;
}
return null;
}
isMergeable(word, nextword) {
if (word && nextword) {
let nw = word.w + nextword.w;
/**
* 不合併存在於 BLACKLIST 內的字詞
*/
if (!this._BLACKLIST[nw]) {
return true;
/*
return {
word,
nextword,
nw,
bool: true,
}
*/
}
}
return null;
}
/**
* 只有新詞屬於人名或未知詞時才會合併
*/
validUnknownNewWord(ws, cb) {
var _a;
let nw = typeof ws === 'string' ? ws : ws.join('');
let ew = this._TABLE[nw];
if (!(ew === null || ew === void 0 ? void 0 : ew.p) || ew.p & this._POSTAG.A_NR) {
let ret = (_a = cb === null || cb === void 0 ? void 0 : cb(nw, ew, ws)) !== null && _a !== void 0 ? _a : true;
if (ret) {
return typeof ret === 'object' ? ret : (ew !== null && ew !== void 0 ? ew : true);
}
}
}
/**
* 对可能是人名的单词进行优化
*
* @param {array} words 单词数组
* @return {array}
*/
doOptimize(words) {
var _a, _b;
//debug(words);
const POSTAG = this._POSTAG;
let i = 0;
/* 第一遍扫描 */
while (i < words.length) {
let word = words[i];
let nextword = words[i + 1];
if (this.isMergeable(word, nextword) && this.validUnknownNewWord(word.w + nextword.w)) {
let nw = word.w + nextword.w;
//debug(nextword);
// 如果为 "小|老" + 姓
if (((_a = nextword === null || nextword === void 0 ? void 0 : nextword.w) === null || _a === void 0 ? void 0 : _a.length) && (word.w === '小' || word.w === '老') &&
(nextword.w in CHS_NAMES_1.default.FAMILY_NAME_1 || nextword.w in CHS_NAMES_1.default.FAMILY_NAME_2)) {
/*
words.splice(i, 2, {
w: word.w + nextword.w,
p: POSTAG.A_NR,
m: [word, nextword],
});
*/
this.sliceToken(words, i, 2, {
w: nw,
p: POSTAG.A_NR,
m: [word, nextword],
}, undefined, {
[this.name]: 1,
});
i++;
continue;
}
// 如果是 姓 + 名(2字以内)
if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2) &&
((nextword.p & POSTAG.A_NR) > 0 && nextword.w.length <= 2)) {
/*
words.splice(i, 2, {
w: word.w + nextword.w,
p: POSTAG.A_NR,
m: [word, nextword],
});
*/
this.sliceToken(words, i, 2, {
w: nw,
p: POSTAG.A_NR,
m: [word, nextword],
}, undefined, {
[this.name]: 2,
});
i++;
continue;
}
// 如果相邻两个均为单字且至少有一个字是未识别的,则尝试判断其是否为人名
if (!word.p || !nextword.p) {
if ((word.w in CHS_NAMES_1.default.SINGLE_NAME && word.w === nextword.w) ||
(word.w in CHS_NAMES_1.default.DOUBLE_NAME_1 && nextword.w in CHS_NAMES_1.default.DOUBLE_NAME_2)) {
/*
words.splice(i, 2, {
w: word.w + nextword.w,
p: POSTAG.A_NR,
m: [word, nextword],
});
*/
this.sliceToken(words, i, 2, {
w: nw,
p: POSTAG.A_NR,
m: [word, nextword],
}, undefined, {
[this.name]: 3,
});
// 如果上一个单词可能是一个姓,则合并
let preword = words[i - 1];
if (((_b = preword === null || preword === void 0 ? void 0 : preword.w) === null || _b === void 0 ? void 0 : _b.length) && (preword.w in CHS_NAMES_1.default.FAMILY_NAME_1 || preword.w in CHS_NAMES_1.default.FAMILY_NAME_2)
&& this.isMergeable2(preword.w, word.w, nextword.w)) {
let nw = preword.w + word.w + nextword.w;
/*
words.splice(i - 1, 2, {
w: preword.w + word.w + nextword.w,
p: POSTAG.A_NR,
m: [preword, word, nextword],
});
*/
this.sliceToken(words, i - 1, 2, {
w: nw,
p: POSTAG.A_NR,
m: [preword, word, nextword],
}, undefined, {
[this.name]: 4,
});
}
else {
i++;
}
continue;
}
}
// 如果为 无歧义的姓 + 名(2字以内) 且其中一个未未识别词
if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2)
&& (!word.p || !nextword.p)
/**
* 防止將標點符號當作名字的BUG
*/
&& !(word.p & POSTAG.D_W || nextword.p & POSTAG.D_W)) {
//debug(word, nextword);
/*
words.splice(i, 2, {
w: word.w + nextword.w,
p: POSTAG.A_NR,
m: [word, nextword],
});
*/
this.sliceToken(words, i, 2, {
w: nw,
p: POSTAG.A_NR,
m: [word, nextword],
}, undefined, {
[this.name]: 5,
});
}
}
// 移到下一个单词
i++;
}
/* 第二遍扫描 */
i = 0;
while (i < words.length) {
let word = words[i];
let nextword = words[i + 1];
if (this.isMergeable(word, nextword)) {
// 如果为 姓 + 单字名
if ((word.w in CHS_NAMES_1.default.FAMILY_NAME_1 || word.w in CHS_NAMES_1.default.FAMILY_NAME_2)
&&
nextword.w in CHS_NAMES_1.default.SINGLE_NAME) {
/*
words.splice(i, 2, {
w: word.w + nextword.w,
p: POSTAG.A_NR,
m: [word, nextword],
});
*/
let nw = word.w + nextword.w;
let ew = this._TABLE[nw];
/**
* 更改為只有新詞屬於人名或未知詞時才會合併
*/
if (!(ew === null || ew === void 0 ? void 0 : ew.p) || ew.p & POSTAG.A_NR) {
this.sliceToken(words, i, 2, {
w: nw,
p: POSTAG.A_NR,
m: [word, nextword],
}, undefined, {
[this.name]: 6,
exists_word: ew,
});
i++;
continue;
}
}
}
// 移到下一个单词
i++;
}
return words;
}
}
exports.ChsNameOptimizer = ChsNameOptimizer;
exports.init = ChsNameOptimizer.init.bind(ChsNameOptimizer);
exports.type = ChsNameOptimizer.type;
exports.default = ChsNameOptimizer;
//# sourceMappingURL=ChsNameOptimizer.js.map