novel-segment
Version:
Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本
224 lines • 7.1 kB
JavaScript
/**
* 分词器接口
*
* @author 老雷<leizongmin@gmail.com>
*/
'use strict';
Object.defineProperty(exports, "__esModule", { value: true });
exports.SegmentCore = void 0;
const index_1 = require("../mod/index");
const stringify_1 = require("./methods/stringify");
const split_1 = require("./methods/split");
const indexOf_1 = require("./methods/indexOf");
const convertSynonym_1 = require("./methods/convertSynonym");
const listModules_1 = require("./methods/listModules");
const _get_text_1 = require("./methods/_get_text");
const getOptionsDoSegment_1 = require("./methods/getOptionsDoSegment");
const useModules_1 = require("./methods/useModules");
const doSegment_1 = require("./methods/doSegment");
const ids_1 = require("@novel-segment/postag/lib/postag/ids");
/**
* 创建分词器接口
*/
class SegmentCore {
constructor(options = {}) {
/**
* 分段
*
* 由於 segment 是利用對內容的前後文分析來進行分詞
* 所以如何切割段落對於結果就會產生不同影響
*
* `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件
*
* @type {Segment.ISPLIT}
*/
this.SPLIT = /([\r\n]+|^[ \s]+|[ \s]+$|[ \s]{2,})/gm;
/**
* 分段之後 如果符合以下條件 則直接忽略分析
* `RegExp` or 具有 `.test(input: string) => boolean` 的物件
*
* @type {Segment.ISPLIT_FILTER}
*/
this.SPLIT_FILTER = /^([\r\n]+)$/g;
/**
* 词性
* @type {POSTAG}
*/
this.POSTAG = ids_1.POSTAG;
/**
* 词典表
* @type {{}}
*/
this.DICT = {};
this.modules = {
/**
* 分词模块
*/
tokenizer: [],
/**
* 优化模块
*/
optimizer: [],
};
this.db = {};
this.options = {};
const self = this;
this.options = Object.assign({}, this.options, options);
this.tokenizer = new index_1.Tokenizer(this);
this.optimizer = new index_1.Optimizer(this);
if (this.options.db) {
this.options.db.forEach(function (data) {
self.db[data.type] = data;
});
}
delete this.options.db;
}
getDictDatabase(type, autocreate, libTableDict) {
// @ts-ignore
return this.db[type];
}
use(mod, ...argv) {
useModules_1.useModules(this, mod, ...argv);
return this;
}
getDict(type) {
return this.DICT[type];
}
getOptionsDoSegment(options) {
return getOptionsDoSegment_1.getOptionsDoSegment(options, this.options.optionsDoSegment);
}
_get_text(text) {
return _get_text_1._get_text(text);
}
addBlacklist(word, remove) {
let me = this;
const BLACKLIST = me.getDictDatabase("BLACKLIST" /* BLACKLIST */);
const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
let bool = !remove;
if (bool) {
BLACKLIST.add(word);
TABLE.remove(word);
}
else {
BLACKLIST.remove(word);
}
return this;
}
/**
* remove key in TABLE by BLACKLIST
*/
doBlacklist() {
let me = this;
const BLACKLIST = me.getDict("BLACKLIST" /* BLACKLIST */);
const TABLE = me.getDictDatabase("TABLE" /* TABLE */);
Object.entries(BLACKLIST)
.forEach(function ([key, bool]) {
bool && TABLE.remove(key);
});
return this;
}
listModules(options = {}) {
options = this.getOptionsDoSegment(options);
return listModules_1.listModules(this.modules, options);
}
doSegment(text, options = {}) {
const me = this;
options = me.getOptionsDoSegment(options);
//console.dir(options);
let text_list = me._get_text(text)
// @ts-ignore
.split(this.SPLIT);
text = undefined;
const mods = me.listModules(options).enable;
// 将文本按照换行符分割成多段,并逐一分词
let ret = text_list.reduce(function (ret, section) {
//console.dir(section);
if (me.SPLIT_FILTER.test(section)) {
ret = ret.concat({ w: section });
// @ts-ignore
section = [];
}
//section = section.trim();
if (section.length > 0) {
// 分词
let sret = me.tokenizer.split(section, mods.tokenizer);
// 优化
sret = me.optimizer.doOptimize(sret, mods.optimizer);
// 连接分词结果
if (sret.length > 0) {
ret = ret.concat(sret);
}
}
return ret;
}, []);
// 去除标点符号
if (options.stripPunctuation) {
ret = doSegment_1._doSegmentStripPOSTAG(ret, ids_1.POSTAG.D_W);
}
if (options.convertSynonym) {
ret = this.convertSynonym(ret);
}
// 去除停止符
if (options.stripStopword) {
ret = doSegment_1._doSegmentStripStopword(ret, me.getDict('STOPWORD'));
}
if (options.stripSpace) {
ret = doSegment_1._doSegmentStripSpace(ret);
}
// 仅返回单词内容
if (options.simple) {
ret = doSegment_1._doSegmentSimple(ret);
}
return ret;
}
convertSynonym(ret, showcount) {
return convertSynonym_1.convertSynonym(ret, {
showcount,
DICT_SYNONYM: this.getDict('SYNONYM'),
DICT_TABLE: this.getDict('TABLE'),
POSTAG: this.POSTAG,
});
}
/**
* 将单词数组连接成字符串
*
* @param {Array} words 单词数组
* @return {String}
*/
stringify(words, ...argv) {
return stringify_1.stringify(words, ...argv);
}
/**
* 将单词数组连接成字符串
*
* @param {Array} words 单词数组
* @return {String}
*/
static stringify(words, ...argv) {
return stringify_1.stringify(words, ...argv);
}
/**
* 根据某个单词或词性来分割单词数组
*
* @param {Array} words 单词数组
* @param {Number|String} s 用于分割的单词或词性
* @return {Array}
*/
split(words, s, ...argv) {
return split_1.split(words, s, ...argv);
}
/**
* 在单词数组中查找某一个单词或词性所在的位置
*
* @param {Array} words 单词数组
* @param {Number|String} s 要查找的单词或词性
* @param {Number} cur 开始位置
* @return {Number} 找不到,返回-1
*/
indexOf(words, s, cur, ...argv) {
return indexOf_1.indexOf(words, cur, ...argv);
}
}
exports.SegmentCore = SegmentCore;
exports.default = SegmentCore;
//# sourceMappingURL=core.js.map