novel-segment
Version:
Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本
157 lines (152 loc) • 4.26 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.TableDict = void 0;
const segment_1 = require("segment-dict/lib/loader/segment");
const cjk_1 = require("../util/cjk");
const core_1 = __importDefault(require("./core"));
const lodash_1 = require("lodash");
const isNum_1 = require("../util/isNum");
/**
* @todo 掛接其他 dict
*/
class TableDict extends core_1.default {
constructor() {
super(...arguments);
this.TABLE = {};
this.TABLE2 = {};
}
exists(data) {
let w, p, f;
if (typeof data === 'string') {
w = data;
}
else if (Array.isArray(data)) {
[w, p, f] = data;
}
else {
({ w, p, f } = data);
}
return this.TABLE[w] || null;
}
__handleInput(data) {
let w, p, f;
let plus;
if (typeof data === 'string') {
w = data;
}
else if (Array.isArray(data)) {
[w, p, f, ...plus] = data;
}
else {
({ w, p, f } = data);
}
if (typeof w !== 'string' || w === '') {
throw new TypeError(JSON.stringify(data));
}
p = isNum_1.notNum(p) ? 0 : p;
f = isNum_1.notNum(f) ? 0 : f;
return {
data: {
w, p, f,
},
plus,
};
}
add(data, skipExists) {
let w, p, f;
let plus;
{
let ret = this.__handleInput(data);
({ w, p, f } = ret.data);
plus = ret.plus;
}
if (skipExists && this.exists(w)) {
return this;
}
if (plus === null || plus === void 0 ? void 0 : plus.length) {
// @todo do something
}
this._add({ w, p, f, s: true });
let self = this;
/**
* @todo 需要更聰明的作法 目前的做法實在太蠢
* @BUG 在不明原因下 似乎不會正確的添加每個項目 如果遇到這種情形請手動添加簡繁項目
*/
if (1 && this.options.autoCjk) {
let wa = cjk_1.text_list(w);
wa.forEach(function (w2) {
if (w2 !== w && !self.exists(w2)) {
self._add({ w: w2, p, f });
}
});
/*
let w2: string;
w2 = CjkConv.zh2jp(w);
if (w2 != w && !this.exists(w2))
{
this._add({w: w2, p, f});
//console.log(w2);
}
w2 = CjkConv.cjk2zht(w);
if (w2 !== w && !this.exists(w2))
{
this._add({w: w2, p, f});
//console.log(w2);
}
w2 = CjkConv.cjk2zhs(w);
if (w2 !== w && !this.exists(w2))
{
this._add({w: w2, p, f});
//console.log(w2);
}
*/
}
return this;
}
_add({ w, p, f, s }) {
let len = w.length;
this.TABLE[w] = {
p,
f,
s,
};
if (!this.TABLE2[len])
this.TABLE2[len] = {};
this.TABLE2[len][w] = this.TABLE[w];
}
remove(target) {
let { data, plus } = this.__handleInput(target);
this._remove(data);
return this;
}
_remove({ w, p, f, s }) {
let len = w.length;
delete this.TABLE[w];
if (this.TABLE2[len]) {
delete this.TABLE2[len][w];
}
return this;
}
json() {
return lodash_1.cloneDeep(this.TABLE);
}
/**
* 將目前的 表格 匯出
*/
stringify(LF = "\n") {
let self = this;
return Object.entries(self.TABLE)
.reduce(function (a, [w, { p, f }]) {
let line = segment_1.stringifyLine([w, p, f]);
a.push(line);
return a;
}, [])
.join(typeof LF === 'string' ? LF : "\n");
}
}
exports.TableDict = TableDict;
exports.default = TableDict;
//# sourceMappingURL=dict.js.map