UNPKG

novel-segment

Version:

Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本

66 lines 2.56 kB
"use strict"; /** * Created by user on 2018/4/19/019. */ Object.defineProperty(exports, "__esModule", { value: true }); exports.type = exports.init = exports.JpSimpleTokenizer = exports.EnumJpSimpleTokenizerType = void 0; const mod_1 = require("../mod"); var EnumJpSimpleTokenizerType; (function (EnumJpSimpleTokenizerType) { /** * 平仮名 * https://en.wikipedia.org/wiki/Hiragana */ EnumJpSimpleTokenizerType[EnumJpSimpleTokenizerType["HIRAGANA"] = 1] = "HIRAGANA"; /** * 片仮名 * https://en.wikipedia.org/wiki/Katakana */ EnumJpSimpleTokenizerType[EnumJpSimpleTokenizerType["KATAKANA"] = 2] = "KATAKANA"; })(EnumJpSimpleTokenizerType || (exports.EnumJpSimpleTokenizerType = EnumJpSimpleTokenizerType = {})); class JpSimpleTokenizer extends mod_1.SubSModuleTokenizer { constructor() { super(...arguments); this.name = 'JpSimpleTokenizer'; } split(words, ...argv) { return this._splitUnset(words, this._splitText); } createJpSimpleToken(data, type) { return super.debugToken(data, { [this.name]: type, }, true); } _splitText(text) { //const POSTAG = this.segment.POSTAG; let self = this; let b1 = /[ぁ-ん]/.test(text); let b2 = /[ァ-ヴーア-ン゙ー]/.test(text); if (b1 === false || b2 === false) { if (b1 === true && /^[ぁ-ん]+$/.test(text) || b2 === true && /^[ァ-ヴーア-ン゙ー]+$/.test(text)) { return [self.createJpSimpleToken({ w: text, }, b1 ? 1 /* EnumJpSimpleTokenizerType.HIRAGANA */ : 2 /* EnumJpSimpleTokenizerType.KATAKANA */)]; } return null; } let ret = []; text .split(/((?:[^ァ-ヴーア-ン゙ー]+)?[ぁ-ん]+(?=[ァ-ヴーア-ン゙ー])|(?:[^ぁ-ん]+)?[ァ-ヴーア-ン゙ー]+(?=[ぁ-ん]))/) .forEach(function (w, i) { if (w !== '') { ret.push(self.createJpSimpleToken({ w, }, /[ぁ-ん]/.test(w) ? 1 /* EnumJpSimpleTokenizerType.HIRAGANA */ : 2 /* EnumJpSimpleTokenizerType.KATAKANA */)); } }); return ret; } } exports.JpSimpleTokenizer = JpSimpleTokenizer; JpSimpleTokenizer.NAME = 'JpSimpleTokenizer'; exports.init = JpSimpleTokenizer.init.bind(JpSimpleTokenizer); exports.type = JpSimpleTokenizer.type; exports.default = JpSimpleTokenizer; //# sourceMappingURL=JpSimpleTokenizer.js.map