UNPKG

novel-segment

Version:

Chinese word segmentation 簡繁中文分词模块 以網路小說為樣本

184 lines (183 loc) 6.87 kB
/** * 分词器接口 * * @author 老雷<leizongmin@gmail.com> */ import { TableDictBlacklist } from '@novel-segment/table-blacklist'; import { AbstractTableDictCore } from '@novel-segment/table-core-abstract'; import { TableDict } from '@novel-segment/table-dict'; import { TableDictStopword } from '@novel-segment/table-stopword'; import { TableDictSynonym } from '@novel-segment/table-synonym'; import { ISubOptimizer, ISubTokenizer, Optimizer, Tokenizer } from '../mod/index'; import { IWordDebug } from '../util/index'; import { IDICT, IDICT2, IDICT_BLACKLIST, IDICT_STOPWORD, IDICT_SYNONYM, IOptionsDoSegment, IOptionsSegment, ISPLIT, ISPLIT_FILTER } from './types'; import { ITSTypeAndStringLiteral } from 'ts-type/lib/helper/string'; import { ITSOverwrite, ITSPartialRecord } from 'ts-type/lib/type/record'; import { POSTAG } from '@novel-segment/postag/lib/postag/ids'; import { EnumDictDatabase, IWord } from '@novel-segment/types'; /** * 创建分词器接口 */ export declare class SegmentCore { /** * 分段 * * 由於 segment 是利用對內容的前後文分析來進行分詞 * 所以如何切割段落對於結果就會產生不同影響 * * `RegExp` or 具有 `.[Symbol.split](input: string, limit?: number) => string[]` 的物件 * * @type {Segment.ISPLIT} */ SPLIT: ISPLIT; /** * 分段之後 如果符合以下條件 則直接忽略分析 * `RegExp` or 具有 `.test(input: string) => boolean` 的物件 * * @type {Segment.ISPLIT_FILTER} */ SPLIT_FILTER: ISPLIT_FILTER; /** * 词性 * @type {POSTAG} */ POSTAG: typeof POSTAG; /** * 词典表 * @type {{}} */ DICT: { STOPWORD?: IDICT_STOPWORD; SYNONYM?: IDICT_SYNONYM; [key: string]: IDICT; } & ITSPartialRecord<ITSTypeAndStringLiteral<EnumDictDatabase.SYNONYM>, IDICT_SYNONYM> & ITSPartialRecord<ITSTypeAndStringLiteral<EnumDictDatabase.STOPWORD>, IDICT_STOPWORD>; modules: { tokenizer: ISubTokenizer[]; optimizer: ISubOptimizer[]; }; tokenizer: Tokenizer; optimizer: Optimizer; db: { [key: string]: TableDict; }; options: IOptionsSegment; inited?: boolean; constructor(options?: IOptionsSegment); getDictDatabase<R extends TableDictSynonym>(type: EnumDictDatabase.SYNONYM, autocreate?: boolean, libTableDict?: { new (...argv: any[]): R; }): R; getDictDatabase<R extends TableDict>(type: EnumDictDatabase.TABLE, autocreate?: boolean, libTableDict?: { new (...argv: any[]): R; }): R; getDictDatabase<R extends TableDictStopword>(type: EnumDictDatabase.STOPWORD, autocreate?: boolean, libTableDict?: { new (...argv: any[]): R; }): R; getDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST, autocreate?: boolean, libTableDict?: { new (...argv: any[]): R; }): R; getDictDatabase<R extends TableDictBlacklist>(type: EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER, autocreate?: boolean, libTableDict?: { new (...argv: any[]): R; }): R; getDictDatabase<R extends AbstractTableDictCore<any>>(type: string | EnumDictDatabase, autocreate?: boolean, libTableDict?: { new (...argv: any[]): R; }): R; /** * 载入分词模块 * * @param {String|Array|Object} module 模块名称(数组)或模块对象 * @return {Segment} */ use(mod: ISubOptimizer, ...argv: any[]): this; use(mod: ISubTokenizer, ...argv: any[]): this; use(mod: any, ...argv: any[]): this; /** * 取词典表 * * @param {String} type 类型 * @return {object} */ getDict(type: ITSTypeAndStringLiteral<EnumDictDatabase.STOPWORD>): IDICT_STOPWORD; getDict(type: ITSTypeAndStringLiteral<EnumDictDatabase.SYNONYM>): IDICT_SYNONYM; getDict(type: ITSTypeAndStringLiteral<EnumDictDatabase.TABLE>): IDICT<IWord>; getDict(type: ITSTypeAndStringLiteral<EnumDictDatabase.BLACKLIST>): IDICT_BLACKLIST; getDict(type: ITSTypeAndStringLiteral<EnumDictDatabase.BLACKLIST_FOR_OPTIMIZER>): IDICT_BLACKLIST; getDict(type: 'TABLE2'): IDICT2<IWord>; getDict(type: ITSTypeAndStringLiteral<EnumDictDatabase>): IDICT; getDict(type: unknown): IDICT; getOptionsDoSegment<T extends IOptionsDoSegment>(options?: T): T; protected _get_text(text: string | Buffer): string; addBlacklist(word: string, remove?: boolean): this; /** * remove key in TABLE by BLACKLIST */ doBlacklist(): this; listModules(options?: IOptionsDoSegment): { enable: { tokenizer: ISubTokenizer[]; optimizer: ISubOptimizer[]; }; disable: { tokenizer: ISubTokenizer[]; optimizer: ISubOptimizer[]; }; }; /** * 开始分词 * * @param {String} text 文本 * @param {Object} options 选项 * - {Boolean} simple 是否仅返回单词内容 * - {Boolean} stripPunctuation 去除标点符号 * - {Boolean} convertSynonym 转换同义词 * - {Boolean} stripStopword 去除停止符 * @return {Array} */ doSegment(text: string | Buffer, options: ITSOverwrite<IOptionsDoSegment, { simple: true; }>): string[]; doSegment(text: string | Buffer, options?: IOptionsDoSegment): IWord[]; /** * 转换同义词 */ convertSynonym(ret: IWordDebug[], showcount: true): { count: number; list: IWordDebug[]; }; /** * 转换同义词 */ convertSynonym(ret: IWordDebug[], showcount?: boolean): IWordDebug[]; /** * 将单词数组连接成字符串 * * @param {Array} words 单词数组 * @return {String} */ stringify(words: Array<IWord | string>, ...argv: any[]): string; /** * 将单词数组连接成字符串 * * @param {Array} words 单词数组 * @return {String} */ static stringify(words: Array<IWord | string>, ...argv: any[]): string; /** * 根据某个单词或词性来分割单词数组 * * @param {Array} words 单词数组 * @param {Number|String} s 用于分割的单词或词性 * @return {Array} */ split(words: IWord[], s: string | number, ...argv: any[]): IWord[]; /** * 在单词数组中查找某一个单词或词性所在的位置 * * @param {Array} words 单词数组 * @param {Number|String} s 要查找的单词或词性 * @param {Number} cur 开始位置 * @return {Number} 找不到,返回-1 */ indexOf(words: IWord[], s: string | number, cur?: number, ...argv: any[]): number; } export { IDICT, IDICT2, IDICT_BLACKLIST, IDICT_STOPWORD, IDICT_SYNONYM, IOptionsDoSegment, IOptionsSegment, ISPLIT, ISPLIT_FILTER, IWord }; export default SegmentCore;