UNPKG

@orama/tokenizers

Version:

Additional tokenizers for Orama

66 lines 2.06 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.createTokenizer = createTokenizer; const internals_1 = require("@orama/orama/internals"); const tokenizerLanguage = "mandarin"; const defaultConfig = { language: tokenizerLanguage, }; const segmenter = new Intl.Segmenter("zh-CN", { granularity: "word" }); /* c8 ignore next 10 */ function trim(text) { while (text[text.length - 1] === "") { text.pop(); } while (text[0] === "") { text.shift(); } return text; } function tokenize(text) { const segments = segmenter.segment(text); const tokens = []; for (const segment of segments) { if (segment.isWordLike) { tokens.push(segment.segment); } } return tokens; } function tokenizeInternal(input, language, prop) { /* c8 ignore next 3 */ if (typeof input !== "string") { return [input]; } let tokens; if (prop && this?.tokenizeSkipProperties?.has(prop)) { // @ts-ignore tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)]; } else { tokens = tokenize(input); } const trimTokens = trim(tokens); if (!this.allowDuplicates) { return Array.from(new Set(trimTokens)); } return trimTokens; } function createTokenizer(config = defaultConfig) { const tokenizerConfig = { tokenize: tokenizeInternal, language: config.language, stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []), tokenizeSkipProperties: new Set(config.tokenizeSkipProperties ? [config.tokenizeSkipProperties].flat() : []), stopWords: config.stopWords, allowDuplicates: Boolean(config.allowDuplicates), normalizeToken: internals_1.normalizeToken, normalizationCache: new Map(), }; // @ts-ignore tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal); return tokenizerConfig; } //# sourceMappingURL=mandarin.js.map