@orama/tokenizers
Version:
Additional tokenizers for Orama
66 lines • 2.06 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.createTokenizer = createTokenizer;
const internals_1 = require("@orama/orama/internals");
const tokenizerLanguage = "mandarin";
const defaultConfig = {
language: tokenizerLanguage,
};
const segmenter = new Intl.Segmenter("zh-CN", { granularity: "word" });
/* c8 ignore next 10 */
function trim(text) {
while (text[text.length - 1] === "") {
text.pop();
}
while (text[0] === "") {
text.shift();
}
return text;
}
function tokenize(text) {
const segments = segmenter.segment(text);
const tokens = [];
for (const segment of segments) {
if (segment.isWordLike) {
tokens.push(segment.segment);
}
}
return tokens;
}
function tokenizeInternal(input, language, prop) {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}
let tokens;
if (prop && this?.tokenizeSkipProperties?.has(prop)) {
// @ts-ignore
tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)];
}
else {
tokens = tokenize(input);
}
const trimTokens = trim(tokens);
if (!this.allowDuplicates) {
return Array.from(new Set(trimTokens));
}
return trimTokens;
}
function createTokenizer(config = defaultConfig) {
const tokenizerConfig = {
tokenize: tokenizeInternal,
language: config.language,
stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []),
tokenizeSkipProperties: new Set(config.tokenizeSkipProperties
? [config.tokenizeSkipProperties].flat()
: []),
stopWords: config.stopWords,
allowDuplicates: Boolean(config.allowDuplicates),
normalizeToken: internals_1.normalizeToken,
normalizationCache: new Map(),
};
// @ts-ignore
tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal);
return tokenizerConfig;
}
//# sourceMappingURL=mandarin.js.map