@orama/tokenizers
Version:
Additional tokenizers for Orama
63 lines • 1.92 kB
JavaScript
import { normalizeToken } from "@orama/orama/internals";
const tokenizerLanguage = "mandarin";
const defaultConfig = {
language: tokenizerLanguage,
};
const segmenter = new Intl.Segmenter("zh-CN", { granularity: "word" });
/* c8 ignore next 10 */
function trim(text) {
while (text[text.length - 1] === "") {
text.pop();
}
while (text[0] === "") {
text.shift();
}
return text;
}
function tokenize(text) {
const segments = segmenter.segment(text);
const tokens = [];
for (const segment of segments) {
if (segment.isWordLike) {
tokens.push(segment.segment);
}
}
return tokens;
}
function tokenizeInternal(input, language, prop) {
/* c8 ignore next 3 */
if (typeof input !== "string") {
return [input];
}
let tokens;
if (prop && this?.tokenizeSkipProperties?.has(prop)) {
// @ts-ignore
tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)];
}
else {
tokens = tokenize(input);
}
const trimTokens = trim(tokens);
if (!this.allowDuplicates) {
return Array.from(new Set(trimTokens));
}
return trimTokens;
}
export function createTokenizer(config = defaultConfig) {
const tokenizerConfig = {
tokenize: tokenizeInternal,
language: config.language,
stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []),
tokenizeSkipProperties: new Set(config.tokenizeSkipProperties
? [config.tokenizeSkipProperties].flat()
: []),
stopWords: config.stopWords,
allowDuplicates: Boolean(config.allowDuplicates),
normalizeToken,
normalizationCache: new Map(),
};
// @ts-ignore
tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal);
return tokenizerConfig;
}
//# sourceMappingURL=mandarin.js.map