@orama/tokenizers
Version: 
Additional tokenizers for Orama
66 lines • 2.06 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.createTokenizer = createTokenizer;
const internals_1 = require("@orama/orama/internals");
const tokenizerLanguage = "mandarin";
const defaultConfig = {
    language: tokenizerLanguage,
};
const segmenter = new Intl.Segmenter("zh-CN", { granularity: "word" });
/* c8 ignore next 10 */
function trim(text) {
    while (text[text.length - 1] === "") {
        text.pop();
    }
    while (text[0] === "") {
        text.shift();
    }
    return text;
}
function tokenize(text) {
    const segments = segmenter.segment(text);
    const tokens = [];
    for (const segment of segments) {
        if (segment.isWordLike) {
            tokens.push(segment.segment);
        }
    }
    return tokens;
}
function tokenizeInternal(input, language, prop) {
    /* c8 ignore next 3 */
    if (typeof input !== "string") {
        return [input];
    }
    let tokens;
    if (prop && this?.tokenizeSkipProperties?.has(prop)) {
        // @ts-ignore
        tokens = [this?.normalizeToken?.bind(this, prop ?? "")(input)];
    }
    else {
        tokens = tokenize(input);
    }
    const trimTokens = trim(tokens);
    if (!this.allowDuplicates) {
        return Array.from(new Set(trimTokens));
    }
    return trimTokens;
}
function createTokenizer(config = defaultConfig) {
    const tokenizerConfig = {
        tokenize: tokenizeInternal,
        language: config.language,
        stemmerSkipProperties: new Set(config.stemmerSkipProperties ? [config.stemmerSkipProperties].flat() : []),
        tokenizeSkipProperties: new Set(config.tokenizeSkipProperties
            ? [config.tokenizeSkipProperties].flat()
            : []),
        stopWords: config.stopWords,
        allowDuplicates: Boolean(config.allowDuplicates),
        normalizeToken: internals_1.normalizeToken,
        normalizationCache: new Map(),
    };
    // @ts-ignore
    tokenizerConfig.tokenize = tokenizeInternal.bind(tokenizeInternal);
    return tokenizerConfig;
}
//# sourceMappingURL=mandarin.js.map