UNPKG

@botonic/plugin-contentful

Version:

Botonic Plugin Contentful is one of the **[available](https://github.com/hubtype/botonic/tree/master/packages)** plugins for Botonic. **[Contentful](http://www.contentful.com)** is a CMS (Content Management System) which manages contents of a great variet

215 lines 9.21 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.stopWordsFor = exports.DEFAULT_STOP_WORDS = exports.DEFAULT_NOT_SEPARATORS_REGEX = exports.DEFAULT_SEPARATORS_REGEX = exports.DEFAULT_SEPARATORS = exports.tokenizerPerLocale = exports.TokenizerCa = exports.countOccurrences = void 0; const tslib_1 = require("tslib"); const util_1 = require("../util"); const locales_1 = require("./locales"); const locales = tslib_1.__importStar(require("./locales")); const stopwords_bg_1 = require("./stopwords/stopwords-bg"); const stopwords_ca_1 = require("./stopwords/stopwords-ca"); const stopwords_cs_1 = require("./stopwords/stopwords-cs"); const stopwords_de_1 = require("./stopwords/stopwords-de"); const stopwords_el_1 = require("./stopwords/stopwords-el"); const stopwords_en_1 = require("./stopwords/stopwords-en"); const stopwords_es_1 = require("./stopwords/stopwords-es"); const stopwords_fr_1 = require("./stopwords/stopwords-fr"); const stopwords_hr_1 = require("./stopwords/stopwords-hr"); const stopwords_hu_1 = require("./stopwords/stopwords-hu"); const stopwords_it_1 = require("./stopwords/stopwords-it"); const stopwords_nl_1 = require("./stopwords/stopwords-nl"); const stopwords_pl_1 = require("./stopwords/stopwords-pl"); const stopwords_pt_1 = require("./stopwords/stopwords-pt"); const stopwords_ro_1 = require("./stopwords/stopwords-ro"); const stopwords_ru_1 = require("./stopwords/stopwords-ru"); const stopwords_sk_1 = require("./stopwords/stopwords-sk"); const stopwords_sl_1 = require("./stopwords/stopwords-sl"); const stopwords_tr_1 = require("./stopwords/stopwords-tr"); const stopwords_uk_1 = require("./stopwords/stopwords-uk"); function countOccurrences(haystack, needle) { let n = 0; let pos = 0; // eslint-disable-next-line no-constant-condition while (true) { pos = haystack.indexOf(needle, pos); if (pos >= 0) { ++n; pos += needle.length; } else break; } return n; } exports.countOccurrences = countOccurrences; /** * Not using TokenizerCa from node-nlp because it does not stem correctly some * "pronoms febles" (eg. adonar-se'n) * It maintains ç & Ç, but maybe we should only do it when normalize=true? */ class TokenizerCa { static splitRegex() { const aLetter = 'a-zA-Zá-úÁ-ÚñÑüÜ'; const pronomFebleEnding = `[-'](?=[${aLetter}])`; const separator = `\\s,.!?;:([\\]'"¡¿)`; const slashNotNumber = `/(?=[^0-9])`; return new RegExp(`${pronomFebleEnding}|[${separator}]+|${slashNotNumber}+`); } static restoreAfterTokenizer(text) { return text.replace(TokenizerCa.RESTORE_CEDIL, 'ç'); } tokenize(text, normalize = true) { let normalized = text; if (normalize) { normalized = text.normalize('NFD'); normalized = TokenizerCa.restoreAfterTokenizer(normalized); normalized = normalized.replace(/[\u0300-\u036f]/g, ''); } return this.trim(normalized.split(TokenizerCa.SPLIT_REGEX)); } trim(arr) { while (arr[arr.length - 1] === '') { arr.pop(); } while (arr[0] === '') { arr.shift(); } return arr; } } exports.TokenizerCa = TokenizerCa; TokenizerCa.RESTORE_CEDIL = new RegExp('c' + String.fromCharCode(807), 'gi'); TokenizerCa.SPLIT_REGEX = TokenizerCa.splitRegex(); const lazyTokenizers = new util_1.SingletonMap({ [locales.SPANISH]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerEs = require('@nlpjs/lang-es/src/tokenizer-es'); return new TokenizerEs(); }, [locales.ENGLISH]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerEn = require('@nlpjs/lang-en-min/src/tokenizer-en'); return new TokenizerEn(); }, [locales.CATALAN]: () => { return new TokenizerCa(); }, [locales.POLISH]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerPl = require('@nlpjs/lang-pl/src/tokenizer-pl'); return new TokenizerPl(); }, [locales.PORTUGUESE]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerPt = require('@nlpjs/lang-pt/src/tokenizer-pt'); return new TokenizerPt(); }, [locales.RUSSIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerRu = require('@nlpjs/lang-ru/src/tokenizer-ru'); return new TokenizerRu(); }, [locales.TURKISH]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerTr = require('@nlpjs/lang-tr/src/tokenizer-tr'); return new TokenizerTr(); }, [locales.ITALIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerIt = require('@nlpjs/lang-it/src/tokenizer-it'); return new TokenizerIt(); }, [locales.FRENCH]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerFr = require('@nlpjs/lang-fr/src/tokenizer-fr'); return new TokenizerFr(); }, [locales.GERMAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerDe = require('@nlpjs/lang-de/src/tokenizer-de'); return new TokenizerDe(); }, [locales.ROMANIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerRo = require('@nlpjs/lang-ro/src/tokenizer-ro'); return new TokenizerRo(); }, [locales.GREEK]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerEl = require('@nlpjs/lang-el/src/tokenizer-el'); return new TokenizerEl(); }, [locales.CZECH]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerCs = require('@nlpjs/lang-cs/src/tokenizer-cs'); return new TokenizerCs(); }, [locales.UKRAINIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerUk = require('@nlpjs/lang-uk/src/tokenizer-uk'); return new TokenizerUk(); }, [locales.CROATIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires,node/no-missing-require const { TokenizerHr } = require('./tokenizers/tokenizer-hr'); return new TokenizerHr(); }, [locales.SLOVAK]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires,node/no-missing-require const { TokenizerSk } = require('./tokenizers/tokenizer-sk'); return new TokenizerSk(); }, [locales.SLOVENIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerSl = require('@nlpjs/lang-sl/src/tokenizer-sl'); return new TokenizerSl(); }, [locales.HUNGARIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerHu = require('@nlpjs/lang-hu/src/tokenizer-hu'); return new TokenizerHu(); }, [locales.DUTCH]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires const TokenizerNl = require('@nlpjs/lang-nl/src/tokenizer-nl'); return new TokenizerNl(); }, [locales.BULGARIAN]: () => { // eslint-disable-next-line @typescript-eslint/no-var-requires,node/no-missing-require const { TokenizerBg } = require('./tokenizers/tokenizer-bg'); return new TokenizerBg(); }, }); function tokenizerPerLocale(locale) { return lazyTokenizers.value((0, locales_1.languageFromLocale)(locale)); } exports.tokenizerPerLocale = tokenizerPerLocale; exports.DEFAULT_SEPARATORS = ';,./()!?" '; exports.DEFAULT_SEPARATORS_REGEX = new RegExp('[' + exports.DEFAULT_SEPARATORS + ']', 'g'); exports.DEFAULT_NOT_SEPARATORS_REGEX = new RegExp('[^' + exports.DEFAULT_SEPARATORS + ']', 'g'); exports.DEFAULT_STOP_WORDS = { es: stopwords_es_1.esDefaultStopWords, ca: stopwords_ca_1.caDefaultStopWords, en: stopwords_en_1.enDefaultStopWords, pl: stopwords_pl_1.plDefaultStopWords, pt: stopwords_pt_1.ptDefaultStopWords, ru: stopwords_ru_1.ruDefaultStopWords, tr: stopwords_tr_1.trDefaultStopWords, it: stopwords_it_1.itDefaultStopWords, fr: stopwords_fr_1.frDefaultStopWords, de: stopwords_de_1.deDefaultStopWords, ro: stopwords_ro_1.roDefaultStopWords, el: stopwords_el_1.elDefaultStopWords, cs: stopwords_cs_1.csDefaultStopWords, uk: stopwords_uk_1.ukDefaultStopWords, hr: stopwords_hr_1.hrDefaultStopWords, sk: stopwords_sk_1.skDefaultStopWords, sl: stopwords_sl_1.slDefaultStopWords, hu: stopwords_hu_1.huDefaultStopWords, nl: stopwords_nl_1.nlDefaultStopWords, bg: stopwords_bg_1.bgDefaultStopWords, }; function stopWordsFor(locale) { return exports.DEFAULT_STOP_WORDS[(0, locales_1.languageFromLocale)(locale)]; } exports.stopWordsFor = stopWordsFor; //# sourceMappingURL=tokens.js.map