@botonic/plugin-contentful
Version:
Botonic Plugin Contentful is one of the **[available](https://github.com/hubtype/botonic/tree/master/packages)** plugins for Botonic. **[Contentful](http://www.contentful.com)** is a CMS (Content Management System) which manages contents of a great variet
215 lines • 9.21 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.stopWordsFor = exports.DEFAULT_STOP_WORDS = exports.DEFAULT_NOT_SEPARATORS_REGEX = exports.DEFAULT_SEPARATORS_REGEX = exports.DEFAULT_SEPARATORS = exports.tokenizerPerLocale = exports.TokenizerCa = exports.countOccurrences = void 0;
const tslib_1 = require("tslib");
const util_1 = require("../util");
const locales_1 = require("./locales");
const locales = tslib_1.__importStar(require("./locales"));
const stopwords_bg_1 = require("./stopwords/stopwords-bg");
const stopwords_ca_1 = require("./stopwords/stopwords-ca");
const stopwords_cs_1 = require("./stopwords/stopwords-cs");
const stopwords_de_1 = require("./stopwords/stopwords-de");
const stopwords_el_1 = require("./stopwords/stopwords-el");
const stopwords_en_1 = require("./stopwords/stopwords-en");
const stopwords_es_1 = require("./stopwords/stopwords-es");
const stopwords_fr_1 = require("./stopwords/stopwords-fr");
const stopwords_hr_1 = require("./stopwords/stopwords-hr");
const stopwords_hu_1 = require("./stopwords/stopwords-hu");
const stopwords_it_1 = require("./stopwords/stopwords-it");
const stopwords_nl_1 = require("./stopwords/stopwords-nl");
const stopwords_pl_1 = require("./stopwords/stopwords-pl");
const stopwords_pt_1 = require("./stopwords/stopwords-pt");
const stopwords_ro_1 = require("./stopwords/stopwords-ro");
const stopwords_ru_1 = require("./stopwords/stopwords-ru");
const stopwords_sk_1 = require("./stopwords/stopwords-sk");
const stopwords_sl_1 = require("./stopwords/stopwords-sl");
const stopwords_tr_1 = require("./stopwords/stopwords-tr");
const stopwords_uk_1 = require("./stopwords/stopwords-uk");
function countOccurrences(haystack, needle) {
let n = 0;
let pos = 0;
// eslint-disable-next-line no-constant-condition
while (true) {
pos = haystack.indexOf(needle, pos);
if (pos >= 0) {
++n;
pos += needle.length;
}
else
break;
}
return n;
}
exports.countOccurrences = countOccurrences;
/**
* Not using TokenizerCa from node-nlp because it does not stem correctly some
* "pronoms febles" (eg. adonar-se'n)
* It maintains ç & Ç, but maybe we should only do it when normalize=true?
*/
class TokenizerCa {
static splitRegex() {
const aLetter = 'a-zA-Zá-úÁ-ÚñÑüÜ';
const pronomFebleEnding = `[-'](?=[${aLetter}])`;
const separator = `\\s,.!?;:([\\]'"¡¿)`;
const slashNotNumber = `/(?=[^0-9])`;
return new RegExp(`${pronomFebleEnding}|[${separator}]+|${slashNotNumber}+`);
}
static restoreAfterTokenizer(text) {
return text.replace(TokenizerCa.RESTORE_CEDIL, 'ç');
}
tokenize(text, normalize = true) {
let normalized = text;
if (normalize) {
normalized = text.normalize('NFD');
normalized = TokenizerCa.restoreAfterTokenizer(normalized);
normalized = normalized.replace(/[\u0300-\u036f]/g, '');
}
return this.trim(normalized.split(TokenizerCa.SPLIT_REGEX));
}
trim(arr) {
while (arr[arr.length - 1] === '') {
arr.pop();
}
while (arr[0] === '') {
arr.shift();
}
return arr;
}
}
exports.TokenizerCa = TokenizerCa;
TokenizerCa.RESTORE_CEDIL = new RegExp('c' + String.fromCharCode(807), 'gi');
TokenizerCa.SPLIT_REGEX = TokenizerCa.splitRegex();
const lazyTokenizers = new util_1.SingletonMap({
[locales.SPANISH]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerEs = require('@nlpjs/lang-es/src/tokenizer-es');
return new TokenizerEs();
},
[locales.ENGLISH]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerEn = require('@nlpjs/lang-en-min/src/tokenizer-en');
return new TokenizerEn();
},
[locales.CATALAN]: () => {
return new TokenizerCa();
},
[locales.POLISH]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerPl = require('@nlpjs/lang-pl/src/tokenizer-pl');
return new TokenizerPl();
},
[locales.PORTUGUESE]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerPt = require('@nlpjs/lang-pt/src/tokenizer-pt');
return new TokenizerPt();
},
[locales.RUSSIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerRu = require('@nlpjs/lang-ru/src/tokenizer-ru');
return new TokenizerRu();
},
[locales.TURKISH]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerTr = require('@nlpjs/lang-tr/src/tokenizer-tr');
return new TokenizerTr();
},
[locales.ITALIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerIt = require('@nlpjs/lang-it/src/tokenizer-it');
return new TokenizerIt();
},
[locales.FRENCH]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerFr = require('@nlpjs/lang-fr/src/tokenizer-fr');
return new TokenizerFr();
},
[locales.GERMAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerDe = require('@nlpjs/lang-de/src/tokenizer-de');
return new TokenizerDe();
},
[locales.ROMANIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerRo = require('@nlpjs/lang-ro/src/tokenizer-ro');
return new TokenizerRo();
},
[locales.GREEK]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerEl = require('@nlpjs/lang-el/src/tokenizer-el');
return new TokenizerEl();
},
[locales.CZECH]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerCs = require('@nlpjs/lang-cs/src/tokenizer-cs');
return new TokenizerCs();
},
[locales.UKRAINIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerUk = require('@nlpjs/lang-uk/src/tokenizer-uk');
return new TokenizerUk();
},
[locales.CROATIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires,node/no-missing-require
const { TokenizerHr } = require('./tokenizers/tokenizer-hr');
return new TokenizerHr();
},
[locales.SLOVAK]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires,node/no-missing-require
const { TokenizerSk } = require('./tokenizers/tokenizer-sk');
return new TokenizerSk();
},
[locales.SLOVENIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerSl = require('@nlpjs/lang-sl/src/tokenizer-sl');
return new TokenizerSl();
},
[locales.HUNGARIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerHu = require('@nlpjs/lang-hu/src/tokenizer-hu');
return new TokenizerHu();
},
[locales.DUTCH]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires
const TokenizerNl = require('@nlpjs/lang-nl/src/tokenizer-nl');
return new TokenizerNl();
},
[locales.BULGARIAN]: () => {
// eslint-disable-next-line @typescript-eslint/no-var-requires,node/no-missing-require
const { TokenizerBg } = require('./tokenizers/tokenizer-bg');
return new TokenizerBg();
},
});
function tokenizerPerLocale(locale) {
return lazyTokenizers.value((0, locales_1.languageFromLocale)(locale));
}
exports.tokenizerPerLocale = tokenizerPerLocale;
exports.DEFAULT_SEPARATORS = ';,./()!?" ';
exports.DEFAULT_SEPARATORS_REGEX = new RegExp('[' + exports.DEFAULT_SEPARATORS + ']', 'g');
exports.DEFAULT_NOT_SEPARATORS_REGEX = new RegExp('[^' + exports.DEFAULT_SEPARATORS + ']', 'g');
exports.DEFAULT_STOP_WORDS = {
es: stopwords_es_1.esDefaultStopWords,
ca: stopwords_ca_1.caDefaultStopWords,
en: stopwords_en_1.enDefaultStopWords,
pl: stopwords_pl_1.plDefaultStopWords,
pt: stopwords_pt_1.ptDefaultStopWords,
ru: stopwords_ru_1.ruDefaultStopWords,
tr: stopwords_tr_1.trDefaultStopWords,
it: stopwords_it_1.itDefaultStopWords,
fr: stopwords_fr_1.frDefaultStopWords,
de: stopwords_de_1.deDefaultStopWords,
ro: stopwords_ro_1.roDefaultStopWords,
el: stopwords_el_1.elDefaultStopWords,
cs: stopwords_cs_1.csDefaultStopWords,
uk: stopwords_uk_1.ukDefaultStopWords,
hr: stopwords_hr_1.hrDefaultStopWords,
sk: stopwords_sk_1.skDefaultStopWords,
sl: stopwords_sl_1.slDefaultStopWords,
hu: stopwords_hu_1.huDefaultStopWords,
nl: stopwords_nl_1.nlDefaultStopWords,
bg: stopwords_bg_1.bgDefaultStopWords,
};
function stopWordsFor(locale) {
return exports.DEFAULT_STOP_WORDS[(0, locales_1.languageFromLocale)(locale)];
}
exports.stopWordsFor = stopWordsFor;
//# sourceMappingURL=tokens.js.map