UNPKG

@botonic/plugin-contentful

Version:

Botonic Plugin Contentful is one of the **[available](https://github.com/hubtype/botonic/tree/master/packages)** plugins for Botonic. **[Contentful](http://www.contentful.com)** is a CMS (Content Management System) which manages contents of a great variet

206 lines 7.34 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Normalizer = exports.AcronymPreprocessor = exports.NopPreprocessor = exports.Preprocessor = exports.NormalizedUtterance = exports.EmptyTextException = exports.Word = exports.StemmingBlackList = void 0; const util_1 = require("../util"); const arrays_1 = require("../util/arrays"); const stemmer_1 = require("./stemmer"); const tokens_1 = require("./tokens"); const strings_1 = require("./util/strings"); /** * Both tokens and stem will be converted to the <code>stem</code> * Tokens will be searched case-insensitively. */ class StemmingBlackList { constructor(stem, tokens) { this.stem = stem.toLowerCase(); this.tokens = tokens.map(t => t.toLowerCase()); } normalize(normalizer) { return new StemmingBlackList(normalizer(this.stem), this.tokens.map(normalizer)); } isBlackListed(token) { return token == this.stem || this.tokens.includes(token); } } exports.StemmingBlackList = StemmingBlackList; class Word { /** * @param token lowercase, with i18n characters converted to ascii and after executing Preprocessor * @param stem lowercase, stemmed. Same as token for stopwords */ constructor(token, stem, isStopWord = false) { this.token = token; this.stem = stem; this.isStopWord = isStopWord; } static joinedTokens(words, withStopwords) { if (!withStopwords) { words = words.filter(w => !w.isStopWord); } return words.map(w => w.token).join(' '); } static StopWord(token) { return new Word(token, token, true); } } exports.Word = Word; class EmptyTextException extends Error { constructor(txt) { super(`'${txt}' not accepted because it only contains separators`); } } exports.EmptyTextException = EmptyTextException; class NormalizedUtterance { /** * @param onlyStopWords: true iff all tokens are stop words */ constructor( /** raw is actually lowercased and trimmed*/ raw, words, onlyStopWords = false) { this.raw = raw; this.words = words; this.onlyStopWords = onlyStopWords; this.stems = words.filter(w => !w.isStopWord).map(w => w.stem); } hasOnlyStopWords() { return this.onlyStopWords; } hasSameStems(other) { return (0, arrays_1.equalArrays)(this.stems, other.stems); } joinedTokens(withStopWords) { return Word.joinedTokens(this.words, withStopWords); } } exports.NormalizedUtterance = NormalizedUtterance; class Preprocessor { } exports.Preprocessor = Preprocessor; class NopPreprocessor { preprocess(txt) { return txt; } } exports.NopPreprocessor = NopPreprocessor; /** * Removes dots within acronyms, even if missing last dot, * or immediately followed by a different separator */ class AcronymPreprocessor { constructor(separators) { this.SEPS_NO_DOT = separators.replace(AcronymPreprocessor.DOT, ''); } preprocess(txt) { if (!txt.includes(AcronymPreprocessor.DOT)) return txt; const wordsAndSeparators = this.splitWordsAndSeparators(txt); txt = ''; for (const wOrSep of wordsAndSeparators) { const isSeparator = wOrSep.includes(this.SEPS_NO_DOT); if (!isSeparator) { txt = txt + this.preprocessWord(wOrSep); } else { txt = txt + wOrSep; } } return txt; } splitWordsAndSeparators(txt) { let word = ''; const ret = []; const pushWord = () => { if (word) { ret.push(word); word = ''; } }; for (const l of txt) { if (this.SEPS_NO_DOT.includes(l)) { pushWord(); ret.push(l); } else { word += l; } } pushWord(); return ret; } preprocessWord(w) { if (w.length <= 2) { return w; } let mustBeDot = false; for (const l of w) { const isDot = l == AcronymPreprocessor.DOT; if (isDot !== mustBeDot) { return w; } mustBeDot = !mustBeDot; } return (0, strings_1.replaceAll)(w, AcronymPreprocessor.DOT, ''); } } exports.AcronymPreprocessor = AcronymPreprocessor; AcronymPreprocessor.DOT = '.'; class Normalizer { /** * preprocessor: Applied before tokenizing. Applied also to separators and stem words */ constructor(stemmingBlackListPerLocale = {}, stopWordsForLocale = tokens_1.stopWordsFor, tokenizer = tokens_1.tokenizerPerLocale, separatorsRegex = tokens_1.DEFAULT_SEPARATORS_REGEX, preprocessor = new AcronymPreprocessor(tokens_1.DEFAULT_SEPARATORS)) { this.tokenizer = tokenizer; this.separatorsRegex = separatorsRegex; this.preprocessor = preprocessor; this.stopWordsPerLocale = new util_1.DynamicSingletonMap(locale => stopWordsForLocale(locale).map(w => this.normalizeWord(locale, w))); this.stemmingBlackListPerLocale = new util_1.DynamicSingletonMap(l => (stemmingBlackListPerLocale[l] || []).map(bl => bl.normalize(w => this.normalizeWord(l, w)))); } /** * @throws EmptyTextException if the text is empty or only contains separators */ normalize(locale, raw) { raw = raw.trim().toLowerCase(); // TODO use preprocess without normalization? move to NormalizedUtterance constructor? let txt = this.preprocessor.preprocess(raw); txt = txt.replace(this.separatorsRegex, ' '); if (!txt.trim()) { throw new EmptyTextException(raw); } const stemmer = (0, stemmer_1.stemmerFor)(locale); // tokenizer will replace i18n characters const tokens = this.tokenizer(locale).tokenize(txt, true); let words = []; const stopWords = this.stopWordsPerLocale.value(locale); let numStopWords = 0; for (const token of tokens) { const blacklistedStem = this.getBlackListStem(locale, token); if (blacklistedStem) { words.push(new Word(token, blacklistedStem)); continue; } if (stopWords.includes(token)) { words.push(Word.StopWord(token)); numStopWords++; continue; } // a token could generate 2 stems (eg can't => can not) const tokenStems = stemmer.stem([token]); words = words.concat(tokenStems.map(stem => new Word(token, stem))); } return new NormalizedUtterance(raw, words, numStopWords == tokens.length); } normalizeWord(locale, word) { word = this.preprocessor.preprocess(word); return this.tokenizer(locale).tokenize(word.toLowerCase(), true).join(' '); } getBlackListStem(locale, word) { const blacks = this.stemmingBlackListPerLocale.value(locale); for (const black of blacks) { if (black.isBlackListed(word)) { return black.stem; } } return undefined; } } exports.Normalizer = Normalizer; //# sourceMappingURL=normalizer.js.map