UNPKG

@botonic/plugin-contentful

Version:

Botonic Plugin Contentful is one of the **[available](https://github.com/hubtype/botonic/tree/master/packages)** plugins for Botonic. **[Contentful](http://www.contentful.com)** is a CMS (Content Management System) which manages contents of a great variet

195 lines 6.77 kB
import { DynamicSingletonMap } from '../util'; import { equalArrays } from '../util/arrays'; import { stemmerFor } from './stemmer'; import { DEFAULT_SEPARATORS, DEFAULT_SEPARATORS_REGEX, stopWordsFor, tokenizerPerLocale, } from './tokens'; import { replaceAll } from './util/strings'; /** * Both tokens and stem will be converted to the <code>stem</code> * Tokens will be searched case-insensitively. */ export class StemmingBlackList { constructor(stem, tokens) { this.stem = stem.toLowerCase(); this.tokens = tokens.map(t => t.toLowerCase()); } normalize(normalizer) { return new StemmingBlackList(normalizer(this.stem), this.tokens.map(normalizer)); } isBlackListed(token) { return token == this.stem || this.tokens.includes(token); } } export class Word { /** * @param token lowercase, with i18n characters converted to ascii and after executing Preprocessor * @param stem lowercase, stemmed. Same as token for stopwords */ constructor(token, stem, isStopWord = false) { this.token = token; this.stem = stem; this.isStopWord = isStopWord; } static joinedTokens(words, withStopwords) { if (!withStopwords) { words = words.filter(w => !w.isStopWord); } return words.map(w => w.token).join(' '); } static StopWord(token) { return new Word(token, token, true); } } export class EmptyTextException extends Error { constructor(txt) { super(`'${txt}' not accepted because it only contains separators`); } } export class NormalizedUtterance { /** * @param onlyStopWords: true iff all tokens are stop words */ constructor( /** raw is actually lowercased and trimmed*/ raw, words, onlyStopWords = false) { this.raw = raw; this.words = words; this.onlyStopWords = onlyStopWords; this.stems = words.filter(w => !w.isStopWord).map(w => w.stem); } hasOnlyStopWords() { return this.onlyStopWords; } hasSameStems(other) { return equalArrays(this.stems, other.stems); } joinedTokens(withStopWords) { return Word.joinedTokens(this.words, withStopWords); } } export class Preprocessor { } export class NopPreprocessor { preprocess(txt) { return txt; } } /** * Removes dots within acronyms, even if missing last dot, * or immediately followed by a different separator */ export class AcronymPreprocessor { constructor(separators) { this.SEPS_NO_DOT = separators.replace(AcronymPreprocessor.DOT, ''); } preprocess(txt) { if (!txt.includes(AcronymPreprocessor.DOT)) return txt; const wordsAndSeparators = this.splitWordsAndSeparators(txt); txt = ''; for (const wOrSep of wordsAndSeparators) { const isSeparator = wOrSep.includes(this.SEPS_NO_DOT); if (!isSeparator) { txt = txt + this.preprocessWord(wOrSep); } else { txt = txt + wOrSep; } } return txt; } splitWordsAndSeparators(txt) { let word = ''; const ret = []; const pushWord = () => { if (word) { ret.push(word); word = ''; } }; for (const l of txt) { if (this.SEPS_NO_DOT.includes(l)) { pushWord(); ret.push(l); } else { word += l; } } pushWord(); return ret; } preprocessWord(w) { if (w.length <= 2) { return w; } let mustBeDot = false; for (const l of w) { const isDot = l == AcronymPreprocessor.DOT; if (isDot !== mustBeDot) { return w; } mustBeDot = !mustBeDot; } return replaceAll(w, AcronymPreprocessor.DOT, ''); } } AcronymPreprocessor.DOT = '.'; export class Normalizer { /** * preprocessor: Applied before tokenizing. Applied also to separators and stem words */ constructor(stemmingBlackListPerLocale = {}, stopWordsForLocale = stopWordsFor, tokenizer = tokenizerPerLocale, separatorsRegex = DEFAULT_SEPARATORS_REGEX, preprocessor = new AcronymPreprocessor(DEFAULT_SEPARATORS)) { this.tokenizer = tokenizer; this.separatorsRegex = separatorsRegex; this.preprocessor = preprocessor; this.stopWordsPerLocale = new DynamicSingletonMap(locale => stopWordsForLocale(locale).map(w => this.normalizeWord(locale, w))); this.stemmingBlackListPerLocale = new DynamicSingletonMap(l => (stemmingBlackListPerLocale[l] || []).map(bl => bl.normalize(w => this.normalizeWord(l, w)))); } /** * @throws EmptyTextException if the text is empty or only contains separators */ normalize(locale, raw) { raw = raw.trim().toLowerCase(); // TODO use preprocess without normalization? move to NormalizedUtterance constructor? let txt = this.preprocessor.preprocess(raw); txt = txt.replace(this.separatorsRegex, ' '); if (!txt.trim()) { throw new EmptyTextException(raw); } const stemmer = stemmerFor(locale); // tokenizer will replace i18n characters const tokens = this.tokenizer(locale).tokenize(txt, true); let words = []; const stopWords = this.stopWordsPerLocale.value(locale); let numStopWords = 0; for (const token of tokens) { const blacklistedStem = this.getBlackListStem(locale, token); if (blacklistedStem) { words.push(new Word(token, blacklistedStem)); continue; } if (stopWords.includes(token)) { words.push(Word.StopWord(token)); numStopWords++; continue; } // a token could generate 2 stems (eg can't => can not) const tokenStems = stemmer.stem([token]); words = words.concat(tokenStems.map(stem => new Word(token, stem))); } return new NormalizedUtterance(raw, words, numStopWords == tokens.length); } normalizeWord(locale, word) { word = this.preprocessor.preprocess(word); return this.tokenizer(locale).tokenize(word.toLowerCase(), true).join(' '); } getBlackListStem(locale, word) { const blacks = this.stemmingBlackListPerLocale.value(locale); for (const black of blacks) { if (black.isBlackListed(word)) { return black.stem; } } return undefined; } } //# sourceMappingURL=normalizer.js.map