@botonic/plugin-contentful

Version:

Botonic Plugin Contentful is one of the **[available](https://github.com/hubtype/botonic/tree/master/packages)** plugins for Botonic. **[Contentful](http://www.contentful.com)** is a CMS (Content Management System) which manages contents of a great variet

github.com/hubtype/botonic

hubtype/botonic

247 lines (221 loc) • 6.89 kB

text/typescript

import { DynamicSingletonMap } from '../util' import { equalArrays } from '../util/arrays' import { Locale } from './locales' import { stemmerFor } from './stemmer' import { DEFAULT_SEPARATORS, DEFAULT_SEPARATORS_REGEX, stopWordsFor, tokenizerPerLocale, } from './tokens' import { replaceAll } from './util/strings' /** * Both tokens and stem will be converted to the <code>stem</code> * Tokens will be searched case-insensitively. */ export class StemmingBlackList { readonly stem: string readonly tokens: string[] constructor(stem: string, tokens: string[]) { this.stem = stem.toLowerCase() this.tokens = tokens.map(t => t.toLowerCase()) } normalize(normalizer: (str: string) => string): StemmingBlackList { return new StemmingBlackList( normalizer(this.stem), this.tokens.map(normalizer) ) } isBlackListed(token: string): boolean { return token == this.stem || this.tokens.includes(token) } } export class Word { /** * @param token lowercase, with i18n characters converted to ascii and after executing Preprocessor * @param stem lowercase, stemmed. Same as token for stopwords */ constructor( readonly token: string, readonly stem: string, readonly isStopWord = false ) {} static joinedTokens(words: Word[], withStopwords: boolean): string { if (!withStopwords) { words = words.filter(w => !w.isStopWord) } return words.map(w => w.token).join(' ') } static StopWord(token: string): Word { return new Word(token, token, true) } } export class EmptyTextException extends Error { constructor(txt: string) { super(`'${txt}' not accepted because it only contains separators`) } } export class NormalizedUtterance { /** Without stopwords */ readonly stems: string[] /** * @param onlyStopWords: true iff all tokens are stop words */ constructor( /** raw is actually lowercased and trimmed*/ readonly raw: string, readonly words: Word[], private readonly onlyStopWords = false ) { this.stems = words.filter(w => !w.isStopWord).map(w => w.stem) } hasOnlyStopWords(): boolean { return this.onlyStopWords } hasSameStems(other: NormalizedUtterance): boolean { return equalArrays(this.stems, other.stems) } joinedTokens(withStopWords: boolean): string { return Word.joinedTokens(this.words, withStopWords) } } export abstract class Preprocessor { abstract preprocess(txt: string): string } export class NopPreprocessor { preprocess(txt: string): string { return txt } } /** * Removes dots within acronyms, even if missing last dot, * or immediately followed by a different separator */ export class AcronymPreprocessor implements Preprocessor { private static readonly DOT = '.' private SEPS_NO_DOT: string constructor(separators: string) { this.SEPS_NO_DOT = separators.replace(AcronymPreprocessor.DOT, '') } preprocess(txt: string): string { if (!txt.includes(AcronymPreprocessor.DOT)) return txt const wordsAndSeparators = this.splitWordsAndSeparators(txt) txt = '' for (const wOrSep of wordsAndSeparators) { const isSeparator = wOrSep.includes(this.SEPS_NO_DOT) if (!isSeparator) { txt = txt + this.preprocessWord(wOrSep) } else { txt = txt + wOrSep } } return txt } private splitWordsAndSeparators(txt: string): string[] { let word = '' const ret: string[] = [] const pushWord = () => { if (word) { ret.push(word) word = '' } } for (const l of txt) { if (this.SEPS_NO_DOT.includes(l)) { pushWord() ret.push(l) } else { word += l } } pushWord() return ret } private preprocessWord(w: string): string { if (w.length <= 2) { return w } let mustBeDot = false for (const l of w) { const isDot = l == AcronymPreprocessor.DOT if (isDot !== mustBeDot) { return w } mustBeDot = !mustBeDot } return replaceAll(w, AcronymPreprocessor.DOT, '') } } export class Normalizer { private stopWordsPerLocale: DynamicSingletonMap<string[]> private stemmingBlackListPerLocale: DynamicSingletonMap<StemmingBlackList[]> /** * preprocessor: Applied before tokenizing. Applied also to separators and stem words */ constructor( stemmingBlackListPerLocale: { [locale: string]: StemmingBlackList[] } = {}, stopWordsForLocale = stopWordsFor, private readonly tokenizer = tokenizerPerLocale, private readonly separatorsRegex = DEFAULT_SEPARATORS_REGEX, private readonly preprocessor = new AcronymPreprocessor(DEFAULT_SEPARATORS) ) { this.stopWordsPerLocale = new DynamicSingletonMap<string[]>(locale => stopWordsForLocale(locale).map(w => this.normalizeWord(locale, w)) ) this.stemmingBlackListPerLocale = new DynamicSingletonMap< StemmingBlackList[] >(l => (stemmingBlackListPerLocale[l] || []).map(bl => bl.normalize(w => this.normalizeWord(l, w)) ) ) } /** * @throws EmptyTextException if the text is empty or only contains separators */ normalize(locale: Locale, raw: string): NormalizedUtterance { raw = raw.trim().toLowerCase() // TODO use preprocess without normalization? move to NormalizedUtterance constructor? let txt = this.preprocessor.preprocess(raw) txt = txt.replace(this.separatorsRegex, ' ') if (!txt.trim()) { throw new EmptyTextException(raw) } const stemmer = stemmerFor(locale) // tokenizer will replace i18n characters const tokens = this.tokenizer(locale).tokenize(txt, true) let words: Word[] = [] const stopWords = this.stopWordsPerLocale.value(locale) let numStopWords = 0 for (const token of tokens) { const blacklistedStem = this.getBlackListStem(locale, token) if (blacklistedStem) { words.push(new Word(token, blacklistedStem)) continue } if (stopWords.includes(token)) { words.push(Word.StopWord(token)) numStopWords++ continue } // a token could generate 2 stems (eg can't => can not) const tokenStems = stemmer.stem([token]) words = words.concat(tokenStems.map(stem => new Word(token, stem))) } return new NormalizedUtterance(raw, words, numStopWords == tokens.length) } private normalizeWord(locale: Locale, word: string): string { word = this.preprocessor.preprocess(word) return this.tokenizer(locale).tokenize(word.toLowerCase(), true).join(' ') } private getBlackListStem(locale: Locale, word: string): string | undefined { const blacks = this.stemmingBlackListPerLocale.value(locale) for (const black of blacks) { if (black.isBlackListed(word)) { return black.stem } } return undefined } }