@botonic/plugin-contentful
Version:
Botonic Plugin Contentful is one of the **[available](https://github.com/hubtype/botonic/tree/master/packages)** plugins for Botonic. **[Contentful](http://www.contentful.com)** is a CMS (Content Management System) which manages contents of a great variet
206 lines • 7.34 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Normalizer = exports.AcronymPreprocessor = exports.NopPreprocessor = exports.Preprocessor = exports.NormalizedUtterance = exports.EmptyTextException = exports.Word = exports.StemmingBlackList = void 0;
const util_1 = require("../util");
const arrays_1 = require("../util/arrays");
const stemmer_1 = require("./stemmer");
const tokens_1 = require("./tokens");
const strings_1 = require("./util/strings");
/**
* Both tokens and stem will be converted to the <code>stem</code>
* Tokens will be searched case-insensitively.
*/
class StemmingBlackList {
constructor(stem, tokens) {
this.stem = stem.toLowerCase();
this.tokens = tokens.map(t => t.toLowerCase());
}
normalize(normalizer) {
return new StemmingBlackList(normalizer(this.stem), this.tokens.map(normalizer));
}
isBlackListed(token) {
return token == this.stem || this.tokens.includes(token);
}
}
exports.StemmingBlackList = StemmingBlackList;
class Word {
/**
* @param token lowercase, with i18n characters converted to ascii and after executing Preprocessor
* @param stem lowercase, stemmed. Same as token for stopwords
*/
constructor(token, stem, isStopWord = false) {
this.token = token;
this.stem = stem;
this.isStopWord = isStopWord;
}
static joinedTokens(words, withStopwords) {
if (!withStopwords) {
words = words.filter(w => !w.isStopWord);
}
return words.map(w => w.token).join(' ');
}
static StopWord(token) {
return new Word(token, token, true);
}
}
exports.Word = Word;
class EmptyTextException extends Error {
constructor(txt) {
super(`'${txt}' not accepted because it only contains separators`);
}
}
exports.EmptyTextException = EmptyTextException;
class NormalizedUtterance {
/**
* @param onlyStopWords: true iff all tokens are stop words
*/
constructor(
/** raw is actually lowercased and trimmed*/
raw, words, onlyStopWords = false) {
this.raw = raw;
this.words = words;
this.onlyStopWords = onlyStopWords;
this.stems = words.filter(w => !w.isStopWord).map(w => w.stem);
}
hasOnlyStopWords() {
return this.onlyStopWords;
}
hasSameStems(other) {
return (0, arrays_1.equalArrays)(this.stems, other.stems);
}
joinedTokens(withStopWords) {
return Word.joinedTokens(this.words, withStopWords);
}
}
exports.NormalizedUtterance = NormalizedUtterance;
class Preprocessor {
}
exports.Preprocessor = Preprocessor;
class NopPreprocessor {
preprocess(txt) {
return txt;
}
}
exports.NopPreprocessor = NopPreprocessor;
/**
* Removes dots within acronyms, even if missing last dot,
* or immediately followed by a different separator
*/
class AcronymPreprocessor {
constructor(separators) {
this.SEPS_NO_DOT = separators.replace(AcronymPreprocessor.DOT, '');
}
preprocess(txt) {
if (!txt.includes(AcronymPreprocessor.DOT))
return txt;
const wordsAndSeparators = this.splitWordsAndSeparators(txt);
txt = '';
for (const wOrSep of wordsAndSeparators) {
const isSeparator = wOrSep.includes(this.SEPS_NO_DOT);
if (!isSeparator) {
txt = txt + this.preprocessWord(wOrSep);
}
else {
txt = txt + wOrSep;
}
}
return txt;
}
splitWordsAndSeparators(txt) {
let word = '';
const ret = [];
const pushWord = () => {
if (word) {
ret.push(word);
word = '';
}
};
for (const l of txt) {
if (this.SEPS_NO_DOT.includes(l)) {
pushWord();
ret.push(l);
}
else {
word += l;
}
}
pushWord();
return ret;
}
preprocessWord(w) {
if (w.length <= 2) {
return w;
}
let mustBeDot = false;
for (const l of w) {
const isDot = l == AcronymPreprocessor.DOT;
if (isDot !== mustBeDot) {
return w;
}
mustBeDot = !mustBeDot;
}
return (0, strings_1.replaceAll)(w, AcronymPreprocessor.DOT, '');
}
}
exports.AcronymPreprocessor = AcronymPreprocessor;
AcronymPreprocessor.DOT = '.';
class Normalizer {
/**
* preprocessor: Applied before tokenizing. Applied also to separators and stem words
*/
constructor(stemmingBlackListPerLocale = {}, stopWordsForLocale = tokens_1.stopWordsFor, tokenizer = tokens_1.tokenizerPerLocale, separatorsRegex = tokens_1.DEFAULT_SEPARATORS_REGEX, preprocessor = new AcronymPreprocessor(tokens_1.DEFAULT_SEPARATORS)) {
this.tokenizer = tokenizer;
this.separatorsRegex = separatorsRegex;
this.preprocessor = preprocessor;
this.stopWordsPerLocale = new util_1.DynamicSingletonMap(locale => stopWordsForLocale(locale).map(w => this.normalizeWord(locale, w)));
this.stemmingBlackListPerLocale = new util_1.DynamicSingletonMap(l => (stemmingBlackListPerLocale[l] || []).map(bl => bl.normalize(w => this.normalizeWord(l, w))));
}
/**
* @throws EmptyTextException if the text is empty or only contains separators
*/
normalize(locale, raw) {
raw = raw.trim().toLowerCase(); // TODO use preprocess without normalization? move to NormalizedUtterance constructor?
let txt = this.preprocessor.preprocess(raw);
txt = txt.replace(this.separatorsRegex, ' ');
if (!txt.trim()) {
throw new EmptyTextException(raw);
}
const stemmer = (0, stemmer_1.stemmerFor)(locale);
// tokenizer will replace i18n characters
const tokens = this.tokenizer(locale).tokenize(txt, true);
let words = [];
const stopWords = this.stopWordsPerLocale.value(locale);
let numStopWords = 0;
for (const token of tokens) {
const blacklistedStem = this.getBlackListStem(locale, token);
if (blacklistedStem) {
words.push(new Word(token, blacklistedStem));
continue;
}
if (stopWords.includes(token)) {
words.push(Word.StopWord(token));
numStopWords++;
continue;
}
// a token could generate 2 stems (eg can't => can not)
const tokenStems = stemmer.stem([token]);
words = words.concat(tokenStems.map(stem => new Word(token, stem)));
}
return new NormalizedUtterance(raw, words, numStopWords == tokens.length);
}
normalizeWord(locale, word) {
word = this.preprocessor.preprocess(word);
return this.tokenizer(locale).tokenize(word.toLowerCase(), true).join(' ');
}
getBlackListStem(locale, word) {
const blacks = this.stemmingBlackListPerLocale.value(locale);
for (const black of blacks) {
if (black.isBlackListed(word)) {
return black.stem;
}
}
return undefined;
}
}
exports.Normalizer = Normalizer;
//# sourceMappingURL=normalizer.js.map