@jnv/node-nlp
Version:
Library for NLU (Natural Language Understanding) done in Node.js
355 lines (340 loc) • 12.3 kB
JavaScript
/*
* Copyright (c) AXA Shared Services Spain S.A.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
const importLazy = require('import-lazy')(require);
const ArabicStemmer = importLazy('./stemmers/arabic-stemmer');
const ArmenianStemmer = importLazy('./stemmers/armenian-stemmer');
const BasqueStemmer = importLazy('./stemmers/basque-stemmer');
const CatalanStemmer = importLazy('./stemmers/catalan-stemmer');
const CzechStemmer = importLazy('./stemmers/czech-stemmer');
const ChineseStemmer = importLazy('./stemmers/chinese-stemmer');
const ChineseTokenizer = importLazy('./tokenizers/chinese-tokenizer');
const DanishStemmer = importLazy('./stemmers/danish-stemmer');
const DutchStemmer = importLazy('./stemmers/dutch-stemmer');
const EnglishStemmer = importLazy('./stemmers/english-stemmer');
const FinnishStemmer = importLazy('./stemmers/finnish-stemmer');
const FrenchStemmer = importLazy('./stemmers/french-stemmer');
const GermanStemmer = importLazy('./stemmers/german-stemmer');
const HungarianStemmer = importLazy('./stemmers/hungarian-stemmer');
const IrishStemmer = importLazy('./stemmers/irish-stemmer');
const ItalianStemmer = importLazy('./stemmers/italian-stemmer');
const NorwegianStemmer = importLazy('./stemmers/norwegian-stemmer');
const PortugueseStemmer = importLazy('./stemmers/portuguese-stemmer');
const PunctTokenizer = importLazy('./tokenizers/punct-tokenizer');
const RomanianStemmer = importLazy('./stemmers/romanian-stemmer');
const RussianStemmer = importLazy('./stemmers/russian-stemmer');
const SloveneStemmer = importLazy('./stemmers/slovene-stemmer');
const SpanishStemmer = importLazy('./stemmers/spanish-stemmer');
const SwedishStemmer = importLazy('./stemmers/swedish-stemmer');
const TamilStemmer = importLazy('./stemmers/tamil-stemmer');
const TurkishStemmer = importLazy('./stemmers/turkish-stemmer');
const PorterStemmer = importLazy('./stemmers/natural/porter-stemmer');
const PorterStemmerEs = importLazy('./stemmers/natural/porter-stemmer-es');
const PorterStemmerFa = importLazy('./stemmers/natural/porter-stemmer-fa');
const PorterStemmerFr = importLazy('./stemmers/natural/porter-stemmer-fr');
const PorterStemmerRu = importLazy('./stemmers/natural/porter-stemmer-ru');
const PorterStemmerIt = importLazy('./stemmers/natural/porter-stemmer-it');
const PorterStemmerNo = importLazy('./stemmers/natural/porter-stemmer-no');
const PorterStemmerPt = importLazy('./stemmers/natural/porter-stemmer-pt');
const PorterStemmerSv = importLazy('./stemmers/natural/porter-stemmer-sv');
const PorterStemmerNl = importLazy('./stemmers/natural/porter-stemmer-nl');
const StemmerJa = importLazy('./stemmers/natural/stemmer-ja');
const StemmerId = importLazy('./stemmers/natural/indonesian/stemmer_id');
const AggressiveTokenizer = importLazy('./tokenizers/aggressive-tokenizer');
const AggressiveTokenizerFa = importLazy(
'./tokenizers/aggressive-tokenizer-fa'
);
const AggressiveTokenizerFr = importLazy(
'./tokenizers/aggressive-tokenizer-fr'
);
const AggressiveTokenizerRu = importLazy(
'./tokenizers/aggressive-tokenizer-ru'
);
const AggressiveTokenizerEs = importLazy(
'./tokenizers/aggressive-tokenizer-es'
);
const AggressiveTokenizerId = importLazy(
'./tokenizers/aggressive-tokenizer-id'
);
const AggressiveTokenizerIt = importLazy(
'./tokenizers/aggressive-tokenizer-it'
);
const AggressiveTokenizerNl = importLazy(
'./tokenizers/aggressive-tokenizer-nl'
);
const AggressiveTokenizerNo = importLazy(
'./tokenizers/aggressive-tokenizer-no'
);
const AggressiveTokenizerPt = importLazy(
'./tokenizers/aggressive-tokenizer-pt'
);
const AggressiveTokenizerPl = importLazy(
'./tokenizers/aggressive-tokenizer-pl'
);
const AggressiveTokenizerSv = importLazy(
'./tokenizers/aggressive-tokenizer-sv'
);
const TokenizerJa = importLazy('./tokenizers/tokenizer-ja');
class NlpUtil {
/**
* Given a locale, get the 2 character one.
* @param {String} locale Locale of the language.
* @returns {String} Locale in 2 character length.
*/
static getTruncatedLocale(locale) {
return locale ? locale.substr(0, 2).toLowerCase() : undefined;
}
static getStemmer(locale) {
switch (locale) {
case 'en': // English
if (NlpUtil.useAlternative[locale]) {
return new EnglishStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmer;
case 'fa': // Farsi
return PorterStemmerFa;
case 'fr': // French
if (NlpUtil.useAlternative[locale]) {
return new FrenchStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerFr; // French
case 'ru': // Russian
if (NlpUtil.useAlternative[locale]) {
return new RussianStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerRu;
case 'es': // Spanish
if (NlpUtil.useAlternative[locale]) {
return new SpanishStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerEs;
case 'it': // Italian
if (NlpUtil.useAlternative[locale]) {
return new ItalianStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerIt;
case 'no': // Norwegian
if (NlpUtil.useAlternative[locale]) {
return new NorwegianStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerNo;
case 'pt': // Portuguese
if (NlpUtil.useAlternative[locale]) {
return new PortugueseStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerPt;
case 'sv': // Swedish
if (NlpUtil.useAlternative[locale]) {
return new SwedishStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerSv;
case 'nl': // Dutch
if (NlpUtil.useAlternative[locale]) {
return new DutchStemmer(NlpUtil.getTokenizer(locale));
}
return PorterStemmerNl;
case 'id':
return StemmerId; // Indonesian
case 'ja':
return new StemmerJa(); // Japanese
case 'ar':
return new ArabicStemmer(NlpUtil.getTokenizer(locale)); // Arabic
case 'hy':
return new ArmenianStemmer(NlpUtil.getTokenizer(locale)); // Armenian
case 'eu':
return new BasqueStemmer(NlpUtil.getTokenizer(locale)); // Basque
case 'ca':
return new CatalanStemmer(NlpUtil.getTokenizer(locale)); // Catalan
case 'cs':
return new CzechStemmer(NlpUtil.getTokenizer(locale)); // Czech
case 'da':
return new DanishStemmer(NlpUtil.getTokenizer(locale)); // Danish
case 'fi':
return new FinnishStemmer(NlpUtil.getTokenizer(locale)); // Finnish
case 'de':
return new GermanStemmer(NlpUtil.getTokenizer(locale)); // German
case 'hu':
return new HungarianStemmer(NlpUtil.getTokenizer(locale)); // Hungarian
case 'ga':
return new IrishStemmer(NlpUtil.getTokenizer(locale)); // Irish
case 'ro':
return new RomanianStemmer(NlpUtil.getTokenizer(locale)); // Romanian
case 'sl':
return new SloveneStemmer(NlpUtil.getTokenizer(locale)); // Slovene
case 'ta':
return new TamilStemmer(NlpUtil.getTokenizer(locale)); // Tamil
case 'tr':
return new TurkishStemmer(NlpUtil.getTokenizer(locale)); // Turkish
case 'zh':
return new ChineseStemmer(); // Chinese
default:
return PorterStemmer;
}
}
static getTokenizer(locale) {
switch (locale) {
case 'en':
return new AggressiveTokenizer(); // English
case 'fa':
return new AggressiveTokenizerFa(); // Farsi
case 'fr':
return new AggressiveTokenizerFr(); // French
case 'ru':
return new AggressiveTokenizerRu(); // Russian
case 'es':
return new AggressiveTokenizerEs(); // Spanish
case 'it':
return new AggressiveTokenizerIt(); // Italian
case 'nl':
return new AggressiveTokenizerNl(); // Dutch
case 'no':
return new AggressiveTokenizerNo(); // Norwegian
case 'pt':
return new AggressiveTokenizerPt(); // Portuguese
case 'pl':
return new AggressiveTokenizerPl(); // Polish
case 'sv':
return new AggressiveTokenizerSv(); // Swedish
case 'id':
return new AggressiveTokenizerId(); // Indonesian
case 'ja':
return new TokenizerJa(); // Japanese
case 'ar':
return new PunctTokenizer(); // Arabic
case 'hy':
return new PunctTokenizer(); // Armenian
case 'eu':
return new PunctTokenizer(); // Basque
case 'ca':
return new PunctTokenizer(); // Catalan
case 'cs':
return new PunctTokenizer(); // Czech
case 'da':
return new PunctTokenizer(); // Danish
case 'fi':
return new PunctTokenizer(); // Finnish
case 'de':
return new PunctTokenizer(); // German
case 'hu':
return new PunctTokenizer(); // Hungarian
case 'ga':
return new PunctTokenizer(); // Irish
case 'ro':
return new PunctTokenizer(); // Romanian
case 'sl':
return new PunctTokenizer(); // Slovene
case 'ta':
return new PunctTokenizer(); // Tamil
case 'tr':
return new PunctTokenizer(); // Turkish
case 'zh':
return new ChineseTokenizer(); // Chinese
default:
return new PunctTokenizer();
}
}
static getCulture(locale) {
switch (locale) {
case 'en':
return 'en-us'; // English
case 'fa':
return 'fa-ir'; // Farsi
case 'fr':
return 'fr-fr'; // French
case 'ru':
return 'ru-ru'; // Russian
case 'es':
return 'es-es'; // Spanish
case 'it':
return 'it-it'; // Italian
case 'nl':
return 'nl-nl'; // Dutch
case 'no':
return 'no-no'; // Norwegian
case 'pt':
return 'pt-br'; // Portuguese
case 'pl':
return 'pl-pl'; // Polish
case 'sv':
return 'sv-se'; // Swedish
case 'id':
return 'id-id'; // Indonesian
case 'ja':
return 'ja-jp'; // Japanese
case 'ar':
return 'ar-ae'; // Arabic
case 'hy':
return 'hy-am'; // Armenian
case 'eu':
return 'eu-es'; // Basque
case 'ca':
return 'ca-es'; // Catalan
case 'cs':
return 'cs-cz'; // Czech
case 'da':
return 'da-dk'; // Danish
case 'fi':
return 'fi-fi'; // Finnish
case 'de':
return 'de-de'; // German
case 'hu':
return 'hu-hu'; // Hungarian
case 'ga':
return 'ga-ie'; // Irish
case 'ro':
return 'ro-ro'; // Romanian
case 'sl':
return 'sl-sl'; // Slovene
case 'ta':
return 'ta-in'; // Tamil
case 'tr':
return 'tr-tr'; // Turkish
case 'zh':
return 'zh-cn'; // Chinese
default:
return 'en-us';
}
}
}
NlpUtil.useAlternative = {
en: false,
fa: false,
fr: false,
ru: false,
es: false,
it: false,
nl: false,
no: false,
pt: false,
pl: false,
sv: false,
id: false,
ja: false,
ca: false,
da: false,
fi: false,
de: false,
hu: false,
ro: false,
tr: false,
};
module.exports = NlpUtil;