UNPKG

phpmorphy

Version:

Original package is located at http://phpmorphy.sourceforge.net/

809 lines (685 loc) 20.2 kB
import path from 'path'; import _ from 'lodash'; import { php, toBuffer } from '~/utils'; import { Fsa } from './fsa/fsa'; import { SourceFsa } from './source'; import { GramTabProxy } from './gramtab'; import { StorageFactory } from './storage'; import { GrammemsProviderFactory, GrammemsProviderInterface } from './langs-stuff/common'; import { GramInfoRuntimeCaching, GramInfoProxyWithHeader, GramInfoAncodeCache } from './graminfo/graminfo'; import { AncodesResolverProxy, MorphierHelper, MorphierCommon, MorphierBulk, MorphierPredictDatabase, MorphierEmpty, MorphierPredictSuffix, } from './morphiers'; import { STORAGE_FILE, STORAGE_MEM, SOURCE_FSA, RESOLVE_ANCODES_AS_TEXT, RESOLVE_ANCODES_AS_DIALING, RESOLVE_ANCODES_AS_INT, NORMAL, IGNORE_PREDICT, ONLY_PREDICT, PREDICT_BY_NONE, PREDICT_BY_SUFFIX, PREDICT_BY_DB, } from './constants'; class FilesBundle { constructor(dirName, lang) { this.dir = dirName; this.setLang(lang); } getLang() { return this.lang; } setLang(lang) { this.lang = lang.toLowerCase(); } getCommonAutomatFile() { return this.genFileName('common_aut'); } getPredictAutomatFile() { return this.genFileName('predict_aut'); } getGramInfoFile() { return this.genFileName('morph_data'); } getGramInfoAncodesCacheFile() { return this.genFileName('morph_data_ancodes_cache'); } getAncodesMapFile() { return this.genFileName('morph_data_ancodes_map'); } getGramTabFile() { return this.genFileName('gramtab'); } getGramTabFileWithTextIds() { return this.genFileName('gramtab_txt'); } getGramInfoHeaderCacheFile() { return this.genFileName('morph_data_header_cache'); } genFileName(token, extraExt) { extraExt = !_.isUndefined(extraExt) ? extraExt : null; return path.join(this.dir, [token, '.', this.lang, php.var.isset(extraExt) ? `.${extraExt}` : '', '.bin'].join('')); } } class WordDescriptorCollectionSerializer { /** * @param {WordDescriptorCollection} collection * @param {boolean} [asText=false] * @returns {*} */ serialize(collection, asText = false) { const result = []; _.forEach(collection, descriptor => result.push(this.processWordDescriptor(descriptor, asText))); return result; } /** * @param {WordDescriptor} descriptor * @param {boolean} [asText=false] * @returns {{forms: *, all: *, common: string}} */ processWordDescriptor(descriptor, asText = false) { const all = []; const forms = []; _.forEach(descriptor, wordForm => { forms.push(wordForm.getWord()); all.push(this.serializeGramInfo(wordForm, asText)); }); return { all, forms, common: '', }; } /** * @param {WordForm} wordForm * @param {boolean} [asText=false] * @returns {*} */ serializeGramInfo(wordForm, asText = false) { if (asText) { return `${wordForm.getPartOfSpeech()} ${php.strings.implode(',', wordForm.getGrammems())}`; } return { pos: wordForm.getPartOfSpeech(), grammems: wordForm.getGrammems(), }; } } class PhpMorphy { constructor(dir, lang = null, options = {}) { this.options = this.repairOptions(options); this.init(this.createFilesBundle(dir, lang), this.options); this.last_prediction_type = PREDICT_BY_NONE; } /** * @param {FilesBundle} bundle * @param options */ init(bundle, options) { this.options = this.repairOptions(options); this.storage_factory = this.createStorageFactory(); this.common_fsa = this.createFsa( this.storage_factory.open(this.options.storage, bundle.getCommonAutomatFile(), false), false, ); this.predict_fsa = this.createFsa( this.storage_factory.open(this.options.storage, bundle.getPredictAutomatFile(), true), true, ); const graminfo = this.createGramInfo( this.storage_factory.open(this.options.storage, bundle.getGramInfoFile(), true), bundle, ); const gramtab = this.createGramTab( this.storage_factory.open( this.options.storage, this.options.graminfo_as_text ? bundle.getGramTabFileWithTextIds() : bundle.getGramTabFile(), true, ), ); this.helper = this.createMorphierHelper(graminfo, gramtab, this.options.graminfo_as_text, bundle); } /** * @return {MorphierInterface} */ getCommonMorphier() { return this.__common_morphier; } /** * @return {MorphierInterface} */ getPredictBySuffixMorphier() { return this.__predict_by_suf_morphier; } /** * @return {MorphierInterface} */ getPredictByDatabaseMorphier() { return this.__predict_by_db_morphier; } /** * @return {MorphierBulk} */ getBulkMorphier() { return this.__bulk_morphier; } /** * @return {string} */ getEncoding() { return this.helper.getGramInfo().getEncoding(); } /** * @return {string} */ getLocale() { return this.helper.getGramInfo().getLocale(); } /** * @return {GrammemsProviderBase} */ getGrammemsProvider() { return _.cloneDeep(this.__grammems_provider); } /** * @return {GrammemsProviderBase} */ getDefaultGrammemsProvider() { return this.__grammems_provider; } /** * @return {boolean} */ isLastPredicted() { return this.last_prediction_type !== PREDICT_BY_NONE; } /** * @returns {string} */ getLastPredictionType() { return this.last_prediction_type; } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {WordDescriptorCollection|{}} */ findWord(word, type = NORMAL) { const result = {}; if (php.var.is_array(word)) { word.forEach(w => (result[w] = this.invoke('getWordDescriptor', toBuffer(w), type))); return result; } return this.invoke('getWordDescriptor', word, type); } /** * Alias for getBaseForm * * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ lemmatize(word, type = NORMAL) { return this.getBaseForm(toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getBaseForm(word, type = NORMAL) { return this.invoke('getBaseForm', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getAllForms(word, type = NORMAL) { return this.invoke('getAllForms', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getPseudoRoot(word, type = NORMAL) { return this.invoke('getPseudoRoot', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getPartOfSpeech(word, type = NORMAL) { return this.invoke('getPartOfSpeech', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getAllFormsWithAncodes(word, type = NORMAL) { return this.invoke('getAllFormsWithAncodes', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {boolean} [asText=true] - represent graminfo as text or ancodes * @param {*} [type=NORMAL] - prediction managment * @return {*} */ getAllFormsWithGramInfo(word, asText = true, type = NORMAL) { const result = this.findWord(toBuffer(word), type); if (!result) return false; if (php.var.is_array(word)) { const out = {}; _.forEach(result, (r, w) => { if (r !== false) { out[w] = this.processWordsCollection(r, asText); } else { out[w] = false; } }); return out; } return this.processWordsCollection(result, asText); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getAncode(word, type = NORMAL) { return this.invoke('getAncode', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getGramInfo(word, type = NORMAL) { return this.invoke('getGrammarInfo', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param {*} [type=NORMAL] - prediction managment * @return {[]} */ getGramInfoMergeForms(word, type = NORMAL) { return this.invoke('getGrammarInfoMergeForms', toBuffer(word), type); } /** * @param {*} word - string or array of strings * @param [type=NORMAL] - prediction managment * @returns {[]} */ getAnnotForWord(word, type = NORMAL) { return this.invoke('getAnnot', toBuffer(word), type); } /** * @param {string} word * @param {*} ancode * @param {*} [commonAncode=null] * @param {boolean} [returnOnlyWord=false] * @param {*} [callback=null] * @param {*} [type=NORMAL] * @return {[]} */ castFormByAncode(word, ancode, commonAncode = null, returnOnlyWord = false, callback = null, type = NORMAL) { const resolver = this.helper.getAncodesResolver(); const commonAncodeId = resolver.unresolve(commonAncode); const ancodeId = resolver.unresolve(ancode); const data = this.helper.getGrammemsAndPartOfSpeech(ancodeId); if (php.var.isset(commonAncodeId)) { data[1] = php.array.array_merge(data[1], this.helper.getGrammems(commonAncodeId)); } return this.castFormByGramInfo(toBuffer(word), data[0], data[1], returnOnlyWord, callback, type); } /** * @param {string} word * @param {*} partOfSpeech * @param {[]} grammems * @param {boolean} [returnOnlyWord=false] * @param {*} [callback=null] * @param {*} [type=NORMAL] * @return {[]|boolean} */ castFormByGramInfo(word, partOfSpeech, grammems, returnOnlyWord = false, callback = null, type = NORMAL) { const annot = this.getAnnotForWord(toBuffer(word), type); if (!annot) return false; return this.helper.castFormByGramInfo(toBuffer(word), annot, partOfSpeech, grammems, returnOnlyWord, callback); } /** * @param {string} word * @param {string} patternWord * @param {GrammemsProviderInterface} [grammemsProvider=null] * @param {boolean} [returnOnlyWord=false] * @param {*} [callback=false] * @param {*} [type=NORMAL] * @return {[]|boolean} */ castFormByPattern( word, patternWord, grammemsProvider = null, returnOnlyWord = false, callback = null, type = NORMAL, ) { word = toBuffer(word); patternWord = toBuffer(patternWord); const wordAnnot = this.getAnnotForWord(word, type); if (!wordAnnot) { return false; } if (!(grammemsProvider instanceof GrammemsProviderInterface)) { grammemsProvider = this.__grammems_provider; } let result = []; _.forEach(this.getGramInfo(patternWord, type), paradigm => { _.forEach(paradigm, grammar => { const pos = grammar.pos; const essentialGrammems = grammemsProvider.getGrammems(pos); const grammems = essentialGrammems !== false ? php.array.array_intersect(grammar.grammems, essentialGrammems) : grammar.grammems; const res = this.helper.castFormByGramInfo(word, wordAnnot, pos, grammems, returnOnlyWord, callback, type); if (res.length > 0) { result = php.array.array_merge(result, res); } }); }); return returnOnlyWord ? _.uniq(result) : result; } /** * @param {WordDescriptorCollection} collection * @param {boolean} asText * @returns {*} */ processWordsCollection(collection, asText) { return this.__word_descriptor_serializer.serialize(collection, asText); } invoke(method, word, type) { this.last_prediction_type = PREDICT_BY_NONE; word = toBuffer(word); let result; let notFound; if (type === ONLY_PREDICT) { if (php.var.is_array(word)) { result = {}; _.forEach(word, w => (result[w] = this.predictWord(method, w))); return result; } return this.predictWord(method, word); } if (php.var.is_array(word)) { result = this.__bulk_morphier[method](word); notFound = this.__bulk_morphier.getNotFoundWords(); _.forEach(notFound, word => { result[word] = type !== IGNORE_PREDICT ? this.predictWord(method, word) : false; }); } else { result = this.__common_morphier[method](word); if (!result && type !== IGNORE_PREDICT) { result = this.predictWord(method, word); } } return result; } predictWord(method, word) { word = toBuffer(word); let result = this.__predict_by_suf_morphier[method](word); if (result !== false) { this.last_prediction_type = PREDICT_BY_SUFFIX; return result; } result = this.__predict_by_db_morphier[method](word); if (result !== false) { this.last_prediction_type = PREDICT_BY_DB; return result; } return false; } /** * @param {FilesBundle} bundle * @param {{}} opts * @returns {*} */ createCommonSource(bundle, opts) { const type = opts.type; // eslint-disable-next-line sonarjs/no-small-switch switch (type) { case SOURCE_FSA: return new SourceFsa(this.common_fsa); default: throw new Error(`Unknown source type given '${type}'`); } } repairOldOptions(options = {}) { const defaults = { predict_by_suffix: false, predict_by_db: false, }; return Object.assign(defaults, options); } repairSourceOptions(options = {}) { const defaults = { type: SOURCE_FSA, opts: null, }; return Object.assign(defaults, options); } repairOptions(options = {}) { const defaults = { graminfo_as_text: true, storage: STORAGE_MEM, common_source: this.repairSourceOptions(options.common_source || null), predict_by_suffix: true, predict_by_db: true, use_ancodes_cache: false, resolve_ancodes: RESOLVE_ANCODES_AS_TEXT, }; return Object.assign(defaults, options); } get __predict_by_db_morphier() { if (!this.___predict_by_db_morphier) { this.___predict_by_db_morphier = this.createPredictByDbMorphier(this.predict_fsa, this.helper); } return this.___predict_by_db_morphier; } set __predict_by_db_morphier(value) { this.___predict_by_db_morphier = !_.isUndefined(value) ? value : null; } get __predict_by_suf_morphier() { if (!this.___predict_by_suf_morphier) { this.___predict_by_suf_morphier = this.createPredictBySuffixMorphier(this.common_fsa, this.helper); } return this.___predict_by_suf_morphier; } set __predict_by_suf_morphier(value) { this.___predict_by_suf_morphier = !_.isUndefined(value) ? value : null; } get __bulk_morphier() { if (!this.___bulk_morphier) { this.___bulk_morphier = this.createBulkMorphier(this.common_fsa, this.helper); } return this.___bulk_morphier; } set __bulk_morphier(value) { this.___bulk_morphier = !_.isUndefined(value) ? value : null; } get __common_morphier() { if (!this.___common_morphier) { this.___common_morphier = this.createCommonMorphier(this.common_fsa, this.helper); } return this.___common_morphier; } set __common_morphier(value) { this.___common_morphier = !_.isUndefined(value) ? value : null; } get __word_descriptor_serializer() { if (!this.___word_descriptor_serializer) { this.___word_descriptor_serializer = this.createWordDescriptorSerializer(); } return this.___word_descriptor_serializer; } set __word_descriptor_serializer(value) { this.___word_descriptor_serializer = !_.isUndefined(value) ? value : null; } get __grammems_provider() { if (!this.___grammems_provider) { this.___grammems_provider = this.createGrammemsProvider(); } return this.___grammems_provider; } set __grammems_provider(value) { this.___grammems_provider = !_.isUndefined(value) ? value : null; } // ////////////////// // factory methods // ////////////////// createGrammemsProvider() { return GrammemsProviderFactory.create(this); } createWordDescriptorSerializer() { return new WordDescriptorCollectionSerializer(); } createFilesBundle(dir, lang) { return new FilesBundle(dir, lang); } createStorageFactory() { return new StorageFactory(); } /** * @param {Storage} storage * @param {boolean} lazy * @returns {*} */ createFsa(storage, lazy) { return Fsa.create(storage, lazy); } /** * @param {Storage} graminfoFile * @param {FilesBundle} bundle */ createGramInfo(graminfoFile, bundle) { const result = new GramInfoRuntimeCaching( new GramInfoProxyWithHeader(graminfoFile, bundle.getGramInfoHeaderCacheFile()), ); if (this.options.use_ancodes_cache) { return new GramInfoAncodeCache( result, this.storage_factory.open(this.options.storage, bundle.getGramInfoAncodesCacheFile(), true), ); } return result; } /** * @param {Storage} storage * @returns {GramTabProxy} */ createGramTab(storage) { return new GramTabProxy(storage); } /** * @param {GramTabInterface} gramtab * @param {FilesBundle} bundle */ createAncodesResolverInternal(gramtab, bundle) { switch (this.options.resolve_ancodes) { case RESOLVE_ANCODES_AS_TEXT: return ['AncodesResolverToText', [gramtab]]; case RESOLVE_ANCODES_AS_INT: return ['AncodesResolverAsIs', []]; case RESOLVE_ANCODES_AS_DIALING: return [ 'AncodesResolverToDialingAncodes', [this.storage_factory.open(this.options.storage, bundle.getAncodesMapFile(), true)], ]; default: throw new Error( 'Invalid resolve_ancodes option, valid values are RESOLVE_ANCODES_AS_DIALING, RESOLVE_ANCODES_AS_INT, RESOLVE_ANCODES_AS_TEXT', ); } } /** * @param {GramTabInterface} gramtab * @param {FilesBundle} bundle * @param {boolean} lazy */ createAncodesResolver(gramtab, bundle, lazy) { const [className, args] = this.createAncodesResolverInternal(gramtab, bundle); if (lazy) { return new AncodesResolverProxy(className, args); } return AncodesResolverProxy.instantinate(className, args); } /** * @param {GramInfoInterface} graminfo * @param {GramTabInterface} gramtab * @param {boolean} graminfoAsText * @param {FilesBundle} bundle */ createMorphierHelper(graminfo, gramtab, graminfoAsText, bundle) { return new MorphierHelper(graminfo, gramtab, this.createAncodesResolver(gramtab, bundle, true), graminfoAsText); } /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ createCommonMorphier(fsa, helper) { return new MorphierCommon(fsa, helper); } /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ createBulkMorphier(fsa, helper) { return new MorphierBulk(fsa, helper); } /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ createPredictByDbMorphier(fsa, helper) { if (this.options.predict_by_db) { return new MorphierPredictDatabase(fsa, helper); } return new MorphierEmpty(); } /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ createPredictBySuffixMorphier(fsa, helper) { if (this.options.predict_by_suffix) { return new MorphierPredictSuffix(fsa, helper); } return new MorphierEmpty(); } } export default PhpMorphy; export { STORAGE_FILE, STORAGE_MEM, SOURCE_FSA, RESOLVE_ANCODES_AS_TEXT, RESOLVE_ANCODES_AS_DIALING, RESOLVE_ANCODES_AS_INT, NORMAL, IGNORE_PREDICT, ONLY_PREDICT, PREDICT_BY_NONE, PREDICT_BY_SUFFIX, PREDICT_BY_DB, };