UNPKG

phpmorphy

Version:

Original package is located at http://phpmorphy.sourceforge.net/

2,064 lines (1,647 loc) 49.3 kB
/** * This file is part of phpMorphy library * * Copyright c 2007-2008 Kamaev Vladimir <heromantor@users.sourceforge.net> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ import _ from 'lodash'; import { php, toBuffer, castArray, isStringifyedNumber } from '~/utils'; import { UnicodeHelper } from './unicode'; import { FsaWordsCollector } from './fsa/fsa'; // ---------------------------- // Morphier interface // ---------------------------- class MorphierInterface { getAnnot(word) {} getBaseForm(word) {} getAllForms(word) {} getPseudoRoot(word) {} getPartOfSpeech(word) {} getWordDescriptor(word) {} getAllFormsWithAncodes(word) {} getAncode(word) {} getGrammarInfoMergeForms(word) {} getGrammarInfo(word) {} } class MorphierEmpty extends MorphierInterface { getAnnot(word) { return false; } getBaseForm(word) { return false; } getAllForms(word) { return false; } getAllFormsWithGramInfo(word) { return false; } getPseudoRoot(word) { return false; } getPartOfSpeech(word) { return false; } getWordDescriptor(word) { return false; } getAllFormsWithAncodes(word) { return false; } getAncode(word) { return false; } getGrammarInfoMergeForms(word) { return false; } getGrammarInfo(word) { return false; } /** * @param word * @param partOfSpeech * @param grammems * @param {boolean} [returnWords=false] * @param {*} [callback=null] * @returns {boolean} */ castFormByGramInfo(word, partOfSpeech, grammems, returnWords, callback) { return false; } } // ---------------------------- // Annot decoder // ---------------------------- class AnnotDecoderInterface { decode(annotsRaw, withBase) {} } class AnnotDecoderBase extends AnnotDecoderInterface { static get INVALID_ANCODE_ID() { return 0xffff; } constructor(ends) { super(); this.ends = ends; this.unpack_str = this.getUnpackString(); this.block_size = this.getUnpackBlockSize(); } getUnpackString() {} getUnpackBlockSize() {} decode(annotRawBuf, withBase) { if (php.var.empty(annotRawBuf)) { throw new Error('Empty annot given'); } const annotRaw = annotRawBuf.toString(); const blockSize = this.block_size; const unpackStr = this.unpack_str; let result = php.unpack(`Vcount/${unpackStr}`, annotRawBuf); let res; let items; let start; if (result === false) { throw new Error(`Invalid annot string '${annotRaw}'`); } if (result.common_ancode === AnnotDecoderBase.INVALID_ANCODE_ID) { result.common_ancode = null; } const { count } = result; result = [result]; if (count > 1) { for (let i = 0; i < count - 1; i++) { start = 4 + (i + 1) * blockSize; res = php.unpack(unpackStr, annotRawBuf.slice(start, start + blockSize)); if (res.common_ancode === AnnotDecoderBase.INVALID_ANCODE_ID) { res.common_ancode = null; } result.push(res); } } if (withBase) { start = 4 + count * blockSize; items = annotRawBuf .slice(start) .toString() .split(this.ends.toString()); for (let i = 0; i < count; i++) { result[i].base_prefix = items[i * 2]; result[i].base_suffix = items[i * 2 + 1]; } } return result; } } class AnnotDecoderCommon extends AnnotDecoderBase { getUnpackString() { return [ 'Voffset', 'vcplen', 'vplen', 'vflen', 'vcommon_ancode', 'vforms_count', 'vpacked_forms_count', 'vaffixes_size', 'vform_no', 'vpos_id', ].join('/'); } getUnpackBlockSize() { return 22; } } class AnnotDecoderPredict extends AnnotDecoderCommon { getUnpackString() { return [super.getUnpackString(), 'vfreq'].join('/'); } getUnpackBlockSize() { return super.getUnpackBlockSize() + 2; } } const AnnotDecoderFactoryInstances = {}; class AnnotDecoderFactory { static get instances() { return AnnotDecoderFactoryInstances; } static get AnnotDecoders() { return { AnnotDecoderCommon, AnnotDecoderPredict, }; } static create(eos) { const { instances } = AnnotDecoderFactory; if (!php.var.isset(instances[eos])) { instances[eos] = new AnnotDecoderFactory(eos); } return instances[eos]; } constructor(eos) { this.eos = eos; } getCommonDecoder() { if (!php.var.isset(this.cache_common)) { this.cache_common = this.instantinate('common'); } return this.cache_common; } getPredictDecoder() { if (!php.var.isset(this.cache_predict)) { this.cache_predict = this.instantinate('predict'); } return this.cache_predict; } instantinate(type) { const className = `AnnotDecoder${_.upperFirst(type.toLowerCase())}`; return new AnnotDecoderFactory.AnnotDecoders[className](this.eos); } } class AncodesResolverInterface { resolve(ancodeId) {} unresolve(ancode) {} } class AncodesResolverToText extends AncodesResolverInterface { /** * @param {GramTabInterface} gramtab * @private */ constructor(gramtab) { super(); this.gramtab = gramtab; } resolve(ancodeId) { if (!php.var.isset(ancodeId)) { return null; } return this.gramtab.ancodeToString(ancodeId); } unresolve(ancode) { return this.gramtab.stringToAncode(ancode); } } class AncodesResolverToDialingAncodes extends AncodesResolverInterface { /** * @param {Storage} ancodesMap */ constructor(ancodesMap) { super(); this.ancodes_map = php.var.unserialize(ancodesMap.read(0, ancodesMap.getFileSize()).toString()); if (this.ancodes_map === false) { throw new Error('Can`t open phpMorphy => Dialing ancodes map'); } this.reverse_map = php.array.array_flip(this.ancodes_map); } unresolve(ancode) { if (!php.var.isset(ancode)) { return null; } if (!php.var.isset(this.reverse_map[ancode])) { throw new Error(`Unknown ancode found '${ancode}'`); } return this.reverse_map[ancode]; } resolve(ancodeId) { if (!php.var.isset(ancodeId)) { return null; } if (!php.var.isset(this.ancodes_map[ancodeId])) { throw new Error(`Unknown ancode id found '${ancodeId}'`); } return this.ancodes_map[ancodeId]; } } class AncodesResolverAsIs extends AncodesResolverInterface { resolve(ancodeId) { return ancodeId; } unresolve(ancode) { return ancode; } } class AncodesResolverProxy extends AncodesResolverInterface { static get AncodesResolvers() { return { AncodesResolverToText, AncodesResolverToDialingAncodes, AncodesResolverAsIs, }; } static instantinate(className, args) { const { AncodesResolvers } = AncodesResolverProxy; return new AncodesResolvers[className](...args); // return new (Function.prototype.bind.apply(AncodesResolvers[className], args)); } constructor(className, ctorArgs) { super(); this.className = className; this.args = ctorArgs; this.___obj = null; } unresolve(ancode) { return this.__obj.unresolve(ancode); } resolve(ancodeId) { return this.__obj.resolve(ancodeId); } get __obj() { if (!this.___obj) { this.___obj = AncodesResolverProxy.instantinate(this.className, this.args); delete this.args; delete this.className; } return this.___obj; } set __obj(value) { this.___obj = !_.isUndefined(value) ? value : null; } } // ---------------------------- // WordDescriptor // ---------------------------- /** * @class * @augments Array */ class WordDescriptorCollection extends Array { // https://developer.mozilla.org/ru/docs/Web/JavaScript/Reference/Classes#Species static get [Symbol.species]() { return Array; } /** * @param {*} word * @param {*} annots * @param {MorphierHelper} helper */ constructor(word, annots, helper) { super(); this.word = `${word || ''}`; this.annots = annots === false ? false : helper.decodeAnnot(annots, true); this.helper = helper; if (this.annots !== false) { _.forEach(this.annots, annot => this.push(this.createDescriptor(word, annot, helper))); } } /** * @param {*} word * @param {*} annot * @param {MorphierHelper} helper */ createDescriptor(word, annot, helper) { return new WordDescriptor(word, annot, helper); } getByPartOfSpeech(poses) { poses = castArray(poses); const result = []; _.forEach(this, desc => { if (desc.hasPartOfSpeech(poses)) { result.push(desc); } }); return result; } } // ---------------------------- // Helper // ---------------------------- class MorphierHelper { /** * @param {GramInfoInterface} graminfo * @param {GramTabInterface} gramtab * @param {AncodesResolverInterface} ancodesResolver * @param {*} resolvePartOfSpeech */ constructor(graminfo, gramtab, ancodesResolver, resolvePartOfSpeech) { this.graminfo = graminfo; this.gramtab = gramtab; this.resolve_pos = !!resolvePartOfSpeech; this.ancodes_resolver = ancodesResolver; this.char_size = graminfo.getCharSize(); this.ends = graminfo.getEnds(); this.annot_decoder = null; this.gramtab_consts_included = false; } /** * @param {AnnotDecoderInterface} annotDecoder */ setAnnotDecoder(annotDecoder) { this.annot_decoder = annotDecoder; } // getters getEndOfString() { return this.ends; } getCharSize() { return this.char_size; } hasAnnotDecoder() { return php.var.isset(this.annot_decoder); } getAnnotDecoder() { return this.annot_decoder; } getAncodesResolver() { return this.ancodes_resolver; } getGramInfo() { return this.graminfo; } getGramTab() { return this.gramtab; } isResolvePartOfSpeech() { return this.resolve_pos; } // other resolvePartOfSpeech(posId) { return this.gramtab.resolvePartOfSpeechId(posId); } getGrammems(ancodeId) { return this.gramtab.getGrammems(ancodeId); } getGrammemsAndPartOfSpeech(ancodeId) { return [this.gramtab.getPartOfSpeech(ancodeId), this.gramtab.getGrammems(ancodeId)]; } extractPartOfSpeech(annot) { if (this.resolve_pos) { return this.resolvePartOfSpeech(annot.pos_id); } return annot.pos_id; } includeGramTabConsts() { if (this.isResolvePartOfSpeech()) { this.gramtab.includeConsts(); } this.gramtab_consts_included = true; } // getters getWordDescriptor(word, annots) { if (!this.gramtab_consts_included) { this.includeGramTabConsts(); } return new WordDescriptorCollection(word, annots, this); } getBaseAndPrefix(word, cplen, plen, flen) { const wordBuf = Buffer.from(word); let base; let prefix; if (flen) { base = php.strings.substr(wordBuf, cplen + plen, -flen); } else if (cplen || plen) { base = php.strings.substr(wordBuf, cplen + plen); } else { base = wordBuf; } prefix = cplen ? php.strings.substr(wordBuf, 0, cplen) : ''; base = base.toString(); prefix = prefix.toString(); return [base, prefix]; } getPartOfSpeech(word, annots) { if (annots === false) return false; let result = {}; _.forEach(this.decodeAnnot(annots, false), annot => (result[this.extractPartOfSpeech(annot)] = 1)); result = _.keys(result); result = this.resolve_pos ? result : result.map(_.toInteger); return result; } getBaseForm(word, annots) { if (annots === false) return false; annots = this.decodeAnnot(annots, true); return this.composeBaseForms(word, annots); } getPseudoRoot(word, annots) { if (annots === false) return false; const result = {}; annots = this.decodeAnnot(annots, false); _.forEach(annots, annot => { const base = this.getBaseAndPrefix(word, annot.cplen, annot.plen, annot.flen)[0]; result[base] = 1; }); return php.array.array_keys(result); } getAllForms(word, annots) { if (annots === false) return false; annots = this.decodeAnnot(annots, false); return this.composeForms(word, annots); } /** * @param word * @param annots * @param partOfSpeech * @param grammems * @param {boolean} [returnWords=false] * @param {*} [callback=null] * @returns {*} */ castFormByGramInfo(word, annots, partOfSpeech, grammems, returnWords = false, callback = null) { if (annots === false) return false; /** * @todo: вот сюда данные приходят правильные (как в php), * а выходят не правильные (не как в php) */ grammems = _.toArray(grammems); partOfSpeech = php.var.isset(partOfSpeech) ? `${partOfSpeech}` : null; /** * Проверено: * grammems * partOfSpeech * this.decodeAnnot(annots, false) */ const result = returnWords ? {} : []; _.forEach(this.decodeAnnot(annots, false), annot => { const allAncodes = this.graminfo.readAncodes(annot); const flexias = this.graminfo.readFlexiaData(annot); const commonAncode = annot.common_ancode; const commonGrammems = php.var.isset(commonAncode) ? this.gramtab.getGrammems(commonAncode) : []; const [base, prefix] = this.getBaseAndPrefix(word, annot.cplen, annot.plen, annot.flen); let formNo = 0; let i = 0; /** * Проверено: * allAncodes * flexias * commonAncode * commonGrammems * base * prefix */ _.forEach(allAncodes, formAncodes => { _.forEach(formAncodes, ancode => { const formPos = this.gramtab.getPartOfSpeech(ancode); const formGrammems = php.array.array_merge(this.gramtab.getGrammems(ancode), commonGrammems); const form = [prefix, flexias[i], base, flexias[i + 1]].join(''); if (_.isFunction(callback)) { if (!callback(form, formPos, formGrammems, formNo)) { formNo += 1; return; } } else { if (php.var.isset(partOfSpeech) && formPos !== partOfSpeech) { formNo += 1; return; } if (_.size(php.array.array_diff(grammems, formGrammems)) > 0) { formNo += 1; return; } } if (returnWords) { result[form] = 1; } else { result.push({ form, form_no: formNo, pos: formPos, grammems: formGrammems, }); } formNo += 1; }); i += 2; }); }); return returnWords ? php.array.array_keys(result) : result; } getAncode(annots) { if (annots === false) return false; const result = []; _.forEach(this.decodeAnnot(annots, false), annot => { const allAncodes = this.graminfo.readAncodes(annot); result.push({ common: this.ancodes_resolver.resolve(annot.common_ancode), all: php.array.array_map([this.ancodes_resolver, 'resolve'], allAncodes[annot.form_no]), }); }); return _.uniqWith(result, _.isEqual); } getGrammarInfoMergeForms(annots) { if (annots === false) return false; const result = []; _.forEach(this.decodeAnnot(annots, false), annot => { const allAncodes = this.graminfo.readAncodes(annot); const commonAncode = annot.common_ancode; const formNo = annot.form_no; let grammems = php.var.isset(commonAncode) ? this.gramtab.getGrammems(commonAncode) : []; let formsCount = 0; let ancodeId; _.forEach(allAncodes[formNo], ancode => { ancodeId = ancode; grammems = php.array.array_merge(grammems, this.gramtab.getGrammems(ancode)); formsCount += 1; }); grammems = _.sortedUniq(_.sortBy(grammems, this.resolve_pos ? JSON.stringify : _.toInteger)); result.push({ // todo: незарезолвенный ancodeId // part of speech identical across all joined forms pos: this.gramtab.getPartOfSpeech(ancodeId), grammems, forms_count: formsCount, form_no_low: formNo, form_no_high: formNo + formsCount, }); }); return _.uniqWith(result, _.isEqual); } getGrammarInfo(annots) { if (annots === false) return false; const result = []; _.forEach(this.decodeAnnot(annots, false), annot => { const allAncodes = this.graminfo.readAncodes(annot); const commonAncode = annot.common_ancode; const commonGrammems = php.var.isset(commonAncode) ? this.gramtab.getGrammems(commonAncode) : []; const info = []; const formNo = annot.form_no; _.forEach(allAncodes[formNo], ancode => { let grammems = php.array.array_merge(commonGrammems, this.gramtab.getGrammems(ancode)); grammems = _.sortBy(grammems, this.resolve_pos ? JSON.stringify : _.toInteger); const infoItem = { pos: this.gramtab.getPartOfSpeech(ancode), grammems, form_no: formNo, }; info.push(infoItem); }); const uniqueInfo = _.sortedUniq(_.sortBy(info, JSON.stringify)); result.push(uniqueInfo); }); return _.uniqWith(result, _.isEqual); } /** * @param word * @param annots * @param {string} [resolveType='no_resolve'] * @returns {boolean} */ getAllFormsWithResolvedAncodes(word, annots, resolveType = 'no_resolve') { if (annots === false) return false; annots = this.decodeAnnot(annots, false); return this.composeFormsWithResolvedAncodes(word, annots); } getAllFormsWithAncodes(word, annots, foundFormNo) { if (annots === false) return false; annots = this.decodeAnnot(annots, false); return this.composeFormsWithAncodes(word, annots); } getAllAncodes(word, annots) { if (annots === false) return false; const result = []; _.forEach(annots, annot => result.push(this.graminfo.readAncodes(annot))); return result; } composeBaseForms(word, annots) { const result = {}; _.forEach(annots, annot => { if (annot.form_no > 0) { const baseAndPrefix = this.getBaseAndPrefix(word, annot.cplen, annot.plen, annot.flen); const base = baseAndPrefix[0]; const prefix = baseAndPrefix[1]; const form = [prefix, annot.base_prefix, base, annot.base_suffix].join(''); result[form] = 1; } else { result[word] = 1; } }); return php.array.array_keys(result); } composeForms(word, annots) { const result = {}; _.forEach(annots, annot => { const baseAndPrefix = this.getBaseAndPrefix(word, annot.cplen, annot.plen, annot.flen); const base = baseAndPrefix[0]; const prefix = baseAndPrefix[1]; // read flexia const flexias = this.graminfo.readFlexiaData(annot); let form; for (let i = 0, c = _.size(flexias); i < c; i += 2) { form = [prefix, flexias[i], base, flexias[i + 1]].join(''); result[form] = 1; } }); return php.array.array_keys(result); } composeFormsWithResolvedAncodes(word, annots) { const result = []; _.forEach(annots, (annot, annotIdx) => { const words = []; const ancodes = []; const commonAncode = annot.common_ancode; // read flexia const flexias = this.graminfo.readFlexiaData(annot); const allAncodes = this.graminfo.readAncodes(annot); const baseAndPrefix = this.getBaseAndPrefix(word, annot.cplen, annot.plen, annot.flen); const base = baseAndPrefix[0]; const prefix = baseAndPrefix[1]; let form; let currentAncodes; for (let i = 0, c = _.size(flexias); i < c; i += 2) { form = [prefix, flexias[i], base, flexias[i + 1]].join(''); currentAncodes = allAncodes[i / 2]; _.forEach(currentAncodes, ancode => { words.push(form); ancodes.push(this.ancodes_resolver.resolve(ancode)); }); } result.push({ all: ancodes, forms: words, common: this.ancodes_resolver.resolve(commonAncode), }); }); return result; } composeFormsWithAncodes(word, annots, foundFormNo) { const result = []; _.forEach(annots, (annot, annotIdx) => { const baseAndPrefix = this.getBaseAndPrefix(word, annot.cplen, annot.plen, annot.flen); const base = baseAndPrefix[0]; const prefix = baseAndPrefix[1]; // read flexia const flexias = this.graminfo.readFlexiaData(annot); const ancodes = this.graminfo.readAncodes(annot); const annotFormNo = annot.form_no; let count; let formNo; foundFormNo = !_.isArray(foundFormNo) ? [] : [...foundFormNo]; for (let i = 0, c = _.size(flexias); i < c; i += 2) { formNo = i / 2; word = [prefix, flexias[i], base, flexias[i + 1]].join(''); if (annotFormNo === formNo) { foundFormNo[annotIdx] = _.isPlainObject(foundFormNo[annotIdx]) ? foundFormNo[annotIdx] : {}; count = _.size(result); foundFormNo[annotIdx].low = count; foundFormNo[annotIdx].high = count + _.size(ancodes[formNo]) - 1; } _.forEach(ancodes[formNo], ancode => result.push([word, ancode])); } }); return { foundFormNo, forms: result, }; } decodeAnnot(annotsRaw, withBase) { if (php.var.is_array(annotsRaw)) { return annotsRaw; } return this.annot_decoder.decode(annotsRaw, withBase); } } class WordForm { static compareGrammems(a, b) { return _.size(a) === _.size(b) && _.size(php.array.array_diff(a, b)) === 0; } constructor(word, formNo, posId, grammems) { grammems = _.values(grammems); this.word = `${word}`; this.form_no = parseInt(formNo, 10); this.pos_id = posId; this.grammems = grammems.length > 0 ? _.sortBy(grammems, _.isNumber(grammems[0]) ? _.toInteger : JSON.stringify) : grammems; } getPartOfSpeech() { return this.pos_id; } getGrammems() { return this.grammems; } hasGrammems(grammems) { grammems = !_.isArray(grammems) ? [grammems] : grammems; const grammesCount = _.size(grammems); return grammesCount && _.size(php.array.array_intersect(grammems, this.grammems)) === grammesCount; } getWord() { return this.word; } getFormNo() { return this.form_no; } } class WordDescriptor extends Array { // https://developer.mozilla.org/ru/docs/Web/JavaScript/Reference/Classes#Species static get [Symbol.species]() { return Array; } /** * @param word * @param annot * @param {MorphierHelper} helper */ constructor(word, annot, helper) { super(); this.word = word; this.annot = [annot]; this.helper = helper; this.cached_base = null; this.cached_forms = null; this.found_form_no = null; this.all_forms_readed = false; this.cached_pseudo_root = null; this.common_ancode_grammems = null; this.readAllForms(); } getPseudoRoot() { if (!php.var.isset(this.cached_pseudo_root)) { this.cached_pseudo_root = this.helper.getPseudoRoot(this.word, this.annot)[0]; } return this.cached_pseudo_root; } getBaseForm() { if (!php.var.isset(this.cached_base)) { this.cached_base = this.helper.getBaseForm(this.word, this.annot)[0]; } return this.cached_base; } getAllForms() { if (!php.var.isset(this.cached_forms)) { this.cached_forms = this.helper.getAllForms(this.word, this.annot); } return this.cached_forms; } getWordForm(index) { this.readAllForms(); return this.slice(index, index + 1)[0]; } createWordForm(word, formNo, ancode) { if (!php.var.isset(this.common_ancode_grammems)) { const { common_ancode } = this.annot[0]; this.common_ancode_grammems = php.var.isset(common_ancode) ? this.helper.getGrammems(common_ancode) : {}; } const grammemsAndPartOfSpeech = this.helper.getGrammemsAndPartOfSpeech(ancode); const posId = grammemsAndPartOfSpeech[0]; const allGrammems = grammemsAndPartOfSpeech[1]; return new WordForm(word, formNo, posId, php.array.array_merge(this.common_ancode_grammems, allGrammems)); } readAllForms() { const forms = []; let formNo = 0; let formsWithAncodes; if (!this.all_forms_readed) { formsWithAncodes = this.helper.getAllFormsWithAncodes(this.word, this.annot); _.forEach(formsWithAncodes.forms, form => { forms.push(this.createWordForm(form[0], formNo, form[1])); formNo += 1; }); this.found_form_no = formsWithAncodes.foundFormNo[0]; this.splice(0, this.length); _.forEach(forms, form => this.push(form)); this.all_forms_readed = true; } return this; } getFoundFormNoLow() { this.readAllForms(); return this.found_form_no.low; } getFoundFormNoHigh() { this.readAllForms(); return this.found_form_no.high; } getFoundWordForm() { const result = []; for (let i = this.getFoundFormNoLow(), c = this.getFoundFormNoHigh() + 1; i < c; i++) { result.push(this.getWordForm(i)); } return result; } hasGrammems(grammems) { grammems = castArray(grammems); return _.some(this, wf => wf.hasGrammems(grammems)); } getWordFormsByGrammems(grammems) { grammems = castArray(grammems); const result = []; _.forEach(this, wf => { if (wf.hasGrammems(grammems)) { result.push(wf); } }); return result; // return count(result) ? result : false; } hasPartOfSpeech(poses) { poses = castArray(poses); return _.some(this, wf => { return poses.includes(wf.getPartOfSpeech()); // return poses.includes(wf.getPartOfSpeech()); }); } getWordFormsByPartOfSpeech(poses) { poses = castArray(poses); const result = []; _.forEach(this, wf => { if (poses.includes(wf.getPartOfSpeech())) { // if (poses.includes(wf.getPartOfSpeech())) { result.push(wf); } }); return result; // return count(result) ? result : false; } } // ---------------------------- // Finders // ---------------------------- class MorphierFinderInterface { findWord(word) {} decodeAnnot(raw, withBase) {} getAnnotDecoder() {} } class MorphierFinderBase extends MorphierFinderInterface { /** * @param {AnnotDecoderInterface} annotDecoder */ constructor(annotDecoder) { super(); this.annot_decoder = annotDecoder; this.prev_word = null; this.prev_result = false; } findWord(word) { if (this.prev_word === word) { return this.prev_result; } const result = this.doFindWord(word); this.prev_word = word; this.prev_result = result; return result; } getAnnotDecoder() { return this.annot_decoder; } decodeAnnot(raw, withBase) { return this.annot_decoder.decode(raw, withBase); } doFindWord(word) {} } class MorphierFinderCommon extends MorphierFinderBase { /** * @param {FsaInterface} fsa * @param {AnnotDecoderInterface} annotDecoder */ constructor(fsa, annotDecoder) { super(annotDecoder); this.fsa = fsa; this.root = this.fsa.getRootTrans(); } getFsa() { return this.fsa; } doFindWord(word) { const result = this.fsa.walk(this.root, word); if (!result.result || result.annot === null) { return false; } return result.annot; } } class MorphierFinderPredictSuffix extends MorphierFinderCommon { /** * @param {FsaInterface} fsa * @param {AnnotDecoderInterface} annotDecoder * @param {string} encoding * @param {number} [minimalSuffixLength=4] */ constructor(fsa, annotDecoder, encoding, minimalSuffixLength = 4) { super(fsa, annotDecoder); this.min_suf_len = minimalSuffixLength; this.unicode = UnicodeHelper.create(encoding); } doFindWord(word) { const wordLen = this.unicode.strlen(word); let result; if (!wordLen) { return false; } let i = 1; const c = wordLen - this.min_suf_len; for (; i < c; i++) { word = php.strings.substr(word, this.unicode.firstCharSize(word)); result = super.doFindWord(word); if (result !== false) { break; } } if (i < c) { return result; } return false; } fixAnnots(annots, len) { _.forEach(annots, annot => (annot.cplen = len)); return annots; } } class MorphierPredictCollector extends FsaWordsCollector { /** * @param {*} limit * @param {AnnotDecoderInterface} annotDecoder */ constructor(limit, annotDecoder) { super(limit); this.collected = 0; this.used_poses = {}; this.annot_decoder = annotDecoder; } collect(path, annotRaw) { if (this.collected > this.limit) { return false; } const annots = this.decodeAnnot(annotRaw); let posId; let resultIdx; let nextItemsIndex; let itemsSize; _.forEach(annots, annot => { annot.cplen = annot.plen = 0; posId = annot.pos_id; if (php.var.isset(this.used_poses[posId])) { resultIdx = this.used_poses[posId]; if (annot.freq > this.items[resultIdx].freq) { this.items[resultIdx] = annot; } } else { itemsSize = _.size(this.items); this.used_poses[posId] = itemsSize; // оригинал: // $this->items[] = annot; nextItemsIndex = itemsSize ? _.max(_.keys(this.items)) : -1; this.items[parseInt(nextItemsIndex, 10) + 1] = annot; } }); this.collected += 1; return true; } clear() { super.clear(); this.collected = 0; this.used_poses = {}; } decodeAnnot(annotRaw) { return this.annot_decoder.decode(annotRaw, true); } } class MorphierFinderPredictDatabase extends MorphierFinderCommon { /** * @param {FsaInterface} fsa * @param {AnnotDecoderInterface} annotDecoder * @param {string} encoding * @param {GramInfoInterface} graminfo * @param {number} [minPostfixMatch=2] * @param {number} [collectLimit=32] */ constructor(fsa, annotDecoder, encoding, graminfo, minPostfixMatch = 2, collectLimit = 32) { super(fsa, annotDecoder); this.graminfo = graminfo; this.min_postfix_match = minPostfixMatch; this.collector = this.createCollector(collectLimit, this.getAnnotDecoder()); this.unicode = UnicodeHelper.create(encoding); } createAnnotDecoder() { // todo: какая-то херня // return phpmorphy_annot_decoder_new('predict'); return AnnotDecoderFactory.create('predict'); } doFindWord(word) { word = toBuffer(word); const revWord = this.unicode.strrev(word); const result = this.fsa.walk(this.root, revWord); let annots; let matchLen; if (result.result && result.annot !== null) { annots = result.annot; } else { matchLen = this.unicode.strlen(this.unicode.fixTrailing(revWord.slice(0, result.walked))); annots = this.determineAnnots(result.last_trans, matchLen); if (annots === null) { return false; } } if (!php.var.is_array(annots)) { annots = this.collector.decodeAnnot(annots); } return this.fixAnnots(word, annots); } determineAnnots(trans, matchLen) { let annots = this.fsa.getAnnot(trans); if (annots == null && matchLen >= this.min_postfix_match) { this.collector.clear(); this.fsa.collect(trans, this.collector.getCallback()); annots = this.collector.getItems(); } return annots; } fixAnnots(word, annots) { word = toBuffer(word); const result = []; let flexias; let prefix; let suffix; let plen; let slen; // remove all prefixes? _.forEach(annots, annot => { annot.cplen = annot.plen = 0; flexias = this.graminfo.readFlexiaData(annot, false); prefix = Buffer.from(flexias[annot.form_no * 2]); suffix = Buffer.from(flexias[annot.form_no * 2 + 1]); plen = prefix.length; slen = suffix.length; const partOfWordInPlaceOfPrefix = php.strings.substr(word, 0, plen); const partOfWordInPlaceOfSuffix = php.strings.substr(word, -slen); if ( (!plen || (partOfWordInPlaceOfPrefix && partOfWordInPlaceOfPrefix.equals(prefix))) && (!slen || (partOfWordInPlaceOfSuffix && partOfWordInPlaceOfSuffix.equals(suffix))) ) { result.push(annot); } }); return _.size(result) ? result : false; } createCollector(limit) { return new MorphierPredictCollector(limit, this.getAnnotDecoder()); } } // ---------------------------- // Morphiers // ---------------------------- class MorphierBase extends MorphierInterface { /** * @param {MorphierFinderInterface} finder * @param {MorphierHelper} helper */ constructor(finder, helper) { super(); this.finder = finder; this.helper = _.cloneDeep(helper); this.helper.setAnnotDecoder(finder.getAnnotDecoder()); } /** * @return MorphierFinderInterface */ getFinder() { return this.finder; } /** * @return MorphierHelper */ getHelper() { return this.helper; } getAnnot(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.decodeAnnot(annots, true); } getWordDescriptor(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getWordDescriptor(word, annots); } getAllFormsWithAncodes(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getAllFormsWithResolvedAncodes(word, annots); } getPartOfSpeech(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getPartOfSpeech(word, annots); } getBaseForm(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getBaseForm(word, annots); } getPseudoRoot(word) { const annots = this.finder.findWord(word); if (annots === false) { return false; } return this.helper.getPseudoRoot(word, annots); } getAllForms(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getAllForms(word, annots); } getAncode(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getAncode(annots); } getGrammarInfo(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getGrammarInfo(annots); } getGrammarInfoMergeForms(word) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.getGrammarInfoMergeForms(annots); } /** * @param word * @param partOfSpeech * @param grammems * @param {boolean} [returnOnlyWord=false] * @param {*} [callback=null] * @returns {array|boolean} */ castFormByGramInfo(word, partOfSpeech, grammems, returnOnlyWord = false, callback = null) { const annots = this.finder.findWord(word); if (!annots) return false; return this.helper.castFormByGramInfo(word, annots); } /** * @param word * @param patternWord * @param {boolean} [returnOnlyWord=false] * @param {*} [callback=null] * @returns {array|boolean} */ castFormByPattern(word, patternWord, returnOnlyWord = false, callback = null) { const origAnnots = this.finder.findWord(word); if (!origAnnots) return false; const patternAnnots = this.finder.findWord(patternWord); if (!patternAnnots) return false; return this.helper.castFormByPattern(word, origAnnots, patternWord, patternAnnots, returnOnlyWord, callback); } } class MorphierCommon extends MorphierBase { /** * @param {MorphierHelper} helper * @returns {AnnotDecoderInterface} */ static createAnnotDecoder(helper) { return AnnotDecoderFactory.create(helper.getGramInfo().getEnds()).getCommonDecoder(); } /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ constructor(fsa, helper) { super(new MorphierFinderCommon(fsa, MorphierCommon.createAnnotDecoder(helper)), helper); } } class MorphierPredictSuffix extends MorphierBase { /** * @param {MorphierHelper} helper * @returns {AnnotDecoderInterface} */ static createAnnotDecoder(helper) { return AnnotDecoderFactory.create(helper.getGramInfo().getEnds()).getCommonDecoder(); } /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ constructor(fsa, helper) { super( new MorphierFinderPredictSuffix( fsa, MorphierPredictSuffix.createAnnotDecoder(helper), helper.getGramInfo().getEncoding(), 4, ), helper, ); } } class MorphierPredictDatabase extends MorphierBase { /** * @param {MorphierHelper} helper * @returns {AnnotDecoderInterface} */ static createAnnotDecoder(helper) { return AnnotDecoderFactory.create(helper.getGramInfo().getEnds()).getPredictDecoder(); } /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ constructor(fsa, helper) { super( new MorphierFinderPredictDatabase( fsa, MorphierPredictDatabase.createAnnotDecoder(helper), helper.getGramInfo().getEncoding(), helper.getGramInfo(), 2, 32, ), helper, ); } } class MorphierBulk extends MorphierInterface { /** * @param {FsaInterface} fsa * @param {MorphierHelper} helper */ constructor(fsa, helper) { super(); this.fsa = fsa; this.root_trans = fsa.getRootTrans(); this.helper = _.cloneDeep(helper); this.helper.setAnnotDecoder(this.createAnnotDecoder(helper)); this.graminfo = helper.getGramInfo(); this.notfound = []; } getFsa() { return this.fsa; } getHelper() { return this.helper; } getGraminfo() { return this.graminfo; } getNotFoundWords() { return this.notfound; } /** * @param {MorphierHelper} helper * @returns {*} */ createAnnotDecoder(helper) { return new AnnotDecoderCommon(helper.getGramInfo().getEnds()); } getAnnot(word) { const result = {}; _.forEach(this.findWord(word), item => { const words = item.data; let annot = item.annots; annot = this.helper.decodeAnnot(annot, true); _.forEach(words, word => { result[word] = result[word] || []; result[word].push(annot); }); }); return result; } getBaseForm(words) { const annots = this.findWord(words); return this.composeForms(annots, true, false, false); } getAllForms(words) { const annots = this.findWord(words); return this.composeForms(annots, false, false, false); } getPseudoRoot(words) { const annots = this.findWord(words); return this.composeForms(annots, false, true, false); } getPartOfSpeech(words) { const annots = this.findWord(words); return this.composeForms(annots, false, false, true); } /** * @param words * @param method * @param {boolean} [callWithWord=false] * @returns {*} */ processAnnotsWithHelper(words, method, callWithWord = false) { const result = {}; _.forEach(this.findWord(words), item => { words = item.data; const annotRaw = item.annots; if (annotRaw.length === 0) return; if (callWithWord) { _.forEach(words, word => (result[word] = this.helper[method](word, annotRaw))); } else { const resultForAnnot = this.helper[method](annotRaw); _.forEach(words, word => (result[word] = resultForAnnot)); } }); return result; } getAncode(words) { return this.processAnnotsWithHelper(words, 'getAncode'); } getGrammarInfoMergeForms(words) { return this.processAnnotsWithHelper(words, 'getGrammarInfoMergeForms'); } getGrammarInfo(words) { return this.processAnnotsWithHelper(words, 'getGrammarInfo'); } getAllFormsWithAncodes(words) { return this.processAnnotsWithHelper(words, 'getAllFormsWithResolvedAncodes', true); } getWordDescriptor(words) { return this.processAnnotsWithHelper(words, 'getWordDescriptor', true); } findWord(words) { this.notfound = []; const patriciaTrie = this.buildPatriciaTrie(words); const labels = patriciaTrie[0]; const finals = patriciaTrie[1]; const dests = patriciaTrie[2]; const annots = {}; const stack = [0, Buffer.from(''), this.root_trans]; const fsa = this.fsa; let n; let path; let trans; let label; let result; let isFinal; let stackIdx = 0; // TODO: Improve this while (stackIdx >= 0) { n = stack[stackIdx]; path = Buffer.concat([Buffer.from(stack[stackIdx + 1]), labels[n]]); trans = stack[stackIdx + 2]; stackIdx -= 3; // TODO: Remove items from stack? (performance!!!) isFinal = finals[n] > 0; // isFinal = dests[n] === false; result = false; if (trans !== false && n > 0) { label = labels[n]; result = fsa.walk(trans, label, isFinal); if (label.length === result.walked) { trans = result.word_trans; } else { trans = false; } } if (isFinal) { if (trans !== false && php.var.isset(result.annot)) { annots[result.annot] = annots[result.annot] || { annots: result.annot, data: [], }; annots[result.annot].data.push(path); } else { this.notfound.push(path); } } if (dests[n] !== false) { _.forEach(dests[n], dest => { stackIdx += 3; stack[stackIdx] = dest; stack[stackIdx + 1] = path; stack[stackIdx + 2] = trans; }); } } return annots; } composeForms(annotsRaw, onlyBase, pseudoRoot, partOfSpeech) { const result = {}; let annotRaw; // process found annotations _.forEach(annotsRaw, item => { const words = item.data; annotRaw = item.annots; if (annotRaw.length === 0) return; _.forEach(this.helper.decodeAnnot(annotRaw, onlyBase), annot => { let flexias; let posId; if (!(onlyBase || pseudoRoot)) { flexias = this.graminfo.readFlexiaData(annot); } const cplen = annot.cplen; const plen = annot.plen; const flen = annot.flen; if (partOfSpeech) { posId = this.helper.extractPartOfSpeech(annot); } _.forEach(words, word => { let base; let form; if (flen) { base = php.strings.substr(word, cplen + plen, -flen); } else if (cplen || plen) { base = php.strings.substr(word, cplen + plen); } else { base = word; } const prefix = cplen ? php.strings.substr(word, 0, cplen) : ''; result[word] = result[word] || {}; if (pseudoRoot) { result[word][base] = 1; } else if (onlyBase) { form = [prefix, annot.base_prefix, base, annot.base_suffix].join(''); result[word][form] = 1; } else if (partOfSpeech) { result[word][posId] = 1; } else { for (let i = 0, c = _.size(flexias); i < c; i += 2) { form = [prefix, flexias[i], base, flexias[i + 1]].join(''); result[word][form] = 1; } } }); }); }); _.keys(result).forEach(key => { result[key] = _.keys(result[key]); if (result[key].length > 0 && isStringifyedNumber(result[key][0])) { result[key] = result[key].map(_.toInteger); } }); return result; } buildPatriciaTrie(words) { if (!php.var.is_array(words)) { throw new Error('Words must be array'); } // words = php.array.sort(words); words = words.length > 0 && Buffer.isBuffer(words[0]) ? words.sort(Buffer.compare) : words.sort(); let stack = []; let prevWordBuf = Buffer.alloc(0); let prevWordLen = 0; let prevLcp = 0; let node = 0; const stateLabels = []; const stateDests = []; const stateFinals = [0]; stateLabels.push(Buffer.from('')); stateDests.push([]); _.forEach(words, word => { const wordBuf = Buffer.from(word, 'utf8'); if (wordBuf.equals(prevWordBuf)) { return; } const wordLen = wordBuf.length; let newStateId; let needSplit; let trimSize; let nodeKey; let newNodeId1; let newNodeId2; let newNodeId; // find longest common prefix let lcp = 0; const c = Math.min(prevWordLen, wordLen); // for (; lcp < c && wordBuf[lcp] == prevWordBuf[lcp]; lcp++) {} while (lcp < c && wordBuf[lcp] === prevWordBuf[lcp]) { lcp += 1; } if (lcp === 0) { stack = []; newStateId = _.size(stateLabels); stateLabels.push(wordBuf); stateFinals.push(1); stateDests.push(false); stateDests[0].push(newStateId); node = newStateId; } else { needSplit = true; trimSize = 0; // for split if (lcp === prevLcp) { needSplit = false; node = stack[_.size(stack) - 1]; } else if (lcp > prevLcp) { if (lcp === prevWordLen) { needSplit = false; } else { needSplit = true; trimSize = lcp - prevLcp; } stack.push(node); } else { trimSize = prevWordBuf.length - lcp; let stackSize = _.size(stack) - 1; for (; ; --stackSize) { trimSize -= stateLabels[node].length; if (trimSize <= 0) { break; } if (_.size(stack) < 1) { throw new Error('Infinite loop possible'); } node = stack.pop(); } needSplit = trimSize < 0; trimSize = Math.abs(trimSize); if (needSplit) { stack.push(node); } else { node = stack[stackSize]; } } let nodeKeyBuf; if (needSplit) { nodeKey = stateLabels[node]; nodeKeyBuf = Buffer.from(nodeKey); // split newNodeId1 = _.size(stateLabels); newNodeId2 = newNodeId1 + 1; // new_node_1 stateLabels.push(php.strings.substr(nodeKeyBuf, trimSize)); stateFinals.push(stateFinals[node]); stateDests.push(stateDests[node]); // adjust old node stateLabels[node] = php.strings.substr(nodeKeyBuf, 0, trimSize); stateFinals[node] = 0; stateDests[node] = [newNodeId1]; // append new node, new_node_2 stateLabels.push(php.strings.substr(wordBuf, lcp)); stateFinals.push(1); stateDests.push(false); stateDests[node].push(newNodeId2); node = newNodeId2; } else { newNodeId = _.size(stateLabels); stateLabels.push(php.strings.substr(wordBuf, lcp)); stateFinals.push(1); stateDests.push(false); if (stateDests[node] !== false) { stateDests[node].push(newNodeId); } else { stateDests[node] = [newNodeId]; } node = newNodeId; } } prevWordLen = wordLen; prevWordBuf = wordBuf; prevLcp = lcp; }); return [stateLabels, stateFinals.join(''), stateDests]; } } export { MorphierInterface, MorphierEmpty, AnnotDecoderInterface, AnnotDecoderBase, AnnotDecoderCommon, AnnotDecoderPredict, AnnotDecoderFactory, AncodesResolverInterface, AncodesResolverToText, AncodesResolverToDialingAncodes, AncodesResolverAsIs, AncodesResolverProxy, WordDescriptorCollection, MorphierHelper, WordForm, WordDescriptor, MorphierFinderInterface, MorphierFinderBase, MorphierFinderCommon, MorphierFinderPredictSuffix, MorphierPredictCollector, MorphierFinderPredictDatabase, MorphierBase, MorphierCommon, MorphierPredictSuffix, MorphierPredictDatabase, MorphierBulk, };