UNPKG

hunspell-reader

Version:
272 lines 9.89 kB
import * as util from 'node:util'; import * as GS from 'gensequence'; import { affFlag, flagToLongStringMap, flagToStringMap } from './affConstants.js'; import { Converter } from './converter.js'; import { filterOrderedList, isDefined } from './util.js'; const log = false; const DefaultMaxDepth = 5; const regExpIsNumber = /^\d+$/; export class Aff { affInfo; filename; rules; _oConv; _iConv; _maxSuffixDepth = DefaultMaxDepth; _mapRules = new Map(); constructor(affInfo, filename) { this.affInfo = affInfo; this.filename = filename; this.rules = processRules(affInfo); this._iConv = new Converter(affInfo.ICONV || []); this._oConv = new Converter(affInfo.OCONV || []); } get maxSuffixDepth() { return this._maxSuffixDepth; } set maxSuffixDepth(value) { this._maxSuffixDepth = value; } /** * Takes a line from a hunspell.dic file and applies the rules found in the aff file. * For performance reasons, only the `word` field is mapped with OCONV. * @param {string} line - the line from the .dic file. */ applyRulesToDicEntry(line, maxDepth) { const maxSuffixDepth = maxDepth ?? this.maxSuffixDepth; const [lineLeft] = line.split(/\s+/, 1); const [word, rules = ''] = lineLeft.split('/', 2); const convert = this._oConv.convert; const results = this.applyRulesToWord(asAffWord(word, rules), maxSuffixDepth).map((affWord) => ((affWord.word = convert(affWord.word)), affWord)); results.sort(compareAff); const filtered = results.filter(filterAff()); return filtered; } /** * @internal */ applyRulesToWord(affWord, remainingDepth) { const compoundMin = this.affInfo.COMPOUNDMIN ?? 3; const { word, base, suffix, prefix, dic } = affWord; const allRules = this.getMatchingRules(affWord.rules); const { rulesApplied, flags } = reduceAffixRules(affWord, allRules); const rules = this.joinRules(allRules.filter((rule) => !rule.flags).map((rule) => rule.id)); const affixRules = allRules.map((rule) => rule.sfx || rule.pfx).filter(isDefined); const wordWithFlags = { word, flags, rulesApplied, rules: '', base, suffix, prefix, dic }; return [wordWithFlags, ...this.applyAffixesToWord(affixRules, { ...wordWithFlags, rules }, remainingDepth)] .filter(({ flags }) => !flags.isNeedAffix) .map((affWord) => adjustCompounding(affWord, compoundMin)) .map((affWord) => logAffWord(affWord, 'applyRulesToWord')); } applyAffixesToWord(affixRules, affWord, remainingDepth) { if (remainingDepth <= 0) { return []; } const combinableRules = affixRules .filter((rule) => rule.type === 'SFX') .filter((rule) => rule.combinable === true) .map(({ id }) => id); const combinableSfx = this.joinRules(combinableRules); const r = affixRules .flatMap((affix) => this.applyAffixToWord(affix, affWord, combinableSfx)) .flatMap((affWord) => this.applyRulesToWord(affWord, remainingDepth - 1)); return r; } applyAffixToWord(affix, affWord, combinableSfx) { const { word } = affWord; const combineRules = affix.type === 'PFX' && affix.combinable && !!combinableSfx ? combinableSfx : ''; const flags = affWord.flags.isNeedAffix ? removeNeedAffix(affWord.flags) : affWord.flags; const matchingSubstitutions = affix.substitutionsForRegExps.filter((sub) => sub.match.test(word)); const partialAffWord = { ...affWord, flags, rules: combineRules }; return matchingSubstitutions .flatMap((sub) => this.#applySubstitution(affix, partialAffWord, sub)) .map((affWord) => logAffWord(affWord, 'applyAffixToWord')); } #substituteAttach(affix, affWord, sub, stripped) { const { word: origWord, rulesApplied, flags, dic } = affWord; const rules = affWord.rules + (sub.attachRules || ''); let word; let p = affWord.prefix.length; let s = origWord.length - affWord.suffix.length; if (sub.type === 'S') { word = stripped + sub.attach; s = Math.min(stripped.length, s); p = Math.min(p, s); } else { word = sub.attach + stripped; const d = word.length - origWord.length; p = Math.max(p, word.length - stripped.length); s = Math.max(s + d, p); } const base = word.slice(p, s); const prefix = word.slice(0, p); const suffix = word.slice(s); return { word, rulesApplied: rulesApplied + ' ' + affix.id, rules, flags, base, suffix, prefix, dic, }; } #applySubstitution(affix, affWord, subs) { const results = []; for (const [replace, substitutions] of subs.substitutionsGroupedByRemove) { if (!replace.test(affWord.word)) continue; const stripped = affWord.word.replace(replace, ''); for (const sub of substitutions) { results.push(this.#substituteAttach(affix, affWord, sub, stripped)); } } return results; } getMatchingRules(rules) { const { AF = [] } = this.affInfo; const idx = regExpIsNumber.test(rules) ? Number.parseInt(rules, 10) : -1; const rulesToSplit = AF[idx] || rules; return this.separateRules(rulesToSplit) .map((key) => this.rules.get(key)) .filter(isDefined); } joinRules(rules) { switch (this.affInfo.FLAG) { case 'long': { return rules.join(''); } case 'num': { return rules.join(','); } } return rules.join(''); } separateRules(rules) { const found = this._mapRules.get(rules); if (found) return found; const split = this.#separateRules(rules); this._mapRules.set(rules, split); return split; } #separateRules(rules) { switch (this.affInfo.FLAG) { case 'long': { return [...new Set(rules.replaceAll(/(..)/g, '$1//').split('//').slice(0, -1))]; } case 'num': { return [...new Set(rules.split(','))]; } } return [...new Set(rules)]; } get iConv() { return this._iConv; } get oConv() { return this._oConv; } } function signature(aff) { const { word, flags } = aff; const sig = Object.entries(flags) .filter((e) => !!e[1]) .map((f) => flagToStringMap[f[0]]) .sort() .join(''); return word + '|' + sig; } export function processRules(affInfo) { const sfxRules = [...(affInfo.SFX || [])] .map(([, sfx]) => sfx) .map((sfx) => ({ id: sfx.id, type: 'sfx', sfx })); const pfxRules = [...(affInfo.PFX || [])] .map(([, pfx]) => pfx) .map((pfx) => ({ id: pfx.id, type: 'pfx', pfx })); const flagRules = GS.sequenceFromObject(affInfo) .filter(([key, value]) => !!affFlag[key] && !!value) .map(([key, value]) => ({ id: value, type: 'flag', flags: affFlag[key] })) .toArray(); const rules = [...sfxRules, ...pfxRules, ...flagRules].reduce((acc, rule) => { acc.set(rule.id, rule); return acc; }, new Map()); return rules; } export function logAffWord(affWord, message) { /* istanbul ignore if */ if (log) { const dump = util.inspect(affWord, { showHidden: false, depth: 5, colors: true }); console.log(`${message}: ${dump}`); } return affWord; } /* istanbul ignore next */ export function affWordToColoredString(affWord) { return util .inspect({ ...affWord, flags: flagsToString(affWord.flags) }, { showHidden: false, depth: 5, colors: true }) .replaceAll(/(\s|\n|\r)+/g, ' '); } /* istanbul ignore next */ export function flagsToString(flags) { return [...Object.entries(flags)] .filter(([, v]) => !!v) .map(([k]) => flagToLongStringMap[k]) .sort() .join(':'); } export function asAffWord(word, rules = '', flags = {}) { return { word, base: word, prefix: '', suffix: '', rulesApplied: '', rules, flags, dic: rules ? word + '/' + rules : word, }; } export function compareAff(a, b) { if (a.word !== b.word) { return a.word < b.word ? -1 : 1; } const sigA = signature(a); const sigB = signature(b); return sigA < sigB ? -1 : sigA > sigB ? 1 : 0; } function reduceAffixRules(affWord, allRules) { return allRules .filter((rule) => !!rule.flags) .reduce((acc, rule) => ({ rulesApplied: [acc.rulesApplied, rule.id].join(' '), flags: { ...acc.flags, ...rule.flags }, }), { rulesApplied: affWord.rulesApplied, flags: affWord.flags }); } /** * Returns a filter function that will filter adjacent AffWords * It compares the word and the flags. */ export function filterAff() { return filterOrderedList((a, b) => a.word !== b.word || signature(a) !== signature(b)); } export const debug = { signature, }; function removeNeedAffix(flags) { const newFlags = { ...flags }; delete newFlags.isNeedAffix; return newFlags; } function adjustCompounding(affWord, minLength) { if (!affWord.flags.isCompoundPermitted || affWord.word.length >= minLength) { return affWord; } const { isCompoundPermitted: _, ...flags } = affWord.flags; affWord.flags = flags; return affWord; } //# sourceMappingURL=affLegacy.js.map