UNPKG

hunspell-reader

Version:

A library for reading Hunspell Dictionary Files

github.com/streetsidesoftware/cspell/tree/main/packages/hunspell-reader

streetsidesoftware/cspell

442 lines • 16.9 kB

JavaScript

import assert from 'node:assert'; import { affFlag } from './affConstants.js'; import { Converter } from './converter.js'; import { filterOrderedList, groupByField, isDefined } from './util.js'; const debug = false; function logError(msg, ...args) { debug && console.error(msg, ...args); } const DefaultMaxDepth = 5; export class Aff { affInfo; affData; _oConv; _iConv; _maxSuffixDepth = DefaultMaxDepth; constructor(affInfo, filename) { this.affInfo = affInfo; this.affData = new AffData(affInfo, filename); this._iConv = new Converter(affInfo.ICONV || []); this._oConv = new Converter(affInfo.OCONV || []); } get maxSuffixDepth() { return this._maxSuffixDepth; } set maxSuffixDepth(value) { this._maxSuffixDepth = value; } /** * Takes a line from a hunspell.dic file and applies the rules found in the aff file. * For performance reasons, only the `word` field is mapped with OCONV. * @param {string} line - the line from the .dic file. */ applyRulesToDicEntry(line, maxDepth) { const afWord = this.affData.dictLineToAffixWord(line); const maxSuffixDepth = maxDepth ?? this.maxSuffixDepth; const convert = this._oConv.convert; const results = this.applyRulesToWord(afWord, maxSuffixDepth).map((affWord) => ({ ...affWord, word: convert(affWord.word), originalWord: affWord.word, })); results.sort(compareAff); const filtered = results.filter(filterAff()); return filtered; } /** * @internal */ applyRulesToWord(affWord, remainingDepth) { const compoundMin = this.affInfo.COMPOUNDMIN ?? 3; const { word, flags, dict, appliedRules } = affWord; const wordWithFlags = { word, rules: undefined, flags, dict, appliedRules }; return [wordWithFlags, ...this.applyAffixesToWord(affWord, remainingDepth)] .filter(({ flags }) => !(flags & AffixFlags.isNeedAffix)) .map((affWord) => adjustCompounding(affWord, compoundMin)); } applyAffixesToWord(affWord, remainingDepth) { if (remainingDepth <= 0 || !affWord.rules) { return []; } const rules = affWord.rules; const combinableSfx = rules.filter((r) => r.type === 'S' && r.fx.combinable); const r = affWord.rules .flatMap((affix) => this.applyAffixToWord(affix, affWord, combinableSfx)) .flatMap((affWord) => this.applyRulesToWord(affWord, remainingDepth - 1)); return r; } applyAffixToWord(rule, affWord, combinableSfx) { const { word } = affWord; const combineRules = rule.type === 'P' && rule.fx.combinable ? combinableSfx : []; const flags = affWord.flags & ~AffixFlags.isNeedAffix; const matchingSubstitutions = rule.fx.substitutionsForRegExps.filter((sub) => sub.match.test(word)); const source = { dict: affWord.dict, appliedRules: affWord.appliedRules ? [...affWord.appliedRules, rule.idx] : undefined, }; const partialAffWord = this.affData.toAffixWord(source, word, flags, combineRules); return matchingSubstitutions.flatMap((sub) => this.#applySubstitution(partialAffWord, sub)); } #substituteAttach(affWord, sub, stripped) { const { flags } = affWord; const subRules = this.affData.getRulesForAffSubstitution(sub); const rules = joinRules(affWord.rules, subRules); const word = sub.type === 'S' ? stripped + sub.attach : sub.attach + stripped; return this.affData.toAffixWord(affWord, word, flags, rules); } #applySubstitution(affWord, subs) { const results = []; for (const [replace, substitutions] of subs.substitutionsGroupedByRemove) { if (!replace.test(affWord.word)) continue; const stripped = affWord.word.replace(replace, ''); for (const sub of substitutions) { results.push(this.#substituteAttach(affWord, sub, stripped)); } } return results; } getMatchingRules(flags) { const rules = this.affData.getRules(flags); return rules; } /** * Convert the applied rule indexes to AFF Letters. * Requires that the affixWord was generated with trace mode turned on. * @param affixWord - the generated AffixWord. */ getFlagsValuesForAffixWord(affixWord) { const rules = this.affData.getRulesForIndexes(affixWord.appliedRules); return rules?.map((r) => r.id); } get iConv() { return this._iConv; } get oConv() { return this._oConv; } setTraceMode(value) { this.affData.trace = value; } } export function compareAff(a, b) { return a.word < b.word ? -1 : a.word > b.word ? 1 : a.flags - b.flags; } /** * Returns a filter function that will filter adjacent AffWords * It compares the word and the flags. */ function filterAff() { return filterOrderedList((a, b) => a.word !== b.word || a.flags !== b.flags); } function adjustCompounding(affWord, minLength) { if (!(affWord.flags & AffixFlags.isCompoundPermitted) || affWord.word.length >= minLength) { return affWord; } affWord.flags &= ~AffixFlags.isCompoundPermitted; return affWord; } export var AffixFlags; (function (AffixFlags) { AffixFlags[AffixFlags["none"] = 0] = "none"; /** * COMPOUNDFLAG flag * * Words signed with COMPOUNDFLAG may be in compound words (except when word shorter than COMPOUNDMIN). * Affixes with COMPOUNDFLAG also permits compounding of affixed words. * */ AffixFlags[AffixFlags["isCompoundPermitted"] = 1] = "isCompoundPermitted"; /** * COMPOUNDBEGIN flag * * Words signed with COMPOUNDBEGIN (or with a signed affix) may be first elements in compound words. * */ AffixFlags[AffixFlags["canBeCompoundBegin"] = 2] = "canBeCompoundBegin"; /** * COMPOUNDMIDDLE flag * * Words signed with COMPOUNDMIDDLE (or with a signed affix) may be middle elements in compound words. * */ AffixFlags[AffixFlags["canBeCompoundMiddle"] = 4] = "canBeCompoundMiddle"; /** * COMPOUNDLAST flag * * Words signed with COMPOUNDLAST (or with a signed affix) may be last elements in compound words. * */ AffixFlags[AffixFlags["canBeCompoundEnd"] = 8] = "canBeCompoundEnd"; /** * COMPOUNDPERMITFLAG flag * * Prefixes are allowed at the beginning of compounds, suffixes are allowed at the end of compounds by default. * Affixes with COMPOUNDPERMITFLAG may be inside of compounds. * */ AffixFlags[AffixFlags["isOnlyAllowedInCompound"] = 16] = "isOnlyAllowedInCompound"; /** * COMPOUNDFORBIDFLAG flag * * Suffixes with this flag forbid compounding of the affixed word. * */ AffixFlags[AffixFlags["isCompoundForbidden"] = 32] = "isCompoundForbidden"; /** * WARN flag * * This flag is for rare words, which are also often spelling mistakes, see option -r of command line Hunspell and FORBIDWARN. */ AffixFlags[AffixFlags["isWarning"] = 64] = "isWarning"; /** * KEEPCASE flag * * Forbid uppercased and capitalized forms of words signed with KEEPCASE flags. Useful for special orthographies (measurements and * currency often keep their case in uppercased texts) and writing systems (e.g. keeping lower case of IPA characters). Also valuable * for words erroneously written in the wrong case. */ AffixFlags[AffixFlags["isKeepCase"] = 128] = "isKeepCase"; /** * FORCEUCASE flag * * Last word part of a compound with flag FORCEUCASE forces capitalization of the whole compound word. * Eg. Dutch word "straat" (street) with FORCEUCASE flags will allowed only in capitalized compound forms, * according to the Dutch spelling rules for proper names. */ AffixFlags[AffixFlags["isForceUCase"] = 256] = "isForceUCase"; /** * FORBIDDENWORD flag * * This flag signs forbidden word form. Because affixed forms are also forbidden, we can subtract a subset from set of the * accepted affixed and compound words. Note: useful to forbid erroneous words, generated by the compounding mechanism. */ AffixFlags[AffixFlags["isForbiddenWord"] = 512] = "isForbiddenWord"; /** * NOSUGGEST flag * * Words signed with NOSUGGEST flag are not suggested (but still accepted when typed correctly). Proposed flag for vulgar * and obscene words (see also SUBSTANDARD). */ AffixFlags[AffixFlags["isNoSuggest"] = 1024] = "isNoSuggest"; // cspell:ignore pseudoroot /** * NEEDAFFIX flag * * This flag signs virtual stems in the dictionary, words only valid when affixed. Except, if the dictionary word has a homonym * or a zero affix. NEEDAFFIX works also with prefixes and prefix + suffix combinations (see tests/pseudoroot5.*). */ AffixFlags[AffixFlags["isNeedAffix"] = 2048] = "isNeedAffix"; })(AffixFlags || (AffixFlags = {})); function toAffixFlags(flags) { let result = 0; for (const [key, value] of Object.entries(flags)) { if (value) { const flag = AffixFlags[key]; result |= flag; } } return result; } class AffData { affInfo; filename; rules = []; mapToRuleIdx = new Map(); mapWordRulesToRuleIndexes = new Map(); mapWordRulesToRules = new Map(); affFlagType; missingFlags = new Set(); _mapRuleIdxToRules = new WeakMap(); trace = false; constructor(affInfo, filename) { this.affInfo = affInfo; this.filename = filename; this.affFlagType = toAffFlagType(affInfo.FLAG); this.#processAffInfo(affInfo); } dictLineToEntry(line) { const [lineLeft] = line.split(/\s+/, 1); const [word, rules = ''] = lineLeft.split('/', 2); return { word, flags: rules, line }; } dictLineToAffixWord(line) { const entry = this.dictLineToEntry(line); return this.toAffixWord({ dict: entry, appliedRules: this.trace ? [] : undefined }, entry.word, AffixFlags.none, this.getRules(entry.flags)); } toAffixWord(source, word, flags, rules) { const dict = source.dict; let appliedRules = source.appliedRules; if (!rules) return { word, rules: undefined, flags, dict, appliedRules }; const fxRules = rules.filter((rule) => rule.type !== 'F'); if (appliedRules) { appliedRules = [...appliedRules, ...rules.filter((r) => r.type === 'F').map((r) => r.idx)]; } return { word, rules: fxRules.length ? fxRules : undefined, flags: flags | this.rulesToFlags(rules), appliedRules, dict, }; } getRules(rules) { const foundRules = this.mapWordRulesToRules.get(rules); if (foundRules) return foundRules; const ruleIndexes = this.getRuleIndexes(rules); const affRules = ruleIndexes.map((idx) => this.rules[idx]); this.mapWordRulesToRules.set(rules, affRules); return affRules; } getRuleIndexes(rules) { const found = this.mapWordRulesToRuleIndexes.get(rules); if (found) return found; const indexes = this.#getRuleIndexes(rules); this.mapWordRulesToRuleIndexes.set(rules, indexes); return indexes; } rulesToFlags(rules) { return rules.reduce((acc, rule) => acc | rule.flags, AffixFlags.none); } getRulesForIndexes(indexes) { if (!indexes) return undefined; let rules = this._mapRuleIdxToRules.get(indexes); if (rules) return rules; rules = indexes.map((idx) => this.rules[idx]); this._mapRuleIdxToRules.set(indexes, rules); return rules; } getRulesForAffSubstitution(sub) { return this.getRulesForIndexes(sub.attachRules); } #getRuleIndexes(rules) { const flags = this.#splitRules(rules); const indexes = flags .flatMap((flag) => { const found = this.mapToRuleIdx.get(flag); if (found === undefined && !this.missingFlags.has(flag)) { this.missingFlags.add(flag); const filename = this.filename; logError('Unable to resolve flag: %o, for file: %o', flag, filename); // throw new Error('Unable to resolve flag'); } return found; }) .filter(isDefined); return indexes; } #splitRules(rules) { switch (this.affFlagType) { case 'long': { return [...new Set(rules.replaceAll(/(..)/g, '$1//').split('//').slice(0, -1))]; } case 'num': { return [...new Set(rules.split(','))]; } } return [...new Set(rules)]; } #processAffInfo(affInfo) { const { AF = [], SFX = [], PFX = [] } = affInfo; const flags = objectToKvP(affInfo) .filter(isValidFlagMember) .map(([key, value]) => ({ id: value, flags: toAffixFlags(affFlag[key]) })); const sfxRules = [...SFX].map(([, sfx]) => sfx).map((sfx) => ({ id: sfx.id, sfx })); const pfxRules = [...PFX].map(([, pfx]) => pfx).map((pfx) => ({ id: pfx.id, pfx })); const rules = [...flags, ...sfxRules, ...pfxRules]; rules.forEach((rule, idx) => { const found = this.mapToRuleIdx.get(rule.id); if (found) { const filename = this.filename; logError('Duplicate affix rule: %o, filename: %o', rule.id, filename); const toAdd = Array.isArray(found) ? found : [found]; toAdd.push(idx); this.mapToRuleIdx.set(rule.id, toAdd); return; } this.mapToRuleIdx.set(rule.id, idx); }); AF.forEach((af, idx) => { if (!af) return; const indexes = this.#getRuleIndexes(af); this.mapWordRulesToRuleIndexes.set(idx.toString(), indexes); }); this.rules = rules.map((rule, idx) => this.#mapPartialRule(rule, idx)); } #mapPartialRule(rule, index) { const { id, flags, sfx, pfx } = rule; const idx = this.mapToRuleIdx.get(id); // if (index !== idx) { // const filename = this.affInfo.filename; // logError('Unexpected index: %o !== %o, rule %o, filename: %o', index, idx, rule, filename); // } assert(idx !== undefined && (idx === index || (Array.isArray(idx) && idx.includes(index)))); const fx = sfx || pfx; if (fx) { const affFx = this.#mapFx(fx); return affFx.type === 'P' ? { id, idx: index, type: 'P', flags: 0, fx: affFx } : { id, idx: index, type: 'S', flags: 0, fx: affFx }; } return { id, idx: index, type: 'F', flags: flags || 0 }; } #mapFx(fx) { const { id, combinable } = fx; const substitutionsForRegExps = this.#mapSubstitutionsForRegExps(fx.substitutionsForRegExps); return { type: fx.type === 'PFX' ? 'P' : 'S', id, combinable, substitutionsForRegExps }; } #mapSubstitutionsForRegExps(substitutions) { return substitutions.map((sub) => this.#mapSubstitutionsForRegExp(sub)); } #mapSubstitutionsForRegExp(subForRegExp) { const { match, substitutions: subs } = subForRegExp; const substitutions = subs.map((sub) => this.#mapSubstitution(sub)); const substitutionsGroupedByRemove = groupByField(substitutions, 'replace'); return { match, substitutionsGroupedByRemove }; } #mapSubstitution(sub) { const { type, remove, attach, attachRules, replace } = sub; const rules = attachRules ? this.getRuleIndexes(attachRules) : undefined; return { type, remove, attach, attachRules: rules, replace }; } } function joinRules(a, b) { if (!a) return b; if (!b) return a; return [...a, ...b]; } function objectToKvP(t) { return Object.entries(t); } // type Defined<T> = Exclude<T, undefined>; function isValidFlagMember(t) { const [key, value] = t; return key in affFlag && !!value; } /** * * @param FLAG - the FLAG value from the aff file * @returns the AffFlagType or throws */ export function toAffFlagType(FLAG) { if (!FLAG) return 'char'; switch (FLAG) { case 'long': case 'num': { return FLAG; } default: { throw new Error(`Unexpected FLAG value: ${FLAG}`); } } } //# sourceMappingURL=aff.js.map