hunspell-reader
Version:
A library for reading Hunspell Dictionary Files
442 lines • 16.9 kB
JavaScript
import assert from 'node:assert';
import { affFlag } from './affConstants.js';
import { Converter } from './converter.js';
import { filterOrderedList, groupByField, isDefined } from './util.js';
const debug = false;
function logError(msg, ...args) {
debug && console.error(msg, ...args);
}
const DefaultMaxDepth = 5;
export class Aff {
affInfo;
affData;
_oConv;
_iConv;
_maxSuffixDepth = DefaultMaxDepth;
constructor(affInfo, filename) {
this.affInfo = affInfo;
this.affData = new AffData(affInfo, filename);
this._iConv = new Converter(affInfo.ICONV || []);
this._oConv = new Converter(affInfo.OCONV || []);
}
get maxSuffixDepth() {
return this._maxSuffixDepth;
}
set maxSuffixDepth(value) {
this._maxSuffixDepth = value;
}
/**
* Takes a line from a hunspell.dic file and applies the rules found in the aff file.
* For performance reasons, only the `word` field is mapped with OCONV.
* @param {string} line - the line from the .dic file.
*/
applyRulesToDicEntry(line, maxDepth) {
const afWord = this.affData.dictLineToAffixWord(line);
const maxSuffixDepth = maxDepth ?? this.maxSuffixDepth;
const convert = this._oConv.convert;
const results = this.applyRulesToWord(afWord, maxSuffixDepth).map((affWord) => ({
...affWord,
word: convert(affWord.word),
originalWord: affWord.word,
}));
results.sort(compareAff);
const filtered = results.filter(filterAff());
return filtered;
}
/**
* @internal
*/
applyRulesToWord(affWord, remainingDepth) {
const compoundMin = this.affInfo.COMPOUNDMIN ?? 3;
const { word, flags, dict, appliedRules } = affWord;
const wordWithFlags = { word, rules: undefined, flags, dict, appliedRules };
return [wordWithFlags, ...this.applyAffixesToWord(affWord, remainingDepth)]
.filter(({ flags }) => !(flags & AffixFlags.isNeedAffix))
.map((affWord) => adjustCompounding(affWord, compoundMin));
}
applyAffixesToWord(affWord, remainingDepth) {
if (remainingDepth <= 0 || !affWord.rules) {
return [];
}
const rules = affWord.rules;
const combinableSfx = rules.filter((r) => r.type === 'S' && r.fx.combinable);
const r = affWord.rules
.flatMap((affix) => this.applyAffixToWord(affix, affWord, combinableSfx))
.flatMap((affWord) => this.applyRulesToWord(affWord, remainingDepth - 1));
return r;
}
applyAffixToWord(rule, affWord, combinableSfx) {
const { word } = affWord;
const combineRules = rule.type === 'P' && rule.fx.combinable ? combinableSfx : [];
const flags = affWord.flags & ~AffixFlags.isNeedAffix;
const matchingSubstitutions = rule.fx.substitutionsForRegExps.filter((sub) => sub.match.test(word));
const source = {
dict: affWord.dict,
appliedRules: affWord.appliedRules ? [...affWord.appliedRules, rule.idx] : undefined,
};
const partialAffWord = this.affData.toAffixWord(source, word, flags, combineRules);
return matchingSubstitutions.flatMap((sub) => this.#applySubstitution(partialAffWord, sub));
}
#substituteAttach(affWord, sub, stripped) {
const { flags } = affWord;
const subRules = this.affData.getRulesForAffSubstitution(sub);
const rules = joinRules(affWord.rules, subRules);
const word = sub.type === 'S' ? stripped + sub.attach : sub.attach + stripped;
return this.affData.toAffixWord(affWord, word, flags, rules);
}
#applySubstitution(affWord, subs) {
const results = [];
for (const [replace, substitutions] of subs.substitutionsGroupedByRemove) {
if (!replace.test(affWord.word))
continue;
const stripped = affWord.word.replace(replace, '');
for (const sub of substitutions) {
results.push(this.#substituteAttach(affWord, sub, stripped));
}
}
return results;
}
getMatchingRules(flags) {
const rules = this.affData.getRules(flags);
return rules;
}
/**
* Convert the applied rule indexes to AFF Letters.
* Requires that the affixWord was generated with trace mode turned on.
* @param affixWord - the generated AffixWord.
*/
getFlagsValuesForAffixWord(affixWord) {
const rules = this.affData.getRulesForIndexes(affixWord.appliedRules);
return rules?.map((r) => r.id);
}
get iConv() {
return this._iConv;
}
get oConv() {
return this._oConv;
}
setTraceMode(value) {
this.affData.trace = value;
}
}
export function compareAff(a, b) {
return a.word < b.word ? -1 : a.word > b.word ? 1 : a.flags - b.flags;
}
/**
* Returns a filter function that will filter adjacent AffWords
* It compares the word and the flags.
*/
function filterAff() {
return filterOrderedList((a, b) => a.word !== b.word || a.flags !== b.flags);
}
function adjustCompounding(affWord, minLength) {
if (!(affWord.flags & AffixFlags.isCompoundPermitted) || affWord.word.length >= minLength) {
return affWord;
}
affWord.flags &= ~AffixFlags.isCompoundPermitted;
return affWord;
}
export var AffixFlags;
(function (AffixFlags) {
AffixFlags[AffixFlags["none"] = 0] = "none";
/**
* COMPOUNDFLAG flag
*
* Words signed with COMPOUNDFLAG may be in compound words (except when word shorter than COMPOUNDMIN).
* Affixes with COMPOUNDFLAG also permits compounding of affixed words.
*
*/
AffixFlags[AffixFlags["isCompoundPermitted"] = 1] = "isCompoundPermitted";
/**
* COMPOUNDBEGIN flag
*
* Words signed with COMPOUNDBEGIN (or with a signed affix) may be first elements in compound words.
*
*/
AffixFlags[AffixFlags["canBeCompoundBegin"] = 2] = "canBeCompoundBegin";
/**
* COMPOUNDMIDDLE flag
*
* Words signed with COMPOUNDMIDDLE (or with a signed affix) may be middle elements in compound words.
*
*/
AffixFlags[AffixFlags["canBeCompoundMiddle"] = 4] = "canBeCompoundMiddle";
/**
* COMPOUNDLAST flag
*
* Words signed with COMPOUNDLAST (or with a signed affix) may be last elements in compound words.
*
*/
AffixFlags[AffixFlags["canBeCompoundEnd"] = 8] = "canBeCompoundEnd";
/**
* COMPOUNDPERMITFLAG flag
*
* Prefixes are allowed at the beginning of compounds, suffixes are allowed at the end of compounds by default.
* Affixes with COMPOUNDPERMITFLAG may be inside of compounds.
*
*/
AffixFlags[AffixFlags["isOnlyAllowedInCompound"] = 16] = "isOnlyAllowedInCompound";
/**
* COMPOUNDFORBIDFLAG flag
*
* Suffixes with this flag forbid compounding of the affixed word.
*
*/
AffixFlags[AffixFlags["isCompoundForbidden"] = 32] = "isCompoundForbidden";
/**
* WARN flag
*
* This flag is for rare words, which are also often spelling mistakes, see option -r of command line Hunspell and FORBIDWARN.
*/
AffixFlags[AffixFlags["isWarning"] = 64] = "isWarning";
/**
* KEEPCASE flag
*
* Forbid uppercased and capitalized forms of words signed with KEEPCASE flags. Useful for special orthographies (measurements and
* currency often keep their case in uppercased texts) and writing systems (e.g. keeping lower case of IPA characters). Also valuable
* for words erroneously written in the wrong case.
*/
AffixFlags[AffixFlags["isKeepCase"] = 128] = "isKeepCase";
/**
* FORCEUCASE flag
*
* Last word part of a compound with flag FORCEUCASE forces capitalization of the whole compound word.
* Eg. Dutch word "straat" (street) with FORCEUCASE flags will allowed only in capitalized compound forms,
* according to the Dutch spelling rules for proper names.
*/
AffixFlags[AffixFlags["isForceUCase"] = 256] = "isForceUCase";
/**
* FORBIDDENWORD flag
*
* This flag signs forbidden word form. Because affixed forms are also forbidden, we can subtract a subset from set of the
* accepted affixed and compound words. Note: useful to forbid erroneous words, generated by the compounding mechanism.
*/
AffixFlags[AffixFlags["isForbiddenWord"] = 512] = "isForbiddenWord";
/**
* NOSUGGEST flag
*
* Words signed with NOSUGGEST flag are not suggested (but still accepted when typed correctly). Proposed flag for vulgar
* and obscene words (see also SUBSTANDARD).
*/
AffixFlags[AffixFlags["isNoSuggest"] = 1024] = "isNoSuggest";
// cspell:ignore pseudoroot
/**
* NEEDAFFIX flag
*
* This flag signs virtual stems in the dictionary, words only valid when affixed. Except, if the dictionary word has a homonym
* or a zero affix. NEEDAFFIX works also with prefixes and prefix + suffix combinations (see tests/pseudoroot5.*).
*/
AffixFlags[AffixFlags["isNeedAffix"] = 2048] = "isNeedAffix";
})(AffixFlags || (AffixFlags = {}));
function toAffixFlags(flags) {
let result = 0;
for (const [key, value] of Object.entries(flags)) {
if (value) {
const flag = AffixFlags[key];
result |= flag;
}
}
return result;
}
class AffData {
affInfo;
filename;
rules = [];
mapToRuleIdx = new Map();
mapWordRulesToRuleIndexes = new Map();
mapWordRulesToRules = new Map();
affFlagType;
missingFlags = new Set();
_mapRuleIdxToRules = new WeakMap();
trace = false;
constructor(affInfo, filename) {
this.affInfo = affInfo;
this.filename = filename;
this.affFlagType = toAffFlagType(affInfo.FLAG);
this.#processAffInfo(affInfo);
}
dictLineToEntry(line) {
const [lineLeft] = line.split(/\s+/, 1);
const [word, rules = ''] = lineLeft.split('/', 2);
return { word, flags: rules, line };
}
dictLineToAffixWord(line) {
const entry = this.dictLineToEntry(line);
return this.toAffixWord({ dict: entry, appliedRules: this.trace ? [] : undefined }, entry.word, AffixFlags.none, this.getRules(entry.flags));
}
toAffixWord(source, word, flags, rules) {
const dict = source.dict;
let appliedRules = source.appliedRules;
if (!rules)
return { word, rules: undefined, flags, dict, appliedRules };
const fxRules = rules.filter((rule) => rule.type !== 'F');
if (appliedRules) {
appliedRules = [...appliedRules, ...rules.filter((r) => r.type === 'F').map((r) => r.idx)];
}
return {
word,
rules: fxRules.length ? fxRules : undefined,
flags: flags | this.rulesToFlags(rules),
appliedRules,
dict,
};
}
getRules(rules) {
const foundRules = this.mapWordRulesToRules.get(rules);
if (foundRules)
return foundRules;
const ruleIndexes = this.getRuleIndexes(rules);
const affRules = ruleIndexes.map((idx) => this.rules[idx]);
this.mapWordRulesToRules.set(rules, affRules);
return affRules;
}
getRuleIndexes(rules) {
const found = this.mapWordRulesToRuleIndexes.get(rules);
if (found)
return found;
const indexes = this.#getRuleIndexes(rules);
this.mapWordRulesToRuleIndexes.set(rules, indexes);
return indexes;
}
rulesToFlags(rules) {
return rules.reduce((acc, rule) => acc | rule.flags, AffixFlags.none);
}
getRulesForIndexes(indexes) {
if (!indexes)
return undefined;
let rules = this._mapRuleIdxToRules.get(indexes);
if (rules)
return rules;
rules = indexes.map((idx) => this.rules[idx]);
this._mapRuleIdxToRules.set(indexes, rules);
return rules;
}
getRulesForAffSubstitution(sub) {
return this.getRulesForIndexes(sub.attachRules);
}
#getRuleIndexes(rules) {
const flags = this.#splitRules(rules);
const indexes = flags
.flatMap((flag) => {
const found = this.mapToRuleIdx.get(flag);
if (found === undefined && !this.missingFlags.has(flag)) {
this.missingFlags.add(flag);
const filename = this.filename;
logError('Unable to resolve flag: %o, for file: %o', flag, filename);
// throw new Error('Unable to resolve flag');
}
return found;
})
.filter(isDefined);
return indexes;
}
#splitRules(rules) {
switch (this.affFlagType) {
case 'long': {
return [...new Set(rules.replaceAll(/(..)/g, '$1//').split('//').slice(0, -1))];
}
case 'num': {
return [...new Set(rules.split(','))];
}
}
return [...new Set(rules)];
}
#processAffInfo(affInfo) {
const { AF = [], SFX = [], PFX = [] } = affInfo;
const flags = objectToKvP(affInfo)
.filter(isValidFlagMember)
.map(([key, value]) => ({ id: value, flags: toAffixFlags(affFlag[key]) }));
const sfxRules = [...SFX].map(([, sfx]) => sfx).map((sfx) => ({ id: sfx.id, sfx }));
const pfxRules = [...PFX].map(([, pfx]) => pfx).map((pfx) => ({ id: pfx.id, pfx }));
const rules = [...flags, ...sfxRules, ...pfxRules];
rules.forEach((rule, idx) => {
const found = this.mapToRuleIdx.get(rule.id);
if (found) {
const filename = this.filename;
logError('Duplicate affix rule: %o, filename: %o', rule.id, filename);
const toAdd = Array.isArray(found) ? found : [found];
toAdd.push(idx);
this.mapToRuleIdx.set(rule.id, toAdd);
return;
}
this.mapToRuleIdx.set(rule.id, idx);
});
AF.forEach((af, idx) => {
if (!af)
return;
const indexes = this.#getRuleIndexes(af);
this.mapWordRulesToRuleIndexes.set(idx.toString(), indexes);
});
this.rules = rules.map((rule, idx) => this.#mapPartialRule(rule, idx));
}
#mapPartialRule(rule, index) {
const { id, flags, sfx, pfx } = rule;
const idx = this.mapToRuleIdx.get(id);
// if (index !== idx) {
// const filename = this.affInfo.filename;
// logError('Unexpected index: %o !== %o, rule %o, filename: %o', index, idx, rule, filename);
// }
assert(idx !== undefined && (idx === index || (Array.isArray(idx) && idx.includes(index))));
const fx = sfx || pfx;
if (fx) {
const affFx = this.#mapFx(fx);
return affFx.type === 'P'
? { id, idx: index, type: 'P', flags: 0, fx: affFx }
: { id, idx: index, type: 'S', flags: 0, fx: affFx };
}
return { id, idx: index, type: 'F', flags: flags || 0 };
}
#mapFx(fx) {
const { id, combinable } = fx;
const substitutionsForRegExps = this.#mapSubstitutionsForRegExps(fx.substitutionsForRegExps);
return { type: fx.type === 'PFX' ? 'P' : 'S', id, combinable, substitutionsForRegExps };
}
#mapSubstitutionsForRegExps(substitutions) {
return substitutions.map((sub) => this.#mapSubstitutionsForRegExp(sub));
}
#mapSubstitutionsForRegExp(subForRegExp) {
const { match, substitutions: subs } = subForRegExp;
const substitutions = subs.map((sub) => this.#mapSubstitution(sub));
const substitutionsGroupedByRemove = groupByField(substitutions, 'replace');
return { match, substitutionsGroupedByRemove };
}
#mapSubstitution(sub) {
const { type, remove, attach, attachRules, replace } = sub;
const rules = attachRules ? this.getRuleIndexes(attachRules) : undefined;
return { type, remove, attach, attachRules: rules, replace };
}
}
function joinRules(a, b) {
if (!a)
return b;
if (!b)
return a;
return [...a, ...b];
}
function objectToKvP(t) {
return Object.entries(t);
}
// type Defined<T> = Exclude<T, undefined>;
function isValidFlagMember(t) {
const [key, value] = t;
return key in affFlag && !!value;
}
/**
*
* @param FLAG - the FLAG value from the aff file
* @returns the AffFlagType or throws
*/
export function toAffFlagType(FLAG) {
if (!FLAG)
return 'char';
switch (FLAG) {
case 'long':
case 'num': {
return FLAG;
}
default: {
throw new Error(`Unexpected FLAG value: ${FLAG}`);
}
}
}
//# sourceMappingURL=aff.js.map