hunspell-reader
Version:
A library for reading Hunspell Dictionary Files
272 lines • 9.89 kB
JavaScript
import * as util from 'node:util';
import * as GS from 'gensequence';
import { affFlag, flagToLongStringMap, flagToStringMap } from './affConstants.js';
import { Converter } from './converter.js';
import { filterOrderedList, isDefined } from './util.js';
const log = false;
const DefaultMaxDepth = 5;
const regExpIsNumber = /^\d+$/;
export class Aff {
affInfo;
filename;
rules;
_oConv;
_iConv;
_maxSuffixDepth = DefaultMaxDepth;
_mapRules = new Map();
constructor(affInfo, filename) {
this.affInfo = affInfo;
this.filename = filename;
this.rules = processRules(affInfo);
this._iConv = new Converter(affInfo.ICONV || []);
this._oConv = new Converter(affInfo.OCONV || []);
}
get maxSuffixDepth() {
return this._maxSuffixDepth;
}
set maxSuffixDepth(value) {
this._maxSuffixDepth = value;
}
/**
* Takes a line from a hunspell.dic file and applies the rules found in the aff file.
* For performance reasons, only the `word` field is mapped with OCONV.
* @param {string} line - the line from the .dic file.
*/
applyRulesToDicEntry(line, maxDepth) {
const maxSuffixDepth = maxDepth ?? this.maxSuffixDepth;
const [lineLeft] = line.split(/\s+/, 1);
const [word, rules = ''] = lineLeft.split('/', 2);
const convert = this._oConv.convert;
const results = this.applyRulesToWord(asAffWord(word, rules), maxSuffixDepth).map((affWord) => ((affWord.word = convert(affWord.word)), affWord));
results.sort(compareAff);
const filtered = results.filter(filterAff());
return filtered;
}
/**
* @internal
*/
applyRulesToWord(affWord, remainingDepth) {
const compoundMin = this.affInfo.COMPOUNDMIN ?? 3;
const { word, base, suffix, prefix, dic } = affWord;
const allRules = this.getMatchingRules(affWord.rules);
const { rulesApplied, flags } = reduceAffixRules(affWord, allRules);
const rules = this.joinRules(allRules.filter((rule) => !rule.flags).map((rule) => rule.id));
const affixRules = allRules.map((rule) => rule.sfx || rule.pfx).filter(isDefined);
const wordWithFlags = { word, flags, rulesApplied, rules: '', base, suffix, prefix, dic };
return [wordWithFlags, ...this.applyAffixesToWord(affixRules, { ...wordWithFlags, rules }, remainingDepth)]
.filter(({ flags }) => !flags.isNeedAffix)
.map((affWord) => adjustCompounding(affWord, compoundMin))
.map((affWord) => logAffWord(affWord, 'applyRulesToWord'));
}
applyAffixesToWord(affixRules, affWord, remainingDepth) {
if (remainingDepth <= 0) {
return [];
}
const combinableRules = affixRules
.filter((rule) => rule.type === 'SFX')
.filter((rule) => rule.combinable === true)
.map(({ id }) => id);
const combinableSfx = this.joinRules(combinableRules);
const r = affixRules
.flatMap((affix) => this.applyAffixToWord(affix, affWord, combinableSfx))
.flatMap((affWord) => this.applyRulesToWord(affWord, remainingDepth - 1));
return r;
}
applyAffixToWord(affix, affWord, combinableSfx) {
const { word } = affWord;
const combineRules = affix.type === 'PFX' && affix.combinable && !!combinableSfx ? combinableSfx : '';
const flags = affWord.flags.isNeedAffix ? removeNeedAffix(affWord.flags) : affWord.flags;
const matchingSubstitutions = affix.substitutionsForRegExps.filter((sub) => sub.match.test(word));
const partialAffWord = { ...affWord, flags, rules: combineRules };
return matchingSubstitutions
.flatMap((sub) => this.#applySubstitution(affix, partialAffWord, sub))
.map((affWord) => logAffWord(affWord, 'applyAffixToWord'));
}
#substituteAttach(affix, affWord, sub, stripped) {
const { word: origWord, rulesApplied, flags, dic } = affWord;
const rules = affWord.rules + (sub.attachRules || '');
let word;
let p = affWord.prefix.length;
let s = origWord.length - affWord.suffix.length;
if (sub.type === 'S') {
word = stripped + sub.attach;
s = Math.min(stripped.length, s);
p = Math.min(p, s);
}
else {
word = sub.attach + stripped;
const d = word.length - origWord.length;
p = Math.max(p, word.length - stripped.length);
s = Math.max(s + d, p);
}
const base = word.slice(p, s);
const prefix = word.slice(0, p);
const suffix = word.slice(s);
return {
word,
rulesApplied: rulesApplied + ' ' + affix.id,
rules,
flags,
base,
suffix,
prefix,
dic,
};
}
#applySubstitution(affix, affWord, subs) {
const results = [];
for (const [replace, substitutions] of subs.substitutionsGroupedByRemove) {
if (!replace.test(affWord.word))
continue;
const stripped = affWord.word.replace(replace, '');
for (const sub of substitutions) {
results.push(this.#substituteAttach(affix, affWord, sub, stripped));
}
}
return results;
}
getMatchingRules(rules) {
const { AF = [] } = this.affInfo;
const idx = regExpIsNumber.test(rules) ? Number.parseInt(rules, 10) : -1;
const rulesToSplit = AF[idx] || rules;
return this.separateRules(rulesToSplit)
.map((key) => this.rules.get(key))
.filter(isDefined);
}
joinRules(rules) {
switch (this.affInfo.FLAG) {
case 'long': {
return rules.join('');
}
case 'num': {
return rules.join(',');
}
}
return rules.join('');
}
separateRules(rules) {
const found = this._mapRules.get(rules);
if (found)
return found;
const split = this.#separateRules(rules);
this._mapRules.set(rules, split);
return split;
}
#separateRules(rules) {
switch (this.affInfo.FLAG) {
case 'long': {
return [...new Set(rules.replaceAll(/(..)/g, '$1//').split('//').slice(0, -1))];
}
case 'num': {
return [...new Set(rules.split(','))];
}
}
return [...new Set(rules)];
}
get iConv() {
return this._iConv;
}
get oConv() {
return this._oConv;
}
}
function signature(aff) {
const { word, flags } = aff;
const sig = Object.entries(flags)
.filter((e) => !!e[1])
.map((f) => flagToStringMap[f[0]])
.sort()
.join('');
return word + '|' + sig;
}
export function processRules(affInfo) {
const sfxRules = [...(affInfo.SFX || [])]
.map(([, sfx]) => sfx)
.map((sfx) => ({ id: sfx.id, type: 'sfx', sfx }));
const pfxRules = [...(affInfo.PFX || [])]
.map(([, pfx]) => pfx)
.map((pfx) => ({ id: pfx.id, type: 'pfx', pfx }));
const flagRules = GS.sequenceFromObject(affInfo)
.filter(([key, value]) => !!affFlag[key] && !!value)
.map(([key, value]) => ({ id: value, type: 'flag', flags: affFlag[key] }))
.toArray();
const rules = [...sfxRules, ...pfxRules, ...flagRules].reduce((acc, rule) => {
acc.set(rule.id, rule);
return acc;
}, new Map());
return rules;
}
export function logAffWord(affWord, message) {
/* istanbul ignore if */
if (log) {
const dump = util.inspect(affWord, { showHidden: false, depth: 5, colors: true });
console.log(`${message}: ${dump}`);
}
return affWord;
}
/* istanbul ignore next */
export function affWordToColoredString(affWord) {
return util
.inspect({ ...affWord, flags: flagsToString(affWord.flags) }, { showHidden: false, depth: 5, colors: true })
.replaceAll(/(\s|\n|\r)+/g, ' ');
}
/* istanbul ignore next */
export function flagsToString(flags) {
return [...Object.entries(flags)]
.filter(([, v]) => !!v)
.map(([k]) => flagToLongStringMap[k])
.sort()
.join(':');
}
export function asAffWord(word, rules = '', flags = {}) {
return {
word,
base: word,
prefix: '',
suffix: '',
rulesApplied: '',
rules,
flags,
dic: rules ? word + '/' + rules : word,
};
}
export function compareAff(a, b) {
if (a.word !== b.word) {
return a.word < b.word ? -1 : 1;
}
const sigA = signature(a);
const sigB = signature(b);
return sigA < sigB ? -1 : sigA > sigB ? 1 : 0;
}
function reduceAffixRules(affWord, allRules) {
return allRules
.filter((rule) => !!rule.flags)
.reduce((acc, rule) => ({
rulesApplied: [acc.rulesApplied, rule.id].join(' '),
flags: { ...acc.flags, ...rule.flags },
}), { rulesApplied: affWord.rulesApplied, flags: affWord.flags });
}
/**
* Returns a filter function that will filter adjacent AffWords
* It compares the word and the flags.
*/
export function filterAff() {
return filterOrderedList((a, b) => a.word !== b.word || signature(a) !== signature(b));
}
export const debug = {
signature,
};
function removeNeedAffix(flags) {
const newFlags = { ...flags };
delete newFlags.isNeedAffix;
return newFlags;
}
function adjustCompounding(affWord, minLength) {
if (!affWord.flags.isCompoundPermitted || affWord.word.length >= minLength) {
return affWord;
}
const { isCompoundPermitted: _, ...flags } = affWord.flags;
affWord.flags = flags;
return affWord;
}
//# sourceMappingURL=affLegacy.js.map