hunspell-reader
Version:
A library for reading Hunspell Dictionary Files
379 lines • 13.7 kB
JavaScript
/* eslint-disable unicorn/text-encoding-identifier-case */
import assert from 'node:assert';
import { readFile } from 'node:fs/promises';
import { decode as decodeHtmlEntities } from 'html-entities';
import pkgIconvLite from 'iconv-lite';
import { Aff } from './aff.js';
import { Aff as AffLegacy } from './affLegacy.js';
import { cleanObject, insertItemIntoGroupByField, isDefined } from './util.js';
const { decode } = pkgIconvLite;
const fixRegex = {
SFX: { m: /$/, r: '$' },
PFX: { m: /^/, r: '^' },
};
const yesRegex = /[yY]/;
const spaceRegex = /\s+/;
const commentRegex = /(?:^\s*#.*)|(?:\s+#.*)/;
const affixLine = /^\s*([^\s]+)\s+(.*)?$/;
const UTF8 = 'UTF-8';
function convEntry() {
let fieldValue;
return {
addLine: (line) => {
if (fieldValue === undefined) {
fieldValue = [];
return;
}
const args = (line.value || '').split(spaceRegex);
fieldValue.push({ from: args[0], to: args[1] });
},
getValue: () => fieldValue,
};
}
function afEntry() {
let fieldValue;
return {
addLine: (line) => {
if (fieldValue === undefined) {
// Add empty entry because rules start at 1
fieldValue = [''];
return;
}
if (line.value) {
fieldValue.push(line.value);
}
},
getValue: () => fieldValue,
};
}
function simpleTable(map) {
let data;
function getValue() {
if (data?.values)
return map(data.values);
return undefined;
}
function addLine(line) {
const args = (line.value || '').split(spaceRegex);
if (data === undefined) {
const [count, ...extraValues] = args;
const extra = extraValues.length ? extraValues : undefined;
const values = [];
data = { count, extra, values };
return;
}
data.values.push(args);
}
return { addLine, getValue };
}
function tablePfxOrSfx(fieldValue, line) {
/*
Fields of an affix rules:
(0) Option name
(1) Flag
(2) stripping characters from beginning (at prefix rules) or end (at suffix rules) of the word
(3) affix (optionally with flags of continuation classes, separated by a slash)
(4) condition.
Zero stripping or affix are indicated by zero. Zero condition is indicated by dot.
Condition is a simplified, regular expression-like pattern, which must be met before the affix can be applied.
(Dot signs an arbitrary character. Characters in braces sign an arbitrary character from the character subset.
Dash hasn't got special meaning, but circumflex (^) next the first brace sets the complimenter character set.)
(5) Optional morphological fields separated by spaces or tabulators.
*/
if (fieldValue === undefined) {
fieldValue = new Map();
}
const [subField] = (line.value || '').split(spaceRegex);
if (!fieldValue.has(subField)) {
const fx = parseAffixCreation(line);
fieldValue.set(fx.id, fx);
return fieldValue;
}
const rule = parseAffixRule(line);
if (!rule) {
console.log(`Affix rule missing values: ${line.option} ${line.value}`);
return fieldValue;
}
const fixRuleSet = fieldValue.get(subField);
assert(fixRuleSet);
const substitutionSets = fixRuleSet.substitutionSets;
const ruleAsString = rule.condition.source;
if (!substitutionSets.has(ruleAsString)) {
substitutionSets.set(ruleAsString, {
match: rule.condition,
substitutions: [],
substitutionsGroupedByRemove: new Map(),
});
}
const substitutionSet = substitutionSets.get(ruleAsString);
assert(substitutionSet);
const [attachText, attachRules] = rule.affix.split('/', 2);
const substitution = {
type: rule.type === 'SFX' ? 'S' : 'P',
remove: rule.stripping,
replace: rule.replace,
attach: attachText,
attachRules,
extra: rule.extra,
};
substitutionSet.substitutions.push(substitution);
insertItemIntoGroupByField(substitutionSet.substitutionsGroupedByRemove, 'replace', substitution);
fixRuleSet.substitutionsForRegExps = [...substitutionSets.values()];
return fieldValue;
}
/**
* Parse Affix creation line:
* `PFX|SFX flag cross_product number`
*/
function parseAffixCreation(line) {
const [flag, combinable, count, ...extra] = (line.value || '').split(spaceRegex);
const fx = {
id: flag,
type: line.option === 'SFX' ? 'SFX' : 'PFX',
combinable: !!yesRegex.test(combinable),
count,
extra,
substitutionSets: new Map(),
substitutionsForRegExps: [],
};
return fx;
}
const affixRuleRegEx = /^(\S+)\s+(\S+)\s+(\S+)\s*(.*)/;
const affixRuleConditionRegEx = /^((?:\[.*\]|\S+)+)\s*(.*)/;
/**
* `PFX|SFX flag stripping prefix [condition [morphological_fields...]]`
*/
function parseAffixRule(line) {
const [, flag, strip, affix, optional = ''] = (line.value || '').match(affixRuleRegEx) || [];
if (!flag || !strip || !affix) {
return undefined;
}
const [, rawCondition = '.', extra] = optional.match(affixRuleConditionRegEx) || [];
const type = line.option === 'SFX' ? 'SFX' : 'PFX';
const condition = fixMatch(type, rawCondition);
const affixRule = {
type,
flag,
stripping: strip,
replace: fixMatch(type, strip),
affix: cleanAffixAttach(affix),
condition,
extra,
};
return affixRule;
}
function cleanAffixAttach(affix) {
const [fix, rules] = affix.split('/', 2);
const attach = fix === '0' ? '' : fix;
return attach + (rules ? '/' + rules : '');
}
const regexpCache = new Map();
function fixMatch(type, match) {
const key = type + ':' + match;
const cached = regexpCache.get(key);
if (cached)
return cached;
const exp = affixMatchToRegExpString(match);
const fix = fixRegex[type];
const regexp = new RegExp(exp.replace(fix.m, fix.r));
regexpCache.set(key, regexp);
return regexp;
}
function affixMatchToRegExpString(match) {
if (match === '0')
return '';
return match.replaceAll(/([\\\-?*])/g, '\\$1');
}
function collectFx() {
let value;
function addLine(line) {
value = tablePfxOrSfx(value, line);
}
return {
addLine,
getValue: () => value,
};
}
const asPfx = collectFx;
const asSfx = collectFx;
const asString = () => collectPrimitive((v) => v, '');
const asBoolean = () => collectPrimitive((v) => !!Number.parseInt(v), '1');
const asNumber = () => collectPrimitive(Number.parseInt, '0');
function collectPrimitive(map, defaultValue = '') {
let primitive;
function getValue() {
return primitive;
}
function addLine(line) {
const { value = defaultValue } = line;
primitive = map(value);
}
return { addLine, getValue };
}
function toRep(values) {
return values.map((v) => ({ match: v[0], replaceWith: v[1] }));
}
function toSingleStrings(values) {
return values.map((v) => v[0]).filter(isDefined);
}
function toAffMap(values) {
return toSingleStrings(values);
}
function toCompoundRule(values) {
return toSingleStrings(values);
}
function toCheckCompoundPattern(values) {
return values;
}
/*
cspell:ignore COMPOUNDBEGIN COMPOUNDEND COMPOUNDMIDDLE COMPOUNDMIN COMPOUNDPERMITFLAG COMPOUNDRULE COMPOUNDFORBIDFLAG COMPOUNDFLAG
cspell:ignore FORBIDDENWORD KEEPCASE
cspell:ignore MAXDIFF NEEDAFFIX WORDCHARS
*/
// prettier-ignore
const createAffFieldTable = () => ({
AF: afEntry(),
BREAK: simpleTable(toSingleStrings),
CHECKCOMPOUNDCASE: asBoolean(),
CHECKCOMPOUNDDUP: asBoolean(),
CHECKCOMPOUNDPATTERN: simpleTable(toCheckCompoundPattern),
CHECKCOMPOUNDREP: asBoolean(),
COMPOUNDBEGIN: asString(),
COMPOUNDEND: asString(),
COMPOUNDMIDDLE: asString(),
COMPOUNDMIN: asNumber(),
COMPOUNDFLAG: asString(),
COMPOUNDPERMITFLAG: asString(),
COMPOUNDFORBIDFLAG: asString(),
COMPOUNDRULE: simpleTable(toCompoundRule),
FLAG: asString(), // 'long' | 'num'
FORBIDDENWORD: asString(),
FORCEUCASE: asString(),
ICONV: convEntry(),
KEEPCASE: asString(),
KEY: asString(),
MAP: simpleTable(toAffMap),
MAXCPDSUGS: asNumber(),
MAXDIFF: asNumber(),
NEEDAFFIX: asString(),
NOSPLITSUGS: asBoolean(),
NOSUGGEST: asString(),
OCONV: convEntry(),
ONLYINCOMPOUND: asString(),
ONLYMAXDIFF: asBoolean(),
PFX: asPfx(),
REP: simpleTable(toRep),
SET: asString(),
SFX: asSfx(),
TRY: asString(),
WARN: asString(),
WORDCHARS: asString(),
});
function collectionToAffInfo(affFieldCollectionTable, encoding) {
// prettier-ignore
const result = {
AF: affFieldCollectionTable.AF.getValue(),
BREAK: affFieldCollectionTable.BREAK.getValue(),
CHECKCOMPOUNDCASE: affFieldCollectionTable.CHECKCOMPOUNDCASE.getValue(),
CHECKCOMPOUNDDUP: affFieldCollectionTable.CHECKCOMPOUNDDUP.getValue(),
CHECKCOMPOUNDPATTERN: affFieldCollectionTable.CHECKCOMPOUNDPATTERN.getValue(),
CHECKCOMPOUNDREP: affFieldCollectionTable.CHECKCOMPOUNDREP.getValue(),
COMPOUNDBEGIN: affFieldCollectionTable.COMPOUNDBEGIN.getValue(),
COMPOUNDEND: affFieldCollectionTable.COMPOUNDEND.getValue(),
COMPOUNDMIDDLE: affFieldCollectionTable.COMPOUNDMIDDLE.getValue(),
COMPOUNDMIN: affFieldCollectionTable.COMPOUNDMIN.getValue(),
COMPOUNDFLAG: affFieldCollectionTable.COMPOUNDFLAG.getValue(),
COMPOUNDPERMITFLAG: affFieldCollectionTable.COMPOUNDPERMITFLAG.getValue(),
COMPOUNDFORBIDFLAG: affFieldCollectionTable.COMPOUNDFORBIDFLAG.getValue(),
COMPOUNDRULE: affFieldCollectionTable.COMPOUNDRULE.getValue(),
FLAG: affFieldCollectionTable.FLAG.getValue(),
FORBIDDENWORD: affFieldCollectionTable.FORBIDDENWORD.getValue(),
FORCEUCASE: affFieldCollectionTable.FORCEUCASE.getValue(),
ICONV: affFieldCollectionTable.ICONV.getValue(),
KEEPCASE: affFieldCollectionTable.KEEPCASE.getValue(),
KEY: affFieldCollectionTable.KEY.getValue(),
MAP: affFieldCollectionTable.MAP.getValue(),
MAXCPDSUGS: affFieldCollectionTable.MAXCPDSUGS.getValue(),
MAXDIFF: affFieldCollectionTable.MAXDIFF.getValue(),
NEEDAFFIX: affFieldCollectionTable.NEEDAFFIX.getValue(),
NOSPLITSUGS: affFieldCollectionTable.NOSPLITSUGS.getValue(),
NOSUGGEST: affFieldCollectionTable.NOSUGGEST.getValue(),
OCONV: affFieldCollectionTable.OCONV.getValue(),
ONLYINCOMPOUND: affFieldCollectionTable.ONLYINCOMPOUND.getValue(),
ONLYMAXDIFF: affFieldCollectionTable.ONLYMAXDIFF.getValue(),
PFX: affFieldCollectionTable.PFX.getValue(),
REP: affFieldCollectionTable.REP.getValue(),
SET: affFieldCollectionTable.SET.getValue() || encoding,
SFX: affFieldCollectionTable.SFX.getValue(),
TRY: affFieldCollectionTable.TRY.getValue(),
WARN: affFieldCollectionTable.WARN.getValue(),
WORDCHARS: affFieldCollectionTable.WORDCHARS.getValue(),
};
return cleanObject(result);
}
let htmlEntitiesFound = 0;
let currentAffFilename = '';
export async function parseAffFile(filename, encoding = UTF8) {
const buffer = await readFile(filename);
currentAffFilename = filename;
const file = decode(buffer, encoding);
const affInfo = parseAff(file, encoding);
if (affInfo.SET && affInfo.SET.toLowerCase() !== encoding.toLowerCase()) {
return parseAff(decode(buffer, affInfo.SET.toLowerCase()), affInfo.SET);
}
return affInfo;
}
function convertHtmlEntities(line, index) {
if (!line.includes('&'))
return line;
const fixed = decodeHtmlEntities(line);
if (fixed !== line) {
if (htmlEntitiesFound < 10) {
const foundInFile = currentAffFilename;
console.error('HTML Entities found in aff file at line %s:%i\n\t%o replaced with:\n\t%o', foundInFile, index + 1, line, fixed);
}
if (htmlEntitiesFound === 10) {
console.error('HTML Entities found in aff...');
}
++htmlEntitiesFound;
}
return fixed;
}
export function parseAff(affFileContent, encoding = UTF8) {
htmlEntitiesFound = 0;
const lines = affFileContent.split(/\r?\n/g);
const affFieldCollectionTable = createAffFieldTable();
affFieldCollectionTable.SET.addLine({ option: 'SET', value: encoding });
lines
.map((line) => line.trimStart())
.map((line) => line.replace(commentRegex, ''))
.map(convertHtmlEntities)
.filter((line) => line.trim() !== '')
.map(parseLine)
.forEach((line) => {
const field = line.option;
affFieldCollectionTable[field]?.addLine(line);
});
return collectionToAffInfo(affFieldCollectionTable, encoding);
}
export function parseAffFileToAff(filename, encoding) {
return parseAffFile(filename, encoding).then((affInfo) => {
return new Aff(affInfo, filename);
});
}
export function parseAffFileToAffLegacy(filename, encoding) {
return parseAffFile(filename, encoding).then((affInfo) => {
return new AffLegacy(affInfo, filename);
});
}
function parseLine(line) {
const result = line.match(affixLine) || ['', ''];
const [, option, value] = result;
return { option, value: value || undefined };
}
export const testing = {
parseAffixRule,
tablePfxOrSfx,
parseLine,
};
//# sourceMappingURL=affReader.js.map