hunspell-reader
Version:
A library for reading Hunspell Dictionary Files
172 lines • 6.98 kB
JavaScript
// cSpell:ignore findup
import { createWriteStream, openSync, writeSync } from 'node:fs';
import { Command } from 'commander';
import { genSequence } from 'gensequence';
import { asAffWord } from './affLegacy.js';
import { IterableHunspellReaderLegacy } from './IterableHunspellReaderLegacy.js';
import { iterableToStream } from './iterableToStream.js';
import { batch, uniqueFilter } from './util.js';
const uniqueHistorySize = 500_000;
let logStream = process.stderr;
export function getCommand() {
const commander = new Command('words');
commander
.arguments('<hunspell_dic_file>')
.option('-o, --output <file>', 'output file - defaults to stdout')
.option('-s, --sort', 'sort the list of words')
.option('-u, --unique', 'make sure the words are unique.')
.option('-l, --lower_case', 'output in lower case')
.option('-T, --no-transform', 'Do not apply the prefix and suffix transforms. Root words only.')
.option('-x, --infix', 'Return words with prefix / suffix breaks. ex: "un<do>ing"')
.option('-r, --rules', 'Append rules used to generate word.')
.option('-p, --progress', 'Show progress.')
.option('-m, --max_depth <limit>', 'Maximum depth to apply suffix rules.')
.option('-n, --number <limit>', 'Limit the number of words to output.')
.option('--forbidden', 'include forbidden words')
.option('--partial_compounds', 'include words that must be part of a compound word')
.option('--only_forbidden', 'includes only words that are forbidden')
.description('Output all the words in the <hunspell.dic> file.')
.action(action);
return commander;
}
function notify(message, newLine = true) {
message = message + (newLine ? '\n' : '');
logStream.write(message, 'utf8');
}
function yesNo(value) {
return value ? 'Yes' : 'No';
}
function affWordToInfix(aff) {
return { ...aff, word: aff.prefix + '<' + aff.base + '>' + aff.suffix };
}
function mapWord(map) {
return (aff) => ({ ...aff, word: map(aff.word) });
}
function appendRules(aff) {
return { ...aff, word: aff.word + '\t[' + aff.rulesApplied + ' ]\t' + '(' + aff.dic + ')' };
}
function writeSeqToFile(seq, outFile) {
return new Promise((resolve, reject) => {
let resolved = false;
const out = outFile ? createWriteStream(outFile) : process.stdout;
const bufferedSeq = genSequence(batch(seq, 500)).map((batch) => batch.join(''));
const dataStream = iterableToStream(bufferedSeq);
const fileStream = dataStream.pipe(out);
const endEvents = ['finish', 'close', 'end'];
function resolvePromise() {
if (!resolved) {
resolved = true;
resolve();
}
}
const endHandler = () => {
cleanupStreams();
setTimeout(resolvePromise, 10);
};
const errorHandler = (e) => {
cleanupStreams();
reject(e);
};
listenToStreams();
function listenToStreams() {
endEvents.forEach((event) => fileStream.addListener(event, endHandler));
fileStream.addListener('error', errorHandler);
dataStream.addListener('end', endHandler);
}
function cleanupStreams() {
endEvents.forEach((event) => fileStream.removeListener(event, endHandler));
fileStream.removeListener('error', errorHandler);
dataStream.removeListener('end', endHandler);
}
});
}
async function action(hunspellDicFilename, options) {
try {
await actionPrime(hunspellDicFilename, options);
}
catch (err) {
const reason = asError(err);
if (reason?.code === 'EPIPE') {
console.log(reason);
return;
}
throw err;
}
}
function asError(err) {
return err && typeof err === 'object' ? err : undefined;
}
async function actionPrime(hunspellDicFilename, options) {
const { sort = false, unique = false, output: outputFile, lower_case: lowerCase = false, transform = true, infix = false, rules = false, progress: showProgress = false, max_depth, forbidden = false, only_forbidden: onlyForbidden = false, partial_compounds: partialCompoundsAllowed = false, } = options;
logStream = outputFile ? process.stdout : process.stderr;
const log = notify;
log('Write words');
log(`Sort: ${yesNo(sort)}`);
log(`Unique: ${yesNo(unique)}`);
const baseFile = hunspellDicFilename.replace(/\.(dic|aff)$/, '');
const dicFile = baseFile + '.dic';
const affFile = baseFile + '.aff';
log(`Dic file: ${dicFile}`);
log(`Aff file: ${affFile}`);
log(`Generating Words...`);
const reader = await IterableHunspellReaderLegacy.createFromFiles(affFile, dicFile);
if (max_depth && Number.parseInt(max_depth) >= 0) {
reader.maxDepth = Number.parseInt(max_depth);
}
const transformers = [];
const filters = [];
if (!forbidden && !onlyForbidden)
filters.push((aff) => !aff.flags.isForbiddenWord);
if (onlyForbidden)
filters.push((aff) => !!aff.flags.isForbiddenWord);
if (!partialCompoundsAllowed)
filters.push((aff) => !aff.flags.isOnlyAllowedInCompound);
if (infix) {
transformers.push(affWordToInfix);
}
if (lowerCase) {
transformers.push(mapWord((a) => a.toLowerCase()));
}
if (rules) {
transformers.push(appendRules);
}
transformers.push(mapWord((a) => a.trim()));
const dicSize = reader.dic.length;
let current = 0;
const calcProgress = () => '\r' + current + ' / ' + dicSize;
const reportProgressRate = 253;
const callback = showProgress
? () => {
current++;
!(current % reportProgressRate) && process.stderr.write(calcProgress(), 'utf8');
}
: () => {
/* void */
};
const seqWords = transform ? reader.seqAffWords(callback) : reader.seqRootWords().map(asAffWord);
const filterUnique = unique ? uniqueFilter(uniqueHistorySize) : (_) => true;
const applyTransformers = (aff) => transformers.reduce((aff, fn) => fn(aff), aff);
const applyFilters = (aff) => filters.reduce((cur, fn) => cur && fn(aff), true);
const allWords = seqWords
.filter(applyFilters)
.map(applyTransformers)
.map((a) => a.word)
.filter((a) => !!a)
.filter(filterUnique)
.map((a) => a + '\n');
const words = options.number ? allWords.take(Number.parseInt(options.number)) : allWords;
if (sort) {
log('Sorting...');
const data = words.toArray().sort().join('');
const fd = outputFile ? openSync(outputFile, 'w') : 1;
writeSync(fd, data);
}
else {
await writeSeqToFile(words, outputFile);
}
if (showProgress) {
console.error(calcProgress());
}
log('Done.');
}
//# sourceMappingURL=commandWords.js.map