@cspell/cspell-tools
Version:
Tools to assist with the development of cSpell
184 lines • 7.3 kB
JavaScript
import { opCombine, opCombine as opPipe, opFilter, opMap } from '@cspell/cspell-pipe/sync';
import { createDictionaryLineParser } from 'cspell-trie-lib';
import { uniqueFilter } from 'hunspell-reader';
import { defaultCompileSourceOptions } from '../config/configDefaults.js';
import { legacyLineToWords } from './legacyLineToWords.js';
import { splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js';
export function normalizeTargetWords(options) {
const lineParser = createDictionaryLineParser({
stripCaseAndAccents: options.generateNonStrict,
stripCaseAndAccentsOnForbidden: true,
keepOptionalCompoundCharacter: true,
});
const operations = [
opFilter((a) => !!a),
lineParser,
options.sort ? createInlineBufferedSort(10_000) : undefined,
opFilter(uniqueFilter(10_000)),
options.filter ? opFilter(options.filter) : undefined,
].filter(isDefined);
return opCombine(...operations);
}
function isDefined(v) {
return v !== undefined;
}
function createInlineBufferedSort(bufferSize = 1000) {
function* inlineBufferedSort(lines) {
const buffer = [];
for (const line of lines) {
buffer.push(line);
if (buffer.length >= bufferSize) {
buffer.sort();
yield* buffer;
buffer.length = 0;
}
}
buffer.sort();
yield* buffer;
}
return inlineBufferedSort;
}
const commentCharacter = '#';
const _defaultOptions = {
keepCase: true,
legacy: false,
split: false,
splitKeepBoth: false,
// splitSeparator: regExpSplit,
allowedSplitWords: { has: () => true, size: 0 },
storeSplitWordsAsCompounds: defaultCompileSourceOptions.storeSplitWordsAsCompounds,
minCompoundLength: defaultCompileSourceOptions.minCompoundLength,
};
export const defaultParseDictionaryOptions = Object.freeze(_defaultOptions);
export const cSpellToolDirective = 'cspell-tools:';
export const setOfCSpellDirectiveFlags = ['no-split', 'split', 'keep-case', 'no-keep-case', 'legacy'];
/**
* Normalizes a dictionary words based upon prefix / suffixes.
* Case insensitive versions are also generated.
* @param options - defines prefixes used when parsing lines.
* @returns words that have been normalized.
*/
export function createParseFileLineMapper(options) {
const _options = options || _defaultOptions;
const { splitKeepBoth = _defaultOptions.splitKeepBoth, allowedSplitWords = _defaultOptions.allowedSplitWords, storeSplitWordsAsCompounds, minCompoundLength = _defaultOptions.minCompoundLength, } = _options;
let { legacy = _defaultOptions.legacy } = _options;
let { split = _defaultOptions.split, keepCase = legacy ? false : _defaultOptions.keepCase } = _options;
const compoundFix = storeSplitWordsAsCompounds ? '+' : '';
function isString(line) {
return typeof line === 'string';
}
function trim(line) {
return line.trim();
}
function removeComments(line) {
const idx = line.indexOf(commentCharacter);
if (idx < 0)
return line;
const idxDirective = line.indexOf(cSpellToolDirective, idx);
if (idxDirective >= 0) {
const flags = line
.slice(idxDirective)
.split(/[\s,;]/g)
.map((s) => s.trim())
.filter((a) => !!a);
for (const flag of flags) {
switch (flag) {
case 'split': {
split = true;
break;
}
case 'no-split': {
split = false;
break;
}
case 'keep-case': {
keepCase = true;
legacy = false;
break;
}
case 'no-keep-case': {
keepCase = false;
break;
}
case 'legacy': {
keepCase = false;
legacy = true;
break;
}
}
}
}
return line.slice(0, idx).trim();
}
function filterEmptyLines(line) {
return !!line;
}
const regNonWordOrDigit = /[^\p{L}\p{M}'\w-]+/giu;
function splitLine(line) {
line = line.replace(/#.*/, ''); // remove comment
line = line.trim();
line = line.replaceAll(/\bU\+[0-9A-F]{4}\b/gi, '|'); // Remove Unicode Definitions
line = line.replaceAll(/\\U[0-9A-F]{4}/gi, '|'); // Remove Unicode Definitions
line = line.replaceAll(regNonWordOrDigit, '|');
line = line.replaceAll(/'(?=\|)/g, ''); // remove trailing '
line = line.replace(/'$/, ''); // remove trailing '
line = line.replaceAll(/(?<=\|)'/g, ''); // remove leading '
line = line.replace(/^'/, ''); // remove leading '
line = line.replaceAll(/\s*\|\s*/g, '|'); // remove spaces around |
line = line.replaceAll(/[|]+/g, '|'); // reduce repeated |
line = line.replace(/^\|/, ''); // remove leading |
line = line.replace(/\|$/, ''); // remove trailing |
const lines = line
.split('|')
.map((a) => a.trim())
.filter((a) => !!a)
.filter((a) => !/^[0-9_-]+$/.test(a)) // pure numbers and symbols
.filter((a) => !/^0[xo][0-9A-F]+$/i.test(a)); // c-style hex/octal digits
return lines;
}
function splitWordIntoWords(word) {
return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix, minCompoundLength);
}
function* splitWords(lines) {
for (const line of lines) {
if (legacy) {
yield* legacyLineToWords(line, keepCase, allowedSplitWords);
continue;
}
if (split) {
const words = splitLine(line);
yield* !allowedSplitWords.size ? words : words.flatMap((word) => splitWordIntoWords(word));
if (!splitKeepBoth)
continue;
}
yield line.replaceAll(/["]/g, '');
}
}
function* unique(lines) {
const known = new Set();
for (const line of lines) {
if (known.has(line))
continue;
known.add(line);
yield line;
}
}
function* splitLines(paragraphs) {
for (const paragraph of paragraphs) {
yield* paragraph.split('\n');
}
}
const processLines = opPipe(opFilter(isString), splitLines, opMap(removeComments), splitWords, opMap(trim), opFilter(filterEmptyLines), unique);
return processLines;
}
/**
* Normalizes a dictionary words based upon prefix / suffixes.
* Case insensitive versions are also generated.
* @param lines - one word per line
* @param _options - defines prefixes used when parsing lines.
* @returns words that have been normalized.
*/
export function parseFileLines(lines, options) {
return createParseFileLineMapper(options)(typeof lines === 'string' ? [lines] : lines);
}
//# sourceMappingURL=wordListParser.js.map