UNPKG

@cspell/cspell-tools

Version:
184 lines 7.3 kB
import { opCombine, opCombine as opPipe, opFilter, opMap } from '@cspell/cspell-pipe/sync'; import { createDictionaryLineParser } from 'cspell-trie-lib'; import { uniqueFilter } from 'hunspell-reader'; import { defaultCompileSourceOptions } from '../config/configDefaults.js'; import { legacyLineToWords } from './legacyLineToWords.js'; import { splitCamelCaseIfAllowed } from './splitCamelCaseIfAllowed.js'; export function normalizeTargetWords(options) { const lineParser = createDictionaryLineParser({ stripCaseAndAccents: options.generateNonStrict, stripCaseAndAccentsOnForbidden: true, keepOptionalCompoundCharacter: true, }); const operations = [ opFilter((a) => !!a), lineParser, options.sort ? createInlineBufferedSort(10_000) : undefined, opFilter(uniqueFilter(10_000)), options.filter ? opFilter(options.filter) : undefined, ].filter(isDefined); return opCombine(...operations); } function isDefined(v) { return v !== undefined; } function createInlineBufferedSort(bufferSize = 1000) { function* inlineBufferedSort(lines) { const buffer = []; for (const line of lines) { buffer.push(line); if (buffer.length >= bufferSize) { buffer.sort(); yield* buffer; buffer.length = 0; } } buffer.sort(); yield* buffer; } return inlineBufferedSort; } const commentCharacter = '#'; const _defaultOptions = { keepCase: true, legacy: false, split: false, splitKeepBoth: false, // splitSeparator: regExpSplit, allowedSplitWords: { has: () => true, size: 0 }, storeSplitWordsAsCompounds: defaultCompileSourceOptions.storeSplitWordsAsCompounds, minCompoundLength: defaultCompileSourceOptions.minCompoundLength, }; export const defaultParseDictionaryOptions = Object.freeze(_defaultOptions); export const cSpellToolDirective = 'cspell-tools:'; export const setOfCSpellDirectiveFlags = ['no-split', 'split', 'keep-case', 'no-keep-case', 'legacy']; /** * Normalizes a dictionary words based upon prefix / suffixes. * Case insensitive versions are also generated. * @param options - defines prefixes used when parsing lines. * @returns words that have been normalized. */ export function createParseFileLineMapper(options) { const _options = options || _defaultOptions; const { splitKeepBoth = _defaultOptions.splitKeepBoth, allowedSplitWords = _defaultOptions.allowedSplitWords, storeSplitWordsAsCompounds, minCompoundLength = _defaultOptions.minCompoundLength, } = _options; let { legacy = _defaultOptions.legacy } = _options; let { split = _defaultOptions.split, keepCase = legacy ? false : _defaultOptions.keepCase } = _options; const compoundFix = storeSplitWordsAsCompounds ? '+' : ''; function isString(line) { return typeof line === 'string'; } function trim(line) { return line.trim(); } function removeComments(line) { const idx = line.indexOf(commentCharacter); if (idx < 0) return line; const idxDirective = line.indexOf(cSpellToolDirective, idx); if (idxDirective >= 0) { const flags = line .slice(idxDirective) .split(/[\s,;]/g) .map((s) => s.trim()) .filter((a) => !!a); for (const flag of flags) { switch (flag) { case 'split': { split = true; break; } case 'no-split': { split = false; break; } case 'keep-case': { keepCase = true; legacy = false; break; } case 'no-keep-case': { keepCase = false; break; } case 'legacy': { keepCase = false; legacy = true; break; } } } } return line.slice(0, idx).trim(); } function filterEmptyLines(line) { return !!line; } const regNonWordOrDigit = /[^\p{L}\p{M}'\w-]+/giu; function splitLine(line) { line = line.replace(/#.*/, ''); // remove comment line = line.trim(); line = line.replaceAll(/\bU\+[0-9A-F]{4}\b/gi, '|'); // Remove Unicode Definitions line = line.replaceAll(/\\U[0-9A-F]{4}/gi, '|'); // Remove Unicode Definitions line = line.replaceAll(regNonWordOrDigit, '|'); line = line.replaceAll(/'(?=\|)/g, ''); // remove trailing ' line = line.replace(/'$/, ''); // remove trailing ' line = line.replaceAll(/(?<=\|)'/g, ''); // remove leading ' line = line.replace(/^'/, ''); // remove leading ' line = line.replaceAll(/\s*\|\s*/g, '|'); // remove spaces around | line = line.replaceAll(/[|]+/g, '|'); // reduce repeated | line = line.replace(/^\|/, ''); // remove leading | line = line.replace(/\|$/, ''); // remove trailing | const lines = line .split('|') .map((a) => a.trim()) .filter((a) => !!a) .filter((a) => !/^[0-9_-]+$/.test(a)) // pure numbers and symbols .filter((a) => !/^0[xo][0-9A-F]+$/i.test(a)); // c-style hex/octal digits return lines; } function splitWordIntoWords(word) { return splitCamelCaseIfAllowed(word, allowedSplitWords, keepCase, compoundFix, minCompoundLength); } function* splitWords(lines) { for (const line of lines) { if (legacy) { yield* legacyLineToWords(line, keepCase, allowedSplitWords); continue; } if (split) { const words = splitLine(line); yield* !allowedSplitWords.size ? words : words.flatMap((word) => splitWordIntoWords(word)); if (!splitKeepBoth) continue; } yield line.replaceAll(/["]/g, ''); } } function* unique(lines) { const known = new Set(); for (const line of lines) { if (known.has(line)) continue; known.add(line); yield line; } } function* splitLines(paragraphs) { for (const paragraph of paragraphs) { yield* paragraph.split('\n'); } } const processLines = opPipe(opFilter(isString), splitLines, opMap(removeComments), splitWords, opMap(trim), opFilter(filterEmptyLines), unique); return processLines; } /** * Normalizes a dictionary words based upon prefix / suffixes. * Case insensitive versions are also generated. * @param lines - one word per line * @param _options - defines prefixes used when parsing lines. * @returns words that have been normalized. */ export function parseFileLines(lines, options) { return createParseFileLineMapper(options)(typeof lines === 'string' ? [lines] : lines); } //# sourceMappingURL=wordListParser.js.map