cspell-lib
Version:
A library of useful functions used across various cspell tools.
368 lines • 16 kB
JavaScript
import assert from 'node:assert';
import { opConcatMap, opFilter, pipe } from '@cspell/cspell-pipe/sync';
import { defaultCSpellSettings } from '@cspell/cspell-types';
import { createCachingDictionary } from 'cspell-dictionary';
import * as RxPat from '../Settings/RegExpPatterns.js';
import { extractPossibleWordsFromTextOffset, extractText, extractWordsFromTextOffset, splitWordWithOffset, } from '../util/text.js';
import { regExpCamelCaseWordBreaksWithEnglishSuffix } from '../util/textRegex.js';
import { split } from '../util/wordSplitter.js';
import { defaultMinWordLength } from './defaultConstants.js';
import { extractHexSequences, isRandomString } from './isRandomString.js';
import { isWordValidWithEscapeRetry } from './isWordValid.js';
import { mapRangeBackToOriginalPos } from './parsedText.js';
const MIN_HEX_SEQUENCE_LENGTH = 8;
export function lineValidatorFactory(sDict, options) {
const { minWordLength = defaultMinWordLength, flagWords = [], allowCompoundWords = false, ignoreCase = true, ignoreRandomStrings = defaultCSpellSettings.ignoreRandomStrings, minRandomLength = defaultCSpellSettings.minRandomLength, } = options;
const hasWordOptions = {
ignoreCase,
useCompounds: allowCompoundWords || undefined, // let the dictionaries decide on useCompounds if allow is false
};
const dictCol = createCachingDictionary(sDict, hasWordOptions);
const knownWords = new Map();
const setOfFlagWords = new Set(flagWords);
const setOfKnownIssues = new Map();
const setOfKnownSuccessfulWords = new Set();
const rememberFilter = (fn) => (v) => {
const keep = fn(v);
if (!keep) {
setOfKnownSuccessfulWords.add(v.text);
}
return keep;
};
const filterAlreadyChecked = (wo) => {
return !setOfKnownSuccessfulWords.has(wo.text);
};
const hasDict = {
has(word) {
const info = getWordInfo(word);
if (info.isFound !== undefined)
return info.isFound;
if (info.isFlagged)
return true;
if (info.isFlagged)
return false;
info.isFound = dictCol.has(word);
return info.isFound;
},
};
function calcIgnored(info) {
info.isIgnored ??= dictCol.isNoSuggestWord(info.word);
return info.isIgnored;
}
function calcFlagged(info) {
if (info.isFlagged !== undefined)
return info.isFlagged;
const word = info.word;
info.isFlagged =
(setOfFlagWords.has(word) || setOfFlagWords.has(word.toLowerCase()) || dictCol.isForbidden(word)) &&
!calcIgnored(info);
return info.isFlagged;
}
function isWordIgnored(word) {
return calcIgnored(getWordInfo(word));
}
function getSuggestions(word) {
return dictCol.getPreferredSuggestions(word);
}
function isWordFlagged(wo) {
return calcFlagged(getWordInfo(wo.text));
}
function annotateIsFlagged(word) {
word.isFlagged = isWordFlagged(word);
return word;
}
function annotateIssue(issue) {
const sugs = getSuggestions(issue.text);
if (sugs && sugs.length) {
issue.suggestionsEx = sugs;
}
return issue;
}
const isFlaggedOrMinLength = (wo) => wo.text.length >= minWordLength || !!wo.isFlagged;
const isFlaggedOrNotFound = rememberFilter((wo) => wo.isFlagged || !wo.isFound);
const isNotRepeatingChar = rememberFilter((wo) => !RxPat.regExRepeatedChar.test(wo.text));
function checkWord(issue) {
const info = getWordInfo(issue.text);
if (info.fin) {
const { isFlagged: isForbidden, isFound, isIgnored } = info;
const isFlagged = issue.isFlagged ?? (!isIgnored && isForbidden);
issue.isFlagged = isFlagged;
issue.isFound = isFlagged ? undefined : isFound;
return issue;
}
const isIgnored = calcIgnored(info);
const isFlagged = issue.isFlagged ?? calcFlagged(info);
info.isFound ??= isFlagged ? false : isIgnored || isWordValidWithEscapeRetry(hasDict, issue, issue.line);
info.isFlagged = !!isFlagged;
info.fin = true;
issue.isFlagged = isFlagged;
issue.isFound = isFlagged ? undefined : info.isFound;
return issue;
}
const regExUpperCaseWithTrailingCommonEnglishSuffix = /^([\p{Lu}\p{M}]{2,})['’]?(?:s|ing|ies|es|ings|ize|ed|ning)$/u; // cspell:disable-line
const regExpIsLetter = /\p{L}/u;
const fn = (lineSegment) => {
const line = lineSegment.line;
function isWordTooShort(word, ignoreSuffix = false) {
if (word.text.length >= minWordLength * 2 || [...word.text].length >= minWordLength)
return false;
const offset = word.offset - line.offset;
assert.equal(line.text.slice(offset, offset + word.text.length), word.text);
const prefix = [...line.text.slice(Math.max(0, offset - 2), offset)];
const hasLetterPrefix = !!prefix.length && regExpIsLetter.test(prefix[prefix.length - 1]);
if (hasLetterPrefix)
return false;
if (ignoreSuffix)
return true;
const suffix = [...line.text.slice(offset + word.text.length, offset + word.text.length + 2)];
const hasLetterSuffix = !!suffix.length && regExpIsLetter.test(suffix[0]);
return !hasLetterSuffix;
}
function splitterIsValid(word) {
if (setOfKnownSuccessfulWords.has(word.text))
return true;
if (isWordFlagged(word))
return false;
if (isWordValidWithEscapeRetry(hasDict, word, lineSegment.line))
return true;
if (isWordTooShort(word))
return true;
return isAllCapsWithTrailingCommonEnglishSuffixOk(word);
}
function isAllCapsWithTrailingCommonEnglishSuffixOk(tWord) {
if (!regExUpperCaseWithTrailingCommonEnglishSuffix.test(tWord.text))
return false;
const m = tWord.text.match(regExUpperCaseWithTrailingCommonEnglishSuffix);
if (!m)
return false;
const offset = tWord.offset;
const v = { offset, text: m[1], line };
const check = checkWord(v);
if (check.isFlagged)
return false;
if (check.isFound)
return true;
if (isWordTooShort(v, true))
return true;
return false;
}
function checkFullWord(vr) {
if (vr.isFlagged) {
return [vr];
}
// English exceptions :-(
if (isAllCapsWithTrailingCommonEnglishSuffixOk(vr))
return [];
if (isWordIgnored(vr.text) || checkWord(vr).isFound) {
rememberFilter((_) => false)(vr);
return [];
}
if (vr.isFlagged)
return [vr];
const codeWordResults = checkCamelCaseWord(vr);
if (!codeWordResults.length) {
rememberFilter((_) => false)(vr);
return [];
}
return codeWordResults;
}
/**
* Break a camel case word into its parts and check each part.
*
* There are two word break patterns:
* - `regExpCamelCaseWordBreaks`
* - `regExpCamelCaseWordBreaksWithEnglishSuffix` is the default pattern with English suffixes on ALL CAPS words.
*
* Note: See [#6066](https://github.com/streetsidesoftware/cspell/pull/6066)
* Using just `regExpCamelCaseWordBreaks` misses unknown 4-letter words.
*
* The code below was tried, but it missed words.
* - `LSTM` was caught. // cspell:disable-line
* - `LSTMs` was missed because it becomes `LST` and `Ms`. // cspell:disable-line
*
* ```ts
* const results = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaks);
* if (!results.length) return results;
* const resultsEnglishBreaks = _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix);
* return results.length < resultsEnglishBreaks.length ? results : resultsEnglishBreaks;
* ```
*/
function checkCamelCaseWord(vr) {
return _checkCamelCaseWord(vr, regExpCamelCaseWordBreaksWithEnglishSuffix);
}
function _checkCamelCaseWord(vr, regExpWordBreaks) {
const codeWordResults = [];
for (const wo of splitWordWithOffset(vr, regExpWordBreaks)) {
if (setOfKnownSuccessfulWords.has(wo.text))
continue;
const issue = wo;
issue.line = vr.line;
issue.isFlagged = undefined;
issue.isFound = undefined;
annotateIsFlagged(issue);
if (!isFlaggedOrMinLength(issue))
continue;
checkWord(issue);
if (!isFlaggedOrNotFound(issue) || !isNotRepeatingChar(issue))
continue;
issue.text = extractText(lineSegment.segment, issue.offset, issue.offset + issue.text.length);
codeWordResults.push(issue);
}
return codeWordResults;
}
function rebaseKnownIssues(possibleWord, known) {
const { issues } = known;
const adjOffset = possibleWord.offset - known.possibleWord.offset;
return issues.map((issue) => {
issue = { ...issue };
issue.offset += adjOffset;
issue.line = lineSegment.line;
return issue;
});
}
function checkForFlaggedWord(possibleWord) {
if (isWordFlagged(possibleWord)) {
const vr = {
...possibleWord,
line: lineSegment.line,
isFlagged: true,
};
return vr;
}
if (possibleWord.text.endsWith('.') && possibleWord.text.length > 1) {
const pw = { ...possibleWord, text: possibleWord.text.slice(0, -1) };
if (isWordFlagged(pw)) {
const vr = {
...pw,
line: lineSegment.line,
isFlagged: true,
};
return vr;
}
}
return undefined;
}
function checkPossibleWords(possibleWord) {
const known = setOfKnownIssues.get(possibleWord.text);
if (known) {
if (!known.issues.length)
return known.issues;
const adjusted = rebaseKnownIssues(possibleWord, known);
return adjusted;
}
const issues = _checkPossibleWords(possibleWord).map(annotateIssue);
setOfKnownIssues.set(possibleWord.text, { possibleWord, issues });
return issues;
}
function _checkPossibleWords(possibleWord) {
const flagged = checkForFlaggedWord(possibleWord);
if (flagged)
return [flagged];
let mismatches = [];
for (const wo of extractWordsFromTextOffset(possibleWord)) {
if (setOfKnownSuccessfulWords.has(wo.text))
continue;
const issue = wo;
issue.line = lineSegment.line;
annotateIsFlagged(issue);
if (!isFlaggedOrMinLength(issue))
continue;
for (const w of checkFullWord(issue)) {
mismatches.push(w);
}
}
if (!mismatches.length)
return mismatches;
const hexSequences = !ignoreRandomStrings
? []
: extractHexSequences(possibleWord.text, MIN_HEX_SEQUENCE_LENGTH)
.filter(
// Only consider hex sequences that are all upper case or all lower case and contain a `-` or a digit.
(w) => (w.text === w.text.toLowerCase() || w.text === w.text.toUpperCase()) &&
/[\d-]/.test(w.text))
.map((w) => ((w.offset += possibleWord.offset), w));
if (hexSequences.length) {
mismatches = filterExcludedTextOffsets(mismatches, hexSequences);
}
if (mismatches.length) {
// Try the more expensive word splitter
const splitResult = split(lineSegment.segment, possibleWord.offset, splitterIsValid);
const nonMatching = splitResult.words
.filter((w) => !w.isFound)
.filter((w) => {
const m = w.text.match(regExUpperCaseWithTrailingCommonEnglishSuffix);
if (!m)
return true;
const v = checkWord({ ...w, text: m[1], line: lineSegment.line });
return v.isFlagged || !v.isFound;
});
const filtered = filterExcludedTextOffsets(nonMatching.map((w) => ({ ...w, line: lineSegment.line })).map(annotateIsFlagged), hexSequences);
if (filtered.length < mismatches.length) {
return filtered;
}
}
return mismatches;
}
function isNotRandom(textOff) {
if (textOff.text.length < minRandomLength || !ignoreRandomStrings)
return true;
return !isRandomString(textOff.text);
}
const checkedPossibleWords = pipe(extractPossibleWordsFromTextOffset(lineSegment.segment), opFilter(isNotRandom), opFilter(filterAlreadyChecked), opConcatMap(checkPossibleWords));
return checkedPossibleWords;
};
function getWordInfo(word) {
const info = knownWords.get(word);
if (info)
return info;
const result = { word, isFound: undefined, isFlagged: undefined, isIgnored: undefined, fin: false };
knownWords.set(word, result);
return result;
}
return { fn, dict: dictCol };
}
export function textValidatorFactory(dict, options) {
const lineValidator = lineValidatorFactory(dict, options);
const lineValidatorFn = lineValidator.fn;
function validate(pText) {
const { text, range: srcRange, map } = pText;
const srcOffset = srcRange[0];
const segment = { text, offset: 0 };
const lineSegment = { line: segment, segment };
function mapBackToOriginSimple(vr) {
const { text, offset, isFlagged, isFound, suggestionsEx } = vr;
const r = mapRangeBackToOriginalPos([offset, offset + text.length], map);
const range = [r[0] + srcOffset, r[1] + srcOffset];
return { text, range, isFlagged, isFound, suggestionsEx };
}
return [...lineValidatorFn(lineSegment)].map(mapBackToOriginSimple);
}
return {
validate,
lineValidator,
};
}
function filterExcludedTextOffsets(issues, excluded) {
if (!excluded.length)
return issues;
const keep = [];
let i = 0;
let j = 0;
for (i = 0; i < issues.length && j < excluded.length; i++) {
const issue = issues[i];
while (j < excluded.length && excluded[j].offset + excluded[j].text.length <= issue.offset) {
j++;
}
if (j >= excluded.length) {
break;
}
if (issue.isFlagged || issue.offset < excluded[j].offset) {
keep.push(issue);
}
}
if (i < issues.length) {
keep.push(...issues.slice(i));
}
return keep;
}
//# sourceMappingURL=lineValidatorFactory.js.map