UNPKG

cspell-lib

Version:

A library of useful functions used across various cspell tools.

331 lines 11.1 kB
import { PairingHeap } from './PairingHeap.js'; import { escapeRegEx } from './regexHelper.js'; import { regExDanglingQuote, regExEscapeCharacters, regExNumericLiteral, regExPossibleWordBreaks, regExSplitWords, regExSplitWords2, regExTrailingEndings, regExWordsAndDigits, } from './textRegex.js'; const ignoreBreak = Object.freeze([]); export function split(line, offset, isValidWord, options = {}) { const relWordToSplit = findNextWordText({ text: line.text, offset: offset - line.offset }); const lineOffset = line.offset; const requested = new Map(); const regExpIgnoreSegment = /^[-.+\d_eE'`\\\s]+$/; if (!relWordToSplit.text) { const text = rebaseTextOffset(relWordToSplit); return { line, offset, text: text, words: [], endOffset: text.offset + text.text.length, }; } const lineSegment = { line, relStart: relWordToSplit.offset, relEnd: relWordToSplit.offset + relWordToSplit.text.length, }; const possibleBreaks = generateWordBreaks(lineSegment, options); if (!possibleBreaks.length) { const text = rebaseTextOffset(relWordToSplit); return { line, offset, text: text, words: [{ ...text, isFound: isValidWord(text) }], endOffset: text.offset + text.text.length, }; } function rebaseTextOffset(relText) { return { ...relText, offset: relText.offset + lineOffset, }; } function has(word) { if (regExpIgnoreSegment.test(word.text)) { return true; } const i = word.offset; const j = word.text.length; let v = i + (j << 20); if (i < 1 << 20 && j < 1 << 11) { const b = requested.get(v); if (b !== undefined) return b; } else { v = -1; } const r = isValidWord(rebaseTextOffset(word)); if (v >= 0) { requested.set(v, r); } return r; } // Add a dummy break at the end to avoid needing to check for last break. possibleBreaks.push({ offset: lineSegment.relEnd, breaks: [ignoreBreak], }); const result = { line, offset, text: rebaseTextOffset(relWordToSplit), words: splitIntoWords(lineSegment, possibleBreaks, has).map(rebaseTextOffset), endOffset: lineOffset + lineSegment.relEnd, }; return result; } function findNextWordText({ text, offset }) { const reg = new RegExp(regExWordsAndDigits); reg.lastIndex = offset; const m = reg.exec(text); if (!m) { return { text: '', offset: offset + text.length, }; } // Skip numeric literals. if (regExNumericLiteral.test(m[0])) { return findNextWordText({ text, offset: offset + m[0].length }); } return { text: m[0], offset: m.index, }; } function generateWordBreaks(line, options) { const camelBreaks = genWordBreakCamel(line); const symbolBreaks = genSymbolBreaks(line); const optionalBreaks = genOptionalWordBreaks(line, options.optionalWordBreakCharacters); return mergeSortedBreaks(...camelBreaks, ...symbolBreaks, ...optionalBreaks); } function offsetRegEx(reg, offset) { const r = new RegExp(reg); r.lastIndex = offset; return r; } function genWordBreakCamel(line) { const breaksCamel1 = []; const text = line.line.text.slice(0, line.relEnd); // lower,Upper: camelCase -> camel|Case for (const m of text.matchAll(offsetRegEx(regExSplitWords, line.relStart))) { if (m.index === undefined) break; const i = m.index + m[1].length; breaksCamel1.push({ offset: m.index, breaks: [[i, i], ignoreBreak], }); } const breaksCamel2 = []; // cspell:ignore ERRORC // Upper,Upper,lower: ERRORCodes -> ERROR|Codes, ERRORC|odes for (const m of text.matchAll(offsetRegEx(regExSplitWords2, line.relStart))) { if (m.index === undefined) break; const i = m.index + m[1].length; const j = i + m[3].length; breaksCamel2.push({ offset: m.index, breaks: [[i, i], [j, j], ignoreBreak], }); } return [breaksCamel1, breaksCamel2]; } function calcBreaksForRegEx(line, reg, calcBreak) { const sb = []; const text = line.line.text.slice(0, line.relEnd); for (const m of text.matchAll(offsetRegEx(reg, line.relStart))) { const b = calcBreak(m); if (b) { sb.push(b); } } return sb; } function genOptionalWordBreaks(line, optionalBreakCharacters) { function calcBreaks(m) { const i = m.index; if (i === undefined) return; const j = i + m[0].length; return { offset: i, breaks: [ [i, j], // Remove the characters ignoreBreak, ], }; } const breaks = [ calcBreaksForRegEx(line, regExDanglingQuote, calcBreaks), calcBreaksForRegEx(line, regExTrailingEndings, calcBreaks), ]; if (optionalBreakCharacters) { const regex = new RegExp(`[${escapeRegEx(optionalBreakCharacters)}]`, 'gu'); breaks.push(calcBreaksForRegEx(line, regex, calcBreaks)); } return breaks; } function genSymbolBreaks(line) { function calcBreaks(m) { const i = m.index; if (i === undefined) return; const j = i + m[0].length; return { offset: i, breaks: [ [i, j], // Remove the characters [i, i], // keep characters with word to right [j, j], // keep characters with word to left ignoreBreak, ], }; } return [ calcBreaksForRegEx(line, regExPossibleWordBreaks, calcBreaks), calcBreaksForRegEx(line, /\d+/g, calcBreaks), calcBreaksForRegEx(line, regExEscapeCharacters, calcBreaks), ]; } function splitIntoWords(lineSeg, breaks, has) { const maxIndex = lineSeg.relEnd; const maxAttempts = 1000; const knownPathsByIndex = new Map(); /** * Create a set of possible candidate to consider * @param p - prev candidate that lead to this one * @param i - offset within the string * @param bi - current index into the set of breaks * @param currentCost - current cost accrued */ function makeCandidates(p, i, bi, currentCost) { const len = maxIndex; while (bi < breaks.length && breaks[bi].offset < i) { bi += 1; } if (bi >= breaks.length) { return []; } const br = breaks[bi]; function c(bp) { const d = bp.length < 2 ? len - i : (bp[0] - i) * 0.5 + len - bp[1]; const ec = currentCost + d; return { p, i, bi, bp, c: currentCost, ec, text: undefined, }; } return br.breaks.map(c); } function checkTextOffset(text, offset) { const valid = has({ text, offset }); return { text, offset, isFound: valid, }; } function compare(a, b) { return a.ec - b.ec || b.i - a.i; } function pathToWords(node) { const results = []; for (let p = node; p; p = p.n) { if (p.text) { results.push(p.text); } } return results; } function addToKnownPaths(candidate, path) { for (let can = candidate; can !== undefined; can = can.p) { const t = can.text; const i = can.i; const cost = (!t || t.isFound ? 0 : t.text.length) + (path?.c ?? 0); const exitingPath = knownPathsByIndex.get(i); // Keep going only if this is a better candidate than the existing path if (exitingPath && exitingPath.c <= cost) { return undefined; } const node = { n: path, i, c: cost, text: t, }; knownPathsByIndex.set(i, node); path = node; } return path; } let maxCost = lineSeg.relEnd - lineSeg.relStart; const candidates = new PairingHeap(compare); const text = lineSeg.line.text; candidates.append(makeCandidates(undefined, lineSeg.relStart, 0, 0)); let attempts = 0; let bestPath; while (maxCost && candidates.length && attempts++ < maxAttempts) { /** Best Candidate Index */ const best = candidates.dequeue(); if (!best || best.c >= maxCost) { continue; } // Does it have a split? if (best.bp.length) { // yes const i = best.bp[0]; const j = best.bp[1]; const t = i > best.i ? checkTextOffset(text.slice(best.i, i), best.i) : undefined; const cost = !t || t.isFound ? 0 : t.text.length; const mc = maxIndex - j; best.c += cost; best.ec = best.c + mc; best.text = t; const possiblePath = knownPathsByIndex.get(j); if (possiblePath) { // We found a known apply to candidate const f = addToKnownPaths(best, possiblePath); bestPath = !bestPath || (f && f.c < bestPath.c) ? f : bestPath; } else if (best.c < maxCost) { const c = makeCandidates(t ? best : best.p, j, best.bi + 1, best.c); candidates.append(c); } } else { // It is a pass through const c = makeCandidates(best.p, best.i, best.bi + 1, best.c); candidates.append(c); if (!c.length) { const t = maxIndex > best.i ? checkTextOffset(text.slice(best.i, maxIndex), best.i) : undefined; const cost = !t || t.isFound ? 0 : t.text.length; best.c += cost; best.ec = best.c; best.text = t; const segText = t || best.p?.text || checkTextOffset('', best.i); const can = t ? { ...best, text: segText } : { ...best, ...best.p, text: segText }; const f = addToKnownPaths(can, undefined); bestPath = !bestPath || (f && f.c < bestPath.c) ? f : bestPath; } } if (bestPath && bestPath.c < maxCost) { maxCost = bestPath.c; } } return pathToWords(bestPath); } function mergeSortedBreaks(...maps) { return maps.flat().sort((a, b) => a.offset - b.offset); } export const __testing__ = { generateWordBreaks, findNextWordText, }; //# sourceMappingURL=wordSplitter.js.map