UNPKG

fast-check

Version:

Property based testing framework for JavaScript (like QuickCheck)

321 lines (320 loc) 13.3 kB
import { safeIndexOf } from '../../../utils/globals.js'; import { TokenizerBlockMode, readFrom } from './ReadRegex.js'; const safeStringFromCodePoint = String.fromCodePoint; function safePop(tokens) { const previous = tokens.pop(); if (previous === undefined) { throw new Error('Unable to extract token preceeding the currently parsed one'); } return previous; } function isDigit(char) { return char >= '0' && char <= '9'; } function simpleChar(char, escaped) { return { type: 'Char', kind: 'simple', symbol: char, value: char, codePoint: char.codePointAt(0) || -1, escaped, }; } function metaEscapedChar(block, symbol) { return { type: 'Char', kind: 'meta', symbol, value: block, codePoint: symbol.codePointAt(0) || -1, }; } function toSingleToken(tokens, allowEmpty) { if (tokens.length > 1) { return { type: 'Alternative', expressions: tokens, }; } if (!allowEmpty && tokens.length === 0) { throw new Error(`Unsupported no token`); } return tokens[0]; } function blockToCharToken(block) { if (block[0] === '\\') { const next = block[1]; switch (next) { case 'x': { const allDigits = block.substring(2); const codePoint = Number.parseInt(allDigits, 16); const symbol = safeStringFromCodePoint(codePoint); return { type: 'Char', kind: 'hex', symbol, value: block, codePoint }; } case 'u': { if (block === '\\u') { return simpleChar('u', true); } const allDigits = block[2] === '{' ? block.substring(3, block.length - 1) : block.substring(2); const codePoint = Number.parseInt(allDigits, 16); const symbol = safeStringFromCodePoint(codePoint); return { type: 'Char', kind: 'unicode', symbol, value: block, codePoint }; } case '0': { return metaEscapedChar(block, '\0'); } case 'n': { return metaEscapedChar(block, '\n'); } case 'f': { return metaEscapedChar(block, '\f'); } case 'r': { return metaEscapedChar(block, '\r'); } case 't': { return metaEscapedChar(block, '\t'); } case 'v': { return metaEscapedChar(block, '\v'); } case 'w': case 'W': case 'd': case 'D': case 's': case 'S': case 'b': case 'B': { return { type: 'Char', kind: 'meta', symbol: undefined, value: block, codePoint: Number.NaN }; } default: { if (isDigit(next)) { const allDigits = block.substring(1); const codePoint = Number(allDigits); const symbol = safeStringFromCodePoint(codePoint); return { type: 'Char', kind: 'decimal', symbol, value: block, codePoint }; } if (block.length > 2 && (next === 'p' || next === 'P')) { throw new Error(`UnicodeProperty not implemented yet!`); } const char = block.substring(1); return simpleChar(char, true); } } } return simpleChar(block); } function pushTokens(tokens, regexSource, unicodeMode, groups) { let disjunctions = null; for (let index = 0, block = readFrom(regexSource, index, unicodeMode, TokenizerBlockMode.Full); index !== regexSource.length; index += block.length, block = readFrom(regexSource, index, unicodeMode, TokenizerBlockMode.Full)) { const firstInBlock = block[0]; switch (firstInBlock) { case '|': { if (disjunctions === null) { disjunctions = []; } disjunctions.push(toSingleToken(tokens.splice(0), true) || null); break; } case '.': { tokens.push({ type: 'Char', kind: 'meta', symbol: block, value: block, codePoint: Number.NaN }); break; } case '*': case '+': { const previous = safePop(tokens); tokens.push({ type: 'Repetition', expression: previous, quantifier: { type: 'Quantifier', kind: firstInBlock, greedy: true }, }); break; } case '?': { const previous = safePop(tokens); if (previous.type === 'Repetition') { previous.quantifier.greedy = false; tokens.push(previous); } else { tokens.push({ type: 'Repetition', expression: previous, quantifier: { type: 'Quantifier', kind: firstInBlock, greedy: true }, }); } break; } case '{': { if (block === '{') { tokens.push(simpleChar(block)); break; } const previous = safePop(tokens); const quantifierText = block.substring(1, block.length - 1); const quantifierTokens = quantifierText.split(','); const from = Number(quantifierTokens[0]); const to = quantifierTokens.length === 1 ? from : quantifierTokens[1].length !== 0 ? Number(quantifierTokens[1]) : undefined; tokens.push({ type: 'Repetition', expression: previous, quantifier: { type: 'Quantifier', kind: 'Range', greedy: true, from, to }, }); break; } case '[': { const blockContent = block.substring(1, block.length - 1); const subTokens = []; let negative = undefined; let previousWasSimpleDash = false; for (let subIndex = 0, subBlock = readFrom(blockContent, subIndex, unicodeMode, TokenizerBlockMode.Character); subIndex !== blockContent.length; subIndex += subBlock.length, subBlock = readFrom(blockContent, subIndex, unicodeMode, TokenizerBlockMode.Character)) { if (subIndex === 0 && subBlock === '^') { negative = true; continue; } const newToken = blockToCharToken(subBlock); if (subBlock === '-') { subTokens.push(newToken); previousWasSimpleDash = true; } else { const operand1Token = subTokens.length >= 2 ? subTokens[subTokens.length - 2] : undefined; if (previousWasSimpleDash && operand1Token !== undefined && operand1Token.type === 'Char') { subTokens.pop(); subTokens.pop(); subTokens.push({ type: 'ClassRange', from: operand1Token, to: newToken }); } else { subTokens.push(newToken); } previousWasSimpleDash = false; } } tokens.push({ type: 'CharacterClass', expressions: subTokens, negative }); break; } case '(': { const blockContent = block.substring(1, block.length - 1); const subTokens = []; if (blockContent[0] === '?') { if (blockContent[1] === ':') { pushTokens(subTokens, blockContent.substring(2), unicodeMode, groups); tokens.push({ type: 'Group', capturing: false, expression: toSingleToken(subTokens), }); } else if (blockContent[1] === '=' || blockContent[1] === '!') { pushTokens(subTokens, blockContent.substring(2), unicodeMode, groups); tokens.push({ type: 'Assertion', kind: 'Lookahead', negative: blockContent[1] === '!' ? true : undefined, assertion: toSingleToken(subTokens), }); } else if (blockContent[1] === '<' && (blockContent[2] === '=' || blockContent[2] === '!')) { pushTokens(subTokens, blockContent.substring(3), unicodeMode, groups); tokens.push({ type: 'Assertion', kind: 'Lookbehind', negative: blockContent[2] === '!' ? true : undefined, assertion: toSingleToken(subTokens), }); } else { const chunks = blockContent.split('>'); if (chunks.length < 2 || chunks[0][1] !== '<') { throw new Error(`Unsupported regex content found at ${JSON.stringify(block)}`); } const groupIndex = ++groups.lastIndex; const nameRaw = chunks[0].substring(2); groups.named.set(nameRaw, groupIndex); pushTokens(subTokens, chunks.slice(1).join('>'), unicodeMode, groups); tokens.push({ type: 'Group', capturing: true, nameRaw, name: nameRaw, number: groupIndex, expression: toSingleToken(subTokens), }); } } else { const groupIndex = ++groups.lastIndex; pushTokens(subTokens, blockContent, unicodeMode, groups); tokens.push({ type: 'Group', capturing: true, number: groupIndex, expression: toSingleToken(subTokens), }); } break; } default: { if (block === '^') { tokens.push({ type: 'Assertion', kind: block }); } else if (block === '$') { tokens.push({ type: 'Assertion', kind: block }); } else if (block[0] === '\\' && isDigit(block[1])) { const reference = Number(block.substring(1)); if (unicodeMode || reference <= groups.lastIndex) { tokens.push({ type: 'Backreference', kind: 'number', number: reference, reference }); } else { tokens.push(blockToCharToken(block)); } } else if (block[0] === '\\' && block[1] === 'k' && block.length !== 2) { const referenceRaw = block.substring(3, block.length - 1); tokens.push({ type: 'Backreference', kind: 'name', number: groups.named.get(referenceRaw) || 0, referenceRaw, reference: referenceRaw, }); } else { tokens.push(blockToCharToken(block)); } break; } } } if (disjunctions !== null) { disjunctions.push(toSingleToken(tokens.splice(0), true) || null); let currentDisjunction = { type: 'Disjunction', left: disjunctions[0], right: disjunctions[1], }; for (let index = 2; index < disjunctions.length; ++index) { currentDisjunction = { type: 'Disjunction', left: currentDisjunction, right: disjunctions[index], }; } tokens.push(currentDisjunction); } } export function tokenizeRegex(regex) { const unicodeMode = safeIndexOf([...regex.flags], 'u') !== -1; const regexSource = regex.source; const tokens = []; pushTokens(tokens, regexSource, unicodeMode, { lastIndex: 0, named: new Map() }); return toSingleToken(tokens); }