@dashevo/re2-wasm
Version:
Google's RE2 library distributed as a WASM module
611 lines (582 loc) • 17.8 kB
text/typescript
/* Copyright 2021 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import {EventEmitter} from 'events';
// eslint-disable-next-line node/no-unpublished-import
import {WrappedRE2, InternalMatchResult} from '../wasm/re2.js';
// eslint-disable-next-line node/no-unpublished-require
const asyncWrappedWasm = require('../wasm/re2.js');
export interface RE2ExecArray extends Array<string | undefined> {
index: number;
input: string;
groups?: {
[key: string]: string;
};
}
export interface RE2MatchArray extends Array<string | undefined> {
index?: number;
input?: string;
groups?: {
[key: string]: string;
};
}
const ALPHA_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
const HEX = '0123456789ABCDEF';
const LOADED_EVENT = 'LOADED';
const events = new EventEmitter();
let compiledWasm: any;
let isLoading = false;
function isHexadecimal(char: string): boolean {
return HEX.indexOf(char.toUpperCase()) !== -1;
}
async function compileWasmModule() {
isLoading = true;
compiledWasm = await asyncWrappedWasm();
isLoading = false;
events.emit(LOADED_EVENT);
}
/**
* Translate a string from Node RegExp syntax RE2 syntax. The algorithm is
* translated from
* https://github.com/uhop/node-re2/blob/master/lib/new.cc#L21-L142
* @param pattern
* @param multiline
*/
function translateRegExp(pattern: string, multiline: boolean): string {
const result: string[] = [];
if (pattern === '') {
return '(?:)';
} else if (multiline) {
result.push('(?m)');
}
for (let i = 0; i < pattern.length; ) {
if (pattern[i] === '\\') {
if (i + 1 < pattern.length) {
switch (pattern[i + 1]) {
case '\\':
// Consume "\\", output "\\"
result.push('\\\\');
i += 2;
break;
case 'c':
if (i + 2 < pattern.length) {
const alphaIndex = ALPHA_UPPER.indexOf(pattern[i + 2]) + 1;
if (alphaIndex >= 0) {
// Consume "\c[upper case character]", output "\x[hex digit][hex digit]"
result.push(
'\\x',
HEX[Math.floor(alphaIndex / 16)],
HEX[alphaIndex % 16]
);
i += 3;
break;
}
}
// Consume "\c", output "\c"
result.push('\\c');
i += 2;
break;
case 'u':
if (i + 2 < pattern.length) {
const ch2 = pattern[i + 2];
if (isHexadecimal(ch2)) {
// Consume "\u[hex digit]", output "\x{[hex digit]"
result.push('\\x{');
result.push(ch2);
i += 3;
// Consume and output up to 3 more hex digits
for (
let j = 0;
j < 3 && i < pattern.length && isHexadecimal(pattern[i]);
i++, j++
) {
result.push(pattern[i]);
}
// Output "}"
result.push('}');
break;
} else if (ch2 === '{') {
// Consume "\u" followed by "{", output "\x"
// The default case handles the subsequent characters
result.push('\\x');
i += 2;
break;
}
}
// Consume and output "\u"
result.push('\\u');
i += 2;
break;
default:
// Consume and output "\[char]"
result.push('\\', pattern[i + 1]);
i += 2;
}
continue;
}
} else if (pattern[i] === '/') {
// Consume "/"" and output "\/"
// An existing "\/" would have been handled by the above default case
result.push('\\/');
i += 1;
continue;
} else if (pattern.substring(i, i + 3) === '(?<') {
if (pattern[i + 3] !== '=' && pattern[i + 3] !== '!') {
// Consume "(?<" and output "(?P<"
result.push('(?P<');
i += 3;
continue;
}
}
// Consume and output the next character
result.push(pattern[i]);
i += 1;
}
return result.join('');
}
/**
* Escape a RegExp pattern by ensuring that any instance of "/" in the string
* is preceded by an odd number of backslashes.
* @param pattern
*/
function escapeRegExp(pattern: string): string {
return pattern.replace(/(^|[^\\])((?:\\\\)*)\//g, '$1$2\\/');
}
/* This class should implement the RegExp interface, but it can't because of
* https://github.com/microsoft/TypeScript/issues/42307 */
class RE2 {
private _global = false;
private _ignoreCase = false;
private _multiline = false;
private _dotAll = false;
private _unicode = false;
private _sticky = false;
lastIndex = 0;
private pattern = '(?:)';
private wrapper: WrappedRE2;
private groupNames: {[group: number]: string} = {};
private namedGroups: {[name: string]: number} = {};
constructor(pattern: string | RegExp | RE2, flags?: string) {
if (typeof pattern !== 'string') {
if (pattern instanceof RegExp || pattern instanceof RE2) {
flags = flags ?? pattern.flags;
pattern = pattern.source;
} else {
if (pattern === undefined) {
pattern = '(?:)';
} else {
pattern = pattern + '';
}
}
}
if (pattern === '') {
pattern = '(?:)';
}
pattern = escapeRegExp(pattern);
flags = flags ?? '';
for (const flag of flags) {
switch (flag) {
case 'g':
this._global = true;
break;
case 'i':
this._ignoreCase = true;
break;
case 'm':
this._multiline = true;
break;
case 's':
this._dotAll = true;
break;
case 'u':
this._unicode = true;
break;
case 'y':
this._sticky = true;
break;
}
}
if (!this._unicode) {
throw new Error(
'RE2 only works in unicode mode. The "u" flag must be passed when constructing a RE2 instance'
);
}
this.pattern = pattern;
this.wrapper = new compiledWasm.WrappedRE2(
translateRegExp(pattern, this._multiline),
this._ignoreCase,
this._multiline,
this._dotAll
);
if (!this.wrapper.ok()) {
throw new SyntaxError(
`Invalid regular expression: /${pattern}/${flags}: ${this.wrapper.error()}`
);
}
// Verify that all named groups have unique names
const groupNames = this.wrapper.capturingGroupNames();
const groupNumbers = groupNames.keys();
for (let i = 0; i < groupNumbers.size(); i++) {
const num = groupNumbers.get(i);
const name = groupNames.get(num);
if (name in this.namedGroups) {
throw new SyntaxError(
`Invalid regular expression: /${pattern}/${flags}: Duplicate capture group name`
);
}
this.groupNames[num] = name;
this.namedGroups[name] = num;
}
}
get source() {
return this.pattern;
}
get internalSource() {
return this.wrapper.pattern();
}
get flags() {
return (
(this._global ? 'g' : '') +
(this._ignoreCase ? 'i' : '') +
(this._multiline ? 'm' : '') +
(this._dotAll ? 's' : '') +
(this._unicode ? 'u' : '') +
(this._sticky ? 'y' : '')
);
}
get global() {
return this._global;
}
get ignoreCase() {
return this._ignoreCase;
}
get multiline() {
return this._multiline;
}
get dotAll() {
return this._dotAll;
}
get unicode() {
return this._unicode;
}
get sticky() {
return this._sticky;
}
toString() {
return `/${this.pattern}/${this.flags}`;
}
private getMaybeStickyIndex() {
if (this._global || this._sticky) {
return this.lastIndex;
} else {
return 0;
}
}
private isMatchSuccessful(
match: InternalMatchResult,
searchStart: number
): boolean {
return match.index === searchStart || (!this._sticky && match.index >= 0);
}
private maybeUpdateLastIndex(match: InternalMatchResult, start: number) {
if (this._global || this._sticky) {
if (this.isMatchSuccessful(match, start)) {
this.lastIndex = match.index + match.match.length;
} else {
this.lastIndex = 0;
}
}
}
private getNamedGroups(match: InternalMatchResult): {[name: string]: string} {
const groups: {[name: string]: string} = {};
for (const [groupName, groupNum] of Object.entries(this.namedGroups)) {
if (match.groups[groupNum - 1] !== undefined) {
groups[groupName] = match.groups[groupNum - 1]!;
}
}
return groups;
}
exec(input: string): RE2ExecArray | null {
if (typeof input !== 'string') {
input = input + '';
}
const startIndex = this.getMaybeStickyIndex();
const match = this.wrapper.match(input, startIndex, true);
this.maybeUpdateLastIndex(match, startIndex);
if (!this.isMatchSuccessful(match, startIndex)) {
return null;
}
const result: RE2ExecArray = [match.match, ...match.groups] as RE2ExecArray;
result.index = match.index;
result.input = input;
const groups = this.getNamedGroups(match);
if (Object.keys(groups).length > 0) {
result.groups = groups;
}
return result;
}
test(input: string): boolean {
if (typeof input !== 'string') {
input = input + '';
}
const startIndex = this.getMaybeStickyIndex();
const match = this.wrapper.match(input, this.getMaybeStickyIndex(), false);
this.maybeUpdateLastIndex(match, startIndex);
return this.isMatchSuccessful(match, startIndex);
}
compile(): this {
// This method is deprecated on RegExp, so it is intentionally not implemented here
throw new Error(
'Deprecated RegExp method compile is not implemented in RE2.'
);
}
[Symbol.match](input: string): RE2MatchArray | null {
if (typeof input !== 'string') {
input = input + '';
}
if (this._global) {
const result: string[] = [];
let nextIndex = 0;
let success: boolean;
do {
const match = this.wrapper.match(input, nextIndex, false);
success =
match.index === nextIndex || (!this._sticky && match.index >= 0);
if (success) {
result.push(match.match);
nextIndex = match.index + match.match.length;
}
} while (success);
if (result.length === 0) {
return null;
} else {
return result;
}
} else {
const startIndex = this.getMaybeStickyIndex();
const match = this.wrapper.match(input, startIndex, true);
this.maybeUpdateLastIndex(match, startIndex);
if (!this.isMatchSuccessful(match, startIndex)) {
return null;
}
const result: RE2MatchArray = [match.match, ...match.groups];
result.index = match.index;
result.input = input;
const groups = this.getNamedGroups(match);
if (Object.keys(groups).length > 0) {
result.groups = groups;
}
return result;
}
}
match(input: string): RE2MatchArray | null {
return this[Symbol.match](input);
}
/**
* Outputs the replacement for the matched part of the string
* @param input
* @param match
* @param replacer
*/
private replaceMatch(
input: string,
match: InternalMatchResult,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
replacer: string | ((substring: string, ...args: any[]) => string)
): string {
if (typeof replacer === 'string') {
let result = '';
for (let i = 0; i < replacer.length; i++) {
if (replacer[i] === '$') {
switch (replacer[i + 1]) {
case '$':
result += '$';
i++;
break;
case '&':
result += match.match;
i++;
break;
case '`':
result += input.substring(0, match.index);
i++;
break;
case "'":
result += input.substring(match.index + match.match.length);
i++;
break;
case '<': {
const endCaret = replacer.indexOf('>', i);
if (endCaret < 0) {
throw new Error('Invalid named group replacement');
}
const groupName = replacer.substring(i + 2, endCaret);
if (groupName in this.namedGroups) {
result += match.groups[this.namedGroups[groupName] - 1] ?? '';
}
i = endCaret;
break;
}
default: {
let groupNum: number;
if ('123456789'.includes(replacer[i + 1])) {
if ('0123456789'.includes(replacer[i + 2])) {
// Subtract 1 because groups are 1-indexed in replacement strings
groupNum =
Number.parseInt(replacer.substring(i + 1, i + 3)) - 1;
i += 2;
} else {
// Subtract 1 because groups are 1-indexed in replacement strings
groupNum = Number.parseInt(replacer[i + 1]) - 1;
i++;
}
} else {
throw new Error('Invalid replacement string');
}
if (groupNum < match.groups.length) {
result += match.groups[groupNum] ?? '';
} else {
result += '$' + groupNum;
}
}
}
} else {
result += replacer[i];
}
}
return result;
} else {
return replacer(
match.match,
...match.groups,
match.index,
input,
this.getNamedGroups(match)
);
}
}
[Symbol.replace](
input: string,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
replacer: string | ((substring: string, ...args: any[]) => string)
): string {
if (typeof input !== 'string') {
input = input + '';
}
if (typeof replacer !== 'function') {
replacer = replacer + '';
}
if (this._global) {
let result = '';
let nextIndex = 0;
let success: boolean;
do {
const match = this.wrapper.match(input, nextIndex, true);
success =
match.index === nextIndex || (!this._sticky && match.index >= 0);
if (success) {
result +=
input.substring(nextIndex, match.index) +
this.replaceMatch(input, match, replacer);
nextIndex = match.index + match.match.length;
}
} while (success);
result += input.substring(nextIndex);
this.lastIndex = 0;
return result;
} else {
const startIndex = this.getMaybeStickyIndex();
const match = this.wrapper.match(input, startIndex, true);
this.maybeUpdateLastIndex(match, startIndex);
if (this.isMatchSuccessful(match, startIndex)) {
return (
input.substring(0, match.index) +
this.replaceMatch(input, match, replacer) +
input.substring(match.index + match.match.length)
);
} else {
return input;
}
}
}
replace(
input: string,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
replacer: string | ((substring: string, ...args: any[]) => string)
): string {
return this[Symbol.replace](input, replacer);
}
[Symbol.search](input: string): number {
if (typeof input !== 'string') {
input = input + '';
}
const result = this.wrapper.match(input, 0, false).index;
if (this._sticky && result !== 0) {
return -1;
} else {
return result;
}
}
search(input: string): number {
return this[Symbol.search](input);
}
[Symbol.split](input: string, limit?: number): (string | undefined)[] {
if (typeof input !== 'string') {
input = input + '';
}
const output = [];
let nextIndex = 0;
limit = limit ?? Infinity;
while (output.length < limit) {
const nextMatch = this.wrapper.match(input, nextIndex, true);
if (nextMatch.index >= 0) {
if (nextMatch.match.length === 0) {
output.push(input.substring(nextIndex, nextIndex + 1));
nextIndex = nextIndex + 1;
} else {
output.push(input.substring(nextIndex, nextMatch.index));
nextIndex = nextMatch.index + nextMatch.match.length;
}
for (const group of nextMatch.groups) {
if (output.length >= limit) {
break;
}
output.push(group);
}
} else {
output.push(input.substring(nextIndex));
break;
}
}
return output;
}
split(input: string, limit?: number): (string | undefined)[] {
return this[Symbol.split](input, limit);
}
}
export default async function InitializeRe2() {
if (!compiledWasm) {
await new Promise<void>(resolve => {
if (isLoading) {
events.once(LOADED_EVENT, () => {
resolve();
});
} else {
compileWasmModule().then(() => {
resolve();
});
}
});
}
return RE2;
}