UNPKG

@microsoft/recognizers-text-sequence

Version:

recognizers-text-sequence provides robust recognition and resolution of series entities like phone numbers, URLs, and e-mail and IP addresses.

github.com/Microsoft/Recognizers-Text

Microsoft/Recognizers-Text

1,393 lines (1,204 loc) • 343 kB

JavaScript

(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : typeof define === 'function' && define.amd ? define(['exports'], factory) : (factory((global.microsoftRecognizersTextSequence = {}))); }(this, (function (exports) { 'use strict'; function unwrapExports (x) { return x && x.__esModule && Object.prototype.hasOwnProperty.call(x, 'default') ? x['default'] : x; } function createCommonjsModule(fn, module) { return module = { exports: {} }, fn(module, module.exports), module.exports; } var culture = createCommonjsModule(function (module, exports) { "use strict"; // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. Object.defineProperty(exports, "__esModule", { value: true }); class Culture { constructor(cultureName, cultureCode) { this.cultureName = cultureName; this.cultureCode = cultureCode; } static getSupportedCultureCodes() { return Culture.supportedCultures.map(c => c.cultureCode); } static mapToNearestLanguage(cultureCode) { if (cultureCode !== undefined) { cultureCode = cultureCode.toLowerCase(); let supportedCultureCodes = Culture.getSupportedCultureCodes(); if (supportedCultureCodes.indexOf(cultureCode) < 0) { let culturePrefix = cultureCode.split('-')[0].trim(); supportedCultureCodes.forEach(function (supportedCultureCode) { if (supportedCultureCode.startsWith(culturePrefix)) { cultureCode = supportedCultureCode; } }); } } return cultureCode; } } Culture.English = "en-us"; Culture.EnglishOthers = "en-*"; Culture.Chinese = "zh-cn"; Culture.Spanish = "es-es"; Culture.Portuguese = "pt-br"; Culture.French = "fr-fr"; Culture.German = "de-de"; Culture.Japanese = "ja-jp"; Culture.Dutch = "nl-nl"; Culture.Italian = "it-it"; Culture.supportedCultures = [ new Culture("English", Culture.English), new Culture("EnglishOthers", Culture.EnglishOthers), new Culture("Chinese", Culture.Chinese), new Culture("Spanish", Culture.Spanish), new Culture("Portuguese", Culture.Portuguese), new Culture("French", Culture.French), new Culture("German", Culture.German), new Culture("Japanese", Culture.Japanese), new Culture("Dutch", Culture.Dutch), new Culture("Italian", Culture.Italian) ]; exports.Culture = Culture; class CultureInfo { static getCultureInfo(cultureCode) { return new CultureInfo(cultureCode); } constructor(cultureName) { this.code = cultureName; } } exports.CultureInfo = CultureInfo; }); unwrapExports(culture); var xregexp = createCommonjsModule(function (module, exports) { "use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.default = void 0; function _slicedToArray(arr, i) { return _arrayWithHoles(arr) || _iterableToArrayLimit(arr, i) || _nonIterableRest(); } function _nonIterableRest() { throw new TypeError("Invalid attempt to destructure non-iterable instance"); } function _iterableToArrayLimit(arr, i) { var _arr = []; var _n = true; var _d = false; var _e = undefined; try { for (var _i = arr[Symbol.iterator](), _s; !(_n = (_s = _i.next()).done); _n = true) { _arr.push(_s.value); if (i && _arr.length === i) break; } } catch (err) { _d = true; _e = err; } finally { try { if (!_n && _i["return"] != null) _i["return"](); } finally { if (_d) throw _e; } } return _arr; } function _arrayWithHoles(arr) { if (Array.isArray(arr)) return arr; } /*! * XRegExp 4.2.0 * <xregexp.com> * Steven Levithan (c) 2007-present MIT License */ /** * XRegExp provides augmented, extensible regular expressions. You get additional regex syntax and * flags, beyond what browsers support natively. XRegExp is also a regex utility belt with tools to * make your client-side grepping simpler and more powerful, while freeing you from related * cross-browser inconsistencies. */ // ==--------------------------== // Private stuff // ==--------------------------== // Property name used for extended regex instance data var REGEX_DATA = 'xregexp'; // Optional features that can be installed and uninstalled var features = { astral: false, namespacing: false }; // Native methods to use and restore ('native' is an ES3 reserved keyword) var nativ = { exec: RegExp.prototype.exec, test: RegExp.prototype.test, match: String.prototype.match, replace: String.prototype.replace, split: String.prototype.split }; // Storage for fixed/extended native methods var fixed = {}; // Storage for regexes cached by `XRegExp.cache` var regexCache = {}; // Storage for pattern details cached by the `XRegExp` constructor var patternCache = {}; // Storage for regex syntax tokens added internally or by `XRegExp.addToken` var tokens = []; // Token scopes var defaultScope = 'default'; var classScope = 'class'; // Regexes that match native regex syntax, including octals var nativeTokens = { // Any native multicharacter token in default scope, or any single character 'default': /\\(?:0(?:[0-3][0-7]{0,2}|[4-7][0-7]?)?|[1-9]\d*|x[\dA-Fa-f]{2}|u(?:[\dA-Fa-f]{4}|{[\dA-Fa-f]+})|c[A-Za-z]|[\s\S])|\(\?(?:[:=!]|<[=!])|[?*+]\?|{\d+(?:,\d*)?}\??|[\s\S]/, // Any native multicharacter token in character class scope, or any single character 'class': /\\(?:[0-3][0-7]{0,2}|[4-7][0-7]?|x[\dA-Fa-f]{2}|u(?:[\dA-Fa-f]{4}|{[\dA-Fa-f]+})|c[A-Za-z]|[\s\S])|[\s\S]/ }; // Any backreference or dollar-prefixed character in replacement strings var replacementToken = /\$(?:{([\w$]+)}|<([\w$]+)>|(\d\d?|[\s\S]))/g; // Check for correct `exec` handling of nonparticipating capturing groups var correctExecNpcg = nativ.exec.call(/()??/, '')[1] === undefined; // Check for ES6 `flags` prop support var hasFlagsProp = /x/.flags !== undefined; // Shortcut to `Object.prototype.toString` var _ref = {}, toString = _ref.toString; function hasNativeFlag(flag) { // Can't check based on the presence of properties/getters since browsers might support such // properties even when they don't support the corresponding flag in regex construction (tested // in Chrome 48, where `'unicode' in /x/` is true but trying to construct a regex with flag `u` // throws an error) var isSupported = true; try { // Can't use regex literals for testing even in a `try` because regex literals with // unsupported flags cause a compilation error in IE new RegExp('', flag); } catch (exception) { isSupported = false; } return isSupported; } // Check for ES6 `u` flag support var hasNativeU = hasNativeFlag('u'); // Check for ES6 `y` flag support var hasNativeY = hasNativeFlag('y'); // Tracker for known flags, including addon flags var registeredFlags = { g: true, i: true, m: true, u: hasNativeU, y: hasNativeY }; /** * Attaches extended data and `XRegExp.prototype` properties to a regex object. * * @private * @param {RegExp} regex Regex to augment. * @param {Array} captureNames Array with capture names, or `null`. * @param {String} xSource XRegExp pattern used to generate `regex`, or `null` if N/A. * @param {String} xFlags XRegExp flags used to generate `regex`, or `null` if N/A. * @param {Boolean} [isInternalOnly=false] Whether the regex will be used only for internal * operations, and never exposed to users. For internal-only regexes, we can improve perf by * skipping some operations like attaching `XRegExp.prototype` properties. * @returns {RegExp} Augmented regex. */ function augment(regex, captureNames, xSource, xFlags, isInternalOnly) { regex[REGEX_DATA] = { captureNames: captureNames }; if (isInternalOnly) { return regex; } // Can't auto-inherit these since the XRegExp constructor returns a nonprimitive value if (regex.__proto__) { regex.__proto__ = XRegExp.prototype; } else { for (var p in XRegExp.prototype) { // An `XRegExp.prototype.hasOwnProperty(p)` check wouldn't be worth it here, since this // is performance sensitive, and enumerable `Object.prototype` or `RegExp.prototype` // extensions exist on `regex.prototype` anyway regex[p] = XRegExp.prototype[p]; } } regex[REGEX_DATA].source = xSource; // Emulate the ES6 `flags` prop by ensuring flags are in alphabetical order regex[REGEX_DATA].flags = xFlags ? xFlags.split('').sort().join('') : xFlags; return regex; } /** * Removes any duplicate characters from the provided string. * * @private * @param {String} str String to remove duplicate characters from. * @returns {String} String with any duplicate characters removed. */ function clipDuplicates(str) { return nativ.replace.call(str, /([\s\S])(?=[\s\S]*\1)/g, ''); } /** * Copies a regex object while preserving extended data and augmenting with `XRegExp.prototype` * properties. The copy has a fresh `lastIndex` property (set to zero). Allows adding and removing * flags g and y while copying the regex. * * @private * @param {RegExp} regex Regex to copy. * @param {Object} [options] Options object with optional properties: * - `addG` {Boolean} Add flag g while copying the regex. * - `addY` {Boolean} Add flag y while copying the regex. * - `removeG` {Boolean} Remove flag g while copying the regex. * - `removeY` {Boolean} Remove flag y while copying the regex. * - `isInternalOnly` {Boolean} Whether the copied regex will be used only for internal * operations, and never exposed to users. For internal-only regexes, we can improve perf by * skipping some operations like attaching `XRegExp.prototype` properties. * - `source` {String} Overrides `<regex>.source`, for special cases. * @returns {RegExp} Copy of the provided regex, possibly with modified flags. */ function copyRegex(regex, options) { if (!XRegExp.isRegExp(regex)) { throw new TypeError('Type RegExp expected'); } var xData = regex[REGEX_DATA] || {}; var flags = getNativeFlags(regex); var flagsToAdd = ''; var flagsToRemove = ''; var xregexpSource = null; var xregexpFlags = null; options = options || {}; if (options.removeG) { flagsToRemove += 'g'; } if (options.removeY) { flagsToRemove += 'y'; } if (flagsToRemove) { flags = nativ.replace.call(flags, new RegExp("[".concat(flagsToRemove, "]+"), 'g'), ''); } if (options.addG) { flagsToAdd += 'g'; } if (options.addY) { flagsToAdd += 'y'; } if (flagsToAdd) { flags = clipDuplicates(flags + flagsToAdd); } if (!options.isInternalOnly) { if (xData.source !== undefined) { xregexpSource = xData.source; } // null or undefined; don't want to add to `flags` if the previous value was null, since // that indicates we're not tracking original precompilation flags if (xData.flags != null) { // Flags are only added for non-internal regexes by `XRegExp.globalize`. Flags are never // removed for non-internal regexes, so don't need to handle it xregexpFlags = flagsToAdd ? clipDuplicates(xData.flags + flagsToAdd) : xData.flags; } } // Augment with `XRegExp.prototype` properties, but use the native `RegExp` constructor to avoid // searching for special tokens. That would be wrong for regexes constructed by `RegExp`, and // unnecessary for regexes constructed by `XRegExp` because the regex has already undergone the // translation to native regex syntax regex = augment(new RegExp(options.source || regex.source, flags), hasNamedCapture(regex) ? xData.captureNames.slice(0) : null, xregexpSource, xregexpFlags, options.isInternalOnly); return regex; } /** * Converts hexadecimal to decimal. * * @private * @param {String} hex * @returns {Number} */ function dec(hex) { return parseInt(hex, 16); } /** * Returns a pattern that can be used in a native RegExp in place of an ignorable token such as an * inline comment or whitespace with flag x. This is used directly as a token handler function * passed to `XRegExp.addToken`. * * @private * @param {String} match Match arg of `XRegExp.addToken` handler * @param {String} scope Scope arg of `XRegExp.addToken` handler * @param {String} flags Flags arg of `XRegExp.addToken` handler * @returns {String} Either '' or '(?:)', depending on which is needed in the context of the match. */ function getContextualTokenSeparator(match, scope, flags) { if ( // No need to separate tokens if at the beginning or end of a group match.input[match.index - 1] === '(' || match.input[match.index + match[0].length] === ')' || // No need to separate tokens if before or after a `|` match.input[match.index - 1] === '|' || match.input[match.index + match[0].length] === '|' || // No need to separate tokens if at the beginning or end of the pattern match.index < 1 || match.index + match[0].length >= match.input.length || // No need to separate tokens if at the beginning of a noncapturing group or lookahead. // The way this is written relies on: // - The search regex matching only 3-char strings. // - Although `substr` gives chars from the end of the string if given a negative index, // the resulting substring will be too short to match. Ex: `'abcd'.substr(-1, 3) === 'd'` nativ.test.call(/^\(\?[:=!]/, match.input.substr(match.index - 3, 3)) || // Avoid separating tokens when the following token is a quantifier isQuantifierNext(match.input, match.index + match[0].length, flags)) { return ''; } // Keep tokens separated. This avoids e.g. inadvertedly changing `\1 1` or `\1(?#)1` to `\11`. // This also ensures all tokens remain as discrete atoms, e.g. it avoids converting the syntax // error `(? :` into `(?:`. return '(?:)'; } /** * Returns native `RegExp` flags used by a regex object. * * @private * @param {RegExp} regex Regex to check. * @returns {String} Native flags in use. */ function getNativeFlags(regex) { return hasFlagsProp ? regex.flags : // Explicitly using `RegExp.prototype.toString` (rather than e.g. `String` or concatenation // with an empty string) allows this to continue working predictably when // `XRegExp.proptotype.toString` is overridden nativ.exec.call(/\/([a-z]*)$/i, RegExp.prototype.toString.call(regex))[1]; } /** * Determines whether a regex has extended instance data used to track capture names. * * @private * @param {RegExp} regex Regex to check. * @returns {Boolean} Whether the regex uses named capture. */ function hasNamedCapture(regex) { return !!(regex[REGEX_DATA] && regex[REGEX_DATA].captureNames); } /** * Converts decimal to hexadecimal. * * @private * @param {Number|String} dec * @returns {String} */ function hex(dec) { return parseInt(dec, 10).toString(16); } /** * Checks whether the next nonignorable token after the specified position is a quantifier. * * @private * @param {String} pattern Pattern to search within. * @param {Number} pos Index in `pattern` to search at. * @param {String} flags Flags used by the pattern. * @returns {Boolean} Whether the next nonignorable token is a quantifier. */ function isQuantifierNext(pattern, pos, flags) { return nativ.test.call(flags.indexOf('x') !== -1 ? // Ignore any leading whitespace, line comments, and inline comments /^(?:\s|#[^#\n]*|\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})/ : // Ignore any leading inline comments /^(?:\(\?#[^)]*\))*(?:[?*+]|{\d+(?:,\d*)?})/, pattern.slice(pos)); } /** * Determines whether a value is of the specified type, by resolving its internal [[Class]]. * * @private * @param {*} value Object to check. * @param {String} type Type to check for, in TitleCase. * @returns {Boolean} Whether the object matches the type. */ function isType(value, type) { return toString.call(value) === "[object ".concat(type, "]"); } /** * Adds leading zeros if shorter than four characters. Used for fixed-length hexadecimal values. * * @private * @param {String} str * @returns {String} */ function pad4(str) { while (str.length < 4) { str = "0".concat(str); } return str; } /** * Checks for flag-related errors, and strips/applies flags in a leading mode modifier. Offloads * the flag preparation logic from the `XRegExp` constructor. * * @private * @param {String} pattern Regex pattern, possibly with a leading mode modifier. * @param {String} flags Any combination of flags. * @returns {Object} Object with properties `pattern` and `flags`. */ function prepareFlags(pattern, flags) { // Recent browsers throw on duplicate flags, so copy this behavior for nonnative flags if (clipDuplicates(flags) !== flags) { throw new SyntaxError("Invalid duplicate regex flag ".concat(flags)); } // Strip and apply a leading mode modifier with any combination of flags except g or y pattern = nativ.replace.call(pattern, /^\(\?([\w$]+)\)/, function ($0, $1) { if (nativ.test.call(/[gy]/, $1)) { throw new SyntaxError("Cannot use flag g or y in mode modifier ".concat($0)); } // Allow duplicate flags within the mode modifier flags = clipDuplicates(flags + $1); return ''; }); // Throw on unknown native or nonnative flags var _iteratorNormalCompletion = true; var _didIteratorError = false; var _iteratorError = undefined; try { for (var _iterator = flags[Symbol.iterator](), _step; !(_iteratorNormalCompletion = (_step = _iterator.next()).done); _iteratorNormalCompletion = true) { var flag = _step.value; if (!registeredFlags[flag]) { throw new SyntaxError("Unknown regex flag ".concat(flag)); } } } catch (err) { _didIteratorError = true; _iteratorError = err; } finally { try { if (!_iteratorNormalCompletion && _iterator.return != null) { _iterator.return(); } } finally { if (_didIteratorError) { throw _iteratorError; } } } return { pattern: pattern, flags: flags }; } /** * Prepares an options object from the given value. * * @private * @param {String|Object} value Value to convert to an options object. * @returns {Object} Options object. */ function prepareOptions(value) { var options = {}; if (isType(value, 'String')) { XRegExp.forEach(value, /[^\s,]+/, function (match) { options[match] = true; }); return options; } return value; } /** * Registers a flag so it doesn't throw an 'unknown flag' error. * * @private * @param {String} flag Single-character flag to register. */ function registerFlag(flag) { if (!/^[\w$]$/.test(flag)) { throw new Error('Flag must be a single character A-Za-z0-9_$'); } registeredFlags[flag] = true; } /** * Runs built-in and custom regex syntax tokens in reverse insertion order at the specified * position, until a match is found. * * @private * @param {String} pattern Original pattern from which an XRegExp object is being built. * @param {String} flags Flags being used to construct the regex. * @param {Number} pos Position to search for tokens within `pattern`. * @param {Number} scope Regex scope to apply: 'default' or 'class'. * @param {Object} context Context object to use for token handler functions. * @returns {Object} Object with properties `matchLength`, `output`, and `reparse`; or `null`. */ function runTokens(pattern, flags, pos, scope, context) { var i = tokens.length; var leadChar = pattern[pos]; var result = null; var match; var t; // Run in reverse insertion order while (i--) { t = tokens[i]; if (t.leadChar && t.leadChar !== leadChar || t.scope !== scope && t.scope !== 'all' || t.flag && !(flags.indexOf(t.flag) !== -1)) { continue; } match = XRegExp.exec(pattern, t.regex, pos, 'sticky'); if (match) { result = { matchLength: match[0].length, output: t.handler.call(context, match, scope, flags), reparse: t.reparse }; // Finished with token tests break; } } return result; } /** * Enables or disables implicit astral mode opt-in. When enabled, flag A is automatically added to * all new regexes created by XRegExp. This causes an error to be thrown when creating regexes if * the Unicode Base addon is not available, since flag A is registered by that addon. * * @private * @param {Boolean} on `true` to enable; `false` to disable. */ function setAstral(on) { features.astral = on; } /** * Adds named capture groups to the `groups` property of match arrays. See here for details: * https://github.com/tc39/proposal-regexp-named-groups * * @private * @param {Boolean} on `true` to enable; `false` to disable. */ function setNamespacing(on) { features.namespacing = on; } /** * Returns the object, or throws an error if it is `null` or `undefined`. This is used to follow * the ES5 abstract operation `ToObject`. * * @private * @param {*} value Object to check and return. * @returns {*} The provided object. */ function toObject(value) { // null or undefined if (value == null) { throw new TypeError('Cannot convert null or undefined to object'); } return value; } // ==--------------------------== // Constructor // ==--------------------------== /** * Creates an extended regular expression object for matching text with a pattern. Differs from a * native regular expression in that additional syntax and flags are supported. The returned object * is in fact a native `RegExp` and works with all native methods. * * @class XRegExp * @constructor * @param {String|RegExp} pattern Regex pattern string, or an existing regex object to copy. * @param {String} [flags] Any combination of flags. * Native flags: * - `g` - global * - `i` - ignore case * - `m` - multiline anchors * - `u` - unicode (ES6) * - `y` - sticky (Firefox 3+, ES6) * Additional XRegExp flags: * - `n` - explicit capture * - `s` - dot matches all (aka singleline) * - `x` - free-spacing and line comments (aka extended) * - `A` - astral (requires the Unicode Base addon) * Flags cannot be provided when constructing one `RegExp` from another. * @returns {RegExp} Extended regular expression object. * @example * * // With named capture and flag x * XRegExp(`(?<year> [0-9]{4} ) -? # year * (?<month> [0-9]{2} ) -? # month * (?<day> [0-9]{2} ) # day`, 'x'); * * // Providing a regex object copies it. Native regexes are recompiled using native (not XRegExp) * // syntax. Copies maintain extended data, are augmented with `XRegExp.prototype` properties, and * // have fresh `lastIndex` properties (set to zero). * XRegExp(/regex/); */ function XRegExp(pattern, flags) { if (XRegExp.isRegExp(pattern)) { if (flags !== undefined) { throw new TypeError('Cannot supply flags when copying a RegExp'); } return copyRegex(pattern); } // Copy the argument behavior of `RegExp` pattern = pattern === undefined ? '' : String(pattern); flags = flags === undefined ? '' : String(flags); if (XRegExp.isInstalled('astral') && !(flags.indexOf('A') !== -1)) { // This causes an error to be thrown if the Unicode Base addon is not available flags += 'A'; } if (!patternCache[pattern]) { patternCache[pattern] = {}; } if (!patternCache[pattern][flags]) { var context = { hasNamedCapture: false, captureNames: [] }; var scope = defaultScope; var output = ''; var pos = 0; var result; // Check for flag-related errors, and strip/apply flags in a leading mode modifier var applied = prepareFlags(pattern, flags); var appliedPattern = applied.pattern; var appliedFlags = applied.flags; // Use XRegExp's tokens to translate the pattern to a native regex pattern. // `appliedPattern.length` may change on each iteration if tokens use `reparse` while (pos < appliedPattern.length) { do { // Check for custom tokens at the current position result = runTokens(appliedPattern, appliedFlags, pos, scope, context); // If the matched token used the `reparse` option, splice its output into the // pattern before running tokens again at the same position if (result && result.reparse) { appliedPattern = appliedPattern.slice(0, pos) + result.output + appliedPattern.slice(pos + result.matchLength); } } while (result && result.reparse); if (result) { output += result.output; pos += result.matchLength || 1; } else { // Get the native token at the current position var _XRegExp$exec = XRegExp.exec(appliedPattern, nativeTokens[scope], pos, 'sticky'), _XRegExp$exec2 = _slicedToArray(_XRegExp$exec, 1), token = _XRegExp$exec2[0]; output += token; pos += token.length; if (token === '[' && scope === defaultScope) { scope = classScope; } else if (token === ']' && scope === classScope) { scope = defaultScope; } } } patternCache[pattern][flags] = { // Use basic cleanup to collapse repeated empty groups like `(?:)(?:)` to `(?:)`. Empty // groups are sometimes inserted during regex transpilation in order to keep tokens // separated. However, more than one empty group in a row is never needed. pattern: nativ.replace.call(output, /(?:\(\?:\))+/g, '(?:)'), // Strip all but native flags flags: nativ.replace.call(appliedFlags, /[^gimuy]+/g, ''), // `context.captureNames` has an item for each capturing group, even if unnamed captures: context.hasNamedCapture ? context.captureNames : null }; } var generated = patternCache[pattern][flags]; return augment(new RegExp(generated.pattern, generated.flags), generated.captures, pattern, flags); } // Add `RegExp.prototype` to the prototype chain XRegExp.prototype = /(?:)/; // ==--------------------------== // Public properties // ==--------------------------== /** * The XRegExp version number as a string containing three dot-separated parts. For example, * '2.0.0-beta-3'. * * @static * @memberOf XRegExp * @type String */ XRegExp.version = '4.2.0'; // ==--------------------------== // Public methods // ==--------------------------== // Intentionally undocumented; used in tests and addons XRegExp._clipDuplicates = clipDuplicates; XRegExp._hasNativeFlag = hasNativeFlag; XRegExp._dec = dec; XRegExp._hex = hex; XRegExp._pad4 = pad4; /** * Extends XRegExp syntax and allows custom flags. This is used internally and can be used to * create XRegExp addons. If more than one token can match the same string, the last added wins. * * @memberOf XRegExp * @param {RegExp} regex Regex object that matches the new token. * @param {Function} handler Function that returns a new pattern string (using native regex syntax) * to replace the matched token within all future XRegExp regexes. Has access to persistent * properties of the regex being built, through `this`. Invoked with three arguments: * - The match array, with named backreference properties. * - The regex scope where the match was found: 'default' or 'class'. * - The flags used by the regex, including any flags in a leading mode modifier. * The handler function becomes part of the XRegExp construction process, so be careful not to * construct XRegExps within the function or you will trigger infinite recursion. * @param {Object} [options] Options object with optional properties: * - `scope` {String} Scope where the token applies: 'default', 'class', or 'all'. * - `flag` {String} Single-character flag that triggers the token. This also registers the * flag, which prevents XRegExp from throwing an 'unknown flag' error when the flag is used. * - `optionalFlags` {String} Any custom flags checked for within the token `handler` that are * not required to trigger the token. This registers the flags, to prevent XRegExp from * throwing an 'unknown flag' error when any of the flags are used. * - `reparse` {Boolean} Whether the `handler` function's output should not be treated as * final, and instead be reparseable by other tokens (including the current token). Allows * token chaining or deferring. * - `leadChar` {String} Single character that occurs at the beginning of any successful match * of the token (not always applicable). This doesn't change the behavior of the token unless * you provide an erroneous value. However, providing it can increase the token's performance * since the token can be skipped at any positions where this character doesn't appear. * @example * * // Basic usage: Add \a for the ALERT control code * XRegExp.addToken( * /\\a/, * () => '\\x07', * {scope: 'all'} * ); * XRegExp('\\a[\\a-\\n]+').test('\x07\n\x07'); // -> true * * // Add the U (ungreedy) flag from PCRE and RE2, which reverses greedy and lazy quantifiers. * // Since `scope` is not specified, it uses 'default' (i.e., transformations apply outside of * // character classes only) * XRegExp.addToken( * /([?*+]|{\d+(?:,\d*)?})(\??)/, * (match) => `${match[1]}${match[2] ? '' : '?'}`, * {flag: 'U'} * ); * XRegExp('a+', 'U').exec('aaa')[0]; // -> 'a' * XRegExp('a+?', 'U').exec('aaa')[0]; // -> 'aaa' */ XRegExp.addToken = function (regex, handler, options) { options = options || {}; var _options = options, optionalFlags = _options.optionalFlags; if (options.flag) { registerFlag(options.flag); } if (optionalFlags) { optionalFlags = nativ.split.call(optionalFlags, ''); var _iteratorNormalCompletion2 = true; var _didIteratorError2 = false; var _iteratorError2 = undefined; try { for (var _iterator2 = optionalFlags[Symbol.iterator](), _step2; !(_iteratorNormalCompletion2 = (_step2 = _iterator2.next()).done); _iteratorNormalCompletion2 = true) { var flag = _step2.value; registerFlag(flag); } } catch (err) { _didIteratorError2 = true; _iteratorError2 = err; } finally { try { if (!_iteratorNormalCompletion2 && _iterator2.return != null) { _iterator2.return(); } } finally { if (_didIteratorError2) { throw _iteratorError2; } } } } // Add to the private list of syntax tokens tokens.push({ regex: copyRegex(regex, { addG: true, addY: hasNativeY, isInternalOnly: true }), handler: handler, scope: options.scope || defaultScope, flag: options.flag, reparse: options.reparse, leadChar: options.leadChar }); // Reset the pattern cache used by the `XRegExp` constructor, since the same pattern and flags // might now produce different results XRegExp.cache.flush('patterns'); }; /** * Caches and returns the result of calling `XRegExp(pattern, flags)`. On any subsequent call with * the same pattern and flag combination, the cached copy of the regex is returned. * * @memberOf XRegExp * @param {String} pattern Regex pattern string. * @param {String} [flags] Any combination of XRegExp flags. * @returns {RegExp} Cached XRegExp object. * @example * * while (match = XRegExp.cache('.', 'gs').exec(str)) { * // The regex is compiled once only * } */ XRegExp.cache = function (pattern, flags) { if (!regexCache[pattern]) { regexCache[pattern] = {}; } return regexCache[pattern][flags] || (regexCache[pattern][flags] = XRegExp(pattern, flags)); }; // Intentionally undocumented; used in tests XRegExp.cache.flush = function (cacheName) { if (cacheName === 'patterns') { // Flush the pattern cache used by the `XRegExp` constructor patternCache = {}; } else { // Flush the regex cache populated by `XRegExp.cache` regexCache = {}; } }; /** * Escapes any regular expression metacharacters, for use when matching literal strings. The result * can safely be used at any point within a regex that uses any flags. * * @memberOf XRegExp * @param {String} str String to escape. * @returns {String} String with regex metacharacters escaped. * @example * * XRegExp.escape('Escaped? <.>'); * // -> 'Escaped\?\ <\.>' */ XRegExp.escape = function (str) { return nativ.replace.call(toObject(str), /[-\[\]{}()*+?.,\\^$|#\s]/g, '\\$&'); }; /** * Executes a regex search in a specified string. Returns a match array or `null`. If the provided * regex uses named capture, named backreference properties are included on the match array. * Optional `pos` and `sticky` arguments specify the search start position, and whether the match * must start at the specified position only. The `lastIndex` property of the provided regex is not * used, but is updated for compatibility. Also fixes browser bugs compared to the native * `RegExp.prototype.exec` and can be used reliably cross-browser. * * @memberOf XRegExp * @param {String} str String to search. * @param {RegExp} regex Regex to search with. * @param {Number} [pos=0] Zero-based index at which to start the search. * @param {Boolean|String} [sticky=false] Whether the match must start at the specified position * only. The string `'sticky'` is accepted as an alternative to `true`. * @returns {Array} Match array with named backreference properties, or `null`. * @example * * // Basic use, with named backreference * let match = XRegExp.exec('U+2620', XRegExp('U\\+(?<hex>[0-9A-F]{4})')); * match.hex; // -> '2620' * * // With pos and sticky, in a loop * let pos = 2, result = [], match; * while (match = XRegExp.exec('<1><2><3><4>5<6>', /<(\d)>/, pos, 'sticky')) { * result.push(match[1]); * pos = match.index + match[0].length; * } * // result -> ['2', '3', '4'] */ XRegExp.exec = function (str, regex, pos, sticky) { var cacheKey = 'g'; var addY = false; var fakeY = false; var match; addY = hasNativeY && !!(sticky || regex.sticky && sticky !== false); if (addY) { cacheKey += 'y'; } else if (sticky) { // Simulate sticky matching by appending an empty capture to the original regex. The // resulting regex will succeed no matter what at the current index (set with `lastIndex`), // and will not search the rest of the subject string. We'll know that the original regex // has failed if that last capture is `''` rather than `undefined` (i.e., if that last // capture participated in the match). fakeY = true; cacheKey += 'FakeY'; } regex[REGEX_DATA] = regex[REGEX_DATA] || {}; // Shares cached copies with `XRegExp.match`/`replace` var r2 = regex[REGEX_DATA][cacheKey] || (regex[REGEX_DATA][cacheKey] = copyRegex(regex, { addG: true, addY: addY, source: fakeY ? "".concat(regex.source, "|()") : undefined, removeY: sticky === false, isInternalOnly: true })); pos = pos || 0; r2.lastIndex = pos; // Fixed `exec` required for `lastIndex` fix, named backreferences, etc. match = fixed.exec.call(r2, str); // Get rid of the capture added by the pseudo-sticky matcher if needed. An empty string means // the original regexp failed (see above). if (fakeY && match && match.pop() === '') { match = null; } if (regex.global) { regex.lastIndex = match ? r2.lastIndex : 0; } return match; }; /** * Executes a provided function once per regex match. Searches always start at the beginning of the * string and continue until the end, regardless of the state of the regex's `global` property and * initial `lastIndex`. * * @memberOf XRegExp * @param {String} str String to search. * @param {RegExp} regex Regex to search with. * @param {Function} callback Function to execute for each match. Invoked with four arguments: * - The match array, with named backreference properties. * - The zero-based match index. * - The string being traversed. * - The regex object being used to traverse the string. * @example * * // Extracts every other digit from a string * const evens = []; * XRegExp.forEach('1a2345', /\d/, (match, i) => { * if (i % 2) evens.push(+match[0]); * }); * // evens -> [2, 4] */ XRegExp.forEach = function (str, regex, callback) { var pos = 0; var i = -1; var match; while (match = XRegExp.exec(str, regex, pos)) { // Because `regex` is provided to `callback`, the function could use the deprecated/ // nonstandard `RegExp.prototype.compile` to mutate the regex. However, since `XRegExp.exec` // doesn't use `lastIndex` to set the search position, this can't lead to an infinite loop, // at least. Actually, because of the way `XRegExp.exec` caches globalized versions of // regexes, mutating the regex will not have any effect on the iteration or matched strings, // which is a nice side effect that brings extra safety. callback(match, ++i, str, regex); pos = match.index + (match[0].length || 1); } }; /** * Copies a regex object and adds flag `g`. The copy maintains extended data, is augmented with * `XRegExp.prototype` properties, and has a fresh `lastIndex` property (set to zero). Native * regexes are not recompiled using XRegExp syntax. * * @memberOf XRegExp * @param {RegExp} regex Regex to globalize. * @returns {RegExp} Copy of the provided regex with flag `g` added. * @example * * const globalCopy = XRegExp.globalize(/regex/); * globalCopy.global; // -> true */ XRegExp.globalize = function (regex) { return copyRegex(regex, { addG: true }); }; /** * Installs optional features according to the specified options. Can be undone using * `XRegExp.uninstall`. * * @memberOf XRegExp * @param {Object|String} options Options object or string. * @example * * // With an options object * XRegExp.install({ * // Enables support for astral code points in Unicode addons (implicitly sets flag A) * astral: true, * * // Adds named capture groups to the `groups` property of matches * namespacing: true * }); * * // With an options string * XRegExp.install('astral namespacing'); */ XRegExp.install = function (options) { options = prepareOptions(options); if (!features.astral && options.astral) { setAstral(true); } if (!features.namespacing && options.namespacing) { setNamespacing(true); } }; /** * Checks whether an individual optional feature is installed. * * @memberOf XRegExp * @param {String} feature Name of the feature to check. One of: * - `astral` * - `namespacing` * @returns {Boolean} Whether the feature is installed. * @example * * XRegExp.isInstalled('astral'); */ XRegExp.isInstalled = function (feature) { return !!features[feature]; }; /** * Returns `true` if an object is a regex; `false` if it isn't. This works correctly for regexes * created in another frame, when `instanceof` and `constructor` checks would fail. * * @memberOf XRegExp * @param {*} value Object to check. * @returns {Boolean} Whether the object is a `RegExp` object. * @example * * XRegExp.isRegExp('string'); // -> false * XRegExp.isRegExp(/regex/i); // -> true * XRegExp.isRegExp(RegExp('^', 'm')); // -> true * XRegExp.isRegExp(XRegExp('(?s).')); // -> true */ XRegExp.isRegExp = function (value) { return toString.call(value) === '[object RegExp]'; }; // isType(value, 'RegExp'); /** * Returns the first matched string, or in global mode, an array containing all matched strings. * This is essentially a more convenient re-implementation of `String.prototype.match` that gives * the result types you actually want (string instead of `exec`-style array in match-first mode, * and an empty array instead of `null` when no matches are found in match-all mode). It also lets * you override flag g and ignore `lastIndex`, and fixes browser bugs. * * @memberOf XRegExp * @param {String} str String to search. * @param {RegExp} regex Regex to search with. * @param {String} [scope='one'] Use 'one' to return the first match as a string. Use 'all' to * return an array of all matched strings. If not explicitly specified and `regex` uses flag g, * `scope` is 'all'. * @returns {String|Array} In match-first mode: First match as a string, or `null`. In match-all * mode: Array of all matched strings, or an empty array. * @example * * // Match first * XRegExp.match('abc', /\w/); // -> 'a' * XRegExp.match('abc', /\w/g, 'one'); // -> 'a' * XRegExp.match('abc', /x/g, 'one'); // -> null * * // Match all * XRegExp.match('abc', /\w/g); // -> ['a', 'b', 'c'] * XRegExp.match('abc', /\w/, 'all'); // -> ['a', 'b', 'c'] * XRegExp.match('abc', /x/, 'all'); // -> [] */ XRegExp.match = function (str, regex, scope) { var global = regex.global && scope !== 'one' || scope === 'all'; var cacheKey = (global ? 'g' : '') + (regex.sticky ? 'y' : '') || 'noGY'; regex[REGEX_DATA] = regex[REGEX_DATA] || {}; // Shares cached copies with `XRegExp.exec`/`replace` var r2 = regex[REGEX_DATA][cacheKey] || (regex[REGEX_DATA][cacheKey] = copyRegex(regex, { addG: !!global, removeG: scope === 'one', isInternalOnly: true })); var result = nativ.match.call(toObject(str), r2); if (regex.global) { regex.lastIndex = scope === 'one' && result ? // Can't use `r2.lastIndex` since `r2` is nonglobal in this case result.index + result[0].length : 0; } return global ? result || [] : result && result[0]; }; /** * Retrieves the matches from searching a string using a chain of regexes that successively search * within previous matches. The provided `chain` array can contain regexes and or objects with * `regex` and `backref` properties. When a backreference is specified, the named or numbered * backreference is passed forward to the next regex or returned. * * @memberOf XRegExp * @param {String} str String to search. * @param {Array} chain Regexes that each search for matches within preceding results. * @returns {Array} Matches by the last regex in the chain, or an empty array. * @example * * // Basic usage; matches numbers within <b> tags * XRegExp.matchChain('1 <b>2</b> 3 <b>4 a 56</b>', [ * XRegExp('(?is)<b>.*?</b>'), * /\d+/ * ]); * // -> ['2', '4', '56'] * * // Passing forward and returning specific backreferences * html = '<a href="http://xregexp.com/api/">XRegExp</a>\ * <a href="http://www.google.com/">Google</a>'; * XRegExp.matchChain(html, [ * {regex: /<a href="([^"]+)">/i, backref: 1}, * {regex: XRegExp('(?i)^https?://(?<domain>[^/?#]+)'), backref: 'domain'} * ]); * // -> ['xregexp.com', 'www.google.com'] */ XRegExp.matchChain = function (str, chain) { return function recurseChain(values, level) { var item = chain[level].regex ? chain[level] : { regex: chain[level] }; var matches = []; function addMatch(match) { if (item.backref) { var ERR_UNDEFINED_GROUP = "Backreference to undefined group: ".concat(item.backref); var isNamedBackref = isNaN(item.backref); if (isNamedBackref && XRegExp.isInstalled('namespacing')) { // `groups` has `null` as prototype, so using `in` instead of `hasOwnProperty` if (!(item.backref in match.groups)) { throw new ReferenceError(ERR_UNDEFINED_GROUP); } } else if (!match.hasOwnProperty(item.backref)) { throw new ReferenceError(ERR_UNDEFINED_GROUP); } var backrefValue = isNamedBackref && XRegExp.isInstalled('namespacing') ? match.groups[item.backref] : match[item.backref]; matches.push(backrefValue || ''); } else { matches.push(match[0]); } } var _iteratorNormalCompletion3 = true; var _didIteratorError3 = false; var _iteratorError3 = undefined; try { for (var _iterator3 = values[Symbol.iterator](), _step3; !(_iteratorNormalCompletion3 = (_step3 = _iterator3.next()).done); _iteratorNormalCompletion3 = true) { var value = _step3.value; XRegExp.forEach(value, item.regex, addMatch); } } catch (err) { _didIteratorError3 = true; _iteratorError3 = err; } finally { try { if (!_iteratorNormalCompletion3 && _iterator3.return != null) { _iterator3.return(); } } finally { if (_didIteratorError3) { throw _iteratorError3; } } } return level === chain.length - 1 || !matches.length ? matches : recurseChain(matches, level + 1); }([str], 0); }; /** * Returns a new string with one or all matches of a pattern replaced. The pattern can be a string * or regex, and the replacement can be a string or a function to be called for each match. To * perform a global search and replace, use the optional `scope` argument or include flag g if using * a regex. Replacement strings can use `${n}` or `$<n>` for named and numbered backreferences. * Replacement functions can use named backreferences via `arguments[0].name`. Also fixes browser * bugs compared to the native `String.prototype.replace` and can be used reliably cross-browser. * * @memberOf XRegExp * @param {String} str String to search. * @param {RegExp|String} search Search pattern to be replaced. * @param {String|Function} replacement Replacement string or a function invoked to create it. * Replacement strings can include special replacement syntax: * - $$ - Inserts a literal $ character. * - $&, $0 - Inserts the matched substring. * - $` - Inserts the string that precedes the matched substring (left context). * - $' - Inserts the string that follows the matched substring (right context). * - $n, $nn - Where n/nn are digits referencing an existent capturing group, inserts * backreference n/nn. * - ${n}, $<n> - Where n is a name or any number of digits that reference an existent capturing * group, inserts backreference n. * Replacement functions are invoked with three or more arguments: * - The matched substring (corresponds to $& above). Named backreferences are accessible as * properties of this first argument. * - 0..n arguments, one for each backreference (corresponding to $1, $2, etc. above). * - The zero-based index of the match within the total search string. * - The total string being searched. * @param {String} [scope='one'] Use 'one' to replace the first match only, or 'all'. If not * explicitly specified and using a regex with flag g, `scope` is 'all'. * @returns {String} New string with one or all matches replaced. * @example * * // Regex search, using named backreferences in replacement string * const name = XRegExp('(?<first>\\w+) (?<last>\\w+)'); * XRegExp.replace('John Smith', name, '$<last>, $<first>'); * // -> 'Smith, John' * * // Regex search, using named backreferences in replacement function * XRegExp.replace('John Smith', name, (match) => `${match.last}, ${match.first}`); * // -> 'Smith, John' * * // String search, with replace-all * XRegExp.replace('RegExp builds RegExps', 'RegExp', 'XRegExp', 'all'); * // -> 'XRegExp builds XRegExps' */ XRegExp.replace = function (str, search, replacement, scope) { var isRegex = XRegExp.isRegExp(search); var global = search.global && scope !== 'one' || scope === 'all'; var cacheKey = (global ? 'g' : '') + (search.sticky ? 'y' : '') || 'noGY'; var s2 = search; if (isRegex) { search[REGEX_DATA] = search[REGEX_DATA] || {}; // Shares cached copies with `XRegExp.exec`/`match`. Since a copy is used, `search`'s // `lastIndex` isn't updated *during* replacement iterations s2 = search[REGEX_DATA][cacheKey] || (search[REGEX_DATA][cacheKey] = copyRegex(search, { addG: !!global, removeG: scope === 'one', isInternalOnly: true })); } else if (global) { s2 = new RegExp(XRegExp.escape(String(search)), 'g'); } // Fixed `replace` required for named backreferences, etc. var result = fixed.replace.call(toObject(str), s2, replacement); if (isRegex && search.global) { // Fixes IE, Safari bug (last tested IE 9, Safari 5.1) search.lastIndex = 0; } return result; }; /** * Performs batch processing of string replacements. Used like `XRegExp.replace`, but accepts an * array of replacement details. Later replacements operate on the output of earlier replacements. * Replacement details are accepted as an array with a regex or string to search for, the * replacement string or function, and an optional scope of 'one' or 'all'. Uses the XRegExp * replacement text syntax, which supports named backreference properties via `${name}` or * `$<name>`. * * @memberOf XRegExp * @param {String} str String to search. * @param {Array} replacements Array of replacement detail arrays. * @returns {String} New string with all replacements. * @example * * str = XRegExp.replaceEach(str, [ * [XRegExp('(?<name>a)'), 'z${name}'], * [/b/gi, 'y'], * [/c/g, 'x', 'one'], // scope 'one' overrides /g * [/d/, 'w', 'all'], // scope 'all' overrides lack of /g * ['e', 'v', 'all'], // scope 'all' allows replace-all for strings * [/f/g, ($0) => $0.toUpperCase()] * ]); */ XRegExp.replaceEach = function (str, replacements) { var _iteratorNormalCompletion4 = true; var _didIteratorError4 = false; var _iteratorError4 = undefined; try { for (var _iterator4 = replacements[Symbol.iterator](), _step4; !(_iteratorNormalCompletion4 = (_step4 = _iterator4.next()).done); _iteratorNormalCompletion4 = true) { var r = _step4.value; str = XRegExp.replace(str, r[0], r[1], r[2]); } } catch (err) { _didIteratorError4 = true; _iteratorError4 = err; } finally { try { if (!_iteratorNormalCompletion4 && _iterator4.return != null) { _iterator4.return(); } } finally { if (_didIteratorError4) { throw _iteratorError4; } } } return str; }; /** * Splits a string into an array of strings using a regex or string separator.