UNPKG

codemirror-mode-pcre

Version:

Perl Compatible Regular Expressions (PCRE) mode for CodeMirror

github.com/xavierog/codemirror-mode-pcre

xavierog/codemirror-mode-pcre

902 lines (844 loc) • 33.9 kB

JavaScript

// Declare global variables to avoid warnings in JSHint /* global CodeMirror, define */ (function (mod) { if (typeof exports === "object" && typeof module === "object") // CommonJS mod(require("codemirror/lib/codemirror")); else if (typeof define === "function" && define.amd) // AMD define(["codemirror/lib/codemirror"], mod); else // Plain browser env mod(CodeMirror); })(function (CodeMirror) { "use strict"; CodeMirror.defineMode('pcre', function(editor_options, mode_options) { // Default settings: var options = { extended: true, }; // Override default settings with user-provided settings: if ('extended' in mode_options) options.extended = Boolean(mode_options.extended); var delimiters = { '<': '>', '[': ']', '{': '}', '(': ')', }; // Behaviour of alphanumeric characters after a backslash character (normal context): var backslash_in_normal_context = { '0': 'non-printing-character', '1': 'backreference', '2': 'backreference', '3': 'backreference', '4': 'backreference', '5': 'backreference', '6': 'backreference', '7': 'backreference', '8': 'backreference', '9': 'backreference', 'A': 'anchor', // \A start of subject 'B': 'anchor', // \B not a word boundary 'C': 'generic-character-type', // \C one data unit, even in UTF mode (best avoided) 'D': 'generic-character-type', // \D any character that is not a decimal digit 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message 'F': '', // \F matches F 'G': 'anchor', // \G first matching position in subject 'H': 'generic-character-type', // \H any character that is not a horizontal white space character 'I': '', // \I matches I 'J': '', // \J matches J 'K': 'anchor', // \K reset start of match (neither an anchor nor a simple assertion) 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'M': '', // \M matches M 'N': 'generic-character-type', // \N a character that is not a newline 'O': '', // \O matches O 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences. 'R': 'generic-character-type', // \R a newline sequence 'S': 'generic-character-type', // \S any character that is not a white space character 'T': '', // \T matches T 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'V': 'generic-character-type', // \V any character that is not a vertical white space character 'W': 'generic-character-type', // \W any "non-word" character 'X': 'generic-character-type', // \X a Unicode extended grapheme cluster 'Y': '', // \Y matches Y 'Z': 'anchor', // \Z matches at the end of the subject; also matches before a newline at the end of the subject 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07) 'b': 'anchor', // \b word boundary 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character 'd': 'generic-character-type', // \d any decimal digi 'e': 'non-printing-character', // \e escape (hex 1B) 'f': 'non-printing-character', // \f form feed (hex 0C) 'g': 'err a-number-reference-must-not-be-zero', // a numbered reference must not be zero 'h': 'generic-character-type', // \h any horizontal white space character 'i': '', // \i matches i 'j': '', // \j matches j 'k': 'err backslash-k-is-not-followed-by-a-name', // \k is not followed by a [...] name 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'm': '', // \m matches m 'n': 'non-printing-character', // \n linefeed (hex 0A) 'o': '', // \o matches o 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 'q': '', // \q matches q 'r': 'non-printing-character', // \r carriage return (hex 0D) 's': 'generic-character-type', // \s any white space character 't': 'non-printing-character', // \t tab (hex 09) 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'v': 'generic-character-type', // \v any vertical white space character 'w': 'generic-character-type', // any "word" character 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT) 'y': '', // \y matches y 'z': 'anchor', // \z end of subject }; // Behaviour of alphanumeric characters after a backslash character (character class context, i.e. [...]): var backslash_in_character_class = { '0': 'non-printing-character', // octal code '1': 'non-printing-character', // octal code '2': 'non-printing-character', // octal code '3': 'non-printing-character', // octal code '4': 'non-printing-character', // octal code '5': 'non-printing-character', // octal code '6': 'non-printing-character', // octal code '7': 'non-printing-character', // octal code '8': '', // \8 matches 8 '9': '', // \9 matches 9 'A': '', // \A matches A 'B': '', // \B matches B -- \B, \R, and \X are not special inside a character class. 'C': '', // \C matches C 'D': 'generic-character-type', // \D any character that is not a decimal digit 'E': 'err no-error-message', // \E ends \Q but never matches 'E' -- PCRE does not emit any error message 'F': '', // \F matches F 'G': '', // \G matches G 'H': 'generic-character-type', // \H any character that is not a horizontal white space character 'I': '', // \I matches I 'J': '', // \J matches J 'K': '', // \K matches K 'L': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'M': '', // \M matches M 'N': 'err backslash-n-is-not-supported-in-a-class', // \N is not allowed in a character class. 'O': '', // \O matches O 'P': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 'Q': 'escaped-sequence-start', // \Q starts \Q...\E escape sequences. 'R': '', // \R matches R -- \B, \R, and \X are not special inside a character class. 'S': 'generic-character-type', // \S any character that is not a white space character 'T': '', // \T matches T 'U': 'unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'V': 'generic-character-type', // \V any character that is not a vertical white space character 'W': 'generic-character-type', // \W any "non-word" character 'X': '', // \X matches X -- \B, \R, and \X are not special inside a character class. 'Y': '', // \Y matches Y 'Z': '', // \Z matches Z 'a': 'non-printing-character', // \a alarm, that is, the BEL character (hex 07) 'b': 'non-printing-character', // inside a character class, \b is interpreted as the backspace character (hex 08) 'c': 'err backslash-c-at-end-of-pattern', // \cx "control-x", where x is any ASCII character 'd': 'generic-character-type', // \d any decimal digi 'e': 'non-printing-character', // \e escape (hex 1B) 'f': 'non-printing-character', // \f form feed (hex 0C) 'g': '', // \g matches g 'h': 'generic-character-type', // \h any horizontal white space character 'i': '', // \i matches i 'j': '', // \j matches j 'k': '', // \k matches k 'l': 'unsupported-espace-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'm': '', // \m matches m 'n': 'non-printing-character', // \n linefeed (hex 0A) 'o': '', // \o matches o 'p': 'err malformed-backslash-p-sequence', // malformed \P or \p sequence 'q': '', // \q matches q 'r': 'non-printing-character', // \r carriage return (hex 0D) 's': 'generic-character-type', // \s any white space character 't': 'non-printing-character', // \t tab (hex 09) 'u': 'err unsupported-escape-sequence', // PCRE does not support \L, \l, \N{name}, \U, or \u 'v': 'generic-character-type', // \v any vertical white space character 'w': 'generic-character-type', // any "word" character 'x': 'non-printing-character', // binary zero (or x if PCRE_JAVASCRIPT_COMPAT) 'y': '', // \y matches y 'z': '', // \z matches z }; var backslask_p_properties = { // GENERAL CATEGORY PROPERTIES FOR \p and \P 'C': 'Other', 'Cc': 'Control', 'Cf': 'Format', 'Cn': 'Unassigned', 'Co': 'Private use', 'Cs': 'Surrogate', 'L': 'Letter', 'Ll': 'Lower case letter', 'Lm': 'Modifier letter', 'Lo': 'Other letter', 'Lt': 'Title case letter', 'Lu': 'Upper case letter', 'L&': 'Ll, Lu, or Lt', 'M': 'Mark', 'Mc': 'Spacing mark', 'Me': 'Enclosing mark', 'Mn': 'Non-spacing mark', 'N': 'Number', 'Nd': 'Decimal number', 'Nl': 'Letter number', 'No': 'Other number', 'P': 'Punctuation', 'Pc': 'Connector punctuation', 'Pd': 'Dash punctuation', 'Pe': 'Close punctuation', 'Pf': 'Final punctuation', 'Pi': 'Initial punctuation', 'Po': 'Other punctuation', 'Ps': 'Open punctuation', 'S': 'Symbol', 'Sc': 'Currency symbol', 'Sk': 'Modifier symbol', 'Sm': 'Mathematical symbol', 'So': 'Other symbol', 'Z': 'Separator', 'Zl': 'Line separator', 'Zp': 'Paragraph separator', 'Zs': 'Space separator', // PCRE SPECIAL CATEGORY PROPERTIES FOR \p and \P 'Xan': 'Alphanumeric: union of properties L and N', 'Xps': 'POSIX space: property Z or tab, NL, VT, FF, CR', 'Xsp': 'Perl space: property Z or tab, NL, VT, FF, CR', 'Xuc': 'Univerally-named character: one that can be represented by a Universal Character Name', 'Xwd': 'Perl word: property Xan or underscore', // SCRIPT NAMES FOR \p AND \P 'Arabic': true, 'Armenian': true, 'Avestan': true, 'Balinese': true, 'Bamum': true, 'Bassa_Vah': true, 'Batak': true, 'Bengali': true, 'Bopomofo': true, 'Brahmi': true, 'Braille': true, 'Buginese': true, 'Buhid': true, 'Canadian_Aboriginal': true, 'Carian': true, 'Caucasian_Albanian': true, 'Chakma': true, 'Cham': true, 'Cherokee': true, 'Common': true, 'Coptic': true, 'Cuneiform': true, 'Cypriot': true, 'Cyrillic': true, 'Deseret': true, 'Devanagari': true, 'Duployan': true, 'Egyptian_Hieroglyphs': true, 'Elbasan': true, 'Ethiopic': true, 'Georgian': true, 'Glagolitic': true, 'Gothic': true, 'Grantha': true, 'Greek': true, 'Gujarati': true, 'Gurmukhi': true, 'Han': true, 'Hangul': true, 'Hanunoo': true, 'Hebrew': true, 'Hiragana': true, 'Imperial_Aramaic': true, 'Inherited': true, 'Inscriptional_Pahlavi': true, 'Inscriptional_Parthian': true, 'Javanese': true, 'Kaithi': true, 'Kannada': true, 'Katakana': true, 'Kayah_Li': true, 'Kharoshthi': true, 'Khmer': true, 'Khojki': true, 'Khudawadi': true, 'Lao': true, 'Latin': true, 'Lepcha': true, 'Limbu': true, 'Linear_A': true, 'Linear_B': true, 'Lisu': true, 'Lycian': true, 'Lydian': true, 'Mahajani': true, 'Malayalam': true, 'Mandaic': true, 'Manichaean': true, 'Meetei_Mayek': true, 'Mende_Kikakui': true, 'Meroitic_Cursive': true, 'Meroitic_Hieroglyphs': true, 'Miao': true, 'Modi': true, 'Mongolian': true, 'Mro': true, 'Myanmar': true, 'Nabataean': true, 'New_Tai_Lue': true, 'Nko': true, 'Ogham': true, 'Ol_Chiki': true, 'Old_Italic': true, 'Old_North_Arabian': true, 'Old_Permic': true, 'Old_Persian': true, 'Old_South_Arabian': true, 'Old_Turkic': true, 'Oriya': true, 'Osmanya': true, 'Pahawh_Hmong': true, 'Palmyrene': true, 'Pau_Cin_Hau': true, 'Phags_Pa': true, 'Phoenician': true, 'Psalter_Pahlavi': true, 'Rejang': true, 'Runic': true, 'Samaritan': true, 'Saurashtra': true, 'Sharada': true, 'Shavian': true, 'Siddham': true, 'Sinhala': true, 'Sora_Sompeng': true, 'Sundanese': true, 'Syloti_Nagri': true, 'Syriac': true, 'Tagalog': true, 'Tagbanwa': true, 'Tai_Le': true, 'Tai_Tham': true, 'Tai_Viet': true, 'Takri': true, 'Tamil': true, 'Telugu': true, 'Thaana': true, 'Thai': true, 'Tibetan': true, 'Tifinagh': true, 'Tirhuta': true, 'Ugaritic': true, 'Vai': true, 'Warang_Citi': true, 'Yi': true, }; var backslash_p_regex_string = '[pP]\\{\\^?([\\w&]+)\\}'; var backslash_p_regex = new RegExp(backslash_p_regex_string); var posix_named_sets = { 'alnum': 'alphanumeric', 'alpha': 'alphabetic', 'ascii': '0-127', 'blank': 'space or tab', 'cntrl': 'control character', 'digit': 'decimal digit', 'graph': 'printing, excluding space', 'lower': 'lower case letter', 'print': 'printing, including space', 'punct': 'printing, excluding alphanumeric', 'space': 'white space', 'upper': 'upper case letter', 'word': 'same as \\w', 'xdigit': 'hexadecimal digit', }; // Include '<' and '>' to spot errors such as [a[:<:]b] var posix_named_sets_regex_string = '\\[:\\^?([\\w<>]+):]'; var posix_named_sets_regex = new RegExp(posix_named_sets_regex_string); var callout_regex_string = '\\(\\?C(\\d{0,3})\\)'; var callout_regex = new RegExp(callout_regex_string); var assertion_regex_string = '\\(\\?<?[=!]'; var assertion_regex = new RegExp(assertion_regex_string); var condition_callout_regex_string = callout_regex_string + assertion_regex_string; var condition_callout_regex = new RegExp(condition_callout_regex_string); // (?i) caseless // (?J) allow duplicate names // (?m) multiline // (?s) single line (dotall) // (?U) default ungreedy (lazy) // (?x) extended (ignore white space) // (?-...) unset option(s) // + combinations e.g. (?im-sx) or (?iJm-s-U-x) var options_regex_string = '(?:-?[iJmsUx]+)+'; // Standalone option sequence, e.g. (?x-i) var option_sequence_regex_string = '\\(\\?' + options_regex_string + '\\)'; var option_sequence_regex = new RegExp(option_sequence_regex_string); // Start of non-capturing group with options, e.g. (?i-U: var group_options_regex_string = '\\(\\?' + options_regex_string + ':'; var group_options_regex = new RegExp(group_options_regex_string); // Helper functions: function delimiter(ch) { return (ch in delimiters) ? delimiters[ch] : ch; } function current(state) { if (!state.context.length) return false; return state.context[state.context.length - 1]; } function consume(stream) { // As a nested mode, we should not consume too much so as to let the nesting mode in charge. // That said, eating \w is usually safe: if (!stream.match(/\w+/)) stream.next(); } function all_tokens(state, token) { var result = state.context.join(' '); if (token) { // Avoid leading spaces as they confuse matchbrackets (see issue #4): if (result) result += ' '; result += token; } return result; } function push(state, new_context, new_context_state, token) { var ret = all_tokens(state, token); state.context.push(new_context); state.context_state.push(new_context_state || {}); return ret; } function pop(state, token) { var current_context = state.context.pop(); state.context_state.pop(); if (token) current_context += ' ' + token; return all_tokens(state, current_context); } function current_context_state(state) { return state.context_state[state.context_state.length - 1]; } function expect_name(state) { state.name_value = ''; return push(state, 'name'); } function expect_end(state, end_string) { var context_state = current_context_state(state); var end_string_array = []; for (var i = 0; i < end_string.length; ++ i) end_string_array.push(end_string[i]); context_state.expected = end_string_array; return context_state; } function read_expected_end(stream, state) { var expected, expected_ch, ch; expected = current_context_state(state).expected; if (expected && expected.length) { expected_ch = expected.shift(); ch = stream.next(); if (ch === expected_ch) { if (!expected.length) { return pop(state); } return all_tokens(state); } // console.log('erroneous end:', ch, 'expected:', expected_ch, 'context', current(state)); return all_tokens(state, 'err erroneous-end-of-token'); } else { return false; } } function handle_backslash(stream, state) { stream.eat('\\'); if (!stream.peek()) return 'err backslash-at-end-of-pattern'; // The backslash character has several uses. Firstly, if it is followed by a character that is not a number // or a letter, it takes away any special meaning that character may have. if (stream.match(/[^0-9a-zA-Z]/)) return 'escaped-character'; // \Q is used to start an escaped sequence: if (stream.match('Q') && current(state) != 'escaped-sequence') { push(state, 'escaped-sequence'); return 'escaped-sequence-start'; } // \cx "control-x", where x is any ASCII character if (stream.match(/c[ -~]/)) return 'non-printing-character'; // \0dd character with octal code 0dd if (stream.match(/0[0-7]{0,2}/)) return 'non-printing-character'; // \ddd character with octal code ddd, or back reference if (stream.match(/[1-7][0-7]{1,2}/)) return 'non-printing-character'; // \o{ddd..} character with octal code ddd.. if (stream.match(/o\{[0-7]+\}/)) return 'non-printing-character'; // \x{hhh..} character with hex code hhh.. (non-JavaScript mode) if (stream.match(/x\{[0-9a-fA-F]+}/)) return 'non-printing-character'; // \xhh character with hex code hh if (stream.match(/x[0-9a-fA-F]{0,2}/)) return 'non-printing-character'; // \uhhhh character with hex code hhhh (JavaScript mode only) if (stream.match(/u[0-9a-fA-F]{4}/)) return 'non-printing-character'; // \p{...} and \P{...}: var rem = stream.match(backslash_p_regex); if (rem) { if (rem[1] in backslask_p_properties) return 'generic-character-type'; else return 'err unknown-property-name-after-p'; } var in_character_class = (current(state) === 'character-class'); // Nothing in this condition can be found in a character class: if (!in_character_class) { // The sequence \g followed by an unsigned or a negative number, optionally enclosed in braces, is an // absolute or relative back reference. A named back reference can be coded as \g{name}. if (stream.match(/g-?[0-9]+/)) return 'backreference'; if (stream.match(/g\{-?[0-9]+\}/)) return 'backreference'; if (stream.match(/g\{/, false)) return push(state, 'backreference'); // \k<name> reference by name (Perl) // \k'name' reference by name (Perl) // \k{name} reference by name (.NET) if (stream.match(/k[<'{]/, false)) return push(state, 'backreference'); if (stream.match(/[0-9]+/)) return 'backreference'; // For compatibility with Oniguruma, the non-Perl syntax \g followed by a name or a number enclosed either // in angle brackets or single quotes, is an alternative syntax for referencing a subpattern as a // "subroutine". if (stream.match(/g<[-+]?[0-9]+>/)) return 'subroutine'; if (stream.match(/g'[-+]?[0-9]+'/)) return 'subroutine'; if (stream.match(/g[<']/, false)) return push(state, 'subroutine'); } // At this stage, we have looked for: // - a backslash followed by nothing // - a backslash followed by a single non-alphanumeric character // - a backslash followed by 1 or more characters to achieve a special, context-dependent meaning // Look for a backslash followed by a single alphanumeric character: var backslash_p = in_character_class ? backslash_in_character_class : backslash_in_normal_context; return backslash_p[stream.next()]; } function handle_name(stream, state) { var ret, rem, consume_limit; var ch = stream.next(); // Names must start with a non-digit. if (!state.name_value.length && (!ch.match(/\w/) || ch.match(/\d/))) { ret = 'err erroneous-start-of-name'; consume_limit = 0; } // Names consist of up to 32 alphanumeric characters and underscores. else if (state.name_value.length > 31) { ret = 'err name-too-long'; consume_limit = -1; } else consume_limit = 32 - state.name_value.length - 1; state.name_value += ch; if (consume_limit < 0) { if (rem = stream.match(/^\w+/)) state.name_value += rem[0]; } else while (consume_limit --) { if (rem = stream.match(/^\w/)) state.name_value += rem[0]; else break; } var next_char = stream.peek(); if (!next_char || !next_char.match(/\w/)) return pop(state, ret); return all_tokens(state, ret); } function handle_callout(stream, state) { // (?C) callout // (?Cn) callout with data n var rem = stream.match(callout_regex); if (rem) { return Number(rem[1]) < 256 ? 'callout' : 'err erroneous-callout-number'; } return false; } function handle_condition_subroutines(stream, state) { if (stream.peek() === ')') { pop(state); return tokenBase(stream, state); } stream.eat('R'); if (stream.eat('&')) return expect_name(state); stream.match(/\d+/); return pop(state); } function handle_conditions(stream, state) { var condition_state = current_context_state(state); var expected_end = read_expected_end(stream, state); if (expected_end) return expected_end; if (condition_state.ok) { pop(state); return tokenBase(stream, state); } // (?(DEFINE)... define subpattern for reference if (stream.match(/DEFINE(?=\))/)) { return pop(state, 'define'); } // (?(R)... overall recursion condition // (?(Rn)... specific group recursion condition // (?(R&name)...) specific recursion condition if (stream.match(/R(\d+|&\w+|)\)/, false)) { condition_state.ok = true; push(state, 'condition-subroutine'); return tokenBase(stream, state); } // (?(n)... absolute reference condition // (?(+n)... relative reference condition // (?(-n)... relative reference condition if (stream.match(/(-|\+|)\d+/)) { condition_state.ok = true; return all_tokens(state, 'backreference'); } var rem = stream.match(/([<'])/); if (rem) { condition_state.ok = false; expect_end(state, delimiter(rem[1])); return expect_name(state); } if (stream.match(/\w+/, false)) { condition_state.ok = true; // the "name" state will handle everything for us return expect_name(state); } // If the condition is not in any of the above formats, it must be an assertion. This may be a positive or // negative lookahead or lookbehind assertion. if (stream.match(/\?<?[=!]/)) { condition_state.ok = true; // the "group" state will handle everything for us // Ensure "group" leaves the closing parenthesis untouched so "start-group" can consume it: var group_options = {'leave_closing_parenthesis': true}; return push(state, 'group' + (++ state.group_level), group_options, 'start-group'); } stream.next(); return all_tokens(state, 'err erroneous-condition'); } function handle_start_group(stream, state) { var start_group_state = current_context_state(state); var expected_end = read_expected_end(stream, state); if (expected_end) return expected_end; var rem; if (start_group_state.option_shorthand === 1) { // A shorthand option was spotted, handle it: start_group_state.option_shorthand = 2; stream.match(/[^:]+/); return all_tokens(state, 'option-sequence'); } if (start_group_state.option_shorthand === 2) { // A shorthand option was handled, finish the job: stream.eat(':'); return pop(state); } if (start_group_state.condition_callout === 1) { // A pre-condition callout was spotted, handle it: start_group_state.condition_callout = 2; return all_tokens(state, handle_callout(stream, state)); } if (start_group_state.condition_callout === 2) { // A pre-condition callout was handled, resume stream.eat('('); expect_end(state, ')'); return push(state, 'condition'); } // (?<name>...) named capturing group (Perl) // (?'name'...) named capturing group (Perl) // (?P<name>...) named capturing group (Python) rem = stream.match(/\(\?P?([<'])/); if (rem) { expect_end(state, delimiter(rem[1])); return expect_name(state); } // Same as (?: but with options, e.g. (?x-i: if (stream.match(group_options_regex, false)) { // As a convenient shorthand, if any option settings are required at the start of a non-capturing // subpattern, the option letters may appear between the "?" and the ":". stream.match('(?'); start_group_state.option_shorthand = 1; return all_tokens(state); } // "(?(" typically marks the start of a condition: (?(condition)yes-pattern|no-pattern) if (stream.match('(?') && stream.peek() === '(') { // An explicit callout may be set just before an assertion condition: (?(?C7)(?<!abc)def|ghi) start_group_state.condition_callout = (stream.match(condition_callout_regex, false)) ? 1 : 2; return all_tokens(state); } stream.next(); return all_tokens(state, 'err erroneous-start-of-start-group'); } function handle_backreference(stream, state) { var expected_end = read_expected_end(stream, state); if (expected_end) return expected_end; var rem = stream.match(/k([<'{])/) || stream.match(/g(\{)/) || stream.match(/(\()\?P=/); if (rem) { expect_end(state, delimiter(rem[1])); return expect_name(state); } stream.next(); return all_tokens(state, 'err erroneous-backreference'); } function handle_subroutine(stream, state) { var expected_end = read_expected_end(stream, state); if (expected_end) return expected_end; var rem = stream.match(/g([<'])/) || stream.match(/(\()\?(P>|&)/); if (rem) { expect_end(state, delimiter(rem[1])); return expect_name(state); } stream.next(); return all_tokens(state, 'err erroneous-subroutine'); } function handle_verb(stream, state) { var expected_end = read_expected_end(stream, state); if (expected_end) return expected_end; expect_end(state, ')'); return expect_name(state); } function update_options(state, options) { // We are only interested in x (extended mode). var enable = true, new_state = null, i = 0, c = null; for (; i < options.length; ++i) { c = options[i]; if (c === '-') enable = false; else if (c === 'x') new_state = enable; } if (new_state !== null) state.extended = new_state; } function tokenBase(stream, state) { var rem, ret; // stand for Regular Expression Match and RETurn, respectively. // Get current state, current char, next char: var ch = stream.peek(); if (!ch) return; var current_state = current(state); var group_state; if (current_state === 'name') return handle_name(stream, state); if (current_state === 'condition') return handle_conditions(stream, state); if (current_state === 'condition-subroutine') return handle_condition_subroutines(stream, state); if (current_state === 'start-group') return handle_start_group(stream, state); if (current_state === 'backreference') return handle_backreference(stream, state); if (current_state === 'subroutine') return handle_subroutine(stream, state); if (current_state === 'verb') return handle_verb(stream, state); if (current_state === 'escaped-sequence') { if (stream.match('\\E')) return pop(state, 'escaped-sequence-end'); consume(stream); return all_tokens(state); } // Escaped characters: if (stream.match(/\\./, false)) return all_tokens(state, handle_backslash(stream, state)); if (stream.match('[', false)) { if (current_state !== 'character-class') { if (stream.match(posix_named_sets_regex)) { return all_tokens(state, 'err posix-outside-class-unsupported'); } // In the POSIX.2 compliant library that was included in 4.4BSD Unix, the ugly syntax [[:<:]] and // [[:>:]] is used for matching "start of word" and "end of word". if (stream.match('[[:<:]]') || stream.match('[[:>:]]')) return all_tokens(state, 'anchor'); // At this stage, we do have a new character class: push(state, 'character-class'); stream.eat('['); stream.eat('^'); // If a closing square bracket is required as a member of the class, it should be the first data // character in the class (after an initial circumflex, if present) or escaped with a backslash. // Note: ']' should be on the same line as '[', even in extended mode. stream.eat(']'); return all_tokens(state); } } if (current_state === 'character-class') { rem = stream.match(posix_named_sets_regex); if (rem) { if (rem[1] in posix_named_sets) return all_tokens(state, 'generic-character-type'); else return all_tokens(state, 'err unknown-posix-class-name'); } if (stream.eat(']')) return pop(state); consume(stream); return all_tokens(state); } // Regular comments in extended mode: if (state.extended && stream.eat('#')) { stream.skipToEnd(); return 'comment'; } if (stream.eat('{')) { // exactly n: if (stream.match(/\d+\}/)) return all_tokens(state, 'quantifier'); // "at least n, no more than m" and "n or more", greedy, possessive or lazy: if (stream.match(/\d+,\d*\}[+?]?/)) return all_tokens(state, 'quantifier'); } if (stream.eat('|')) { return all_tokens(state, 'alternation'); } if (stream.peek() === '(') { if (stream.match(/\(\*(?:UTF(?:8|16|32|)|UCP|NO_AUTO_POSSESS|NO_START_OPT)\)/)) return all_tokens(state, 'option-sequence'); if (stream.match(/\(\*LIMIT_(?:RECURSION|MATCH)=[0-9]+\)/)) return all_tokens(state, 'option-sequence'); // Newline convention + what \R matches: if (stream.match(/\(\*(?:CR|LF|CRLF|ANYCRLF|ANY|BSR_(?:ANYCRLF|UNICODE))\)/)) return all_tokens(state, 'option-sequence'); // Backtracking control: if (stream.match(/\(\*(?:ACCEPT|FAIL|F|COMMIT|PRUNE|SKIP|THEN)\)/)) return all_tokens(state, 'verb'); if (stream.match(/\(\*(?:MARK|PRUNE|SKIP|THEN|):/)) return push(state, 'verb', {}, 'verb'); rem = stream.match(option_sequence_regex); if (rem) { update_options(state, rem[0]); return all_tokens(state, 'option-sequence'); } // (?#....) comment (not nestable) if (stream.match(/\(\?#[^)]*\)/)) return all_tokens(state, 'comment'); // (?P=name) reference by name (Python) if (stream.match(/\(\?P=/, false)) return push(state, 'backreference'); // (?&name) call subpattern by name (Perl) // (?P>name) call subpattern by name (Python) if (stream.match(/\(\?(P>|&)/, false)) return push(state, 'subroutine'); // (?n) call subpattern by absolute number // (?+n) call subpattern by relative number // (?-n) call subpattern by relative number if (stream.match(/\(\?(\-|\+|)\d+\)/)) return all_tokens(state, 'subroutine'); // (?R) recurse whole pattern if (stream.match('(?R)')) return all_tokens(state, 'subroutine'); // Callouts: var callout = handle_callout(stream, state); if (callout) return all_tokens(state, callout); // At this stage, we have a new group: ++ state.group_level; group_state = 'group' + state.group_level; push(state, group_state); // (?=...) positive look ahead // (?!...) negative look ahead // (?<=...) positive look behind // (?<!...) negative look behind if (stream.match(assertion_regex)) return all_tokens(state, 'start-group'); // (?:...) non-capturing group // (?|...) non-capturing group; reset group numbers for capturing groups in each alternative // (?>...) atomic, non-capturing group if (stream.match(/\(\?[:|>]/)) return all_tokens(state, 'start-group'); if (stream.match('(?', false)) { push(state, 'start-group'); return tokenBase(stream, state); } stream.eat('('); return all_tokens(state, 'start-group'); } if (stream.peek() === ')') { if (current_state && current_state.match(/^group/)) { ret = 'start-group'; // formerly 'end-group' but that used to confuse matchbrackets (see issue #4) if (current_context_state(state).leave_closing_parenthesis) ret = ''; else stream.next(); -- state.group_level; return pop(state, ret); } stream.next(); return all_tokens(state, 'err unmatched-closing-parenthesis'); } // Anchors if (stream.eat('^') || stream.eat('$')) return all_tokens(state, 'anchor'); if (stream.eat('.')) return all_tokens(state, 'generic-character-type'); // Quantifiers: 0 or 1, 0 or more, 1 or more, greedy: if (stream.eat('?') || stream.eat('*') || stream.eat('+')) { // Handle possessive and lazy variants: stream.eat(/[+?]/); return all_tokens(state, 'quantifier'); } consume(stream); return all_tokens(state); } function startState() { return { context: [], context_state: [], group_level: 0, name_value: '', extended: options.extended, }; } function copyState(o) { // o = original var i, oo, oc, key, c = startState(); // c = copy, oo = original object, oc = object copy for (i = 0; i < o.context_state.length; ++i) { oo = o.context_state[i]; oc = {}; for (key in oo) oc[key] = (key === 'expected') ? oo[key].slice() : oo[key]; c.context_state.push(oc); } c.context = o.context.slice(); c.group_level = o.group_level; c.name_value = o.name_value; c.extended = o.extended; return c; } return { startState: startState, copyState: copyState, token: tokenBase, }; }); CodeMirror.defineMIME('text/x-regex', 'pcre'); CodeMirror.defineMIME('text/x-pcre-regex', 'pcre'); });