UNPKG

highlight.js

Version:

Syntax highlighting with language autodetection.

highlightjs.org

highlightjs/highlight.js

589 lines (560 loc) • 12.6 kB

JavaScript

/** * @param {string} value * @returns {string} */ /** * performs a shallow merge of multiple objects into one * * @template T * @param {T} original * @param {Record<string,any>[]} objects * @returns {T} a single new object */ function inherit(original, ...objects) { /** @type Record<string,any> */ const result = Object.create(null); for (const key in original) { result[key] = original[key]; } objects.forEach(function(obj) { for (const key in obj) { result[key] = obj[key]; } }); return /** @type {T} */ (result); } /** * @param {string} value * @returns {RegExp} * */ /** * @param {RegExp | string } re * @returns {string} */ function source(re) { if (!re) return null; if (typeof re === "string") return re; return re.source; } /** * @param {RegExp | string } re * @returns {string} */ function lookahead(re) { return concat('(?=', re, ')'); } /** * @param {...(RegExp | string) } args * @returns {string} */ function concat(...args) { const joined = args.map((x) => source(x)).join(""); return joined; } function stripOptionsFromArgs(args) { const opts = args[args.length - 1]; if (typeof opts === 'object' && opts.constructor === Object) { args.splice(args.length - 1, 1); return opts; } else { return {}; } } /** * Any of the passed expresssions may match * * Creates a huge this | this | that | that match * @param {(RegExp | string)[] } args * @returns {string} */ function either(...args) { const opts = stripOptionsFromArgs(args); const joined = '(' + (opts.capture ? "" : "?:") + args.map((x) => source(x)).join("|") + ")"; return joined; } const UNDERSCORE_IDENT_RE = '[a-zA-Z_]\\w*'; /** * Creates a comment mode * * @param {string | RegExp} begin * @param {string | RegExp} end * @param {Mode | {}} [modeOptions] * @returns {Partial<Mode>} */ const COMMENT = function(begin, end, modeOptions = {}) { const mode = inherit( { scope: 'comment', begin, end, contains: [] }, modeOptions ); mode.contains.push({ scope: 'doctag', // hack to avoid the space from being included. the space is necessary to // match here to prevent the plain text rule below from gobbling up doctags begin: '[ ]*(?=(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):)', end: /(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):/, excludeBegin: true, relevance: 0 }); const ENGLISH_WORD = either( // list of common 1 and 2 letter words in English "I", "a", "is", "so", "us", "to", "at", "if", "in", "it", "on", // note: this is not an exhaustive list of contractions, just popular ones /[A-Za-z]+['](d|ve|re|ll|t|s|n)/, // contractions - can't we'd they're let's, etc /[A-Za-z]+[-][a-z]+/, // `no-way`, etc. /[A-Za-z][a-z]{2,}/ // allow capitalized words at beginning of sentences ); // looking like plain text, more likely to be a comment mode.contains.push( { // TODO: how to include ", (, ) without breaking grammars that use these for // comment delimiters? // begin: /[ ]+([()"]?([A-Za-z'-]{3,}|is|a|I|so|us|[tT][oO]|at|if|in|it|on)[.]?[()":]?([.][ ]|[ ]|\))){3}/ // --- // this tries to find sequences of 3 english words in a row (without any // "programming" type syntax) this gives us a strong signal that we've // TRULY found a comment - vs perhaps scanning with the wrong language. // It's possible to find something that LOOKS like the start of the // comment - but then if there is no readable text - good chance it is a // false match and not a comment. // // for a visual example please see: // https://github.com/highlightjs/highlight.js/issues/2827 begin: concat( /[ ]+/, // necessary to prevent us gobbling up doctags like /* @author Bob Mcgill */ '(', ENGLISH_WORD, /[.]?[:]?([.][ ]|[ ])/, '){3}') // look for 3 words in a row } ); return mode; }; COMMENT('//', '$'); COMMENT('/\\*', '\\*/'); COMMENT('#', '$'); /* Language: Python Description: Python is an interpreted, object-oriented, high-level programming language with dynamic semantics. Website: https://www.python.org Category: common */ function python(hljs) { const RESERVED_WORDS = [ 'and', 'as', 'assert', 'async', 'await', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'finally', 'for', 'from', 'global', 'if', 'import', 'in', 'is', 'lambda', 'nonlocal|10', 'not', 'or', 'pass', 'raise', 'return', 'try', 'while', 'with', 'yield' ]; const BUILT_INS = [ '__import__', 'abs', 'all', 'any', 'ascii', 'bin', 'bool', 'breakpoint', 'bytearray', 'bytes', 'callable', 'chr', 'classmethod', 'compile', 'complex', 'delattr', 'dict', 'dir', 'divmod', 'enumerate', 'eval', 'exec', 'filter', 'float', 'format', 'frozenset', 'getattr', 'globals', 'hasattr', 'hash', 'help', 'hex', 'id', 'input', 'int', 'isinstance', 'issubclass', 'iter', 'len', 'list', 'locals', 'map', 'max', 'memoryview', 'min', 'next', 'object', 'oct', 'open', 'ord', 'pow', 'print', 'property', 'range', 'repr', 'reversed', 'round', 'set', 'setattr', 'slice', 'sorted', 'staticmethod', 'str', 'sum', 'super', 'tuple', 'type', 'vars', 'zip' ]; const LITERALS = [ '__debug__', 'Ellipsis', 'False', 'None', 'NotImplemented', 'True' ]; // https://docs.python.org/3/library/typing.html // TODO: Could these be supplemented by a CamelCase matcher in certain // contexts, leaving these remaining only for relevance hinting? const TYPES = [ "Any", "Callable", "Coroutine", "Dict", "List", "Literal", "Generic", "Optional", "Sequence", "Set", "Tuple", "Type", "Union" ]; const KEYWORDS = { $pattern: /[A-Za-z]\w+|__\w+__/, keyword: RESERVED_WORDS, built_in: BUILT_INS, literal: LITERALS, type: TYPES }; const PROMPT = { className: 'meta', begin: /^(>>>|\.\.\.) / }; const SUBST = { className: 'subst', begin: /\{/, end: /\}/, keywords: KEYWORDS, illegal: /#/ }; const LITERAL_BRACKET = { begin: /\{\{/, relevance: 0 }; const STRING = { className: 'string', contains: [ hljs.BACKSLASH_ESCAPE ], variants: [ { begin: /([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?'''/, end: /'''/, contains: [ hljs.BACKSLASH_ESCAPE, PROMPT ], relevance: 10 }, { begin: /([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?"""/, end: /"""/, contains: [ hljs.BACKSLASH_ESCAPE, PROMPT ], relevance: 10 }, { begin: /([fF][rR]|[rR][fF]|[fF])'''/, end: /'''/, contains: [ hljs.BACKSLASH_ESCAPE, PROMPT, LITERAL_BRACKET, SUBST ] }, { begin: /([fF][rR]|[rR][fF]|[fF])"""/, end: /"""/, contains: [ hljs.BACKSLASH_ESCAPE, PROMPT, LITERAL_BRACKET, SUBST ] }, { begin: /([uU]|[rR])'/, end: /'/, relevance: 10 }, { begin: /([uU]|[rR])"/, end: /"/, relevance: 10 }, { begin: /([bB]|[bB][rR]|[rR][bB])'/, end: /'/ }, { begin: /([bB]|[bB][rR]|[rR][bB])"/, end: /"/ }, { begin: /([fF][rR]|[rR][fF]|[fF])'/, end: /'/, contains: [ hljs.BACKSLASH_ESCAPE, LITERAL_BRACKET, SUBST ] }, { begin: /([fF][rR]|[rR][fF]|[fF])"/, end: /"/, contains: [ hljs.BACKSLASH_ESCAPE, LITERAL_BRACKET, SUBST ] }, hljs.APOS_STRING_MODE, hljs.QUOTE_STRING_MODE ] }; // https://docs.python.org/3.9/reference/lexical_analysis.html#numeric-literals const digitpart = '[0-9](_?[0-9])*'; const pointfloat = `(\\b(${digitpart}))?\\.(${digitpart})|\\b(${digitpart})\\.`; const NUMBER = { className: 'number', relevance: 0, variants: [ // exponentfloat, pointfloat // https://docs.python.org/3.9/reference/lexical_analysis.html#floating-point-literals // optionally imaginary // https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals // Note: no leading \b because floats can start with a decimal point // and we don't want to mishandle e.g. `fn(.5)`, // no trailing \b for pointfloat because it can end with a decimal point // and we don't want to mishandle e.g. `0..hex()`; this should be safe // because both MUST contain a decimal point and so cannot be confused with // the interior part of an identifier { begin: `(\\b(${digitpart})|(${pointfloat}))[eE][+-]?(${digitpart})[jJ]?\\b` }, { begin: `(${pointfloat})[jJ]?` }, // decinteger, bininteger, octinteger, hexinteger // https://docs.python.org/3.9/reference/lexical_analysis.html#integer-literals // optionally "long" in Python 2 // https://docs.python.org/2.7/reference/lexical_analysis.html#integer-and-long-integer-literals // decinteger is optionally imaginary // https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals { begin: '\\b([1-9](_?[0-9])*|0+(_?0)*)[lLjJ]?\\b' }, { begin: '\\b0[bB](_?[01])+[lL]?\\b' }, { begin: '\\b0[oO](_?[0-7])+[lL]?\\b' }, { begin: '\\b0[xX](_?[0-9a-fA-F])+[lL]?\\b' }, // imagnumber (digitpart-based) // https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals { begin: `\\b(${digitpart})[jJ]\\b` } ] }; const COMMENT_TYPE = { className: "comment", begin: lookahead(/# type:/), end: /$/, keywords: KEYWORDS, contains: [ { // prevent keywords from coloring `type` begin: /# type:/ }, // comment within a datatype comment includes no keywords { begin: /#/, end: /\b\B/, endsWithParent: true } ] }; const PARAMS = { className: 'params', variants: [ // Exclude params in functions without params { className: "", begin: /\(\s*\)/, skip: true }, { begin: /\(/, end: /\)/, excludeBegin: true, excludeEnd: true, keywords: KEYWORDS, contains: [ 'self', PROMPT, NUMBER, STRING, hljs.HASH_COMMENT_MODE ] } ] }; SUBST.contains = [ STRING, NUMBER, PROMPT ]; return { name: 'Python', aliases: [ 'py', 'gyp', 'ipython' ], keywords: KEYWORDS, illegal: /(<\/|->|\?)|=>/, contains: [ PROMPT, NUMBER, { // very common convention begin: /\bself\b/ }, { // eat "if" prior to string so that it won't accidentally be // labeled as an f-string beginKeywords: "if", relevance: 0 }, STRING, COMMENT_TYPE, hljs.HASH_COMMENT_MODE, { match: [ /def/, /\s+/, UNDERSCORE_IDENT_RE ], scope: { 1: "keyword", 3: "title.function" }, contains: [ PARAMS ] }, { variants: [ { match: [ /class/, /\s+/, UNDERSCORE_IDENT_RE, /\s*/, /\(\s*/, UNDERSCORE_IDENT_RE,/\s*\)/ ], }, { match: [ /class/, /\s+/, UNDERSCORE_IDENT_RE ], } ], scope: { 1: "keyword", 3: "title.class", 6: "title.class.inherited", } }, { className: 'meta', begin: /^[\t ]*@/, end: /(?=#)|$/, contains: [ NUMBER, PARAMS, STRING ] } ] }; } export default python;