highlight.js
Version:
Syntax highlighting with language autodetection.
589 lines (560 loc) • 12.6 kB
JavaScript
/**
* @param {string} value
* @returns {string}
*/
/**
* performs a shallow merge of multiple objects into one
*
* @template T
* @param {T} original
* @param {Record<string,any>[]} objects
* @returns {T} a single new object
*/
function inherit(original, ...objects) {
/** @type Record<string,any> */
const result = Object.create(null);
for (const key in original) {
result[key] = original[key];
}
objects.forEach(function(obj) {
for (const key in obj) {
result[key] = obj[key];
}
});
return /** @type {T} */ (result);
}
/**
* @param {string} value
* @returns {RegExp}
* */
/**
* @param {RegExp | string } re
* @returns {string}
*/
function source(re) {
if (!re) return null;
if (typeof re === "string") return re;
return re.source;
}
/**
* @param {RegExp | string } re
* @returns {string}
*/
function lookahead(re) {
return concat('(?=', re, ')');
}
/**
* @param {...(RegExp | string) } args
* @returns {string}
*/
function concat(...args) {
const joined = args.map((x) => source(x)).join("");
return joined;
}
function stripOptionsFromArgs(args) {
const opts = args[args.length - 1];
if (typeof opts === 'object' && opts.constructor === Object) {
args.splice(args.length - 1, 1);
return opts;
} else {
return {};
}
}
/**
* Any of the passed expresssions may match
*
* Creates a huge this | this | that | that match
* @param {(RegExp | string)[] } args
* @returns {string}
*/
function either(...args) {
const opts = stripOptionsFromArgs(args);
const joined = '(' +
(opts.capture ? "" : "?:") +
args.map((x) => source(x)).join("|") + ")";
return joined;
}
const UNDERSCORE_IDENT_RE = '[a-zA-Z_]\\w*';
/**
* Creates a comment mode
*
* @param {string | RegExp} begin
* @param {string | RegExp} end
* @param {Mode | {}} [modeOptions]
* @returns {Partial<Mode>}
*/
const COMMENT = function(begin, end, modeOptions = {}) {
const mode = inherit(
{
scope: 'comment',
begin,
end,
contains: []
},
modeOptions
);
mode.contains.push({
scope: 'doctag',
// hack to avoid the space from being included. the space is necessary to
// match here to prevent the plain text rule below from gobbling up doctags
begin: '[ ]*(?=(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):)',
end: /(TODO|FIXME|NOTE|BUG|OPTIMIZE|HACK|XXX):/,
excludeBegin: true,
relevance: 0
});
const ENGLISH_WORD = either(
// list of common 1 and 2 letter words in English
"I",
"a",
"is",
"so",
"us",
"to",
"at",
"if",
"in",
"it",
"on",
// note: this is not an exhaustive list of contractions, just popular ones
/[A-Za-z]+['](d|ve|re|ll|t|s|n)/, // contractions - can't we'd they're let's, etc
/[A-Za-z]+[-][a-z]+/, // `no-way`, etc.
/[A-Za-z][a-z]{2,}/ // allow capitalized words at beginning of sentences
);
// looking like plain text, more likely to be a comment
mode.contains.push(
{
// TODO: how to include ", (, ) without breaking grammars that use these for
// comment delimiters?
// begin: /[ ]+([()"]?([A-Za-z'-]{3,}|is|a|I|so|us|[tT][oO]|at|if|in|it|on)[.]?[()":]?([.][ ]|[ ]|\))){3}/
// ---
// this tries to find sequences of 3 english words in a row (without any
// "programming" type syntax) this gives us a strong signal that we've
// TRULY found a comment - vs perhaps scanning with the wrong language.
// It's possible to find something that LOOKS like the start of the
// comment - but then if there is no readable text - good chance it is a
// false match and not a comment.
//
// for a visual example please see:
// https://github.com/highlightjs/highlight.js/issues/2827
begin: concat(
/[ ]+/, // necessary to prevent us gobbling up doctags like /* @author Bob Mcgill */
'(',
ENGLISH_WORD,
/[.]?[:]?([.][ ]|[ ])/,
'){3}') // look for 3 words in a row
}
);
return mode;
};
COMMENT('//', '$');
COMMENT('/\\*', '\\*/');
COMMENT('#', '$');
/*
Language: Python
Description: Python is an interpreted, object-oriented, high-level programming language with dynamic semantics.
Website: https://www.python.org
Category: common
*/
function python(hljs) {
const RESERVED_WORDS = [
'and',
'as',
'assert',
'async',
'await',
'break',
'class',
'continue',
'def',
'del',
'elif',
'else',
'except',
'finally',
'for',
'from',
'global',
'if',
'import',
'in',
'is',
'lambda',
'nonlocal|10',
'not',
'or',
'pass',
'raise',
'return',
'try',
'while',
'with',
'yield'
];
const BUILT_INS = [
'__import__',
'abs',
'all',
'any',
'ascii',
'bin',
'bool',
'breakpoint',
'bytearray',
'bytes',
'callable',
'chr',
'classmethod',
'compile',
'complex',
'delattr',
'dict',
'dir',
'divmod',
'enumerate',
'eval',
'exec',
'filter',
'float',
'format',
'frozenset',
'getattr',
'globals',
'hasattr',
'hash',
'help',
'hex',
'id',
'input',
'int',
'isinstance',
'issubclass',
'iter',
'len',
'list',
'locals',
'map',
'max',
'memoryview',
'min',
'next',
'object',
'oct',
'open',
'ord',
'pow',
'print',
'property',
'range',
'repr',
'reversed',
'round',
'set',
'setattr',
'slice',
'sorted',
'staticmethod',
'str',
'sum',
'super',
'tuple',
'type',
'vars',
'zip'
];
const LITERALS = [
'__debug__',
'Ellipsis',
'False',
'None',
'NotImplemented',
'True'
];
// https://docs.python.org/3/library/typing.html
// TODO: Could these be supplemented by a CamelCase matcher in certain
// contexts, leaving these remaining only for relevance hinting?
const TYPES = [
"Any",
"Callable",
"Coroutine",
"Dict",
"List",
"Literal",
"Generic",
"Optional",
"Sequence",
"Set",
"Tuple",
"Type",
"Union"
];
const KEYWORDS = {
$pattern: /[A-Za-z]\w+|__\w+__/,
keyword: RESERVED_WORDS,
built_in: BUILT_INS,
literal: LITERALS,
type: TYPES
};
const PROMPT = {
className: 'meta',
begin: /^(>>>|\.\.\.) /
};
const SUBST = {
className: 'subst',
begin: /\{/,
end: /\}/,
keywords: KEYWORDS,
illegal: /#/
};
const LITERAL_BRACKET = {
begin: /\{\{/,
relevance: 0
};
const STRING = {
className: 'string',
contains: [ hljs.BACKSLASH_ESCAPE ],
variants: [
{
begin: /([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?'''/,
end: /'''/,
contains: [
hljs.BACKSLASH_ESCAPE,
PROMPT
],
relevance: 10
},
{
begin: /([uU]|[bB]|[rR]|[bB][rR]|[rR][bB])?"""/,
end: /"""/,
contains: [
hljs.BACKSLASH_ESCAPE,
PROMPT
],
relevance: 10
},
{
begin: /([fF][rR]|[rR][fF]|[fF])'''/,
end: /'''/,
contains: [
hljs.BACKSLASH_ESCAPE,
PROMPT,
LITERAL_BRACKET,
SUBST
]
},
{
begin: /([fF][rR]|[rR][fF]|[fF])"""/,
end: /"""/,
contains: [
hljs.BACKSLASH_ESCAPE,
PROMPT,
LITERAL_BRACKET,
SUBST
]
},
{
begin: /([uU]|[rR])'/,
end: /'/,
relevance: 10
},
{
begin: /([uU]|[rR])"/,
end: /"/,
relevance: 10
},
{
begin: /([bB]|[bB][rR]|[rR][bB])'/,
end: /'/
},
{
begin: /([bB]|[bB][rR]|[rR][bB])"/,
end: /"/
},
{
begin: /([fF][rR]|[rR][fF]|[fF])'/,
end: /'/,
contains: [
hljs.BACKSLASH_ESCAPE,
LITERAL_BRACKET,
SUBST
]
},
{
begin: /([fF][rR]|[rR][fF]|[fF])"/,
end: /"/,
contains: [
hljs.BACKSLASH_ESCAPE,
LITERAL_BRACKET,
SUBST
]
},
hljs.APOS_STRING_MODE,
hljs.QUOTE_STRING_MODE
]
};
// https://docs.python.org/3.9/reference/lexical_analysis.html#numeric-literals
const digitpart = '[0-9](_?[0-9])*';
const pointfloat = `(\\b(${digitpart}))?\\.(${digitpart})|\\b(${digitpart})\\.`;
const NUMBER = {
className: 'number',
relevance: 0,
variants: [
// exponentfloat, pointfloat
// https://docs.python.org/3.9/reference/lexical_analysis.html#floating-point-literals
// optionally imaginary
// https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals
// Note: no leading \b because floats can start with a decimal point
// and we don't want to mishandle e.g. `fn(.5)`,
// no trailing \b for pointfloat because it can end with a decimal point
// and we don't want to mishandle e.g. `0..hex()`; this should be safe
// because both MUST contain a decimal point and so cannot be confused with
// the interior part of an identifier
{
begin: `(\\b(${digitpart})|(${pointfloat}))[eE][+-]?(${digitpart})[jJ]?\\b`
},
{
begin: `(${pointfloat})[jJ]?`
},
// decinteger, bininteger, octinteger, hexinteger
// https://docs.python.org/3.9/reference/lexical_analysis.html#integer-literals
// optionally "long" in Python 2
// https://docs.python.org/2.7/reference/lexical_analysis.html#integer-and-long-integer-literals
// decinteger is optionally imaginary
// https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals
{
begin: '\\b([1-9](_?[0-9])*|0+(_?0)*)[lLjJ]?\\b'
},
{
begin: '\\b0[bB](_?[01])+[lL]?\\b'
},
{
begin: '\\b0[oO](_?[0-7])+[lL]?\\b'
},
{
begin: '\\b0[xX](_?[0-9a-fA-F])+[lL]?\\b'
},
// imagnumber (digitpart-based)
// https://docs.python.org/3.9/reference/lexical_analysis.html#imaginary-literals
{
begin: `\\b(${digitpart})[jJ]\\b`
}
]
};
const COMMENT_TYPE = {
className: "comment",
begin: lookahead(/# type:/),
end: /$/,
keywords: KEYWORDS,
contains: [
{ // prevent keywords from coloring `type`
begin: /# type:/
},
// comment within a datatype comment includes no keywords
{
begin: /#/,
end: /\b\B/,
endsWithParent: true
}
]
};
const PARAMS = {
className: 'params',
variants: [
// Exclude params in functions without params
{
className: "",
begin: /\(\s*\)/,
skip: true
},
{
begin: /\(/,
end: /\)/,
excludeBegin: true,
excludeEnd: true,
keywords: KEYWORDS,
contains: [
'self',
PROMPT,
NUMBER,
STRING,
hljs.HASH_COMMENT_MODE
]
}
]
};
SUBST.contains = [
STRING,
NUMBER,
PROMPT
];
return {
name: 'Python',
aliases: [
'py',
'gyp',
'ipython'
],
keywords: KEYWORDS,
illegal: /(<\/|->|\?)|=>/,
contains: [
PROMPT,
NUMBER,
{
// very common convention
begin: /\bself\b/
},
{
// eat "if" prior to string so that it won't accidentally be
// labeled as an f-string
beginKeywords: "if",
relevance: 0
},
STRING,
COMMENT_TYPE,
hljs.HASH_COMMENT_MODE,
{
match: [
/def/, /\s+/,
UNDERSCORE_IDENT_RE
],
scope: {
1: "keyword",
3: "title.function"
},
contains: [ PARAMS ]
},
{
variants: [
{
match: [
/class/, /\s+/,
UNDERSCORE_IDENT_RE, /\s*/,
/\(\s*/, UNDERSCORE_IDENT_RE,/\s*\)/
],
},
{
match: [
/class/, /\s+/,
UNDERSCORE_IDENT_RE
],
}
],
scope: {
1: "keyword",
3: "title.class",
6: "title.class.inherited",
}
},
{
className: 'meta',
begin: /^[\t ]*@/,
end: /(?=#)|$/,
contains: [
NUMBER,
PARAMS,
STRING
]
}
]
};
}
export default python;