@eagleoutice/tree-sitter-r
Version:
R grammar for tree-sitter
718 lines (618 loc) • 24.4 kB
JavaScript
// ---------------------------------------------------------------------------------------
// R has an operator precedence table defined here:
// https://github.com/wch/r-source/blob/0a8f53a7ba47463f1c938dd3e2c2acc7a2d3a1c2/src/main/gram.y#L419-L441
//
// Our precedence table defined in `PREC` follows the R precedence order pretty closely.
// However, we sometimes need to adjust the associativity, see `NOTE ON PREC.RIGHT` below.
//
// We also don't need the `ELSE` or `PIPEBIND` precedence specifications from the table.
// We handle `ELSE` within the `if` node using a special external, and `=>` didn't make it
// into a release version of R (it is off by default) and we think it is unlikely to make
// a return.
//
// Note that if a precedence rank is unspecified in a rule, it can be assumed to be 0.
// ---------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------
// NOTE ON PREC.RIGHT:
//
// A few things in this table are left associative in R's grammar, but we are forced to
// make them right associative to get the behavior we want. This includes:
// - $, @
// - ::, :::
//
// We are forced to do this because we want these nodes to have an `optional()` RHS.
// While `dplyr::` isn't parsable R code, we want it to be recognized as a `::` node with
// a known package name, as this helps us generate completions.
//
// The trailing `optional()` then means there are two interpretations of `foo::bar`:
// - [foo::][bar] == [namespace][identifier]
// - [foo::bar] == [namespace]
//
// We want to force the latter, which means that the namespace rule has to be right
// associative to "prefer matching a rule that ends later".
//
// In practice we don't think this will matter for the rules we've had to swap the order
// on, since they are typically the only things at their numeric precedence rank.
// ---------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------
// NOTE ON OPERATORS:
//
// For `unary_operator`, `binary_operator`, `extract_operator`, and `namespace_operator`,
// the way these have been grouped is based on the semantic definition of each kind of
// operator group. Specifically:
//
// <unary> <expr>
// <expr> <binary> <expr>
// <expr> <extract> <symbol>
// <symbol> <namespace> <symbol>
//
// In theory, we could have gone further to, say, split out comparison and arithmetic
// operators from the binary operator group, or split out the pipe as its own rule.
// However, this is all rather arbitrary, and we decided it was best to stop the grouping
// at semantic meaning, and let end consumers layer on additional behavior as needed by
// creating more granular groups based on the `operator` field.
// ---------------------------------------------------------------------------------------
// ---------------------------------------------------------------------------------------
// NOTE ON NEWLINES BETWEEN PARENTHESES:
//
// Typically, we handle newlines explicitly in the R grammar, as they often do have
// contextual meaning.
//
// However, the external scanner will proactively consume `\n` if we are within a scope
// defined by a `(`, `[`, or `[[`. This is why you don't see `repeat($.newline)` within
// the `parameters` node, or after the `(` of an `if_statement` node - newlines will have
// been consumed already!
// ---------------------------------------------------------------------------------------
const PREC = {
// #
// NOTE: If we don't put comments at a negative rank, then `"#"` will treat the `#` as
// the start of a comment rather than being part of the string.
COMMENT: { ASSOC: prec, RANK: -1},
// {, (
// NOTE: If we understand correctly, brace and parenthesis blocks are given the same
// precedence as all other general expressions. Note that they are not tagged with an
// explicit `%prec` in the links below, and the last terminal nodes of `}` and `)` do
// not have a `%left` or `%right` precedence assigned in the table above (the last
// terminal node is how Bison assigns precedence to a rule by default), so we are left
// to assume they have default precedence.
// https://github.com/wch/r-source/blob/0a8f53a7ba47463f1c938dd3e2c2acc7a2d3a1c2/src/main/gram.y#L467-L468
// https://www.gnu.org/software/bison/manual/html_node/How-Precedence.html
BLOCK: { ASSOC: prec, RANK: 0 },
// ..i
// NOTE: If we don't put `dot_dot_i` at a positive rank, then `..1` will get
// treated as an `identifier` because the rules for `identifier` and
// `dot_dot_i` both match `..1`, but `identifier` physically comes first in
// the grammar so it will otherwise have prioritiy if both have rank 0.
DOT_DOT_I: { ASSOC: prec, RANK: 1 },
// ?
HELP: { ASSOC: prec.left, RANK: 1 },
// function() {}
// while() {}
// for() {}
// repeat {}
FUNCTION_OR_LOOP: { ASSOC: prec.left, RANK: 2 },
// if {}
IF: { ASSOC: prec.right, RANK: 3 },
// <-, <<-, :=
LEFT_ASSIGN: { ASSOC: prec.right, RANK: 4 },
// =
EQUALS_ASSIGN: { ASSOC: prec.right, RANK: 5 },
// ->, ->>
RIGHT_ASSIGN: { ASSOC: prec.left, RANK: 6 },
// ~
TILDE: { ASSOC: prec.left, RANK: 7 },
// |, ||
OR: { ASSOC: prec.left, RANK: 8 },
// &, &&
AND: { ASSOC: prec.left, RANK: 9 },
// !
UNARY_NOT: { ASSOC: prec.left, RANK: 10 },
// >, >=, <, <=, ==, !=
// NOTE: These are nonassoc in R's grammar, but we have to specify
// associativity to generate the grammar, and left seems correct.
COMPARISON: { ASSOC: prec.left, RANK: 11 },
// +, -
PLUS_MINUS: { ASSOC: prec.left, RANK: 12 },
// *, /
MULTIPLY_DIVIDE: { ASSOC: prec.left, RANK: 13 },
// %>%, %<>%, |>
SPECIAL_OR_PIPE: { ASSOC: prec.left, RANK: 14 },
// :
COLON: { ASSOC: prec.left, RANK: 15 },
// +, -
UNARY_PLUS_MINUS: { ASSOC: prec.left, RANK: 16 },
// ^, **
EXPONENTIATE: { ASSOC: prec.right, RANK: 17 },
// $, @
// NOTE: See `NOTE ON PREC.RIGHT` above
EXTRACT: { ASSOC: prec.right, RANK: 18 },
// ::, :::
// NOTE: See `NOTE ON PREC.RIGHT` above
NAMESPACE: { ASSOC: prec.right, RANK: 19 },
// match(1, 2), [, [[
// NOTE: We aren't entirely sure how Bison works for calls and subsets. Practically,
// we need calls to have high precedence so things like `function(x, y, z) match(x, y)`
// don't get parsed as a call with function `function(x, y, z) match` and arguments of
// `(x, y)`. In the Bison grammar, there is no `%prec` specified for these rules, and
// the last terminal nodes of `]` and `)` don't have an assigned precedence in the
// table, so in theory they have the same precedence as general R expressions, but that
// obviously isn't the case. Possibly this has to do with Bison's lookahead that
// tree-sitter doesn't do.
// https://github.com/wch/r-source/blob/0a8f53a7ba47463f1c938dd3e2c2acc7a2d3a1c2/src/main/gram.y#L501
// https://github.com/wch/r-source/blob/0a8f53a7ba47463f1c938dd3e2c2acc7a2d3a1c2/src/main/gram.y#L507-L508
// https://github.com/wch/r-source/blob/0a8f53a7ba47463f1c938dd3e2c2acc7a2d3a1c2/src/main/gram.y#L441
CALL: { ASSOC: prec.right, RANK: 20 },
}
module.exports = grammar({
name: 'r',
extras: $ => [
$.comment,
/\s/
],
externals: $ => [
$._start,
$._newline,
$._semicolon,
$._raw_string_literal,
// Don't use `_external` variants directly. Instead use their aliased versions.
$._external_else,
$._external_open_parenthesis,
$._external_close_parenthesis,
$._external_open_brace,
$._external_close_brace,
$._external_open_bracket,
$._external_close_bracket,
$._external_open_bracket2,
$._external_close_bracket2,
// If a syntax error is encountered, the first thing to occur is that our external
// scanner is called with all symbols marked valid. We really don't want this, so
// we include a sentinel external to detect this and decline to handle.
$._error_sentinel
],
word: $ => $.identifier,
rules: {
// Top-level rules.
// The zero width `$._start` ensures that `program` starts at `(0, 0)`.
program: $ => seq(
$._start,
repeat(choice($._expression, $._semicolon, $._newline))
),
// Function definitions.
function_definition: $ => withPrec(PREC.FUNCTION_OR_LOOP, seq(
field("name", choice("\\", "function")),
repeat($._newline),
field("parameters", $.parameters),
repeat($._newline),
field("body", $._expression)
)),
// NOTE: We include "(" and ")" as part of the rule here to allow
// tree-sitter to create a "parameters" node in the AST even when
// no parameters are declared for a function.
// NOTE: See `NOTE ON NEWLINES BETWEEN PARENTHESES` above
parameters: $ => seq(
field("open", $._open_parenthesis),
optional(seq(
field("parameter", $.parameter),
repeat(seq($.comma, field("parameter", $.parameter)))
)),
field("close", $._close_parenthesis)
),
parameter: $ => choice(
$._parameter_with_default,
$._parameter_without_default
),
_parameter_with_default: $ => seq(
$._parameter_name,
"=",
optional(field("default", $._expression))
),
_parameter_without_default: $ => $._parameter_name,
_parameter_name: $ => field("name", $._identifier_or_dots_or_dot_dot_i),
// Control flow.
// NOTE: See `NOTE ON NEWLINES BETWEEN PARENTHESES` above
if_statement: $ => withPrec(PREC.IF, seq(
"if",
repeat($._newline),
field("open", $._open_parenthesis),
field("condition", $._expression),
field("close", $._close_parenthesis),
repeat($._newline),
field("consequence", $._expression),
// No `repeat($._newline)` here. Specially handled in the scanner instead.
optional(seq(
$._else,
repeat($._newline),
field("alternative", $._expression)
))
)),
// NOTE: See `NOTE ON NEWLINES BETWEEN PARENTHESES` above
for_statement: $ => withPrec(PREC.FUNCTION_OR_LOOP, seq(
"for",
repeat($._newline),
field("open", $._open_parenthesis),
field("variable", $._identifier_or_dots_or_dot_dot_i),
"in",
field("sequence", $._expression),
field("close", $._close_parenthesis),
repeat($._newline),
field("body", $._expression)
)),
// NOTE: See `NOTE ON NEWLINES BETWEEN PARENTHESES` above
while_statement: $ => withPrec(PREC.FUNCTION_OR_LOOP, seq(
"while",
repeat($._newline),
field("open", $._open_parenthesis),
field("condition", $._expression),
field("close", $._close_parenthesis),
repeat($._newline),
field("body", $._expression)
)),
repeat_statement: $ => withPrec(PREC.FUNCTION_OR_LOOP, seq(
"repeat",
repeat($._newline),
field("body", $._expression)
)),
// Blocks.
braced_expression: $ => withPrec(PREC.BLOCK, seq(
field("open", $._open_brace),
repeat(field("body", choice($._expression, $._semicolon, $._newline))),
field("close", $._close_brace)
)),
// NOTE: R requires exactly 1 expression within parentheses
// NOTE: See `NOTE ON NEWLINES BETWEEN PARENTHESES` above
parenthesized_expression: $ => withPrec(PREC.BLOCK, seq(
field("open", $._open_parenthesis),
field("body", $._expression),
field("close", $._close_parenthesis)
)),
// Function calls and subsetting.
call: $ => withPrec(PREC.CALL, seq(
field("function", $._expression),
field("arguments", alias($.call_arguments, $.arguments))
)),
subset: $ => withPrec(PREC.CALL, seq(
field("function", $._expression),
field("arguments", alias($.subset_arguments, $.arguments))
)),
subset2: $ => withPrec(PREC.CALL, seq(
field("function", $._expression),
field("arguments", alias($.subset2_arguments, $.arguments))
)),
// The actual matching rules for arguments in each of the above.
//
// Complicated by the fact that we want to support any number
// of sequential commas, but two arguments must be separated
// by at least one comma.
//
// We accomplish this by using `delimSep1()`, but we make the
// actual `rule` itself `optional()`.
//
// # Allowed
// fn()
// fn(,)
// fn(,,)
// fn(a)
// fn(a, b)
// fn(,,a,,b,,)
//
// # Not allowed
// fn(a b)
//
// NOTE: See `NOTE ON NEWLINES BETWEEN PARENTHESES` above
call_arguments: $ => seq(
field("open", $._open_parenthesis),
delimSep1(optional(field("argument", $.argument)), $.comma),
field("close", $._close_parenthesis)
),
subset_arguments: $ => seq(
field("open", $._open_bracket),
delimSep1(optional(field("argument", $.argument)), $.comma),
field("close", $._close_bracket)
),
subset2_arguments: $ => seq(
field("open", $._open_bracket2),
delimSep1(optional(field("argument", $.argument)), $.comma),
field("close", $._close_bracket2)
),
// An argument; either named or unnamed.
argument: $ => choice(
$._argument_named,
$._argument_unnamed
),
_argument_named: $ => seq(
field("name", $._argument_name_string_or_identifier_or_dots_or_dot_dot_i),
"=",
optional($._argument_value)
),
_argument_unnamed: $ => $._argument_value,
_argument_value: $ => field("value", $._expression),
// Operators
// NOTE: See `NOTE ON OPERATORS` above
// NOTE: Newlines are allowed after all unary operators
unary_operator: $ => {
const table = [
["?", PREC.HELP],
["~", PREC.TILDE],
["!", PREC.UNARY_NOT],
["+", PREC.UNARY_PLUS_MINUS],
["-", PREC.UNARY_PLUS_MINUS]
];
return choice(...table.map(([operator, prec]) => prec.ASSOC(prec.RANK, seq(
field("operator", operator),
repeat($._newline),
field("rhs", $._expression)
))))
},
// NOTE: Expressions are allowed on either side of the operator
binary_operator: $ => {
const table = [
["?", PREC.HELP],
["~", PREC.TILDE],
["<-", PREC.LEFT_ASSIGN],
["<<-", PREC.LEFT_ASSIGN],
[":=", PREC.LEFT_ASSIGN],
["->", PREC.RIGHT_ASSIGN],
["->>", PREC.RIGHT_ASSIGN],
["=", PREC.EQUALS_ASSIGN],
["|", PREC.OR],
["&", PREC.AND],
["||", PREC.OR],
["&&", PREC.AND],
["<", PREC.COMPARISON],
["<=", PREC.COMPARISON],
[">", PREC.COMPARISON],
[">=", PREC.COMPARISON],
["==", PREC.COMPARISON],
["!=", PREC.COMPARISON],
["+", PREC.PLUS_MINUS],
["-", PREC.PLUS_MINUS],
["*", PREC.MULTIPLY_DIVIDE],
["/", PREC.MULTIPLY_DIVIDE],
["**", PREC.EXPONENTIATE],
["^", PREC.EXPONENTIATE],
// Special infix operator
// Regex: Between two `%`, anything but another `%`, `\`, or `\n`.
// Includes primitives `%%` and `%/%`.
// Alias is used for targeting in `highlights.scm`.
// TODO: This could probably be fine tuned to disallow more things.
[alias(/%[^%\\\n]*%/, "special"), PREC.SPECIAL_OR_PIPE],
["|>", PREC.SPECIAL_OR_PIPE],
[":", PREC.COLON]
];
return choice(...table.map(([operator, prec]) => prec.ASSOC(prec.RANK, seq(
field("lhs", $._expression),
field("operator", operator),
repeat($._newline),
field("rhs", $._expression)
))))
},
// NOTE: Expression on LHS, string/identifier/dots/dot_dot_i on RHS
extract_operator: $ => {
const table = [
["$", PREC.EXTRACT],
["@", PREC.EXTRACT]
];
return choice(...table.map(([operator, prec]) => prec.ASSOC(prec.RANK, seq(
field("lhs", $._expression),
field("operator", operator),
repeat($._newline),
optional(field("rhs", $._string_or_identifier_or_dots_or_dot_dot_i))
))))
},
// NOTE: No newlines are allowed. String/identifier/dots/dot_dot_i on both LHS and RHS.
namespace_operator: $ => {
const table = [
["::", PREC.NAMESPACE],
[":::", PREC.NAMESPACE]
];
return choice(...table.map(([operator, prec]) => prec.ASSOC(prec.RANK, seq(
field("lhs", $._string_or_identifier_or_dots_or_dot_dot_i),
field("operator", operator),
optional(field("rhs", $._string_or_identifier_or_dots_or_dot_dot_i))
))))
},
// Numeric literals.
integer: $ => seq($._float_literal, "L"),
complex: $ => seq($._float_literal, "i"),
float: $ => $._float_literal,
_hex_literal: $ => /0[xX][0-9a-fA-F]+/,
_number_literal: $ => /(?:(?:\d+(?:\.\d*)?)|(?:\.\d+))(?:[eE][+-]?\d*)?/,
_float_literal: $ => choice($._hex_literal, $._number_literal),
// Strings.
string: $ => choice(
$._raw_string_literal,
$._single_quoted_string,
$._double_quoted_string
),
// TODO: Raw string contents, something like this, where `_raw_string_open`,
// `_raw_string_close`, and `_raw_string_content` are externals.
// _raw_string_literal: $ => seq(
// $._raw_string_open,
// optional(field("content", alias($._raw_string_content, $.string_content))),
// $._raw_string_close
// ),
// Explanation is:
// - Between two quote characters, allow either:
// - Anything except `'` (or `"`) or `\`
// - An escape sequence
_single_quoted_string: $ => seq(
'\'',
optional(field("content", alias($._single_quoted_string_content, $.string_content))),
'\''
),
_double_quoted_string: $ => seq(
'"',
optional(field("content", alias($._double_quoted_string_content, $.string_content))),
'"'
),
_single_quoted_string_content: $ => repeat1(choice(
/[^'\\]+/,
$.escape_sequence
)),
_double_quoted_string_content: $ => repeat1(choice(
/[^"\\]+/,
$.escape_sequence
)),
escape_sequence: $ => token.immediate(seq(
'\\',
choice(
/[^0-9xuU]/,
/[0-7]{1,3}/,
/x[0-9a-fA-F]{1,2}/,
/u[0-9a-fA-F]{1,4}/,
/u\{[0-9a-fA-F]{1,4}\}/,
/U[0-9a-fA-F]{1,8}/,
/U\{[0-9a-fA-F]{1,8}\}/
)
)),
// Identifiers.
// NOTE: `_` isn't a valid way to start an R identifier, but we are a little
// lax here and parse it anyways. One reason is because want to support a lone `_` as
// the pipe placeholder identifier. It could be included as a separate `"_"` choice,
// but then `_foo` parses as two identifiers: `_` and `foo`, making it impossible to
// check that `_foo` is an invalid identifier. It seems simpler to parse `_foo` as a
// single identifier, and then let downstream consumers do further checks on the
// validity as needed (#71).
// NOTE: Due to the linked tree-sitter discussion, if `_identifier` and
// `_quoted_identifier` are their own hidden rules, then we can't detect error
// recovered `identifier`s as missing with `ts_node_is_missing()`. The workaround used
// here inlines the regexes, and wraps the `choice()` call in a single terminal
// `token()` so `identifier` can still be used as the `word` rule.
// https://github.com/tree-sitter/tree-sitter/issues/3332
identifier: $ => {
const _identifier = /[\p{XID_Start}._][\p{XID_Continue}.]*/;
const _quoted_identifier = /`((?:\\(.|\n))|[^`\\])*`/;
return token(
choice(
_identifier,
_quoted_identifier
)
)
},
// Identifier-ish, but useful enough to be their own nodes
dots: $ => "...",
dot_dot_i: $ => token(withPrec(PREC.DOT_DOT_I, /[.][.]\d+/)),
// NOTE: Technically R allows `...` and `..1` anywhere we want an `$.identifier`,
// but practically it can be useful for downstream consumers to have separate
// nodes for these particular constructs. Our compromise is to keep these as separate
// nodes, but then use this in most places we want an identifier.
_identifier_or_dots_or_dot_dot_i: $ => choice(
$.identifier,
$.dots,
$.dot_dot_i
),
// NOTE: Having this as an actual node (rather than inlining the `choice()`) somehow
// ends up allowing better error recovery in a few cases
_string_or_identifier_or_dots_or_dot_dot_i: $ => choice(
$.string,
$.identifier,
$.dots,
$.dot_dot_i
),
// NOTE: This is exactly `_string_or_identifier_or_dots_or_dot_dot_i` but with
// a precedence of 1. It seems like we have to set the `prec(1, )` on the `choice()`
// directly, we can't reuse `_string_or_identifier_or_dots_or_dot_dot_i` here
// otherwise `tree-sitter generate` throws an unresolved conflict error.
//
// This is only for use in `_argument_named`.
//
// Since `_argument_unnamed` can be an arbitrary `_expression` (with precedence 0)
// which includes `string`, `identifier`, `dots`, and `dot_dot_i`, there is an
// ambiguity between:
// - Starting the `value` of an `_argument_unnamed`
// - Starting the `name` of an `_argument_named`
//
// We set a higher precedence here to try and match `_argument_named` first.
_argument_name_string_or_identifier_or_dots_or_dot_dot_i: $ => prec(1, choice(
$.string,
$.identifier,
$.dots,
$.dot_dot_i
)),
// Keywords.
// We define keywords as those contained in `?Reserved`, i.e. it must be a reserved
// word in R's parser to be considered here. If a keyword from `?Reserved` is already
// mentioned within a wider rule (like "if" and "function"), then it is not included
// again here. Grammar consumers can choose to highlight more words as required. We
// make a single exception for `return`, which is not in `?Reserved` but is SO special
// that we decided to include it as a keyword.
return: $ => "return",
next: $ => "next",
break: $ => "break",
true: $ => "TRUE",
false: $ => "FALSE",
null: $ => "NULL",
inf: $ => "Inf",
nan: $ => "NaN",
na: $ => choice(
"NA",
"NA_integer_",
"NA_real_",
"NA_complex_",
"NA_character_"
),
// A general R expression.
_expression: $ => choice(
$.function_definition,
$.if_statement,
$.for_statement,
$.while_statement,
$.repeat_statement,
$.braced_expression,
$.parenthesized_expression,
$.call,
$.subset,
$.subset2,
$.unary_operator,
$.binary_operator,
$.extract_operator,
$.namespace_operator,
$.integer,
$.complex,
$.float,
$.string,
$.identifier,
$.dots,
$.dot_dot_i,
$.return,
$.next,
$.break,
$.true,
$.false,
$.null,
$.inf,
$.nan,
$.na
),
// Comments.
comment: $ => token(withPrec(PREC.COMMENT, /#.*/)),
// Commas. We include these in the AST so we can figure out the
// argument call position. This is necessary given how R tolerates
// missing arguments in function calls.
comma: $ => ",",
// Provide aliasing of some key externals.
// This gives `highlights.scm` something to target for
// `@punctuation.bracket` and `@keyword`.
_else: $ => alias($._external_else, "else"),
_open_parenthesis: $ => alias($._external_open_parenthesis, "("),
_close_parenthesis: $ => alias($._external_close_parenthesis, ")"),
_open_brace: $ => alias($._external_open_brace, "{"),
_close_brace: $ => alias($._external_close_brace, "}"),
_open_bracket: $ => alias($._external_open_bracket, "["),
_close_bracket: $ => alias($._external_close_bracket, "]"),
_open_bracket2: $ => alias($._external_open_bracket2, "[["),
_close_bracket2: $ => alias($._external_close_bracket2, "]]")
}
})
function withPrec(prec, rule) {
return prec.ASSOC(prec.RANK, rule)
}
// Creates a rule to match one or more of the `rule`s separated by a `delim`
//
// Used in argument list generation
function delimSep1(rule, delim) {
return seq(
rule,
repeat(seq(delim, rule))
)
}