UNPKG

@wardbrian/tree-sitter-stan

Version:

A tree sitter grammer for modern Stan programs

1,028 lines (885 loc) 26 kB
const PREC = { ASSIGN: -2, CONDITIONAL: -1, DEFAULT: 0, LOR: 2, LAND: 3, EQ: 4, NEQ: 4, LT: 5, LEQ: 5, GT: 5, GEQ: 5, SUB: 6, ADD: 6, MOD: 7, DIV: 7, MULT: 7, LEFT_DIV: 8, EL_DIV: 9, EL_MULT: 9, PLUS: 10, MINUS: 10, LNEG: 10, EXPON: 11, BRACKET: 12, INDEX: 12, PAREN: 12, FUNCTION: 12, TRANSPOSE: 12, RANGE: 13, }; function commaSep2(rule) { return seq( rule, repeat1(seq(',', rule)), ); } function commaSep1(rule) { return seq( rule, repeat(seq(',', rule)), ); } // comma separated variables function commaSep(rule) { return optional(commaSep1(rule)); } function decl($, type_rule, expr_rule) { return seq( type_rule, commaSep1(seq(field("name", $.identifier) , optional(seq('=', expr_rule)))), ';'); } const operators = { infix: { arithmetic: [ ['^', PREC.EXPON, prec.right], ['.^', PREC.EXPON, prec.right], ['.*', PREC.EL_MULT, prec.left], ['./', PREC.EL_DIV, prec.left], ['\\', PREC.LEFT_DIV, prec.left], ['*', PREC.MULT, prec.left], ['%', PREC.MOD, prec.left], ['/', PREC.DIV, prec.left], ['%/%', PREC.DIV, prec.left], ['+', PREC.ADD, prec.left], ['.+', PREC.ADD, prec.left], ['-', PREC.SUB, prec.left], ['.-', PREC.SUB, prec.left], ], logical: [ ['>=', PREC.GEQ, prec.left], ['>', PREC.GT, prec.left], ['<=', PREC.LEQ, prec.left], ['<', PREC.LT, prec.left], ['!=', PREC.NEQ, prec.left], ['==', PREC.EQ, prec.left], ['&&', PREC.LAND, prec.left], ['||', PREC.LOR, prec.left], ], }, prefix: [ ['+', PREC.PLUS], ['-', PREC.MINUS], ['!', PREC.LNEG], ], postfix: [ ['\'', PREC.TRANSPOSE], ], }; function infixExpressions(ops, expression) { return ops.map(([op, number, func]) => func(number, seq(expression, op, expression))); } function prefixExpressions(ops, expression) { return ops.map(([op, number]) => prec.right(number, seq(op, expression))); } function postfixExpressions(ops, expression) { return ops.map(([op, number]) => prec.left(number, seq(expression, op))); } module.exports = grammar({ name: 'stan', extras: $ => [ /\s/, $.comment, $.preproc_include, ], inline: $ => [ $._statement, $._vardecl_or_statement, ], conflicts: $ => [ [$.array_expression, $.block_statement], [$._common_expression, $.lhs], ], word: $ => $.identifier, rules: { // The production rules of the context-free grammar program: $ => seq( optional($.functions), optional($.data), optional($.transformed_data), optional($.parameters), optional($.transformed_parameters), optional($.model), optional($.generated_quantities), ), functions: $ => seq( 'functions', '{', repeat($.function_definition), '}', ), data: $ => seq( 'data', '{', repeat($.top_var_decl_no_assign), '}', ), transformed_data: $ => seq( token('transformed data'), '{', repeat($._top_vardecl_or_statement), '}', ), parameters: $ => seq( 'parameters', '{', repeat($.top_var_decl_no_assign), '}', ), transformed_parameters: $ => seq( token('transformed parameters'), '{', repeat($._top_vardecl_or_statement), '}', ), model: $ => prec(1, seq( 'model', '{', repeat($._vardecl_or_statement), '}', )), generated_quantities: $ => seq( token('generated quantities'), '{', repeat($._top_vardecl_or_statement), '}', ), var_decl: $ => decl($, $._sized_higher_type, $._expression), top_var_decl: $ => choice(decl($, $._topvar_higher_type, $._expression)), top_var_decl_no_assign: $ => choice(decl($, $._topvar_higher_type, "<<<UNREACHABLE TOKEN>>>"), $.empty_statement), _vardecl_or_statement: $ => choice( $._statement, $.var_decl ), _top_vardecl_or_statement: $ => choice( $._statement, $.top_var_decl ), // Function declaration function_definition: $ => seq( $.return_type, $.function_declarator, field("body", $._statement), ), function_declarator: $ => prec(1, seq( field("name", $.identifier), $.parameter_list, )), parameter_list: $ => prec.dynamic(1, seq( '(', commaSep($.parameter_declaration), ')', )), parameter_declaration: $ => seq( optional('data'), $.unsized_type, field("parameter", $.identifier), ), return_type: $ => choice( 'void', $.unsized_type, ), unsized_type: $ => choice( seq("array", $.unsized_dims, $.basic_type), seq("array", $.unsized_dims, $.unsized_tuple_type), $.basic_type, $.unsized_tuple_type), unsized_tuple_type: $ => seq( 'tuple', '(', commaSep2($.unsized_type), ')', ), unsized_dims: $ => seq( '[', repeat(','), ']', ), sized_basic_type: $ => prec.dynamic(1, choice( 'int', 'real', 'complex', seq('vector', '[', $._expression, ']'), seq('row_vector', '[', $._expression, ']'), seq('matrix', '[', $._expression, ',', $._expression, ']'), seq('complex_vector', '[', $._expression, ']'), seq('complex_row_vector', '[', $._expression, ']'), seq('complex_matrix', '[', $._expression, ',', $._expression, ']'), )), // unfortunate duplication with topvar types _sized_higher_type: $ => choice( $._sized_array_type, $.sized_tuple_type, $.sized_basic_type ), _sized_array_type: $ => choice(seq( $.arr_dims, $.sized_basic_type ), seq($.arr_dims, $.sized_tuple_type) ), sized_tuple_type: $ => seq( 'tuple', '(', commaSep2($._sized_higher_type), ')', ), /* eslint-disable no-unused-vars */ basic_type: $ => prec.dynamic(1, choice( 'int', 'real', 'complex', 'vector', 'row_vector', 'matrix', 'complex_vector', 'complex_row_vector', 'complex_matrix', )), /* eslint-enable no-unused-vars */ top_var_type: $ => choice( $.int_type, $.real_type, $.complex_type, $.vector_type, $.ordered_type, $.positive_ordered_type, $.simplex_type, $.unit_vector_type, $.sum_to_zero_vector_type, $.sum_to_zero_matrix_type, $.row_vector_type, $.matrix_type, $.complex_vector_type, $.complex_row_vector_type, $.complex_matrix_type, $.cholesky_factor_cov_type, $.cholesky_factor_corr_type, $.cov_matrix_type, $.corr_matrix_type, $.row_stochastic_matrix_type, $.column_stochastic_matrix_type, ), _topvar_higher_type: $ => choice( $._topvar_array_type, $.topvar_tuple_type, $.top_var_type ), _topvar_array_type: $ => seq($.arr_dims, choice( $.top_var_type, $.topvar_tuple_type) ), topvar_tuple_type: $ => seq( 'tuple', '(', commaSep2($._topvar_higher_type), ')', ), arr_dims: $ => seq( 'array', '[', commaSep($._expression), ']', ), int_type: $ => seq( 'int', optional($._range_constraint), ), real_type: $ => seq( 'real', optional($.type_constraint), ), complex_type: $ => "complex", vector_type: $ => seq( 'vector', optional($.type_constraint), '[', $._expression, ']', ), ordered_type: $ => seq( 'ordered', '[', $._expression, ']', ), positive_ordered_type: $ => seq( 'positive_ordered', '[', $._expression, ']', ), simplex_type: $ => seq( 'simplex', '[', $._expression, ']', ), unit_vector_type: $ => seq( 'unit_vector', '[', $._expression, ']', ), sum_to_zero_vector_type: $ => seq( 'sum_to_zero_vector', '[', $._expression, ']', ), sum_to_zero_matrix_type: $ => seq( 'sum_to_zero_vector', '[', $._expression, ',', $._expression, ']', ), row_vector_type: $ => seq( 'row_vector', optional($.type_constraint), '[', $._expression, ']', ), matrix_type: $ => seq( 'matrix', optional($.type_constraint), '[', $._expression, ',', $._expression, ']', ), complex_vector_type: $ => seq( 'complex_vector', '[', $._expression, ']', ), complex_row_vector_type: $ => seq( 'complex_row_vector', '[', $._expression, ']', ), complex_matrix_type: $ => seq( 'complex_matrix', '[', $._expression, ',', $._expression, ']', ), cholesky_factor_corr_type: $ => seq( 'cholesky_factor_corr', '[', $._expression, ']', ), cholesky_factor_cov_type: $ => seq( 'cholesky_factor_cov', '[', $._expression, optional( seq( ',', $._expression, ), ), ']', ), corr_matrix_type: $ => seq( 'corr_matrix', '[', $._expression, ']', ), column_stochastic_matrix_type: $ => seq( 'column_stochastic_matrix', '[', $._expression, ',', $._expression, ']', ), row_stochastic_matrix_type: $ => seq( 'row_stochastic_matrix', '[', $._expression, ',', $._expression, ']', ), cov_matrix_type: $ => seq( 'cov_matrix', '[', $._expression, ']', ), type_constraint: $ => prec(PREC.RANGE, choice( $._range_constraint, seq("<", $.offset_mult, ">" ) ) ), // this parses differently than Stan. Stan does not allow // ANY <> expressions inside the _range_constraint: $ => prec(PREC.RANGE, choice( $.range_lower, $.range_upper, $.range_lower_upper, $.range_upper_lower, )), /* eslint-enable no-unused-vars */ range_lower_upper: $ => prec(PREC.RANGE, seq( '<', 'lower', '=', $._range_expression, ',', 'upper', '=', $._range_expression, '>', )), range_upper_lower: $ => prec(PREC.RANGE, seq( '<', 'upper', '=', $._range_expression, ',', 'lower', '=', $._range_expression, '>', )), range_lower: $ => prec(PREC.RANGE, seq( '<', 'lower', '=', $._range_expression, '>', )), range_upper: $ => prec(PREC.RANGE, seq( '<', 'upper', '=', $._range_expression, '>', )), offset_mult: $ => prec(PREC.RANGE, choice( seq("offset", "=", $._range_expression, ",", "multiplier", "=", $._range_expression), seq("multiplier", "=", $._range_expression, ",", "offset", "=", $._range_expression), seq("offset", "=", $._range_expression), seq("multiplier", "=", $._range_expression), )), /* eslint-disable no-unused-vars */ identifier: $ => /[A-Za-z][A-Za-z0-9_]*/, /* eslint-enable no-unused-vars */ // Expressions _expression: $ => choice( $._range_expression, $.conditional_expression, $.infix_op_expression, $.prefix_op_expression, $.postfix_op_expression, $.indexed_expression, ), // in order to deal with ambiguity of <> range expressions // cannot directly include logical infix operators. _range_expression: $ => choice( $._common_expression, alias($.infix_op_range_expression, $.infix_op_expression), alias($.prefix_op_range_expression, $.prefix_op_expression), alias($.postfix_op_range_expression, $.postfix_op_expression), alias($.indexed_range_expression, $.indexed_expression), ), // range constraints only allow a subset of expressions _common_expression: $ => choice( $.integer_literal, $.real_literal, $.imag_literal, $.variable_expression, $.array_expression, $.tuple_expression, $.vector_expression, $.function_expression, $.distr_expression, $.tuple_projection, $.parenthized_expression, ), variable_expression: $ => $.identifier, conditional_expression: $ => prec.right(PREC.CONDITIONAL, seq( $._expression, '?', $._expression, ':', $._expression, )), array_expression: $ => seq( '{', commaSep($._expression), '}', ), tuple_expression: $ => seq( '(', commaSep2($._expression), ')', ), tuple_projection: $ => seq( $._common_expression, '.', field("idx", token.immediate(/[0-9]+/)) ), vector_expression: $ => prec(PREC.BRACKET, seq( '[', commaSep($._expression), ']', )), // operator expressions outside of range constraints infix_op_expression: $ => { const ops = operators.infix.arithmetic.concat(operators.infix.logical); return choice( ...infixExpressions(ops, $._expression), ); }, prefix_op_expression: $ => choice(...prefixExpressions(operators.prefix, $._expression)), postfix_op_expression: $ => choice(...postfixExpressions(operators.postfix, $._expression)), // trick used for call expression in c grammar indexed_expression: $ => prec.left(PREC.INDEX, seq( $._expression, '[', optional($.index), repeat(seq(',', optional($.index))), ']', )), // expressions inside of range constraints infix_op_range_expression: $ => choice( ...infixExpressions(operators.infix.arithmetic, $._range_expression), ), prefix_op_range_expression: $ => choice( ...prefixExpressions(operators.prefix, $._range_expression), ), postfix_op_range_expression: $ => choice( ...postfixExpressions(operators.postfix, $._range_expression), ), // trick used for call expression in c grammar indexed_range_expression: $ => prec.left(PREC.INDEX, seq( $._range_expression, '[', optional($.index), repeat(seq(',', optional($.index))), ']', )), function_expression: $ => seq( field("name", $.identifier), $.argument_list, ), argument_list: $ => prec.dynamic(1, seq( '(', commaSep($._expression), ')', )), distr_expression: $ => prec(PREC.FUNCTION, seq( field("name", $.identifier), $.distr_argument_list, )), distr_argument_list: $ => prec.dynamic(1, seq( '(', $._expression, '|', commaSep($._expression), ')', )), parenthized_expression: $ => seq( '(', $._expression, ')', ), index: $ => choice( $._expression, $.colon_expression, $.left_colon_expression, $.right_colon_expression, ":", ), colon_expression: $ => seq($._expression, ':', $._expression), left_colon_expression: $ => seq(':', $._expression), right_colon_expression: $ => seq($._expression, ':'), /* eslint-disable no-unused-vars */ integer_literal: $ => /[0-9]+(_[0-9]+)*/, /* eslint-enable no-unused-vars */ // can't use token because of https://github.com/tree-sitter/tree-sitter/issues/449 exp_literal: $ => seq( token.immediate(/[eE][+-]?/), $.integer_literal ), real_literal: $ => choice( seq($.integer_literal, token.immediate("."), optional($.integer_literal), optional($.exp_literal)), seq(".", $.integer_literal, optional($.exp_literal)), seq($.integer_literal, $.exp_literal), ), imag_literal: $ => seq( choice( $.integer_literal, $.real_literal, ), token.immediate("i") ), // Statements _statement: $ => choice( $.empty_statement, $.assignment_statement, $.sampling_statement, $.function_statement, $.log_prob_statement, $.target_statement, $.jacobian_statement, $.break_statement, $.continue_statement, $.print_statement, $.reject_statement, $.fatal_error_statement, $.return_statement, $.if_statement, $.while_statement, $.for_statement, $.block_statement, $.profile_statement, ), /* eslint-disable no-unused-vars */ empty_statement: $ => ';', /* eslint-enable no-unused-vars */ assignment_statement: $ => prec(PREC.ASSIGN, seq( $.lhs, $.assignment_op, $._expression, ';', )), lhs: $ => choice( $.variable_expression, $.indexed_lhs, $.tuple_proj_lhs, $.tuple_pack_lhs, ), indexed_lhs: $ => prec.left(PREC.INDEX, seq( $.lhs, '[', commaSep1($.index), ']' )), tuple_pack_lhs: $ => seq( '(', commaSep1($.lhs), ')' ), tuple_proj_lhs: $ => seq( $.lhs, '.', field("idx", token.immediate(/[0-9]+/)), ), /* eslint-disable no-unused-vars */ assignment_op: $ => choice( '=', '+=', '-=', '*=', '/=', '.*=', './=', ), /* eslint-enable no-unused-vars */ function_statement: $ => prec(PREC.FUNCTION, seq( field("name", $.identifier), $.argument_list, ';', )), sampling_statement: $ => seq( $._expression, '~', field("name", $.identifier), '(', commaSep($._expression), ')', optional($._truncation), ';', ), /* eslint-disable no-unused-vars */ _truncation: $ => choice( $.lower_upper_truncation, $.lower_truncation, $.upper_truncation, $.empty_truncation, ), /* eslint-enable no-unused-vars */ /* eslint-disable no-unused-vars */ lower_upper_truncation: $ => seq( 'T', '[', $._expression, ',', $._expression, ']', ), /* eslint-enable no-unused-vars */ /* eslint-disable no-unused-vars */ lower_truncation: $ => seq( 'T', '[', $._expression, ',', ']', ), /* eslint-enable no-unused-vars */ /* eslint-disable no-unused-vars */ upper_truncation: $ => seq( 'T', '[', ',', $._expression, ']', ), /* eslint-enable no-unused-vars */ /* eslint-disable no-unused-vars */ empty_truncation: $ => seq( 'T', '[', ',', ']', ), /* eslint-enable no-unused-vars */ log_prob_statement: $ => seq( 'increment_log_prob', '(', $._expression, ')', ';', ), target_statement: $ => seq( 'target', '+=', $._expression, ';', ), jacobian_statement: $ => seq( 'jacobian', '+=', $._expression, ';', ), /* eslint-disable no-unused-vars */ break_statement: $ => seq('break', ';'), /* eslint-enable no-unused-vars */ /* eslint-disable no-unused-vars */ continue_statement: $ => seq('continue', ';'), /* eslint-enable no-unused-vars */ print_statement: $ => seq( 'print', '(', commaSep(choice($._expression, $.string_literal)), ')', ';', ), // currently nested quotes not allowed /* eslint-disable no-unused-vars */ string_literal: $ => /"[^\"]*"/, /* eslint-enable no-unused-vars */ reject_statement: $ => seq( 'reject', '(', optional(commaSep(choice($._expression, $.string_literal))), ')', ';', ), fatal_error_statement: $ => seq( 'fatal_error', '(', optional(commaSep(choice($._expression, $.string_literal))), ')', ';', ), return_statement: $ => seq( 'return', optional($._expression), ';', ), // right precedence needed to resolve ambiguity if_statement: $ => prec.right(1, seq( 'if', '(', $._expression, ')', $._vardecl_or_statement, // implicit else if optional(seq('else', $._statement)), )), while_statement: $ => prec(1, seq( 'while', '(', $._expression, ')', $._vardecl_or_statement, )), for_statement: $ => prec(1, seq( 'for', '(', field("loopvar", $.identifier), 'in', $._expression, optional(seq(':', $._expression)), ')', $._vardecl_or_statement, )), block_statement: $ => seq( '{', repeat($._vardecl_or_statement), '}', ), profile_statement: $ => seq( "profile", "(", $.string_literal, ")", '{', repeat($._vardecl_or_statement), '}', ), /* eslint-disable no-unused-vars */ preproc_include: $ => seq( field("directive", '#include'), // could be more specific /[<'\"]?/, field("file", $.preproc_file), /[>'\"]?/, ), /* eslint-enable no-unused-vars */ preproc_file: $ => token.immediate(/.*/), /* eslint-disable no-unused-vars */ comment: $ => token(choice( seq('//', /.*/), // http://stackoverflow.com/questions/13014947/regex-to-match-a-c-style-multiline-comment/36328890#36328890 seq( '/*', /[^*]*\*+([^/*][^*]*\*+)*/, '/', ), )), /* eslint-enable no-unused-vars */ }, });