@sap/cds-compiler
Version:
CDS (Core Data Services) compiler and backends
252 lines (230 loc) • 9.09 kB
JavaScript
// Lexer for CDL grammar
// The lexer only cares about potential keywords, not the exact list. That is, it
// sets the `keyword` property for all non-delimited `Id` tokens.
// General remarks about regular expressions in node.js (or in general):
//
// - Alternatives in regexps are searched left to right, not longest as in scanner
// generator!
// - Beware if a regular expression fails (or matches just one char) after having
// tested k characters in an input. A regexp having a non-optional match or
// assertion after a loop (Kleene star) could lead to lexer execution time of
// O(n*n). Therefore, regexps for strings etc only cover the opening delimiter.
;
const { Location } = require('../base/location'); // TODO main: add tokenIndex
const rules = [ // must not contain capturing groups!
{ type: comment, re: '/[*/]' },
// token type = token text (`type: null`):
{ type: null, re: '[-+*?()\\[\\]{},;:/@#]|\\.(?:\\.\\.?)?|<[=>]?|>=?|=[=>]?|!=|\\|\\|' },
{ type: ident, re: '[$_\\p{ID_Start}][$\\p{ID_Continue}\u200C\u200D]*|!\\[|"' },
{ type: string, re: '[\'"]|`(?:``)?' }, // strings, template literal without …${}
{ type: 'Number', re: '\\d+(?:\\.\\d+)?(?:e[-+]?\\d+)?' },
{ type: 'IllegalToken', re: '\\S' }, // must be last
];
const rulesRegexp = new RegExp( `(${ rules.map( r => r.re ).join( ')|(' ) })`, 'iugm' );
if (rulesRegexp.exec( '§' )[rules.length] !== '§')
throw Error( 'Invalid capturing group in rules regexp' );
const newlineRegexp = /\n/g; // TODO: \r?, PS, LS
const commentRegexps = { '//': /$/gm, '/*': /\*\//g };
const stringRegexps = { "'": /'|$/gm, '`': /[`\\]/g, '```': /```|\\/g };
const identRegexps = { '![': /\]|$/gm, '"': /"|$/gm };
const quotedLiterals = [ 'date', 'time', 'timestamp', 'x' ];
class Token {
type;
text;
keyword;
location;
parsedAs;
get isIdentifier() { // compatibility method
return this.parsedAs !== 'keyword' && this.parsedAs !== 'token' && this.parsedAs;
}
get tokenIndex() {
return this.location.tokenIndex;
}
}
class Lexer {
constructor( file, input ) {
this.file = file;
this.input = input; // string
this.linePositions = undefined;
this.location = undefined;
}
characterPos( line, col ) {
return this.linePositions[line - 1] + col - 1;
}
tokenize( parser ) {
this.linePositions = [ 0 ];
parser.tokens = [];
parser.docComments = [];
newlineRegexp.lastIndex = 0;
while (newlineRegexp.test( this.input ))
this.linePositions.push( newlineRegexp.lastIndex );
const { file } = this;
let line = 1;
rulesRegexp.lastIndex = 0;
let match;
// eslint-disable-next-line no-cond-assign
while (match = rulesRegexp.exec( this.input )) {
let text = match[0];
const group = match.indexOf( text, 1 ) - 1;
let type = rules[group].type || text;
const pos = match.index;
while (pos >= this.linePositions[line])
++line;
const col = pos - this.linePositions[line - 1] + 1;
this.location = {
__proto__: Location.prototype,
file,
line,
col,
endLine: line,
endCol: col + text.length,
// remark: end positions of multi-line tokens must be set by function
tokenIndex: parser.tokens.length + parser.docComments.length + parser.comments.length,
};
let keyword;
if (typeof type !== 'function' || // eslint-disable-next-line sonarjs/no-nested-assignment
([ type, text, keyword ] = type( text, this, parser, pos )) && type) {
parser.tokens.push( {
__proto__: Token.prototype,
type,
text,
keyword,
location: this.location,
parsedAs: undefined,
} );
}
}
line = this.linePositions.length;
const endCol = this.input.length - this.linePositions[line - 1] + 1;
const location = {
__proto__: Location.prototype,
file,
line,
col: endCol,
endLine: line,
endCol,
tokenIndex: parser.tokens.length + parser.docComments.length + parser.comments.length,
};
parser.tokens.push( {
__proto__: Token.prototype,
type: 'EOF',
text: '',
keyword: false,
location,
parsedAs: undefined,
} );
}
}
function comment( text, lexer, parser, beg ) {
const re = commentRegexps[text];
re.lastIndex = rulesRegexp.lastIndex;
if (!re.test( lexer.input )) {
// eslint-disable-next-line cds-compiler/message-texts
parser.error( 'syntax-missing-token-end', lexer.location,
{ '#': 'comment', code: '/*', newCode: '*/' }, {
comment: 'Comments starting with $(CODE) must end with $(NEWCODE)',
} );
}
else if (text === '/*' && lexer.input.charAt( rulesRegexp.lastIndex ) === '*' &&
rulesRegexp.lastIndex + 2 < re.lastIndex) { // not just `/**/`
parser.docComments.push( {
__proto__: Token.prototype,
type: 'DocComment',
text: lexer.input.substring( beg, re.lastIndex ),
keyword: false,
location: lexer.location,
parsedAs: undefined,
} );
adaptEndLocation( lexer, re.lastIndex ); // also works after push ?
}
else { // TODO: only attach with option `attachTokens` ?
parser.comments.push( {
__proto__: Token.prototype,
type: 'Comment',
text: lexer.input.substring( beg, re.lastIndex ),
keyword: false,
location: lexer.location,
parsedAs: undefined,
} );
adaptEndLocation( lexer, re.lastIndex ); // also works after push ?
}
rulesRegexp.lastIndex = re.lastIndex || lexer.input.length;
return [];
}
function string( text, lexer, parser, beg ) {
let prefix = null;
const re = stringRegexps[text];
re.lastIndex = rulesRegexp.lastIndex;
let esc = 0;
if (text !== "'") { // single or triple back-quote
while (re.test( lexer.input ) && lexer.input[re.lastIndex - 1] === '\\')
esc = ++re.lastIndex;
}
else { // try with previous date/time/timestamp/x
prefix = parser.tokens[parser.tokens.length - 1];
if (prefix && (prefix.location.endLine !== lexer.location.line ||
prefix.location.endCol !== lexer.location.col ||
!quotedLiterals.includes( prefix.keyword )))
prefix = null;
while (re.test( lexer.input ) && lexer.input[re.lastIndex] === "'")
esc = ++re.lastIndex;
}
let keyword;
const { lastIndex } = re;
if (!lastIndex || // reached EOF with template literal
lexer.input[lastIndex - 1] !== lexer.input[beg] || esc === lastIndex) {
const before = (lastIndex) ? 'string' : 'multi';
// eslint-disable-next-line cds-compiler/message-texts
parser.error( 'syntax-missing-token-end', lexer.location,
{ '#': before, newcode: text }, {
string: 'The string literal must end with $(NEWCODE) before the end of line',
multi: 'The multi-line string literal must end with $(NEWCODE)',
} );
keyword = 0;
// TODO: set parsedAs to 0 → no further error if string is not expected?
prefix = null; // no combination with date/time/…
}
rulesRegexp.lastIndex = lastIndex || lexer.input.length;
adaptEndLocation( lexer, rulesRegexp.lastIndex );
if (!prefix)
return [ 'String', lexer.input.substring( beg, rulesRegexp.lastIndex ), keyword ];
prefix.type = 'QuotedLiteral';
prefix.text += lexer.input.substring( beg, rulesRegexp.lastIndex );
prefix.keyword = undefined;
prefix.location.endLine = lexer.location.endLine;
prefix.location.endCol = lexer.location.endCol;
return [];
}
function ident( text, lexer, parser, beg ) {
if (!Object.hasOwn( identRegexps, text ))
return [ 'Id', text, text.toLowerCase() ];
const re = identRegexps[text];
const close = (text === '"') ? '"' : ']';
re.lastIndex = rulesRegexp.lastIndex;
let esc = 0;
while (re.test( lexer.input ) && lexer.input[re.lastIndex] === close)
esc = ++re.lastIndex;
let keyword;
const { lastIndex } = re;
if (lexer.input[lastIndex - 1] !== close || esc === lastIndex) {
// eslint-disable-next-line cds-compiler/message-texts
parser.error( 'syntax-missing-token-end', lexer.location,
{ '#': 'ident', newcode: close }, {
ident: 'The delimited id must end with $(NEWCODE) before the end of line',
} );
keyword = 0;
// TODO: set parsedAs to 0 → no further error if string is not expected?
}
rulesRegexp.lastIndex = lastIndex || lexer.input.length;
adaptEndLocation( lexer, rulesRegexp.lastIndex );
return [ 'Id', lexer.input.substring( beg, rulesRegexp.lastIndex ), keyword ];
}
function adaptEndLocation( lexer, pos ) {
let { line } = lexer.location;
while (pos >= lexer.linePositions[line])
++line;
lexer.location.endLine = line;
lexer.location.endCol = pos - lexer.linePositions[line - 1] + 1;
}
Lexer.Token = Token;
module.exports = Lexer;