@sap/cds-compiler
Version:
CDS (Core Data Services) compiler and backends
526 lines (483 loc) • 15.8 kB
JavaScript
'use strict';
const {
isWhitespaceOrNewLineOnly,
isWhitespaceCharacterNoNewline,
cdlNewLineRegEx,
} = require('./textUtils');
const { CompilerAssertion } = require('../base/error');
const { Location } = require('../base/location');
/**
* Strips and counts the indentation from the given string.
* This function is similar to the one in docCommentParser.js, but
* has special handling for the first and last line of the string.
*
* @example
* | hello
* | world
* | foo bar
* becomes
* | hello
* | world
* | foo bar
*
* @param {string} str String prior to newline-normalization and escape parsing.
* @returns {[string, number]} The indentation-stripped string and the number
* of whitespace characters removed.
*/
function stripIndentation( str ) {
if (str === '')
return [ '', 0 ];
// Note: We have to check all newline characters, as the string is not normalized, yet.
const lines = str.split(cdlNewLineRegEx);
const n = lines.length;
const hasTrailingLineBreak = cdlNewLineRegEx.test(str[str.length - 1]);
if (hasTrailingLineBreak) {
// Shortcut:
// If there is a trailing line break, it means that ``` is on newline and
// therefore the indentation to remove is 0.
// Remove the last newline, which may be CRLF.
return [ lines.slice(0, -1).join('\n'), 0 ];
}
const minIndent = lines.reduce((min, line, index) => {
// Note: Last line is the line containing ```. There, we always count the indentation,
// even if blank. For all other lines, blank lines are ignored.
if (isWhitespaceOrNewLineOnly(line) && index !== (n - 1))
return min;
let count = 0;
const length = Math.min(min, line.length);
while (count < length && isWhitespaceCharacterNoNewline(line[count]))
count++;
return Math.min(min, count);
}, Number.MAX_SAFE_INTEGER);
for (let i = 0; i < n; ++i) {
// Note: Line may be empty and have fewer characters than `min`.
// In that case, slice() returns an empty string.
lines[i] = lines[i].slice(minIndent);
}
// Remove trailing last line, if there was nothing else in that line.
if (lines[n - 1] === '')
lines.pop();
return [ lines.join('\n'), minIndent ];
}
class MultiLineStringParser {
constructor(cdsParser, token) {
this.parser = cdsParser; // for message functions
this.token = token;
this.str = token.text; // Copy because .text is a getter
if (this.str[0] !== '`' || this.str[this.str.length - 1] !== '`')
// eslint-disable-next-line @stylistic/max-len
throw new CompilerAssertion('Invalid multi-line string sequence: Require string to be surrounded by back-ticks!');
this.output = [];
this.isTextBlock = this.str.startsWith('```');
this._indentation = 0;
// For message locations
this._lineInString = 0;
this._currentLineBreakIndex = 0;
if (this.isTextBlock) {
this.i = 3;
this.end = this.str.length - 3;
}
else {
this.i = 1;
this.end = this.str.length - 1;
}
}
/**
* Parse the token's text and return it.
*
* @return {string}
*/
parse() {
if (this.str.length === 2)
return ''; // Nothing to do: ``
if (this.isTextBlock) {
// If there are no line breaks, emit an error as normal single-back-tick
// strings should be used instead. Because the first line is skipped,
// there is no text without at least one line break.
if (!cdlNewLineRegEx.test(this.str)) {
const loc = this._locationForCharacters(this.end, 1);
this.parser.error('syntax-invalid-text-block', loc);
return '';
}
this._skipOptionalLanguageIdentifierLine();
// Indentation needs to be stripped _before_ escape sequences are parsed and
// _after_ the first line is skipped, because otherwise `\n` in the string
// will interfere with calculating indentation and the language identifier
// is not part of the actual string.
// Because of message locations, we still need to keep track of indentation count
// and need to update the cursor and end position as well as the currentLineBreakIndex.
const [ str, indent ] = stripIndentation(this.str.slice(this.i, -3));
this.str = str;
this._indentation = indent;
this.i = 0;
this.end = this.str.length;
// this._lineInString is > 0, but having this._currentLineBreakIndex = 0 would be incorrect,
// as the line break isn't the first character in the indentation-stripped string
this._currentLineBreakIndex = -1;
}
// Note: Index is at first character of string
do {
switch (this._current()) {
case this._matchLineBreakAtCurrentChar():
this.output.push('\n');
break;
case '\\':
this._move();
this._innerEscape();
break;
case '$':
if (this._lookahead() === '{') {
const loc = this._locationForCharacters(this.i, 2);
this.parser.error('syntax-missing-escape', loc,
{ '#': 'placeholder', code: '${', newcode: '\\${' });
}
this.output.push(this.str[this.i]);
break;
default:
this.output.push(this.str[this.i]);
break;
}
} while (this._move());
return this.output.join('');
}
/**
* Parse the escape sequence after the first '\'.
*
* @private
*/
_innerEscape() {
switch (this._current()) {
case this._matchLineBreakAtCurrentChar():
// Don't add to output -> line break is escaped
break;
case 'b': // backspace
this.output.push('\b');
break;
case 'f': // form feed
this.output.push('\f');
break;
case 'v': // vertical tabulator
this.output.push('\v');
break;
case 'r': // carriage return
this.output.push('\r');
break;
case 'n': // line feed
this.output.push('\n');
break;
case 't': // tab
this.output.push('\t');
break;
case '\\':
case '"':
case '\'':
case '`':
case '$':
this.output.push(this._current());
break;
case 'x':
this._parseHexEscape('x', 2);
break;
case 'u':
if (this._lookahead() === '{')
this._parseBracedUnicodeEscape();
else
this._parseHexEscape('u', 4);
break;
case '0': // null terminator
if (!/^\d$/.test(this._lookahead())) {
this.output.push('\0');
break;
}
// Let the default case handle octal representation.
// fallthrough
default: {
this.output.push(this._current());
const loc = this._locationForCharacters(this.i - 1, 2);
if (/\s/.test(this._current())) {
this.parser.error('syntax-invalid-escape', loc, { '#': 'whitespace' });
}
else if (/\d/.test(this._current())) {
this.parser.error('syntax-invalid-escape', loc, { '#': 'octal' });
}
else {
const code = this._makeCode(`\\${ this._current() }`);
this.parser.message('syntax-unknown-escape', loc, { '#': 'std', code });
}
break;
}
}
}
/**
* Parse the given hexadecimal string to a unicode code-point.
*
* @param {string} codePoint Code-point represented as hexadecimal string, e.g. 'ABCD'.
* @private
*/
_parseHexCodePoint(codePoint) {
// Notes:
// It isn't possible to get an invalid code point with the \u0000
// syntax variant as the first invalid code point is \u{110000}
// and an empty `codePoint` is only possible with the braced variant.
const reportInvalidCodePoint = () => {
const code = this._makeCode(`\\u{${ codePoint }}`);
const loc = this._locationForCharacters(this.i - codePoint.length, codePoint.length);
this.parser.error('syntax-invalid-escape', loc, { '#': 'codepoint', code });
};
const n = Number.parseInt(codePoint, 16);
if (Number.isNaN(n)) {
reportInvalidCodePoint();
return;
}
try {
this.output.push(String.fromCodePoint(n));
}
catch {
// RangeError is thrown if number isn't a valid code point
reportInvalidCodePoint();
}
}
/**
* Parse a hex escape-sequence. Useful for unicode escapes and hex escapes.
* Cursor is at the `x`: `\x00`
* ^
* or at the `u`: `\u0000`
* ^
* @param {string} mode Either `x` or `u`. Used for error messages.
* @param {number} count Number of expected hexadecimal numbers
* @private
*/
_parseHexEscape(mode, count) {
let codePoint = '';
for (let j = 0; j < count; ++j) {
if (!this._eos() && /^[\dA-Fa-f]$/.test(this._lookahead())) {
this._move();
codePoint += this._current();
}
else {
break;
}
}
if (codePoint.length === count) {
this._parseHexCodePoint(codePoint);
}
else {
const loc = this._locationForCharacters(this.i + 1, 1);
const code = this._eos(this.i + 1) ? `\\${ mode }${ codePoint }` : `\\${ mode }${ codePoint }${ this._lookahead() }`;
this.parser.error('syntax-invalid-escape', loc,
{ '#': 'hex-count', count, code: this._makeCode(code) });
}
}
/**
* Parse a unicode escape-sequence with braces.
* Cursor is at the `u`: `\u{0000}`
* ^
* @private
*/
_parseBracedUnicodeEscape() {
let codePoint = '';
this._move(); // 'u'
while (!this._eos()) {
if (/^[\dA-Fa-f]$/.test(this._lookahead())) {
this._move();
codePoint += this._current();
}
else if (this._lookahead() === '}') {
break;
}
else if (!this._eos(this.i + 1)) {
const loc = this._locationForCharacters(this.i + 1, 1); // Point to the exact character
const code = this._makeCode(`\\u{${ codePoint }${ this._lookahead() }…}`);
this.parser.error('syntax-invalid-escape', loc, { '#': 'unicode-hex', code });
return;
}
else {
break;
}
}
if (this._lookahead() === '}') {
this._move();
this._parseHexCodePoint(codePoint);
}
else {
const loc = this._locationForCharacters(this.i, 1);
this.parser.error('syntax-invalid-escape', loc, { '#': 'unicode-brace' });
}
}
/**
* This function skips the language identifier, i.e. until the next line.
* After this function, the cursor will be at the character _after_ the newline.
*
* @private
*/
_skipOptionalLanguageIdentifierLine() {
while (!this._eos()) {
switch (this._current()) {
case this._matchLineBreakAtCurrentChar():
this._move();
return;
case '\\': {
// Do not allow an escape in the language identifier. If at the line's end, users
// may expect the identifier to span more than the first line, which is _not_ the case.
const loc = this._locationForCharacters(this.i, 1);
this.parser.error('syntax-invalid-escape', loc, { '#': 'language-identifier' });
this._move();
break;
}
default:
this._move();
break;
}
}
}
/**
* Consume a line-break Character. Because CDS is close to JavaScript, we
* also support LS and PS. This function also ensures that CRLF (`\r\n`) is
* recognized as a single character.
* We increase the line number for LF (`\n`) for correct message locations.
*
* This function returns the input character, so that it can be used
* in a switch-case.
*
* @returns {string|null}
* @private
*/
_matchLineBreakAtCurrentChar() {
// Only increase line number for \n, because ANTLR does the same
// TODO: Is this still the case with redepage?
switch (this._current()) {
case '\r':
if (this._lookahead() === '\n') {
this._move(); // \r\n is normalized
this._lineInString++;
this._currentLineBreakIndex = this.i;
}
return '\r';
case '\n':
this._lineInString++;
this._currentLineBreakIndex = this.i;
// fallthrough
case '\u2028': // LS
case '\u2029': // PS
return this._current();
default: break;
}
return null;
}
/**
* Move the cursor to the next character _if_ we're not at the end.
*
* @private
* @returns {boolean} `true` if we're not at the end
*/
_move() {
if (this.i < this.end) { // Don't move past last char and `
++this.i;
}
return this.i < this.end;
}
/**
* Returns `true` if we're at the end of the string
*
* @param {Number} [i=this.i] Index to check for EOS
* @private
* @returns {boolean}
*/
_eos(i = this.i) {
// end-of-string -> char before `
return i >= this.end;
}
/**
* Get the next character without increasing the cursor.
* @note Does not check for `eos()`
*
* @private
* @returns {string}
*/
_lookahead() {
return this.str[this.i + 1];
}
/**
* Get the current character without increasing the cursor.
*
* @private
* @returns {string}
*/
_current() {
return this.str[this.i];
}
/**
* Get the previous character without decreasing the cursor.
*
* @private
* @returns {string}
*/
_previous() {
return this.str[this.i - 1];
}
/**
* Get message location for the given cursor position inside the string.
*
* @param {Number} i Cursor position
* @param {Number} width Width of the location
* @private
* @returns {CSN.Location}
*/
_locationForCharacters(i, width) {
return {
__proto__: Location.prototype,
file: this.parser.filename,
line: this.token.line + this._lineInString,
endLine: this.token.line + this._lineInString,
col: this._lineInString > 0
? i - this._currentLineBreakIndex + this._indentation
: this.token.column + i + 1,
endCol: this._lineInString > 0
? i - this._currentLineBreakIndex + width + this._indentation
: this.token.column + i + width + 1,
};
}
/**
* For text messages, escape the given string for $(CODE).
* Escaping is required to avoid line breaks in compiler messages, e.g.
* if \u000<LF> is the code, the line-feed must be escaped.
*
* @param {string} code
* @private
*/
_makeCode(code) {
// For characters that may be rendered as newline,
// see <https://www.unicode.org/reports/tr14/tr14-32.html>.
//
// Note: Unicode class `General_Category=Line_Separator` does not work for '\n'.
//
// U+000A: Line Feed (short: LF)
// U+000B: Vertical Tab (short: VT)
// U+000C: Form Feed (short: FF)
// U+000D: Carriage Return (short: CR)
// U+0085: Next Line (short: NEL)
// U+2028: Line Separator (short: LS)
// U+2029: Paragraph Separator (short: PS)
//
// For Visualization, see <https://en.wikipedia.org/wiki/Newline#Unicode>
// U+23CE: ⏎
// eslint-disable-next-line no-control-regex
const allNewLineCharacters = /[\u{000A}\u{000B}\u{000C}\u{000D}\u{0085}\u{2028}\u{2029}]/ug;
return code.replace(allNewLineCharacters, '\u{23CE}');
}
}
/**
* Parse a back-tick string and return it. This includes escape
* sequences, newlines, etc.
*
* Does _not_ modify the token's text.
*
* @param {object} token
*/
function parseMultiLineStringLiteral( token ) {
const p = new MultiLineStringParser(this, token);
return p.parse();
}
module.exports = {
parseMultiLineStringLiteral,
};