UNPKG

aws-ddk-core

Version:

The AWS DataOps Development Kit is an open source development framework for customers that build data workflows and modern data architecture on AWS.

github.com/awslabs/aws-ddk/tree/main

awslabs/aws-ddk

720 lines (707 loc) • 24.1 kB

JavaScript

'use strict'; var cst = require('./cst.js'); /* START -> stream stream directive -> line-end -> stream indent + line-end -> stream [else] -> line-start line-end comment -> line-end newline -> . input-end -> END line-start doc-start -> doc doc-end -> stream [else] -> indent -> block-start block-start seq-item-start -> block-start explicit-key-start -> block-start map-value-start -> block-start [else] -> doc doc line-end -> line-start spaces -> doc anchor -> doc tag -> doc flow-start -> flow -> doc flow-end -> error -> doc seq-item-start -> error -> doc explicit-key-start -> error -> doc map-value-start -> doc alias -> doc quote-start -> quoted-scalar -> doc block-scalar-header -> line-end -> block-scalar(min) -> line-start [else] -> plain-scalar(false, min) -> doc flow line-end -> flow spaces -> flow anchor -> flow tag -> flow flow-start -> flow -> flow flow-end -> . seq-item-start -> error -> flow explicit-key-start -> flow map-value-start -> flow alias -> flow quote-start -> quoted-scalar -> flow comma -> flow [else] -> plain-scalar(true, 0) -> flow quoted-scalar quote-end -> . [else] -> quoted-scalar block-scalar(min) newline + peek(indent < min) -> . [else] -> block-scalar(min) plain-scalar(is-flow, min) scalar-end(is-flow) -> . peek(newline + (indent < min)) -> . [else] -> plain-scalar(min) */ function isEmpty(ch) { switch (ch) { case undefined: case ' ': case '\n': case '\r': case '\t': return true; default: return false; } } const hexDigits = new Set('0123456789ABCDEFabcdef'); const tagChars = new Set("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-#;/?:@&=+$_.!~*'()"); const flowIndicatorChars = new Set(',[]{}'); const invalidAnchorChars = new Set(' ,[]{}\n\r\t'); const isNotAnchorChar = (ch) => !ch || invalidAnchorChars.has(ch); /** * Splits an input string into lexical tokens, i.e. smaller strings that are * easily identifiable by `tokens.tokenType()`. * * Lexing starts always in a "stream" context. Incomplete input may be buffered * until a complete token can be emitted. * * In addition to slices of the original input, the following control characters * may also be emitted: * * - `\x02` (Start of Text): A document starts with the next token * - `\x18` (Cancel): Unexpected end of flow-mode (indicates an error) * - `\x1f` (Unit Separator): Next token is a scalar value * - `\u{FEFF}` (Byte order mark): Emitted separately outside documents */ class Lexer { constructor() { /** * Flag indicating whether the end of the current buffer marks the end of * all input */ this.atEnd = false; /** * Explicit indent set in block scalar header, as an offset from the current * minimum indent, so e.g. set to 1 from a header `|2+`. Set to -1 if not * explicitly set. */ this.blockScalarIndent = -1; /** * Block scalars that include a + (keep) chomping indicator in their header * include trailing empty lines, which are otherwise excluded from the * scalar's contents. */ this.blockScalarKeep = false; /** Current input */ this.buffer = ''; /** * Flag noting whether the map value indicator : can immediately follow this * node within a flow context. */ this.flowKey = false; /** Count of surrounding flow collection levels. */ this.flowLevel = 0; /** * Minimum level of indentation required for next lines to be parsed as a * part of the current scalar value. */ this.indentNext = 0; /** Indentation level of the current line. */ this.indentValue = 0; /** Position of the next \n character. */ this.lineEndPos = null; /** Stores the state of the lexer if reaching the end of incpomplete input */ this.next = null; /** A pointer to `buffer`; the current position of the lexer. */ this.pos = 0; } /** * Generate YAML tokens from the `source` string. If `incomplete`, * a part of the last line may be left as a buffer for the next call. * * @returns A generator of lexical tokens */ *lex(source, incomplete = false) { if (source) { if (typeof source !== 'string') throw TypeError('source is not a string'); this.buffer = this.buffer ? this.buffer + source : source; this.lineEndPos = null; } this.atEnd = !incomplete; let next = this.next ?? 'stream'; while (next && (incomplete || this.hasChars(1))) next = yield* this.parseNext(next); } atLineEnd() { let i = this.pos; let ch = this.buffer[i]; while (ch === ' ' || ch === '\t') ch = this.buffer[++i]; if (!ch || ch === '#' || ch === '\n') return true; if (ch === '\r') return this.buffer[i + 1] === '\n'; return false; } charAt(n) { return this.buffer[this.pos + n]; } continueScalar(offset) { let ch = this.buffer[offset]; if (this.indentNext > 0) { let indent = 0; while (ch === ' ') ch = this.buffer[++indent + offset]; if (ch === '\r') { const next = this.buffer[indent + offset + 1]; if (next === '\n' || (!next && !this.atEnd)) return offset + indent + 1; } return ch === '\n' || indent >= this.indentNext || (!ch && !this.atEnd) ? offset + indent : -1; } if (ch === '-' || ch === '.') { const dt = this.buffer.substr(offset, 3); if ((dt === '---' || dt === '...') && isEmpty(this.buffer[offset + 3])) return -1; } return offset; } getLine() { let end = this.lineEndPos; if (typeof end !== 'number' || (end !== -1 && end < this.pos)) { end = this.buffer.indexOf('\n', this.pos); this.lineEndPos = end; } if (end === -1) return this.atEnd ? this.buffer.substring(this.pos) : null; if (this.buffer[end - 1] === '\r') end -= 1; return this.buffer.substring(this.pos, end); } hasChars(n) { return this.pos + n <= this.buffer.length; } setNext(state) { this.buffer = this.buffer.substring(this.pos); this.pos = 0; this.lineEndPos = null; this.next = state; return null; } peek(n) { return this.buffer.substr(this.pos, n); } *parseNext(next) { switch (next) { case 'stream': return yield* this.parseStream(); case 'line-start': return yield* this.parseLineStart(); case 'block-start': return yield* this.parseBlockStart(); case 'doc': return yield* this.parseDocument(); case 'flow': return yield* this.parseFlowCollection(); case 'quoted-scalar': return yield* this.parseQuotedScalar(); case 'block-scalar': return yield* this.parseBlockScalar(); case 'plain-scalar': return yield* this.parsePlainScalar(); } } *parseStream() { let line = this.getLine(); if (line === null) return this.setNext('stream'); if (line[0] === cst.BOM) { yield* this.pushCount(1); line = line.substring(1); } if (line[0] === '%') { let dirEnd = line.length; let cs = line.indexOf('#'); while (cs !== -1) { const ch = line[cs - 1]; if (ch === ' ' || ch === '\t') { dirEnd = cs - 1; break; } else { cs = line.indexOf('#', cs + 1); } } while (true) { const ch = line[dirEnd - 1]; if (ch === ' ' || ch === '\t') dirEnd -= 1; else break; } const n = (yield* this.pushCount(dirEnd)) + (yield* this.pushSpaces(true)); yield* this.pushCount(line.length - n); // possible comment this.pushNewline(); return 'stream'; } if (this.atLineEnd()) { const sp = yield* this.pushSpaces(true); yield* this.pushCount(line.length - sp); yield* this.pushNewline(); return 'stream'; } yield cst.DOCUMENT; return yield* this.parseLineStart(); } *parseLineStart() { const ch = this.charAt(0); if (!ch && !this.atEnd) return this.setNext('line-start'); if (ch === '-' || ch === '.') { if (!this.atEnd && !this.hasChars(4)) return this.setNext('line-start'); const s = this.peek(3); if ((s === '---' || s === '...') && isEmpty(this.charAt(3))) { yield* this.pushCount(3); this.indentValue = 0; this.indentNext = 0; return s === '---' ? 'doc' : 'stream'; } } this.indentValue = yield* this.pushSpaces(false); if (this.indentNext > this.indentValue && !isEmpty(this.charAt(1))) this.indentNext = this.indentValue; return yield* this.parseBlockStart(); } *parseBlockStart() { const [ch0, ch1] = this.peek(2); if (!ch1 && !this.atEnd) return this.setNext('block-start'); if ((ch0 === '-' || ch0 === '?' || ch0 === ':') && isEmpty(ch1)) { const n = (yield* this.pushCount(1)) + (yield* this.pushSpaces(true)); this.indentNext = this.indentValue + 1; this.indentValue += n; return yield* this.parseBlockStart(); } return 'doc'; } *parseDocument() { yield* this.pushSpaces(true); const line = this.getLine(); if (line === null) return this.setNext('doc'); let n = yield* this.pushIndicators(); switch (line[n]) { case '#': yield* this.pushCount(line.length - n); // fallthrough case undefined: yield* this.pushNewline(); return yield* this.parseLineStart(); case '{': case '[': yield* this.pushCount(1); this.flowKey = false; this.flowLevel = 1; return 'flow'; case '}': case ']': // this is an error yield* this.pushCount(1); return 'doc'; case '*': yield* this.pushUntil(isNotAnchorChar); return 'doc'; case '"': case "'": return yield* this.parseQuotedScalar(); case '|': case '>': n += yield* this.parseBlockScalarHeader(); n += yield* this.pushSpaces(true); yield* this.pushCount(line.length - n); yield* this.pushNewline(); return yield* this.parseBlockScalar(); default: return yield* this.parsePlainScalar(); } } *parseFlowCollection() { let nl, sp; let indent = -1; do { nl = yield* this.pushNewline(); if (nl > 0) { sp = yield* this.pushSpaces(false); this.indentValue = indent = sp; } else { sp = 0; } sp += yield* this.pushSpaces(true); } while (nl + sp > 0); const line = this.getLine(); if (line === null) return this.setNext('flow'); if ((indent !== -1 && indent < this.indentNext && line[0] !== '#') || (indent === 0 && (line.startsWith('---') || line.startsWith('...')) && isEmpty(line[3]))) { // Allowing for the terminal ] or } at the same (rather than greater) // indent level as the initial [ or { is technically invalid, but // failing here would be surprising to users. const atFlowEndMarker = indent === this.indentNext - 1 && this.flowLevel === 1 && (line[0] === ']' || line[0] === '}'); if (!atFlowEndMarker) { // this is an error this.flowLevel = 0; yield cst.FLOW_END; return yield* this.parseLineStart(); } } let n = 0; while (line[n] === ',') { n += yield* this.pushCount(1); n += yield* this.pushSpaces(true); this.flowKey = false; } n += yield* this.pushIndicators(); switch (line[n]) { case undefined: return 'flow'; case '#': yield* this.pushCount(line.length - n); return 'flow'; case '{': case '[': yield* this.pushCount(1); this.flowKey = false; this.flowLevel += 1; return 'flow'; case '}': case ']': yield* this.pushCount(1); this.flowKey = true; this.flowLevel -= 1; return this.flowLevel ? 'flow' : 'doc'; case '*': yield* this.pushUntil(isNotAnchorChar); return 'flow'; case '"': case "'": this.flowKey = true; return yield* this.parseQuotedScalar(); case ':': { const next = this.charAt(1); if (this.flowKey || isEmpty(next) || next === ',') { this.flowKey = false; yield* this.pushCount(1); yield* this.pushSpaces(true); return 'flow'; } } // fallthrough default: this.flowKey = false; return yield* this.parsePlainScalar(); } } *parseQuotedScalar() { const quote = this.charAt(0); let end = this.buffer.indexOf(quote, this.pos + 1); if (quote === "'") { while (end !== -1 && this.buffer[end + 1] === "'") end = this.buffer.indexOf("'", end + 2); } else { // double-quote while (end !== -1) { let n = 0; while (this.buffer[end - 1 - n] === '\\') n += 1; if (n % 2 === 0) break; end = this.buffer.indexOf('"', end + 1); } } // Only looking for newlines within the quotes const qb = this.buffer.substring(0, end); let nl = qb.indexOf('\n', this.pos); if (nl !== -1) { while (nl !== -1) { const cs = this.continueScalar(nl + 1); if (cs === -1) break; nl = qb.indexOf('\n', cs); } if (nl !== -1) { // this is an error caused by an unexpected unindent end = nl - (qb[nl - 1] === '\r' ? 2 : 1); } } if (end === -1) { if (!this.atEnd) return this.setNext('quoted-scalar'); end = this.buffer.length; } yield* this.pushToIndex(end + 1, false); return this.flowLevel ? 'flow' : 'doc'; } *parseBlockScalarHeader() { this.blockScalarIndent = -1; this.blockScalarKeep = false; let i = this.pos; while (true) { const ch = this.buffer[++i]; if (ch === '+') this.blockScalarKeep = true; else if (ch > '0' && ch <= '9') this.blockScalarIndent = Number(ch) - 1; else if (ch !== '-') break; } return yield* this.pushUntil(ch => isEmpty(ch) || ch === '#'); } *parseBlockScalar() { let nl = this.pos - 1; // may be -1 if this.pos === 0 let indent = 0; let ch; loop: for (let i = this.pos; (ch = this.buffer[i]); ++i) { switch (ch) { case ' ': indent += 1; break; case '\n': nl = i; indent = 0; break; case '\r': { const next = this.buffer[i + 1]; if (!next && !this.atEnd) return this.setNext('block-scalar'); if (next === '\n') break; } // fallthrough default: break loop; } } if (!ch && !this.atEnd) return this.setNext('block-scalar'); if (indent >= this.indentNext) { if (this.blockScalarIndent === -1) this.indentNext = indent; else { this.indentNext = this.blockScalarIndent + (this.indentNext === 0 ? 1 : this.indentNext); } do { const cs = this.continueScalar(nl + 1); if (cs === -1) break; nl = this.buffer.indexOf('\n', cs); } while (nl !== -1); if (nl === -1) { if (!this.atEnd) return this.setNext('block-scalar'); nl = this.buffer.length; } } // Trailing insufficiently indented tabs are invalid. // To catch that during parsing, we include them in the block scalar value. let i = nl + 1; ch = this.buffer[i]; while (ch === ' ') ch = this.buffer[++i]; if (ch === '\t') { while (ch === '\t' || ch === ' ' || ch === '\r' || ch === '\n') ch = this.buffer[++i]; nl = i - 1; } else if (!this.blockScalarKeep) { do { let i = nl - 1; let ch = this.buffer[i]; if (ch === '\r') ch = this.buffer[--i]; const lastChar = i; // Drop the line if last char not more indented while (ch === ' ') ch = this.buffer[--i]; if (ch === '\n' && i >= this.pos && i + 1 + indent > lastChar) nl = i; else break; } while (true); } yield cst.SCALAR; yield* this.pushToIndex(nl + 1, true); return yield* this.parseLineStart(); } *parsePlainScalar() { const inFlow = this.flowLevel > 0; let end = this.pos - 1; let i = this.pos - 1; let ch; while ((ch = this.buffer[++i])) { if (ch === ':') { const next = this.buffer[i + 1]; if (isEmpty(next) || (inFlow && flowIndicatorChars.has(next))) break; end = i; } else if (isEmpty(ch)) { let next = this.buffer[i + 1]; if (ch === '\r') { if (next === '\n') { i += 1; ch = '\n'; next = this.buffer[i + 1]; } else end = i; } if (next === '#' || (inFlow && flowIndicatorChars.has(next))) break; if (ch === '\n') { const cs = this.continueScalar(i + 1); if (cs === -1) break; i = Math.max(i, cs - 2); // to advance, but still account for ' #' } } else { if (inFlow && flowIndicatorChars.has(ch)) break; end = i; } } if (!ch && !this.atEnd) return this.setNext('plain-scalar'); yield cst.SCALAR; yield* this.pushToIndex(end + 1, true); return inFlow ? 'flow' : 'doc'; } *pushCount(n) { if (n > 0) { yield this.buffer.substr(this.pos, n); this.pos += n; return n; } return 0; } *pushToIndex(i, allowEmpty) { const s = this.buffer.slice(this.pos, i); if (s) { yield s; this.pos += s.length; return s.length; } else if (allowEmpty) yield ''; return 0; } *pushIndicators() { switch (this.charAt(0)) { case '!': return ((yield* this.pushTag()) + (yield* this.pushSpaces(true)) + (yield* this.pushIndicators())); case '&': return ((yield* this.pushUntil(isNotAnchorChar)) + (yield* this.pushSpaces(true)) + (yield* this.pushIndicators())); case '-': // this is an error case '?': // this is an error outside flow collections case ':': { const inFlow = this.flowLevel > 0; const ch1 = this.charAt(1); if (isEmpty(ch1) || (inFlow && flowIndicatorChars.has(ch1))) { if (!inFlow) this.indentNext = this.indentValue + 1; else if (this.flowKey) this.flowKey = false; return ((yield* this.pushCount(1)) + (yield* this.pushSpaces(true)) + (yield* this.pushIndicators())); } } } return 0; } *pushTag() { if (this.charAt(1) === '<') { let i = this.pos + 2; let ch = this.buffer[i]; while (!isEmpty(ch) && ch !== '>') ch = this.buffer[++i]; return yield* this.pushToIndex(ch === '>' ? i + 1 : i, false); } else { let i = this.pos + 1; let ch = this.buffer[i]; while (ch) { if (tagChars.has(ch)) ch = this.buffer[++i]; else if (ch === '%' && hexDigits.has(this.buffer[i + 1]) && hexDigits.has(this.buffer[i + 2])) { ch = this.buffer[(i += 3)]; } else break; } return yield* this.pushToIndex(i, false); } } *pushNewline() { const ch = this.buffer[this.pos]; if (ch === '\n') return yield* this.pushCount(1); else if (ch === '\r' && this.charAt(1) === '\n') return yield* this.pushCount(2); else return 0; } *pushSpaces(allowTabs) { let i = this.pos - 1; let ch; do { ch = this.buffer[++i]; } while (ch === ' ' || (allowTabs && ch === '\t')); const n = i - this.pos; if (n > 0) { yield this.buffer.substr(this.pos, n); this.pos = i; } return n; } *pushUntil(test) { let i = this.pos; let ch = this.buffer[i]; while (!test(ch)) ch = this.buffer[++i]; return yield* this.pushToIndex(i, false); } } exports.Lexer = Lexer;