UNPKG

aws-ddk-core

Version:

The AWS DataOps Development Kit is an open source development framework for customers that build data workflows and modern data architecture on AWS.

720 lines (707 loc) 24.1 kB
'use strict'; var cst = require('./cst.js'); /* START -> stream stream directive -> line-end -> stream indent + line-end -> stream [else] -> line-start line-end comment -> line-end newline -> . input-end -> END line-start doc-start -> doc doc-end -> stream [else] -> indent -> block-start block-start seq-item-start -> block-start explicit-key-start -> block-start map-value-start -> block-start [else] -> doc doc line-end -> line-start spaces -> doc anchor -> doc tag -> doc flow-start -> flow -> doc flow-end -> error -> doc seq-item-start -> error -> doc explicit-key-start -> error -> doc map-value-start -> doc alias -> doc quote-start -> quoted-scalar -> doc block-scalar-header -> line-end -> block-scalar(min) -> line-start [else] -> plain-scalar(false, min) -> doc flow line-end -> flow spaces -> flow anchor -> flow tag -> flow flow-start -> flow -> flow flow-end -> . seq-item-start -> error -> flow explicit-key-start -> flow map-value-start -> flow alias -> flow quote-start -> quoted-scalar -> flow comma -> flow [else] -> plain-scalar(true, 0) -> flow quoted-scalar quote-end -> . [else] -> quoted-scalar block-scalar(min) newline + peek(indent < min) -> . [else] -> block-scalar(min) plain-scalar(is-flow, min) scalar-end(is-flow) -> . peek(newline + (indent < min)) -> . [else] -> plain-scalar(min) */ function isEmpty(ch) { switch (ch) { case undefined: case ' ': case '\n': case '\r': case '\t': return true; default: return false; } } const hexDigits = new Set('0123456789ABCDEFabcdef'); const tagChars = new Set("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-#;/?:@&=+$_.!~*'()"); const flowIndicatorChars = new Set(',[]{}'); const invalidAnchorChars = new Set(' ,[]{}\n\r\t'); const isNotAnchorChar = (ch) => !ch || invalidAnchorChars.has(ch); /** * Splits an input string into lexical tokens, i.e. smaller strings that are * easily identifiable by `tokens.tokenType()`. * * Lexing starts always in a "stream" context. Incomplete input may be buffered * until a complete token can be emitted. * * In addition to slices of the original input, the following control characters * may also be emitted: * * - `\x02` (Start of Text): A document starts with the next token * - `\x18` (Cancel): Unexpected end of flow-mode (indicates an error) * - `\x1f` (Unit Separator): Next token is a scalar value * - `\u{FEFF}` (Byte order mark): Emitted separately outside documents */ class Lexer { constructor() { /** * Flag indicating whether the end of the current buffer marks the end of * all input */ this.atEnd = false; /** * Explicit indent set in block scalar header, as an offset from the current * minimum indent, so e.g. set to 1 from a header `|2+`. Set to -1 if not * explicitly set. */ this.blockScalarIndent = -1; /** * Block scalars that include a + (keep) chomping indicator in their header * include trailing empty lines, which are otherwise excluded from the * scalar's contents. */ this.blockScalarKeep = false; /** Current input */ this.buffer = ''; /** * Flag noting whether the map value indicator : can immediately follow this * node within a flow context. */ this.flowKey = false; /** Count of surrounding flow collection levels. */ this.flowLevel = 0; /** * Minimum level of indentation required for next lines to be parsed as a * part of the current scalar value. */ this.indentNext = 0; /** Indentation level of the current line. */ this.indentValue = 0; /** Position of the next \n character. */ this.lineEndPos = null; /** Stores the state of the lexer if reaching the end of incpomplete input */ this.next = null; /** A pointer to `buffer`; the current position of the lexer. */ this.pos = 0; } /** * Generate YAML tokens from the `source` string. If `incomplete`, * a part of the last line may be left as a buffer for the next call. * * @returns A generator of lexical tokens */ *lex(source, incomplete = false) { if (source) { if (typeof source !== 'string') throw TypeError('source is not a string'); this.buffer = this.buffer ? this.buffer + source : source; this.lineEndPos = null; } this.atEnd = !incomplete; let next = this.next ?? 'stream'; while (next && (incomplete || this.hasChars(1))) next = yield* this.parseNext(next); } atLineEnd() { let i = this.pos; let ch = this.buffer[i]; while (ch === ' ' || ch === '\t') ch = this.buffer[++i]; if (!ch || ch === '#' || ch === '\n') return true; if (ch === '\r') return this.buffer[i + 1] === '\n'; return false; } charAt(n) { return this.buffer[this.pos + n]; } continueScalar(offset) { let ch = this.buffer[offset]; if (this.indentNext > 0) { let indent = 0; while (ch === ' ') ch = this.buffer[++indent + offset]; if (ch === '\r') { const next = this.buffer[indent + offset + 1]; if (next === '\n' || (!next && !this.atEnd)) return offset + indent + 1; } return ch === '\n' || indent >= this.indentNext || (!ch && !this.atEnd) ? offset + indent : -1; } if (ch === '-' || ch === '.') { const dt = this.buffer.substr(offset, 3); if ((dt === '---' || dt === '...') && isEmpty(this.buffer[offset + 3])) return -1; } return offset; } getLine() { let end = this.lineEndPos; if (typeof end !== 'number' || (end !== -1 && end < this.pos)) { end = this.buffer.indexOf('\n', this.pos); this.lineEndPos = end; } if (end === -1) return this.atEnd ? this.buffer.substring(this.pos) : null; if (this.buffer[end - 1] === '\r') end -= 1; return this.buffer.substring(this.pos, end); } hasChars(n) { return this.pos + n <= this.buffer.length; } setNext(state) { this.buffer = this.buffer.substring(this.pos); this.pos = 0; this.lineEndPos = null; this.next = state; return null; } peek(n) { return this.buffer.substr(this.pos, n); } *parseNext(next) { switch (next) { case 'stream': return yield* this.parseStream(); case 'line-start': return yield* this.parseLineStart(); case 'block-start': return yield* this.parseBlockStart(); case 'doc': return yield* this.parseDocument(); case 'flow': return yield* this.parseFlowCollection(); case 'quoted-scalar': return yield* this.parseQuotedScalar(); case 'block-scalar': return yield* this.parseBlockScalar(); case 'plain-scalar': return yield* this.parsePlainScalar(); } } *parseStream() { let line = this.getLine(); if (line === null) return this.setNext('stream'); if (line[0] === cst.BOM) { yield* this.pushCount(1); line = line.substring(1); } if (line[0] === '%') { let dirEnd = line.length; let cs = line.indexOf('#'); while (cs !== -1) { const ch = line[cs - 1]; if (ch === ' ' || ch === '\t') { dirEnd = cs - 1; break; } else { cs = line.indexOf('#', cs + 1); } } while (true) { const ch = line[dirEnd - 1]; if (ch === ' ' || ch === '\t') dirEnd -= 1; else break; } const n = (yield* this.pushCount(dirEnd)) + (yield* this.pushSpaces(true)); yield* this.pushCount(line.length - n); // possible comment this.pushNewline(); return 'stream'; } if (this.atLineEnd()) { const sp = yield* this.pushSpaces(true); yield* this.pushCount(line.length - sp); yield* this.pushNewline(); return 'stream'; } yield cst.DOCUMENT; return yield* this.parseLineStart(); } *parseLineStart() { const ch = this.charAt(0); if (!ch && !this.atEnd) return this.setNext('line-start'); if (ch === '-' || ch === '.') { if (!this.atEnd && !this.hasChars(4)) return this.setNext('line-start'); const s = this.peek(3); if ((s === '---' || s === '...') && isEmpty(this.charAt(3))) { yield* this.pushCount(3); this.indentValue = 0; this.indentNext = 0; return s === '---' ? 'doc' : 'stream'; } } this.indentValue = yield* this.pushSpaces(false); if (this.indentNext > this.indentValue && !isEmpty(this.charAt(1))) this.indentNext = this.indentValue; return yield* this.parseBlockStart(); } *parseBlockStart() { const [ch0, ch1] = this.peek(2); if (!ch1 && !this.atEnd) return this.setNext('block-start'); if ((ch0 === '-' || ch0 === '?' || ch0 === ':') && isEmpty(ch1)) { const n = (yield* this.pushCount(1)) + (yield* this.pushSpaces(true)); this.indentNext = this.indentValue + 1; this.indentValue += n; return yield* this.parseBlockStart(); } return 'doc'; } *parseDocument() { yield* this.pushSpaces(true); const line = this.getLine(); if (line === null) return this.setNext('doc'); let n = yield* this.pushIndicators(); switch (line[n]) { case '#': yield* this.pushCount(line.length - n); // fallthrough case undefined: yield* this.pushNewline(); return yield* this.parseLineStart(); case '{': case '[': yield* this.pushCount(1); this.flowKey = false; this.flowLevel = 1; return 'flow'; case '}': case ']': // this is an error yield* this.pushCount(1); return 'doc'; case '*': yield* this.pushUntil(isNotAnchorChar); return 'doc'; case '"': case "'": return yield* this.parseQuotedScalar(); case '|': case '>': n += yield* this.parseBlockScalarHeader(); n += yield* this.pushSpaces(true); yield* this.pushCount(line.length - n); yield* this.pushNewline(); return yield* this.parseBlockScalar(); default: return yield* this.parsePlainScalar(); } } *parseFlowCollection() { let nl, sp; let indent = -1; do { nl = yield* this.pushNewline(); if (nl > 0) { sp = yield* this.pushSpaces(false); this.indentValue = indent = sp; } else { sp = 0; } sp += yield* this.pushSpaces(true); } while (nl + sp > 0); const line = this.getLine(); if (line === null) return this.setNext('flow'); if ((indent !== -1 && indent < this.indentNext && line[0] !== '#') || (indent === 0 && (line.startsWith('---') || line.startsWith('...')) && isEmpty(line[3]))) { // Allowing for the terminal ] or } at the same (rather than greater) // indent level as the initial [ or { is technically invalid, but // failing here would be surprising to users. const atFlowEndMarker = indent === this.indentNext - 1 && this.flowLevel === 1 && (line[0] === ']' || line[0] === '}'); if (!atFlowEndMarker) { // this is an error this.flowLevel = 0; yield cst.FLOW_END; return yield* this.parseLineStart(); } } let n = 0; while (line[n] === ',') { n += yield* this.pushCount(1); n += yield* this.pushSpaces(true); this.flowKey = false; } n += yield* this.pushIndicators(); switch (line[n]) { case undefined: return 'flow'; case '#': yield* this.pushCount(line.length - n); return 'flow'; case '{': case '[': yield* this.pushCount(1); this.flowKey = false; this.flowLevel += 1; return 'flow'; case '}': case ']': yield* this.pushCount(1); this.flowKey = true; this.flowLevel -= 1; return this.flowLevel ? 'flow' : 'doc'; case '*': yield* this.pushUntil(isNotAnchorChar); return 'flow'; case '"': case "'": this.flowKey = true; return yield* this.parseQuotedScalar(); case ':': { const next = this.charAt(1); if (this.flowKey || isEmpty(next) || next === ',') { this.flowKey = false; yield* this.pushCount(1); yield* this.pushSpaces(true); return 'flow'; } } // fallthrough default: this.flowKey = false; return yield* this.parsePlainScalar(); } } *parseQuotedScalar() { const quote = this.charAt(0); let end = this.buffer.indexOf(quote, this.pos + 1); if (quote === "'") { while (end !== -1 && this.buffer[end + 1] === "'") end = this.buffer.indexOf("'", end + 2); } else { // double-quote while (end !== -1) { let n = 0; while (this.buffer[end - 1 - n] === '\\') n += 1; if (n % 2 === 0) break; end = this.buffer.indexOf('"', end + 1); } } // Only looking for newlines within the quotes const qb = this.buffer.substring(0, end); let nl = qb.indexOf('\n', this.pos); if (nl !== -1) { while (nl !== -1) { const cs = this.continueScalar(nl + 1); if (cs === -1) break; nl = qb.indexOf('\n', cs); } if (nl !== -1) { // this is an error caused by an unexpected unindent end = nl - (qb[nl - 1] === '\r' ? 2 : 1); } } if (end === -1) { if (!this.atEnd) return this.setNext('quoted-scalar'); end = this.buffer.length; } yield* this.pushToIndex(end + 1, false); return this.flowLevel ? 'flow' : 'doc'; } *parseBlockScalarHeader() { this.blockScalarIndent = -1; this.blockScalarKeep = false; let i = this.pos; while (true) { const ch = this.buffer[++i]; if (ch === '+') this.blockScalarKeep = true; else if (ch > '0' && ch <= '9') this.blockScalarIndent = Number(ch) - 1; else if (ch !== '-') break; } return yield* this.pushUntil(ch => isEmpty(ch) || ch === '#'); } *parseBlockScalar() { let nl = this.pos - 1; // may be -1 if this.pos === 0 let indent = 0; let ch; loop: for (let i = this.pos; (ch = this.buffer[i]); ++i) { switch (ch) { case ' ': indent += 1; break; case '\n': nl = i; indent = 0; break; case '\r': { const next = this.buffer[i + 1]; if (!next && !this.atEnd) return this.setNext('block-scalar'); if (next === '\n') break; } // fallthrough default: break loop; } } if (!ch && !this.atEnd) return this.setNext('block-scalar'); if (indent >= this.indentNext) { if (this.blockScalarIndent === -1) this.indentNext = indent; else { this.indentNext = this.blockScalarIndent + (this.indentNext === 0 ? 1 : this.indentNext); } do { const cs = this.continueScalar(nl + 1); if (cs === -1) break; nl = this.buffer.indexOf('\n', cs); } while (nl !== -1); if (nl === -1) { if (!this.atEnd) return this.setNext('block-scalar'); nl = this.buffer.length; } } // Trailing insufficiently indented tabs are invalid. // To catch that during parsing, we include them in the block scalar value. let i = nl + 1; ch = this.buffer[i]; while (ch === ' ') ch = this.buffer[++i]; if (ch === '\t') { while (ch === '\t' || ch === ' ' || ch === '\r' || ch === '\n') ch = this.buffer[++i]; nl = i - 1; } else if (!this.blockScalarKeep) { do { let i = nl - 1; let ch = this.buffer[i]; if (ch === '\r') ch = this.buffer[--i]; const lastChar = i; // Drop the line if last char not more indented while (ch === ' ') ch = this.buffer[--i]; if (ch === '\n' && i >= this.pos && i + 1 + indent > lastChar) nl = i; else break; } while (true); } yield cst.SCALAR; yield* this.pushToIndex(nl + 1, true); return yield* this.parseLineStart(); } *parsePlainScalar() { const inFlow = this.flowLevel > 0; let end = this.pos - 1; let i = this.pos - 1; let ch; while ((ch = this.buffer[++i])) { if (ch === ':') { const next = this.buffer[i + 1]; if (isEmpty(next) || (inFlow && flowIndicatorChars.has(next))) break; end = i; } else if (isEmpty(ch)) { let next = this.buffer[i + 1]; if (ch === '\r') { if (next === '\n') { i += 1; ch = '\n'; next = this.buffer[i + 1]; } else end = i; } if (next === '#' || (inFlow && flowIndicatorChars.has(next))) break; if (ch === '\n') { const cs = this.continueScalar(i + 1); if (cs === -1) break; i = Math.max(i, cs - 2); // to advance, but still account for ' #' } } else { if (inFlow && flowIndicatorChars.has(ch)) break; end = i; } } if (!ch && !this.atEnd) return this.setNext('plain-scalar'); yield cst.SCALAR; yield* this.pushToIndex(end + 1, true); return inFlow ? 'flow' : 'doc'; } *pushCount(n) { if (n > 0) { yield this.buffer.substr(this.pos, n); this.pos += n; return n; } return 0; } *pushToIndex(i, allowEmpty) { const s = this.buffer.slice(this.pos, i); if (s) { yield s; this.pos += s.length; return s.length; } else if (allowEmpty) yield ''; return 0; } *pushIndicators() { switch (this.charAt(0)) { case '!': return ((yield* this.pushTag()) + (yield* this.pushSpaces(true)) + (yield* this.pushIndicators())); case '&': return ((yield* this.pushUntil(isNotAnchorChar)) + (yield* this.pushSpaces(true)) + (yield* this.pushIndicators())); case '-': // this is an error case '?': // this is an error outside flow collections case ':': { const inFlow = this.flowLevel > 0; const ch1 = this.charAt(1); if (isEmpty(ch1) || (inFlow && flowIndicatorChars.has(ch1))) { if (!inFlow) this.indentNext = this.indentValue + 1; else if (this.flowKey) this.flowKey = false; return ((yield* this.pushCount(1)) + (yield* this.pushSpaces(true)) + (yield* this.pushIndicators())); } } } return 0; } *pushTag() { if (this.charAt(1) === '<') { let i = this.pos + 2; let ch = this.buffer[i]; while (!isEmpty(ch) && ch !== '>') ch = this.buffer[++i]; return yield* this.pushToIndex(ch === '>' ? i + 1 : i, false); } else { let i = this.pos + 1; let ch = this.buffer[i]; while (ch) { if (tagChars.has(ch)) ch = this.buffer[++i]; else if (ch === '%' && hexDigits.has(this.buffer[i + 1]) && hexDigits.has(this.buffer[i + 2])) { ch = this.buffer[(i += 3)]; } else break; } return yield* this.pushToIndex(i, false); } } *pushNewline() { const ch = this.buffer[this.pos]; if (ch === '\n') return yield* this.pushCount(1); else if (ch === '\r' && this.charAt(1) === '\n') return yield* this.pushCount(2); else return 0; } *pushSpaces(allowTabs) { let i = this.pos - 1; let ch; do { ch = this.buffer[++i]; } while (ch === ' ' || (allowTabs && ch === '\t')); const n = i - this.pos; if (n > 0) { yield this.buffer.substr(this.pos, n); this.pos = i; } return n; } *pushUntil(test) { let i = this.pos; let ch = this.buffer[i]; while (!test(ch)) ch = this.buffer[++i]; return yield* this.pushToIndex(i, false); } } exports.Lexer = Lexer;