fracturedjsonjs
Version:
JSON formatter that produces highly readable but fairly compact output
355 lines (354 loc) • 15.2 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.TokenGenerator = void 0;
const ScannerState_1 = require("./ScannerState");
const TokenType_1 = require("./TokenType");
/**
* Converts a sequence of characters into a sequence of JSON tokens. There's no guarantee that the tokens make
* sense - just that they're lexically correct.
*/
function* TokenGenerator(inputJson) {
const state = new ScannerState_1.ScannerState(inputJson);
while (true) {
if (state.AtEnd())
return;
// With the exception of whitespace, all of the characters examined in the switch below will send us to
// a function that will potentially read more characters and either return the appropriate token, or
// throw a FracturedJsonError. If there is no error, state.Current() will be pointing to the character
// *after* the last one in the token that was read.
//
// Note that we're comparing the numeric (UTF16, I guess) form of the character to constants - or as close
// as we can reasonable come to them. The alternative is to create a new single-character string at every
// step and then do string comparisons. I'm assuming the numbers are faster, but who knows.
const ch = state.Current();
switch (ch) {
case _codeSpace:
case _codeTab:
case _codeCR:
// Regular unremarkable whitespace.
state.Advance(true);
break;
case _codeLF:
// If a line contained only whitespace, return a blank line. Note that we're ignoring CRs. If
// we get a Window's style CRLF, we throw away the CR, and then trigger on the LF just like we would
// for Unix.
if (!state.NonWhitespaceSinceLastNewline)
yield state.MakeToken(TokenType_1.TokenType.BlankLine, "\n");
state.NewLine();
// If this new line turns out to be nothing but whitespace, we want to report the blank line
// token as starting at the beginning of the line. Otherwise you get into \r\n vs. \n issues.
state.SetTokenStart();
break;
case _codeOpenCurly:
yield ProcessSingleChar(state, "{", TokenType_1.TokenType.BeginObject);
break;
case _codeCloseCurly:
yield ProcessSingleChar(state, "}", TokenType_1.TokenType.EndObject);
break;
case _codeOpenSquare:
yield ProcessSingleChar(state, "[", TokenType_1.TokenType.BeginArray);
break;
case _codeCloseSquare:
yield ProcessSingleChar(state, "]", TokenType_1.TokenType.EndArray);
break;
case _codeColon:
yield ProcessSingleChar(state, ":", TokenType_1.TokenType.Colon);
break;
case _codeComma:
yield ProcessSingleChar(state, ",", TokenType_1.TokenType.Comma);
break;
case _codeLittleT:
yield ProcessKeyword(state, "true", TokenType_1.TokenType.True);
break;
case _codeLittleF:
yield ProcessKeyword(state, "false", TokenType_1.TokenType.False);
break;
case _codeLittleN:
yield ProcessKeyword(state, "null", TokenType_1.TokenType.Null);
break;
case _codeSlash:
yield ProcessComment(state);
break;
case _codeQuote:
yield ProcessString(state);
break;
case _codeMinus:
yield ProcessNumber(state);
break;
default:
if (!isDigit(ch))
state.Throw("Unexpected character");
yield ProcessNumber(state);
break;
}
}
}
exports.TokenGenerator = TokenGenerator;
function ProcessSingleChar(state, symbol, type) {
state.SetTokenStart();
const token = state.MakeToken(type, symbol);
state.Advance(false);
return token;
}
function ProcessKeyword(state, keyword, type) {
state.SetTokenStart();
for (let i = 1; i < keyword.length; ++i) {
if (state.AtEnd())
state.Throw("Unexpected end of input while processing keyword");
state.Advance(false);
if (state.Current() !== keyword.charCodeAt(i))
state.Throw("Unexpected keyword");
}
const token = state.MakeToken(type, keyword);
state.Advance(false);
return token;
}
function ProcessComment(state) {
state.SetTokenStart();
if (state.AtEnd())
state.Throw("Unexpected end of input while processing comment");
state.Advance(false);
let isBlockComment = false;
if (state.Current() === _codeStar)
isBlockComment = true;
else if (state.Current() !== _codeSlash)
state.Throw("Bad character for start of comment");
state.Advance(false);
let lastCharWasAsterisk = false;
while (true) {
if (state.AtEnd()) {
// If the input ends while we're in the middle of a block comment, treat it as an error. If it
// ends in the middle of a line comment, treat the comment as valid.
if (isBlockComment)
state.Throw("Unexpected end of input while processing comment");
else
return state.MakeTokenFromBuffer(TokenType_1.TokenType.LineComment, true);
}
const ch = state.Current();
if (ch === _codeLF) {
state.NewLine();
if (!isBlockComment)
return state.MakeTokenFromBuffer(TokenType_1.TokenType.LineComment, true);
continue;
}
state.Advance(false);
if (ch === _codeSlash && lastCharWasAsterisk)
return state.MakeTokenFromBuffer(TokenType_1.TokenType.BlockComment);
lastCharWasAsterisk = (ch === _codeStar);
}
}
function ProcessString(state) {
state.SetTokenStart();
state.Advance(false);
let lastCharBeganEscape = false;
let expectedHexCount = 0;
while (true) {
if (state.AtEnd())
state.Throw("Unexpected end of input while processing string");
const ch = state.Current();
if (expectedHexCount > 0) {
if (!isHex(ch))
state.Throw("Bad unicode escape in string");
expectedHexCount -= 1;
state.Advance(false);
continue;
}
// Only certain characters are allowed after backslashes. The only ones that affect us here are
// \u, which needs to be followed by 4 hex digits, and \", which should not end the string.
if (lastCharBeganEscape) {
if (!isLegalAfterBackslash(ch))
state.Throw("Bad escaped character in string");
if (ch === _codeLittleU)
expectedHexCount = 4;
lastCharBeganEscape = false;
state.Advance(false);
continue;
}
if (isControl(ch))
state.Throw("Control characters are not allowed in strings");
state.Advance(false);
if (ch === _codeQuote)
return state.MakeTokenFromBuffer(TokenType_1.TokenType.String);
if (ch === _codeBackSlash)
lastCharBeganEscape = true;
}
}
function ProcessNumber(state) {
state.SetTokenStart();
let phase = NumberPhase.Beginning;
while (true) {
const ch = state.Current();
let handling = CharHandling.ValidAndConsumed;
switch (phase) {
case NumberPhase.Beginning:
if (ch === _codeMinus)
phase = NumberPhase.PastLeadingSign;
else if (ch === _codeZero)
phase = NumberPhase.PastWhole;
else if (isDigit(ch))
phase = NumberPhase.PastFirstDigitOfWhole;
else
handling = CharHandling.InvalidatesToken;
break;
case NumberPhase.PastLeadingSign:
if (!isDigit(ch))
handling = CharHandling.InvalidatesToken;
else if (ch === _codeZero)
phase = NumberPhase.PastWhole;
else
phase = NumberPhase.PastFirstDigitOfWhole;
break;
// We've started with a 1-9 and more digits are welcome.
case NumberPhase.PastFirstDigitOfWhole:
if (ch === _codeDecimal)
phase = NumberPhase.PastDecimalPoint;
else if (ch === _codeLittleE || ch === _codeBigE)
phase = NumberPhase.PastE;
else if (!isDigit(ch))
handling = CharHandling.StartOfNewToken;
break;
// We started with a 0. Another digit at this point would not be part of this token.
case NumberPhase.PastWhole:
if (ch === _codeDecimal)
phase = NumberPhase.PastDecimalPoint;
else if (ch === _codeLittleE || ch === _codeBigE)
phase = NumberPhase.PastE;
else
handling = CharHandling.StartOfNewToken;
break;
case NumberPhase.PastDecimalPoint:
if (isDigit(ch))
phase = NumberPhase.PastFirstDigitOfFractional;
else
handling = CharHandling.InvalidatesToken;
break;
case NumberPhase.PastFirstDigitOfFractional:
if (ch === _codeLittleE || ch === _codeBigE)
phase = NumberPhase.PastE;
else if (!isDigit(ch))
handling = CharHandling.StartOfNewToken;
break;
// An E must be followed by either a digit or +/-
case NumberPhase.PastE:
if (ch === _codePlus || ch === _codeMinus)
phase = NumberPhase.PastExpSign;
else if (isDigit(ch))
phase = NumberPhase.PastFirstDigitOfExponent;
else
handling = CharHandling.InvalidatesToken;
break;
// E and a +/- must still be followed by one or more digits.
case NumberPhase.PastExpSign:
if (isDigit(ch))
phase = NumberPhase.PastFirstDigitOfExponent;
else
handling = CharHandling.InvalidatesToken;
break;
case NumberPhase.PastFirstDigitOfExponent:
if (!isDigit(ch))
handling = CharHandling.StartOfNewToken;
break;
}
if (handling === CharHandling.InvalidatesToken)
state.Throw("Bad character while processing number");
if (handling === CharHandling.StartOfNewToken) {
// We're done processing the number, and the enumerator is pointed to the character after it.
return state.MakeTokenFromBuffer(TokenType_1.TokenType.Number);
}
if (!state.AtEnd()) {
state.Advance(false);
continue;
}
// We've reached the end of the input. Figure out if we read a complete token or not.
switch (phase) {
case NumberPhase.PastFirstDigitOfWhole:
case NumberPhase.PastWhole:
case NumberPhase.PastFirstDigitOfFractional:
case NumberPhase.PastFirstDigitOfExponent:
return state.MakeTokenFromBuffer(TokenType_1.TokenType.Number);
default:
state.Throw("Unexpected end of input while processing number");
break;
}
}
}
// Number versions of various important characters. I assume it's quicker to compare against these than doing
// a bunch of single-character string compared. But it's possible the JS engine has some slick optimizations for
// that case.
const _codeSpace = " ".charCodeAt(0);
const _codeLF = "\n".charCodeAt(0);
const _codeCR = "\r".charCodeAt(0);
const _codeTab = "\t".charCodeAt(0);
const _codeSlash = "/".charCodeAt(0);
const _codeStar = "*".charCodeAt(0);
const _codeBackSlash = "\\".charCodeAt(0);
const _codeQuote = "\"".charCodeAt(0);
const _codeOpenCurly = "{".charCodeAt(0);
const _codeCloseCurly = "}".charCodeAt(0);
const _codeOpenSquare = "[".charCodeAt(0);
const _codeCloseSquare = "]".charCodeAt(0);
const _codeColon = ":".charCodeAt(0);
const _codeComma = ",".charCodeAt(0);
const _codePlus = "+".charCodeAt(0);
const _codeMinus = "-".charCodeAt(0);
const _codeDecimal = ".".charCodeAt(0);
const _codeZero = "0".charCodeAt(0);
const _codeNine = "9".charCodeAt(0);
const _codeLittleA = "a".charCodeAt(0);
const _codeBigA = "A".charCodeAt(0);
const _codeLittleB = "b".charCodeAt(0);
const _codeLittleE = "e".charCodeAt(0);
const _codeBigE = "E".charCodeAt(0);
const _codeLittleF = "f".charCodeAt(0);
const _codeBigF = "F".charCodeAt(0);
const _codeLittleN = "n".charCodeAt(0);
const _codeLittleR = "r".charCodeAt(0);
const _codeLittleT = "t".charCodeAt(0);
const _codeLittleU = "u".charCodeAt(0);
function isDigit(charCode) {
return charCode >= _codeZero && charCode <= _codeNine;
}
function isHex(charCode) {
return (charCode >= _codeZero && charCode <= _codeNine)
|| (charCode >= _codeLittleA && charCode <= _codeLittleF)
|| (charCode >= _codeBigA && charCode <= _codeBigF);
}
function isLegalAfterBackslash(charCode) {
switch (charCode) {
case _codeQuote:
case _codeBackSlash:
case _codeSlash:
case _codeLittleB:
case _codeLittleF:
case _codeLittleN:
case _codeLittleR:
case _codeLittleT:
case _codeLittleU:
return true;
default:
return false;
}
}
function isControl(charCode) {
return (charCode >= 0x00 && charCode <= 0x1F)
|| (charCode === 0x7F)
|| (charCode >= 0x80 && charCode <= 0x9F);
}
var NumberPhase;
(function (NumberPhase) {
NumberPhase[NumberPhase["Beginning"] = 0] = "Beginning";
NumberPhase[NumberPhase["PastLeadingSign"] = 1] = "PastLeadingSign";
NumberPhase[NumberPhase["PastFirstDigitOfWhole"] = 2] = "PastFirstDigitOfWhole";
NumberPhase[NumberPhase["PastWhole"] = 3] = "PastWhole";
NumberPhase[NumberPhase["PastDecimalPoint"] = 4] = "PastDecimalPoint";
NumberPhase[NumberPhase["PastFirstDigitOfFractional"] = 5] = "PastFirstDigitOfFractional";
NumberPhase[NumberPhase["PastE"] = 6] = "PastE";
NumberPhase[NumberPhase["PastExpSign"] = 7] = "PastExpSign";
NumberPhase[NumberPhase["PastFirstDigitOfExponent"] = 8] = "PastFirstDigitOfExponent";
})(NumberPhase || (NumberPhase = {}));
var CharHandling;
(function (CharHandling) {
CharHandling[CharHandling["InvalidatesToken"] = 0] = "InvalidatesToken";
CharHandling[CharHandling["ValidAndConsumed"] = 1] = "ValidAndConsumed";
CharHandling[CharHandling["StartOfNewToken"] = 2] = "StartOfNewToken";
})(CharHandling || (CharHandling = {}));