@anchan828/json-ast
Version:
JSON parser AST utilities
484 lines (483 loc) • 15.1 kB
JavaScript
import { error } from "./error.js";
import { JsonPosition } from "./position.js";
import { cannotTokenizeSymbol } from "./tokenizeErrorTypes.js";
import { JsonTokenTypes } from "./types.js";
const charTokens = {
"{": JsonTokenTypes.LEFT_BRACE,
"}": JsonTokenTypes.RIGHT_BRACE,
"[": JsonTokenTypes.LEFT_BRACKET,
"]": JsonTokenTypes.RIGHT_BRACKET,
":": JsonTokenTypes.COLON,
",": JsonTokenTypes.COMMA,
};
const keywordsTokens = {
true: JsonTokenTypes.TRUE,
false: JsonTokenTypes.FALSE,
null: JsonTokenTypes.NULL,
};
const stringStates = {
_START_: 0,
START_QUOTE_OR_CHAR: 1,
ESCAPE: 2,
};
const symbolSubstitutes = {
b: '\b', // Backspace
f: '\f', // Form feed
n: '\n', // New line
r: '\r', // Carriage return
t: '\t', // Horizontal tab
};
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#escape_sequences
const escapes = {
'"': 0, // Quotation mask
"\\": 1, // Reverse solidus
"/": 2, // Solidus
b: 3, // Backspace
f: 4, // Form feed
n: 5, // New line
r: 6, // Carriage return
t: 7, // Horizontal tab
u: 8, // 4 hexadecimal digits
};
// Support regex
["d", "D", "w", "W", "s", "S"].forEach((d, i) => {
escapes[d] = i;
});
const numberStates = {
_START_: 0,
MINUS: 1,
ZERO: 2,
DIGIT: 3,
POINT: 4,
DIGIT_FRACTION: 5,
EXP: 6,
EXP_DIGIT_OR_SIGN: 7,
};
// HELPERS
function isDigit1to9(char) {
return char >= "1" && char <= "9";
}
function isDigit(char) {
return char >= "0" && char <= "9";
}
function isLetter(char) {
return (char >= "a" && char <= "z") || (char >= "A" && char <= "Z");
}
function isHex(char) {
return isDigit(char) || (char >= "a" && char <= "f") || (char >= "A" && char <= "F");
}
function isExp(char) {
return char === "e" || char === "E";
}
// PARSERS
function parseWhitespace(source, index, line, column) {
const char = source.charAt(index);
if (char === "\r") {
// CR (Unix)
index++;
line++;
column = 1;
if (source.charAt(index) === "\n") {
// CRLF (Windows)
index++;
}
}
else if (char === "\n") {
// LF (MacOS)
index++;
line++;
column = 1;
}
else if (char === "\t" || char === " ") {
index++;
column++;
}
else {
return null;
}
return { index, line, column };
}
function parseComment(source, index, line, column) {
const sourceLength = source.length;
let char = source.charAt(index);
if (char === "/") {
let next_char = source.charAt(index + 1) || "";
if ("/" === next_char) {
// Unroll until the end of the line
const first_index = index + 2;
let last_index = index + 2;
index += 2;
while (index < sourceLength) {
char = source.charAt(index);
if (char === "\r") {
last_index = index;
index++;
line++;
column = 1;
if (source.charAt(index + 1) === "\n") {
// CR LF
last_index = index;
index++;
}
break;
}
else if (char === "\n") {
last_index = index;
index++;
line++;
column = 1;
break;
}
else {
index++;
}
}
if (index >= sourceLength) {
last_index = sourceLength;
}
return {
type: JsonTokenTypes.COMMENT,
value: source.substring(first_index, last_index).replace(/(\r\n|\n|\r)/gm, ""),
index: index,
line: line,
column: column,
};
}
else if ("*" === next_char) {
// unroll until we find */
const first_index = index + 2;
let last_index = index + 2;
index += 2;
while (index < sourceLength) {
char = source.charAt(index);
if (char !== "*") {
if (char === "\r") {
next_char = source.charAt(index + 1) || "";
line++;
column = 1;
if (next_char === "\n") {
index++;
}
}
else if (char === "\n") {
line++;
column = 1;
}
}
else {
next_char = source.charAt(index + 1) || "";
if ("/" === next_char) {
last_index = index;
if (last_index >= sourceLength) {
last_index = sourceLength;
}
return {
type: JsonTokenTypes.COMMENT,
value: source.substring(first_index, last_index),
index: index + 2,
line: line,
column: column,
};
}
}
index++;
}
}
}
else {
return null;
}
}
function parseChar(source, index, line, column) {
const char = source.charAt(index);
if (char in charTokens) {
return {
type: charTokens[char],
line: line,
column: column + 1,
index: index + 1,
value: undefined,
};
}
else {
return null;
}
}
function parseKeyword(source, index, line, column) {
const matched = Object.keys(keywordsTokens).find((name) => name === source.slice(index, index + name.length));
if (matched) {
return {
type: keywordsTokens[matched],
line: line,
column: column + matched.length,
index: index + matched.length,
value: null,
};
}
else {
return null;
}
}
function parseIdentifier(source, index, line, column) {
const sourceLength = source.length;
const startIndex = index;
let buffer = "";
// Must start with a letter or underscore
const firstChar = source.charAt(index);
if (!(isLetter(firstChar) || firstChar === "_"))
return null;
while (index < sourceLength) {
const char = source.charAt(index);
if (!(isLetter(char) || char === "_" || isDigit(char)))
break;
buffer += char;
index++;
}
if (buffer.length > 0) {
return {
type: JsonTokenTypes.IDENTIFIER,
line: line,
column: column + index - startIndex,
index: index,
value: buffer,
};
}
else {
return null;
}
}
function parseString(source, index, line, column) {
const sourceLength = source.length;
const startIndex = index;
let buffer = "";
let decoded = "";
let state = stringStates._START_;
while (index < sourceLength) {
const char = source.charAt(index);
switch (state) {
case stringStates._START_:
if (char === '"') {
state = stringStates.START_QUOTE_OR_CHAR;
index++;
}
else {
return null;
}
break;
case stringStates.START_QUOTE_OR_CHAR:
if (char === "\\") {
state = stringStates.ESCAPE;
buffer += char;
index++;
}
else if (char === '"') {
index++;
return {
type: JsonTokenTypes.STRING,
value: buffer,
decoded: decoded !== buffer ? decoded : null,
line,
index,
column: column + index - startIndex,
};
}
else {
buffer += char;
decoded += char;
index++;
}
break;
case stringStates.ESCAPE:
if (char in escapes) {
buffer += char;
index++;
if (char === "u") {
let hex = "";
for (let i = 0; i < 4; i++) {
const curChar = source.charAt(index);
if (curChar && isHex(curChar)) {
hex += curChar;
index++;
}
else {
return null;
}
}
buffer += hex;
decoded += String.fromCodePoint(parseInt(hex, 16));
}
else {
if (char in symbolSubstitutes) {
decoded += symbolSubstitutes[char];
}
else {
decoded += char;
}
}
state = stringStates.START_QUOTE_OR_CHAR;
}
else {
return null;
}
break;
}
}
}
function parseNumber(source, index, line, column) {
const sourceLength = source.length;
const startIndex = index;
let passedValueIndex = index;
let state = numberStates._START_;
iterator: while (index < sourceLength) {
const char = source.charAt(index);
switch (state) {
case numberStates._START_:
if (char === "-") {
state = numberStates.MINUS;
}
else if (char === "0") {
passedValueIndex = index + 1;
state = numberStates.ZERO;
}
else if (isDigit1to9(char)) {
passedValueIndex = index + 1;
state = numberStates.DIGIT;
}
else {
return null;
}
break;
case numberStates.MINUS:
if (char === "0") {
passedValueIndex = index + 1;
state = numberStates.ZERO;
}
else if (isDigit1to9(char)) {
passedValueIndex = index + 1;
state = numberStates.DIGIT;
}
else {
return null;
}
break;
case numberStates.ZERO:
if (char === ".") {
state = numberStates.POINT;
}
else if (isExp(char)) {
state = numberStates.EXP;
}
else {
break iterator;
}
break;
case numberStates.DIGIT:
if (isDigit(char)) {
passedValueIndex = index + 1;
}
else if (char === ".") {
state = numberStates.POINT;
}
else if (isExp(char)) {
state = numberStates.EXP;
}
else {
break iterator;
}
break;
case numberStates.POINT:
if (isDigit(char)) {
passedValueIndex = index + 1;
state = numberStates.DIGIT_FRACTION;
}
else {
break iterator;
}
break;
case numberStates.DIGIT_FRACTION:
if (isDigit(char)) {
passedValueIndex = index + 1;
}
else if (isExp(char)) {
state = numberStates.EXP;
}
else {
break iterator;
}
break;
case numberStates.EXP:
if (char === "+" || char === "-") {
state = numberStates.EXP_DIGIT_OR_SIGN;
}
else if (isDigit(char)) {
passedValueIndex = index + 1;
state = numberStates.EXP_DIGIT_OR_SIGN;
}
else {
break iterator;
}
break;
case numberStates.EXP_DIGIT_OR_SIGN:
if (isDigit(char)) {
passedValueIndex = index + 1;
}
else {
break iterator;
}
break;
}
index++;
}
if (passedValueIndex > 0) {
return {
type: JsonTokenTypes.NUMBER,
value: source.substring(startIndex, passedValueIndex),
line: line,
index: passedValueIndex,
column: column + passedValueIndex - startIndex,
};
}
else {
return null;
}
}
const defaultSettings = {
verbose: true,
};
export function tokenize(source, settings) {
settings = Object.assign({}, defaultSettings, settings);
let line = 1;
let column = 1;
let index = 0;
const tokens = [];
const sourceLength = source.length;
while (index < sourceLength) {
const whitespace = parseWhitespace(source, index, line, column);
if (whitespace) {
index = whitespace.index;
line = whitespace.line;
column = whitespace.column;
continue;
}
const matched = parseComment(source, index, line, column) ||
parseChar(source, index, line, column) ||
parseKeyword(source, index, line, column) ||
parseIdentifier(source, index, line, column) ||
parseString(source, index, line, column) ||
parseNumber(source, index, line, column);
if (matched) {
const token = { type: matched.type, value: matched.value, decoded: matched.decoded };
if (settings.verbose) {
token.position = new JsonPosition(line, column, index, matched.line, matched.column, matched.index);
}
tokens.push(token);
index = matched.index;
line = matched.line;
column = matched.column;
}
else {
error(cannotTokenizeSymbol(source.charAt(index), line, column), source, line, column);
}
}
return tokens;
}