jsonrepair
Version:
Repair broken JSON documents
818 lines (799 loc) • 26.7 kB
JavaScript
import { JSONRepairError } from '../utils/JSONRepairError.js';
import { isControlCharacter, isDelimiter, isDigit, isDoubleQuote, isDoubleQuoteLike, isFunctionNameChar, isFunctionNameCharStart, isHex, isQuote, isSingleQuote, isSingleQuoteLike, isSpecialWhitespace, isStartOfValue, isUnquotedStringDelimiter, isValidStringCharacter, isWhitespace, isWhitespaceExceptNewline, regexUrlChar, regexUrlStart } from '../utils/stringUtils.js';
import { createInputBuffer } from './buffer/InputBuffer.js';
import { createOutputBuffer } from './buffer/OutputBuffer.js';
import { Caret, createStack, StackType } from './stack.js';
const controlCharacters = {
'\b': '\\b',
'\f': '\\f',
'\n': '\\n',
'\r': '\\r',
'\t': '\\t'
};
// map with all escape characters
const escapeCharacters = {
'"': '"',
'\\': '\\',
'/': '/',
b: '\b',
f: '\f',
n: '\n',
r: '\r',
t: '\t'
// note that \u is handled separately in parseString()
};
export function jsonrepairCore(_ref) {
let {
onData,
bufferSize = 65536,
chunkSize = 65536
} = _ref;
const input = createInputBuffer();
const output = createOutputBuffer({
write: onData,
bufferSize,
chunkSize
});
let i = 0;
let iFlushed = 0;
const stack = createStack();
function flushInputBuffer() {
while (iFlushed < i - bufferSize - chunkSize) {
iFlushed += chunkSize;
input.flush(iFlushed);
}
}
function transform(chunk) {
input.push(chunk);
while (i < input.currentLength() - bufferSize && parse()) {
// loop until there is nothing more to process
}
flushInputBuffer();
}
function flush() {
input.close();
while (parse()) {
// loop until there is nothing more to process
}
output.flush();
}
function parse() {
parseWhitespaceAndSkipComments();
switch (stack.type) {
case StackType.object:
{
switch (stack.caret) {
case Caret.beforeKey:
return skipEllipsis() || parseObjectKey() || parseUnexpectedColon() || parseRepairTrailingComma() || parseRepairObjectEndOrComma();
case Caret.beforeValue:
return parseValue() || parseRepairMissingObjectValue();
case Caret.afterValue:
return parseObjectComma() || parseObjectEnd() || parseRepairObjectEndOrComma();
default:
return false;
}
}
case StackType.array:
{
switch (stack.caret) {
case Caret.beforeValue:
return skipEllipsis() || parseValue() || parseRepairTrailingComma() || parseRepairArrayEnd();
case Caret.afterValue:
return parseArrayComma() || parseArrayEnd() || parseRepairMissingComma() || parseRepairArrayEnd();
default:
return false;
}
}
case StackType.ndJson:
{
switch (stack.caret) {
case Caret.beforeValue:
return parseValue() || parseRepairTrailingComma();
case Caret.afterValue:
return parseArrayComma() || parseRepairMissingComma() || parseRepairNdJsonEnd();
default:
return false;
}
}
case StackType.functionCall:
{
switch (stack.caret) {
case Caret.beforeValue:
return parseValue();
case Caret.afterValue:
return parseFunctionCallEnd();
default:
return false;
}
}
case StackType.root:
{
switch (stack.caret) {
case Caret.beforeValue:
return parseRootStart();
case Caret.afterValue:
return parseRootEnd();
default:
return false;
}
}
default:
return false;
}
}
function parseValue() {
return parseObjectStart() || parseArrayStart() || parseString() || parseNumber() || parseKeywords() || parseRepairUnquotedString() || parseRepairRegex();
}
function parseObjectStart() {
if (parseCharacter('{')) {
parseWhitespaceAndSkipComments();
skipEllipsis();
if (skipCharacter(',')) {
parseWhitespaceAndSkipComments();
}
if (parseCharacter('}')) {
return stack.update(Caret.afterValue);
}
return stack.push(StackType.object, Caret.beforeKey);
}
return false;
}
function parseArrayStart() {
if (parseCharacter('[')) {
parseWhitespaceAndSkipComments();
skipEllipsis();
if (skipCharacter(',')) {
parseWhitespaceAndSkipComments();
}
if (parseCharacter(']')) {
return stack.update(Caret.afterValue);
}
return stack.push(StackType.array, Caret.beforeValue);
}
return false;
}
function parseRepairUnquotedString() {
let j = i;
if (isFunctionNameCharStart(input.charAt(j))) {
while (!input.isEnd(j) && isFunctionNameChar(input.charAt(j))) {
j++;
}
let k = j;
while (isWhitespace(input, k)) {
k++;
}
if (input.charAt(k) === '(') {
// repair a MongoDB function call like NumberLong("2")
// repair a JSONP function call like callback({...});
k++;
i = k;
return stack.push(StackType.functionCall, Caret.beforeValue);
}
}
j = findNextDelimiter(false, j);
if (j !== null) {
// test start of an url like "https://..." (this would be parsed as a comment)
if (input.charAt(j - 1) === ':' && regexUrlStart.test(input.substring(i, j + 2))) {
while (!input.isEnd(j) && regexUrlChar.test(input.charAt(j))) {
j++;
}
}
const symbol = input.substring(i, j);
i = j;
output.push(symbol === 'undefined' ? 'null' : JSON.stringify(symbol));
if (input.charAt(i) === '"') {
// we had a missing start quote, but now we encountered the end quote, so we can skip that one
i++;
}
return stack.update(Caret.afterValue);
}
return false;
}
function parseRepairRegex() {
if (input.charAt(i) === '/') {
const start = i;
i++;
while (!input.isEnd(i) && (input.charAt(i) !== '/' || input.charAt(i - 1) === '\\')) {
i++;
}
i++;
output.push(`"${input.substring(start, i)}"`);
return stack.update(Caret.afterValue);
}
}
function parseRepairMissingObjectValue() {
// repair missing object value
output.push('null');
return stack.update(Caret.afterValue);
}
function parseRepairTrailingComma() {
// repair trailing comma
if (output.endsWithIgnoringWhitespace(',')) {
output.stripLastOccurrence(',');
return stack.update(Caret.afterValue);
}
return false;
}
function parseUnexpectedColon() {
if (input.charAt(i) === ':') {
throwObjectKeyExpected();
}
return false;
}
function parseUnexpectedEnd() {
if (input.isEnd(i)) {
throwUnexpectedEnd();
} else {
throwUnexpectedCharacter();
}
return false;
}
function parseObjectKey() {
const parsedKey = parseString() || parseUnquotedKey();
if (parsedKey) {
parseWhitespaceAndSkipComments();
if (parseCharacter(':')) {
// expect a value after the :
return stack.update(Caret.beforeValue);
}
const truncatedText = input.isEnd(i);
if (isStartOfValue(input.charAt(i)) || truncatedText) {
// repair missing colon
output.insertBeforeLastWhitespace(':');
return stack.update(Caret.beforeValue);
}
throwColonExpected();
}
return false;
}
function parseObjectComma() {
if (parseCharacter(',')) {
return stack.update(Caret.beforeKey);
}
return false;
}
function parseObjectEnd() {
if (parseCharacter('}')) {
return stack.pop();
}
return false;
}
function parseRepairObjectEndOrComma() {
// repair missing object end and trailing comma
if (input.charAt(i) === '{') {
output.stripLastOccurrence(',');
output.insertBeforeLastWhitespace('}');
return stack.pop();
}
// repair missing comma
if (!input.isEnd(i) && isStartOfValue(input.charAt(i))) {
output.insertBeforeLastWhitespace(',');
return stack.update(Caret.beforeKey);
}
// repair missing closing brace
output.insertBeforeLastWhitespace('}');
return stack.pop();
}
function parseArrayComma() {
if (parseCharacter(',')) {
return stack.update(Caret.beforeValue);
}
return false;
}
function parseArrayEnd() {
if (parseCharacter(']')) {
return stack.pop();
}
return false;
}
function parseRepairMissingComma() {
// repair missing comma
if (!input.isEnd(i) && isStartOfValue(input.charAt(i))) {
output.insertBeforeLastWhitespace(',');
return stack.update(Caret.beforeValue);
}
return false;
}
function parseRepairArrayEnd() {
// repair missing closing bracket
output.insertBeforeLastWhitespace(']');
return stack.pop();
}
function parseRepairNdJsonEnd() {
if (input.isEnd(i)) {
output.push('\n]');
return stack.pop();
}
throwUnexpectedEnd();
return false; // just to make TS happy
}
function parseFunctionCallEnd() {
if (skipCharacter(')')) {
skipCharacter(';');
}
return stack.pop();
}
function parseRootStart() {
parseMarkdownCodeBlock(['```', '[```', '{```']);
return parseValue() || parseUnexpectedEnd();
}
function parseRootEnd() {
parseMarkdownCodeBlock(['```', '```]', '```}']);
const parsedComma = parseCharacter(',');
parseWhitespaceAndSkipComments();
if (isStartOfValue(input.charAt(i)) && (output.endsWithIgnoringWhitespace(',') || output.endsWithIgnoringWhitespace('\n'))) {
// start of a new value after end of the root level object: looks like
// newline delimited JSON -> turn into a root level array
if (!parsedComma) {
// repair missing comma
output.insertBeforeLastWhitespace(',');
}
output.unshift('[\n');
return stack.push(StackType.ndJson, Caret.beforeValue);
}
if (parsedComma) {
// repair: remove trailing comma
output.stripLastOccurrence(',');
return stack.update(Caret.afterValue);
}
// repair redundant end braces and brackets
while (input.charAt(i) === '}' || input.charAt(i) === ']') {
i++;
parseWhitespaceAndSkipComments();
}
if (!input.isEnd(i)) {
throwUnexpectedCharacter();
}
return false;
}
function parseWhitespaceAndSkipComments() {
let skipNewline = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : true;
const start = i;
let changed = parseWhitespace(skipNewline);
do {
changed = parseComment();
if (changed) {
changed = parseWhitespace(skipNewline);
}
} while (changed);
return i > start;
}
function parseWhitespace(skipNewline) {
const _isWhiteSpace = skipNewline ? isWhitespace : isWhitespaceExceptNewline;
let whitespace = '';
while (true) {
if (_isWhiteSpace(input, i)) {
whitespace += input.charAt(i);
i++;
} else if (isSpecialWhitespace(input, i)) {
// repair special whitespace
whitespace += ' ';
i++;
} else {
break;
}
}
if (whitespace.length > 0) {
output.push(whitespace);
return true;
}
return false;
}
function parseComment() {
// find a block comment '/* ... */'
if (input.charAt(i) === '/' && input.charAt(i + 1) === '*') {
// repair block comment by skipping it
while (!input.isEnd(i) && !atEndOfBlockComment(i)) {
i++;
}
i += 2;
return true;
}
// find a line comment '// ...'
if (input.charAt(i) === '/' && input.charAt(i + 1) === '/') {
// repair line comment by skipping it
while (!input.isEnd(i) && input.charAt(i) !== '\n') {
i++;
}
return true;
}
return false;
}
function parseMarkdownCodeBlock(blocks) {
// find and skip over a Markdown fenced code block:
// ``` ... ```
// or
// ```json ... ```
if (skipMarkdownCodeBlock(blocks)) {
if (isFunctionNameCharStart(input.charAt(i))) {
// strip the optional language specifier like "json"
while (!input.isEnd(i) && isFunctionNameChar(input.charAt(i))) {
i++;
}
}
parseWhitespaceAndSkipComments();
return true;
}
return false;
}
function skipMarkdownCodeBlock(blocks) {
for (const block of blocks) {
const end = i + block.length;
if (input.substring(i, end) === block) {
i = end;
return true;
}
}
return false;
}
function parseCharacter(char) {
if (input.charAt(i) === char) {
output.push(input.charAt(i));
i++;
return true;
}
return false;
}
function skipCharacter(char) {
if (input.charAt(i) === char) {
i++;
return true;
}
return false;
}
function skipEscapeCharacter() {
return skipCharacter('\\');
}
/**
* Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]"
* or a similar construct in objects.
*/
function skipEllipsis() {
parseWhitespaceAndSkipComments();
if (input.charAt(i) === '.' && input.charAt(i + 1) === '.' && input.charAt(i + 2) === '.') {
// repair: remove the ellipsis (three dots) and optionally a comma
i += 3;
parseWhitespaceAndSkipComments();
skipCharacter(',');
return true;
}
return false;
}
/**
* Parse a string enclosed by double quotes "...". Can contain escaped quotes
* Repair strings enclosed in single quotes or special quotes
* Repair an escaped string
*
* The function can run in two stages:
* - First, it assumes the string has a valid end quote
* - If it turns out that the string does not have a valid end quote followed
* by a delimiter (which should be the case), the function runs again in a
* more conservative way, stopping the string at the first next delimiter
* and fixing the string by inserting a quote there, or stopping at a
* stop index detected in the first iteration.
*/
function parseString() {
let stopAtDelimiter = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : false;
let stopAtIndex = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : -1;
let skipEscapeChars = input.charAt(i) === '\\';
if (skipEscapeChars) {
// repair: remove the first escape character
i++;
skipEscapeChars = true;
}
if (isQuote(input.charAt(i))) {
// double quotes are correct JSON,
// single quotes come from JavaScript for example, we assume it will have a correct single end quote too
// otherwise, we will match any double-quote-like start with a double-quote-like end,
// or any single-quote-like start with a single-quote-like end
const isEndQuote = isDoubleQuote(input.charAt(i)) ? isDoubleQuote : isSingleQuote(input.charAt(i)) ? isSingleQuote : isSingleQuoteLike(input.charAt(i)) ? isSingleQuoteLike : isDoubleQuoteLike;
const iBefore = i;
const oBefore = output.length();
output.push('"');
i++;
while (true) {
if (input.isEnd(i)) {
// end of text, we have a missing quote somewhere
const iPrev = prevNonWhitespaceIndex(i - 1);
if (!stopAtDelimiter && isDelimiter(input.charAt(iPrev))) {
// if the text ends with a delimiter, like ["hello],
// so the missing end quote should be inserted before this delimiter
// retry parsing the string, stopping at the first next delimiter
i = iBefore;
output.remove(oBefore);
return parseString(true);
}
// repair missing quote
output.insertBeforeLastWhitespace('"');
return stack.update(Caret.afterValue);
}
if (i === stopAtIndex) {
// use the stop index detected in the first iteration, and repair end quote
output.insertBeforeLastWhitespace('"');
return stack.update(Caret.afterValue);
}
if (isEndQuote(input.charAt(i))) {
// end quote
// let us check what is before and after the quote to verify whether this is a legit end quote
const iQuote = i;
const oQuote = output.length();
output.push('"');
i++;
parseWhitespaceAndSkipComments(false);
if (stopAtDelimiter || input.isEnd(i) || isDelimiter(input.charAt(i)) || isQuote(input.charAt(i)) || isDigit(input.charAt(i))) {
// The quote is followed by the end of the text, a delimiter, or a next value
// so the quote is indeed the end of the string
parseConcatenatedString();
return stack.update(Caret.afterValue);
}
const iPrevChar = prevNonWhitespaceIndex(iQuote - 1);
const prevChar = input.charAt(iPrevChar);
if (prevChar === ',') {
// A comma followed by a quote, like '{"a":"b,c,"d":"e"}'.
// We assume that the quote is a start quote, and that the end quote
// should have been located right before the comma but is missing.
i = iBefore;
output.remove(oBefore);
return parseString(false, iPrevChar);
}
if (isDelimiter(prevChar)) {
// This is not the right end quote: it is preceded by a delimiter,
// and NOT followed by a delimiter. So, there is an end quote missing
// parse the string again and then stop at the first next delimiter
i = iBefore;
output.remove(oBefore);
return parseString(true);
}
// revert to right after the quote but before any whitespace, and continue parsing the string
output.remove(oQuote + 1);
i = iQuote + 1;
// repair unescaped quote
output.insertAt(oQuote, '\\');
} else if (stopAtDelimiter && isUnquotedStringDelimiter(input.charAt(i))) {
// we're in the mode to stop the string at the first delimiter
// because there is an end quote missing
// test start of an url like "https://..." (this would be parsed as a comment)
if (input.charAt(i - 1) === ':' && regexUrlStart.test(input.substring(iBefore + 1, i + 2))) {
while (!input.isEnd(i) && regexUrlChar.test(input.charAt(i))) {
output.push(input.charAt(i));
i++;
}
}
// repair missing quote
output.insertBeforeLastWhitespace('"');
parseConcatenatedString();
return stack.update(Caret.afterValue);
} else if (input.charAt(i) === '\\') {
// handle escaped content like \n or \u2605
const char = input.charAt(i + 1);
const escapeChar = escapeCharacters[char];
if (escapeChar !== undefined) {
output.push(input.substring(i, i + 2));
i += 2;
} else if (char === 'u') {
let j = 2;
while (j < 6 && isHex(input.charAt(i + j))) {
j++;
}
if (j === 6) {
output.push(input.substring(i, i + 6));
i += 6;
} else if (input.isEnd(i + j)) {
// repair invalid or truncated unicode char at the end of the text
// by removing the unicode char and ending the string here
i += j;
} else {
throwInvalidUnicodeCharacter();
}
} else {
// repair invalid escape character: remove it
output.push(char);
i += 2;
}
} else {
// handle regular characters
const char = input.charAt(i);
if (char === '"' && input.charAt(i - 1) !== '\\') {
// repair unescaped double quote
output.push(`\\${char}`);
i++;
} else if (isControlCharacter(char)) {
// unescaped control character
output.push(controlCharacters[char]);
i++;
} else {
if (!isValidStringCharacter(char)) {
throwInvalidCharacter(char);
}
output.push(char);
i++;
}
}
if (skipEscapeChars) {
// repair: skipped escape character (nothing to do)
skipEscapeCharacter();
}
}
}
return false;
}
/**
* Repair concatenated strings like "hello" + "world", change this into "helloworld"
*/
function parseConcatenatedString() {
let parsed = false;
parseWhitespaceAndSkipComments();
while (input.charAt(i) === '+') {
parsed = true;
i++;
parseWhitespaceAndSkipComments();
// repair: remove the end quote of the first string
output.stripLastOccurrence('"', true);
const start = output.length();
const parsedStr = parseString();
if (parsedStr) {
// repair: remove the start quote of the second string
output.remove(start, start + 1);
} else {
// repair: remove the + because it is not followed by a string
output.insertBeforeLastWhitespace('"');
}
}
return parsed;
}
/**
* Parse a number like 2.4 or 2.4e6
*/
function parseNumber() {
const start = i;
if (input.charAt(i) === '-') {
i++;
if (atEndOfNumber()) {
repairNumberEndingWithNumericSymbol(start);
return stack.update(Caret.afterValue);
}
if (!isDigit(input.charAt(i))) {
i = start;
return false;
}
}
// Note that in JSON leading zeros like "00789" are not allowed.
// We will allow all leading zeros here though and at the end of parseNumber
// check against trailing zeros and repair that if needed.
// Leading zeros can have meaning, so we should not clear them.
while (isDigit(input.charAt(i))) {
i++;
}
if (input.charAt(i) === '.') {
i++;
if (atEndOfNumber()) {
repairNumberEndingWithNumericSymbol(start);
return stack.update(Caret.afterValue);
}
if (!isDigit(input.charAt(i))) {
i = start;
return false;
}
while (isDigit(input.charAt(i))) {
i++;
}
}
if (input.charAt(i) === 'e' || input.charAt(i) === 'E') {
i++;
if (input.charAt(i) === '-' || input.charAt(i) === '+') {
i++;
}
if (atEndOfNumber()) {
repairNumberEndingWithNumericSymbol(start);
return stack.update(Caret.afterValue);
}
if (!isDigit(input.charAt(i))) {
i = start;
return false;
}
while (isDigit(input.charAt(i))) {
i++;
}
}
// if we're not at the end of the number by this point, allow this to be parsed as another type
if (!atEndOfNumber()) {
i = start;
return false;
}
if (i > start) {
// repair a number with leading zeros like "00789"
const num = input.substring(start, i);
const hasInvalidLeadingZero = /^0\d/.test(num);
output.push(hasInvalidLeadingZero ? `"${num}"` : num);
return stack.update(Caret.afterValue);
}
return false;
}
/**
* Parse keywords true, false, null
* Repair Python keywords True, False, None
*/
function parseKeywords() {
return parseKeyword('true', 'true') || parseKeyword('false', 'false') || parseKeyword('null', 'null') ||
// repair Python keywords True, False, None
parseKeyword('True', 'true') || parseKeyword('False', 'false') || parseKeyword('None', 'null');
}
function parseKeyword(name, value) {
if (input.substring(i, i + name.length) === name) {
output.push(value);
i += name.length;
return stack.update(Caret.afterValue);
}
return false;
}
function parseUnquotedKey() {
let end = findNextDelimiter(true, i);
if (end !== null) {
// first, go back to prevent getting trailing whitespaces in the string
while (isWhitespace(input, end - 1) && end > i) {
end--;
}
const symbol = input.substring(i, end);
output.push(JSON.stringify(symbol));
i = end;
if (input.charAt(i) === '"') {
// we had a missing start quote, but now we encountered the end quote, so we can skip that one
i++;
}
return stack.update(Caret.afterValue); // we do not have a state Caret.afterKey, therefore we use afterValue here
}
return false;
}
function findNextDelimiter(isKey, start) {
// note that the symbol can end with whitespaces: we stop at the next delimiter
// also, note that we allow strings to contain a slash / in order to support repairing regular expressions
let j = start;
while (!input.isEnd(j) && !isUnquotedStringDelimiter(input.charAt(j)) && !isQuote(input.charAt(j)) && (!isKey || input.charAt(j) !== ':')) {
j++;
}
return j > i ? j : null;
}
function prevNonWhitespaceIndex(start) {
let prev = start;
while (prev > 0 && isWhitespace(input, prev)) {
prev--;
}
return prev;
}
function atEndOfNumber() {
return input.isEnd(i) || isDelimiter(input.charAt(i)) || isWhitespace(input, i);
}
function repairNumberEndingWithNumericSymbol(start) {
// repair numbers cut off at the end
// this will only be called when we end after a '.', '-', or 'e' and does not
// change the number more than it needs to make it valid JSON
output.push(`${input.substring(start, i)}0`);
}
function throwInvalidCharacter(char) {
throw new JSONRepairError(`Invalid character ${JSON.stringify(char)}`, i);
}
function throwUnexpectedCharacter() {
throw new JSONRepairError(`Unexpected character ${JSON.stringify(input.charAt(i))}`, i);
}
function throwUnexpectedEnd() {
throw new JSONRepairError('Unexpected end of json string', i);
}
function throwObjectKeyExpected() {
throw new JSONRepairError('Object key expected', i);
}
function throwColonExpected() {
throw new JSONRepairError('Colon expected', i);
}
function throwInvalidUnicodeCharacter() {
const chars = input.substring(i, i + 6);
throw new JSONRepairError(`Invalid unicode character "${chars}"`, i);
}
function atEndOfBlockComment(i) {
return input.charAt(i) === '*' && input.charAt(i + 1) === '/';
}
return {
transform,
flush
};
}
//# sourceMappingURL=core.js.map