UNPKG

jsonrepair

Version:

Repair broken JSON documents

github.com/josdejong/jsonrepair

josdejong/jsonrepair

904 lines (858 loc) • 30.2 kB

JavaScript

(function (global, factory) { typeof exports === 'object' && typeof module !== 'undefined' ? factory(exports) : typeof define === 'function' && define.amd ? define(['exports'], factory) : (global = typeof globalThis !== 'undefined' ? globalThis : global || self, factory(global.JSONRepair = {})); })(this, (function (exports) { 'use strict'; class JSONRepairError extends Error { constructor(message, position) { super(`${message} at position ${position}`); this.position = position; } } const codeSpace = 0x20; // " " const codeNewline = 0xa; // "\n" const codeTab = 0x9; // "\t" const codeReturn = 0xd; // "\r" const codeNonBreakingSpace = 0xa0; const codeEnQuad = 0x2000; const codeHairSpace = 0x200a; const codeNarrowNoBreakSpace = 0x202f; const codeMediumMathematicalSpace = 0x205f; const codeIdeographicSpace = 0x3000; function isHex(char) { return /^[0-9A-Fa-f]$/.test(char); } function isDigit(char) { return char >= '0' && char <= '9'; } function isValidStringCharacter(char) { // note that the valid range is between \u{0020} and \u{10ffff}, // but in JavaScript it is not possible to create a code point larger than // \u{10ffff}, so there is no need to test for that here. return char >= '\u0020'; } function isDelimiter(char) { return ',:[]/{}()\n+'.includes(char); } function isFunctionNameCharStart(char) { return char >= 'a' && char <= 'z' || char >= 'A' && char <= 'Z' || char === '_' || char === '$'; } function isFunctionNameChar(char) { return char >= 'a' && char <= 'z' || char >= 'A' && char <= 'Z' || char === '_' || char === '$' || char >= '0' && char <= '9'; } // matches "https://" and other schemas const regexUrlStart = /^(http|https|ftp|mailto|file|data|irc):\/\/$/; // matches all valid URL characters EXCEPT "[", "]", and ",", since that are important JSON delimiters const regexUrlChar = /^[A-Za-z0-9-._~:/?#@!$&'()*+;=]$/; function isUnquotedStringDelimiter(char) { return ',[]/{}\n+'.includes(char); } function isStartOfValue(char) { return isQuote(char) || regexStartOfValue.test(char); } // alpha, number, minus, or opening bracket or brace const regexStartOfValue = /^[[{\w-]$/; function isControlCharacter(char) { return char === '\n' || char === '\r' || char === '\t' || char === '\b' || char === '\f'; } /** * Check if the given character is a whitespace character like space, tab, or * newline */ function isWhitespace(text, index) { const code = text.charCodeAt(index); return code === codeSpace || code === codeNewline || code === codeTab || code === codeReturn; } /** * Check if the given character is a whitespace character like space or tab, * but NOT a newline */ function isWhitespaceExceptNewline(text, index) { const code = text.charCodeAt(index); return code === codeSpace || code === codeTab || code === codeReturn; } /** * Check if the given character is a special whitespace character, some * unicode variant */ function isSpecialWhitespace(text, index) { const code = text.charCodeAt(index); return code === codeNonBreakingSpace || code >= codeEnQuad && code <= codeHairSpace || code === codeNarrowNoBreakSpace || code === codeMediumMathematicalSpace || code === codeIdeographicSpace; } /** * Test whether the given character is a quote or double quote character. * Also tests for special variants of quotes. */ function isQuote(char) { // the first check double quotes, since that occurs most often return isDoubleQuoteLike(char) || isSingleQuoteLike(char); } /** * Test whether the given character is a double quote character. * Also tests for special variants of double quotes. */ function isDoubleQuoteLike(char) { return char === '"' || char === '\u201c' || char === '\u201d'; } /** * Test whether the given character is a double quote character. * Does NOT test for special variants of double quotes. */ function isDoubleQuote(char) { return char === '"'; } /** * Test whether the given character is a single quote character. * Also tests for special variants of single quotes. */ function isSingleQuoteLike(char) { return char === "'" || char === '\u2018' || char === '\u2019' || char === '\u0060' || char === '\u00b4'; } /** * Test whether the given character is a single quote character. * Does NOT test for special variants of single quotes. */ function isSingleQuote(char) { return char === "'"; } /** * Strip last occurrence of textToStrip from text */ function stripLastOccurrence(text, textToStrip) { let stripRemainingText = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false; const index = text.lastIndexOf(textToStrip); return index !== -1 ? text.substring(0, index) + (stripRemainingText ? '' : text.substring(index + 1)) : text; } function insertBeforeLastWhitespace(text, textToInsert) { let index = text.length; if (!isWhitespace(text, index - 1)) { // no trailing whitespaces return text + textToInsert; } while (isWhitespace(text, index - 1)) { index--; } return text.substring(0, index) + textToInsert + text.substring(index); } function removeAtIndex(text, start, count) { return text.substring(0, start) + text.substring(start + count); } /** * Test whether a string ends with a newline or comma character and optional whitespace */ function endsWithCommaOrNewline(text) { return /[,\n][ \t\r]*$/.test(text); } const controlCharacters = { '\b': '\\b', '\f': '\\f', '\n': '\\n', '\r': '\\r', '\t': '\\t' }; // map with all escape characters const escapeCharacters = { '"': '"', '\\': '\\', '/': '/', b: '\b', f: '\f', n: '\n', r: '\r', t: '\t' // note that \u is handled separately in parseString() }; /** * Repair a string containing an invalid JSON document. * For example changes JavaScript notation into JSON notation. * * Example: * * try { * const json = "{name: 'John'}" * const repaired = jsonrepair(json) * console.log(repaired) * // '{"name": "John"}' * } catch (err) { * console.error(err) * } * */ function jsonrepair(text) { let i = 0; // current index in text let output = ''; // generated output parseMarkdownCodeBlock(['```', '[```', '{```']); const processed = parseValue(); if (!processed) { throwUnexpectedEnd(); } parseMarkdownCodeBlock(['```', '```]', '```}']); const processedComma = parseCharacter(','); if (processedComma) { parseWhitespaceAndSkipComments(); } if (isStartOfValue(text[i]) && endsWithCommaOrNewline(output)) { // start of a new value after end of the root level object: looks like // newline delimited JSON -> turn into a root level array if (!processedComma) { // repair missing comma output = insertBeforeLastWhitespace(output, ','); } parseNewlineDelimitedJSON(); } else if (processedComma) { // repair: remove trailing comma output = stripLastOccurrence(output, ','); } // repair redundant end quotes while (text[i] === '}' || text[i] === ']') { i++; parseWhitespaceAndSkipComments(); } if (i >= text.length) { // reached the end of the document properly return output; } throwUnexpectedCharacter(); function parseValue() { parseWhitespaceAndSkipComments(); const processed = parseObject() || parseArray() || parseString() || parseNumber() || parseKeywords() || parseUnquotedString(false) || parseRegex(); parseWhitespaceAndSkipComments(); return processed; } function parseWhitespaceAndSkipComments() { let skipNewline = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : true; const start = i; let changed = parseWhitespace(skipNewline); do { changed = parseComment(); if (changed) { changed = parseWhitespace(skipNewline); } } while (changed); return i > start; } function parseWhitespace(skipNewline) { const _isWhiteSpace = skipNewline ? isWhitespace : isWhitespaceExceptNewline; let whitespace = ''; while (true) { if (_isWhiteSpace(text, i)) { whitespace += text[i]; i++; } else if (isSpecialWhitespace(text, i)) { // repair special whitespace whitespace += ' '; i++; } else { break; } } if (whitespace.length > 0) { output += whitespace; return true; } return false; } function parseComment() { // find a block comment '/* ... */' if (text[i] === '/' && text[i + 1] === '*') { // repair block comment by skipping it while (i < text.length && !atEndOfBlockComment(text, i)) { i++; } i += 2; return true; } // find a line comment '// ...' if (text[i] === '/' && text[i + 1] === '/') { // repair line comment by skipping it while (i < text.length && text[i] !== '\n') { i++; } return true; } return false; } function parseMarkdownCodeBlock(blocks) { // find and skip over a Markdown fenced code block: // ``` ... ``` // or // ```json ... ``` if (skipMarkdownCodeBlock(blocks)) { if (isFunctionNameCharStart(text[i])) { // strip the optional language specifier like "json" while (i < text.length && isFunctionNameChar(text[i])) { i++; } } parseWhitespaceAndSkipComments(); return true; } return false; } function skipMarkdownCodeBlock(blocks) { parseWhitespace(true); for (const block of blocks) { const end = i + block.length; if (text.slice(i, end) === block) { i = end; return true; } } return false; } function parseCharacter(char) { if (text[i] === char) { output += text[i]; i++; return true; } return false; } function skipCharacter(char) { if (text[i] === char) { i++; return true; } return false; } function skipEscapeCharacter() { return skipCharacter('\\'); } /** * Skip ellipsis like "[1,2,3,...]" or "[1,2,3,...,9]" or "[...,7,8,9]" * or a similar construct in objects. */ function skipEllipsis() { parseWhitespaceAndSkipComments(); if (text[i] === '.' && text[i + 1] === '.' && text[i + 2] === '.') { // repair: remove the ellipsis (three dots) and optionally a comma i += 3; parseWhitespaceAndSkipComments(); skipCharacter(','); return true; } return false; } /** * Parse an object like '{"key": "value"}' */ function parseObject() { if (text[i] === '{') { output += '{'; i++; parseWhitespaceAndSkipComments(); // repair: skip leading comma like in {, message: "hi"} if (skipCharacter(',')) { parseWhitespaceAndSkipComments(); } let initial = true; while (i < text.length && text[i] !== '}') { let processedComma; if (!initial) { processedComma = parseCharacter(','); if (!processedComma) { // repair missing comma output = insertBeforeLastWhitespace(output, ','); } parseWhitespaceAndSkipComments(); } else { processedComma = true; initial = false; } skipEllipsis(); const processedKey = parseString() || parseUnquotedString(true); if (!processedKey) { if (text[i] === '}' || text[i] === '{' || text[i] === ']' || text[i] === '[' || text[i] === undefined) { // repair trailing comma output = stripLastOccurrence(output, ','); } else { throwObjectKeyExpected(); } break; } parseWhitespaceAndSkipComments(); const processedColon = parseCharacter(':'); const truncatedText = i >= text.length; if (!processedColon) { if (isStartOfValue(text[i]) || truncatedText) { // repair missing colon output = insertBeforeLastWhitespace(output, ':'); } else { throwColonExpected(); } } const processedValue = parseValue(); if (!processedValue) { if (processedColon || truncatedText) { // repair missing object value output += 'null'; } else { throwColonExpected(); } } } if (text[i] === '}') { output += '}'; i++; } else { // repair missing end bracket output = insertBeforeLastWhitespace(output, '}'); } return true; } return false; } /** * Parse an array like '["item1", "item2", ...]' */ function parseArray() { if (text[i] === '[') { output += '['; i++; parseWhitespaceAndSkipComments(); // repair: skip leading comma like in [,1,2,3] if (skipCharacter(',')) { parseWhitespaceAndSkipComments(); } let initial = true; while (i < text.length && text[i] !== ']') { if (!initial) { const processedComma = parseCharacter(','); if (!processedComma) { // repair missing comma output = insertBeforeLastWhitespace(output, ','); } } else { initial = false; } skipEllipsis(); const processedValue = parseValue(); if (!processedValue) { // repair trailing comma output = stripLastOccurrence(output, ','); break; } } if (text[i] === ']') { output += ']'; i++; } else { // repair missing closing array bracket output = insertBeforeLastWhitespace(output, ']'); } return true; } return false; } /** * Parse and repair Newline Delimited JSON (NDJSON): * multiple JSON objects separated by a newline character */ function parseNewlineDelimitedJSON() { // repair NDJSON let initial = true; let processedValue = true; while (processedValue) { if (!initial) { // parse optional comma, insert when missing const processedComma = parseCharacter(','); if (!processedComma) { // repair: add missing comma output = insertBeforeLastWhitespace(output, ','); } } else { initial = false; } processedValue = parseValue(); } if (!processedValue) { // repair: remove trailing comma output = stripLastOccurrence(output, ','); } // repair: wrap the output inside array brackets output = `[\n${output}\n]`; } /** * Parse a string enclosed by double quotes "...". Can contain escaped quotes * Repair strings enclosed in single quotes or special quotes * Repair an escaped string * * The function can run in two stages: * - First, it assumes the string has a valid end quote * - If it turns out that the string does not have a valid end quote followed * by a delimiter (which should be the case), the function runs again in a * more conservative way, stopping the string at the first next delimiter * and fixing the string by inserting a quote there, or stopping at a * stop index detected in the first iteration. */ function parseString() { let stopAtDelimiter = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : false; let stopAtIndex = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : -1; let skipEscapeChars = text[i] === '\\'; if (skipEscapeChars) { // repair: remove the first escape character i++; skipEscapeChars = true; } if (isQuote(text[i])) { // double quotes are correct JSON, // single quotes come from JavaScript for example, we assume it will have a correct single end quote too // otherwise, we will match any double-quote-like start with a double-quote-like end, // or any single-quote-like start with a single-quote-like end const isEndQuote = isDoubleQuote(text[i]) ? isDoubleQuote : isSingleQuote(text[i]) ? isSingleQuote : isSingleQuoteLike(text[i]) ? isSingleQuoteLike : isDoubleQuoteLike; const iBefore = i; const oBefore = output.length; let str = '"'; i++; while (true) { if (i >= text.length) { // end of text, we are missing an end quote const iPrev = prevNonWhitespaceIndex(i - 1); if (!stopAtDelimiter && isDelimiter(text.charAt(iPrev))) { // if the text ends with a delimiter, like ["hello], // so the missing end quote should be inserted before this delimiter // retry parsing the string, stopping at the first next delimiter i = iBefore; output = output.substring(0, oBefore); return parseString(true); } // repair missing quote str = insertBeforeLastWhitespace(str, '"'); output += str; return true; } if (i === stopAtIndex) { // use the stop index detected in the first iteration, and repair end quote str = insertBeforeLastWhitespace(str, '"'); output += str; return true; } if (isEndQuote(text[i])) { // end quote // let us check what is before and after the quote to verify whether this is a legit end quote const iQuote = i; const oQuote = str.length; str += '"'; i++; output += str; parseWhitespaceAndSkipComments(false); if (stopAtDelimiter || i >= text.length || isDelimiter(text[i]) || isQuote(text[i]) || isDigit(text[i])) { // The quote is followed by the end of the text, a delimiter, // or a next value. So the quote is indeed the end of the string. parseConcatenatedString(); return true; } const iPrevChar = prevNonWhitespaceIndex(iQuote - 1); const prevChar = text.charAt(iPrevChar); if (prevChar === ',') { // A comma followed by a quote, like '{"a":"b,c,"d":"e"}'. // We assume that the quote is a start quote, and that the end quote // should have been located right before the comma but is missing. i = iBefore; output = output.substring(0, oBefore); return parseString(false, iPrevChar); } if (isDelimiter(prevChar)) { // This is not the right end quote: it is preceded by a delimiter, // and NOT followed by a delimiter. So, there is an end quote missing // parse the string again and then stop at the first next delimiter i = iBefore; output = output.substring(0, oBefore); return parseString(true); } // revert to right after the quote but before any whitespace, and continue parsing the string output = output.substring(0, oBefore); i = iQuote + 1; // repair unescaped quote str = `${str.substring(0, oQuote)}\\${str.substring(oQuote)}`; } else if (stopAtDelimiter && isUnquotedStringDelimiter(text[i])) { // we're in the mode to stop the string at the first delimiter // because there is an end quote missing // test start of an url like "https://..." (this would be parsed as a comment) if (text[i - 1] === ':' && regexUrlStart.test(text.substring(iBefore + 1, i + 2))) { while (i < text.length && regexUrlChar.test(text[i])) { str += text[i]; i++; } } // repair missing quote str = insertBeforeLastWhitespace(str, '"'); output += str; parseConcatenatedString(); return true; } else if (text[i] === '\\') { // handle escaped content like \n or \u2605 const char = text.charAt(i + 1); const escapeChar = escapeCharacters[char]; if (escapeChar !== undefined) { str += text.slice(i, i + 2); i += 2; } else if (char === 'u') { let j = 2; while (j < 6 && isHex(text[i + j])) { j++; } if (j === 6) { str += text.slice(i, i + 6); i += 6; } else if (i + j >= text.length) { // repair invalid or truncated unicode char at the end of the text // by removing the unicode char and ending the string here i = text.length; } else { throwInvalidUnicodeCharacter(); } } else { // repair invalid escape character: remove it str += char; i += 2; } } else { // handle regular characters const char = text.charAt(i); if (char === '"' && text[i - 1] !== '\\') { // repair unescaped double quote str += `\\${char}`; i++; } else if (isControlCharacter(char)) { // unescaped control character str += controlCharacters[char]; i++; } else { if (!isValidStringCharacter(char)) { throwInvalidCharacter(char); } str += char; i++; } } if (skipEscapeChars) { // repair: skipped escape character (nothing to do) skipEscapeCharacter(); } } } return false; } /** * Repair concatenated strings like "hello" + "world", change this into "helloworld" */ function parseConcatenatedString() { let processed = false; parseWhitespaceAndSkipComments(); while (text[i] === '+') { processed = true; i++; parseWhitespaceAndSkipComments(); // repair: remove the end quote of the first string output = stripLastOccurrence(output, '"', true); const start = output.length; const parsedStr = parseString(); if (parsedStr) { // repair: remove the start quote of the second string output = removeAtIndex(output, start, 1); } else { // repair: remove the + because it is not followed by a string output = insertBeforeLastWhitespace(output, '"'); } } return processed; } /** * Parse a number like 2.4 or 2.4e6 */ function parseNumber() { const start = i; if (text[i] === '-') { i++; if (atEndOfNumber()) { repairNumberEndingWithNumericSymbol(start); return true; } if (!isDigit(text[i])) { i = start; return false; } } // Note that in JSON leading zeros like "00789" are not allowed. // We will allow all leading zeros here though and at the end of parseNumber // check against trailing zeros and repair that if needed. // Leading zeros can have meaning, so we should not clear them. while (isDigit(text[i])) { i++; } if (text[i] === '.') { i++; if (atEndOfNumber()) { repairNumberEndingWithNumericSymbol(start); return true; } if (!isDigit(text[i])) { i = start; return false; } while (isDigit(text[i])) { i++; } } if (text[i] === 'e' || text[i] === 'E') { i++; if (text[i] === '-' || text[i] === '+') { i++; } if (atEndOfNumber()) { repairNumberEndingWithNumericSymbol(start); return true; } if (!isDigit(text[i])) { i = start; return false; } while (isDigit(text[i])) { i++; } } // if we're not at the end of the number by this point, allow this to be parsed as another type if (!atEndOfNumber()) { i = start; return false; } if (i > start) { // repair a number with leading zeros like "00789" const num = text.slice(start, i); const hasInvalidLeadingZero = /^0\d/.test(num); output += hasInvalidLeadingZero ? `"${num}"` : num; return true; } return false; } /** * Parse keywords true, false, null * Repair Python keywords True, False, None */ function parseKeywords() { return parseKeyword('true', 'true') || parseKeyword('false', 'false') || parseKeyword('null', 'null') || // repair Python keywords True, False, None parseKeyword('True', 'true') || parseKeyword('False', 'false') || parseKeyword('None', 'null'); } function parseKeyword(name, value) { if (text.slice(i, i + name.length) === name) { output += value; i += name.length; return true; } return false; } /** * Repair an unquoted string by adding quotes around it * Repair a MongoDB function call like NumberLong("2") * Repair a JSONP function call like callback({...}); */ function parseUnquotedString(isKey) { // note that the symbol can end with whitespaces: we stop at the next delimiter // also, note that we allow strings to contain a slash / in order to support repairing regular expressions const start = i; if (isFunctionNameCharStart(text[i])) { while (i < text.length && isFunctionNameChar(text[i])) { i++; } let j = i; while (isWhitespace(text, j)) { j++; } if (text[j] === '(') { // repair a MongoDB function call like NumberLong("2") // repair a JSONP function call like callback({...}); i = j + 1; parseValue(); if (text[i] === ')') { // repair: skip close bracket of function call i++; if (text[i] === ';') { // repair: skip semicolon after JSONP call i++; } } return true; } } while (i < text.length && !isUnquotedStringDelimiter(text[i]) && !isQuote(text[i]) && (!isKey || text[i] !== ':')) { i++; } // test start of an url like "https://..." (this would be parsed as a comment) if (text[i - 1] === ':' && regexUrlStart.test(text.substring(start, i + 2))) { while (i < text.length && regexUrlChar.test(text[i])) { i++; } } if (i > start) { // repair unquoted string // also, repair undefined into null // first, go back to prevent getting trailing whitespaces in the string while (isWhitespace(text, i - 1) && i > 0) { i--; } const symbol = text.slice(start, i); output += symbol === 'undefined' ? 'null' : JSON.stringify(symbol); if (text[i] === '"') { // we had a missing start quote, but now we encountered the end quote, so we can skip that one i++; } return true; } } function parseRegex() { if (text[i] === '/') { const start = i; i++; while (i < text.length && (text[i] !== '/' || text[i - 1] === '\\')) { i++; } i++; output += `"${text.substring(start, i)}"`; return true; } } function prevNonWhitespaceIndex(start) { let prev = start; while (prev > 0 && isWhitespace(text, prev)) { prev--; } return prev; } function atEndOfNumber() { return i >= text.length || isDelimiter(text[i]) || isWhitespace(text, i); } function repairNumberEndingWithNumericSymbol(start) { // repair numbers cut off at the end // this will only be called when we end after a '.', '-', or 'e' and does not // change the number more than it needs to make it valid JSON output += `${text.slice(start, i)}0`; } function throwInvalidCharacter(char) { throw new JSONRepairError(`Invalid character ${JSON.stringify(char)}`, i); } function throwUnexpectedCharacter() { throw new JSONRepairError(`Unexpected character ${JSON.stringify(text[i])}`, i); } function throwUnexpectedEnd() { throw new JSONRepairError('Unexpected end of json string', text.length); } function throwObjectKeyExpected() { throw new JSONRepairError('Object key expected', i); } function throwColonExpected() { throw new JSONRepairError('Colon expected', i); } function throwInvalidUnicodeCharacter() { const chars = text.slice(i, i + 6); throw new JSONRepairError(`Invalid unicode character "${chars}"`, i); } } function atEndOfBlockComment(text, i) { return text[i] === '*' && text[i + 1] === '/'; } exports.JSONRepairError = JSONRepairError; exports.jsonrepair = jsonrepair; })); //# sourceMappingURL=jsonrepair.js.map