UNPKG

@toolsycc/json-repair

Version:

A robust utility to repair JSON strings - fix malformed or broken JSON, especially from LLM output like ChatGPT.

314 lines (313 loc) 12.2 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.extractJsonFromText = extractJsonFromText; exports.repairJson = repairJson; /** * Attempts to extract a valid JSON block (between braces or brackets) from a string */ function extractJsonFromText(input) { let cleaned = input.trim() .replace(/^```json\s*/i, '') .replace(/```$/, '') .trim(); const firstBrace = cleaned.indexOf('{'); const firstBracket = cleaned.indexOf('['); // We look for the first "{" or "[" to guess the beginning of the JSON const start = firstBrace >= 0 && (firstBrace < firstBracket || firstBracket === -1) ? firstBrace : firstBracket; // If not found, we return the input as is if (start === -1) return input; // We traverse the string from "start" until the braces/brackets are balanced const stack = []; let end = start; for (let i = start; i < cleaned.length; i++) { const char = cleaned[i]; if (char === '{' || char === '[') { stack.push(char); } else if (char === '}' || char === ']') { const last = stack.pop(); if (!last || (char === '}' && last !== '{') || (char === ']' && last !== '[')) { // Imbalance, we stop break; } // If the stack is empty, we've closed everything, we can stop if (stack.length === 0) { end = i + 1; break; } } } return cleaned.substring(start, end); } /** * Repairs any broken string fragments * (simple case: {foo: bar} on a single line). */ function fixBrokenStringValues(json) { return json .split('\n') .map((line) => { const start = line.indexOf('{'); const end = line.lastIndexOf('}'); if (start === -1 || end === -1 || start >= end || line.includes(',')) return line; const content = line.slice(start + 1, end); const colon = content.indexOf(':'); if (colon === -1) return line; const key = content.slice(0, colon).trim(); let value = content.slice(colon + 1).trim(); if (value.includes('"')) { if (!value.startsWith('"')) value = '"' + value; if (!value.endsWith('"')) value = value + '"'; // We escape unprotected inner quotes value = value.slice(1, -1).replace(/(?<!\\)"/g, '\\"'); value = '"' + value + '"'; } return `{${key}: ${value}}`; }) .join('\n'); } /** * Escapes unprotected quotes inside values already between quotes. */ function escapeInnerQuotesInStrings(json) { return json.replace(/"([^"\n\\]*?)"/g, (match, content) => { // We only modify if we find unescaped quotes if (/[^\\]"/.test(content)) { const safe = content.replace(/([^\\])"/g, '$1\\"'); return `"${safe}"`; } return match; }); } /** * Naive balancing of braces and brackets: * - Removes "excess" closing brackets if there was no corresponding opening bracket. * - Adds missing closing brackets at the end of the string if necessary. */ function balanceJsonBrackets(input) { let curlyCount = 0; // counting for { } let squareCount = 0; // counting for [ ] const chars = [...input]; for (let i = 0; i < chars.length; i++) { const c = chars[i]; if (c === '{') { curlyCount++; } else if (c === '}') { // We have a '}' when there's no '{' open => we remove it if (curlyCount > 0) curlyCount--; else chars[i] = ''; } else if (c === '[') { squareCount++; } else if (c === ']') { // Same principle for brackets if (squareCount > 0) squareCount--; else chars[i] = ''; } } // Add missing closing brackets if openCount > 0 let result = chars.join(''); if (curlyCount > 0) { result += '}'.repeat(curlyCount); } if (squareCount > 0) { result += ']'.repeat(squareCount); } return result; } /** * Replaces (in values between quotes) real \n, \r, \t * with their escaped versions \\n, \\r, \\t. * * This approach is naive: if someone had already written \n, it becomes \\n. * But for your specific test (and most cases), it's sufficient. */ function escapeControlCharsInsideStrings(json) { let inString = false; let escaped = false; const chars = [...json]; for (let i = 0; i < chars.length; i++) { const c = chars[i]; // Detects quotes that open/close a string if (c === '"' && !escaped) { inString = !inString; escaped = false; } else if (c === '\\' && !escaped) { // Backslash => the next character is escaped escaped = true; } else { if (inString) { // If we're inside a string, we replace real \n, \r, \t if (c === '\n') { chars[i] = '\\n'; } else if (c === '\r') { chars[i] = '\\r'; } else if (c === '\t') { chars[i] = '\\t'; } } escaped = false; } } return chars.join(''); } /** * Removes comments that contain a closing brace without a matching opening brace * This helps with cases like {name: Seb // test comments} */ function removeCommentsWithUnmatchedBraces(json) { // First, find all comment positions const commentMatches = [...json.matchAll(/\/\/.*/g)]; let result = json; // Process each comment from the end to avoid index shifting for (let i = commentMatches.length - 1; i >= 0; i--) { const match = commentMatches[i]; const commentStart = match.index; const commentText = match[0]; // Check if the comment contains a closing brace if (commentText.includes('}')) { // Find the position of the closing brace in the comment const braceInComment = commentText.indexOf('}'); // Keep the closing brace and everything after it const afterBrace = commentText.substring(braceInComment); // Replace the comment with just the closing brace and what follows result = result.substring(0, commentStart) + afterBrace + result.substring(commentStart + commentText.length); } } return result; } /** * Repairs a potentially incorrect JSON string by applying various * transformations (extraction, removal of trailing commas, quoting, etc.). * Returns either a JSON string (default) or a JS object (if `returnObject` is true). */ function repairJson(input, options = {}) { try { // First, we test if it's already parseable as is const parsed = JSON.parse(input); if (!options.returnObject) { let json = JSON.stringify(parsed); if (options.encodeAscii) { json = json.replace(/[^\x00-\x7F]/g, (c) => '\\u' + c.charCodeAt(0).toString(16).padStart(4, '0')); } return json; } return parsed; } catch (e) { if (options.logging) { console.warn('JSON.parse failed:', e); console.warn('Trying to repair JSON...'); } let fixed = input; // 1. Optional extraction if (options.extractJson) { fixed = extractJsonFromText(fixed); if (options.logging) console.log('Extracted JSON block:', fixed); } // 2. Handle comments with unmatched braces fixed = removeCommentsWithUnmatchedBraces(fixed); if (options.logging) console.log('Handled comments with unmatched braces:', fixed); // 3. Removal of remaining comments fixed = fixed.replace(/\/\/.*|\/\*[\s\S]*?\*\//g, ''); if (options.logging) console.log('Removed remaining comments:', fixed); // 4. Removal of trailing commas fixed = fixed.replace(/,\s*([\]}])/g, '$1'); if (options.logging) console.log('Removed trailing commas:', fixed); // 5. Adding quotes around unquoted keys fixed = fixed.replace(/([{,]\s*)([a-zA-Z_][a-zA-Z0-9_]*)(\s*:)/g, '$1"$2"$3'); if (options.logging) console.log('Quoted keys:', fixed); // 6. Conversion of single quotes to double quotes fixed = fixed.replace(/'([^']*)'/g, '"$1"'); if (options.logging) console.log('Converted single to double quotes:', fixed); // 7. Replacement of invalid literals (NaN, undefined, Infinity, -Infinity) fixed = fixed.replace(/\b(NaN|undefined|Infinity|-Infinity)\b/g, 'null'); if (options.logging) console.log('Replaced invalid literals:', fixed); // 8. Replacement of capitalized versions fixed = fixed.replace(/\bTrue\b/g, 'true') .replace(/\bTRUE\b/g, 'true') .replace(/\bFalse\b/g, 'false') // or just /false\b/i if you want to handle everything .replace(/\bFALSE\b/g, 'false') // or just /false\b/i if you want to handle everything .replace(/\bNull\b/g, 'null') .replace(/\bNULL\b/g, 'null'); if (options.logging) console.log('Replaced capitalized True/False/Null:', fixed); // 9. Quoting unquoted values (except true/false/null) fixed = fixed.replace(/:\s*(?!true\b|false\b|null\b)([a-zA-Z_][a-zA-Z0-9_]*)\s*([\}\],])/g, ': "$1"$2'); if (options.logging) console.log('Quoted unquoted string values:', fixed); // 10. Fix the case where we have : John", i.e. an unquoted word followed by a quote fixed = fixed.replace(/:\s*([a-zA-Z0-9_]+)"([\}\],])/g, ': "$1"$2'); if (options.logging) console.log('Fixed missing opening quote for values:', fixed); // 11. Fix any lines like {foo: bar} fixed = fixBrokenStringValues(fixed); if (options.logging) console.log('Fixed broken string values:', fixed); // 12. Escape unprotected quotes in strings fixed = escapeInnerQuotesInStrings(fixed); if (options.logging) console.log('Escaped inner unquoted double quotes:', fixed); // 13. Balancing of braces/brackets (if we consider that in safeMode we don't want to "overcorrect" too much, we can condition it) if (!options.safeMode) { fixed = balanceJsonBrackets(fixed); if (options.logging) console.log('Balanced brackets:', fixed); } // 14. Escape real line breaks, tabs, etc. in strings fixed = escapeControlCharsInsideStrings(fixed); if (options.logging) console.log('Escaped control chars:', fixed); // Last attempt try { const repaired = JSON.parse(fixed); if (!options.returnObject) { let json = JSON.stringify(repaired); if (options.encodeAscii) { json = json.replace(/[^\x00-\x7F]/g, (c) => '\\u' + c.charCodeAt(0).toString(16).padStart(4, '0')); } return json; } return repaired; } catch (err) { if (options.logging) { console.warn('[json-repair] Repair failed:', err); } const preview = input.slice(0, 100).replace(/\n/g, ' ').trim(); // short preview const baseMessage = `[json-repair] Failed to parse repaired JSON.`; const details = err instanceof Error ? err.message : 'Unknown parsing error'; const combined = `${baseMessage} ${details} Input was: "${preview}..."`; if (options.safeMode) { throw new Error(baseMessage); } else { throw new Error(combined); } } } }