UNPKG

llm-json-fix

Version:

Fix malformed JSON outputs from Large Language Models (LLMs)

445 lines (444 loc) 16.2 kB
import { UnrepairableJSONError } from '../utils/errors'; import { applyLLMSpecificFixes } from '../utils/llmPatterns'; import { formatPosition, isWhitespace, isLineTerminator, isDigit, isAlpha } from '../utils/stringUtils'; /** * Fix malformed JSON from LLM outputs * * @param text Potentially broken JSON text * @param options Configuration options * @returns Repaired JSON string * @throws {UnrepairableJSONError} If the JSON cannot be repaired */ export function fixLLMJson(text, options = {}) { const { applyModelSpecificFixes = true, // Keep model in destructuring for API consistency even though unused // eslint-disable-next-line @typescript-eslint/no-unused-vars model = 'general', preserveComments = false, verbose = false } = options; // Apply LLM-specific fixes first, if enabled let result = text; if (applyModelSpecificFixes) { // Note: When applying model-specific fixes, we always strip comments // because they're likely part of the LLM's explanatory text. // We'll preserve user-specified comments in the JSON repair step if requested. result = applyLLMSpecificFixes(text); } // Now apply standard JSON repair logic return jsonrepair(result, { preserveComments, verbose }); } /** * Repair invalid JSON documents * Core implementation based on jsonrepair library with enhancements * * @param text The JSON document containing errors * @param options Repair options * @returns Repaired JSON as string * @throws {UnrepairableJSONError} If the JSON cannot be repaired */ export function jsonrepair(text, options = {}) { if (text === '') { return ''; } const { preserveComments = false, verbose = false } = options; let i = 0; // current index in text let output = ''; // generated output const processedIndices = new Set(); // to track processed indices and prevent infinite loops let indentation = 0; // current indentation level // object stack to track the type of objects we're currently in const stack = []; // whether we're currently in a string let inString = false; // for verbose logging const changes = []; function trackChange(message) { if (verbose) { changes.push({ index: i, message }); } } /** * Get the next character that is not a whitespace character */ function nextNonWhitespaceCharacter() { let j = i + 1; while (j < text.length && isWhitespace(text[j])) { j++; } if (j >= text.length) { return null; } return text[j]; } /** * Convert a JavaScript string with single or double quotes into a JSON string * with double quotes and proper escaping of special characters. */ function normalizeString(str) { // Remove the first and last quote const content = str.slice(1, -1); let isEscaped = false; let normalized = '"'; for (let j = 0; j < content.length; j++) { const char = content[j]; if (isEscaped) { if (char !== "'" && char !== '"' && char !== '\\' && char !== '/') { // Maintain existing escapes except for quotes and forward slashes normalized += '\\'; } normalized += char; isEscaped = false; continue; } if (char === '\\') { isEscaped = true; normalized += '\\'; continue; } if (char === '"') { normalized += '\\'; } normalized += char; } normalized += '"'; return normalized; } // Process characters one by one while (i < text.length) { // Prevent infinite loops by tracking processed indices if (processedIndices.has(i)) { throw new UnrepairableJSONError(`Infinite loop detected at ${formatPosition(text, i)}`, i); } processedIndices.add(i); const char = text[i]; if (inString) { // We're inside a string if (char === '\\') { // Escape character if (i + 1 < text.length) { // Just include the escape and the next character output += char + text[i + 1]; i += 2; continue; } else { // String is not closed, escape at the end output += '"'; trackChange('Added missing closing double quote at end of text'); inString = false; i++; continue; } } else if (char === '"') { // End of string output += char; inString = false; i++; continue; } else { // Regular character inside a string output += char; i++; continue; } } // Handle whitespace between tokens if (isWhitespace(char)) { i++; // Include newlines in the output, but skip other whitespace if (isLineTerminator(char)) { if (indentation > 0) { output += '\n' + ' '.repeat(indentation); } else { output += '\n'; } } continue; } // Handle comments if preserveComments is false if (char === '/' && i + 1 < text.length) { const nextChar = text[i + 1]; if (nextChar === '/') { // Single-line comment if (preserveComments) { // Keep the comment in the output while (i < text.length && !isLineTerminator(text[i])) { output += text[i]; i++; } trackChange('Preserved single-line comment'); continue; } else { // Remove the comment trackChange('Removing single-line comment'); i += 2; while (i < text.length && !isLineTerminator(text[i])) { i++; } continue; } } else if (nextChar === '*') { // Multi-line comment if (preserveComments) { // Keep the multi-line comment in the output output += '/*'; i += 2; while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) { output += text[i]; i++; } if (i + 1 < text.length) { output += '*/'; i += 2; } trackChange('Preserved multi-line comment'); continue; } else { // Remove the comment trackChange('Removing multi-line comment'); i += 2; while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) { i++; } i += 2; continue; } } } if (char === '{') { // Start of an object stack.push('object'); output += char; indentation++; i++; continue; } if (char === '[') { // Start of an array stack.push('array'); output += char; indentation++; i++; continue; } if (char === '}' || char === ']') { // End of an object or array if (stack.length === 0) { // Unmatched closing bracket, remove it trackChange(`Removing unmatched closing ${char === '}' ? 'curly brace' : 'square bracket'}`); i++; continue; } const currentStructure = stack.pop(); const expectedClosing = currentStructure === 'object' ? '}' : ']'; if (char !== expectedClosing) { // Mismatched closing bracket trackChange(`Replacing ${char} with ${expectedClosing}`); output += expectedClosing; } else { output += char; } indentation--; i++; continue; } if (char === ',') { // Handle trailing commas const next = nextNonWhitespaceCharacter(); if (next === '}' || next === ']') { // Trailing comma; skip it trackChange('Removing trailing comma'); i++; continue; } output += char; i++; continue; } if (char === '"' || char === "'") { // Start of a string const stringStartIndex = i; const quoteType = char; i++; // Find the end of the string let endIndex = i; let isEscaped = false; while (endIndex < text.length) { if (text[endIndex] === '\\') { // Skip the next character isEscaped = !isEscaped; endIndex++; continue; } if (text[endIndex] === quoteType && !isEscaped) { // Found the end of the string break; } isEscaped = false; endIndex++; } if (endIndex < text.length) { // Complete string found const stringContent = text.substring(stringStartIndex, endIndex + 1); if (quoteType === "'") { // Convert to double quotes const normalized = normalizeString(stringContent); output += normalized; trackChange('Converted single quotes to double quotes in string'); } else { output += stringContent; } i = endIndex + 1; continue; } else { // Unclosed string inString = true; if (quoteType === "'") { // Convert to a double quote output += '"'; trackChange('Converted single quote to double quote and treating as unclosed string'); } else { output += char; } i++; continue; } } if (char === ':') { // Property separator in an object output += char; i++; continue; } if (isDigit(char) || char === '-' || char === '+' || char === '.') { // Number let numberStr = ''; // eslint-disable-next-line @typescript-eslint/no-unused-vars const start = i; // Extract the number while (i < text.length && (isDigit(text[i]) || text[i] === '-' || text[i] === '+' || text[i] === '.' || text[i] === 'e' || text[i] === 'E')) { numberStr += text[i]; i++; } // Validate and fix the number if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(numberStr)) { output += numberStr; } else { // Invalid number format trackChange('Fixing invalid number format'); try { // Try to parse it as a JavaScript number and convert back to a valid JSON number const parsed = parseFloat(numberStr); if (!isNaN(parsed)) { output += JSON.stringify(parsed); } else { // Fallback for numbers that can't be parsed output += '0'; trackChange('Replaced invalid number with 0'); } } catch (e) { output += '0'; trackChange('Replaced invalid number with 0'); } } continue; } // Handle special tokens if (isAlpha(char)) { // Check for literals like true, false, null // or Python constants like True, False, None const remaining = text.substring(i); if (/^true/i.test(remaining)) { output += 'true'; i += remaining.match(/^true/i)[0].length; trackChange('Normalized to lowercase true'); continue; } if (/^false/i.test(remaining)) { output += 'false'; i += remaining.match(/^false/i)[0].length; trackChange('Normalized to lowercase false'); continue; } if (/^null/i.test(remaining)) { output += 'null'; i += remaining.match(/^null/i)[0].length; trackChange('Normalized to lowercase null'); continue; } if (/^none/i.test(remaining)) { output += 'null'; i += remaining.match(/^none/i)[0].length; trackChange('Converted Python None to null'); continue; } // Check for unquoted property names (common in JavaScript objects) if (stack[stack.length - 1] === 'object') { let propertyName = ''; const startIndex = i; // Extract the property name while (i < text.length && (isAlpha(text[i]) || isDigit(text[i]) || text[i] === '_')) { propertyName += text[i]; i++; } // Skip whitespace while (i < text.length && isWhitespace(text[i])) { i++; } // Check if we have a colon after the property name if (i < text.length && text[i] === ':') { output += `"${propertyName}":`; trackChange('Added quotes around property name'); i++; continue; } else { // Not a property, reset position i = startIndex; } } // Unrecognized token trackChange('Skipping unrecognized token'); i++; continue; } // Handle other characters if (char === '=') { // Sometimes used instead of colon const next = nextNonWhitespaceCharacter(); if (next !== '=') { // Single equals, likely a mistake for a colon output += ':'; trackChange('Replaced = with :'); i++; continue; } } // Skip any other characters we don't recognize i++; } // Handle unclosed structures while (stack.length > 0) { const currentStructure = stack.pop(); const closingChar = currentStructure === 'object' ? '}' : ']'; output += closingChar; trackChange(`Added missing closing ${closingChar}`); } return output; }