llm-json-fix
Version:
Fix malformed JSON outputs from Large Language Models (LLMs)
449 lines (448 loc) • 16.5 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.fixLLMJson = fixLLMJson;
exports.jsonrepair = jsonrepair;
const errors_1 = require("../utils/errors");
const llmPatterns_1 = require("../utils/llmPatterns");
const stringUtils_1 = require("../utils/stringUtils");
/**
* Fix malformed JSON from LLM outputs
*
* @param text Potentially broken JSON text
* @param options Configuration options
* @returns Repaired JSON string
* @throws {UnrepairableJSONError} If the JSON cannot be repaired
*/
function fixLLMJson(text, options = {}) {
const { applyModelSpecificFixes = true,
// Keep model in destructuring for API consistency even though unused
// eslint-disable-next-line @typescript-eslint/no-unused-vars
model = 'general', preserveComments = false, verbose = false } = options;
// Apply LLM-specific fixes first, if enabled
let result = text;
if (applyModelSpecificFixes) {
// Note: When applying model-specific fixes, we always strip comments
// because they're likely part of the LLM's explanatory text.
// We'll preserve user-specified comments in the JSON repair step if requested.
result = (0, llmPatterns_1.applyLLMSpecificFixes)(text);
}
// Now apply standard JSON repair logic
return jsonrepair(result, { preserveComments, verbose });
}
/**
* Repair invalid JSON documents
* Core implementation based on jsonrepair library with enhancements
*
* @param text The JSON document containing errors
* @param options Repair options
* @returns Repaired JSON as string
* @throws {UnrepairableJSONError} If the JSON cannot be repaired
*/
function jsonrepair(text, options = {}) {
if (text === '') {
return '';
}
const { preserveComments = false, verbose = false } = options;
let i = 0; // current index in text
let output = ''; // generated output
const processedIndices = new Set(); // to track processed indices and prevent infinite loops
let indentation = 0; // current indentation level
// object stack to track the type of objects we're currently in
const stack = [];
// whether we're currently in a string
let inString = false;
// for verbose logging
const changes = [];
function trackChange(message) {
if (verbose) {
changes.push({ index: i, message });
}
}
/**
* Get the next character that is not a whitespace character
*/
function nextNonWhitespaceCharacter() {
let j = i + 1;
while (j < text.length && (0, stringUtils_1.isWhitespace)(text[j])) {
j++;
}
if (j >= text.length) {
return null;
}
return text[j];
}
/**
* Convert a JavaScript string with single or double quotes into a JSON string
* with double quotes and proper escaping of special characters.
*/
function normalizeString(str) {
// Remove the first and last quote
const content = str.slice(1, -1);
let isEscaped = false;
let normalized = '"';
for (let j = 0; j < content.length; j++) {
const char = content[j];
if (isEscaped) {
if (char !== "'" && char !== '"' && char !== '\\' && char !== '/') {
// Maintain existing escapes except for quotes and forward slashes
normalized += '\\';
}
normalized += char;
isEscaped = false;
continue;
}
if (char === '\\') {
isEscaped = true;
normalized += '\\';
continue;
}
if (char === '"') {
normalized += '\\';
}
normalized += char;
}
normalized += '"';
return normalized;
}
// Process characters one by one
while (i < text.length) {
// Prevent infinite loops by tracking processed indices
if (processedIndices.has(i)) {
throw new errors_1.UnrepairableJSONError(`Infinite loop detected at ${(0, stringUtils_1.formatPosition)(text, i)}`, i);
}
processedIndices.add(i);
const char = text[i];
if (inString) {
// We're inside a string
if (char === '\\') {
// Escape character
if (i + 1 < text.length) {
// Just include the escape and the next character
output += char + text[i + 1];
i += 2;
continue;
}
else {
// String is not closed, escape at the end
output += '"';
trackChange('Added missing closing double quote at end of text');
inString = false;
i++;
continue;
}
}
else if (char === '"') {
// End of string
output += char;
inString = false;
i++;
continue;
}
else {
// Regular character inside a string
output += char;
i++;
continue;
}
}
// Handle whitespace between tokens
if ((0, stringUtils_1.isWhitespace)(char)) {
i++;
// Include newlines in the output, but skip other whitespace
if ((0, stringUtils_1.isLineTerminator)(char)) {
if (indentation > 0) {
output += '\n' + ' '.repeat(indentation);
}
else {
output += '\n';
}
}
continue;
}
// Handle comments if preserveComments is false
if (char === '/' && i + 1 < text.length) {
const nextChar = text[i + 1];
if (nextChar === '/') {
// Single-line comment
if (preserveComments) {
// Keep the comment in the output
while (i < text.length && !(0, stringUtils_1.isLineTerminator)(text[i])) {
output += text[i];
i++;
}
trackChange('Preserved single-line comment');
continue;
}
else {
// Remove the comment
trackChange('Removing single-line comment');
i += 2;
while (i < text.length && !(0, stringUtils_1.isLineTerminator)(text[i])) {
i++;
}
continue;
}
}
else if (nextChar === '*') {
// Multi-line comment
if (preserveComments) {
// Keep the multi-line comment in the output
output += '/*';
i += 2;
while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) {
output += text[i];
i++;
}
if (i + 1 < text.length) {
output += '*/';
i += 2;
}
trackChange('Preserved multi-line comment');
continue;
}
else {
// Remove the comment
trackChange('Removing multi-line comment');
i += 2;
while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) {
i++;
}
i += 2;
continue;
}
}
}
if (char === '{') {
// Start of an object
stack.push('object');
output += char;
indentation++;
i++;
continue;
}
if (char === '[') {
// Start of an array
stack.push('array');
output += char;
indentation++;
i++;
continue;
}
if (char === '}' || char === ']') {
// End of an object or array
if (stack.length === 0) {
// Unmatched closing bracket, remove it
trackChange(`Removing unmatched closing ${char === '}' ? 'curly brace' : 'square bracket'}`);
i++;
continue;
}
const currentStructure = stack.pop();
const expectedClosing = currentStructure === 'object' ? '}' : ']';
if (char !== expectedClosing) {
// Mismatched closing bracket
trackChange(`Replacing ${char} with ${expectedClosing}`);
output += expectedClosing;
}
else {
output += char;
}
indentation--;
i++;
continue;
}
if (char === ',') {
// Handle trailing commas
const next = nextNonWhitespaceCharacter();
if (next === '}' || next === ']') {
// Trailing comma; skip it
trackChange('Removing trailing comma');
i++;
continue;
}
output += char;
i++;
continue;
}
if (char === '"' || char === "'") {
// Start of a string
const stringStartIndex = i;
const quoteType = char;
i++;
// Find the end of the string
let endIndex = i;
let isEscaped = false;
while (endIndex < text.length) {
if (text[endIndex] === '\\') {
// Skip the next character
isEscaped = !isEscaped;
endIndex++;
continue;
}
if (text[endIndex] === quoteType && !isEscaped) {
// Found the end of the string
break;
}
isEscaped = false;
endIndex++;
}
if (endIndex < text.length) {
// Complete string found
const stringContent = text.substring(stringStartIndex, endIndex + 1);
if (quoteType === "'") {
// Convert to double quotes
const normalized = normalizeString(stringContent);
output += normalized;
trackChange('Converted single quotes to double quotes in string');
}
else {
output += stringContent;
}
i = endIndex + 1;
continue;
}
else {
// Unclosed string
inString = true;
if (quoteType === "'") {
// Convert to a double quote
output += '"';
trackChange('Converted single quote to double quote and treating as unclosed string');
}
else {
output += char;
}
i++;
continue;
}
}
if (char === ':') {
// Property separator in an object
output += char;
i++;
continue;
}
if ((0, stringUtils_1.isDigit)(char) || char === '-' || char === '+' || char === '.') {
// Number
let numberStr = '';
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const start = i;
// Extract the number
while (i < text.length &&
((0, stringUtils_1.isDigit)(text[i]) ||
text[i] === '-' ||
text[i] === '+' ||
text[i] === '.' ||
text[i] === 'e' ||
text[i] === 'E')) {
numberStr += text[i];
i++;
}
// Validate and fix the number
if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(numberStr)) {
output += numberStr;
}
else {
// Invalid number format
trackChange('Fixing invalid number format');
try {
// Try to parse it as a JavaScript number and convert back to a valid JSON number
const parsed = parseFloat(numberStr);
if (!isNaN(parsed)) {
output += JSON.stringify(parsed);
}
else {
// Fallback for numbers that can't be parsed
output += '0';
trackChange('Replaced invalid number with 0');
}
}
catch (e) {
output += '0';
trackChange('Replaced invalid number with 0');
}
}
continue;
}
// Handle special tokens
if ((0, stringUtils_1.isAlpha)(char)) {
// Check for literals like true, false, null
// or Python constants like True, False, None
const remaining = text.substring(i);
if (/^true/i.test(remaining)) {
output += 'true';
i += remaining.match(/^true/i)[0].length;
trackChange('Normalized to lowercase true');
continue;
}
if (/^false/i.test(remaining)) {
output += 'false';
i += remaining.match(/^false/i)[0].length;
trackChange('Normalized to lowercase false');
continue;
}
if (/^null/i.test(remaining)) {
output += 'null';
i += remaining.match(/^null/i)[0].length;
trackChange('Normalized to lowercase null');
continue;
}
if (/^none/i.test(remaining)) {
output += 'null';
i += remaining.match(/^none/i)[0].length;
trackChange('Converted Python None to null');
continue;
}
// Check for unquoted property names (common in JavaScript objects)
if (stack[stack.length - 1] === 'object') {
let propertyName = '';
const startIndex = i;
// Extract the property name
while (i < text.length && ((0, stringUtils_1.isAlpha)(text[i]) || (0, stringUtils_1.isDigit)(text[i]) || text[i] === '_')) {
propertyName += text[i];
i++;
}
// Skip whitespace
while (i < text.length && (0, stringUtils_1.isWhitespace)(text[i])) {
i++;
}
// Check if we have a colon after the property name
if (i < text.length && text[i] === ':') {
output += `"${propertyName}":`;
trackChange('Added quotes around property name');
i++;
continue;
}
else {
// Not a property, reset position
i = startIndex;
}
}
// Unrecognized token
trackChange('Skipping unrecognized token');
i++;
continue;
}
// Handle other characters
if (char === '=') {
// Sometimes used instead of colon
const next = nextNonWhitespaceCharacter();
if (next !== '=') {
// Single equals, likely a mistake for a colon
output += ':';
trackChange('Replaced = with :');
i++;
continue;
}
}
// Skip any other characters we don't recognize
i++;
}
// Handle unclosed structures
while (stack.length > 0) {
const currentStructure = stack.pop();
const closingChar = currentStructure === 'object' ? '}' : ']';
output += closingChar;
trackChange(`Added missing closing ${closingChar}`);
}
return output;
}