UNPKG

llm-json-fix

Version:

Fix malformed JSON outputs from Large Language Models (LLMs)

github.com/thantsinoo/llm-json-fix

thantsinoo/llm-json-fix

865 lines (852 loc) • 29.1 kB

JavaScript

(function webpackUniversalModuleDefinition(root, factory) { if(typeof exports === 'object' && typeof module === 'object') module.exports = factory(); else if(typeof define === 'function' && define.amd) define([], factory); else if(typeof exports === 'object') exports["LLMJSONFix"] = factory(); else root["LLMJSONFix"] = factory(); })(this, () => { return /******/ (() => { // webpackBootstrap /******/ "use strict"; /******/ // The require scope /******/ var __webpack_require__ = {}; /******/ /************************************************************************/ /******/ /* webpack/runtime/define property getters */ /******/ (() => { /******/ // define getter functions for harmony exports /******/ __webpack_require__.d = (exports, definition) => { /******/ for(var key in definition) { /******/ if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) { /******/ Object.defineProperty(exports, key, { enumerable: true, get: definition[key] }); /******/ } /******/ } /******/ }; /******/ })(); /******/ /******/ /* webpack/runtime/hasOwnProperty shorthand */ /******/ (() => { /******/ __webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop)) /******/ })(); /******/ /******/ /* webpack/runtime/make namespace object */ /******/ (() => { /******/ // define __esModule on exports /******/ __webpack_require__.r = (exports) => { /******/ if(typeof Symbol !== 'undefined' && Symbol.toStringTag) { /******/ Object.defineProperty(exports, Symbol.toStringTag, { value: 'Module' }); /******/ } /******/ Object.defineProperty(exports, '__esModule', { value: true }); /******/ }; /******/ })(); /******/ /************************************************************************/ var __webpack_exports__ = {}; // ESM COMPAT FLAG __webpack_require__.r(__webpack_exports__); // EXPORTS __webpack_require__.d(__webpack_exports__, { AmbiguousRepairError: () => (/* reexport */ AmbiguousRepairError), BufferLimitExceededError: () => (/* reexport */ BufferLimitExceededError), LLMJSONFixError: () => (/* reexport */ LLMJSONFixError), UnrepairableJSONError: () => (/* reexport */ UnrepairableJSONError), fixLLMJson: () => (/* reexport */ fixLLMJson) }); ;// ./src/utils/errors.ts /** * Custom error thrown by the LLM JSON Fix library */ class LLMJSONFixError extends Error { /** * Create a new LLMJSONFixError * @param message The error message * @param position The position in the text where the error occurred (if available) */ constructor(message, position) { super(message); this.position = position; this.name = 'LLMJSONFixError'; // Maintain proper stack trace in V8 engines if (Error.captureStackTrace) { Error.captureStackTrace(this, LLMJSONFixError); } } } /** * Error thrown when a repair operation could not be completed */ class UnrepairableJSONError extends LLMJSONFixError { constructor(message, position) { super(message, position); this.name = 'UnrepairableJSONError'; } } /** * Error thrown when the input JSON is too complex or ambiguous to be repaired */ class AmbiguousRepairError extends LLMJSONFixError { constructor(message, position) { super(message, position); this.name = 'AmbiguousRepairError'; } } /** * Error thrown when repair operation exceeds buffer limits */ class BufferLimitExceededError extends LLMJSONFixError { constructor(message, position) { super(message, position); this.name = 'BufferLimitExceededError'; } } ;// ./src/utils/llmPatterns.ts /** * Patterns for detecting and handling LLM-specific JSON issues */ /** * Detects markdown code blocks and extracts the content * @param text Input text that may contain markdown code blocks * @returns The content of the first JSON code block, or null if none found */ function extractJsonFromMarkdown(text) { // Standard markdown code block with json const jsonBlockRegex = /```(?:json)?\s*\n([\s\S]*?)\n```/; const match = text.match(jsonBlockRegex); if (match && match[1]) { return match[1].trim(); } // For other types of code blocks that might contain JSON const genericBlockRegex = /```(?:\w*)?\s*\n([\s\S]*?)\n```/; const genericMatch = text.match(genericBlockRegex); if (genericMatch && genericMatch[1]) { const content = genericMatch[1].trim(); // Check if content looks like JSON if (/^\s*[{[]/.test(content)) { return content; } } return null; } /** * Removes explanatory text that LLMs often include before/after JSON output */ function stripExplanatoryText(text) { // First, try to extract JSON from markdown const jsonFromMarkdown = extractJsonFromMarkdown(text); if (jsonFromMarkdown) { return jsonFromMarkdown; } // If we can identify JSON object or array, extract it const jsonPattern = /(\{[\s\S]*\}|\[[\s\S]*\])/; const match = text.match(jsonPattern); if (match && match[1]) { return match[1]; } // If we can't easily extract JSON, just return the original return text; } /** * Fixes code block annotations that may have been erroneously included in the output */ function stripCodeBlockAnnotations(text) { return text .replace(/^```json\s*\n/gm, '') .replace(/\n```\s*$/gm, '') .replace(/^```\s*\n/gm, ''); } /** * Removes trailing ellipses, which are often used by LLMs to indicate truncation * or continuation that isn't actual JSON data */ function removeEllipses(text) { // Use more specific patterns to avoid unintended replacements let result = text; // Replace ellipses at the end of arrays result = result.replace(/,\s*\.\.\.(\s*])/g, '$1'); // Replace ellipses at the end of objects result = result.replace(/,\s*\.\.\.(\s*})/g, '$1'); // Replace quoted ellipses result = result.replace(/,\s*["']\.\.\.["'](\s*[\]}])/g, '$1'); // Replace trailing ellipses at the end of lines result = result.replace(/,\s*\.\.\.$/gm, ''); return result; } /** * Remove explanations that LLMs sometimes insert as comments */ function stripLLMComments(text) { // Remove both JSON comments and natural language explanation patterns return text // Standard JSON comments .replace(/\/\/.*$/gm, '') .replace(/\/\*[\s\S]*?\*\//g, '') // Explanatory notes often added by LLMs in parentheses .replace(/\(\s*Note:.*?\)/g, '') .replace(/\[\s*Note:.*?\]/g, ''); } /** * Handle partial property names or incomplete strings at the end of the text */ function fixTruncatedContent(text) { // Remove anything after the last balanced closing bracket const balancedText = balanceJsonStructure(text); // If we handled it through balancing, return that if (balancedText !== text) { return balancedText; } // Otherwise, look for trailing fragments and remove them return text // Remove partial property at the end .replace(/,\s*"[^"]*$/g, '') .replace(/,\s*'[^']*$/g, '') // Remove trailing commas .replace(/,(\s*[\]}])/g, '$1'); } /** * Try to balance JSON by fixing unclosed brackets */ function balanceJsonStructure(text) { const stack = []; let inString = false; let escapeNext = false; // First pass: analyze the structure for (let i = 0; i < text.length; i++) { const char = text[i]; if (escapeNext) { escapeNext = false; continue; } if (char === '\\') { escapeNext = true; continue; } if (char === '"' && !inString) { inString = true; continue; } if (char === '"' && inString) { inString = false; continue; } if (inString) { continue; } if (char === '{' || char === '[') { stack.push({ char, position: i }); } else if (char === '}') { if (stack.length > 0 && stack[stack.length - 1].char === '{') { stack.pop(); } else { // Unmatched closing bracket return text.substring(0, i); } } else if (char === ']') { if (stack.length > 0 && stack[stack.length - 1].char === '[') { stack.pop(); } else { // Unmatched closing bracket return text.substring(0, i); } } } // If we have unclosed brackets, close them if (stack.length > 0) { let result = text; // Close remaining brackets in reverse order for (let i = stack.length - 1; i >= 0; i--) { const openingBracket = stack[i].char; const closingBracket = openingBracket === '{' ? '}' : ']'; result += closingBracket; } return result; } return text; } /** * Fix incorrect handling of nested JSON strings */ function fixNestedJsonStrings(text) { // Look for cases where stringified JSON has escaped quotes inside strings // This is a simplistic approach - a complete solution would need to parse and rebuild return text .replace(/"{/g, '{') .replace(/}"/g, '}') .replace(/"\[/g, '[') .replace(/\]"/g, ']') // Fix escaped quotes that shouldn't be escaped .replace(/\\"/g, '"') .replace(/\\'/g, "'"); } /** * Main function to apply all LLM-specific fixes */ function applyLLMSpecificFixes(text) { // Apply fixes in an order that makes sense let result = text; // First try to extract JSON if in markdown const extracted = extractJsonFromMarkdown(result); if (extracted) { result = extracted; } else { // Otherwise strip code block annotations result = stripCodeBlockAnnotations(result); } // Remove comments and explanations result = stripLLMComments(result); result = stripExplanatoryText(result); // Fix structural issues result = removeEllipses(result); result = fixNestedJsonStrings(result); result = fixTruncatedContent(result); return result; } ;// ./src/utils/stringUtils.ts /** * Checks if a character is a whitespace character */ function isWhitespace(char) { return /\s/.test(char); } /** * Checks if a character is a line terminator */ function isLineTerminator(char) { return char === '\n' || char === '\r'; } /** * Checks if a character is a digit */ function isDigit(char) { return /[0-9]/.test(char); } /** * Checks if a character is a letter */ function isAlpha(char) { return /[a-zA-Z]/.test(char); } /** * Checks if a character can be part of a valid identifier */ function isIdentifierChar(char) { return isAlpha(char) || isDigit(char) || char === '_' || char === '$'; } /** * Get the position in the text as line:column */ function getPositionDetails(text, index) { const lines = text.slice(0, index).split('\n'); const line = lines.length; const column = lines[lines.length - 1].length + 1; return { line, column }; } /** * Format a position as a string */ function formatPosition(text, index) { const { line, column } = getPositionDetails(text, index); return `line ${line}, column ${column}`; } /** * Extracts text that is likely markdown-formatted code blocks */ function extractMarkdownCodeBlocks(text) { const codeBlockRegex = /```(?:json)?([^`]+)```/g; const matches = text.match(codeBlockRegex); return matches ? Array.from(matches) : null; } /** * Normalizes different quote styles to standard double quotes */ function normalizeQuotes(text) { // Replace fancy quotes with standard double quotes return text .replace(/[""]/g, '"') .replace(/['']/g, "'"); } /** * Checks if text contains common markdown indicators */ function containsMarkdown(text) { const markdownIndicators = [ /```/, // Code blocks /^#+\s+/m, // Headers /\*\*.+\*\*/, // Bold /\*.+\*/, // Italic /\[.+\]\(.+\)/ // Links ]; return markdownIndicators.some(pattern => pattern.test(text)); } /** * Checks if the text appears to be LLM formatted output with natural language */ function isLLMStyleOutput(text) { // Look for patterns common in LLM outputs const llmPatterns = [ /here(?:'|')?s\s+(?:the|an?|your)\s+(?:json|output|response)/i, /I(?:'|')?(?:ve|ll|m)\s+(?:generated|created|provided)/i, /```json/i, /^\s*[\w\s]+:\s*$/m // Explanatory labels followed by a colon ]; return llmPatterns.some(pattern => pattern.test(text)); } ;// ./src/regular/jsonFix.ts /** * Fix malformed JSON from LLM outputs * * @param text Potentially broken JSON text * @param options Configuration options * @returns Repaired JSON string * @throws {UnrepairableJSONError} If the JSON cannot be repaired */ function fixLLMJson(text, options = {}) { const { applyModelSpecificFixes = true, // Keep model in destructuring for API consistency even though unused // eslint-disable-next-line @typescript-eslint/no-unused-vars model = 'general', preserveComments = false, verbose = false } = options; // Apply LLM-specific fixes first, if enabled let result = text; if (applyModelSpecificFixes) { // Note: When applying model-specific fixes, we always strip comments // because they're likely part of the LLM's explanatory text. // We'll preserve user-specified comments in the JSON repair step if requested. result = applyLLMSpecificFixes(text); } // Now apply standard JSON repair logic return jsonrepair(result, { preserveComments, verbose }); } /** * Repair invalid JSON documents * Core implementation based on jsonrepair library with enhancements * * @param text The JSON document containing errors * @param options Repair options * @returns Repaired JSON as string * @throws {UnrepairableJSONError} If the JSON cannot be repaired */ function jsonrepair(text, options = {}) { if (text === '') { return ''; } const { preserveComments = false, verbose = false } = options; let i = 0; // current index in text let output = ''; // generated output const processedIndices = new Set(); // to track processed indices and prevent infinite loops let indentation = 0; // current indentation level // object stack to track the type of objects we're currently in const stack = []; // whether we're currently in a string let inString = false; // for verbose logging const changes = []; function trackChange(message) { if (verbose) { changes.push({ index: i, message }); } } /** * Get the next character that is not a whitespace character */ function nextNonWhitespaceCharacter() { let j = i + 1; while (j < text.length && isWhitespace(text[j])) { j++; } if (j >= text.length) { return null; } return text[j]; } /** * Convert a JavaScript string with single or double quotes into a JSON string * with double quotes and proper escaping of special characters. */ function normalizeString(str) { // Remove the first and last quote const content = str.slice(1, -1); let isEscaped = false; let normalized = '"'; for (let j = 0; j < content.length; j++) { const char = content[j]; if (isEscaped) { if (char !== "'" && char !== '"' && char !== '\\' && char !== '/') { // Maintain existing escapes except for quotes and forward slashes normalized += '\\'; } normalized += char; isEscaped = false; continue; } if (char === '\\') { isEscaped = true; normalized += '\\'; continue; } if (char === '"') { normalized += '\\'; } normalized += char; } normalized += '"'; return normalized; } // Process characters one by one while (i < text.length) { // Prevent infinite loops by tracking processed indices if (processedIndices.has(i)) { throw new UnrepairableJSONError(`Infinite loop detected at ${formatPosition(text, i)}`, i); } processedIndices.add(i); const char = text[i]; if (inString) { // We're inside a string if (char === '\\') { // Escape character if (i + 1 < text.length) { // Just include the escape and the next character output += char + text[i + 1]; i += 2; continue; } else { // String is not closed, escape at the end output += '"'; trackChange('Added missing closing double quote at end of text'); inString = false; i++; continue; } } else if (char === '"') { // End of string output += char; inString = false; i++; continue; } else { // Regular character inside a string output += char; i++; continue; } } // Handle whitespace between tokens if (isWhitespace(char)) { i++; // Include newlines in the output, but skip other whitespace if (isLineTerminator(char)) { if (indentation > 0) { output += '\n' + ' '.repeat(indentation); } else { output += '\n'; } } continue; } // Handle comments if preserveComments is false if (char === '/' && i + 1 < text.length) { const nextChar = text[i + 1]; if (nextChar === '/') { // Single-line comment if (preserveComments) { // Keep the comment in the output while (i < text.length && !isLineTerminator(text[i])) { output += text[i]; i++; } trackChange('Preserved single-line comment'); continue; } else { // Remove the comment trackChange('Removing single-line comment'); i += 2; while (i < text.length && !isLineTerminator(text[i])) { i++; } continue; } } else if (nextChar === '*') { // Multi-line comment if (preserveComments) { // Keep the multi-line comment in the output output += '/*'; i += 2; while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) { output += text[i]; i++; } if (i + 1 < text.length) { output += '*/'; i += 2; } trackChange('Preserved multi-line comment'); continue; } else { // Remove the comment trackChange('Removing multi-line comment'); i += 2; while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) { i++; } i += 2; continue; } } } if (char === '{') { // Start of an object stack.push('object'); output += char; indentation++; i++; continue; } if (char === '[') { // Start of an array stack.push('array'); output += char; indentation++; i++; continue; } if (char === '}' || char === ']') { // End of an object or array if (stack.length === 0) { // Unmatched closing bracket, remove it trackChange(`Removing unmatched closing ${char === '}' ? 'curly brace' : 'square bracket'}`); i++; continue; } const currentStructure = stack.pop(); const expectedClosing = currentStructure === 'object' ? '}' : ']'; if (char !== expectedClosing) { // Mismatched closing bracket trackChange(`Replacing ${char} with ${expectedClosing}`); output += expectedClosing; } else { output += char; } indentation--; i++; continue; } if (char === ',') { // Handle trailing commas const next = nextNonWhitespaceCharacter(); if (next === '}' || next === ']') { // Trailing comma; skip it trackChange('Removing trailing comma'); i++; continue; } output += char; i++; continue; } if (char === '"' || char === "'") { // Start of a string const stringStartIndex = i; const quoteType = char; i++; // Find the end of the string let endIndex = i; let isEscaped = false; while (endIndex < text.length) { if (text[endIndex] === '\\') { // Skip the next character isEscaped = !isEscaped; endIndex++; continue; } if (text[endIndex] === quoteType && !isEscaped) { // Found the end of the string break; } isEscaped = false; endIndex++; } if (endIndex < text.length) { // Complete string found const stringContent = text.substring(stringStartIndex, endIndex + 1); if (quoteType === "'") { // Convert to double quotes const normalized = normalizeString(stringContent); output += normalized; trackChange('Converted single quotes to double quotes in string'); } else { output += stringContent; } i = endIndex + 1; continue; } else { // Unclosed string inString = true; if (quoteType === "'") { // Convert to a double quote output += '"'; trackChange('Converted single quote to double quote and treating as unclosed string'); } else { output += char; } i++; continue; } } if (char === ':') { // Property separator in an object output += char; i++; continue; } if (isDigit(char) || char === '-' || char === '+' || char === '.') { // Number let numberStr = ''; // eslint-disable-next-line @typescript-eslint/no-unused-vars const start = i; // Extract the number while (i < text.length && (isDigit(text[i]) || text[i] === '-' || text[i] === '+' || text[i] === '.' || text[i] === 'e' || text[i] === 'E')) { numberStr += text[i]; i++; } // Validate and fix the number if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(numberStr)) { output += numberStr; } else { // Invalid number format trackChange('Fixing invalid number format'); try { // Try to parse it as a JavaScript number and convert back to a valid JSON number const parsed = parseFloat(numberStr); if (!isNaN(parsed)) { output += JSON.stringify(parsed); } else { // Fallback for numbers that can't be parsed output += '0'; trackChange('Replaced invalid number with 0'); } } catch (e) { output += '0'; trackChange('Replaced invalid number with 0'); } } continue; } // Handle special tokens if (isAlpha(char)) { // Check for literals like true, false, null // or Python constants like True, False, None const remaining = text.substring(i); if (/^true/i.test(remaining)) { output += 'true'; i += remaining.match(/^true/i)[0].length; trackChange('Normalized to lowercase true'); continue; } if (/^false/i.test(remaining)) { output += 'false'; i += remaining.match(/^false/i)[0].length; trackChange('Normalized to lowercase false'); continue; } if (/^null/i.test(remaining)) { output += 'null'; i += remaining.match(/^null/i)[0].length; trackChange('Normalized to lowercase null'); continue; } if (/^none/i.test(remaining)) { output += 'null'; i += remaining.match(/^none/i)[0].length; trackChange('Converted Python None to null'); continue; } // Check for unquoted property names (common in JavaScript objects) if (stack[stack.length - 1] === 'object') { let propertyName = ''; const startIndex = i; // Extract the property name while (i < text.length && (isAlpha(text[i]) || isDigit(text[i]) || text[i] === '_')) { propertyName += text[i]; i++; } // Skip whitespace while (i < text.length && isWhitespace(text[i])) { i++; } // Check if we have a colon after the property name if (i < text.length && text[i] === ':') { output += `"${propertyName}":`; trackChange('Added quotes around property name'); i++; continue; } else { // Not a property, reset position i = startIndex; } } // Unrecognized token trackChange('Skipping unrecognized token'); i++; continue; } // Handle other characters if (char === '=') { // Sometimes used instead of colon const next = nextNonWhitespaceCharacter(); if (next !== '=') { // Single equals, likely a mistake for a colon output += ':'; trackChange('Replaced = with :'); i++; continue; } } // Skip any other characters we don't recognize i++; } // Handle unclosed structures while (stack.length > 0) { const currentStructure = stack.pop(); const closingChar = currentStructure === 'object' ? '}' : ']'; output += closingChar; trackChange(`Added missing closing ${closingChar}`); } return output; } ;// ./src/index.ts /** * LLM JSON Fix - A library for fixing malformed JSON outputs from LLMs * * @packageDocumentation */ // Export the main API /******/ return __webpack_exports__; /******/ })() ; });