llm-json-fix
Version:
Fix malformed JSON outputs from Large Language Models (LLMs)
865 lines (852 loc) • 29.1 kB
JavaScript
(function webpackUniversalModuleDefinition(root, factory) {
if(typeof exports === 'object' && typeof module === 'object')
module.exports = factory();
else if(typeof define === 'function' && define.amd)
define([], factory);
else if(typeof exports === 'object')
exports["LLMJSONFix"] = factory();
else
root["LLMJSONFix"] = factory();
})(this, () => {
return /******/ (() => { // webpackBootstrap
/******/ "use strict";
/******/ // The require scope
/******/ var __webpack_require__ = {};
/******/
/************************************************************************/
/******/ /* webpack/runtime/define property getters */
/******/ (() => {
/******/ // define getter functions for harmony exports
/******/ __webpack_require__.d = (exports, definition) => {
/******/ for(var key in definition) {
/******/ if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {
/******/ Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });
/******/ }
/******/ }
/******/ };
/******/ })();
/******/
/******/ /* webpack/runtime/hasOwnProperty shorthand */
/******/ (() => {
/******/ __webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))
/******/ })();
/******/
/******/ /* webpack/runtime/make namespace object */
/******/ (() => {
/******/ // define __esModule on exports
/******/ __webpack_require__.r = (exports) => {
/******/ if(typeof Symbol !== 'undefined' && Symbol.toStringTag) {
/******/ Object.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });
/******/ }
/******/ Object.defineProperty(exports, '__esModule', { value: true });
/******/ };
/******/ })();
/******/
/************************************************************************/
var __webpack_exports__ = {};
// ESM COMPAT FLAG
__webpack_require__.r(__webpack_exports__);
// EXPORTS
__webpack_require__.d(__webpack_exports__, {
AmbiguousRepairError: () => (/* reexport */ AmbiguousRepairError),
BufferLimitExceededError: () => (/* reexport */ BufferLimitExceededError),
LLMJSONFixError: () => (/* reexport */ LLMJSONFixError),
UnrepairableJSONError: () => (/* reexport */ UnrepairableJSONError),
fixLLMJson: () => (/* reexport */ fixLLMJson)
});
;// ./src/utils/errors.ts
/**
* Custom error thrown by the LLM JSON Fix library
*/
class LLMJSONFixError extends Error {
/**
* Create a new LLMJSONFixError
* @param message The error message
* @param position The position in the text where the error occurred (if available)
*/
constructor(message, position) {
super(message);
this.position = position;
this.name = 'LLMJSONFixError';
// Maintain proper stack trace in V8 engines
if (Error.captureStackTrace) {
Error.captureStackTrace(this, LLMJSONFixError);
}
}
}
/**
* Error thrown when a repair operation could not be completed
*/
class UnrepairableJSONError extends LLMJSONFixError {
constructor(message, position) {
super(message, position);
this.name = 'UnrepairableJSONError';
}
}
/**
* Error thrown when the input JSON is too complex or ambiguous to be repaired
*/
class AmbiguousRepairError extends LLMJSONFixError {
constructor(message, position) {
super(message, position);
this.name = 'AmbiguousRepairError';
}
}
/**
* Error thrown when repair operation exceeds buffer limits
*/
class BufferLimitExceededError extends LLMJSONFixError {
constructor(message, position) {
super(message, position);
this.name = 'BufferLimitExceededError';
}
}
;// ./src/utils/llmPatterns.ts
/**
* Patterns for detecting and handling LLM-specific JSON issues
*/
/**
* Detects markdown code blocks and extracts the content
* @param text Input text that may contain markdown code blocks
* @returns The content of the first JSON code block, or null if none found
*/
function extractJsonFromMarkdown(text) {
// Standard markdown code block with json
const jsonBlockRegex = /```(?:json)?\s*\n([\s\S]*?)\n```/;
const match = text.match(jsonBlockRegex);
if (match && match[1]) {
return match[1].trim();
}
// For other types of code blocks that might contain JSON
const genericBlockRegex = /```(?:\w*)?\s*\n([\s\S]*?)\n```/;
const genericMatch = text.match(genericBlockRegex);
if (genericMatch && genericMatch[1]) {
const content = genericMatch[1].trim();
// Check if content looks like JSON
if (/^\s*[{[]/.test(content)) {
return content;
}
}
return null;
}
/**
* Removes explanatory text that LLMs often include before/after JSON output
*/
function stripExplanatoryText(text) {
// First, try to extract JSON from markdown
const jsonFromMarkdown = extractJsonFromMarkdown(text);
if (jsonFromMarkdown) {
return jsonFromMarkdown;
}
// If we can identify JSON object or array, extract it
const jsonPattern = /(\{[\s\S]*\}|\[[\s\S]*\])/;
const match = text.match(jsonPattern);
if (match && match[1]) {
return match[1];
}
// If we can't easily extract JSON, just return the original
return text;
}
/**
* Fixes code block annotations that may have been erroneously included in the output
*/
function stripCodeBlockAnnotations(text) {
return text
.replace(/^```json\s*\n/gm, '')
.replace(/\n```\s*$/gm, '')
.replace(/^```\s*\n/gm, '');
}
/**
* Removes trailing ellipses, which are often used by LLMs to indicate truncation
* or continuation that isn't actual JSON data
*/
function removeEllipses(text) {
// Use more specific patterns to avoid unintended replacements
let result = text;
// Replace ellipses at the end of arrays
result = result.replace(/,\s*\.\.\.(\s*])/g, '$1');
// Replace ellipses at the end of objects
result = result.replace(/,\s*\.\.\.(\s*})/g, '$1');
// Replace quoted ellipses
result = result.replace(/,\s*["']\.\.\.["'](\s*[\]}])/g, '$1');
// Replace trailing ellipses at the end of lines
result = result.replace(/,\s*\.\.\.$/gm, '');
return result;
}
/**
* Remove explanations that LLMs sometimes insert as comments
*/
function stripLLMComments(text) {
// Remove both JSON comments and natural language explanation patterns
return text
// Standard JSON comments
.replace(/\/\/.*$/gm, '')
.replace(/\/\*[\s\S]*?\*\//g, '')
// Explanatory notes often added by LLMs in parentheses
.replace(/\(\s*Note:.*?\)/g, '')
.replace(/\[\s*Note:.*?\]/g, '');
}
/**
* Handle partial property names or incomplete strings at the end of the text
*/
function fixTruncatedContent(text) {
// Remove anything after the last balanced closing bracket
const balancedText = balanceJsonStructure(text);
// If we handled it through balancing, return that
if (balancedText !== text) {
return balancedText;
}
// Otherwise, look for trailing fragments and remove them
return text
// Remove partial property at the end
.replace(/,\s*"[^"]*$/g, '')
.replace(/,\s*'[^']*$/g, '')
// Remove trailing commas
.replace(/,(\s*[\]}])/g, '$1');
}
/**
* Try to balance JSON by fixing unclosed brackets
*/
function balanceJsonStructure(text) {
const stack = [];
let inString = false;
let escapeNext = false;
// First pass: analyze the structure
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (escapeNext) {
escapeNext = false;
continue;
}
if (char === '\\') {
escapeNext = true;
continue;
}
if (char === '"' && !inString) {
inString = true;
continue;
}
if (char === '"' && inString) {
inString = false;
continue;
}
if (inString) {
continue;
}
if (char === '{' || char === '[') {
stack.push({ char, position: i });
}
else if (char === '}') {
if (stack.length > 0 && stack[stack.length - 1].char === '{') {
stack.pop();
}
else {
// Unmatched closing bracket
return text.substring(0, i);
}
}
else if (char === ']') {
if (stack.length > 0 && stack[stack.length - 1].char === '[') {
stack.pop();
}
else {
// Unmatched closing bracket
return text.substring(0, i);
}
}
}
// If we have unclosed brackets, close them
if (stack.length > 0) {
let result = text;
// Close remaining brackets in reverse order
for (let i = stack.length - 1; i >= 0; i--) {
const openingBracket = stack[i].char;
const closingBracket = openingBracket === '{' ? '}' : ']';
result += closingBracket;
}
return result;
}
return text;
}
/**
* Fix incorrect handling of nested JSON strings
*/
function fixNestedJsonStrings(text) {
// Look for cases where stringified JSON has escaped quotes inside strings
// This is a simplistic approach - a complete solution would need to parse and rebuild
return text
.replace(/"{/g, '{')
.replace(/}"/g, '}')
.replace(/"\[/g, '[')
.replace(/\]"/g, ']')
// Fix escaped quotes that shouldn't be escaped
.replace(/\\"/g, '"')
.replace(/\\'/g, "'");
}
/**
* Main function to apply all LLM-specific fixes
*/
function applyLLMSpecificFixes(text) {
// Apply fixes in an order that makes sense
let result = text;
// First try to extract JSON if in markdown
const extracted = extractJsonFromMarkdown(result);
if (extracted) {
result = extracted;
}
else {
// Otherwise strip code block annotations
result = stripCodeBlockAnnotations(result);
}
// Remove comments and explanations
result = stripLLMComments(result);
result = stripExplanatoryText(result);
// Fix structural issues
result = removeEllipses(result);
result = fixNestedJsonStrings(result);
result = fixTruncatedContent(result);
return result;
}
;// ./src/utils/stringUtils.ts
/**
* Checks if a character is a whitespace character
*/
function isWhitespace(char) {
return /\s/.test(char);
}
/**
* Checks if a character is a line terminator
*/
function isLineTerminator(char) {
return char === '\n' || char === '\r';
}
/**
* Checks if a character is a digit
*/
function isDigit(char) {
return /[0-9]/.test(char);
}
/**
* Checks if a character is a letter
*/
function isAlpha(char) {
return /[a-zA-Z]/.test(char);
}
/**
* Checks if a character can be part of a valid identifier
*/
function isIdentifierChar(char) {
return isAlpha(char) || isDigit(char) || char === '_' || char === '$';
}
/**
* Get the position in the text as line:column
*/
function getPositionDetails(text, index) {
const lines = text.slice(0, index).split('\n');
const line = lines.length;
const column = lines[lines.length - 1].length + 1;
return { line, column };
}
/**
* Format a position as a string
*/
function formatPosition(text, index) {
const { line, column } = getPositionDetails(text, index);
return `line ${line}, column ${column}`;
}
/**
* Extracts text that is likely markdown-formatted code blocks
*/
function extractMarkdownCodeBlocks(text) {
const codeBlockRegex = /```(?:json)?([^`]+)```/g;
const matches = text.match(codeBlockRegex);
return matches ? Array.from(matches) : null;
}
/**
* Normalizes different quote styles to standard double quotes
*/
function normalizeQuotes(text) {
// Replace fancy quotes with standard double quotes
return text
.replace(/[""]/g, '"')
.replace(/['']/g, "'");
}
/**
* Checks if text contains common markdown indicators
*/
function containsMarkdown(text) {
const markdownIndicators = [
/```/, // Code blocks
/^#+\s+/m, // Headers
/\*\*.+\*\*/, // Bold
/\*.+\*/, // Italic
/\[.+\]\(.+\)/ // Links
];
return markdownIndicators.some(pattern => pattern.test(text));
}
/**
* Checks if the text appears to be LLM formatted output with natural language
*/
function isLLMStyleOutput(text) {
// Look for patterns common in LLM outputs
const llmPatterns = [
/here(?:'|')?s\s+(?:the|an?|your)\s+(?:json|output|response)/i,
/I(?:'|')?(?:ve|ll|m)\s+(?:generated|created|provided)/i,
/```json/i,
/^\s*[\w\s]+:\s*$/m // Explanatory labels followed by a colon
];
return llmPatterns.some(pattern => pattern.test(text));
}
;// ./src/regular/jsonFix.ts
/**
* Fix malformed JSON from LLM outputs
*
* @param text Potentially broken JSON text
* @param options Configuration options
* @returns Repaired JSON string
* @throws {UnrepairableJSONError} If the JSON cannot be repaired
*/
function fixLLMJson(text, options = {}) {
const { applyModelSpecificFixes = true,
// Keep model in destructuring for API consistency even though unused
// eslint-disable-next-line @typescript-eslint/no-unused-vars
model = 'general', preserveComments = false, verbose = false } = options;
// Apply LLM-specific fixes first, if enabled
let result = text;
if (applyModelSpecificFixes) {
// Note: When applying model-specific fixes, we always strip comments
// because they're likely part of the LLM's explanatory text.
// We'll preserve user-specified comments in the JSON repair step if requested.
result = applyLLMSpecificFixes(text);
}
// Now apply standard JSON repair logic
return jsonrepair(result, { preserveComments, verbose });
}
/**
* Repair invalid JSON documents
* Core implementation based on jsonrepair library with enhancements
*
* @param text The JSON document containing errors
* @param options Repair options
* @returns Repaired JSON as string
* @throws {UnrepairableJSONError} If the JSON cannot be repaired
*/
function jsonrepair(text, options = {}) {
if (text === '') {
return '';
}
const { preserveComments = false, verbose = false } = options;
let i = 0; // current index in text
let output = ''; // generated output
const processedIndices = new Set(); // to track processed indices and prevent infinite loops
let indentation = 0; // current indentation level
// object stack to track the type of objects we're currently in
const stack = [];
// whether we're currently in a string
let inString = false;
// for verbose logging
const changes = [];
function trackChange(message) {
if (verbose) {
changes.push({ index: i, message });
}
}
/**
* Get the next character that is not a whitespace character
*/
function nextNonWhitespaceCharacter() {
let j = i + 1;
while (j < text.length && isWhitespace(text[j])) {
j++;
}
if (j >= text.length) {
return null;
}
return text[j];
}
/**
* Convert a JavaScript string with single or double quotes into a JSON string
* with double quotes and proper escaping of special characters.
*/
function normalizeString(str) {
// Remove the first and last quote
const content = str.slice(1, -1);
let isEscaped = false;
let normalized = '"';
for (let j = 0; j < content.length; j++) {
const char = content[j];
if (isEscaped) {
if (char !== "'" && char !== '"' && char !== '\\' && char !== '/') {
// Maintain existing escapes except for quotes and forward slashes
normalized += '\\';
}
normalized += char;
isEscaped = false;
continue;
}
if (char === '\\') {
isEscaped = true;
normalized += '\\';
continue;
}
if (char === '"') {
normalized += '\\';
}
normalized += char;
}
normalized += '"';
return normalized;
}
// Process characters one by one
while (i < text.length) {
// Prevent infinite loops by tracking processed indices
if (processedIndices.has(i)) {
throw new UnrepairableJSONError(`Infinite loop detected at ${formatPosition(text, i)}`, i);
}
processedIndices.add(i);
const char = text[i];
if (inString) {
// We're inside a string
if (char === '\\') {
// Escape character
if (i + 1 < text.length) {
// Just include the escape and the next character
output += char + text[i + 1];
i += 2;
continue;
}
else {
// String is not closed, escape at the end
output += '"';
trackChange('Added missing closing double quote at end of text');
inString = false;
i++;
continue;
}
}
else if (char === '"') {
// End of string
output += char;
inString = false;
i++;
continue;
}
else {
// Regular character inside a string
output += char;
i++;
continue;
}
}
// Handle whitespace between tokens
if (isWhitespace(char)) {
i++;
// Include newlines in the output, but skip other whitespace
if (isLineTerminator(char)) {
if (indentation > 0) {
output += '\n' + ' '.repeat(indentation);
}
else {
output += '\n';
}
}
continue;
}
// Handle comments if preserveComments is false
if (char === '/' && i + 1 < text.length) {
const nextChar = text[i + 1];
if (nextChar === '/') {
// Single-line comment
if (preserveComments) {
// Keep the comment in the output
while (i < text.length && !isLineTerminator(text[i])) {
output += text[i];
i++;
}
trackChange('Preserved single-line comment');
continue;
}
else {
// Remove the comment
trackChange('Removing single-line comment');
i += 2;
while (i < text.length && !isLineTerminator(text[i])) {
i++;
}
continue;
}
}
else if (nextChar === '*') {
// Multi-line comment
if (preserveComments) {
// Keep the multi-line comment in the output
output += '/*';
i += 2;
while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) {
output += text[i];
i++;
}
if (i + 1 < text.length) {
output += '*/';
i += 2;
}
trackChange('Preserved multi-line comment');
continue;
}
else {
// Remove the comment
trackChange('Removing multi-line comment');
i += 2;
while (i + 1 < text.length && !(text[i] === '*' && text[i + 1] === '/')) {
i++;
}
i += 2;
continue;
}
}
}
if (char === '{') {
// Start of an object
stack.push('object');
output += char;
indentation++;
i++;
continue;
}
if (char === '[') {
// Start of an array
stack.push('array');
output += char;
indentation++;
i++;
continue;
}
if (char === '}' || char === ']') {
// End of an object or array
if (stack.length === 0) {
// Unmatched closing bracket, remove it
trackChange(`Removing unmatched closing ${char === '}' ? 'curly brace' : 'square bracket'}`);
i++;
continue;
}
const currentStructure = stack.pop();
const expectedClosing = currentStructure === 'object' ? '}' : ']';
if (char !== expectedClosing) {
// Mismatched closing bracket
trackChange(`Replacing ${char} with ${expectedClosing}`);
output += expectedClosing;
}
else {
output += char;
}
indentation--;
i++;
continue;
}
if (char === ',') {
// Handle trailing commas
const next = nextNonWhitespaceCharacter();
if (next === '}' || next === ']') {
// Trailing comma; skip it
trackChange('Removing trailing comma');
i++;
continue;
}
output += char;
i++;
continue;
}
if (char === '"' || char === "'") {
// Start of a string
const stringStartIndex = i;
const quoteType = char;
i++;
// Find the end of the string
let endIndex = i;
let isEscaped = false;
while (endIndex < text.length) {
if (text[endIndex] === '\\') {
// Skip the next character
isEscaped = !isEscaped;
endIndex++;
continue;
}
if (text[endIndex] === quoteType && !isEscaped) {
// Found the end of the string
break;
}
isEscaped = false;
endIndex++;
}
if (endIndex < text.length) {
// Complete string found
const stringContent = text.substring(stringStartIndex, endIndex + 1);
if (quoteType === "'") {
// Convert to double quotes
const normalized = normalizeString(stringContent);
output += normalized;
trackChange('Converted single quotes to double quotes in string');
}
else {
output += stringContent;
}
i = endIndex + 1;
continue;
}
else {
// Unclosed string
inString = true;
if (quoteType === "'") {
// Convert to a double quote
output += '"';
trackChange('Converted single quote to double quote and treating as unclosed string');
}
else {
output += char;
}
i++;
continue;
}
}
if (char === ':') {
// Property separator in an object
output += char;
i++;
continue;
}
if (isDigit(char) || char === '-' || char === '+' || char === '.') {
// Number
let numberStr = '';
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const start = i;
// Extract the number
while (i < text.length &&
(isDigit(text[i]) ||
text[i] === '-' ||
text[i] === '+' ||
text[i] === '.' ||
text[i] === 'e' ||
text[i] === 'E')) {
numberStr += text[i];
i++;
}
// Validate and fix the number
if (/^-?\d+(\.\d+)?([eE][+-]?\d+)?$/.test(numberStr)) {
output += numberStr;
}
else {
// Invalid number format
trackChange('Fixing invalid number format');
try {
// Try to parse it as a JavaScript number and convert back to a valid JSON number
const parsed = parseFloat(numberStr);
if (!isNaN(parsed)) {
output += JSON.stringify(parsed);
}
else {
// Fallback for numbers that can't be parsed
output += '0';
trackChange('Replaced invalid number with 0');
}
}
catch (e) {
output += '0';
trackChange('Replaced invalid number with 0');
}
}
continue;
}
// Handle special tokens
if (isAlpha(char)) {
// Check for literals like true, false, null
// or Python constants like True, False, None
const remaining = text.substring(i);
if (/^true/i.test(remaining)) {
output += 'true';
i += remaining.match(/^true/i)[0].length;
trackChange('Normalized to lowercase true');
continue;
}
if (/^false/i.test(remaining)) {
output += 'false';
i += remaining.match(/^false/i)[0].length;
trackChange('Normalized to lowercase false');
continue;
}
if (/^null/i.test(remaining)) {
output += 'null';
i += remaining.match(/^null/i)[0].length;
trackChange('Normalized to lowercase null');
continue;
}
if (/^none/i.test(remaining)) {
output += 'null';
i += remaining.match(/^none/i)[0].length;
trackChange('Converted Python None to null');
continue;
}
// Check for unquoted property names (common in JavaScript objects)
if (stack[stack.length - 1] === 'object') {
let propertyName = '';
const startIndex = i;
// Extract the property name
while (i < text.length && (isAlpha(text[i]) || isDigit(text[i]) || text[i] === '_')) {
propertyName += text[i];
i++;
}
// Skip whitespace
while (i < text.length && isWhitespace(text[i])) {
i++;
}
// Check if we have a colon after the property name
if (i < text.length && text[i] === ':') {
output += `"${propertyName}":`;
trackChange('Added quotes around property name');
i++;
continue;
}
else {
// Not a property, reset position
i = startIndex;
}
}
// Unrecognized token
trackChange('Skipping unrecognized token');
i++;
continue;
}
// Handle other characters
if (char === '=') {
// Sometimes used instead of colon
const next = nextNonWhitespaceCharacter();
if (next !== '=') {
// Single equals, likely a mistake for a colon
output += ':';
trackChange('Replaced = with :');
i++;
continue;
}
}
// Skip any other characters we don't recognize
i++;
}
// Handle unclosed structures
while (stack.length > 0) {
const currentStructure = stack.pop();
const closingChar = currentStructure === 'object' ? '}' : ']';
output += closingChar;
trackChange(`Added missing closing ${closingChar}`);
}
return output;
}
;// ./src/index.ts
/**
* LLM JSON Fix - A library for fixing malformed JSON outputs from LLMs
*
* @packageDocumentation
*/
// Export the main API
/******/ return __webpack_exports__;
/******/ })()
;
});