UNPKG

llm-json-fix

Version:

Fix malformed JSON outputs from Large Language Models (LLMs)

206 lines (205 loc) 6.75 kB
/** * Patterns for detecting and handling LLM-specific JSON issues */ /** * Detects markdown code blocks and extracts the content * @param text Input text that may contain markdown code blocks * @returns The content of the first JSON code block, or null if none found */ export function extractJsonFromMarkdown(text) { // Standard markdown code block with json const jsonBlockRegex = /```(?:json)?\s*\n([\s\S]*?)\n```/; const match = text.match(jsonBlockRegex); if (match && match[1]) { return match[1].trim(); } // For other types of code blocks that might contain JSON const genericBlockRegex = /```(?:\w*)?\s*\n([\s\S]*?)\n```/; const genericMatch = text.match(genericBlockRegex); if (genericMatch && genericMatch[1]) { const content = genericMatch[1].trim(); // Check if content looks like JSON if (/^\s*[{[]/.test(content)) { return content; } } return null; } /** * Removes explanatory text that LLMs often include before/after JSON output */ export function stripExplanatoryText(text) { // First, try to extract JSON from markdown const jsonFromMarkdown = extractJsonFromMarkdown(text); if (jsonFromMarkdown) { return jsonFromMarkdown; } // If we can identify JSON object or array, extract it const jsonPattern = /(\{[\s\S]*\}|\[[\s\S]*\])/; const match = text.match(jsonPattern); if (match && match[1]) { return match[1]; } // If we can't easily extract JSON, just return the original return text; } /** * Fixes code block annotations that may have been erroneously included in the output */ export function stripCodeBlockAnnotations(text) { return text .replace(/^```json\s*\n/gm, '') .replace(/\n```\s*$/gm, '') .replace(/^```\s*\n/gm, ''); } /** * Removes trailing ellipses, which are often used by LLMs to indicate truncation * or continuation that isn't actual JSON data */ export function removeEllipses(text) { // Use more specific patterns to avoid unintended replacements let result = text; // Replace ellipses at the end of arrays result = result.replace(/,\s*\.\.\.(\s*])/g, '$1'); // Replace ellipses at the end of objects result = result.replace(/,\s*\.\.\.(\s*})/g, '$1'); // Replace quoted ellipses result = result.replace(/,\s*["']\.\.\.["'](\s*[\]}])/g, '$1'); // Replace trailing ellipses at the end of lines result = result.replace(/,\s*\.\.\.$/gm, ''); return result; } /** * Remove explanations that LLMs sometimes insert as comments */ export function stripLLMComments(text) { // Remove both JSON comments and natural language explanation patterns return text // Standard JSON comments .replace(/\/\/.*$/gm, '') .replace(/\/\*[\s\S]*?\*\//g, '') // Explanatory notes often added by LLMs in parentheses .replace(/\(\s*Note:.*?\)/g, '') .replace(/\[\s*Note:.*?\]/g, ''); } /** * Handle partial property names or incomplete strings at the end of the text */ export function fixTruncatedContent(text) { // Remove anything after the last balanced closing bracket const balancedText = balanceJsonStructure(text); // If we handled it through balancing, return that if (balancedText !== text) { return balancedText; } // Otherwise, look for trailing fragments and remove them return text // Remove partial property at the end .replace(/,\s*"[^"]*$/g, '') .replace(/,\s*'[^']*$/g, '') // Remove trailing commas .replace(/,(\s*[\]}])/g, '$1'); } /** * Try to balance JSON by fixing unclosed brackets */ export function balanceJsonStructure(text) { const stack = []; let inString = false; let escapeNext = false; // First pass: analyze the structure for (let i = 0; i < text.length; i++) { const char = text[i]; if (escapeNext) { escapeNext = false; continue; } if (char === '\\') { escapeNext = true; continue; } if (char === '"' && !inString) { inString = true; continue; } if (char === '"' && inString) { inString = false; continue; } if (inString) { continue; } if (char === '{' || char === '[') { stack.push({ char, position: i }); } else if (char === '}') { if (stack.length > 0 && stack[stack.length - 1].char === '{') { stack.pop(); } else { // Unmatched closing bracket return text.substring(0, i); } } else if (char === ']') { if (stack.length > 0 && stack[stack.length - 1].char === '[') { stack.pop(); } else { // Unmatched closing bracket return text.substring(0, i); } } } // If we have unclosed brackets, close them if (stack.length > 0) { let result = text; // Close remaining brackets in reverse order for (let i = stack.length - 1; i >= 0; i--) { const openingBracket = stack[i].char; const closingBracket = openingBracket === '{' ? '}' : ']'; result += closingBracket; } return result; } return text; } /** * Fix incorrect handling of nested JSON strings */ export function fixNestedJsonStrings(text) { // Look for cases where stringified JSON has escaped quotes inside strings // This is a simplistic approach - a complete solution would need to parse and rebuild return text .replace(/"{/g, '{') .replace(/}"/g, '}') .replace(/"\[/g, '[') .replace(/\]"/g, ']') // Fix escaped quotes that shouldn't be escaped .replace(/\\"/g, '"') .replace(/\\'/g, "'"); } /** * Main function to apply all LLM-specific fixes */ export function applyLLMSpecificFixes(text) { // Apply fixes in an order that makes sense let result = text; // First try to extract JSON if in markdown const extracted = extractJsonFromMarkdown(result); if (extracted) { result = extracted; } else { // Otherwise strip code block annotations result = stripCodeBlockAnnotations(result); } // Remove comments and explanations result = stripLLMComments(result); result = stripExplanatoryText(result); // Fix structural issues result = removeEllipses(result); result = fixNestedJsonStrings(result); result = fixTruncatedContent(result); return result; }