llm-json-fix
Version:
Fix malformed JSON outputs from Large Language Models (LLMs)
217 lines (216 loc) • 7.24 kB
JavaScript
;
/**
* Patterns for detecting and handling LLM-specific JSON issues
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractJsonFromMarkdown = extractJsonFromMarkdown;
exports.stripExplanatoryText = stripExplanatoryText;
exports.stripCodeBlockAnnotations = stripCodeBlockAnnotations;
exports.removeEllipses = removeEllipses;
exports.stripLLMComments = stripLLMComments;
exports.fixTruncatedContent = fixTruncatedContent;
exports.balanceJsonStructure = balanceJsonStructure;
exports.fixNestedJsonStrings = fixNestedJsonStrings;
exports.applyLLMSpecificFixes = applyLLMSpecificFixes;
/**
* Detects markdown code blocks and extracts the content
* @param text Input text that may contain markdown code blocks
* @returns The content of the first JSON code block, or null if none found
*/
function extractJsonFromMarkdown(text) {
// Standard markdown code block with json
const jsonBlockRegex = /```(?:json)?\s*\n([\s\S]*?)\n```/;
const match = text.match(jsonBlockRegex);
if (match && match[1]) {
return match[1].trim();
}
// For other types of code blocks that might contain JSON
const genericBlockRegex = /```(?:\w*)?\s*\n([\s\S]*?)\n```/;
const genericMatch = text.match(genericBlockRegex);
if (genericMatch && genericMatch[1]) {
const content = genericMatch[1].trim();
// Check if content looks like JSON
if (/^\s*[{[]/.test(content)) {
return content;
}
}
return null;
}
/**
* Removes explanatory text that LLMs often include before/after JSON output
*/
function stripExplanatoryText(text) {
// First, try to extract JSON from markdown
const jsonFromMarkdown = extractJsonFromMarkdown(text);
if (jsonFromMarkdown) {
return jsonFromMarkdown;
}
// If we can identify JSON object or array, extract it
const jsonPattern = /(\{[\s\S]*\}|\[[\s\S]*\])/;
const match = text.match(jsonPattern);
if (match && match[1]) {
return match[1];
}
// If we can't easily extract JSON, just return the original
return text;
}
/**
* Fixes code block annotations that may have been erroneously included in the output
*/
function stripCodeBlockAnnotations(text) {
return text
.replace(/^```json\s*\n/gm, '')
.replace(/\n```\s*$/gm, '')
.replace(/^```\s*\n/gm, '');
}
/**
* Removes trailing ellipses, which are often used by LLMs to indicate truncation
* or continuation that isn't actual JSON data
*/
function removeEllipses(text) {
// Use more specific patterns to avoid unintended replacements
let result = text;
// Replace ellipses at the end of arrays
result = result.replace(/,\s*\.\.\.(\s*])/g, '$1');
// Replace ellipses at the end of objects
result = result.replace(/,\s*\.\.\.(\s*})/g, '$1');
// Replace quoted ellipses
result = result.replace(/,\s*["']\.\.\.["'](\s*[\]}])/g, '$1');
// Replace trailing ellipses at the end of lines
result = result.replace(/,\s*\.\.\.$/gm, '');
return result;
}
/**
* Remove explanations that LLMs sometimes insert as comments
*/
function stripLLMComments(text) {
// Remove both JSON comments and natural language explanation patterns
return text
// Standard JSON comments
.replace(/\/\/.*$/gm, '')
.replace(/\/\*[\s\S]*?\*\//g, '')
// Explanatory notes often added by LLMs in parentheses
.replace(/\(\s*Note:.*?\)/g, '')
.replace(/\[\s*Note:.*?\]/g, '');
}
/**
* Handle partial property names or incomplete strings at the end of the text
*/
function fixTruncatedContent(text) {
// Remove anything after the last balanced closing bracket
const balancedText = balanceJsonStructure(text);
// If we handled it through balancing, return that
if (balancedText !== text) {
return balancedText;
}
// Otherwise, look for trailing fragments and remove them
return text
// Remove partial property at the end
.replace(/,\s*"[^"]*$/g, '')
.replace(/,\s*'[^']*$/g, '')
// Remove trailing commas
.replace(/,(\s*[\]}])/g, '$1');
}
/**
* Try to balance JSON by fixing unclosed brackets
*/
function balanceJsonStructure(text) {
const stack = [];
let inString = false;
let escapeNext = false;
// First pass: analyze the structure
for (let i = 0; i < text.length; i++) {
const char = text[i];
if (escapeNext) {
escapeNext = false;
continue;
}
if (char === '\\') {
escapeNext = true;
continue;
}
if (char === '"' && !inString) {
inString = true;
continue;
}
if (char === '"' && inString) {
inString = false;
continue;
}
if (inString) {
continue;
}
if (char === '{' || char === '[') {
stack.push({ char, position: i });
}
else if (char === '}') {
if (stack.length > 0 && stack[stack.length - 1].char === '{') {
stack.pop();
}
else {
// Unmatched closing bracket
return text.substring(0, i);
}
}
else if (char === ']') {
if (stack.length > 0 && stack[stack.length - 1].char === '[') {
stack.pop();
}
else {
// Unmatched closing bracket
return text.substring(0, i);
}
}
}
// If we have unclosed brackets, close them
if (stack.length > 0) {
let result = text;
// Close remaining brackets in reverse order
for (let i = stack.length - 1; i >= 0; i--) {
const openingBracket = stack[i].char;
const closingBracket = openingBracket === '{' ? '}' : ']';
result += closingBracket;
}
return result;
}
return text;
}
/**
* Fix incorrect handling of nested JSON strings
*/
function fixNestedJsonStrings(text) {
// Look for cases where stringified JSON has escaped quotes inside strings
// This is a simplistic approach - a complete solution would need to parse and rebuild
return text
.replace(/"{/g, '{')
.replace(/}"/g, '}')
.replace(/"\[/g, '[')
.replace(/\]"/g, ']')
// Fix escaped quotes that shouldn't be escaped
.replace(/\\"/g, '"')
.replace(/\\'/g, "'");
}
/**
* Main function to apply all LLM-specific fixes
*/
function applyLLMSpecificFixes(text) {
// Apply fixes in an order that makes sense
let result = text;
// First try to extract JSON if in markdown
const extracted = extractJsonFromMarkdown(result);
if (extracted) {
result = extracted;
}
else {
// Otherwise strip code block annotations
result = stripCodeBlockAnnotations(result);
}
// Remove comments and explanations
result = stripLLMComments(result);
result = stripExplanatoryText(result);
// Fix structural issues
result = removeEllipses(result);
result = fixNestedJsonStrings(result);
result = fixTruncatedContent(result);
return result;
}