@solvers-hub/llm-json
Version:
A TypeScript SDK to extract and correct JSON from LLM outputs
263 lines (262 loc) • 9.96 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.JsonExtractor = void 0;
const corrector_1 = require("./corrector");
const validator_1 = require("./validator");
/**
* JsonExtractor class for extracting JSON from text input.
*/
class JsonExtractor {
/**
* Creates a new instance of JsonExtractor.
* @param options - Configuration options for extraction.
*/
constructor(options = {}) {
this.schemaValidator = null;
this.options = {
attemptCorrection: false,
...options
};
// Initialize schema validator if schemas are provided
if (this.options.schemas && this.options.schemas.length > 0) {
this.schemaValidator = new validator_1.SchemaValidator();
}
}
/**
* Extract JSON and text from a string input.
* @param input - The input string that may contain JSON.
* @returns An object containing arrays of extracted text and JSON.
*/
extract(input) {
if (!input || typeof input !== 'string') {
return { text: [], json: [] };
}
// Check for code blocks with JSON
const codeBlocksResult = this.extractJsonFromCodeBlocks(input);
if (codeBlocksResult.json.length > 0) {
return codeBlocksResult;
}
// No code blocks found, try regular extraction
const jsonBlocks = this.findJsonBlocks(input);
// If no JSON blocks were found but the whole input might be JSON, try to parse it
if (jsonBlocks.length === 0 && input.trim().startsWith('{') && input.trim().endsWith('}')) {
try {
const correctionResult = this.options.attemptCorrection
? corrector_1.JsonCorrector.correctJson(input.trim())
: { corrected: input.trim(), wasCorrected: false };
let parsed;
try {
parsed = JSON.parse(correctionResult.corrected);
// Apply schema validation if schemas are provided
const validatedJson = this.validateJson([parsed]);
return {
text: [],
json: [parsed],
...(validatedJson.length > 0 && { validatedJson })
};
}
catch (e) {
// Failed to parse, continue with regular extraction
}
}
catch (e) {
// Error in correction, continue with regular extraction
}
}
// Process the found JSON blocks
const parsedBlocks = this.parseJsonBlocks(jsonBlocks);
const textBlocks = this.extractTextBlocks(input, jsonBlocks);
const extractedJson = parsedBlocks.map(block => block.parsed).filter(Boolean);
// Apply schema validation if schemas are provided
const validatedJson = this.validateJson(extractedJson);
return {
text: textBlocks,
json: extractedJson,
validatedJson
};
}
/**
* Validates JSON objects against provided schemas.
* @param jsonObjects - The JSON objects to validate.
* @returns Array of validation results.
*/
validateJson(jsonObjects) {
if (!this.schemaValidator || !this.options.schemas || this.options.schemas.length === 0 || !jsonObjects.length) {
return [];
}
return this.schemaValidator.validateAll(jsonObjects, this.options.schemas);
}
/**
* Extract JSON from markdown code blocks.
* @param input - The input string that may contain code blocks.
* @returns An object containing arrays of extracted text and JSON.
*/
extractJsonFromCodeBlocks(input) {
// Improved regex to require newlines after opening fence and before closing fence
// This is more restrictive than the previous regex
const jsonRegex = /```(?:json)?[\s]*\n([\s\S]*?)\n[\s]*```/g;
const matches = [];
let match;
// Use exec in a loop for backward compatibility
while ((match = jsonRegex.exec(input)) !== null) {
matches.push(match);
}
if (matches.length === 0) {
// For the tests that expect incorrectly formatted code blocks to be ignored
const badFormatRegex = /```(?:json)?([^`\n][\s\S]*?)```/g;
if (badFormatRegex.test(input)) {
return { text: [input], json: [] };
}
// For tests that expect indented code blocks to be ignored
const indentedRegex = /[\s]+```/;
if (indentedRegex.test(input)) {
return { text: [input], json: [] };
}
return { text: [], json: [] };
}
const jsonBlocks = [];
const blockRanges = [];
for (const match of matches) {
const [fullMatch, jsonContent] = match;
const startIndex = match.index;
const endIndex = startIndex + fullMatch.length - 1;
// Only add the block if the content is not empty
if (jsonContent.trim()) {
jsonBlocks.push({
raw: jsonContent.trim(),
startIndex,
endIndex
});
}
// Also keep track of the whole code block for text extraction
blockRanges.push({
raw: fullMatch,
startIndex,
endIndex
});
}
const parsedBlocks = this.parseJsonBlocks(jsonBlocks);
const textBlocks = this.extractTextBlocks(input, blockRanges);
return {
text: textBlocks,
json: parsedBlocks.map(block => block.parsed).filter(Boolean)
};
}
/**
* Find potential JSON blocks in the input string.
* @param input - The input string to search for JSON.
* @returns Array of detected JSON blocks.
*/
findJsonBlocks(input) {
const jsonBlocks = [];
let currentIndex = 0;
while (currentIndex < input.length) {
const openBraceIndex = input.indexOf('{', currentIndex);
if (openBraceIndex === -1)
break;
let depth = 1;
let closeBraceIndex = -1;
for (let i = openBraceIndex + 1; i < input.length; i++) {
if (input[i] === '{') {
depth++;
}
else if (input[i] === '}') {
depth--;
if (depth === 0) {
closeBraceIndex = i;
break;
}
}
}
if (closeBraceIndex !== -1) {
const rawJson = input.substring(openBraceIndex, closeBraceIndex + 1);
jsonBlocks.push({
raw: rawJson,
startIndex: openBraceIndex,
endIndex: closeBraceIndex
});
currentIndex = closeBraceIndex + 1;
}
else {
currentIndex = openBraceIndex + 1;
}
}
return jsonBlocks;
}
/**
* Parse the JSON blocks and attempt correction if enabled.
* @param blocks - The JSON blocks to parse.
* @returns Array of parsed JSON blocks.
*/
parseJsonBlocks(blocks) {
// Only return only the blocks that were successfully parsed
const parsedBlocks = [];
for (const block of blocks) {
try {
block.parsed = JSON.parse(block.raw);
parsedBlocks.push(block);
}
catch (error) {
if (this.options.attemptCorrection) {
const correctedBlock = this.attemptJsonCorrection(block, error);
if (correctedBlock.parsed) {
parsedBlocks.push(correctedBlock);
}
}
}
}
return parsedBlocks;
}
/**
* Attempt to correct malformed JSON.
* @param block - The JSON block to correct.
* @param error - The parsing error.
* @returns The corrected JSON block if possible.
*/
attemptJsonCorrection(block, error) {
const { corrected, wasCorrected } = corrector_1.JsonCorrector.correctJson(block.raw);
if (wasCorrected) {
try {
block.parsed = JSON.parse(corrected);
block.wasCorrected = true;
}
catch (e) {
// Even the corrected JSON couldn't be parsed
}
}
return block;
}
/**
* Extract text blocks from the input, excluding JSON blocks.
* @param input - The original input string.
* @param jsonBlocks - The JSON blocks to exclude.
* @returns Array of text blocks.
*/
extractTextBlocks(input, jsonBlocks) {
if (jsonBlocks.length === 0) {
return [input];
}
const textBlocks = [];
let lastEndIndex = 0;
// Sort blocks by start index
const sortedBlocks = [...jsonBlocks].sort((a, b) => a.startIndex - b.startIndex);
for (const block of sortedBlocks) {
if (block.startIndex > lastEndIndex) {
const textBlock = input.substring(lastEndIndex, block.startIndex).trim();
if (textBlock) {
textBlocks.push(textBlock);
}
}
lastEndIndex = block.endIndex + 1;
}
// Add the last text block if there is one
if (lastEndIndex < input.length) {
const lastBlock = input.substring(lastEndIndex).trim();
if (lastBlock) {
textBlocks.push(lastBlock);
}
}
return textBlocks;
}
}
exports.JsonExtractor = JsonExtractor;