json-repair-js
Version: 
JavaScript library to repair broken/invalid JSON strings, especially from LLM outputs
261 lines (206 loc) • 7.43 kB
JavaScript
const { JsonContext, ContextValues } = require('./JsonContext');
const STRING_DELIMITERS = ['"', "'"];
const WHITESPACE = new Set([0x20, 0x09, 0x0A, 0x0D]); // space, tab, newline, return
const QUOTES = new Set([0x22, 0x27]); // " and '
class JsonParser {
    constructor(jsonStr = "", logging = false) {
        this.jsonStr = jsonStr;
        this.index = 0;
        this.context = new JsonContext();
        this.logging = logging;
        this.logger = [];
    }
    log(text) {
        if (!this.logging) return;
        const window = 10;
        const start = Math.max(this.index - window, 0);
        const end = Math.min(this.index + window, this.jsonStr.length);
        const context = this.jsonStr.slice(start, end);
        this.logger.push({ text, context });
    }
    parse() {
        // Find the first { or [ in the string
        let inBackticks = false;
        let foundJson = false;
        while (this.index < this.jsonStr.length) {
            const char = this.peek();
            
            // Handle code blocks in markdown/text
            if (char === '`') {
                if (this.jsonStr.slice(this.index, this.index + 3) === '```') {
                    inBackticks = !inBackticks;
                    this.index += 3;
                    continue;
                }
            }
            // Look for JSON start
            if (char === '{' || char === '[') {
                foundJson = true;
                break;
            }
            this.index++;
        }
        if (!foundJson) {
            return "";
        }
        const result = this.parseValue();
        return this.logging ? [result, this.logger] : result;
    }
    parseValue() {
        this.skipWhitespace();
        const char = this.peek();
        if (!char) return "";
        
        if (char === "{") return this.parseObject();
        if (char === "[") return this.parseArray();
        if (STRING_DELIMITERS.includes(char)) return this.parseString();
        if (/[-0-9]/.test(char)) return this.parseNumber();
        if (/[a-zA-Z]/.test(char)) return this.parseUnquotedString();
        this.index++;
        return "";
    }
    parseObject() {
        const obj = {};
        this.index++; // skip {
        while (this.index < this.jsonStr.length) {
            this.skipWhitespace();
            
            if (this.peek() === "}") {
                this.index++;
                break;
            }
            // Parse key
            this.context.set(ContextValues.OBJECT_KEY);
            const key = this.parseString() || this.parseUnquotedString();
            if (!key) break;
            this.skipWhitespace();
            // Handle missing colon
            if (this.peek() !== ":") {
                this.log("Missing colon after key, adding it");
            } else {
                this.index++; // skip :
            }
            this.skipWhitespace();
            
            // Parse value
            this.context.reset();
            this.context.set(ContextValues.OBJECT_VALUE);
            const value = this.parseValue();
            this.context.reset();
            if (key) {
                obj[key] = value;
            }
            this.skipWhitespace();
            // Handle comma
            if (this.peek() === ",") {
                this.index++;
            }
        }
        return obj;
    }
    parseArray() {
        const arr = [];
        this.index++; // skip [
        this.context.set(ContextValues.ARRAY);
        while (this.index < this.jsonStr.length) {
            this.skipWhitespace();
            
            if (this.peek() === "]") {
                this.index++;
                break;
            }
            const value = this.parseValue();
            if (value !== undefined) {
                arr.push(value);
            }
            this.skipWhitespace();
            // Handle comma
            if (this.peek() === ",") {
                this.index++;
            }
        }
        this.context.reset();
        return arr;
    }
    parseString() {
        let char = this.peek();
        let isQuoted = STRING_DELIMITERS.includes(char);
        let stringAcc = "";
        // Skip leading whitespace
        while (char && /\s/.test(char)) {
            this.index++;
            char = this.peek();
        }
        if (isQuoted) {
            const quote = char;
            this.index++; // skip opening quote
            while (this.index < this.jsonStr.length) {
                char = this.peek();
                
                if (char === quote && this.jsonStr[this.index - 1] !== "\\") {
                    this.index++; // skip closing quote
                    break;
                }
                stringAcc += char;
                this.index++;
            }
        } else {
            // For unquoted strings, collect until delimiter
            while (this.index < this.jsonStr.length) {
                char = this.peek();
                if ([",", "}", "]", ":"].includes(char)) {
                    break;
                } else if (/\s/.test(char)) {
                    // Skip whitespace between words
                    if (stringAcc && this.index < this.jsonStr.length - 1) {
                        const nextChar = this.jsonStr[this.index + 1];
                        if (!/[,}\]:]/.test(nextChar)) {
                            stringAcc += " ";
                        }
                    }
                } else {
                    stringAcc += char;
                }
                this.index++;
            }
        }
        // Convert value types for object values
        if (!isQuoted && this.context.current === ContextValues.OBJECT_VALUE) {
            const trimmed = stringAcc.trim();
            
            // Try number
            const num = Number(trimmed);
            if (!isNaN(num)) return num;
            
            // Try boolean/null
            if (trimmed.toLowerCase() === "true") return true;
            if (trimmed.toLowerCase() === "false") return false;
            if (trimmed.toLowerCase() === "null") return null;
        }
        return stringAcc.trim();
    }
    parseNumber() {
        let numStr = "";
        
        while (this.index < this.jsonStr.length) {
            const char = this.peek();
            if (!/[-0-9.eE]/.test(char)) break;
            numStr += char;
            this.index++;
        }
        const num = Number(numStr);
        return isNaN(num) ? numStr : num;
    }
    parseUnquotedString() {
        let str = "";
        
        while (this.index < this.jsonStr.length) {
            const char = this.peek();
            if ([",", "}", "]", ":"].includes(char) || /\s/.test(char)) break;
            str += char;
            this.index++;
        }
        return str;
    }
    skipWhitespace() {
        while (this.index < this.jsonStr.length) {
            const code = this.jsonStr.charCodeAt(this.index);
            if (!WHITESPACE.has(code)) break;
            this.index++;
        }
    }
    peek() {
        return this.jsonStr[this.index];
    }
}
module.exports = JsonParser;