UNPKG

jaison

Version:

A robust, fault-tolerant JSON parser engineered specifically for handling malformed JSON output from AI systems and language models. Supports Chinese punctuation and international characters.

433 lines (379 loc) 14.4 kB
/** * Optimized parser for tokenized JSON-like structures * Maintains all original functionality while improving performance */ const { TRUE_ALIAS, FALSE_ALIAS, NULL_ALIAS, UNDEFINED_ALIAS, TOKEN_TYPES, CONTAINER_TYPES, RADIX } = require('./constants'); /** * Parse tokens into JavaScript objects/values * @param {Array} tokens - Array of token objects from tokenizer * @returns {*} Parsed JavaScript value */ function parse(tokens) { const stacks = []; let currentContainer = null; const tokenLength = tokens.length; for (let i = 0; i < tokenLength; i++) { const token = tokens[i]; // Handle bracket tokens (container creation/closing) if (token.type === TOKEN_TYPES.BRACKET) { const result = handleBracketToken(token, currentContainer, stacks, tokens, i, tokenLength); if (result.shouldReturn) { return result.value; } if (result.newContainer !== undefined) { currentContainer = result.newContainer; } if (result.skipNext) { i++; // Skip next comma } continue; } // Handle object key parsing if (currentContainer && currentContainer.type === CONTAINER_TYPES.OBJECT && currentContainer._key === undefined) { const keyResult = handleObjectKey(token); if (keyResult.isKey) { currentContainer._key = keyResult.key; continue; } else if (keyResult.shouldContinue) { continue; } else if (keyResult.shouldThrow) { throw new Error(`Unexpected token "${token.value}" when expecting a key in an object`); } } // Handle punctuation tokens if (token.type === TOKEN_TYPES.PUNCTUATION) { const punctuationResult = handlePunctuationToken(token, tokens, i, tokenLength, currentContainer, stacks); if (punctuationResult.shouldThrow) { throw new Error('Unexpected punctuation outside of object or array context'); } if (punctuationResult.shouldContinue) { continue; } } // Parse value tokens const value = parseValueToken(token); // Assign value to current container or return if top-level const assignResult = assignValue(value, currentContainer, tokens, i, tokenLength); if (assignResult.shouldReturn) { return assignResult.value; } if (assignResult.skipNext) { i++; // Skip next comma } } // Handle unclosed containers (same as original) return handleUnclosedContainers(stacks, currentContainer); } /** * Handle bracket tokens for container creation and closing * @param {Object} token - Current token * @param {Object} currentContainer - Current container context * @param {Array} stacks - Container stack * @param {Array} tokens - All tokens * @param {number} index - Current token index * @param {number} tokenLength - Total token count * @returns {Object} Result object with action instructions */ function handleBracketToken(token, currentContainer, stacks, tokens, index, tokenLength) { if (token.value === '{') { return createObjectContainer(currentContainer, stacks); } else if (token.value === '}') { return closeContainer(stacks, tokens, index, tokenLength, 'Unmatched closing brace "}"'); } else if (token.value === '[') { return createArrayContainer(currentContainer, stacks); } else if (token.value === ']') { return closeContainer(stacks, tokens, index, tokenLength, 'Unmatched closing bracket "]"'); } return { shouldReturn: false }; } /** * Create new object container * @param {Object} currentContainer - Current container context * @param {Array} stacks - Container stack * @returns {Object} Result object */ function createObjectContainer(currentContainer, stacks) { const newContainer = { type: CONTAINER_TYPES.OBJECT, value: {} }; // Add to parent container if exists (same logic as original) if (currentContainer) { if (currentContainer.type === CONTAINER_TYPES.OBJECT && currentContainer._key !== undefined) { currentContainer.value[currentContainer._key] = newContainer.value; delete currentContainer._key; } else if (currentContainer.type === CONTAINER_TYPES.ARRAY) { currentContainer.value.push(newContainer.value); } } stacks.push(newContainer); return { shouldReturn: false, newContainer }; } /** * Create new array container * @param {Object} currentContainer - Current container context * @param {Array} stacks - Container stack * @returns {Object} Result object */ function createArrayContainer(currentContainer, stacks) { const newContainer = { type: CONTAINER_TYPES.ARRAY, value: [] }; // Add to parent container if exists (same logic as original) if (currentContainer) { if (currentContainer.type === CONTAINER_TYPES.OBJECT && currentContainer._key !== undefined) { currentContainer.value[currentContainer._key] = newContainer.value; delete currentContainer._key; } else if (currentContainer.type === CONTAINER_TYPES.ARRAY) { currentContainer.value.push(newContainer.value); } } stacks.push(newContainer); return { shouldReturn: false, newContainer }; } /** * Close current container * @param {Array} stacks - Container stack * @param {Array} tokens - All tokens * @param {number} index - Current token index * @param {number} tokenLength - Total token count * @param {string} errorMessage - Error message if unmatched * @returns {Object} Result object */ function closeContainer(stacks, tokens, index, tokenLength, errorMessage) { if (stacks.length > 0) { const completedContainer = stacks.pop(); if (stacks.length > 0) { const skipNext = (index + 1 < tokenLength && tokens[index + 1].value === ','); return { shouldReturn: false, newContainer: stacks[stacks.length - 1], skipNext }; } else { return { shouldReturn: true, value: completedContainer.value }; } } else { throw new Error(errorMessage); } } /** * Handle object key parsing * @param {Object} token - Current token * @returns {Object} Result object */ function handleObjectKey(token) { switch (token.type) { case TOKEN_TYPES.STRING: return { isKey: true, key: parseStringValue(token) }; case TOKEN_TYPES.NUMBER: case TOKEN_TYPES.IDENTIFIER: // For number keys, include sign if present (same as original) if (token.type === TOKEN_TYPES.NUMBER && token.isNegative) { return { isKey: true, key: '-' + token.value }; } else if (token.type === TOKEN_TYPES.NUMBER && token.isPositive) { return { isKey: true, key: '+' + token.value }; } else { return { isKey: true, key: token.value }; } case TOKEN_TYPES.PUNCTUATION: if (token.value === ',') { // Skip consecutive commas in objects (same as original) return { shouldContinue: true }; } return { shouldThrow: true }; default: return { shouldThrow: true }; } } /** * Handle punctuation tokens * @param {Object} token - Current token * @param {Array} tokens - All tokens * @param {number} index - Current token index * @param {number} tokenLength - Total token count * @param {Object} currentContainer - Current container context * @param {Array} stacks - Container stack * @returns {Object} Result object */ function handlePunctuationToken(token, tokens, index, tokenLength, currentContainer, stacks) { if (stacks.length === 0) { return { shouldThrow: true }; } if (token.value === ',') { // Same comma handling as original if (currentContainer && currentContainer.type === CONTAINER_TYPES.ARRAY) { currentContainer.value.push(null); } return { shouldContinue: true }; } else if (token.value === ':') { // Check for missing value after colon (same as original) if (index + 1 < tokenLength) { const nextToken = tokens[index + 1]; if (nextToken.value === ',' || nextToken.value === '}') { // Missing value after colon, insert null if (currentContainer && currentContainer.type === CONTAINER_TYPES.OBJECT && currentContainer._key !== undefined) { currentContainer.value[currentContainer._key] = null; delete currentContainer._key; } } } else { // Colon at end of input, insert null if (currentContainer && currentContainer.type === CONTAINER_TYPES.OBJECT && currentContainer._key !== undefined) { currentContainer.value[currentContainer._key] = null; delete currentContainer._key; } } return { shouldContinue: true }; } return { shouldContinue: true }; } /** * Parse value from token * @param {Object} token - Token to parse * @returns {*} Parsed value */ function parseValueToken(token) { switch (token.type) { case TOKEN_TYPES.IDENTIFIER: return parseIdentifierValue(token); case TOKEN_TYPES.STRING: return parseStringValue(token); case TOKEN_TYPES.NUMBER: return parseNumberValue(token); default: return null; } } /** * Parse identifier value with alias support * @param {Object} token - Identifier token * @returns {*} Parsed value */ function parseIdentifierValue(token) { const tokenValue = token.value; const tokenValueLower = tokenValue.toLowerCase(); // Same alias handling as original if (TRUE_ALIAS.has(tokenValueLower)) { return true; } else if (FALSE_ALIAS.has(tokenValueLower)) { return false; } else if (NULL_ALIAS.has(tokenValueLower)) { return null; } else if (UNDEFINED_ALIAS.has(tokenValueLower)) { return undefined; } else { // Same error message as original throw new Error(`Unexpected identifier "${tokenValue}" in value position. Only recognized constants (true, false, null, undefined, etc.) are allowed.`); } } /** * Parse string value with control character escaping * @param {Object} token - String token * @returns {string} Parsed string value */ function parseStringValue(token) { // Same string escape handling as original let stringValue = token.value; // Escape unescaped control characters (ASCII 0-31) using \u format stringValue = stringValue.replace(/[\x00-\x1F]/g, function(match) { const code = match.charCodeAt(0); // Use \u format for all control characters return '\\u' + ('000' + code.toString(16)).slice(-4); }); return JSON.parse(stringValue); } /** * Parse number value with radix support * @param {Object} token - Number token * @returns {number} Parsed number value */ function parseNumberValue(token) { const numValue = token.value; let value; // Same number parsing logic as original if (token.radix === RADIX.HEXADECIMAL) { // Hexadecimal value = parseInt(numValue, 16); } else if (token.radix === RADIX.OCTAL) { // Octal - parse without prefix value = parseInt(numValue.slice(2), 8); } else if (token.radix === RADIX.BINARY) { // Binary - parse without prefix value = parseInt(numValue.slice(2), 2); } else { // Decimal (including scientific notation) value = parseFloat(numValue); } if (token.isNegative) { value = -value; } // Note: isPositive doesn't need special handling as parseFloat handles it naturally return value; } /** * Assign parsed value to container or return as top-level value * @param {*} value - Parsed value * @param {Object} currentContainer - Current container context * @param {Array} tokens - All tokens * @param {number} index - Current token index * @param {number} tokenLength - Total token count * @returns {Object} Result object */ function assignValue(value, currentContainer, tokens, index, tokenLength) { // Same value assignment logic as original if (currentContainer) { if (currentContainer.type === CONTAINER_TYPES.OBJECT) { currentContainer.value[currentContainer._key] = value; delete currentContainer._key; } else { currentContainer.value.push(value); } const skipNext = (index + 1 < tokenLength && tokens[index + 1].value === ','); return { shouldReturn: false, skipNext }; } else { return { shouldReturn: true, value }; } } /** * Handle unclosed containers at end of parsing * @param {Array} stacks - Container stack * @param {Object} currentContainer - Current container context * @returns {*} Final parsed value */ function handleUnclosedContainers(stacks, currentContainer) { // Same ending logic as original if (stacks.length > 0) { // Handle any pending keys with null values if (currentContainer && currentContainer.type === CONTAINER_TYPES.OBJECT && currentContainer._key !== undefined) { currentContainer.value[currentContainer._key] = null; delete currentContainer._key; } return stacks[0].value; } // If no containers and no values were processed, return undefined return undefined; } module.exports = { parse };