jaison
Version:
A robust, fault-tolerant JSON parser engineered specifically for handling malformed JSON output from AI systems and language models. Supports Chinese punctuation and international characters.
341 lines (302 loc) • 10.8 kB
JavaScript
/**
* Optimized tokenizer for JSON-like strings
* Replaces complex regex with manual character-by-character parsing
*/
const { TOKEN_TYPES, RADIX } = require('./constants');
/**
* Tokenize JSON string using optimized manual parsing
* @param {string} jsonString - Input string to tokenize
* @returns {Array} Array of token objects
*/
function tokenize(jsonString) {
const tokens = [];
const len = jsonString.length;
let pos = 0;
while (pos < len) {
const char = jsonString[pos];
// Skip whitespace - optimized check
if (char <= ' ') {
pos++;
continue;
}
// Skip comments (both single-line and multi-line)
if (char === '/') {
const commentResult = skipComment(jsonString, pos);
if (commentResult.isComment) {
pos = commentResult.newPos;
continue;
}
}
// String tokens (both double and single quotes)
if (char === '"' || char === "'") {
const stringToken = parseStringToken(jsonString, pos);
tokens.push(stringToken.token);
pos = stringToken.newPos;
continue;
}
// Bracket tokens
if (char === '{' || char === '}' || char === '[' || char === ']') {
tokens.push({
type: TOKEN_TYPES.BRACKET,
value: char
});
pos++;
continue;
}
// Punctuation tokens (including Chinese punctuation)
if (char === ',' || char === ':' || char === ':' || char === ',') {
const punctuationToken = parsePunctuationToken(char);
tokens.push(punctuationToken);
pos++;
continue;
}
// Number tokens (with proper radix and negative sign tracking)
// Support: -123, +123, .123, 123
if ((char >= '0' && char <= '9') || char === '-' || char === '+' || char === '.') {
const numberToken = parseNumberToken(jsonString, pos);
tokens.push(numberToken.token);
pos = numberToken.newPos;
continue;
}
// Identifier tokens
const identifierToken = parseIdentifierToken(jsonString, pos);
if (identifierToken.token) {
tokens.push(identifierToken.token);
}
pos = identifierToken.newPos;
}
return tokens;
}
/**
* Parse string token with proper escape handling for both single and double quotes
* @param {string} jsonString - Input string
* @param {number} startPos - Starting position
* @returns {Object} Token and new position
*/
function parseStringToken(jsonString, startPos) {
const len = jsonString.length;
const quoteChar = jsonString[startPos]; // Can be " or '
let pos = startPos + 1; // Skip opening quote
while (pos < len) {
const c = jsonString[pos];
if (c === quoteChar) {
pos++; // Include closing quote
break;
} else if (c === '\\') {
pos += 2; // Skip escape sequence
} else {
pos++;
}
}
let tokenValue = jsonString.slice(startPos, pos);
// Handle missing closing quote (same as original logic)
if (!tokenValue.endsWith(quoteChar)) {
tokenValue += quoteChar;
}
// Normalize single quotes to double quotes for JSON compatibility
if (quoteChar === "'") {
// Properly handle single quote strings by escaping internal double quotes
// and converting outer quotes
const innerContent = tokenValue.slice(1, -1); // Remove outer quotes
// Manually scan and escape unescaped double quotes
let escapedContent = '';
for (let i = 0; i < innerContent.length; i++) {
const char = innerContent[i];
if (char === '"') {
// Check if this quote is already escaped
if (i === 0 || innerContent[i - 1] !== '\\') {
escapedContent += '\\"'; // Escape unescaped double quote
} else {
escapedContent += char; // Keep already escaped quote
}
} else {
escapedContent += char;
}
}
tokenValue = '"' + escapedContent + '"';
}
return {
token: {
type: TOKEN_TYPES.STRING,
value: tokenValue
},
newPos: pos
};
}
/**
* Parse punctuation token with Chinese character normalization
* @param {string} char - Punctuation character
* @returns {Object} Token object
*/
function parsePunctuationToken(char) {
let normalizedPunctuation = char;
if (char === ':') {
normalizedPunctuation = ':';
} else if (char === ',') {
normalizedPunctuation = ',';
}
return {
type: TOKEN_TYPES.PUNCTUATION,
value: normalizedPunctuation
};
}
/**
* Parse number token with radix detection and negative sign tracking
* @param {string} jsonString - Input string
* @param {number} startPos - Starting position
* @returns {Object} Token and new position
*/
function parseNumberToken(jsonString, startPos) {
const len = jsonString.length;
let pos = startPos;
let isNegative = false;
let isPositive = false;
let radix = RADIX.DECIMAL;
// Handle sign
if (jsonString[pos] === '-') {
isNegative = true;
pos++;
} else if (jsonString[pos] === '+') {
isPositive = true;
pos++;
}
// Check if this is a valid number start after sign or dot
if (pos >= len) {
// Just a sign, treat as identifier
return parseIdentifierToken(jsonString, startPos);
}
const firstChar = jsonString[pos];
// Handle numbers starting with dot (.123)
if (jsonString[startPos] === '.' || (pos > startPos && firstChar === '.')) {
// Must be followed by digits
if (firstChar === '.' && pos + 1 < len && jsonString[pos + 1] >= '0' && jsonString[pos + 1] <= '9') {
// Valid decimal like .123
pos++; // skip the dot
while (pos < len && /[\d.eE+-]/.test(jsonString[pos])) pos++;
} else {
// Invalid, treat as identifier
return parseIdentifierToken(jsonString, startPos);
}
} else if (firstChar >= '0' && firstChar <= '9') {
// Determine number format and radix
if (firstChar === '0' && pos < len - 1) {
const nextChar = jsonString[pos + 1];
if (nextChar === 'x' || nextChar === 'X') {
// Hexadecimal
radix = RADIX.HEXADECIMAL;
pos += 2;
while (pos < len && /[0-9a-fA-F]/.test(jsonString[pos])) pos++;
} else if (nextChar === 'o' || nextChar === 'O') {
// Octal
radix = RADIX.OCTAL;
pos += 2;
while (pos < len && /[0-7]/.test(jsonString[pos])) pos++;
} else if (nextChar === 'b' || nextChar === 'B') {
// Binary
radix = RADIX.BINARY;
pos += 2;
while (pos < len && /[01]/.test(jsonString[pos])) pos++;
} else {
// Regular decimal starting with 0
pos++;
while (pos < len && /[\d.eE+-]/.test(jsonString[pos])) pos++;
}
} else {
// Regular decimal number
while (pos < len && /[\d.eE+-]/.test(jsonString[pos])) pos++;
}
} else {
// Not a valid number, treat as identifier
return parseIdentifierToken(jsonString, startPos);
}
const numberValue = jsonString.slice(isNegative || isPositive ? startPos + 1 : startPos, pos);
return {
token: {
type: TOKEN_TYPES.NUMBER,
value: numberValue,
radix: radix,
isNegative: isNegative,
isPositive: isPositive
},
newPos: pos
};
}
/**
* Parse identifier token
* @param {string} jsonString - Input string
* @param {number} startPos - Starting position
* @returns {Object} Token and new position
*/
function parseIdentifierToken(jsonString, startPos) {
const len = jsonString.length;
let pos = startPos;
while (pos < len && !/["{}[\],::,\s]/.test(jsonString[pos])) {
pos++;
}
if (pos > startPos) {
return {
token: {
type: TOKEN_TYPES.IDENTIFIER,
value: jsonString.slice(startPos, pos)
},
newPos: pos
};
} else {
return {
token: null,
newPos: pos + 1 // Skip unrecognized character
};
}
}
/**
* Skip comment tokens (both single-line and multi-line)
* @param {string} jsonString - Input string
* @param {number} startPos - Starting position (should be '/')
* @returns {Object} isComment flag and new position
*/
function skipComment(jsonString, startPos) {
const len = jsonString.length;
if (startPos + 1 >= len) {
return { isComment: false, newPos: startPos + 1 };
}
const nextChar = jsonString[startPos + 1];
// Single-line comment: //
if (nextChar === '/') {
let pos = startPos + 2;
// Skip until end of line or end of string
while (pos < len && jsonString[pos] !== '\n' && jsonString[pos] !== '\r') {
pos++;
}
// Skip the newline character if present
if (pos < len && (jsonString[pos] === '\n' || jsonString[pos] === '\r')) {
pos++;
// Handle Windows-style \r\n
if (pos < len && jsonString[pos - 1] === '\r' && jsonString[pos] === '\n') {
pos++;
}
}
return { isComment: true, newPos: pos };
}
// Multi-line comment: /* ... */
if (nextChar === '*') {
let pos = startPos + 2;
// Skip until */ or end of string
while (pos < len - 1) {
if (jsonString[pos] === '*' && jsonString[pos + 1] === '/') {
pos += 2; // Skip the closing */
break;
}
pos++;
}
// If we reached the end without finding closing */, treat as unterminated comment
if (pos >= len - 1 && !(jsonString[pos - 1] === '*' && jsonString[pos] === '/')) {
pos = len; // Skip to end of string
}
return { isComment: true, newPos: pos };
}
// Not a comment, just a forward slash
return { isComment: false, newPos: startPos };
}
module.exports = {
tokenize
};