full-json-extractor
Version:
Brute-forces all possible highest-level json candidates with pruning to keep performance fast with reasonable payload < 1MB
180 lines • 5.59 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.extractJsons = extractJsons;
const queue_1 = require("./queue");
const interval_tree_1d_1 = __importDefault(require("interval-tree-1d"));
const LBRACE = "{";
const RBRACE = "}";
class JsonParseError extends Error {
}
function convertMemoPositionToKey(memoPosition) {
return `${memoPosition.left}-${memoPosition.right}`;
}
function generateBracesPrefixAndSufix(input) {
const prefix = [];
const suffix = [];
const braceCounter = new Map();
for (let i = 0; i < input.length; i++) {
if (input[i] == LBRACE) {
prefix.push(i);
braceCounter.set(i, prefix.length - suffix.length);
}
else if (input[i] == RBRACE) {
suffix.push(i);
braceCounter.set(i, prefix.length - suffix.length);
}
}
return {
prefix,
suffix,
};
}
function queryIntervalSync(tree, low, high) {
let intervalExists = false;
tree.queryInterval(low, high, (interval) => {
const [left, right] = interval;
if (left < low && high < right) {
intervalExists = true;
return;
}
});
return intervalExists;
}
/**
* Coarse pre-check to filter out invalid json candidates. Short circuits if >1 json candidates exist in slice
*
* @param input
* @param left
* @param right
* @returns
*/
function isBalancedWithOneJson(input, left, right, limit) {
const terminationThreshold = generateLimit(input, left, limit);
let braceCount = 0;
let inString = false;
let escapeNext = false;
let firstJsonObj = true;
for (let i = left; i <= right; i++) {
if (i >= terminationThreshold) {
return true;
}
const char = input[i];
if (escapeNext) {
escapeNext = false;
continue;
}
if (char === "\\") {
escapeNext = true;
continue;
}
if (char === '"') {
inString = !inString;
continue;
}
if (inString)
continue;
if (char === LBRACE) {
braceCount++;
}
else if (char === RBRACE) {
braceCount--;
}
if (braceCount < 0) {
return false;
}
if (braceCount === 0) {
if (!firstJsonObj) {
return false;
}
firstJsonObj = !firstJsonObj;
}
}
return braceCount === 0;
}
/**
* Uses a Set to memoize brace locations, a Queue to process candidates top-down and uses an interval tree
* to detect already validated json ranges. Attempts
* @param braceLocations
* @param input
* @returns
*/
function findValidJsons({ prefix, suffix }, input, limit) {
const tree = (0, interval_tree_1d_1.default)();
const startingPosition = {
left: 0,
right: suffix.length - 1,
};
const queue = new queue_1.Queue(startingPosition);
const memo = new Set([convertMemoPositionToKey(startingPosition)]);
const jsons = [];
while (queue.length()) {
const { left: leftIndex, right: rightIndex } = queue.dequeue();
const leftPosition = prefix[leftIndex];
const rightPosition = suffix[rightIndex];
if (rightPosition < leftPosition ||
queryIntervalSync(tree, leftPosition, rightPosition)) {
continue;
}
try {
if (isBalancedWithOneJson(input, leftPosition, rightPosition, limit)) {
jsons.push(JSON.parse(input.slice(leftPosition, rightPosition + 1)));
tree.insert([leftPosition, rightPosition]);
continue;
}
}
catch (error) {
if (!(error instanceof SyntaxError)) {
throw error;
}
}
const positions = [
{
left: leftIndex,
right: rightIndex - 1 >= 0 ? rightIndex - 1 : rightIndex,
},
{
left: leftIndex + 1 < prefix.length ? leftIndex + 1 : leftIndex,
right: rightIndex,
},
];
for (const position of positions) {
const key = convertMemoPositionToKey(position);
if (!memo.has(key)) {
queue.enqueue(position);
memo.add(key);
}
}
}
return jsons;
}
function generateLimit(input, left, limit) {
switch (limit) {
case "log2":
return left + Math.ceil(Math.log2(input.length));
case "none":
return input.length;
default:
throw new JsonParseError("unknown limit type provided");
}
}
/**
* Extracts json objects from a given input string
* @param input input string
* @param limit Sets pre-check behavior. If set to 'log2', method will terminate pre-check after reaching log2(n) characters. Useful for malformed data i.e. many {}
* Else, will do a O(n) scan to coarsely validate brace matches. Useful for many json objects (i.e. early termination)
* @returns
*/
function extractJsons(input, limit = "none") {
if (!input?.length) {
return [];
}
const locations = generateBracesPrefixAndSufix(input);
if (!locations.prefix.length || !locations.suffix.length) {
return [];
}
return findValidJsons(locations, input, limit);
}
//# sourceMappingURL=extractor.js.map