UNPKG

qb-json-next

Version:

fast (~300 MB/sec) and simple incremental JSON parsing that can start and stop at any byte offset

446 lines (402 loc) 17.8 kB
// Software License Agreement (ISC License) // // Copyright (c) 2023, Matthew Voss // // Permission to use, copy, modify, and/or distribute this software for // any purpose with or without fee is hereby granted, provided that the // above copyright notice and this permission notice appear in all copies. // // THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES // WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF // MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR // ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES // WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN // ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF // OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. // @ts-check // possible values for ps.pos(ition). // Note that LSB (0x7F) are reserved for token ascii value. const POS = { A_BF: 0x080, // in array, before first value A_BV: 0x100, // in array, before value A_AV: 0x180, // in array, after value O_BF: 0x200, // in object, before first key O_BK: 0x280, // in object, before key O_AK: 0x300, // in object, after key O_BV: 0x380, // in object, before value O_AV: 0x400, // in object, after value } // Possible values for ps.tok(en). All but string and decimal are represented // by the first ascii byte encountered const TOK = { ARR: 91, // [ array start ARR_END: 93, // ] array end DEC: 100, // d a decimal value starting with: -, 0, 1, ..., 9 FAL: 102, // f false // INT: 105 // i integer, reserved token NUL: 110, // n null STR: 115, // s a string value starting with " TRU: 116, // t true // UNT: 117, // u unsigned integer, reserved token // BYT: 120 // x byte, reserved token OBJ: 123, // { object start OBJ_END: 125, // } object end } // For an unexpected or illegal value, or if src limit is reached before a value is complete, ps.tok will be zero // and ps.ecode will be one of the following const ECODE = { BAD_VALUE: 66, // 'B' encountered invalid byte or series of bytes TRUNC_DEC: 68, // 'D' end of buffer was a decimal ending with a digit (0-9). it is *possibly* unfinished KEY_NO_VAL: 75, // 'K' object key complete, but value did not start TRUNCATED: 84, // 'T' key or value was unfinished at end of buffer UNEXPECTED: 85, // 'U' encountered a recognized token in wrong place/context } // ASCII flags const NON_TOKEN = 1 // '\b\f\n\t\r ,:', const DELIM = 2 // '\b\f\n\t\r ,:{}[]', const DECIMAL_END = 4 // '0123456789', const DECIMAL_ASCII = 8 // '-0123456789+.eE', const NO_LEN_TOKENS = 16 // 'tfn[]{}()', // 0 1 2 3 4 5 6 7 8 9 A B C D E F // ----------------------------------------------------------------------------------- // 0 | NUL SOH STX ETX EOT ENQ ACK BEL BS TAB LF VT FF CR SO SI | // 0 // 1 | DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US | // 1 // 2 | SPC ! " # $ % & ' ( ) * + , - . / | // 2 // 3 | 0 1 2 3 4 5 6 7 8 9 : ; < = > ? | // 3 // 4 | @ A B C D E F G H I J K L M N O | // 4 // 5 | P Q R S T U V W X Y Z [ \ ] ^ _ | // 5 // 6 | ` a b c d e f g h i j k l m n o | // 6 // 7 | p q r s t u v w x y z { | } ~ | // 7 // ----------------------------------------------------------------------------------- // CMAP was lovingly crafted by https://github.com/quicbit-js/qb-json-next/blob/master/export/generate-maps.js const CMAP = [ //0 1 2 3 4 5 6 7 8 9 A B C D E F 0, 0, 0, 0, 0, 0, 0, 0, 0x03, 0x03, 0x03, 0, 0x03, 0x03, 0, 0, // 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 0x03, 0, 0, 0, 0, 0, 0, 0, 0x10, 0x10, 0, 0x08, 0x03, 0x08, 0x08, 0, // 2 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x0C, 0x03, 0, 0, 0, 0, 0, // 3 0, 0, 0, 0, 0, 0x08, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x12, 0, 0x12, 0, 0, // 5 0, 0, 0, 0, 0, 0x08, 0x10, 0, 0, 0, 0, 0, 0, 0, 0x10, 0, // 6 0, 0, 0, 0, 0x10, 0, 0, 0, 0, 0, 0, 0x12, 0, 0x12, 0, 0, // 7 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // C 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // D 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // E 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F ] // convert {first-ascii-char: remaining-ascii-string} to {first-ascii-byte: remaining-ascii-bytes} function ascii_to_bytes (strings) { return Object.keys(strings).reduce( /** @type {function(number[][], string): number[][]} */ function (a, c ) { a[c.charCodeAt(0)] = strings[c].split('').map(function (c) { return c.charCodeAt(0) }) return a }, []) } const TOK_BYTES = ascii_to_bytes({ f: 'alse', t: 'rue', n: 'ull' }) const POS2NAME = Object.keys(POS).reduce(function (/** @type {string[]}*/ a, /** @type {string} */ n) { a[POS[n]] = n; return a }, []) function pos_map () { const ret = [] const max = 0x400 + 0xFF // max pos + max ascii for (let i = 0; i <= max; i++) { ret[i] = 0 } // pos_pairs is generated by utils.js const pos_pairs = [ 219,128,221,384,228,384,230,384,238,384,243,384,244,384,251,512, 347,128,356,384,358,384,366,384,371,384,372,384,379,512,428,256, 477,384,627,768,637,384,755,768,826,896,987,128,996,1024,998,1024, 1006,1024,1011,1024,1012,1024,1019,512,1068,640,1149,384, ] for (let i=0; i<pos_pairs.length; i+=2) { ret[pos_pairs[i]] = pos_pairs[i+1] } return ret } const POS_MAP = pos_map() // skip as many bytes of src that match bsrc, up to lim. // return (byte offset) after all bytes from bsrc are matched or -(byte offset) of first // unmatched byte, if unmatched. function skip_bytes (src, off, lim, bsrc) { let blen = bsrc.length if (blen > lim - off) { blen = lim - off } let i = 0 while (bsrc[i] === src[i + off] && i < blen) { i++ } return i === bsrc.length ? i + off : -(i + off) } function skip_str (src, off, lim) { let i = off while (i < lim) { if (src[i] === 34) { if (src[i - 1] === 92) { // count number of escapes going backwards (n = escape count +1) let n = 2 while (src[i - n] === 92 && i - n >= off) {n++} // \ BACKSLASH escape if (n % 2 === 1) { return i + 1 // skip quote } } else { return i + 1 // skip quote } } i++ } return -i } function skip_dec (src, off, lim) { while (off < lim && (CMAP[src[off]] & DECIMAL_ASCII)) { off++ } return (off < lim && (CMAP[src[off]] & DELIM)) ? off : -off } // // switch ps.src to ps.next_src if conditions are right (ps.src is null or is complete without errors) // function next_src (ps) { if (ps.ecode || (ps.src && ps.vlim < ps.lim)) { return false } if (ps.next_src.length === 0) { return false } ps.soff += ps.src && ps.src.length || 0 ps.src = ps.next_src ps.next_src = [] ps.koff = ps.klim = ps.voff = ps.vlim = ps.tok = ps.ecode = 0 ps.lim = ps.src.length return true } // Lazy-initialize an object properties to hold all ParseState values/defaults. The object is modified in place to support // legacy usage. The object is also returned as a typed ParseState to support Type clarity with type script and documentation. // Though functionaly equivalent, use the returned object to show type-clarity. function init (ps) { ps.soff = ps.soff || 0 // prior src offset. e.g. ps.soff + ps.vlim = total byte offset from start ps.src = ps.src || [] ps.lim = ps.lim == null ? ps.src.length : ps.lim ps.koff = ps.koff || ps.soff ps.klim = ps.klim || ps.koff ps.voff = ps.voff || ps.klim ps.vlim = ps.vlim || ps.voff ps.tok = ps.tok || 0 ps.stack = ps.stack || [] ps.pos = ps.pos || POS.A_BF ps.ecode = ps.ecode || 0 ps.vcount = ps.vcount || 0 ps.line = ps.line || 1 ps.lineoff = ps.lineoff || 0 ps.next_src = ps.next_src || [] if (ps.next_src.length) { next_src(ps) } return ps } // Handle cases where tokenization has stopped due to unexpected // or invalid bytes or running out of buffer. If smooth buffer // transition is possible, seamless transition is executed with next_src. // If not, ecode is updated to facilitate further handling. Err handling is // invoked for bad or invalid bytes. function end_src (ps, opt) { switch (ps.ecode) { case 0: if (ps.pos === POS.O_AK || ps.pos === POS.O_BV) { ps.ecode = ECODE.KEY_NO_VAL } else { if (ps.next_src && next_src(ps)) { return next(ps) } } break case ECODE.BAD_VALUE: case ECODE.UNEXPECTED: ps.tok = 0 if (opt && (typeof opt.err === 'function')) { opt.err(ps) return ps.tok } else { checke(ps) // throws error } // any other ecode is just sticky (prevents progress) } return ps.tok = 0 } function handle_neg (ps, opt) { ps.vlim = -ps.vlim if (ps.vlim >= ps.lim) { ps.ecode = ps.tok === TOK.DEC && (CMAP[ps.src[ps.vlim - 1]] & DECIMAL_END) ? ECODE.TRUNC_DEC : ECODE.TRUNCATED } else { ps.ecode = ECODE.BAD_VALUE ps.vlim++ } return end_src(ps, opt) } function handle_unexp (ps, opt) { if (ps.vlim < 0) { ps.vlim = -ps.vlim } ps.ecode = ECODE.UNEXPECTED return end_src(ps, opt) } // Default error handler. Throws an error with the given message and parse_state as a property of the error. function err (msg, ps) { const ctx = '(line ' + (ps.line + 1) + ', col ' + (ps.soff + ps.voff - ps.lineoff) + ', tokstr ' + tokstr(ps, true) + ')' /** @type {*} */ const e = new Error(msg + ': ' + ctx) e.parse_state = ps throw e } // // PUBLIC API // /** * Return the abbreviated string name for a Position integer code. * * @param {number} pos Position integer state-code as stored in ParseState.pos * @returns {string} Abbreviated string name for the code */ function posname (pos) { return POS2NAME[pos] || '???' } /** * Create and return a new ParseState object. */ function ps (src) { let ret = {src: src} return init(ret) } // Parses next token from ps.src, *very quickly*. Call this function repeatedly to tokenize JSON buffers // passing in the same ParseState which is updated in place to the next key/value in the buffer. // // ps object holding parse state context/position and current token to be updated to next token state. // opt optional override for error handling // // Return the token value successfully parsed TOK.string, TOK.number... or zero if incomplete or error // // For Example: // // const next = require('qb-json-next') // const someJSON = '{"a": "some",\n "b": "json to parse", \n: "c": [1.1, 2.5, 33]}' // const ps = next.ps(Buffer.from(someJSON) // while (next(ps)) { // console.log(next.tokstr(ps)) // } function next (ps, opt) { if (!ps.pos) { init(ps) } if (ps.ecode !== 0) { // ecode is sticky (requires intentional fix) return ps.tok = 0 } ps.koff = ps.klim = ps.voff = ps.vlim let pos1 = ps.pos while (ps.vlim < ps.lim) { ps.voff = ps.vlim ps.tok = ps.src[ps.vlim++] switch (ps.tok) { case 10: // new-line ps.lineoff = ps.soff + ps.vlim ps.line++ continue case 13: // carriage return ps.lineoff = ps.soff + ps.vlim continue case 8: case 9: case 12: case 32: // other white-space continue case 44: // , COMMA case 58: // : COLON pos1 = POS_MAP[ps.pos | ps.tok] if (pos1 === 0) { ps.voff = ps.vlim - 1; return handle_unexp(ps, opt) } ps.pos = pos1 continue case 34: // " QUOTE ps.tok = 115 // s for string ps.vlim = skip_str(ps.src, ps.vlim, ps.lim) pos1 = POS_MAP[ps.pos | ps.tok] if (pos1 === 0) return handle_unexp(ps, opt) if (pos1 === POS.O_AK) { // key ps.koff = ps.voff if (ps.vlim > 0) { ps.pos = pos1; ps.klim = ps.voff = ps.vlim; continue } else { ps.klim = ps.voff = -ps.vlim; return handle_neg(ps, opt) } } else { // value if (ps.vlim > 0) { ps.pos = pos1; ps.vcount++; return ps.tok } else return handle_neg(ps, opt) } case 102: // f false case 110: // n null case 116: // t true ps.vlim = skip_bytes(ps.src, ps.vlim, ps.lim, TOK_BYTES[ps.tok]) pos1 = POS_MAP[ps.pos | ps.tok] if (pos1 === 0) return handle_unexp(ps, opt) if (ps.vlim > 0) { ps.pos = pos1; ps.vcount++; return ps.tok } else return handle_neg(ps, opt) case 48:case 49:case 50:case 51:case 52: // 0-4 digits case 53:case 54:case 55:case 56:case 57: // 5-9 digits case 45: // '-' ('+' is not legal here) ps.tok = 100 // d for decimal ps.vlim = skip_dec(ps.src, ps.vlim, ps.lim) pos1 = POS_MAP[ps.pos | ps.tok] if (pos1 === 0) return handle_unexp(ps, opt) if (ps.vlim > 0) { ps.pos = pos1; ps.vcount++; return ps.tok } else return handle_neg(ps, opt) case 91: // [ ARRAY START case 123: // { OBJECT START pos1 = POS_MAP[ps.pos | ps.tok] if (pos1 === 0) return handle_unexp(ps, opt) ps.pos = pos1 ps.stack.push(ps.tok) return ps.tok case 93: // ] ARRAY END if (POS_MAP[ps.pos | ps.tok] === 0) return handle_unexp(ps, opt) ps.stack.pop() ps.pos = ps.stack[ps.stack.length - 1] === 123 ? POS.O_AV : POS.A_AV ps.vcount++; return ps.tok case 125: // } OBJECT END if (POS_MAP[ps.pos | ps.tok] === 0) return handle_unexp(ps, opt) ps.stack.pop() ps.pos = ps.stack[ps.stack.length - 1] === 123 ? POS.O_AV : POS.A_AV ps.vcount++; return ps.tok default: --ps.vlim ps.ecode = ECODE.BAD_VALUE return end_src(ps, opt) } } // reached src limit without error or truncation if (CMAP[ps.tok] & NON_TOKEN) { ps.voff = ps.vlim } return end_src(ps, opt) } // Convenience function to throw error if parse state shows unexpected or invalid value encountered. // Throw error is the default behavior of next(), but can be overridden with the err override // option: // // next(ps, { err: (ps) => {handle-error-my-way...} }) function checke (ps) { ps.ecode !== ECODE.UNEXPECTED || err('unexpected token at ' + ps.voff + '..' + ps.vlim, ps) ps.ecode !== ECODE.BAD_VALUE || err('bad value at ' + ps.voff + '..' + ps.vlim, ps) } // Return the parse state as a brief string. // ps Parse state that was updated by calling next(ps) // detail (optional) Pass true to print more detail including the position state and stack context function tokstr (ps, detail) { const keystr = ps.koff === ps.klim ? '' : 'k' + (ps.klim - ps.koff) + '@' + ps.koff + ':' const vlen = (ps.vlim === ps.voff || (CMAP[ps.tok] & NO_LEN_TOKENS)) ? '' : ps.vlim - ps.voff const tchar = ps.tok && String.fromCharCode(ps.tok) || '!' let ret = keystr + tchar + vlen + '@' + ps.voff if (ps.ecode) { ret += ':' + String.fromCharCode(ps.ecode) } if (detail) { ret += ':' + posname(ps.pos) if (ps.stack && ps.stack.length) { ret += ':' + ps.stack.map(function (c) { return String.fromCharCode(c) }).join('') } } return ret } next.checke = checke next.next = next next.posname = posname next.ps = ps next.tokstr = tokstr next.ECODE = ECODE next.POS = POS next.TOK = TOK next._skip_bytes = skip_bytes next._skip_dec = skip_dec next._skip_str = skip_str next._TOK_BYTES = TOK_BYTES next._init = init module.exports = next