UNPKG

sax-wasm

Version:

An extremely fast JSX, HTML and XML parser written in Rust compiled to WebAssembly for Node and the Web

694 lines (693 loc) 23.2 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.readPosition = exports.readU32 = exports.readString = exports.SAXParser = exports.Tag = exports.Text = exports.ProcInst = exports.Attribute = exports.Position = exports.Reader = exports.AttributeType = exports.SaxEventType = void 0; /** * An enum representing the events that can be * subscribed to on the parser. Multiple events * are subscribed to by using the bitwise or operator. * * @example * ```ts * // Subscribe to both the Text and OpenTag events. * const parser = new SaxParser(SaxEventType.Text | SaxEventType.OpenTag); * ``` * Event subscriptions can be updated between write operations. * * Note that minimizing the number of events will have a * slight performance improvement which becomes more noticeable * on very large documents. */ exports.SaxEventType = { Text: 0b1, ProcessingInstruction: 0b10, Declaration: 0b100, Doctype: 0b1000, Comment: 0b10000, OpenTagStart: 0b100000, Attribute: 0b1000000, OpenTag: 0b10000000, CloseTag: 0b100000000, Cdata: 0b1000000000, }; /** * Represents the different types of attributes. */ var AttributeType; (function (AttributeType) { AttributeType[AttributeType["NoValue"] = 0] = "NoValue"; AttributeType[AttributeType["JSX"] = 1] = "JSX"; AttributeType[AttributeType["NoQuotes"] = 2] = "NoQuotes"; AttributeType[AttributeType["SingleQuoted"] = 4] = "SingleQuoted"; AttributeType[AttributeType["DoubleQuoted"] = 8] = "DoubleQuoted"; })(AttributeType || (exports.AttributeType = AttributeType = {})); /** * Abstract class for decoding SAX event data. * * @template T - The type of detail to be read. */ class Reader { data; memory; cache = {}; #dataView; get dataView() { return this.#dataView ??= new Uint8Array(this.memory.buffer); } /** * Creates a new Reader instance. * * @param data - The data buffer containing the event data. * @param ptr - The initial pointer position. * @param memory - The WebAssembly memory instance. */ constructor(data, memory) { this.data = data; this.memory = memory; } } exports.Reader = Reader; /** * Class representing the line and character * integers for entities that are encountered * in the document. */ class Position { line; character; /** * Creates a new Position instance. * * @param line - The line number. * @param character - The character position. */ constructor(line, character) { this.line = line; this.character = character; } } exports.Position = Position; /** * Represents an attribute in the XML data. * * This class decodes the Attribute data sent across * the FFI boundary. Encoded data has the following schema: * * 1. AttributeType - byte position 0 (1 bytes) * 2. name_length - length of the 'name' Text - byte position 1-4 (4 bytes) * 3. 'name' bytes - byte position 5-name_length (name_length bytes) * 4. 'value' bytes - byte position name_length-n (n bytes) */ class Attribute extends Reader { static LENGTH = 168; type; name; value; constructor(data, memory) { super(data, memory); this.name = new Text(new Uint8Array(data.buffer, data.byteOffset, Text.LENGTH), memory); this.value = new Text(new Uint8Array(data.buffer, data.byteOffset + Text.LENGTH, Text.LENGTH), memory); this.type = data[144]; } /** * Gets the byte offsets representing the * start and end byte in the data */ get byteOffsets() { return (this.cache.byteOffsets ??= { start: readU64(this.data, 152), end: readU64(this.data, 160) }); } /** * @inheritDoc */ toJSON() { const { name, value, type, byteOffsets } = this; return { name: name.toJSON(), value: value.toJSON(), type, byteOffsets }; } /** * Converts the attribute to a string representation. * * @returns A string representing the attribute. */ toString() { const { name, value } = this; return this.type === AttributeType.JSX ? `${name}="{${value}}"` : `${name}="${value}"`; } } exports.Attribute = Attribute; /** * Represents a processing instruction in the XML data. * * This class decodes the processing instruction data sent across the FFI boundary. * The encoded data has the following schema: * * 1. Start position (line and character) - byte positions 0-7 (8 bytes) * 2. End position (line and character) - byte positions 8-15 (8 bytes) * 3. Target length - byte positions 16-19 (4 bytes) * 4. Target bytes - byte positions 20-(20 + target length - 1) (target length bytes) * 5. Content bytes - byte positions (20 + target length)-(end of buffer) (remaining bytes) * * The `ProcInst` class decodes this data into its respective fields: `start`, `end`, `target`, and `content`. * * # Fields * * * `start` - The start position of the processing instruction. * * `end` - The end position of the processing instruction. * * `target` - The target of the processing instruction. * * `content` - The content of the processing instruction. * * # Arguments * * * `buffer` - The buffer containing the processing instruction data. * * `ptr` - The initial pointer position. */ class ProcInst extends Reader { static LENGTH = 186; target; content; constructor(data, memory) { super(data, memory); this.target = new Text(new Uint8Array(data.buffer, data.byteOffset + 32, Text.LENGTH), memory); this.content = new Text(new Uint8Array(data.buffer, data.byteOffset + 32 + Text.LENGTH, Text.LENGTH), memory); } /** * Gets the start position of the processing instruction. * * @returns The start position of the processing instruction. */ get start() { return (this.cache.start || (this.cache.start = (0, exports.readPosition)(this.data, 0))); } /** * Gets the start position of the processing instruction. * * @returns The start position of the processing instruction. */ get end() { return (this.cache.end || (this.cache.end = (0, exports.readPosition)(this.data, 16))); } /** * Gets the byte offsets representing the * start and end byte in the data */ get byteOffsets() { return (this.cache.byteOffsets ??= { start: readU64(this.data, 16), end: readU64(this.data, 24), }); } /** * Converts the processing instruction to a JSON object. * * @returns A JSON object representing the processing instruction. */ toJSON() { const { start, end, target, content, byteOffsets } = this; return { start, end, target: target.toJSON(), content: content.toJSON(), byteOffsets }; } /** * @inheritdoc */ toString() { const { target, content } = this; return `<? ${target} ${content} ?>`; } } exports.ProcInst = ProcInst; /** * Represents a text node in the XML data. * * This class decodes the text node data sent across the FFI boundary * into its respective fields: `start`, `end`, and `value`. */ class Text extends Reader { static LENGTH = 72; /** * Gets the start position of the text node. * * @returns The start position of the text node. */ get start() { return this.cache.start || (this.cache.start = (0, exports.readPosition)(this.data, 24)); } /** * Gets the end position of the text node. * * @returns The end position of the text node. */ get end() { return this.cache.end || (this.cache.end = (0, exports.readPosition)(this.data, 40)); } /** * Gets the value of the text node. * * @returns The value of the text node. */ get value() { if (this.cache.value) { return this.cache.value; } const vecPtr = (0, exports.readU32)(this.data, 12); const valueLen = (0, exports.readU32)(this.data, 16); return (this.cache.value = (0, exports.readString)(this.dataView, vecPtr, valueLen)); } /** * Gets the byte offsets representing the * start and end byte in the data */ get byteOffsets() { return (this.cache.byteOffsets ??= { start: readU64(this.data, 56), end: readU64(this.data, 64) }); } /** * Converts the text node to a JSON object. * * @returns A JSON object representing the text node. */ toJSON() { const { start, end, value, byteOffsets } = this; return { start, end, value, byteOffsets }; } /** * Converts the text node to a string representation. * * @returns A string representing the text node. */ toString() { return this.value; } } exports.Text = Text; /** * Represents a tag in the XML data. * * This class decodes the tag data sent across the FFI boundary * into its respective fields: `openStart`, `openEnd`, `closeStart`, * `closeEnd`, `selfClosing`, `name`, `attributes`, and `textNodes`. */ class Tag extends Reader { static LENGTH = 128; /** * Gets the start position of the tag opening. * * @returns The start position of the tag opening. */ get openStart() { return (this.cache.openStart || (this.cache.openStart = (0, exports.readPosition)(this.data, 40))); } /** * Gets the end position of the tag opening. * * @returns The end position of the tag opening. */ get openEnd() { return (this.cache.openEnd || (this.cache.openEnd = (0, exports.readPosition)(this.data, 56))); } /** * Gets the start position of the tag closing. * * @returns The start position of the tag closing. */ get closeStart() { return (this.cache.closeStart || (this.cache.closeStart = (0, exports.readPosition)(this.data, 72))); } /** * Gets the end position of the tag closing. * * @returns The end position of the tag closing. */ get closeEnd() { return (this.cache.closeEnd || (this.cache.closeEnd = (0, exports.readPosition)(this.data, 88))); } /** * Gets the self-closing flag of the tag. * * @returns The self-closing flag of the tag. */ get selfClosing() { return !!this.data[36]; } /** * Gets the name of the tag. * * @returns The name of the tag. */ get name() { if (this.cache.name) { return this.cache.name; } const vecPtr = (0, exports.readU32)(this.data, 4); const valueLen = (0, exports.readU32)(this.data, 8); return (this.cache.name = (0, exports.readString)(this.dataView, vecPtr, valueLen)); } /** * Gets the attributes of the tag. * * @returns An array of attributes of the tag. * @see Attribute */ get attributes() { if (this.cache.attributes) { return this.cache.attributes; } // starting location of the attribute block let ptr = (0, exports.readU32)(this.data, 16); const numAttrs = (0, exports.readU32)(this.data, 20); const attributes = []; for (let i = 0; i < numAttrs; i++) { const attrVecData = new Uint8Array(this.dataView.buffer, ptr, Attribute.LENGTH); attributes[i] = new Attribute(attrVecData, this.memory); ptr += Attribute.LENGTH; } return (this.cache.attributes = attributes); } /** * Gets the text nodes within the tag. * * @returns An array of text nodes within the tag. * @see Text */ get textNodes() { if (this.cache.textNodes) { return this.cache.textNodes; } // starting location of the text nodes block let ptr = (0, exports.readU32)(this.data, 28); const numTextNodes = (0, exports.readU32)(this.data, 32); const textNodes = []; for (let i = 0; i < numTextNodes; i++) { const textVecData = new Uint8Array(this.dataView.buffer, ptr, Text.LENGTH); textNodes[i] = new Text(textVecData, this.memory); ptr += Text.LENGTH; } return (this.cache.textNodes = textNodes); } /** * Gets the byte offsets representing the * start and end byte in the data */ get byteOffsets() { return (this.cache.byteOffsets ??= { start: readU64(this.data, 112), end: readU64(this.data, 120) }); } /** * Converts the tag to a JSON object. * * @returns A JSON object representing the tag. */ toJSON() { const { openStart, openEnd, closeStart, closeEnd, name, attributes, textNodes, selfClosing, byteOffsets } = this; return { openStart, openEnd, closeStart, closeEnd, name, attributes: attributes.map(a => a.toJSON()), textNodes: textNodes.map(t => t.toJSON()), selfClosing, byteOffsets, }; } get value() { return this.name; } } exports.Tag = Tag; class SAXParser { static textDecoder = new TextDecoder(); events; wasmSaxParser; eventHandler; createDetailConstructor(Constructor) { return (memoryBuffer, ptr) => { return new Constructor(new Uint8Array(memoryBuffer, ptr, Constructor.LENGTH), this.wasmSaxParser.memory); }; } eventConstructors = []; writeBuffer; constructor(events = 0) { const self = this; // Initialize a fast lookup table for event constructors to avoid Map lookups per event. this.eventConstructors[exports.SaxEventType.Attribute] = this.createDetailConstructor(Attribute); this.eventConstructors[exports.SaxEventType.ProcessingInstruction] = this.createDetailConstructor(ProcInst); this.eventConstructors[exports.SaxEventType.OpenTag] = this.createDetailConstructor(Tag); this.eventConstructors[exports.SaxEventType.CloseTag] = this.createDetailConstructor(Tag); this.eventConstructors[exports.SaxEventType.OpenTagStart] = this.createDetailConstructor(Tag); this.eventConstructors[exports.SaxEventType.Text] = this.createDetailConstructor(Text); this.eventConstructors[exports.SaxEventType.Cdata] = this.createDetailConstructor(Text); this.eventConstructors[exports.SaxEventType.Comment] = this.createDetailConstructor(Text); this.eventConstructors[exports.SaxEventType.Doctype] = this.createDetailConstructor(Text); this.eventConstructors[exports.SaxEventType.Declaration] = this.createDetailConstructor(Text); Object.defineProperties(this, { events: { get: () => ~~events, set: (value) => { if (events === ~~value) { return; } events = ~~value; if (self.wasmSaxParser) { self.wasmSaxParser.parser(events); } }, configurable: false, enumerable: true, }, }); } /** * Parses the XML data from a readable stream. * * This function takes a readable stream of `Uint8Array` chunks and processes them using the SAX parser. * It yields events and their details as they are parsed. * * # Arguments * * * `reader` - A readable stream reader for `Uint8Array` chunks. * * # Returns * * * An async generator yielding tuples of `SaxEventType` and `Detail`. * * # Examples * * ```ts * // Node.js example * import { createReadStream } from 'fs'; * import { resolve as pathResolve } from 'path'; * import { Readable } from 'stream'; * import { SAXParser, SaxEventType, Detail } from 'sax-wasm'; * * (async () => { * const parser = new SAXParser(SaxEventType.Text | SaxEventType.OpenTag); * const options = { encoding: 'utf8' }; * const readable = createReadStream(pathResolve('path/to/your.xml'), options); * const webReadable = Readable.toWeb(readable); * * for await (const [event, detail] of parser.parse(webReadable.getReader())) { * // Do something with these * } * })(); * * // Browser example * import { SAXParser, SaxEventType, Detail } from 'sax-wasm'; * * (async () => { * const parser = new SAXParser(SaxEventType.Text | SaxEventType.OpenTag); * const response = await fetch('path/to/your.xml'); * const reader = response.body.getReader(); * * for await (const [event, detail] of parser.parse(reader)) { * // Do something with these * } * })(); * ``` */ async *parse(reader) { let eventAggregator = []; this.eventHandler = function (event, detail) { eventAggregator.push([event, detail]); }; while (true) { const chunk = await reader.read(); if (chunk.done) { return this.end(); } this.write(chunk.value); if (eventAggregator.length) { for (const event of eventAggregator) { yield event; } eventAggregator.length = 0; } } } /** * Writes a chunk of data to the parser. * * This function takes a `Uint8Array` chunk and processes it using the SAX parser. * * # Arguments * * * `chunk` - A `Uint8Array` chunk representing the data to be parsed. * * # Examples * * ```ts * // Node.js example * import { createReadStream } from 'node:fs'; * import { resolve as pathResolve } from 'node:path'; * import { Readable } from 'stream'; * import { SAXParser, SaxEventType } from 'sax-wasm'; * * (async () => { * const parser = new SAXParser(SaxEventType.Text | SaxEventType.OpenTag); * await parser.prepareWasm(fetch('path/to/your.wasm')); * const options = { encoding: 'utf8' }; * const readable = createReadStream(pathResolve(__dirname + '/xml.xml'), options); * const webReadable = Readable.toWeb(readable); * * for await (const chunk of webReadable.getReader()) { * parser.write(chunk); * } * parser.end(); * })(); * * // Browser example * import { SAXParser, SaxEventType } from 'sax-wasm'; * * (async () => { * const parser = new SAXParser(SaxEventType.Text | SaxEventType.OpenTag); * await parser.prepareWasm(fetch('path/to/your.wasm')); * const response = await fetch('path/to/your.xml'); * const reader = response.body.getReader(); * * while (true) { * const { done, value } = await reader.read(); * if (done) break; * parser.write(value); * } * parser.end(); * })(); * ``` */ write(chunk) { if (!this.wasmSaxParser) { return; } const { write, memory: { buffer } } = this.wasmSaxParser; // Allocations within the WASM process // invalidate reference to the memory buffer. // We check for this and create a new Uint8Array // with the new memory buffer reference if needed. // **NOTE** These allocations can slow down parsing // if they become excessive. Consider adjusting the // highWaterMark in the options up or down to find the optimal // memory allocation to prevent too many new Uint8Array instances. if (this.writeBuffer?.buffer !== buffer) { this.writeBuffer = new Uint8Array(buffer); } this.writeBuffer.set(chunk, 4); write(4, chunk.byteLength); } /** * Ends the parsing process. * * This function signals the end of the parsing process notifies * the WASM binary to flush buffers and normalize. */ end() { this.writeBuffer = undefined; this.wasmSaxParser?.end(); } async prepareWasm(saxWasm) { const env = { memory: new WebAssembly.Memory({ initial: 10, shared: true, maximum: 150 }), table: new WebAssembly.Table({ initial: 1, element: 'anyfunc' }), event_listener: this.eventTrap }; const result = (saxWasm instanceof Uint8Array ? await WebAssembly.instantiate(saxWasm, { env }) : await WebAssembly.instantiateStreaming(saxWasm, { env })); if (result && typeof this.events === 'number') { const { parser } = this.wasmSaxParser = result.instance.exports; parser(this.events); return true; } throw new Error(`Failed to instantiate the parser.`); } eventTrap = (event, ptr) => { if (!this.wasmSaxParser || !this.eventHandler) { return; } const memoryBuffer = this.wasmSaxParser.memory.buffer; let detail; const ctor = this.eventConstructors[event]; if (ctor) { detail = ctor(memoryBuffer, ptr); } else { throw new Error("No reader for this event type"); } this.eventHandler(event, detail); }; } exports.SAXParser = SAXParser; const readString = (data, offset, length) => SAXParser.textDecoder.decode(data.subarray(offset, offset + length)); exports.readString = readString; let cachedDataBuffer = null; let cachedDataView = null; const dataViewFor = (array) => { const buffer = array.buffer; if (buffer !== cachedDataBuffer) { cachedDataBuffer = buffer; cachedDataView = new DataView(buffer); } return cachedDataView; }; const readU32 = (uint8Array, ptr) => { const view = dataViewFor(uint8Array); return view.getUint32(uint8Array.byteOffset + ptr, true); }; exports.readU32 = readU32; /** * Reads a u64 as a javascript number. This * will limit precision to 2⁵³ - 1 or 53 bits * or Number.MAX_SAFE_INTEGER or an XML document * that's 8,388,608 GB in size. * * When working with strings in JS, 64 bit * bigints don't make sense because string * length limits will be encountered before * reaching these values. * * @param uint8Array The data to read the u64 from * @param ptr The offset to start at * @returns number */ function readU64(uint8Array, ptr = 0) { const view = dataViewFor(uint8Array); const lo = view.getUint32(uint8Array.byteOffset + ptr, true); const hi = view.getUint32(uint8Array.byteOffset + ptr + 4, true); return lo + hi * 0x1_0000_0000; } const readPosition = (uint8Array, ptr = 0) => { const line = readU64(uint8Array, ptr); const character = readU64(uint8Array, ptr + 8); return new Position(line, character); }; exports.readPosition = readPosition;