oxc-parser
Version:
Oxc Parser Node API
276 lines (254 loc) • 11.4 kB
JavaScript
import os from 'node:os';
import {
getBufferOffset,
parseAsyncRaw as parseAsyncRawBinding,
parseSyncRaw as parseSyncRawBinding,
} from '../bindings.mjs';
import { BUFFER_ALIGN, BUFFER_SIZE, IS_TS_FLAG_POS } from '../generated/constants.mjs';
import { rawTransferSupported } from './supported.mjs';
// Throw an error if running on a platform which raw transfer doesn't support.
//
// Note: This module is lazy-loaded only when user calls `parseSync` or `parseAsync` with
// `experimentalRawTransfer` or `experimentalLazy` options, or calls `experimentalGetLazyVisitor`.
if (!rawTransferSupported()) {
throw new Error(
'`experimentalRawTransfer` and `experimentalLazy` options are not supported ' +
'on 32-bit or big-endian systems, versions of NodeJS prior to v22.0.0, ' +
'versions of Deno prior to v2.0.0, or other runtimes',
);
}
/**
* Parse JS/TS source synchronously on current thread using raw transfer.
*
* Convert the buffer returned by Rust to a JS object with provided `convert` function.
*
* This function contains logic shared by both `parseSyncRaw` and `parseSyncLazy`.
*
* @param {string} filename - Filename
* @param {string} sourceText - Source text of file
* @param {Object|undefined} options - Parsing options
* @param {function} convert - Function to convert the buffer returned from Rust into a JS object
* @returns {Object} - The return value of `convert`
*/
export function parseSyncRawImpl(filename, sourceText, options, convert) {
const { buffer, sourceByteLen } = prepareRaw(sourceText);
parseSyncRawBinding(filename, buffer, sourceByteLen, options);
return convert(buffer, sourceText, sourceByteLen);
}
// User should not schedule more async tasks than there are available CPUs, as it hurts performance,
// but it's a common mistake in async JS code to do exactly that.
//
// That anti-pattern looks like this when applied to Oxc:
//
// ```js
// const asts = await Promise.all(
// files.map(
// async (filename) => {
// const sourceText = await fs.readFile(filename, 'utf8');
// const ast = await oxc.parseAsync(filename, sourceText);
// return ast;
// }
// )
// );
// ```
//
// In most cases, that'd just result in a bit of degraded performance, and higher memory use because
// of loading sources into memory prematurely.
//
// However, raw transfer uses a 6 GiB buffer for each parsing operation.
// Most of the memory pages in those buffers are never touched, so this does not consume a huge amount
// of physical memory, but it does still consume virtual memory.
//
// If we allowed creating a large number of 6 GiB buffers simultaneously, it would quickly consume
// virtual memory space and risk memory exhaustion. The code above would exhaust all of bottom half
// (heap) of 48-bit virtual memory space if `files.length >= 21_845`. This is not a number which
// is unrealistic in real world code.
//
// To guard against this possibility, we implement a simple queue.
// No more than `os.availableParallelism()` files can be parsed simultaneously, and any further calls to
// `parseAsyncRaw` will be put in a queue, to execute once other tasks complete.
//
// Fallback to `os.cpus().length` on versions of NodeJS prior to v18.14.0, which do not support
// `os.availableParallelism`.
let availableCores = os.availableParallelism ? os.availableParallelism() : os.cpus().length;
const queue = [];
/**
* Parse JS/TS source asynchronously using raw transfer.
*
* Convert the buffer returned by Rust to a JS object with provided `convert` function.
*
* Queues up parsing operations if more calls than number of CPU cores (see above).
*
* This function contains logic shared by both `parseAsyncRaw` and `parseAsyncLazy`.
*
* @param {string} filename - Filename
* @param {string} sourceText - Source text of file
* @param {Object|undefined} options - Parsing options
* @param {function} convert - Function to convert the buffer returned from Rust into a JS object
* @returns {Object} - The return value of `convert`
*/
export async function parseAsyncRawImpl(filename, sourceText, options, convert) {
// Wait for a free CPU core if all CPUs are currently busy.
//
// Note: `availableCores` is NOT decremented if have to wait in the queue first,
// and NOT incremented when parsing completes and it runs next task in the queue.
//
// This is to avoid a race condition if `parseAsyncRaw` is called during the microtick in between
// `resolve` being called below, and the promise resolving here. In that case the new task could
// start running, and then the promise resolves, and the queued task also starts running.
// We'd then have `availableParallelism() + 1` tasks running simultaneously. Potentially, this could
// happen repeatedly, with the number of tasks running simultaneously ever-increasing.
if (availableCores === 0) {
// All CPU cores are busy. Put this task in queue and wait for capacity to become available.
await new Promise((resolve, _) => {
queue.push(resolve);
});
} else {
// A CPU core is available. Mark core as busy, and run parsing now.
availableCores--;
}
// Parse
const { buffer, sourceByteLen } = prepareRaw(sourceText);
await parseAsyncRawBinding(filename, buffer, sourceByteLen, options);
const data = convert(buffer, sourceText, sourceByteLen);
// Free the CPU core
if (queue.length > 0) {
// Some further tasks waiting in queue. Run the next one.
// Do not increment `availableCores` (see above).
const resolve = queue.shift();
resolve();
} else {
// No tasks waiting in queue. This CPU is now free.
availableCores++;
}
return data;
}
const ARRAY_BUFFER_SIZE = BUFFER_SIZE + BUFFER_ALIGN;
const ONE_GIB = 1 << 30;
// We keep a cache of buffers for raw transfer, so we can reuse them as much as possible.
//
// When processing multiple files, it's ideal if can reuse an existing buffer, as it's more likely to
// be warm in CPU cache, it avoids allocations, and it saves work for the garbage collector.
//
// However, we also don't want to keep a load of large buffers around indefinitely using up memory,
// if they're not going to be used again.
//
// We have no knowledge of what pattern over time user may process files in (could be lots in quick
// succession, or more occasionally in a long-running process). So we try to use flexible caching
// strategy which is adaptable to many usage patterns.
//
// We use a 2-tier cache.
// Tier 1 uses strong references, tier 2 uses weak references.
//
// When parsing is complete and the buffer is no longer in use, push it to `buffers` (tier 1 cache).
// Set a timer to clear the cache when no activity for 10 seconds.
//
// When the timer expires, move all the buffers from tier 1 cache into `oldBuffers` (tier 2).
// They are stored there as `WeakRef`s, so the garbage collector is free to reclaim them.
//
// On the next call to `parseSync` or `parseAsync`, promote any buffers in tier 2 cache which were not
// already garbage collected back into tier 1 cache. This is on assumption that parsing one file
// indicates parsing as a whole is an ongoing process, and there will likely be further calls to
// `parseSync` / `parseAsync` in future.
//
// The weak tier 2 cache is because V8 does not necessarily free memory as soon as it's able to be
// freed. We don't want to block it from freeing memory, but if it's not done that yet, there's no
// point creating a new buffer, when one already exists.
const CLEAR_BUFFERS_TIMEOUT = 10_000; // 10 seconds
const buffers = [], oldBuffers = [];
let clearBuffersTimeout = null;
const textEncoder = new TextEncoder();
/**
* Get a buffer (from cache if possible), and copy source text into it.
*
* @param {string} sourceText - Source text of file
* @returns {Object} - Object of form `{ buffer, sourceByteLen }`.
* - `buffer`: `Uint8Array` containing the AST in raw form.
* - `sourceByteLen`: Length of source text in UTF-8 bytes
* (which may not be equal to `sourceText.length` if source contains non-ASCII characters).
*/
export function prepareRaw(sourceText) {
// Cancel timeout for clearing buffers
if (clearBuffersTimeout !== null) {
clearTimeout(clearBuffersTimeout);
clearBuffersTimeout = null;
}
// Revive any discarded buffers which have not yet been garbage collected
if (oldBuffers.length > 0) {
const revivedBuffers = [];
for (let oldBuffer of oldBuffers) {
oldBuffer = oldBuffer.deref();
if (oldBuffer !== undefined) revivedBuffers.push(oldBuffer);
}
oldBuffers.length = 0;
if (revivedBuffers.length > 0) buffers.unshift(...revivedBuffers);
}
// Reuse existing buffer, or create a new one
const buffer = buffers.length > 0 ? buffers.pop() : createBuffer();
// Write source into start of buffer.
// `TextEncoder` cannot write into a `Uint8Array` larger than 1 GiB,
// so create a view into buffer of this size to write into.
const sourceBuffer = new Uint8Array(buffer.buffer, buffer.byteOffset, ONE_GIB);
const { read, written: sourceByteLen } = textEncoder.encodeInto(sourceText, sourceBuffer);
if (read !== sourceText.length) throw new Error('Failed to write source text into buffer');
return { buffer, sourceByteLen };
}
/**
* Get if AST should be parsed as JS or TS.
* Rust side sets a `bool` in this position in buffer which is `true` if TS.
*
* @param {Uint8Array} buffer - Buffer containing AST in raw form
* @returns {boolean} - `true` if AST is JS, `false` if TS
*/
export function isJsAst(buffer) {
return buffer[IS_TS_FLAG_POS] === 0;
}
/**
* Return buffer to cache, to be reused.
* Set a timer to clear buffers.
*
* @param {Uint8Array} buffer - Buffer
* @returns {undefined}
*/
export function returnBufferToCache(buffer) {
buffers.push(buffer);
if (clearBuffersTimeout !== null) clearTimeout(clearBuffersTimeout);
clearBuffersTimeout = setTimeout(clearBuffersCache, CLEAR_BUFFERS_TIMEOUT);
clearBuffersTimeout.unref();
}
/**
* Downgrade buffers in tier 1 cache (`buffers`) to tier 2 (`oldBuffers`)
* so they can be garbage collected.
*
* @returns {undefined}
*/
function clearBuffersCache() {
clearBuffersTimeout = null;
for (const buffer of buffers) {
oldBuffers.push(new WeakRef(buffer));
}
buffers.length = 0;
}
/**
* Create a `Uint8Array` which is 2 GiB in size, with its start aligned on 4 GiB.
*
* Achieve this by creating a 6 GiB `ArrayBuffer`, getting the offset within it that's aligned to 4 GiB,
* chopping off that number of bytes from the start, and shortening to 2 GiB.
*
* It's always possible to obtain a 2 GiB slice aligned on 4 GiB within a 6 GiB buffer,
* no matter how the 6 GiB buffer is aligned.
*
* Note: On systems with virtual memory, this only consumes 6 GiB of *virtual* memory.
* It does not consume physical memory until data is actually written to the `Uint8Array`.
* Physical memory consumed corresponds to the quantity of data actually written.
*
* @returns {Uint8Array} - Buffer
*/
function createBuffer() {
const arrayBuffer = new ArrayBuffer(ARRAY_BUFFER_SIZE);
const offset = getBufferOffset(new Uint8Array(arrayBuffer));
const buffer = new Uint8Array(arrayBuffer, offset, BUFFER_SIZE);
buffer.uint32 = new Uint32Array(arrayBuffer, offset, BUFFER_SIZE / 4);
buffer.float64 = new Float64Array(arrayBuffer, offset, BUFFER_SIZE / 8);
return buffer;
}