ocsv
Version:
High-performance RFC 4180 compliant CSV parser with lazy mode for Bun. Parse 10M rows in 5s with minimal memory.
861 lines (782 loc) • 23.5 kB
JavaScript
/**
* OCSV - High-performance CSV Parser
*
* A fast, RFC 4180 compliant CSV parser written in Odin with Bun FFI bindings.
* Achieves 61.25 MB/s throughput (56% of native) with zero memory leaks.
*
* @module ocsv
*/
import { dlopen, FFIType, ptr, toArrayBuffer } from "bun:ffi";
import { resolve, dirname } from "path";
import { fileURLToPath } from "url";
import { existsSync } from "fs";
import os from "os";
/**
* Error codes from the parser
*/
export const ParseErrorCode = {
NONE: 0,
INVALID_INPUT: 1,
UNTERMINATED_QUOTE: 2,
INVALID_ESCAPE: 3,
ROW_TOO_LARGE: 4,
MEMORY_ERROR: 5,
IO_ERROR: 6,
};
/**
* Custom error class for OCSV parsing errors
*
* @extends Error
* @example
* try {
* parser.parse(malformedCSV);
* } catch (err) {
* if (err instanceof OcsvError) {
* console.error(`Parse error at line ${err.line}, column ${err.column}: ${err.message}`);
* console.error(`Error code: ${err.code}`);
* }
* }
*/
export class OcsvError extends Error {
/**
* Create a new OCSV parsing error
* @param {string} message - Error message
* @param {number} code - Error code from ParseErrorCode
* @param {number} line - Line number where error occurred (1-indexed)
* @param {number} column - Column number where error occurred (1-indexed)
*/
constructor(message, code, line, column) {
super(message);
this.name = "OcsvError";
this.code = code;
this.line = line;
this.column = column;
}
}
/**
* Lazy row accessor - reads field data on demand
*
* Provides efficient access to individual fields without materializing
* the entire row in memory. Ideal for processing large files where you
* only need specific columns.
*
* @example
* const result = parseCSV(data, { mode: 'lazy' });
* const row = result.getRow(100);
* console.log(row.getField(0)); // Get first field
* console.log(row.toArray()); // Materialize entire row
* result.destroy();
*/
export class LazyRow {
/**
* Create a new lazy row accessor (internal use only)
* @private
* @param {bigint} parser - Pointer to native parser
* @param {number} rowIndex - Zero-based row index
*/
constructor(parser, rowIndex) {
this.parser = parser;
this.rowIndex = rowIndex;
this._fieldCount = null;
}
/**
* Get the number of fields in this row
* @type {number}
*/
get fieldCount() {
if (this._fieldCount === null) {
this._fieldCount = lib.symbols.ocsv_get_field_count(this.parser, this.rowIndex);
}
return this._fieldCount;
}
/**
* Get a specific field by index
* @param {number} fieldIndex - Zero-based field index
* @returns {string|null} Field value, or null if index out of bounds
*
* @example
* const row = result.getRow(5);
* const name = row.getField(0); // First column
* const age = row.getField(1); // Second column
*/
getField(fieldIndex) {
if (fieldIndex < 0 || fieldIndex >= this.fieldCount) {
return null;
}
return lib.symbols.ocsv_get_field(this.parser, this.rowIndex, fieldIndex) || "";
}
/**
* Materialize the entire row as an array
* @returns {string[]} Array of field values
*
* @example
* const row = result.getRow(10);
* const fields = row.toArray();
* console.log(fields); // ['Alice', '30', 'NYC']
*/
toArray() {
const result = [];
for (let i = 0; i < this.fieldCount; i++) {
result.push(this.getField(i));
}
return result;
}
/**
* Convert row to object using provided headers
* @param {string[]} headers - Column headers
* @returns {Object<string, string>} Object mapping headers to field values
*
* @example
* const headers = ['name', 'age', 'city'];
* const row = result.getRow(10);
* const obj = row.toObject(headers);
* console.log(obj); // { name: 'Alice', age: '30', city: 'NYC' }
*/
toObject(headers) {
const obj = {};
for (let i = 0; i < headers.length; i++) {
obj[headers[i]] = this.getField(i);
}
return obj;
}
}
/**
* Lazy result accessor - provides on-demand row access
*
* Returned when using `mode: 'lazy'`. Allows iteration over CSV rows
* without materializing the entire dataset in memory. Perfect for
* processing massive files (100M+ rows) with minimal memory footprint.
*
* **IMPORTANT:** You MUST call `destroy()` when done to free native memory.
*
* @example Basic usage
* const result = parseCSV(data, { mode: 'lazy' });
* try {
* const row = result.getRow(100);
* console.log(row.toArray());
* } finally {
* result.destroy(); // REQUIRED!
* }
*
* @example Iteration
* const result = parseCSV(data, { mode: 'lazy', hasHeader: true });
* try {
* for (const row of result) {
* const obj = row.toObject(result.headers);
* console.log(obj);
* }
* } finally {
* result.destroy();
* }
*/
export class LazyResult {
/**
* Create a new lazy result accessor (internal use only)
* @private
* @param {bigint} parser - Pointer to native parser
* @param {number} rowCount - Total number of data rows (excluding header)
* @param {string[]|null} headers - Header row if hasHeader was true
* @param {ParseOptions} options - Original parse options
*/
constructor(parser, rowCount, headers, options) {
this.parser = parser;
this.rowCount = rowCount;
this.headers = headers;
this.options = options;
this._destroyed = false;
}
/**
* Get a specific row by index (lazy access)
* @param {number} rowIndex - Zero-based row index (excluding header)
* @returns {LazyRow|null} Row accessor, or null if index out of bounds
* @throws {Error} If LazyResult has been destroyed
*
* @example
* const result = parseCSV(data, { mode: 'lazy' });
* const row = result.getRow(1000); // Access row 1000 directly
* console.log(row.getField(2)); // Get column 2
* result.destroy();
*/
getRow(rowIndex) {
if (this._destroyed) {
throw new Error("LazyResult has been destroyed");
}
if (rowIndex < 0 || rowIndex >= this.rowCount) {
return null;
}
// Offset by 1 if we have headers
const actualRowIndex = this.headers ? rowIndex + 1 : rowIndex;
return new LazyRow(this.parser, actualRowIndex);
}
/**
* Iterate over all rows (supports for...of loops)
* @generator
* @yields {LazyRow} Each row in the dataset
*
* @example
* const result = parseCSV(data, { mode: 'lazy' });
* try {
* for (const row of result) {
* console.log(row.toArray());
* }
* } finally {
* result.destroy();
* }
*/
*[Symbol.iterator]() {
for (let i = 0; i < this.rowCount; i++) {
yield this.getRow(i);
}
}
/**
* Destroy the lazy result and free native memory
*
* **IMPORTANT:** You MUST call this method when done with lazy results.
* Failure to call destroy() will cause memory leaks in native code.
*
* @example
* const result = parseCSV(data, { mode: 'lazy' });
* try {
* // Use result...
* } finally {
* result.destroy(); // Always destroy in finally block
* }
*/
destroy() {
if (!this._destroyed) {
lib.symbols.ocsv_parser_destroy(this.parser);
this._destroyed = true;
}
}
}
/**
* Detect the current platform and architecture
* @returns {string} Platform string in format: platform-arch
*/
function getPlatform() {
const platform = os.platform();
const arch = os.arch();
return `${platform}-${arch}`;
}
/**
* Get the path to the native library
* @returns {string} Absolute path to the library
*/
function getLibraryPath() {
const __dirname = dirname(fileURLToPath(import.meta.url));
const platform = getPlatform();
// Determine library name by platform
let libName;
if (platform.startsWith('darwin')) {
libName = 'libocsv.dylib';
} else if (platform.startsWith('linux')) {
libName = 'libocsv.so';
} else if (platform.startsWith('win32')) {
libName = 'ocsv.dll';
} else {
throw new Error(`Unsupported platform: ${platform}`);
}
// Try prebuilds directory first
const prebuiltPath = resolve(__dirname, '..', 'prebuilds', platform, libName);
if (existsSync(prebuiltPath)) {
return prebuiltPath;
}
// Fall back to root directory (for development)
const devPath = resolve(__dirname, '..', libName);
if (existsSync(devPath)) {
return devPath;
}
throw new Error(`Could not find library for platform ${platform}. Tried:\n ${prebuiltPath}\n ${devPath}`);
}
// Load the shared library
const libPath = getLibraryPath();
const lib = dlopen(libPath, {
ocsv_parser_create: {
returns: FFIType.ptr,
},
ocsv_parser_destroy: {
args: [FFIType.ptr],
returns: FFIType.void,
},
ocsv_parse_string: {
args: [FFIType.ptr, FFIType.cstring, FFIType.i32],
returns: FFIType.i32,
},
ocsv_get_row_count: {
args: [FFIType.ptr],
returns: FFIType.i32,
},
ocsv_get_field_count: {
args: [FFIType.ptr, FFIType.i32],
returns: FFIType.i32,
},
ocsv_get_field: {
args: [FFIType.ptr, FFIType.i32, FFIType.i32],
returns: FFIType.cstring,
},
// Configuration setters (Phase 1)
ocsv_set_delimiter: {
args: [FFIType.ptr, FFIType.u8],
returns: FFIType.i32,
},
ocsv_set_quote: {
args: [FFIType.ptr, FFIType.u8],
returns: FFIType.i32,
},
ocsv_set_escape: {
args: [FFIType.ptr, FFIType.u8],
returns: FFIType.i32,
},
ocsv_set_skip_empty_lines: {
args: [FFIType.ptr, FFIType.bool],
returns: FFIType.i32,
},
ocsv_set_comment: {
args: [FFIType.ptr, FFIType.u8],
returns: FFIType.i32,
},
ocsv_set_trim: {
args: [FFIType.ptr, FFIType.bool],
returns: FFIType.i32,
},
ocsv_set_relaxed: {
args: [FFIType.ptr, FFIType.bool],
returns: FFIType.i32,
},
ocsv_set_max_row_size: {
args: [FFIType.ptr, FFIType.i32],
returns: FFIType.i32,
},
ocsv_set_from_line: {
args: [FFIType.ptr, FFIType.i32],
returns: FFIType.i32,
},
ocsv_set_to_line: {
args: [FFIType.ptr, FFIType.i32],
returns: FFIType.i32,
},
ocsv_set_skip_lines_with_error: {
args: [FFIType.ptr, FFIType.bool],
returns: FFIType.i32,
},
// Error getters (Phase 1)
ocsv_has_error: {
args: [FFIType.ptr],
returns: FFIType.bool,
},
ocsv_get_error_code: {
args: [FFIType.ptr],
returns: FFIType.i32,
},
ocsv_get_error_line: {
args: [FFIType.ptr],
returns: FFIType.i32,
},
ocsv_get_error_column: {
args: [FFIType.ptr],
returns: FFIType.i32,
},
ocsv_get_error_message: {
args: [FFIType.ptr],
returns: FFIType.cstring,
},
ocsv_get_error_count: {
args: [FFIType.ptr],
returns: FFIType.i32,
},
// Bulk extraction methods (Phase 1 & 2)
ocsv_rows_to_json: {
args: [FFIType.ptr],
returns: FFIType.cstring,
},
ocsv_rows_to_packed_buffer: {
args: [FFIType.ptr, FFIType.ptr],
returns: FFIType.ptr,
},
});
/**
* Deserialize packed binary buffer to 2D array (internal helper)
* @private
* @param {bigint|number} bufferPtr - Pointer to packed buffer
* @param {number} bufferSize - Size of buffer in bytes
* @returns {string[][]} 2D array of strings [row][field]
*/
function _deserializePackedBuffer(bufferPtr, bufferSize) {
// Convert pointer to ArrayBuffer (zero-copy)
const arrayBuffer = toArrayBuffer(bufferPtr, 0, bufferSize);
const view = new DataView(arrayBuffer);
const bytes = new Uint8Array(arrayBuffer);
// Read header
const magic = view.getUint32(0, true);
if (magic !== 0x4F435356) { // "OCSV"
throw new Error(`Invalid magic number: 0x${magic.toString(16)}`);
}
const version = view.getUint32(4, true);
if (version !== 1) {
throw new Error(`Unsupported version: ${version}`);
}
const rowCount = view.getUint32(8, true);
const fieldCount = view.getUint32(12, true);
const totalBytes = view.getBigUint64(16, true);
// Validate buffer size
if (BigInt(bufferSize) !== totalBytes) {
throw new Error(`Buffer size mismatch: expected ${totalBytes}, got ${bufferSize}`);
}
// Read row offsets
const rowOffsets = new Uint32Array(rowCount);
for (let i = 0; i < rowCount; i++) {
rowOffsets[i] = view.getUint32(24 + i * 4, true);
}
// Deserialize rows
const rows = new Array(rowCount);
const decoder = new TextDecoder('utf-8');
for (let i = 0; i < rowCount; i++) {
const row = new Array(fieldCount);
let offset = rowOffsets[i];
for (let j = 0; j < fieldCount; j++) {
// Read field length (u16)
const length = view.getUint16(offset, true);
offset += 2;
// Zero-copy string extraction
if (length > 0) {
const fieldBytes = bytes.subarray(offset, offset + length);
row[j] = decoder.decode(fieldBytes);
offset += length;
} else {
row[j] = "";
}
}
rows[i] = row;
}
return rows;
}
/**
* Configuration options for CSV parsing
* @typedef {Object} ParseOptions
* @property {string} [delimiter=','] - Field delimiter character
* @property {string} [quote='"'] - Quote character for escaping
* @property {string} [escape='"'] - Escape character
* @property {boolean} [skipEmptyLines=false] - Skip empty lines
* @property {string} [comment='#'] - Comment line prefix (use empty string to disable)
* @property {boolean} [trim=false] - Trim whitespace from fields
* @property {boolean} [relaxed=false] - Enable relaxed parsing mode (allows some RFC violations)
* @property {number} [maxRowSize=1048576] - Maximum row size in bytes (default: 1MB)
* @property {number} [fromLine=0] - Start parsing from line N (0 = start from beginning)
* @property {number} [toLine=-1] - Stop parsing at line N (-1 = parse all lines)
* @property {boolean} [skipLinesWithError=false] - Skip lines that fail to parse
* @property {boolean} [hasHeader=false] - Whether the first row is a header
* @property {string} [mode='auto'] - Parsing mode: 'auto' (default), 'packed', 'bulk', 'field', or 'lazy'
* - 'auto': Automatically select best mode based on data size (recommended)
* - 'packed': Use packed buffer (fastest, 61.25 MB/s, best for >1K rows)
* - 'bulk': Use bulk JSON (fast, 40 MB/s, good for 100-1K rows)
* - 'field': Use field-by-field (slower, 30 MB/s, fine for <100 rows)
* - 'lazy': Use lazy evaluation (on-demand row access, requires manual cleanup)
*/
/**
* Result of CSV parsing
* @typedef {Object} ParseResult
* @property {string[]} [headers] - Header row (if hasHeader was true)
* @property {string[][]} rows - Array of rows, each row is an array of fields
* @property {number} rowCount - Total number of rows parsed (excluding header)
*/
/**
* CSV Parser class with automatic memory management
*
* @example
* const parser = new Parser();
* try {
* const result = parser.parse('a,b,c\n1,2,3');
* console.log(result.rows); // [['a','b','c'], ['1','2','3']]
* } finally {
* parser.destroy();
* }
*/
export class Parser {
/**
* Create a new CSV parser
*/
constructor() {
this.parser = lib.symbols.ocsv_parser_create();
if (!this.parser) {
throw new Error("Failed to create parser");
}
}
/**
* Apply configuration options to the parser (Phase 1)
* @private
* @param {ParseOptions} options - Configuration options
*/
_applyConfig(options) {
if (options.delimiter !== undefined) {
const code = options.delimiter.charCodeAt(0);
lib.symbols.ocsv_set_delimiter(this.parser, code);
}
if (options.quote !== undefined) {
const code = options.quote.charCodeAt(0);
lib.symbols.ocsv_set_quote(this.parser, code);
}
if (options.escape !== undefined) {
const code = options.escape.charCodeAt(0);
lib.symbols.ocsv_set_escape(this.parser, code);
}
if (options.skipEmptyLines !== undefined) {
lib.symbols.ocsv_set_skip_empty_lines(this.parser, options.skipEmptyLines);
}
if (options.comment !== undefined) {
// Use charCode 0 to disable comments
const code = options.comment.length > 0 ? options.comment.charCodeAt(0) : 0;
lib.symbols.ocsv_set_comment(this.parser, code);
}
if (options.trim !== undefined) {
lib.symbols.ocsv_set_trim(this.parser, options.trim);
}
if (options.relaxed !== undefined) {
lib.symbols.ocsv_set_relaxed(this.parser, options.relaxed);
}
if (options.maxRowSize !== undefined) {
lib.symbols.ocsv_set_max_row_size(this.parser, options.maxRowSize);
}
if (options.fromLine !== undefined) {
lib.symbols.ocsv_set_from_line(this.parser, options.fromLine);
}
if (options.toLine !== undefined) {
lib.symbols.ocsv_set_to_line(this.parser, options.toLine);
}
if (options.skipLinesWithError !== undefined) {
lib.symbols.ocsv_set_skip_lines_with_error(this.parser, options.skipLinesWithError);
}
}
/**
* Parse a CSV string and return the data
* @param {string} data - CSV data to parse
* @param {ParseOptions} [options={}] - Parsing options
* @returns {ParseResult} Parsed CSV data
* @throws {OcsvError} If parsing fails
*/
parse(data, options = {}) {
// Apply configuration before parsing
this._applyConfig(options);
const buffer = Buffer.from(data + '\0');
const parseResult = lib.symbols.ocsv_parse_string(this.parser, ptr(buffer), data.length);
// Check for errors after parsing
if (parseResult !== 0 || lib.symbols.ocsv_has_error(this.parser)) {
const errorCode = lib.symbols.ocsv_get_error_code(this.parser);
const errorLine = lib.symbols.ocsv_get_error_line(this.parser);
const errorColumn = lib.symbols.ocsv_get_error_column(this.parser);
const errorMessage = lib.symbols.ocsv_get_error_message(this.parser) || "CSV parsing failed";
throw new OcsvError(errorMessage, errorCode, errorLine, errorColumn);
}
const rowCount = lib.symbols.ocsv_get_row_count(this.parser);
// Determine parsing mode
const mode = options.mode || 'auto';
// Handle lazy mode (special case - no auto-selection)
if (mode === 'lazy') {
return this._parseLazy(rowCount, options);
}
// Auto-select best mode based on row count
let selectedMode = mode;
if (mode === 'auto') {
if (rowCount > 1000) {
selectedMode = 'packed'; // Best for large files
} else if (rowCount > 100) {
selectedMode = 'bulk'; // Good for medium files
} else {
selectedMode = 'field'; // Fine for small files
}
}
// Execute selected mode
switch (selectedMode) {
case 'packed':
return this._parsePacked();
case 'bulk':
return this._parseBulk();
case 'field':
return this._parseEager(rowCount, options);
default:
// Fallback to eager mode for unknown modes
return this._parseEager(rowCount, options);
}
}
/**
* Parse in lazy mode - returns LazyResult with on-demand row access
* @private
* @param {number} rowCount - Total number of rows
* @param {ParseOptions} options - Parsing options
* @returns {LazyResult} Lazy result accessor
*/
_parseLazy(rowCount, options) {
let headers = null;
// Handle header row if requested
if (options.hasHeader && rowCount > 0) {
// Extract headers eagerly using direct FFI
// This is a small overhead (~50-100μs) but ensures reliability
const fieldCount = lib.symbols.ocsv_get_field_count(this.parser, 0);
headers = new Array(fieldCount);
for (let i = 0; i < fieldCount; i++) {
headers[i] = lib.symbols.ocsv_get_field(this.parser, 0, i) || "";
}
// Create LazyResult starting from row 1 (data rows only)
// Note: User will call getRow(0) to access first data row
return new LazyResult(
this.parser,
rowCount - 1, // Exclude header from count
headers,
options
);
}
// No header
return new LazyResult(
this.parser,
rowCount,
null,
options
);
}
/**
* Parse in eager mode - materializes all rows into arrays
* @private
* @param {number} rowCount - Total number of rows
* @param {ParseOptions} options - Parsing options
* @returns {ParseResult} Eager result with all rows
*/
_parseEager(rowCount, options) {
const rows = [];
for (let i = 0; i < rowCount; i++) {
const fieldCount = lib.symbols.ocsv_get_field_count(this.parser, i);
const row = [];
for (let j = 0; j < fieldCount; j++) {
// ocsv_get_field returns a cstring which Bun automatically converts to string
const field = lib.symbols.ocsv_get_field(this.parser, i, j);
row.push(field || "");
}
rows.push(row);
}
const result = {
rows,
rowCount: options.hasHeader ? rowCount - 1 : rowCount,
};
if (options.hasHeader && rows.length > 0) {
result.headers = rows.shift();
}
return result;
}
/**
* Parse using packed buffer format (Phase 2 - fastest)
* @private
* @returns {ParseResult} Parsed result with all rows
*/
_parsePacked() {
const sizeBuffer = new Int32Array(1);
const bufferPtr = lib.symbols.ocsv_rows_to_packed_buffer(this.parser, ptr(sizeBuffer));
if (!bufferPtr || sizeBuffer[0] <= 0) {
return { rows: [], rowCount: 0 };
}
const rows = _deserializePackedBuffer(bufferPtr, sizeBuffer[0]);
return {
rows,
rowCount: rows.length,
};
}
/**
* Parse using bulk JSON serialization (Phase 1 - fast)
* @private
* @returns {ParseResult} Parsed result with all rows
*/
_parseBulk() {
const jsonStr = lib.symbols.ocsv_rows_to_json(this.parser);
if (!jsonStr) {
return { rows: [], rowCount: 0 };
}
const rows = JSON.parse(jsonStr);
return {
rows,
rowCount: rows.length,
};
}
/**
* Parse a CSV file
* @param {string} path - Path to CSV file
* @param {ParseOptions} [options={}] - Parsing options
* @returns {Promise<ParseResult>} Parsed CSV data
*/
async parseFile(path, options = {}) {
const file = Bun.file(path);
const text = await file.text();
return this.parse(text, options);
}
/**
* Destroy the parser and free all memory
* Must be called when done with the parser
*/
destroy() {
if (this.parser) {
lib.symbols.ocsv_parser_destroy(this.parser);
this.parser = null;
}
}
}
/**
* Convenience function to parse CSV string
* Automatically manages parser lifecycle (except in lazy mode)
*
* @param {string} data - CSV data
* @param {ParseOptions} [options={}] - Parsing options
* @returns {ParseResult | LazyResult} Parsed CSV data
*
* @example Eager mode (automatic cleanup)
* import { parseCSV } from 'ocsv';
*
* const result = parseCSV('name,age\nJohn,30\nJane,25', { hasHeader: true });
* console.log(result.headers); // ['name', 'age']
* console.log(result.rows); // [['John', '30'], ['Jane', '25']]
*
* @example Lazy mode (manual cleanup required)
* const result = parseCSV(data, { mode: 'lazy' });
* try {
* const row = result.getRow(5000);
* console.log(row.toArray());
* } finally {
* result.destroy(); // MUST call destroy()
* }
*/
export function parseCSV(data, options = {}) {
const parser = new Parser();
// Check for lazy mode
if (options.mode === 'lazy') {
// IMPORTANT: Do NOT destroy parser!
// LazyResult owns it now - user must call result.destroy()
return parser.parse(data, options);
}
// Eager mode: cleanup immediately
try {
return parser.parse(data, options);
} finally {
parser.destroy();
}
}
/**
* Convenience function to parse CSV file
* Automatically manages parser lifecycle
*
* @param {string} path - Path to CSV file
* @param {ParseOptions} [options={}] - Parsing options
* @returns {Promise<ParseResult>} Parsed CSV data
*
* @example
* import { parseCSVFile } from 'ocsv';
*
* const result = await parseCSVFile('./data.csv', { hasHeader: true });
* console.log(`Parsed ${result.rowCount} rows`);
*/
export async function parseCSVFile(path, options = {}) {
const parser = new Parser();
try {
return await parser.parseFile(path, options);
} finally {
parser.destroy();
}
}
// Export for backwards compatibility
export { Parser as OCSVParser };
// Export advanced performance functions (Phase 2)
export { parseCSVPacked, parseCSVBulk } from "./simple.ts";