UNPKG

ocsv

Version:

High-performance RFC 4180 compliant CSV parser with lazy mode for Bun. Parse 10M rows in 5s with minimal memory.

470 lines (398 loc) 12.6 kB
/** * OCSV - Minimal FFI Bindings * * Ultra-simple bindings with zero abstraction between JavaScript and Odin FFI. * No classes, no wrappers - just direct function calls. * * Usage: * import { parseCSV } from './bindings/simple.ts' * const rows = parseCSV(csvData) */ import { dlopen, FFIType, suffix, toArrayBuffer, ptr } from "bun:ffi"; import { fileURLToPath } from "url"; import { dirname, join } from "path"; // Get library path relative to this file const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); // Determine prebuilds directory based on platform and architecture const getPrebuildPath = (): string => { const platform = process.platform; const arch = process.arch; let prebuildDir: string; let libName: string; if (platform === "darwin") { prebuildDir = arch === "arm64" ? "darwin-arm64" : "darwin-x64"; libName = "libocsv.dylib"; } else if (platform === "linux") { prebuildDir = "linux-x64"; libName = "libocsv.so"; } else if (platform === "win32") { prebuildDir = "win32-x64"; libName = "ocsv.dll"; } else { throw new Error(`Unsupported platform: ${platform}`); } return join(__dirname, "..", "prebuilds", prebuildDir, libName); }; const libPath = getPrebuildPath(); // Load OCSV library const lib = dlopen(libPath, { ocsv_parser_create: { returns: FFIType.ptr, }, ocsv_parser_destroy: { args: [FFIType.ptr], }, ocsv_parse_string: { args: [FFIType.ptr, FFIType.cstring, FFIType.i32], returns: FFIType.i32, }, ocsv_get_row_count: { args: [FFIType.ptr], returns: FFIType.i32, }, ocsv_get_field_count: { args: [FFIType.ptr, FFIType.i32], returns: FFIType.i32, }, ocsv_get_field: { args: [FFIType.ptr, FFIType.i32, FFIType.i32], returns: FFIType.cstring, }, // Bulk memory access for performance ocsv_rows_to_json: { args: [FFIType.ptr], returns: FFIType.cstring, }, ocsv_free_json_string: { args: [FFIType.cstring], }, // Phase 2: Packed buffer (zero-copy) ocsv_rows_to_packed_buffer: { args: [FFIType.ptr, FFIType.ptr], returns: FFIType.ptr, }, }); // Direct FFI exports (for advanced users) export const ffi = lib.symbols; /** * Parse CSV data and return all rows as a 2D array * * @param csvData - CSV string to parse * @returns 2D array of strings [row][field] * * @example * const rows = parseCSV("name,age\nAlice,30\nBob,25") * // [["name", "age"], ["Alice", "30"], ["Bob", "25"]] */ export function parseCSV(csvData: string): string[][] { const parser = ffi.ocsv_parser_create(); try { // Parse const buffer = Buffer.from(csvData); const result = ffi.ocsv_parse_string(parser, buffer, buffer.length); if (result !== 0) { throw new Error("Failed to parse CSV"); } // Get dimensions const rowCount = ffi.ocsv_get_row_count(parser); const rows: string[][] = []; // Extract all rows and fields for (let i = 0; i < rowCount; i++) { const fieldCount = ffi.ocsv_get_field_count(parser, i); const row: string[] = []; for (let j = 0; j < fieldCount; j++) { const field = ffi.ocsv_get_field(parser, i, j); row.push(field || ""); } rows.push(row); } return rows; } finally { ffi.ocsv_parser_destroy(parser); } } /** * Parse CSV and return with separate header and rows * * @param csvData - CSV string to parse * @returns { header: string[], rows: string[][] } * * @example * const { header, rows } = parseCSVWithHeader("name,age\nAlice,30") * // header: ["name", "age"] * // rows: [["Alice", "30"]] */ export function parseCSVWithHeader(csvData: string): { header: string[]; rows: string[][] } { const allRows = parseCSV(csvData); if (allRows.length === 0) { return { header: [], rows: [] }; } return { header: allRows[0], rows: allRows.slice(1), }; } /** * Parse CSV and return as array of objects using first row as keys * * @param csvData - CSV string to parse * @returns Array of objects { [columnName]: value } * * @example * const records = parseCSVToObjects("name,age\nAlice,30\nBob,25") * // [{ name: "Alice", age: "30" }, { name: "Bob", age: "25" }] */ export function parseCSVToObjects(csvData: string): Record<string, string>[] { const { header, rows } = parseCSVWithHeader(csvData); return rows.map(row => { const obj: Record<string, string> = {}; header.forEach((key, i) => { obj[key] = row[i] || ""; }); return obj; }); } /** * Get CSV dimensions without extracting all data * * @param csvData - CSV string to parse * @returns { rows: number, avgFields: number } * * @example * const { rows, avgFields } = getCSVDimensions(csvData) * console.log(`CSV has ${rows} rows with ~${avgFields} fields each`) */ export function getCSVDimensions(csvData: string): { rows: number; avgFields: number } { const parser = ffi.ocsv_parser_create(); try { const buffer = Buffer.from(csvData); const result = ffi.ocsv_parse_string(parser, buffer, buffer.length); if (result !== 0) { throw new Error("Failed to parse CSV"); } const rowCount = ffi.ocsv_get_row_count(parser); if (rowCount === 0) { return { rows: 0, avgFields: 0 }; } // Sample some rows to get average field count const samples = Math.min(10, rowCount); let totalFields = 0; for (let i = 0; i < samples; i++) { const idx = Math.floor((i / samples) * rowCount); totalFields += ffi.ocsv_get_field_count(parser, idx); } return { rows: rowCount, avgFields: Math.round(totalFields / samples), }; } finally { ffi.ocsv_parser_destroy(parser); } } /** * Parse CSV and get specific row by index * * @param csvData - CSV string to parse * @param rowIndex - Row index (0-based) * @returns Array of field values or null if index out of bounds * * @example * const row = getRow(csvData, 5) // Get 6th row */ export function getRow(csvData: string, rowIndex: number): string[] | null { const parser = ffi.ocsv_parser_create(); try { const buffer = Buffer.from(csvData); const result = ffi.ocsv_parse_string(parser, buffer, buffer.length); if (result !== 0) { throw new Error("Failed to parse CSV"); } const rowCount = ffi.ocsv_get_row_count(parser); if (rowIndex < 0 || rowIndex >= rowCount) { return null; } const fieldCount = ffi.ocsv_get_field_count(parser, rowIndex); const row: string[] = []; for (let j = 0; j < fieldCount; j++) { const field = ffi.ocsv_get_field(parser, rowIndex, j); row.push(field || ""); } return row; } finally { ffi.ocsv_parser_destroy(parser); } } /** * Parse CSV and get specific field value * * @param csvData - CSV string to parse * @param rowIndex - Row index (0-based) * @param fieldIndex - Field index (0-based) * @returns Field value or null if indices out of bounds * * @example * const value = getField(csvData, 1, 2) // Row 1, Column 2 */ export function getField(csvData: string, rowIndex: number, fieldIndex: number): string | null { const parser = ffi.ocsv_parser_create(); try { const buffer = Buffer.from(csvData); const result = ffi.ocsv_parse_string(parser, buffer, buffer.length); if (result !== 0) { throw new Error("Failed to parse CSV"); } const rowCount = ffi.ocsv_get_row_count(parser); if (rowIndex < 0 || rowIndex >= rowCount) { return null; } const fieldCount = ffi.ocsv_get_field_count(parser, rowIndex); if (fieldIndex < 0 || fieldIndex >= fieldCount) { return null; } return ffi.ocsv_get_field(parser, rowIndex, fieldIndex) || ""; } finally { ffi.ocsv_parser_destroy(parser); } } /** * Parse CSV using bulk JSON serialization (high performance) * * This function minimizes FFI overhead by serializing all rows to JSON * in one FFI call, then parsing in JavaScript. Significantly faster than * field-by-field extraction for large datasets. * * @param csvData - CSV string to parse * @returns 2D array of strings [row][field] * * @example * const rows = parseCSVBulk("name,age\nAlice,30\nBob,25") * // [["name", "age"], ["Alice", "30"], ["Bob", "25"]] */ export function parseCSVBulk(csvData: string): string[][] { const parser = ffi.ocsv_parser_create(); try { // Parse CSV const buffer = Buffer.from(csvData); const result = ffi.ocsv_parse_string(parser, buffer, buffer.length); if (result !== 0) { throw new Error("Failed to parse CSV"); } // Get all rows as JSON (ONE FFI call instead of N×M calls) const jsonStr = ffi.ocsv_rows_to_json(parser); if (!jsonStr) { return []; } // Parse JSON in JavaScript // Note: jsonStr is automatically converted to JavaScript string by Bun FFI // The memory is managed by the Odin parser and will be freed when parser_destroy is called const rows = JSON.parse(jsonStr); return rows; } finally { ffi.ocsv_parser_destroy(parser); } } /** * Deserialize packed binary buffer to 2D array (internal helper) * * @param bufferPtr - Pointer to packed buffer * @param bufferSize - Size of buffer in bytes * @returns 2D array of strings [row][field] * * Binary format: * Header (24 bytes): * 0-3: magic (0x4F435356 "OCSV") * 4-7: version (1) * 8-11: row_count (u32) * 12-15: field_count (u32) * 16-23: total_bytes (u64) * * Row Offsets (row_count × 4 bytes): * 24+i*4: offset to row i data * * Field Data (variable length): * [length: u16][data: UTF-8 bytes] */ function deserializePackedBuffer(bufferPtr: number | bigint, bufferSize: number): string[][] { // Convert pointer to ArrayBuffer (zero-copy) const arrayBuffer = toArrayBuffer(bufferPtr, 0, bufferSize); const view = new DataView(arrayBuffer); const bytes = new Uint8Array(arrayBuffer); // Read header const magic = view.getUint32(0, true); if (magic !== 0x4F435356) { throw new Error(`Invalid magic number: 0x${magic.toString(16)}`); } const version = view.getUint32(4, true); if (version !== 1) { throw new Error(`Unsupported version: ${version}`); } const rowCount = view.getUint32(8, true); const fieldCount = view.getUint32(12, true); const totalBytes = view.getBigUint64(16, true); // Validate buffer size if (BigInt(bufferSize) !== totalBytes) { throw new Error(`Buffer size mismatch: expected ${totalBytes}, got ${bufferSize}`); } // Read row offsets const rowOffsets = new Uint32Array(rowCount); for (let i = 0; i < rowCount; i++) { rowOffsets[i] = view.getUint32(24 + i * 4, true); } // Deserialize rows const rows: string[][] = new Array(rowCount); const decoder = new TextDecoder('utf-8'); for (let i = 0; i < rowCount; i++) { const row: string[] = new Array(fieldCount); let offset = rowOffsets[i]; for (let j = 0; j < fieldCount; j++) { // Read field length (u16) const length = view.getUint16(offset, true); offset += 2; // Zero-copy string extraction if (length > 0) { const fieldBytes = bytes.subarray(offset, offset + length); row[j] = decoder.decode(fieldBytes); offset += length; } else { row[j] = ""; } } rows[i] = row; } return rows; } /** * Parse CSV data using packed buffer (zero-copy, highest performance) * * This is the fastest parsing method, using binary serialization and zero-copy * deserialization for minimal FFI overhead. * * @param csvData - CSV string to parse * @returns 2D array of strings [row][field] * * @example * const rows = parseCSVPacked("name,age\nAlice,30\nBob,25") * // [["name", "age"], ["Alice", "30"], ["Bob", "25"]] */ export function parseCSVPacked(csvData: string): string[][] { const parser = ffi.ocsv_parser_create(); try { // Parse CSV const buffer = Buffer.from(csvData); const result = ffi.ocsv_parse_string(parser, buffer, buffer.length); if (result !== 0) { throw new Error("Failed to parse CSV"); } // Get packed buffer (ONE FFI call) const sizeBuffer = new Int32Array(1); const bufferPtr = ffi.ocsv_rows_to_packed_buffer(parser, ptr(sizeBuffer)); if (!bufferPtr || sizeBuffer[0] <= 0) { return []; } // Deserialize in JavaScript (zero-copy) return deserializePackedBuffer(bufferPtr, sizeBuffer[0]); } finally { ffi.ocsv_parser_destroy(parser); } }