UNPKG

@doeixd/csv-utils

Version:

Utilities for csv files / arrays of objects

1,252 lines (1,115 loc) 175 kB
/** * @fileoverview A production-ready TypeScript library for CSV manipulation, * featuring robust error handling, strong typing, and a fluent interface. */ import fs from 'node:fs'; import path from 'node:path'; import { parse as parseCSV, stringify as stringifyCSV } from 'csv/sync'; import { parse as parseCSVAsync, stringify as stringifyCSVAsync } from 'csv'; import { distance as levenshteinDistance } from 'fastest-levenshtein'; import { get as lodashGet } from 'lodash'; import { createHeaderMapFns, HeaderMap, RetryOptions } from './headers' import { StandardSchemaV1, tryValidateStandardSchemaSync, tryValidateStandardSchemaAsync, CSVSchemaConfig, RowValidationResult } from './schema' import { Readable, Transform, Writable, pipeline as streamPipeline } from 'node:stream'; import { type Transform as NodeTransform } from 'node:stream' import { promisify } from 'node:util'; import { Worker, isMainThread, parentPort, workerData } from 'node:worker_threads'; import os from 'node:os'; export * from './headers' export * from './schema' export * from './standalone' /** * Options for creating a CSV instance from pre-existing data. */ export interface CSVFromDataOptions<T extends Record<string, any>> { /** * Schema to validate the provided data objects against. * If schema validation fails and mode is 'error', an error will be thrown. * If mode is 'filter', invalid objects will be removed. * If mode is 'keep', validationResults will be populated on the CSV instance. */ schema?: CSVSchemaConfig<T>; // We are deliberately OMITTING customCasts here for fromData // as their primary design is for string inputs from CSV parsing. // Coercion of existing object properties should ideally be handled by // the schema itself (e.g., Zod's coerce) or a separate transformation step. } /** * Error class for CSV-related operations */ export class CSVError extends Error { constructor(message: string, public readonly cause?: unknown) { super(message); this.name = 'CSVError'; // Maintains proper stack trace for where the error was thrown if (Error.captureStackTrace) { Error.captureStackTrace(this, CSVError); } } } export type CsvParseOptions = Parameters<typeof parseCSV>[1]; type CsvParseInternalOptions = Exclude<Parameters<typeof parseCSV>[1], null | undefined> export type CsvStringifyOptions<T = any> = Parameters<typeof stringifyCSV>[1] | { stringifyOptions?: Parameters<typeof stringifyCSV>[1]; headerMap?: HeaderMap<T>; }; /** * Context for casting functions, similar to csv-parse's CastingContext */ export interface CastingContext { /** Column name or index */ column: string | number; /** Is it the header row? (Usually false for custom casting stage) */ header: boolean; /** Index of the field in the record */ index: number; /** Line number in the source */ lines: number; /** Number of records parsed so far */ records: number; /** Count of empty lines */ empty_lines: number; /** Count of rows with inconsistent field lengths */ invalid_field_length: number; /** Is the field quoted? */ quoting: boolean; } /** * Functions for testing if a value should be cast and performing the casting */ export type CastTestFunction = (value: string, context: CastingContext) => boolean; export type CastParseFunction<TargetType> = (value: string, context: CastingContext) => TargetType; /** * Definition of a caster that converts string values to a target type */ export interface Caster<TargetType> { /** * Tests if a string value is a candidate for this caster * @param value The string value from the CSV cell (after csv-parse's initial processing) * @param context An object containing column name, line number, etc. * @returns True if this caster should attempt to parse the value */ test: CastTestFunction; /** * Parses the string value into the target type * Called only if `test` returns true * @param value The string value to parse * @param context An object containing column name, line number, etc. * @returns The parsed value of TargetType * @throws If parsing fails and strict error handling is desired */ parse: CastParseFunction<TargetType>; } /** * Set of type-specific casters to apply to CSV values */ export interface CustomCastDefinition { string?: Caster<string>; number?: Caster<number>; boolean?: Caster<boolean>; date?: Caster<Date>; object?: Caster<object>; array?: Caster<any[]>; null?: Caster<null>; } /** * Configuration for column-specific casting */ export type ColumnCastConfig<T extends Record<string, any>> = { [K in keyof T]?: keyof CustomCastDefinition | Caster<T[K]> | (keyof CustomCastDefinition | Caster<any>)[]; } | { [columnName: string]: keyof CustomCastDefinition | Caster<any> | (keyof CustomCastDefinition | Caster<any>)[]; }; /** * CSV reading options */ export interface CSVReadOptions<T extends Record<string, any>> { /** File system options for reading the file */ fsOptions?: { encoding?: BufferEncoding; flag?: string; mode?: number; autoClose?: boolean; emitClose?: boolean; start?: number; end?: number; highWaterMark?: number; }; csvOptions?: Parameters<typeof parseCSV>[1]; /** Optional transform function to apply to raw file content */ transform?: (content: string) => string; /** Flag to indicate the input is raw data rather than a filename */ rawData?: boolean; /** Optional header mapping configuration */ headerMap?: HeaderMap<T>; /** Options for retrying failed operations */ retry?: RetryOptions; /** Enable basic validation of data against expected schema */ validateData?: boolean; /** Enable standard schema validation of data */ schema?: CSVSchemaConfig<T>; allowEmptyValues?: boolean; /** * Controls the extraction of initial lines as an "additional header" (preamble). * These lines are stored in `csvInstance.additionalHeader`. * * - If `number > 0`: Specifies the exact number of lines to extract as the preamble. * Data parsing will start after these lines, unless `csvOptions.from_line` (or `from`) * is set and points to an even later line. * * - If `true`: Enables preamble extraction *if* `csvOptions.from_line` (or `from`) * is set to a value greater than 1. The preamble will consist of `csvOptions.from_line - 1` lines. * If `csvOptions.from_line` is not set or is 1, no preamble is extracted with `true`. * * - If `false`, `0`, or `undefined`: No preamble is extracted. */ saveAdditionalHeader?: boolean | number; /** * Optional CSV parsing options specifically for the preamble (additional header) lines. * If provided, these options will be used when parsing the preamble. * If not provided, relevant low-level parsing options (like delimiter, quote, escape) * might be inherited from the main `csvOptions` by default (see implementation for details), * or a very basic parsing configuration will be used. * * **Important:** Options like `columns`, `from_line`, `to_line` will be overridden * internally for preamble extraction. You should primarily use this for options * like `delimiter`, `quote`, `escape`, `record_delimiter`, `ltrim`, `rtrim`, `bom`. */ additionalHeaderParseOptions?: Parameters<typeof parseCSV>[1]; /** * Custom type casting options that are applied after csv-parse's built-in casting * but before data validation */ customCasts?: { /** * A global set of custom casting definitions. * These are tried if a column doesn't have a specific rule in `columnCasts`. */ definitions?: CustomCastDefinition; /** * Per-column casting rules. The key is the column name (after initial parsing and header mapping). * The value can be: * - A string key of a caster defined in `definitions` (e.g., 'number', 'date'). * - A custom Caster object `{ test, parse }`. * - An array of the above, tried in order until one succeeds. * If not specified for a column, or if all rules fail, the value remains as parsed by `csv-parse`. */ columnCasts?: ColumnCastConfig<T>; /** * What to do if a specific cast's `parse` function throws an error. * - 'error': Propagate the error, failing the CSV loading. (default) * - 'null': Set the value to null. * - 'original': Keep the original string value (as received by the custom caster). */ onCastError?: 'error' | 'null' | 'original'; }; } /** * CSV writing options */ export interface CSVWriteOptions<T = any> { /** Additional header content to prepend to the CSV */ additionalHeader?: string; /** Options for stringifying the CSV */ stringifyOptions?: Parameters<typeof stringifyCSV>[1]; /** Whether to use streaming for large files */ streaming?: boolean; /** Optional header mapping configuration */ headerMap?: HeaderMap<T>; /** Threshold for using streaming (number of rows) */ streamingThreshold?: number; /** Options for retrying failed operations */ retry?: RetryOptions; } /** * Options for working with CSV streams and generators */ export interface CSVStreamOptions<T> { /** CSV parsing options */ csvOptions?: Parameters<typeof parseCSVAsync>[0]; /** Options for transforming rows */ transform?: (row: any) => T; /** Batch size for processing */ batchSize?: number; /** Optional header mapping configuration */ headerMap?: HeaderMap<T>; /** Options for retrying failed operations */ retry?: RetryOptions; /** Buffers rows before yielding to improve performance with very large files */ useBuffering?: boolean; /** Buffer size when useBuffering is true */ bufferSize?: number; } /** * Result type for similarity matches */ export interface SimilarityMatch<T> { row: T; dist: number; } // Type definitions for callbacks export type ComparisonCallback<T> = (row: T) => boolean; export type ModificationCallback<T> = (row: T) => Partial<T>; export type TransformCallback<T, R> = (row: T) => R; export type EqualityCallback<T> = (a: T, b: T) => boolean; export type MergeCallback<T, E> = (a: T, b: E) => T; // Aggregation and sorting types export type AggregateOperation = 'sum' | 'avg' | 'min' | 'max' | 'count'; export type SortDirection = 'asc' | 'desc'; /** * Core class for CSV data manipulation with a fluent interface */ export class CSV<T extends Record<string, any>> { /** Validation results if schema validation was used with 'keep' mode */ readonly validationResults?: RowValidationResult<T>[]; private constructor( private readonly data: T[], readonly additionalHeader?: string, validationResults?: RowValidationResult<T>[] ) { this.validationResults = validationResults; } /** * Helper function to implement retry logic * @param operation - Function to retry * @param errorMessage - Error message if all retries fail * @param retryOptions - Retry configuration * @returns Result of the operation * @throws {CSVError} If operation fails after all retries */ private static async retryOperation<R>( operation: () => Promise<R>, errorMessage: string, retryOptions?: RetryOptions ): Promise<R> { const maxRetries = retryOptions?.maxRetries ?? 3; const baseDelay = retryOptions?.baseDelay ?? 100; const logRetries = retryOptions?.logRetries ?? false; let lastError: Error | null = null; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return await operation(); } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); if (attempt < maxRetries) { // Calculate delay with exponential backoff const delay = Math.pow(2, attempt) * baseDelay; if (logRetries) { console.warn(`Retry attempt ${attempt + 1}/${maxRetries} after ${delay}ms`); } // Wait before retrying await new Promise(resolve => setTimeout(resolve, delay)); } } } throw new CSVError(`${errorMessage} after ${maxRetries} attempts`, lastError); } /** * Synchronous version of retry operation * @param operation - Function to retry * @param errorMessage - Error message if all retries fail * @param retryOptions - Retry configuration * @returns Result of the operation * @throws {CSVError} If operation fails after all retries */ private static retryOperationSync<R>( operation: () => R, errorMessage: string, retryOptions?: RetryOptions ): R { const maxRetries = retryOptions?.maxRetries ?? 3; const baseDelay = retryOptions?.baseDelay ?? 100; const logRetries = retryOptions?.logRetries ?? false; let lastError: Error | null = null; for (let attempt = 0; attempt <= maxRetries; attempt++) { try { return operation(); } catch (error) { lastError = error instanceof Error ? error : new Error(String(error)); if (attempt < maxRetries) { // Calculate delay with exponential backoff const delay = Math.pow(2, attempt) * baseDelay; if (logRetries) { console.warn(`Retry attempt ${attempt + 1}/${maxRetries} after ${delay}ms`); } } } } throw new CSVError(`${errorMessage} after ${maxRetries} attempts`, lastError); } /** * Validates data using a standard schema configuration. * This method can validate at both row and column levels. * * @param data - The data to validate * @param schemaConfig - The schema configuration * @param baseLineNumber - The base line number for error reporting * @returns A tuple containing: [validatedData, validationResults] * @private */ /** * Validates data synchronously using a standard schema configuration. * This method throws an error if the schema is configured for async validation. * * @param data - The data to validate * @param schemaConfig - The schema configuration * @param baseLineNumber - The base line number for error reporting * @returns A tuple containing: [validatedData, validationResults] * @private */ private static _validateWithSchemaSync<T extends Record<string, any>>( data: Record<string, any>[], schemaConfig: CSVSchemaConfig<T>, baseLineNumber: number = 1 ): [T[], RowValidationResult<T>[]] { const { rowSchema, columnSchemas, validationMode = 'error', useAsync = false } = schemaConfig; // If schema is configured for async validation, throw an error in this sync method if (useAsync) { throw new CSVError( "Asynchronous schema validation is not supported in synchronous CSV methods. Use an async method (e.g., fromFileAsync, validateAsync) or set useAsync: false." ); } const validationResults: RowValidationResult<T>[] = []; const validatedData: T[] = []; // Function to validate a row synchronously const validateRowSync = (row: Record<string, any>, index: number): RowValidationResult<T> => { const result: RowValidationResult<T> = { originalRow: row, valid: true }; // Validate row using row schema if provided if (rowSchema) { const rowValidation = tryValidateStandardSchemaSync(rowSchema, row); if (rowValidation.issues) { result.valid = false; result.rowIssues = [...rowValidation.issues]; } else { result.validatedRow = rowValidation.value as T; } } else { // If no row schema, original row is the validated row result.validatedRow = row as T; } // Validate individual columns if provided if (columnSchemas) { result.columnIssues = {}; for (const [column, schema] of Object.entries(columnSchemas)) { if (!schema) continue; const columnValidation = tryValidateStandardSchemaSync(schema, row[column]); if (columnValidation.issues) { result.valid = false; result.columnIssues[column] = [...columnValidation.issues]; // If row validation succeeded but column validation failed, update the column value in validated row if (result.validatedRow && !result.rowIssues) { if ('value' in columnValidation) { (result.validatedRow as any)[column] = columnValidation.value; } else { // Column validation failed, set to null/undefined or keep original (result.validatedRow as any)[column] = null; } } } else if (result.validatedRow) { // Update the validated value in the result (result.validatedRow as any)[column] = columnValidation.value; } } // If no column issues, remove the empty object if (Object.keys(result.columnIssues).length === 0) { delete result.columnIssues; } } return result; }; // Process each row synchronously const results = data.map(validateRowSync); // Process validation results for (const result of results) { validationResults.push(result); if (result.valid && result.validatedRow) { validatedData.push(result.validatedRow); } else if (validationMode === 'error') { // Collect all validation issues for better error reporting const issues: string[] = []; if (result.rowIssues) { issues.push(`Row validation issues: ${result.rowIssues.map(i => i.message).join(', ')}`); } if (result.columnIssues) { for (const [column, columnIssues] of Object.entries(result.columnIssues)) { issues.push(`Column '${column}' validation issues: ${columnIssues.map(i => i.message).join(', ')}`); } } throw new CSVError(`CSV validation failed: ${issues.join('; ')}`); } else if (validationMode === 'filter') { // Skip invalid rows - don't add to validatedData continue; } else if (validationMode === 'keep' && !result.valid) { // Keep even invalid rows in the resulting dataset validatedData.push(result.originalRow as T); } } return [validatedData, validationResults]; } /** * For backward compatibility - redirects to the appropriate sync or async validation method * @private */ private static _validateWithSchema<T extends Record<string, any>>( data: Record<string, any>[], schemaConfig: CSVSchemaConfig<T>, baseLineNumber: number = 1 ): [T[], RowValidationResult<T>[]] { // For backward compatibility, if useAsync is true, we'll still return empty results // as this was the previous behavior. The async method will need to be called afterward. if (schemaConfig.useAsync) { return [[], []]; } else { return this._validateWithSchemaSync(data, schemaConfig, baseLineNumber); } } /** * Create a CSV instance from a file * @param filename - Path to the CSV file * @param options - Reading options * @returns A new CSV instance * @throws {CSVError} If file reading or parsing fails after retries * @example * ```typescript * // Basic usage * const users = CSV.fromFile<User>('users.csv'); * * // With header mapping * const users = CSV.fromFile<User>('users.csv', { * headerMap: { * 'user_id': 'id', * 'first_name': 'profile.firstName' * } * }); * * // With retry options * const users = CSV.fromFile<User>('users.csv', { * retry: { maxRetries: 5, logRetries: true } * }); * * // With schema validation using Zod * import { z } from 'zod'; * * const userSchema = z.object({ * id: z.string().min(1), * name: z.string().min(1), * email: z.string().email().optional() * }); * * // TypeScript type derived from the schema * type User = z.infer<typeof userSchema>; * * const users = CSV.fromFile<User>('users.csv', { * schema: { * rowSchema: userSchema, * columnSchemas: { * email: z.string().email() * }, * validationMode: 'filter' * } * }); * ``` */ static fromFile<T extends Record<string, any>>( filename: string, options: CSVReadOptions<T> = {} ): CSV<T> { const operation = () => { const resolvedPath = path.resolve(filename); const fileData = fs.readFileSync( resolvedPath, options.fsOptions?.encoding as BufferEncoding || 'utf-8' ); const rawFullContent = fileData.toString(); let fileAdditionalHeader = ''; const getCsvFromLineValue = (csvOpts?: CsvParseOptions): number | undefined => { if (!csvOpts) return undefined; const fromVal = csvOpts.from ?? csvOpts.from_line ?? csvOpts.fromLine; return typeof fromVal === 'number' && fromVal >= 1 ? fromVal : undefined; }; let numPreambleLinesToExtract: number | undefined = undefined; const userSpecifiedFromForData = getCsvFromLineValue(options.csvOptions); if (typeof options.saveAdditionalHeader === 'number' && options.saveAdditionalHeader > 0) { numPreambleLinesToExtract = options.saveAdditionalHeader; } else if (options.saveAdditionalHeader === true && userSpecifiedFromForData && userSpecifiedFromForData > 1) { numPreambleLinesToExtract = userSpecifiedFromForData - 1; } if (numPreambleLinesToExtract && numPreambleLinesToExtract > 0) { const basePreambleOpts: CsvParseInternalOptions = options.additionalHeaderParseOptions ? { ...options.additionalHeaderParseOptions } : {}; if (!options.additionalHeaderParseOptions && options.csvOptions) { const RELEVANT_LOW_LEVEL_KEYS: (keyof CsvParseInternalOptions)[] = ['delimiter', 'quote', 'escape', 'record_delimiter', 'recordDelimiter', 'ltrim', 'rtrim', 'trim', 'bom']; RELEVANT_LOW_LEVEL_KEYS.forEach(key => { let valueToInherit: any = undefined; if (options.csvOptions) { if (key === 'record_delimiter' || key === 'recordDelimiter') { valueToInherit = options.csvOptions.record_delimiter ?? options.csvOptions.recordDelimiter; } else { valueToInherit = options.csvOptions[key as keyof typeof options.csvOptions]; } } if (valueToInherit !== undefined && !(key in basePreambleOpts)) { (basePreambleOpts as any)[key] = valueToInherit; } }); } const finaladditionalHeaderParseOptions: CsvParseInternalOptions = { /* ... as before ... */ ...basePreambleOpts, columns: false, to: numPreambleLinesToExtract, from: undefined, from_line: undefined, fromLine: undefined, to_line: undefined, toLine: undefined, skip_empty_lines: undefined, skipEmptyLines: undefined, skip_records_with_error: undefined, skipRecordsWithError: undefined, skip_records_with_empty_values: undefined, skipRecordsWithEmptyValues: undefined, comment: undefined, on_record: undefined, onRecord: undefined, auto_parse: undefined, autoParse: undefined, cast: undefined, cast_date: undefined, castDate: undefined, objname: undefined, info: undefined, raw: undefined, relax_column_count: undefined, relaxColumnCount: undefined, relax_column_count_less: undefined, relaxColumnCountLess: undefined, relax_column_count_more: undefined, relaxColumnCountMore: undefined, }; Object.keys(finaladditionalHeaderParseOptions).forEach(k => (finaladditionalHeaderParseOptions as any)[k] === undefined && delete (finaladditionalHeaderParseOptions as any)[k]); const parsedPreambleRows: unknown[][] = parseCSV(rawFullContent, finaladditionalHeaderParseOptions); const actualPreambleHasContent = parsedPreambleRows.some((row: unknown[]) => Array.isArray(row) && row.some((cell: unknown) => cell !== null && cell !== undefined && String(cell).trim().length > 0)); if (actualPreambleHasContent) { /* ... stringify preamble ... */ let tempPreamble = stringifyCSV(parsedPreambleRows); const originalTotalLines = rawFullContent.split('\n').length; if (numPreambleLinesToExtract >= originalTotalLines) { if (tempPreamble.endsWith('\n') && !rawFullContent.endsWith('\n')) tempPreamble = tempPreamble.slice(0, -1); else if (!tempPreamble.endsWith('\n') && rawFullContent.endsWith('\n') && rawFullContent.length > 0) tempPreamble += '\n'; } else if (tempPreamble.length > 0 && !tempPreamble.endsWith('\n')) tempPreamble += '\n'; fileAdditionalHeader = tempPreamble; } } const contentForMainParsing = options.transform ? options.transform(rawFullContent.trim()) : rawFullContent.trim(); // Explicitly set columns: true as the default, then spread user options for clarity and consistency // This ensures the desired default behavior while still allowing user overrides if needed const finalMainParserOptions: CsvParseInternalOptions = { columns: true, ...(options.csvOptions || {}) }; const fromLineForData = getCsvFromLineValue(options.csvOptions); if (fromLineForData !== undefined) { finalMainParserOptions.from_line = fromLineForData; } delete finalMainParserOptions.from; delete finalMainParserOptions.fromLine; if (numPreambleLinesToExtract && numPreambleLinesToExtract > 0) { const startDataAfterPreamble = numPreambleLinesToExtract + 1; if (!finalMainParserOptions.from_line || finalMainParserOptions.from_line < startDataAfterPreamble) { finalMainParserOptions.from_line = startDataAfterPreamble; } } if (finalMainParserOptions.from_line !== undefined && finalMainParserOptions.from_line < 1) { delete finalMainParserOptions.from_line; } const toLineForData = finalMainParserOptions.to_line ?? finalMainParserOptions.toLine; if (toLineForData !== undefined && finalMainParserOptions.to === undefined) { finalMainParserOptions.to = toLineForData; } delete finalMainParserOptions.to_line; delete finalMainParserOptions.toLine; // If headerMap is provided, ensure columns is true if (options.headerMap) { finalMainParserOptions.columns = true; // headerMap requires objects from csv-parse } // We've already set columns: true as default in finalMainParserOptions // Initial parsing of the data const dataAfterCsvParse = parseCSV(contentForMainParsing, finalMainParserOptions) as any[]; // Process the data with the shared function (apply header mapping and custom casting) const baseLineNumber = finalMainParserOptions.from_line || 1; const parsedData = this.processCSVData<T>(dataAfterCsvParse, options as CSVReadOptions<T & Record<string, any>>, baseLineNumber); // Basic structural validation if (options.validateData && parsedData.length > 0) { const firstDataRowActualLine = finalMainParserOptions.from_line || 1; if (finalMainParserOptions.columns !== false) { if (!(parsedData[0] && typeof parsedData[0] === 'object')) { throw new CSVError(`Expected object rows for validation, but first parsed record (approx. file line ${firstDataRowActualLine}) is not an object.`); } const sampleKeys = Object.keys(parsedData[0]); parsedData.forEach((row, i) => { if (!(row && typeof row === 'object')) { throw new CSVError(`Row at approx. file line ${firstDataRowActualLine + i} (parsed index ${i}) is not an object as expected.`); } const relaxCount = finalMainParserOptions.relax_column_count ?? finalMainParserOptions.relaxColumnCount; if (Object.keys(row).length !== sampleKeys.length && !relaxCount) { throw new CSVError(`Row at approx. file line ${firstDataRowActualLine + i} (parsed index ${i}) has inconsistent column count. Expected ${sampleKeys.length}, got ${Object.keys(row).length}.`); } }); } } // Apply schema validation if configured if (options.schema && parsedData.length > 0) { // If schema specifies async validation, throw an error as fromFile is synchronous if (options.schema.useAsync) { throw new CSVError( "Asynchronous schema validation is not supported in fromFile. Use fromFileAsync instead, or set useAsync: false in your schema configuration." ); } // Ensure synchronous validation is used const syncSchema: CSVSchemaConfig<T> = { ...options.schema, useAsync: false }; const [validatedData, validationResults] = this._validateWithSchemaSync( parsedData, syncSchema, baseLineNumber ); // Return validated data and include validation results return new CSV<T>(validatedData, fileAdditionalHeader, validationResults); } // No schema validation, return parsed data as-is return new CSV<T>(parsedData as T[], fileAdditionalHeader); } // Use retry logic if configured if (options.retry) { return this.retryOperationSync( operation, `Failed to read or parse CSV file: ${filename}`, options.retry ); } else { try { return operation(); } catch (error) { throw new CSVError( `Failed to read or parse CSV file: ${filename}`, error instanceof Error ? error : new Error(String(error)) ); } } } /** * Create a CSV instance from an array of objects, with optional schema validation. * * @param data - Array of objects representing CSV-like rows. * @param options - Optional configuration, primarily for schema validation. * @returns A new CSV instance. * @throws {CSVError} If schema validation is mode 'error' and fails, or if an async schema is used incorrectly. */ static fromData<T extends Record<string, any>>( data: (T |Record<string, any>)[], // Input can be Record<string, any> to allow for validation to type T options?: CSVFromDataOptions<T> ): CSV<T> { // Create a new array with copies of the input objects to ensure immutability of the input `data` array let processedData: Record<string, any>[] = Array.isArray(data) ? data.map(row => ({ ...row })) : []; let finalAdditionalHeader: string | undefined = undefined; // Not applicable for fromData let validationResults: RowValidationResult<T>[] | undefined = undefined; // Apply schema validation if configured if (options?.schema && processedData.length > 0) { // If schema specifies async validation, throw an error as fromData is synchronous if (options.schema.useAsync) { throw new CSVError( "Asynchronous schema validation is not supported in the synchronous fromData method. " + "Validate separately using csvInstance.validateAsync() or ensure your schema and useAsync:false are set for synchronous validation." ); } // Ensure synchronous validation is used for this synchronous method const syncSchema: CSVSchemaConfig<T> = { ...options.schema, useAsync: false, // Force synchronous validation path }; try { const [validatedDataOutput, valRes] = this._validateWithSchemaSync( processedData, // Pass the current data syncSchema ); processedData = validatedDataOutput; // Update processedData with the validated (and possibly filtered/transformed) data validationResults = valRes; } catch (error) { // _validateWithSchemaSync throws CSVError directly if validationMode is 'error' and a failure occurs. // Re-throw if it's already a CSVError, otherwise wrap it. if (error instanceof CSVError) { throw error; } throw new CSVError( 'Schema validation failed during CSV.fromData', error instanceof Error ? error : new Error(String(error)) ); } } // The final data should conform to T after validation (or be T[] if no validation) return new CSV<T>(processedData as T[], finalAdditionalHeader, validationResults); } /** * Create a CSV instance from a string * @param csvString - CSV content as a string * @param options - CSV reading options including custom casting * @returns A new CSV instance * @throws {CSVError} If parsing fails */ /** * Common function to process parsed CSV data with header mapping and custom casting * @param dataAfterCsvParse - The initially parsed data from csv-parse * @param options - Reading options including custom casting * @param baseLineNumber - Base line number for error reporting (usually 1 or the value of from_line) * @returns Processed data with header mapping and custom casting applied */ private static processCSVData<T>( dataAfterCsvParse: any[], options: CSVReadOptions<T & Record<string, any>>, baseLineNumber: number = 1 ): any[] { // Apply header mapping if specified let processedData: any[] = dataAfterCsvParse; if (options.headerMap && dataAfterCsvParse.length > 0 && typeof dataAfterCsvParse[0] === 'object') { const { fromRowArr } = createHeaderMapFns<T & Record<string, any>>(options.headerMap); processedData = dataAfterCsvParse.map(row => fromRowArr(row)); } // Apply custom casting if specified if (options.customCasts && processedData.length > 0) { const { definitions, columnCasts, onCastError = 'error' } = options.customCasts; // Only apply custom casting to object-based rows if (typeof processedData[0] === 'object' && processedData[0] !== null) { return processedData.map((row: Record<string, any>, rowIndex: number) => { const newRow = { ...row }; for (const columnName in row) { if (Object.prototype.hasOwnProperty.call(row, columnName)) { const originalValue = row[columnName]; // Prepare string input for custom casters let valueToTestAndParse: string; if (typeof originalValue === 'string') { valueToTestAndParse = originalValue; } else if (originalValue === null) { valueToTestAndParse = 'null'; } else if (originalValue === undefined) { valueToTestAndParse = 'undefined'; } else { valueToTestAndParse = String(originalValue); } // Build casting context const context: CastingContext = { column: columnName, header: false, index: Object.keys(row).indexOf(columnName), lines: baseLineNumber + rowIndex, records: rowIndex, empty_lines: 0, invalid_field_length: 0, quoting: false }; let castSuccessful = false; let castedValue: any = originalValue; // Default to original value // Function to apply a caster const applyCaster = (caster: Caster<any>): boolean => { if (caster.test(valueToTestAndParse, context)) { try { castedValue = caster.parse(valueToTestAndParse, context); castSuccessful = true; return true; // Caster applied successfully } catch (e) { if (onCastError === 'error') { throw new CSVError( `Custom cast failed for column "${columnName}" at line ${context.lines}, value: "${valueToTestAndParse}". Error: ${(e as Error).message}`, e ); } else if (onCastError === 'null') { castedValue = null; } else { // 'original' castedValue = originalValue; } return true; // Caster was attempted but failed/handled } } return false; // Caster test failed }; // 1. Try column-specific casters first if (columnCasts && columnCasts[columnName as string]) { const columnRule = columnCasts[columnName as string]; const rulesToTry = Array.isArray(columnRule) ? columnRule : [columnRule]; for (const rule of rulesToTry) { let casterToUse: Caster<any> | undefined; if (typeof rule === 'string' && definitions && definitions[rule as keyof CustomCastDefinition]) { casterToUse = definitions[rule as keyof CustomCastDefinition]; } else if (typeof rule === 'object' && rule !== null && 'test' in rule && 'parse' in rule) { casterToUse = rule as Caster<any>; } if (casterToUse && applyCaster(casterToUse)) { break; // First successful caster wins } } } // 2. If no column-specific caster succeeded, try global casters if (!castSuccessful && definitions) { // Predefined order for more predictable behavior const orderedGlobalKeys: (keyof CustomCastDefinition)[] = [ 'null', 'boolean', 'number', 'date', 'object', 'array', 'string' ]; for (const defKey of orderedGlobalKeys) { const globalCaster = definitions[defKey]; if (globalCaster && applyCaster(globalCaster)) { break; // First successful global caster wins } } } // Set the potentially modified value in the new row newRow[columnName] = castedValue; } } return newRow as T; }); } } // No header mapping or custom casting needed, or non-object rows return processedData as (T & Record<string, any>)[]; } static fromString<T extends Record<string, any>>( csvString: string, options: CSVReadOptions<T> = { csvOptions: { columns: true } } ): CSV<T> { try { // Set columns: true by default to ensure consistent behavior, then allow user overrides const csvOptions = { columns: true, ...options.csvOptions }; let dataAfterCsvParse = parseCSV(csvString, csvOptions) as any[]; // Process the data with the shared function const parsedData = this.processCSVData<T>(dataAfterCsvParse, options); // Apply schema validation if configured if (options.schema && parsedData.length > 0) { // If schema specifies async validation, throw an error as fromString is synchronous if (options.schema.useAsync) { throw new CSVError( "Asynchronous schema validation is not supported in fromString. Use fromFileAsync or validateAsync instead, or set useAsync: false in your schema configuration." ); } // Ensure synchronous validation is used const syncSchema: CSVSchemaConfig<T> = { ...options.schema, useAsync: false }; const [validatedData, validationResults] = this._validateWithSchemaSync( parsedData, syncSchema ); // Return validated data and include validation results return new CSV<T>(validatedData, undefined, validationResults); } return new CSV<T>(parsedData as T[]); } catch (error) { throw new CSVError('Failed to parse CSV string', error instanceof Error ? error : new Error(String(error))); } } /** * Create a CSV instance from a readable stream * @param stream - Readable stream containing CSV data * @param options - CSV reading options including custom casting * @returns Promise resolving to a new CSV instance * @throws {CSVError} If parsing fails */ static async fromStream<T extends Record<string, any>>( stream: NodeJS.ReadableStream, options: CSVReadOptions<T> | { columns?: boolean } = { columns: true } ): Promise<CSV<T>> { try { // Handle the case where options is just simple options const readOptions: CSVReadOptions<T> = 'csvOptions' in options ? options as CSVReadOptions<T> : { csvOptions: options } as CSVReadOptions<T>; // Consistent approach: Set columns: true by default for proper object-based parsing const csvParseOptions = { columns: true, ...(readOptions.csvOptions || {}) }; return new Promise((resolve, reject) => { const data: any[] = []; const parser = parseCSVAsync(csvParseOptions); parser.on('readable', () => { let record; while ((record = parser.read()) !== null) { data.push(record); } }); parser.on('error', (err) => { reject(new CSVError('Failed to parse CSV stream', err instanceof Error ? err : new Error(String(err)))); }); parser.on('end', () => { try { // Process the data with the shared function (apply header mapping and custom casting) const processedData = this.processCSVData<T>(data, readOptions as CSVReadOptions<T & Record<string, any>>); // Apply schema validation if configured if (readOptions.schema && processedData.length > 0) { try { const [validatedData, validationResults] = this._validateWithSchema( processedData, readOptions.schema ); // Return validated data and include validation results resolve(new CSV<T>(validatedData, undefined, validationResults)); } catch (validationError) { reject(new CSVError('CSV validation failed', validationError instanceof Error ? validationError : new Error(String(validationError)))); } } else { // No schema validation, return processed data as-is resolve(new CSV<T>(processedData as T[])); } } catch (error) { reject(new CSVError('Failed to process CSV stream data', error instanceof Error ? error : new Error(String(error)))); } }); stream.pipe(parser); }); } catch (error) { throw new CSVError('Failed to parse CSV stream', error instanceof Error ? error : new Error(String(error))); } } /** * Create a CSV instance from a file asynchronously using streams * @param filename - Path to the CSV file * @param options - Reading options including custom casting * @returns Promise resolving to a new CSV instance * @throws {CSVError} If file reading or parsing fails */ static async fromFileAsync<T extends Record<string, any>>( filename: string, options: CSVReadOptions<T> = {} ): Promise<CSV<T>> { try { const resolvedPath = path.resolve(filename); const stream = fs.createReadStream(resolvedPath, options.fsOptions); // Check if the schema might need async validation if (options.schema && options.schema.useAsync === undefined) { // Set useAsync to true for fromFileAsync to ensure proper handling of potentially async schemas const schemaWithAsyncOption = { ...options.schema, useAsync: true }; // Pass the options to fromStream with updated schema return CSV.fromStream<T>(stream, { ...options, csvOptions: options.csvOptions || { columns: true }, schema: schemaWithAsyncOption }); } // Pass the options to fromStream as csvOptions to ensure it has the right type return CSV.fromStream<T>(stream, { csvOptions: options.csvOptions || { columns: true }, ...options }); } catch (error) { throw new CSVError( `Failed to read or parse CSV file asynchronously: ${filename}`, error instanceof Error ? error : new Error(String(error)) ); } } /** * Write the current data to a CSV file * @param filename - Destination file path * @param options - Writing options * @throws {CSVError} If writing fails after retries * @example * ```typescript * // Basic writing * users.writeToFile('users_export.csv'); * * // With header mapping * users.writeToFile('users_export.csv', { * headerMap: { * 'id': 'ID', * 'profile.firstName': 'First Name', * 'profile.lastName': 'Last Name' * } * }); * * // With streaming for large files * users.writeToFile('users_export.csv', { * streaming: true, * streamingThreshold: 500 // Default is 1000 * }); * ``` */ writeToFile(filename: string, options: CSVWriteOptions<T> = {}): void { const operation = () => { const outputPath = filename.endsWith('.csv') ? filename : `${filename}.csv`; const streamingThreshold = options.streamingThreshold || 1000; // Apply header mapping if provided if (options.headerMap) { const stringifyOptions = options.stringifyOptions || { header: true }; const headers = Array.isArray(stringifyOptions.header) ? stringifyOptions.header : Object.keys(this.data[0] || {}); const { toRowArr } = createHeaderMapFns<T>(options.headerMap); // Handle streaming with header map if (options.streaming && this.data.length > streamingThreshold) { // Use streaming for large datasets with header mapping const headerToPrepend = options.additionalHeader ?? this.additionalHeader ?? ''; const writable = fs.createWriteStream(outputPath, { encoding: 'utf-8' }); if (headerToPrepend) { writable.write(headerToPrepend); } // Create transform stream for header mapping const headerMapTransform = new Transform({ objectMode: true, transform(chunk, encoding, callback) { try { const mappedRow = toRowArr(chunk, headers); callback(null, mappedRow); } catch (error) { callback(error as Error); } } }); // Create stringifier with appropriate options // If header is true, make sure it's handled correctly in stringifyOptions const csvStringifyOptions = { ...stringifyOptions, header: Array.isArray(stringifyOptions.header) ? headers : stringifyOptions.header }; // Create a transform stream using stringifyCSVAsync // Use a type assertion to help TypeScript understand this is valid // The CSV module expects options as the first argument // The 'as any' casting is necessary because csvStringifyOptions' structure (with header property that // could be boolean or string[]) may not perfectly align with stringifyCSVAsync's expected type const stringifier = stringifyCSVAsync(csvStringifyOptions as any) as Transform; // Add a simple error handler to the stringifier stringifier.on('error', (err) => { console.error('CSV stringification error:', err); }); // Create pipeline Readable.from(this.data) .pipe(headerMapTransform) .pipe(stringifier) .pipe(writable); return; } else { // Standard in-memory processing for smaller datasets // Transform the data through the header map const rows = this.data.map(item => toRowArr(item, headers)); // Add headers as the first row if needed if (stringifyOptions.header === true) { rows.unshift(headers); } // Use a custom stringifier without the header option since we've manually handled it const csvString = rows.map(row => row.map(cell => { if (cell === null || cell === undefined) return ''; return typeof cell === 'string' && (cell.includes(',') || cell.includes('"') || cell.includes('\n')) ? `"${cell.replace(/"/g, '""')}"` : String(cell); }).join(',') ).join('\n'); fs.writeFileSync( outputPath, (options.additionalHeader ?? this.additionalHeader ?? '') + csvString, 'utf-8' ); return; } } // Standard CSV writing without header mapping if (options.streaming && this.data.length > streamingThreshold) { // Use streaming for large datasets // Use a type assertion to help TypeScript understand this is valid const stringifier = stringifyCSVAsync( options.stringifyOptions || { header: true } ) as Transform; const readable = Readable.from(this.data); const writable = fs.createWriteStream(outputPath, { encoding: 'utf-8' }); const headerToPrepend = options.additionalHeader ?? this.additionalHeader ?? ''; if (headerToPrepend) { writable.write(headerToPrepend); } readable.pipe(stringifier).pipe(writable); } else { // Use synchronous version for smaller datasets const csvString = stringifyCSV( this.data, options.stringifyOptions || { header: true } ); fs.writeFileSync( outputPath, (options.additionalHeader ?? this.additionalHeader ?? '') + csvString, 'utf-8' ); } }; // Use retry logic if configured if (