@doeixd/csv-utils
Version:
Utilities for csv files / arrays of objects
1,252 lines (1,115 loc) • 175 kB
text/typescript
/**
* @fileoverview A production-ready TypeScript library for CSV manipulation,
* featuring robust error handling, strong typing, and a fluent interface.
*/
import fs from 'node:fs';
import path from 'node:path';
import { parse as parseCSV, stringify as stringifyCSV } from 'csv/sync';
import { parse as parseCSVAsync, stringify as stringifyCSVAsync } from 'csv';
import { distance as levenshteinDistance } from 'fastest-levenshtein';
import { get as lodashGet } from 'lodash';
import { createHeaderMapFns, HeaderMap, RetryOptions } from './headers'
import { StandardSchemaV1, tryValidateStandardSchemaSync, tryValidateStandardSchemaAsync, CSVSchemaConfig, RowValidationResult } from './schema'
import { Readable, Transform, Writable, pipeline as streamPipeline } from 'node:stream';
import { type Transform as NodeTransform } from 'node:stream'
import { promisify } from 'node:util';
import { Worker, isMainThread, parentPort, workerData } from 'node:worker_threads';
import os from 'node:os';
export * from './headers'
export * from './schema'
export * from './standalone'
/**
* Options for creating a CSV instance from pre-existing data.
*/
export interface CSVFromDataOptions<T extends Record<string, any>> {
/**
* Schema to validate the provided data objects against.
* If schema validation fails and mode is 'error', an error will be thrown.
* If mode is 'filter', invalid objects will be removed.
* If mode is 'keep', validationResults will be populated on the CSV instance.
*/
schema?: CSVSchemaConfig<T>;
// We are deliberately OMITTING customCasts here for fromData
// as their primary design is for string inputs from CSV parsing.
// Coercion of existing object properties should ideally be handled by
// the schema itself (e.g., Zod's coerce) or a separate transformation step.
}
/**
* Error class for CSV-related operations
*/
export class CSVError extends Error {
constructor(message: string, public readonly cause?: unknown) {
super(message);
this.name = 'CSVError';
// Maintains proper stack trace for where the error was thrown
if (Error.captureStackTrace) {
Error.captureStackTrace(this, CSVError);
}
}
}
export type CsvParseOptions = Parameters<typeof parseCSV>[1];
type CsvParseInternalOptions = Exclude<Parameters<typeof parseCSV>[1], null | undefined>
export type CsvStringifyOptions<T = any> = Parameters<typeof stringifyCSV>[1] | {
stringifyOptions?: Parameters<typeof stringifyCSV>[1];
headerMap?: HeaderMap<T>;
};
/**
* Context for casting functions, similar to csv-parse's CastingContext
*/
export interface CastingContext {
/** Column name or index */
column: string | number;
/** Is it the header row? (Usually false for custom casting stage) */
header: boolean;
/** Index of the field in the record */
index: number;
/** Line number in the source */
lines: number;
/** Number of records parsed so far */
records: number;
/** Count of empty lines */
empty_lines: number;
/** Count of rows with inconsistent field lengths */
invalid_field_length: number;
/** Is the field quoted? */
quoting: boolean;
}
/**
* Functions for testing if a value should be cast and performing the casting
*/
export type CastTestFunction = (value: string, context: CastingContext) => boolean;
export type CastParseFunction<TargetType> = (value: string, context: CastingContext) => TargetType;
/**
* Definition of a caster that converts string values to a target type
*/
export interface Caster<TargetType> {
/**
* Tests if a string value is a candidate for this caster
* @param value The string value from the CSV cell (after csv-parse's initial processing)
* @param context An object containing column name, line number, etc.
* @returns True if this caster should attempt to parse the value
*/
test: CastTestFunction;
/**
* Parses the string value into the target type
* Called only if `test` returns true
* @param value The string value to parse
* @param context An object containing column name, line number, etc.
* @returns The parsed value of TargetType
* @throws If parsing fails and strict error handling is desired
*/
parse: CastParseFunction<TargetType>;
}
/**
* Set of type-specific casters to apply to CSV values
*/
export interface CustomCastDefinition {
string?: Caster<string>;
number?: Caster<number>;
boolean?: Caster<boolean>;
date?: Caster<Date>;
object?: Caster<object>;
array?: Caster<any[]>;
null?: Caster<null>;
}
/**
* Configuration for column-specific casting
*/
export type ColumnCastConfig<T extends Record<string, any>> = {
[K in keyof T]?: keyof CustomCastDefinition | Caster<T[K]> | (keyof CustomCastDefinition | Caster<any>)[];
} | {
[columnName: string]: keyof CustomCastDefinition | Caster<any> | (keyof CustomCastDefinition | Caster<any>)[];
};
/**
* CSV reading options
*/
export interface CSVReadOptions<T extends Record<string, any>> {
/** File system options for reading the file */
fsOptions?: {
encoding?: BufferEncoding;
flag?: string;
mode?: number;
autoClose?: boolean;
emitClose?: boolean;
start?: number;
end?: number;
highWaterMark?: number;
};
csvOptions?: Parameters<typeof parseCSV>[1];
/** Optional transform function to apply to raw file content */
transform?: (content: string) => string;
/** Flag to indicate the input is raw data rather than a filename */
rawData?: boolean;
/** Optional header mapping configuration */
headerMap?: HeaderMap<T>;
/** Options for retrying failed operations */
retry?: RetryOptions;
/** Enable basic validation of data against expected schema */
validateData?: boolean;
/** Enable standard schema validation of data */
schema?: CSVSchemaConfig<T>;
allowEmptyValues?: boolean;
/**
* Controls the extraction of initial lines as an "additional header" (preamble).
* These lines are stored in `csvInstance.additionalHeader`.
*
* - If `number > 0`: Specifies the exact number of lines to extract as the preamble.
* Data parsing will start after these lines, unless `csvOptions.from_line` (or `from`)
* is set and points to an even later line.
*
* - If `true`: Enables preamble extraction *if* `csvOptions.from_line` (or `from`)
* is set to a value greater than 1. The preamble will consist of `csvOptions.from_line - 1` lines.
* If `csvOptions.from_line` is not set or is 1, no preamble is extracted with `true`.
*
* - If `false`, `0`, or `undefined`: No preamble is extracted.
*/
saveAdditionalHeader?: boolean | number;
/**
* Optional CSV parsing options specifically for the preamble (additional header) lines.
* If provided, these options will be used when parsing the preamble.
* If not provided, relevant low-level parsing options (like delimiter, quote, escape)
* might be inherited from the main `csvOptions` by default (see implementation for details),
* or a very basic parsing configuration will be used.
*
* **Important:** Options like `columns`, `from_line`, `to_line` will be overridden
* internally for preamble extraction. You should primarily use this for options
* like `delimiter`, `quote`, `escape`, `record_delimiter`, `ltrim`, `rtrim`, `bom`.
*/
additionalHeaderParseOptions?: Parameters<typeof parseCSV>[1];
/**
* Custom type casting options that are applied after csv-parse's built-in casting
* but before data validation
*/
customCasts?: {
/**
* A global set of custom casting definitions.
* These are tried if a column doesn't have a specific rule in `columnCasts`.
*/
definitions?: CustomCastDefinition;
/**
* Per-column casting rules. The key is the column name (after initial parsing and header mapping).
* The value can be:
* - A string key of a caster defined in `definitions` (e.g., 'number', 'date').
* - A custom Caster object `{ test, parse }`.
* - An array of the above, tried in order until one succeeds.
* If not specified for a column, or if all rules fail, the value remains as parsed by `csv-parse`.
*/
columnCasts?: ColumnCastConfig<T>;
/**
* What to do if a specific cast's `parse` function throws an error.
* - 'error': Propagate the error, failing the CSV loading. (default)
* - 'null': Set the value to null.
* - 'original': Keep the original string value (as received by the custom caster).
*/
onCastError?: 'error' | 'null' | 'original';
};
}
/**
* CSV writing options
*/
export interface CSVWriteOptions<T = any> {
/** Additional header content to prepend to the CSV */
additionalHeader?: string;
/** Options for stringifying the CSV */
stringifyOptions?: Parameters<typeof stringifyCSV>[1];
/** Whether to use streaming for large files */
streaming?: boolean;
/** Optional header mapping configuration */
headerMap?: HeaderMap<T>;
/** Threshold for using streaming (number of rows) */
streamingThreshold?: number;
/** Options for retrying failed operations */
retry?: RetryOptions;
}
/**
* Options for working with CSV streams and generators
*/
export interface CSVStreamOptions<T> {
/** CSV parsing options */
csvOptions?: Parameters<typeof parseCSVAsync>[0];
/** Options for transforming rows */
transform?: (row: any) => T;
/** Batch size for processing */
batchSize?: number;
/** Optional header mapping configuration */
headerMap?: HeaderMap<T>;
/** Options for retrying failed operations */
retry?: RetryOptions;
/** Buffers rows before yielding to improve performance with very large files */
useBuffering?: boolean;
/** Buffer size when useBuffering is true */
bufferSize?: number;
}
/**
* Result type for similarity matches
*/
export interface SimilarityMatch<T> {
row: T;
dist: number;
}
// Type definitions for callbacks
export type ComparisonCallback<T> = (row: T) => boolean;
export type ModificationCallback<T> = (row: T) => Partial<T>;
export type TransformCallback<T, R> = (row: T) => R;
export type EqualityCallback<T> = (a: T, b: T) => boolean;
export type MergeCallback<T, E> = (a: T, b: E) => T;
// Aggregation and sorting types
export type AggregateOperation = 'sum' | 'avg' | 'min' | 'max' | 'count';
export type SortDirection = 'asc' | 'desc';
/**
* Core class for CSV data manipulation with a fluent interface
*/
export class CSV<T extends Record<string, any>> {
/** Validation results if schema validation was used with 'keep' mode */
readonly validationResults?: RowValidationResult<T>[];
private constructor(
private readonly data: T[],
readonly additionalHeader?: string,
validationResults?: RowValidationResult<T>[]
) {
this.validationResults = validationResults;
}
/**
* Helper function to implement retry logic
* @param operation - Function to retry
* @param errorMessage - Error message if all retries fail
* @param retryOptions - Retry configuration
* @returns Result of the operation
* @throws {CSVError} If operation fails after all retries
*/
private static async retryOperation<R>(
operation: () => Promise<R>,
errorMessage: string,
retryOptions?: RetryOptions
): Promise<R> {
const maxRetries = retryOptions?.maxRetries ?? 3;
const baseDelay = retryOptions?.baseDelay ?? 100;
const logRetries = retryOptions?.logRetries ?? false;
let lastError: Error | null = null;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return await operation();
} catch (error) {
lastError = error instanceof Error ? error : new Error(String(error));
if (attempt < maxRetries) {
// Calculate delay with exponential backoff
const delay = Math.pow(2, attempt) * baseDelay;
if (logRetries) {
console.warn(`Retry attempt ${attempt + 1}/${maxRetries} after ${delay}ms`);
}
// Wait before retrying
await new Promise(resolve => setTimeout(resolve, delay));
}
}
}
throw new CSVError(`${errorMessage} after ${maxRetries} attempts`, lastError);
}
/**
* Synchronous version of retry operation
* @param operation - Function to retry
* @param errorMessage - Error message if all retries fail
* @param retryOptions - Retry configuration
* @returns Result of the operation
* @throws {CSVError} If operation fails after all retries
*/
private static retryOperationSync<R>(
operation: () => R,
errorMessage: string,
retryOptions?: RetryOptions
): R {
const maxRetries = retryOptions?.maxRetries ?? 3;
const baseDelay = retryOptions?.baseDelay ?? 100;
const logRetries = retryOptions?.logRetries ?? false;
let lastError: Error | null = null;
for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
return operation();
} catch (error) {
lastError = error instanceof Error ? error : new Error(String(error));
if (attempt < maxRetries) {
// Calculate delay with exponential backoff
const delay = Math.pow(2, attempt) * baseDelay;
if (logRetries) {
console.warn(`Retry attempt ${attempt + 1}/${maxRetries} after ${delay}ms`);
}
}
}
}
throw new CSVError(`${errorMessage} after ${maxRetries} attempts`, lastError);
}
/**
* Validates data using a standard schema configuration.
* This method can validate at both row and column levels.
*
* @param data - The data to validate
* @param schemaConfig - The schema configuration
* @param baseLineNumber - The base line number for error reporting
* @returns A tuple containing: [validatedData, validationResults]
* @private
*/
/**
* Validates data synchronously using a standard schema configuration.
* This method throws an error if the schema is configured for async validation.
*
* @param data - The data to validate
* @param schemaConfig - The schema configuration
* @param baseLineNumber - The base line number for error reporting
* @returns A tuple containing: [validatedData, validationResults]
* @private
*/
private static _validateWithSchemaSync<T extends Record<string, any>>(
data: Record<string, any>[],
schemaConfig: CSVSchemaConfig<T>,
baseLineNumber: number = 1
): [T[], RowValidationResult<T>[]] {
const { rowSchema, columnSchemas, validationMode = 'error', useAsync = false } = schemaConfig;
// If schema is configured for async validation, throw an error in this sync method
if (useAsync) {
throw new CSVError(
"Asynchronous schema validation is not supported in synchronous CSV methods. Use an async method (e.g., fromFileAsync, validateAsync) or set useAsync: false."
);
}
const validationResults: RowValidationResult<T>[] = [];
const validatedData: T[] = [];
// Function to validate a row synchronously
const validateRowSync = (row: Record<string, any>, index: number): RowValidationResult<T> => {
const result: RowValidationResult<T> = {
originalRow: row,
valid: true
};
// Validate row using row schema if provided
if (rowSchema) {
const rowValidation = tryValidateStandardSchemaSync(rowSchema, row);
if (rowValidation.issues) {
result.valid = false;
result.rowIssues = [...rowValidation.issues];
} else {
result.validatedRow = rowValidation.value as T;
}
} else {
// If no row schema, original row is the validated row
result.validatedRow = row as T;
}
// Validate individual columns if provided
if (columnSchemas) {
result.columnIssues = {};
for (const [column, schema] of Object.entries(columnSchemas)) {
if (!schema) continue;
const columnValidation = tryValidateStandardSchemaSync(schema, row[column]);
if (columnValidation.issues) {
result.valid = false;
result.columnIssues[column] = [...columnValidation.issues];
// If row validation succeeded but column validation failed, update the column value in validated row
if (result.validatedRow && !result.rowIssues) {
if ('value' in columnValidation) {
(result.validatedRow as any)[column] = columnValidation.value;
} else {
// Column validation failed, set to null/undefined or keep original
(result.validatedRow as any)[column] = null;
}
}
} else if (result.validatedRow) {
// Update the validated value in the result
(result.validatedRow as any)[column] = columnValidation.value;
}
}
// If no column issues, remove the empty object
if (Object.keys(result.columnIssues).length === 0) {
delete result.columnIssues;
}
}
return result;
};
// Process each row synchronously
const results = data.map(validateRowSync);
// Process validation results
for (const result of results) {
validationResults.push(result);
if (result.valid && result.validatedRow) {
validatedData.push(result.validatedRow);
} else if (validationMode === 'error') {
// Collect all validation issues for better error reporting
const issues: string[] = [];
if (result.rowIssues) {
issues.push(`Row validation issues: ${result.rowIssues.map(i => i.message).join(', ')}`);
}
if (result.columnIssues) {
for (const [column, columnIssues] of Object.entries(result.columnIssues)) {
issues.push(`Column '${column}' validation issues: ${columnIssues.map(i => i.message).join(', ')}`);
}
}
throw new CSVError(`CSV validation failed: ${issues.join('; ')}`);
} else if (validationMode === 'filter') {
// Skip invalid rows - don't add to validatedData
continue;
} else if (validationMode === 'keep' && !result.valid) {
// Keep even invalid rows in the resulting dataset
validatedData.push(result.originalRow as T);
}
}
return [validatedData, validationResults];
}
/**
* For backward compatibility - redirects to the appropriate sync or async validation method
* @private
*/
private static _validateWithSchema<T extends Record<string, any>>(
data: Record<string, any>[],
schemaConfig: CSVSchemaConfig<T>,
baseLineNumber: number = 1
): [T[], RowValidationResult<T>[]] {
// For backward compatibility, if useAsync is true, we'll still return empty results
// as this was the previous behavior. The async method will need to be called afterward.
if (schemaConfig.useAsync) {
return [[], []];
} else {
return this._validateWithSchemaSync(data, schemaConfig, baseLineNumber);
}
}
/**
* Create a CSV instance from a file
* @param filename - Path to the CSV file
* @param options - Reading options
* @returns A new CSV instance
* @throws {CSVError} If file reading or parsing fails after retries
* @example
* ```typescript
* // Basic usage
* const users = CSV.fromFile<User>('users.csv');
*
* // With header mapping
* const users = CSV.fromFile<User>('users.csv', {
* headerMap: {
* 'user_id': 'id',
* 'first_name': 'profile.firstName'
* }
* });
*
* // With retry options
* const users = CSV.fromFile<User>('users.csv', {
* retry: { maxRetries: 5, logRetries: true }
* });
*
* // With schema validation using Zod
* import { z } from 'zod';
*
* const userSchema = z.object({
* id: z.string().min(1),
* name: z.string().min(1),
* email: z.string().email().optional()
* });
*
* // TypeScript type derived from the schema
* type User = z.infer<typeof userSchema>;
*
* const users = CSV.fromFile<User>('users.csv', {
* schema: {
* rowSchema: userSchema,
* columnSchemas: {
* email: z.string().email()
* },
* validationMode: 'filter'
* }
* });
* ```
*/
static fromFile<T extends Record<string, any>>(
filename: string,
options: CSVReadOptions<T> = {}
): CSV<T> {
const operation = () => {
const resolvedPath = path.resolve(filename);
const fileData = fs.readFileSync(
resolvedPath,
options.fsOptions?.encoding as BufferEncoding || 'utf-8'
);
const rawFullContent = fileData.toString();
let fileAdditionalHeader = '';
const getCsvFromLineValue = (csvOpts?: CsvParseOptions): number | undefined => {
if (!csvOpts) return undefined;
const fromVal = csvOpts.from ?? csvOpts.from_line ?? csvOpts.fromLine;
return typeof fromVal === 'number' && fromVal >= 1 ? fromVal : undefined;
};
let numPreambleLinesToExtract: number | undefined = undefined;
const userSpecifiedFromForData = getCsvFromLineValue(options.csvOptions);
if (typeof options.saveAdditionalHeader === 'number' && options.saveAdditionalHeader > 0) {
numPreambleLinesToExtract = options.saveAdditionalHeader;
} else if (options.saveAdditionalHeader === true && userSpecifiedFromForData && userSpecifiedFromForData > 1) {
numPreambleLinesToExtract = userSpecifiedFromForData - 1;
}
if (numPreambleLinesToExtract && numPreambleLinesToExtract > 0) {
const basePreambleOpts: CsvParseInternalOptions = options.additionalHeaderParseOptions
? { ...options.additionalHeaderParseOptions } : {};
if (!options.additionalHeaderParseOptions && options.csvOptions) {
const RELEVANT_LOW_LEVEL_KEYS: (keyof CsvParseInternalOptions)[] = ['delimiter', 'quote', 'escape', 'record_delimiter', 'recordDelimiter', 'ltrim', 'rtrim', 'trim', 'bom'];
RELEVANT_LOW_LEVEL_KEYS.forEach(key => {
let valueToInherit: any = undefined;
if (options.csvOptions) {
if (key === 'record_delimiter' || key === 'recordDelimiter') { valueToInherit = options.csvOptions.record_delimiter ?? options.csvOptions.recordDelimiter; }
else { valueToInherit = options.csvOptions[key as keyof typeof options.csvOptions]; }
}
if (valueToInherit !== undefined && !(key in basePreambleOpts)) { (basePreambleOpts as any)[key] = valueToInherit; }
});
}
const finaladditionalHeaderParseOptions: CsvParseInternalOptions = { /* ... as before ... */
...basePreambleOpts, columns: false, to: numPreambleLinesToExtract,
from: undefined, from_line: undefined, fromLine: undefined, to_line: undefined, toLine: undefined,
skip_empty_lines: undefined, skipEmptyLines: undefined, skip_records_with_error: undefined, skipRecordsWithError: undefined,
skip_records_with_empty_values: undefined, skipRecordsWithEmptyValues: undefined, comment: undefined,
on_record: undefined, onRecord: undefined, auto_parse: undefined, autoParse: undefined, cast: undefined,
cast_date: undefined, castDate: undefined, objname: undefined, info: undefined, raw: undefined,
relax_column_count: undefined, relaxColumnCount: undefined, relax_column_count_less: undefined, relaxColumnCountLess: undefined,
relax_column_count_more: undefined, relaxColumnCountMore: undefined,
};
Object.keys(finaladditionalHeaderParseOptions).forEach(k => (finaladditionalHeaderParseOptions as any)[k] === undefined && delete (finaladditionalHeaderParseOptions as any)[k]);
const parsedPreambleRows: unknown[][] = parseCSV(rawFullContent, finaladditionalHeaderParseOptions);
const actualPreambleHasContent = parsedPreambleRows.some((row: unknown[]) => Array.isArray(row) && row.some((cell: unknown) => cell !== null && cell !== undefined && String(cell).trim().length > 0));
if (actualPreambleHasContent) { /* ... stringify preamble ... */
let tempPreamble = stringifyCSV(parsedPreambleRows);
const originalTotalLines = rawFullContent.split('\n').length;
if (numPreambleLinesToExtract >= originalTotalLines) {
if (tempPreamble.endsWith('\n') && !rawFullContent.endsWith('\n')) tempPreamble = tempPreamble.slice(0, -1);
else if (!tempPreamble.endsWith('\n') && rawFullContent.endsWith('\n') && rawFullContent.length > 0) tempPreamble += '\n';
} else if (tempPreamble.length > 0 && !tempPreamble.endsWith('\n')) tempPreamble += '\n';
fileAdditionalHeader = tempPreamble;
}
}
const contentForMainParsing = options.transform ? options.transform(rawFullContent.trim()) : rawFullContent.trim();
// Explicitly set columns: true as the default, then spread user options for clarity and consistency
// This ensures the desired default behavior while still allowing user overrides if needed
const finalMainParserOptions: CsvParseInternalOptions = {
columns: true,
...(options.csvOptions || {})
};
const fromLineForData = getCsvFromLineValue(options.csvOptions);
if (fromLineForData !== undefined) { finalMainParserOptions.from_line = fromLineForData; }
delete finalMainParserOptions.from; delete finalMainParserOptions.fromLine;
if (numPreambleLinesToExtract && numPreambleLinesToExtract > 0) {
const startDataAfterPreamble = numPreambleLinesToExtract + 1;
if (!finalMainParserOptions.from_line || finalMainParserOptions.from_line < startDataAfterPreamble) {
finalMainParserOptions.from_line = startDataAfterPreamble;
}
}
if (finalMainParserOptions.from_line !== undefined && finalMainParserOptions.from_line < 1) {
delete finalMainParserOptions.from_line;
}
const toLineForData = finalMainParserOptions.to_line ?? finalMainParserOptions.toLine;
if (toLineForData !== undefined && finalMainParserOptions.to === undefined) { finalMainParserOptions.to = toLineForData; }
delete finalMainParserOptions.to_line; delete finalMainParserOptions.toLine;
// If headerMap is provided, ensure columns is true
if (options.headerMap) {
finalMainParserOptions.columns = true; // headerMap requires objects from csv-parse
}
// We've already set columns: true as default in finalMainParserOptions
// Initial parsing of the data
const dataAfterCsvParse = parseCSV(contentForMainParsing, finalMainParserOptions) as any[];
// Process the data with the shared function (apply header mapping and custom casting)
const baseLineNumber = finalMainParserOptions.from_line || 1;
const parsedData = this.processCSVData<T>(dataAfterCsvParse, options as CSVReadOptions<T & Record<string, any>>, baseLineNumber);
// Basic structural validation
if (options.validateData && parsedData.length > 0) {
const firstDataRowActualLine = finalMainParserOptions.from_line || 1;
if (finalMainParserOptions.columns !== false) {
if (!(parsedData[0] && typeof parsedData[0] === 'object')) {
throw new CSVError(`Expected object rows for validation, but first parsed record (approx. file line ${firstDataRowActualLine}) is not an object.`);
}
const sampleKeys = Object.keys(parsedData[0]);
parsedData.forEach((row, i) => {
if (!(row && typeof row === 'object')) { throw new CSVError(`Row at approx. file line ${firstDataRowActualLine + i} (parsed index ${i}) is not an object as expected.`); }
const relaxCount = finalMainParserOptions.relax_column_count ?? finalMainParserOptions.relaxColumnCount;
if (Object.keys(row).length !== sampleKeys.length && !relaxCount) { throw new CSVError(`Row at approx. file line ${firstDataRowActualLine + i} (parsed index ${i}) has inconsistent column count. Expected ${sampleKeys.length}, got ${Object.keys(row).length}.`); }
});
}
}
// Apply schema validation if configured
if (options.schema && parsedData.length > 0) {
// If schema specifies async validation, throw an error as fromFile is synchronous
if (options.schema.useAsync) {
throw new CSVError(
"Asynchronous schema validation is not supported in fromFile. Use fromFileAsync instead, or set useAsync: false in your schema configuration."
);
}
// Ensure synchronous validation is used
const syncSchema: CSVSchemaConfig<T> = {
...options.schema,
useAsync: false
};
const [validatedData, validationResults] = this._validateWithSchemaSync(
parsedData,
syncSchema,
baseLineNumber
);
// Return validated data and include validation results
return new CSV<T>(validatedData, fileAdditionalHeader, validationResults);
}
// No schema validation, return parsed data as-is
return new CSV<T>(parsedData as T[], fileAdditionalHeader);
}
// Use retry logic if configured
if (options.retry) {
return this.retryOperationSync(
operation,
`Failed to read or parse CSV file: ${filename}`,
options.retry
);
} else {
try {
return operation();
} catch (error) {
throw new CSVError(
`Failed to read or parse CSV file: ${filename}`,
error instanceof Error ? error : new Error(String(error))
);
}
}
}
/**
* Create a CSV instance from an array of objects, with optional schema validation.
*
* @param data - Array of objects representing CSV-like rows.
* @param options - Optional configuration, primarily for schema validation.
* @returns A new CSV instance.
* @throws {CSVError} If schema validation is mode 'error' and fails, or if an async schema is used incorrectly.
*/
static fromData<T extends Record<string, any>>(
data: (T |Record<string, any>)[], // Input can be Record<string, any> to allow for validation to type T
options?: CSVFromDataOptions<T>
): CSV<T> {
// Create a new array with copies of the input objects to ensure immutability of the input `data` array
let processedData: Record<string, any>[] = Array.isArray(data) ? data.map(row => ({ ...row })) : [];
let finalAdditionalHeader: string | undefined = undefined; // Not applicable for fromData
let validationResults: RowValidationResult<T>[] | undefined = undefined;
// Apply schema validation if configured
if (options?.schema && processedData.length > 0) {
// If schema specifies async validation, throw an error as fromData is synchronous
if (options.schema.useAsync) {
throw new CSVError(
"Asynchronous schema validation is not supported in the synchronous fromData method. " +
"Validate separately using csvInstance.validateAsync() or ensure your schema and useAsync:false are set for synchronous validation."
);
}
// Ensure synchronous validation is used for this synchronous method
const syncSchema: CSVSchemaConfig<T> = {
...options.schema,
useAsync: false, // Force synchronous validation path
};
try {
const [validatedDataOutput, valRes] = this._validateWithSchemaSync(
processedData, // Pass the current data
syncSchema
);
processedData = validatedDataOutput; // Update processedData with the validated (and possibly filtered/transformed) data
validationResults = valRes;
} catch (error) {
// _validateWithSchemaSync throws CSVError directly if validationMode is 'error' and a failure occurs.
// Re-throw if it's already a CSVError, otherwise wrap it.
if (error instanceof CSVError) {
throw error;
}
throw new CSVError(
'Schema validation failed during CSV.fromData',
error instanceof Error ? error : new Error(String(error))
);
}
}
// The final data should conform to T after validation (or be T[] if no validation)
return new CSV<T>(processedData as T[], finalAdditionalHeader, validationResults);
}
/**
* Create a CSV instance from a string
* @param csvString - CSV content as a string
* @param options - CSV reading options including custom casting
* @returns A new CSV instance
* @throws {CSVError} If parsing fails
*/
/**
* Common function to process parsed CSV data with header mapping and custom casting
* @param dataAfterCsvParse - The initially parsed data from csv-parse
* @param options - Reading options including custom casting
* @param baseLineNumber - Base line number for error reporting (usually 1 or the value of from_line)
* @returns Processed data with header mapping and custom casting applied
*/
private static processCSVData<T>(
dataAfterCsvParse: any[],
options: CSVReadOptions<T & Record<string, any>>,
baseLineNumber: number = 1
): any[] {
// Apply header mapping if specified
let processedData: any[] = dataAfterCsvParse;
if (options.headerMap && dataAfterCsvParse.length > 0 && typeof dataAfterCsvParse[0] === 'object') {
const { fromRowArr } = createHeaderMapFns<T & Record<string, any>>(options.headerMap);
processedData = dataAfterCsvParse.map(row => fromRowArr(row));
}
// Apply custom casting if specified
if (options.customCasts && processedData.length > 0) {
const { definitions, columnCasts, onCastError = 'error' } = options.customCasts;
// Only apply custom casting to object-based rows
if (typeof processedData[0] === 'object' && processedData[0] !== null) {
return processedData.map((row: Record<string, any>, rowIndex: number) => {
const newRow = { ...row };
for (const columnName in row) {
if (Object.prototype.hasOwnProperty.call(row, columnName)) {
const originalValue = row[columnName];
// Prepare string input for custom casters
let valueToTestAndParse: string;
if (typeof originalValue === 'string') {
valueToTestAndParse = originalValue;
} else if (originalValue === null) {
valueToTestAndParse = 'null';
} else if (originalValue === undefined) {
valueToTestAndParse = 'undefined';
} else {
valueToTestAndParse = String(originalValue);
}
// Build casting context
const context: CastingContext = {
column: columnName,
header: false,
index: Object.keys(row).indexOf(columnName),
lines: baseLineNumber + rowIndex,
records: rowIndex,
empty_lines: 0,
invalid_field_length: 0,
quoting: false
};
let castSuccessful = false;
let castedValue: any = originalValue; // Default to original value
// Function to apply a caster
const applyCaster = (caster: Caster<any>): boolean => {
if (caster.test(valueToTestAndParse, context)) {
try {
castedValue = caster.parse(valueToTestAndParse, context);
castSuccessful = true;
return true; // Caster applied successfully
} catch (e) {
if (onCastError === 'error') {
throw new CSVError(
`Custom cast failed for column "${columnName}" at line ${context.lines}, value: "${valueToTestAndParse}". Error: ${(e as Error).message}`,
e
);
} else if (onCastError === 'null') {
castedValue = null;
} else { // 'original'
castedValue = originalValue;
}
return true; // Caster was attempted but failed/handled
}
}
return false; // Caster test failed
};
// 1. Try column-specific casters first
if (columnCasts && columnCasts[columnName as string]) {
const columnRule = columnCasts[columnName as string];
const rulesToTry = Array.isArray(columnRule) ? columnRule : [columnRule];
for (const rule of rulesToTry) {
let casterToUse: Caster<any> | undefined;
if (typeof rule === 'string' && definitions && definitions[rule as keyof CustomCastDefinition]) {
casterToUse = definitions[rule as keyof CustomCastDefinition];
} else if (typeof rule === 'object' && rule !== null && 'test' in rule && 'parse' in rule) {
casterToUse = rule as Caster<any>;
}
if (casterToUse && applyCaster(casterToUse)) {
break; // First successful caster wins
}
}
}
// 2. If no column-specific caster succeeded, try global casters
if (!castSuccessful && definitions) {
// Predefined order for more predictable behavior
const orderedGlobalKeys: (keyof CustomCastDefinition)[] = [
'null', 'boolean', 'number', 'date', 'object', 'array', 'string'
];
for (const defKey of orderedGlobalKeys) {
const globalCaster = definitions[defKey];
if (globalCaster && applyCaster(globalCaster)) {
break; // First successful global caster wins
}
}
}
// Set the potentially modified value in the new row
newRow[columnName] = castedValue;
}
}
return newRow as T;
});
}
}
// No header mapping or custom casting needed, or non-object rows
return processedData as (T & Record<string, any>)[];
}
static fromString<T extends Record<string, any>>(
csvString: string,
options: CSVReadOptions<T> = { csvOptions: { columns: true } }
): CSV<T> {
try {
// Set columns: true by default to ensure consistent behavior, then allow user overrides
const csvOptions = {
columns: true,
...options.csvOptions
};
let dataAfterCsvParse = parseCSV(csvString, csvOptions) as any[];
// Process the data with the shared function
const parsedData = this.processCSVData<T>(dataAfterCsvParse, options);
// Apply schema validation if configured
if (options.schema && parsedData.length > 0) {
// If schema specifies async validation, throw an error as fromString is synchronous
if (options.schema.useAsync) {
throw new CSVError(
"Asynchronous schema validation is not supported in fromString. Use fromFileAsync or validateAsync instead, or set useAsync: false in your schema configuration."
);
}
// Ensure synchronous validation is used
const syncSchema: CSVSchemaConfig<T> = {
...options.schema,
useAsync: false
};
const [validatedData, validationResults] = this._validateWithSchemaSync(
parsedData,
syncSchema
);
// Return validated data and include validation results
return new CSV<T>(validatedData, undefined, validationResults);
}
return new CSV<T>(parsedData as T[]);
} catch (error) {
throw new CSVError('Failed to parse CSV string', error instanceof Error ? error : new Error(String(error)));
}
}
/**
* Create a CSV instance from a readable stream
* @param stream - Readable stream containing CSV data
* @param options - CSV reading options including custom casting
* @returns Promise resolving to a new CSV instance
* @throws {CSVError} If parsing fails
*/
static async fromStream<T extends Record<string, any>>(
stream: NodeJS.ReadableStream,
options: CSVReadOptions<T> | { columns?: boolean } = { columns: true }
): Promise<CSV<T>> {
try {
// Handle the case where options is just simple options
const readOptions: CSVReadOptions<T> = 'csvOptions' in options
? options as CSVReadOptions<T>
: { csvOptions: options } as CSVReadOptions<T>;
// Consistent approach: Set columns: true by default for proper object-based parsing
const csvParseOptions = {
columns: true,
...(readOptions.csvOptions || {})
};
return new Promise((resolve, reject) => {
const data: any[] = [];
const parser = parseCSVAsync(csvParseOptions);
parser.on('readable', () => {
let record;
while ((record = parser.read()) !== null) {
data.push(record);
}
});
parser.on('error', (err) => {
reject(new CSVError('Failed to parse CSV stream', err instanceof Error ? err : new Error(String(err))));
});
parser.on('end', () => {
try {
// Process the data with the shared function (apply header mapping and custom casting)
const processedData = this.processCSVData<T>(data, readOptions as CSVReadOptions<T & Record<string, any>>);
// Apply schema validation if configured
if (readOptions.schema && processedData.length > 0) {
try {
const [validatedData, validationResults] = this._validateWithSchema(
processedData,
readOptions.schema
);
// Return validated data and include validation results
resolve(new CSV<T>(validatedData, undefined, validationResults));
} catch (validationError) {
reject(new CSVError('CSV validation failed', validationError instanceof Error ? validationError : new Error(String(validationError))));
}
} else {
// No schema validation, return processed data as-is
resolve(new CSV<T>(processedData as T[]));
}
} catch (error) {
reject(new CSVError('Failed to process CSV stream data', error instanceof Error ? error : new Error(String(error))));
}
});
stream.pipe(parser);
});
} catch (error) {
throw new CSVError('Failed to parse CSV stream', error instanceof Error ? error : new Error(String(error)));
}
}
/**
* Create a CSV instance from a file asynchronously using streams
* @param filename - Path to the CSV file
* @param options - Reading options including custom casting
* @returns Promise resolving to a new CSV instance
* @throws {CSVError} If file reading or parsing fails
*/
static async fromFileAsync<T extends Record<string, any>>(
filename: string,
options: CSVReadOptions<T> = {}
): Promise<CSV<T>> {
try {
const resolvedPath = path.resolve(filename);
const stream = fs.createReadStream(resolvedPath, options.fsOptions);
// Check if the schema might need async validation
if (options.schema && options.schema.useAsync === undefined) {
// Set useAsync to true for fromFileAsync to ensure proper handling of potentially async schemas
const schemaWithAsyncOption = {
...options.schema,
useAsync: true
};
// Pass the options to fromStream with updated schema
return CSV.fromStream<T>(stream, {
...options,
csvOptions: options.csvOptions || { columns: true },
schema: schemaWithAsyncOption
});
}
// Pass the options to fromStream as csvOptions to ensure it has the right type
return CSV.fromStream<T>(stream, {
csvOptions: options.csvOptions || { columns: true },
...options
});
} catch (error) {
throw new CSVError(
`Failed to read or parse CSV file asynchronously: ${filename}`,
error instanceof Error ? error : new Error(String(error))
);
}
}
/**
* Write the current data to a CSV file
* @param filename - Destination file path
* @param options - Writing options
* @throws {CSVError} If writing fails after retries
* @example
* ```typescript
* // Basic writing
* users.writeToFile('users_export.csv');
*
* // With header mapping
* users.writeToFile('users_export.csv', {
* headerMap: {
* 'id': 'ID',
* 'profile.firstName': 'First Name',
* 'profile.lastName': 'Last Name'
* }
* });
*
* // With streaming for large files
* users.writeToFile('users_export.csv', {
* streaming: true,
* streamingThreshold: 500 // Default is 1000
* });
* ```
*/
writeToFile(filename: string, options: CSVWriteOptions<T> = {}): void {
const operation = () => {
const outputPath = filename.endsWith('.csv')
? filename
: `${filename}.csv`;
const streamingThreshold = options.streamingThreshold || 1000;
// Apply header mapping if provided
if (options.headerMap) {
const stringifyOptions = options.stringifyOptions || { header: true };
const headers = Array.isArray(stringifyOptions.header)
? stringifyOptions.header
: Object.keys(this.data[0] || {});
const { toRowArr } = createHeaderMapFns<T>(options.headerMap);
// Handle streaming with header map
if (options.streaming && this.data.length > streamingThreshold) {
// Use streaming for large datasets with header mapping
const headerToPrepend = options.additionalHeader ?? this.additionalHeader ?? '';
const writable = fs.createWriteStream(outputPath, { encoding: 'utf-8' });
if (headerToPrepend) {
writable.write(headerToPrepend);
}
// Create transform stream for header mapping
const headerMapTransform = new Transform({
objectMode: true,
transform(chunk, encoding, callback) {
try {
const mappedRow = toRowArr(chunk, headers);
callback(null, mappedRow);
} catch (error) {
callback(error as Error);
}
}
});
// Create stringifier with appropriate options
// If header is true, make sure it's handled correctly in stringifyOptions
const csvStringifyOptions = {
...stringifyOptions,
header: Array.isArray(stringifyOptions.header) ? headers : stringifyOptions.header
};
// Create a transform stream using stringifyCSVAsync
// Use a type assertion to help TypeScript understand this is valid
// The CSV module expects options as the first argument
// The 'as any' casting is necessary because csvStringifyOptions' structure (with header property that
// could be boolean or string[]) may not perfectly align with stringifyCSVAsync's expected type
const stringifier = stringifyCSVAsync(csvStringifyOptions as any) as Transform;
// Add a simple error handler to the stringifier
stringifier.on('error', (err) => {
console.error('CSV stringification error:', err);
});
// Create pipeline
Readable.from(this.data)
.pipe(headerMapTransform)
.pipe(stringifier)
.pipe(writable);
return;
} else {
// Standard in-memory processing for smaller datasets
// Transform the data through the header map
const rows = this.data.map(item => toRowArr(item, headers));
// Add headers as the first row if needed
if (stringifyOptions.header === true) {
rows.unshift(headers);
}
// Use a custom stringifier without the header option since we've manually handled it
const csvString = rows.map(row =>
row.map(cell => {
if (cell === null || cell === undefined) return '';
return typeof cell === 'string' && (cell.includes(',') || cell.includes('"') || cell.includes('\n'))
? `"${cell.replace(/"/g, '""')}"`
: String(cell);
}).join(',')
).join('\n');
fs.writeFileSync(
outputPath,
(options.additionalHeader ?? this.additionalHeader ?? '') + csvString,
'utf-8'
);
return;
}
}
// Standard CSV writing without header mapping
if (options.streaming && this.data.length > streamingThreshold) {
// Use streaming for large datasets
// Use a type assertion to help TypeScript understand this is valid
const stringifier = stringifyCSVAsync(
options.stringifyOptions || { header: true }
) as Transform;
const readable = Readable.from(this.data);
const writable = fs.createWriteStream(outputPath, { encoding: 'utf-8' });
const headerToPrepend = options.additionalHeader ?? this.additionalHeader ?? '';
if (headerToPrepend) {
writable.write(headerToPrepend);
}
readable.pipe(stringifier).pipe(writable);
} else {
// Use synchronous version for smaller datasets
const csvString = stringifyCSV(
this.data,
options.stringifyOptions || { header: true }
);
fs.writeFileSync(
outputPath,
(options.additionalHeader ?? this.additionalHeader ?? '') + csvString,
'utf-8'
);
}
};
// Use retry logic if configured
if (