UNPKG

csv-parse

Version:

CSV parsing implementing the Node.js `stream.Transform` API

csv.js.org/parse

adaltas/node-csv

1,461 lines (1,447 loc) • 68.1 kB

JavaScript

'use strict'; class CsvError extends Error { constructor(code, message, options, ...contexts) { if (Array.isArray(message)) message = message.join(" ").trim(); super(message); if (Error.captureStackTrace !== undefined) { Error.captureStackTrace(this, CsvError); } this.code = code; for (const context of contexts) { for (const key in context) { const value = context[key]; this[key] = Buffer.isBuffer(value) ? value.toString(options.encoding) : value == null ? value : JSON.parse(JSON.stringify(value)); } } } } const is_object = function (obj) { return typeof obj === "object" && obj !== null && !Array.isArray(obj); }; const normalize_columns_array = function (columns) { const normalizedColumns = []; for (let i = 0, l = columns.length; i < l; i++) { const column = columns[i]; if (column === undefined || column === null || column === false) { normalizedColumns[i] = { disabled: true }; } else if (typeof column === "string" || typeof column === "number") { normalizedColumns[i] = { name: `${column}` }; } else if (is_object(column)) { if (typeof column.name !== "string") { throw new CsvError("CSV_OPTION_COLUMNS_MISSING_NAME", [ "Option columns missing name:", `property "name" is required at position ${i}`, "when column is an object literal", ]); } normalizedColumns[i] = column; } else { throw new CsvError("CSV_INVALID_COLUMN_DEFINITION", [ "Invalid column definition:", "expect a string or a literal object,", `got ${JSON.stringify(column)} at position ${i}`, ]); } } return normalizedColumns; }; class ResizeableBuffer { constructor(size = 100) { this.size = size; this.length = 0; this.buf = Buffer.allocUnsafe(size); } prepend(val) { if (Buffer.isBuffer(val)) { const length = this.length + val.length; if (length >= this.size) { this.resize(); if (length >= this.size) { throw Error("INVALID_BUFFER_STATE"); } } const buf = this.buf; this.buf = Buffer.allocUnsafe(this.size); val.copy(this.buf, 0); buf.copy(this.buf, val.length); this.length += val.length; } else { const length = this.length++; if (length === this.size) { this.resize(); } const buf = this.clone(); this.buf[0] = val; buf.copy(this.buf, 1, 0, length); } } append(val) { const length = this.length++; if (length === this.size) { this.resize(); } this.buf[length] = val; } clone() { return Buffer.from(this.buf.slice(0, this.length)); } resize() { const length = this.length; this.size = this.size * 2; const buf = Buffer.allocUnsafe(this.size); this.buf.copy(buf, 0, 0, length); this.buf = buf; } toString(encoding) { if (encoding) { return this.buf.slice(0, this.length).toString(encoding); } else { return Uint8Array.prototype.slice.call(this.buf.slice(0, this.length)); } } toJSON() { return this.toString("utf8"); } reset() { this.length = 0; } } const init_state = function (options) { // ECMAScript WhiteSpace + LineTerminator codepoints, encoded under // `options.encoding`. Aligns trimming with `String.prototype.trim()`. // https://tc39.es/ecma262/#sec-white-space // https://tc39.es/ecma262/#sec-line-terminators // // Codepoints unrepresentable in the target encoding are dropped: Node's // Buffer substitutes them with `?` (0x3F), and including those would cause // literal `?` bytes in the input to be trimmed under `latin1`/`ascii`. const timchars = [ // Basic Latin 0x0020, // [Space](https://www.fileformat.info/info/unicode/char/0020/index.htm) 0x0009, // [CHARACTER TABULATION (HT)](https://www.fileformat.info/info/unicode/char/0009/index.htm) 0x000a, // [LINE FEED (LF)](https://www.fileformat.info/info/unicode/char/000a/index.htm) 0x000d, // [CARRIAGE RETURN (CR)](https://www.fileformat.info/info/unicode/char/000d/index.htm) 0x000c, // [FORM FEED (FF)](https://www.fileformat.info/info/unicode/char/000c/index.htm) 0x000b, // [LINE TABULATION (VT)](https://www.fileformat.info/info/unicode/char/000b/index.htm) // Latin-1 Supplement 0x00a0, // [NO-BREAK SPACE (NBSP)](https://www.fileformat.info/info/unicode/char/00a0/index.htm) // Ogham 0x1680, // [OGHAM SPACE MARK](https://www.fileformat.info/info/unicode/char/1680/index.htm) // General Punctuation 0x2000, // [EN QUAD](https://www.fileformat.info/info/unicode/char/2000/index.htm) 0x2001, // [EM QUAD](https://www.fileformat.info/info/unicode/char/2001/index.htm) 0x2002, // [EN SPACE](https://www.fileformat.info/info/unicode/char/2002/index.htm) 0x2003, // [EM SPACE](https://www.fileformat.info/info/unicode/char/2003/index.htm) 0x2004, // [THREE-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2004/index.htm) 0x2005, // [FOUR-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2005/index.htm) 0x2006, // [SIX-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2006/index.htm) 0x2007, // [FIGURE SPACE](https://www.fileformat.info/info/unicode/char/2007/index.htm) 0x2008, // [PUNCTUATION SPACE](https://www.fileformat.info/info/unicode/char/2008/index.htm) 0x2009, // [THIN SPACE](https://www.fileformat.info/info/unicode/char/2009/index.htm) 0x200a, // [HAIR SPACE](https://www.fileformat.info/info/unicode/char/200a/index.htm) 0x2028, // [LINE SEPARATOR](https://www.fileformat.info/info/unicode/char/2028/index.htm) 0x2029, // [PARAGRAPH SEPARATOR](https://www.fileformat.info/info/unicode/char/2029/index.htm) 0x202f, // [NARROW NO-BREAK SPACE (NNBSP)](https://www.fileformat.info/info/unicode/char/202f/index.htm) 0x205f, // [MEDIUM MATHEMATICAL SPACE (MMSP)](https://www.fileformat.info/info/unicode/char/205f/index.htm) 0x3000, // [IDEOGRAPHIC SPACE](https://www.fileformat.info/info/unicode/char/3000/index.htm) 0xfeff, // [ZERO WIDTH NO-BREAK SPACE (BOM)](https://www.fileformat.info/info/unicode/char/feff/index.htm) ].reduce((acc, codepoint) => { const encoded = Buffer.from( String.fromCharCode(codepoint), options.encoding, ); if (codepoint !== 0x3f && encoded.length === 1 && encoded[0] === 0x3f) { return acc; } acc.push(encoded); return acc; }, []); // First-byte lookup table for `__isCharTrimable`. Non-whitespace bytes // (the common case) bail out in O(1) without scanning every timchar. const timcharFirstBytes = new Uint8Array(256); for (const t of timchars) timcharFirstBytes[t[0]] = 1; return { bomSkipped: false, bufBytesStart: 0, castField: options.cast_function, commenting: false, delimiterBufPrevious: undefined, delimiterDiscovered: false, // Current error encountered by a record error: undefined, enabled: options.from_line === 1, escaping: false, escapeIsQuote: Buffer.isBuffer(options.escape) && Buffer.isBuffer(options.quote) && Buffer.compare(options.escape, options.quote) === 0, // columns can be `false`, `true`, `Array` expectedRecordLength: Array.isArray(options.columns) ? options.columns.length : undefined, field: new ResizeableBuffer(20), firstLineToHeaders: options.cast_first_line_to_header, needMoreDataSize: Math.max( // Skip if the remaining buffer smaller than comment options.comment !== null ? options.comment.length : 0, // Skip if the remaining buffer can be delimiter ...(options.delimiter ? options.delimiter.map((delimiter) => delimiter.length) : []), // Auto discovery of delimiter is limited to 1 character options.delimiter_auto ? 1 : 0, // Skip if the remaining buffer can be escape sequence options.quote !== null ? options.quote.length : 0, // Skip if the remaining buffer can be a multi-byte trim character ...timchars.map((t) => t.length), ), previousBuf: undefined, quoting: false, stop: false, rawBuffer: new ResizeableBuffer(100), record: [], recordHasError: false, record_length: 0, recordDelimiterMaxLength: options.record_delimiter.length === 0 ? 0 : Math.max(...options.record_delimiter.map((v) => v.length)), trimChars: [ Buffer.from(" ", options.encoding)[0], Buffer.from("\t", options.encoding)[0], ], wasQuoting: false, wasRowDelimiter: false, timchars: timchars, timcharFirstBytes: timcharFirstBytes, }; }; const underscore = function (str) { return str.replace(/([A-Z])/g, function (_, match) { return "_" + match.toLowerCase(); }); }; const normalize_options = function (opts) { const options = {}; // Merge with user options for (const opt in opts) { options[underscore(opt)] = opts[opt]; } // Normalize option `encoding` // Note: defined first because other options depends on it // to convert chars/strings into buffers. if (options.encoding === undefined || options.encoding === true) { options.encoding = "utf8"; } else if (options.encoding === null || options.encoding === false) { options.encoding = null; } else if ( typeof options.encoding !== "string" && options.encoding !== null ) { throw new CsvError( "CSV_INVALID_OPTION_ENCODING", [ "Invalid option encoding:", "encoding must be a string or null to return a buffer,", `got ${JSON.stringify(options.encoding)}`, ], options, ); } // Normalize option `bom` if ( options.bom === undefined || options.bom === null || options.bom === false ) { options.bom = false; } else if (options.bom !== true) { throw new CsvError( "CSV_INVALID_OPTION_BOM", [ "Invalid option bom:", "bom must be true,", `got ${JSON.stringify(options.bom)}`, ], options, ); } // Normalize option `cast` options.cast_function = null; if ( options.cast === undefined || options.cast === null || options.cast === false || options.cast === "" ) { options.cast = undefined; } else if (typeof options.cast === "function") { options.cast_function = options.cast; options.cast = true; } else if (options.cast !== true) { throw new CsvError( "CSV_INVALID_OPTION_CAST", [ "Invalid option cast:", "cast must be true or a function,", `got ${JSON.stringify(options.cast)}`, ], options, ); } // Normalize option `cast_date` if ( options.cast_date === undefined || options.cast_date === null || options.cast_date === false || options.cast_date === "" ) { options.cast_date = false; } else if (options.cast_date === true) { options.cast_date = function (value) { const date = Date.parse(value); return !isNaN(date) ? new Date(date) : value; }; } else if (typeof options.cast_date !== "function") { throw new CsvError( "CSV_INVALID_OPTION_CAST_DATE", [ "Invalid option cast_date:", "cast_date must be true or a function,", `got ${JSON.stringify(options.cast_date)}`, ], options, ); } // Normalize option `columns` options.cast_first_line_to_header = undefined; if (options.columns === true) { // Fields in the first line are converted as-is to columns options.cast_first_line_to_header = undefined; } else if (typeof options.columns === "function") { options.cast_first_line_to_header = options.columns; options.columns = true; } else if (Array.isArray(options.columns)) { options.columns = normalize_columns_array(options.columns); } else if ( options.columns === undefined || options.columns === null || options.columns === false ) { options.columns = false; } else { throw new CsvError( "CSV_INVALID_OPTION_COLUMNS", [ "Invalid option columns:", "expect an array, a function or true,", `got ${JSON.stringify(options.columns)}`, ], options, ); } // Normalize option `group_columns_by_name` if ( options.group_columns_by_name === undefined || options.group_columns_by_name === null || options.group_columns_by_name === false ) { options.group_columns_by_name = false; } else if (options.group_columns_by_name !== true) { throw new CsvError( "CSV_INVALID_OPTION_GROUP_COLUMNS_BY_NAME", [ "Invalid option group_columns_by_name:", "expect an boolean,", `got ${JSON.stringify(options.group_columns_by_name)}`, ], options, ); } else if (options.columns === false) { throw new CsvError( "CSV_INVALID_OPTION_GROUP_COLUMNS_BY_NAME", [ "Invalid option group_columns_by_name:", "the `columns` mode must be activated.", ], options, ); } // Normalize option `comment` if ( options.comment === undefined || options.comment === null || options.comment === false || options.comment === "" ) { options.comment = null; } else { if (typeof options.comment === "string") { options.comment = Buffer.from(options.comment, options.encoding); } if (!Buffer.isBuffer(options.comment)) { throw new CsvError( "CSV_INVALID_OPTION_COMMENT", [ "Invalid option comment:", "comment must be a buffer or a string,", `got ${JSON.stringify(options.comment)}`, ], options, ); } } // Normalize option `comment_no_infix` if ( options.comment_no_infix === undefined || options.comment_no_infix === null || options.comment_no_infix === false ) { options.comment_no_infix = false; } else if (options.comment_no_infix !== true) { throw new CsvError( "CSV_INVALID_OPTION_COMMENT", [ "Invalid option comment_no_infix:", "value must be a boolean,", `got ${JSON.stringify(options.comment_no_infix)}`, ], options, ); } // Normalize option `delimiter_auto` if ( options.delimiter_auto === undefined || options.delimiter_auto === null || options.delimiter_auto === false ) { options.delimiter_auto = false; } else if (options.delimiter_auto === true) { options.delimiter_auto = {}; } else if (!is_object(options.delimiter_auto)) { throw new CsvError( "CSV_INVALID_OPTION_DELIMITER_AUTO", [ "Invalid option delimiter_auto:", "delimiter_auto must be a boolean or a configuration object,", `got ${JSON.stringify(options.delimiter_auto)}`, ], options, ); } if (options.delimiter_auto) { if (options.delimiter_auto.preferred === undefined) options.delimiter_auto.preferred = { [",".charCodeAt(0)]: 1.8, ["\t".charCodeAt(0)]: 1.8, [";".charCodeAt(0)]: 1.6, [" ".charCodeAt(0)]: 1.6, [":".charCodeAt(0)]: 1.5, [".".charCodeAt(0)]: 1.4, ["/".charCodeAt(0)]: 1.4, }; else if (!is_object(options.delimiter_auto.preferred)) { throw new CsvError( "CSV_INVALID_OPTION_DELIMITER_AUTO", [ "Invalid option delimiter_auto:", "preferred must be an object,", `got ${JSON.stringify(options.delimiter_auto.preferred)}`, ], options, ); } if (options.delimiter_auto.score === undefined) options.delimiter_auto.score = (info, options) => { return ( (info.total - info.std) * (options.preferred[info.char_code] || 1) ); }; else if (typeof options.delimiter_auto.score !== "function") { throw new CsvError( "CSV_INVALID_OPTION_DELIMITER_AUTO", [ "Invalid option delimiter_auto:", "score must be a function,", `got ${JSON.stringify(options.delimiter_auto.score)}`, ], options, ); } if (options.delimiter_auto.size === undefined) options.delimiter_auto.size = 2048; else if (typeof options.delimiter_auto.size !== "number") { throw new CsvError( "CSV_INVALID_OPTION_DELIMITER_AUTO", [ "Invalid option delimiter_auto:", "size must be a number,", `got ${JSON.stringify(options.delimiter_auto.size)}`, ], options, ); } } // Normalize option `delimiter` const delimiter_json = JSON.stringify(options.delimiter); if (options.delimiter_auto !== false) { options.delimiter = []; } if (!Array.isArray(options.delimiter)) { if ( options.delimiter === undefined || options.delimiter === null || options.delimiter === false ) { options.delimiter = Buffer.from(",", options.encoding); } options.delimiter = [options.delimiter]; } options.delimiter = options.delimiter.map(function (delimiter) { if (typeof delimiter === "string") { delimiter = Buffer.from(delimiter, options.encoding); } if (!Buffer.isBuffer(delimiter) || delimiter.length === 0) { throw new CsvError( "CSV_INVALID_OPTION_DELIMITER", [ "Invalid option delimiter:", "delimiter must be a non empty string or buffer or array of string|buffer,", `got ${delimiter_json}`, ], options, ); } return delimiter; }); // Normalize option `escape` if (options.escape === undefined || options.escape === true) { options.escape = Buffer.from('"', options.encoding); } else if (typeof options.escape === "string") { options.escape = Buffer.from(options.escape, options.encoding); } else if (options.escape === null || options.escape === false) { options.escape = null; } if (options.escape !== null) { if (!Buffer.isBuffer(options.escape)) { throw new Error( `Invalid Option: escape must be a buffer, a string or a boolean, got ${JSON.stringify(options.escape)}`, ); } } // Normalize option `from` if (options.from === undefined || options.from === null) { options.from = 1; } else { if (typeof options.from === "string" && /\d+/.test(options.from)) { options.from = parseInt(options.from); } if (Number.isInteger(options.from)) { if (options.from < 0) { throw new Error( `Invalid Option: from must be a positive integer, got ${JSON.stringify(opts.from)}`, ); } } else { throw new Error( `Invalid Option: from must be an integer, got ${JSON.stringify(options.from)}`, ); } } // Normalize option `from_line` if (options.from_line === undefined || options.from_line === null) { options.from_line = 1; } else { if ( typeof options.from_line === "string" && /\d+/.test(options.from_line) ) { options.from_line = parseInt(options.from_line); } if (Number.isInteger(options.from_line)) { if (options.from_line <= 0) { throw new Error( `Invalid Option: from_line must be a positive integer greater than 0, got ${JSON.stringify(opts.from_line)}`, ); } } else { throw new Error( `Invalid Option: from_line must be an integer, got ${JSON.stringify(opts.from_line)}`, ); } } // Normalize options `ignore_last_delimiters` if ( options.ignore_last_delimiters === undefined || options.ignore_last_delimiters === null ) { options.ignore_last_delimiters = false; } else if (typeof options.ignore_last_delimiters === "number") { options.ignore_last_delimiters = Math.floor(options.ignore_last_delimiters); if (options.ignore_last_delimiters === 0) { options.ignore_last_delimiters = false; } } else if (typeof options.ignore_last_delimiters !== "boolean") { throw new CsvError( "CSV_INVALID_OPTION_IGNORE_LAST_DELIMITERS", [ "Invalid option `ignore_last_delimiters`:", "the value must be a boolean value or an integer,", `got ${JSON.stringify(options.ignore_last_delimiters)}`, ], options, ); } if (options.ignore_last_delimiters === true && options.columns === false) { throw new CsvError( "CSV_IGNORE_LAST_DELIMITERS_REQUIRES_COLUMNS", [ "The option `ignore_last_delimiters`", "requires the activation of the `columns` option", ], options, ); } // Normalize option `info` if ( options.info === undefined || options.info === null || options.info === false ) { options.info = false; } else if (options.info !== true) { throw new Error( `Invalid Option: info must be true, got ${JSON.stringify(options.info)}`, ); } // Normalize option `max_record_size` if ( options.max_record_size === undefined || options.max_record_size === null || options.max_record_size === false ) { options.max_record_size = 0; } else if ( Number.isInteger(options.max_record_size) && options.max_record_size >= 0 ) ; else if ( typeof options.max_record_size === "string" && /\d+/.test(options.max_record_size) ) { options.max_record_size = parseInt(options.max_record_size); } else { throw new Error( `Invalid Option: max_record_size must be a positive integer, got ${JSON.stringify(options.max_record_size)}`, ); } // Normalize option `objname` if ( options.objname === undefined || options.objname === null || options.objname === false ) { options.objname = undefined; } else if (Buffer.isBuffer(options.objname)) { if (options.objname.length === 0) { throw new Error(`Invalid Option: objname must be a non empty buffer`); } if (options.encoding === null) ; else { options.objname = options.objname.toString(options.encoding); } } else if (typeof options.objname === "string") { if (options.objname.length === 0) { throw new Error(`Invalid Option: objname must be a non empty string`); } // Great, nothing to do } else if (typeof options.objname === "number") ; else { throw new Error( `Invalid Option: objname must be a string or a buffer, got ${options.objname}`, ); } if (options.objname !== undefined) { if (typeof options.objname === "number") { if (options.columns !== false) { throw Error( "Invalid Option: objname index cannot be combined with columns or be defined as a field", ); } } else { // A string or a buffer if (options.columns === false) { throw Error( "Invalid Option: objname field must be combined with columns or be defined as an index", ); } } } // Normalize option `on_record` if (options.on_record === undefined || options.on_record === null) { options.on_record = undefined; } else if (typeof options.on_record !== "function") { throw new CsvError( "CSV_INVALID_OPTION_ON_RECORD", [ "Invalid option `on_record`:", "expect a function,", `got ${JSON.stringify(options.on_record)}`, ], options, ); } // Normalize option `on_skip` // options.on_skip ??= (err, chunk) => { // this.emit('skip', err, chunk); // }; if ( options.on_skip !== undefined && options.on_skip !== null && typeof options.on_skip !== "function" ) { throw new Error( `Invalid Option: on_skip must be a function, got ${JSON.stringify(options.on_skip)}`, ); } // Normalize option `quote` if ( options.quote === null || options.quote === false || options.quote === "" ) { options.quote = null; } else { if (options.quote === undefined || options.quote === true) { options.quote = Buffer.from('"', options.encoding); } else if (typeof options.quote === "string") { options.quote = Buffer.from(options.quote, options.encoding); } if (!Buffer.isBuffer(options.quote)) { throw new Error( `Invalid Option: quote must be a buffer or a string, got ${JSON.stringify(options.quote)}`, ); } } // Normalize option `raw` if ( options.raw === undefined || options.raw === null || options.raw === false ) { options.raw = false; } else if (options.raw !== true) { throw new Error( `Invalid Option: raw must be true, got ${JSON.stringify(options.raw)}`, ); } // Normalize option `record_delimiter` if (options.record_delimiter === undefined) { options.record_delimiter = []; } else if ( typeof options.record_delimiter === "string" || Buffer.isBuffer(options.record_delimiter) ) { if (options.record_delimiter.length === 0) { throw new CsvError( "CSV_INVALID_OPTION_RECORD_DELIMITER", [ "Invalid option `record_delimiter`:", "value must be a non empty string or buffer,", `got ${JSON.stringify(options.record_delimiter)}`, ], options, ); } options.record_delimiter = [options.record_delimiter]; } else if (!Array.isArray(options.record_delimiter)) { throw new CsvError( "CSV_INVALID_OPTION_RECORD_DELIMITER", [ "Invalid option `record_delimiter`:", "value must be a string, a buffer or array of string|buffer,", `got ${JSON.stringify(options.record_delimiter)}`, ], options, ); } options.record_delimiter = options.record_delimiter.map(function (rd, i) { if (typeof rd !== "string" && !Buffer.isBuffer(rd)) { throw new CsvError( "CSV_INVALID_OPTION_RECORD_DELIMITER", [ "Invalid option `record_delimiter`:", "value must be a string, a buffer or array of string|buffer", `at index ${i},`, `got ${JSON.stringify(rd)}`, ], options, ); } else if (rd.length === 0) { throw new CsvError( "CSV_INVALID_OPTION_RECORD_DELIMITER", [ "Invalid option `record_delimiter`:", "value must be a non empty string or buffer", `at index ${i},`, `got ${JSON.stringify(rd)}`, ], options, ); } if (typeof rd === "string") { rd = Buffer.from(rd, options.encoding); } return rd; }); // Normalize option `relax_column_count` if (typeof options.relax_column_count === "boolean") ; else if ( options.relax_column_count === undefined || options.relax_column_count === null ) { options.relax_column_count = false; } else { throw new Error( `Invalid Option: relax_column_count must be a boolean, got ${JSON.stringify(options.relax_column_count)}`, ); } if (typeof options.relax_column_count_less === "boolean") ; else if ( options.relax_column_count_less === undefined || options.relax_column_count_less === null ) { options.relax_column_count_less = false; } else { throw new Error( `Invalid Option: relax_column_count_less must be a boolean, got ${JSON.stringify(options.relax_column_count_less)}`, ); } if (typeof options.relax_column_count_more === "boolean") ; else if ( options.relax_column_count_more === undefined || options.relax_column_count_more === null ) { options.relax_column_count_more = false; } else { throw new Error( `Invalid Option: relax_column_count_more must be a boolean, got ${JSON.stringify(options.relax_column_count_more)}`, ); } // Normalize option `relax_quotes` if (typeof options.relax_quotes === "boolean") ; else if ( options.relax_quotes === undefined || options.relax_quotes === null ) { options.relax_quotes = false; } else { throw new Error( `Invalid Option: relax_quotes must be a boolean, got ${JSON.stringify(options.relax_quotes)}`, ); } // Normalize option `skip_empty_lines` if (typeof options.skip_empty_lines === "boolean") ; else if ( options.skip_empty_lines === undefined || options.skip_empty_lines === null ) { options.skip_empty_lines = false; } else { throw new Error( `Invalid Option: skip_empty_lines must be a boolean, got ${JSON.stringify(options.skip_empty_lines)}`, ); } // Normalize option `skip_records_with_empty_values` if (typeof options.skip_records_with_empty_values === "boolean") ; else if ( options.skip_records_with_empty_values === undefined || options.skip_records_with_empty_values === null ) { options.skip_records_with_empty_values = false; } else { throw new Error( `Invalid Option: skip_records_with_empty_values must be a boolean, got ${JSON.stringify(options.skip_records_with_empty_values)}`, ); } // Normalize option `skip_records_with_error` if (typeof options.skip_records_with_error === "boolean") ; else if ( options.skip_records_with_error === undefined || options.skip_records_with_error === null ) { options.skip_records_with_error = false; } else { throw new Error( `Invalid Option: skip_records_with_error must be a boolean, got ${JSON.stringify(options.skip_records_with_error)}`, ); } // Normalize option `rtrim` if ( options.rtrim === undefined || options.rtrim === null || options.rtrim === false ) { options.rtrim = false; } else if (options.rtrim !== true) { throw new Error( `Invalid Option: rtrim must be a boolean, got ${JSON.stringify(options.rtrim)}`, ); } // Normalize option `ltrim` if ( options.ltrim === undefined || options.ltrim === null || options.ltrim === false ) { options.ltrim = false; } else if (options.ltrim !== true) { throw new Error( `Invalid Option: ltrim must be a boolean, got ${JSON.stringify(options.ltrim)}`, ); } // Normalize option `trim` if ( options.trim === undefined || options.trim === null || options.trim === false ) { options.trim = false; } else if (options.trim !== true) { throw new Error( `Invalid Option: trim must be a boolean, got ${JSON.stringify(options.trim)}`, ); } // Normalize options `trim`, `ltrim` and `rtrim` if (options.trim === true && opts.ltrim !== false) { options.ltrim = true; } else if (options.ltrim !== true) { options.ltrim = false; } if (options.trim === true && opts.rtrim !== false) { options.rtrim = true; } else if (options.rtrim !== true) { options.rtrim = false; } // Normalize option `to` if (options.to === undefined || options.to === null) { options.to = -1; } else if (options.to !== -1) { if (typeof options.to === "string" && /\d+/.test(options.to)) { options.to = parseInt(options.to); } if (Number.isInteger(options.to)) { if (options.to <= 0) { throw new Error( `Invalid Option: to must be a positive integer greater than 0, got ${JSON.stringify(opts.to)}`, ); } } else { throw new Error( `Invalid Option: to must be an integer, got ${JSON.stringify(opts.to)}`, ); } } // Normalize option `to_line` if (options.to_line === undefined || options.to_line === null) { options.to_line = -1; } else if (options.to_line !== -1) { if (typeof options.to_line === "string" && /\d+/.test(options.to_line)) { options.to_line = parseInt(options.to_line); } if (Number.isInteger(options.to_line)) { if (options.to_line <= 0) { throw new Error( `Invalid Option: to_line must be a positive integer greater than 0, got ${JSON.stringify(opts.to_line)}`, ); } } else { throw new Error( `Invalid Option: to_line must be an integer, got ${JSON.stringify(opts.to_line)}`, ); } } return options; }; // Discussed in [issue #400](https://github.com/adaltas/node-csv/issues/400) // See https://github.com/python/cpython/blob/ea1b1c579f600cc85d145c60862b2e6b98701b24/Lib/csv.py#L349 const delimiter_discover = function (records, options) { // Normalize the configuration if (!options) { ({ delimiter_auto: options } = normalize_options({ delimiter_auto: true })); } // Convert String to Buffer if (typeof records === "string") { records = Buffer.from(records); } // Convert Buffer to an array of records if (Buffer.isBuffer(records)) { records = ((data) => { const records = []; const parser = transform({ delimiter: [] }); const push = (record) => records.push(record); const close = () => {}; const error = parser.parse(data, true, push, close); if (error !== undefined) throw error; return records; })(records); } // Info array initialization, 127 entries, one per char code const info = Array(127) .fill() .map(() => ({ lines: [] })); // Traverse each records, count occurences per char code records.map(([record], line) => { for (let i = 0, l = record.length; i < l; i++) { // Count the character frequency const code = record.charCodeAt(i); if (info[code].lines[line] === undefined) info[code].lines[line] = 0; info[code].lines[line]++; } }); // Traverse each char code, compute the score info.map((info, i) => { info.char_code = i; info.std = std(info.lines); info.total = info.lines.reduce((acc, val) => acc + val, 0); info.preferred = !!options.preferred[i]; info.score = options.score(info, options); }); // Extract the dominant character const result = info.reduce( (acc, info) => (acc.score > info.score ? acc : info), {}, ); return String.fromCharCode(result.char_code); }; const std = function (array) { const n = array.length; if (n === 0) return 0; const mean = array.reduce((a, b) => a + b) / n; return Math.sqrt( array.map((x) => Math.pow(x - mean, 2)).reduce((a, b) => a + b) / n, ); }; const isRecordEmpty = function (record) { return record.every( (field) => field == null || (field.toString && field.toString().trim() === ""), ); }; const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal const boms = { // Note, the following are equals: // Buffer.from("\ufeff") // Buffer.from([239, 187, 191]) // Buffer.from('EFBBBF', 'hex') utf8: Buffer.from([239, 187, 191]), // Note, the following are equals: // Buffer.from "\ufeff", 'utf16le // Buffer.from([255, 254]) utf16le: Buffer.from([255, 254]), }; const transform = function (original_options = {}) { const info = { bytes: 0, bytes_records: 0, comment_lines: 0, empty_lines: 0, invalid_field_length: 0, lines: 1, records: 0, }; const options = normalize_options(original_options); return { info: info, original_options: original_options, options: options, state: init_state(options), __needMoreData: function (i, bufLen, end) { if (end) return false; const { encoding, escape, quote } = this.options; const { quoting, needMoreDataSize, recordDelimiterMaxLength } = this.state; const numOfCharLeft = bufLen - i - 1; const requiredLength = Math.max( needMoreDataSize, // Skip if the remaining buffer smaller than record delimiter // If "record_delimiter" is yet to be discovered: // 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0` // 2. We set the length to windows line ending in the current encoding // Note, that encoding is known from user or bom discovery at that point // recordDelimiterMaxLength, recordDelimiterMaxLength === 0 ? Buffer.from("\r\n", encoding).length : recordDelimiterMaxLength, // Skip if remaining buffer can be an escaped quote quoting ? (escape === null ? 0 : escape.length) + quote.length : 0, // Skip if remaining buffer can be record delimiter following the closing quote quoting ? quote.length + recordDelimiterMaxLength : 0, ); return numOfCharLeft < requiredLength; }, // Central parser implementation parse: function (nextBuf, end, push, close) { const { bom, comment_no_infix, delimiter_auto, encoding, from_line, ltrim, max_record_size, raw, relax_quotes, rtrim, skip_empty_lines, to, to_line, } = this.options; let { comment, escape, quote, record_delimiter } = this.options; const { bomSkipped, delimiterDiscovered, delimiterBufPrevious, rawBuffer, escapeIsQuote, } = this.state; // Automatic delimiter discovery if (!delimiterDiscovered && delimiter_auto) { let delimiterBuf; if (delimiterBufPrevious === undefined) { delimiterBuf = nextBuf; } else if ( delimiterBufPrevious !== undefined && nextBuf === undefined ) { delimiterBuf = delimiterBufPrevious; } else { delimiterBuf = Buffer.concat([delimiterBufPrevious, nextBuf]); } // Ensure that nextBuf is not concatenated a second time during buffer reconciliation nextBuf = undefined; // this.delimiterBufPrevious = delimiterBuf; if (end || delimiterBuf.length > delimiter_auto.size) { this.options.delimiter = [ Buffer.from( delimiter_discover(delimiterBuf, this.options.delimiter_auto), ), ]; this.state.previousBuf = delimiterBuf; this.state.delimiterBufPrevious = undefined; this.state.delimiterDiscovered = true; } else { this.state.delimiterBufPrevious = delimiterBuf; return; } } // Previous buffers reconciliation const { previousBuf } = this.state; let buf; if (previousBuf === undefined) { if (nextBuf === undefined) { // Handle empty string close(); return; } else { buf = nextBuf; } } else if (previousBuf !== undefined && nextBuf === undefined) { buf = previousBuf; } else { buf = Buffer.concat([previousBuf, nextBuf]); } // Handle UTF BOM if (bomSkipped === false) { if (bom === false) { this.state.bomSkipped = true; } else if (buf.length < 3) { // No enough data if (end === false) { // Wait for more data this.state.previousBuf = buf; return; } } else { for (const encoding in boms) { if (boms[encoding].compare(buf, 0, boms[encoding].length) === 0) { // Skip BOM const bomLength = boms[encoding].length; this.state.bufBytesStart += bomLength; buf = buf.slice(bomLength); // Renormalize original options with the new encoding const options = normalize_options({ ...this.original_options, encoding: encoding, }); // Properties are merged with the existing options instance for (const key in options) { this.options[key] = options[key]; } // Options will re-evaluate the Buffer with the new encoding ({ comment, escape, quote } = this.options); break; } } this.state.bomSkipped = true; } } const bufLen = buf.length; let pos; for (pos = 0; pos < bufLen; pos++) { // Ensure we get enough space to look ahead // There should be a way to move this out of the loop if (this.__needMoreData(pos, bufLen, end)) { break; } if (this.state.wasRowDelimiter === true) { this.info.lines++; this.state.wasRowDelimiter = false; } if (to_line !== -1 && this.info.lines > to_line) { this.state.stop = true; close(); return; } // Auto discovery of record_delimiter, unix, mac and windows supported if (this.state.quoting === false && record_delimiter.length === 0) { const record_delimiterCount = this.__autoDiscoverRecordDelimiter( buf, pos, ); if (record_delimiterCount) { record_delimiter = this.options.record_delimiter; } } const chr = buf[pos]; if (raw === true) { rawBuffer.append(chr); } if ( (chr === cr || chr === nl) && this.state.wasRowDelimiter === false ) { this.state.wasRowDelimiter = true; } // Previous char was a valid escape char // treat the current char as a regular char if (this.state.escaping === true) { this.state.escaping = false; } else { // Escape is only active inside quoted fields // We are quoting, the char is an escape chr and there is a chr to escape // if(escape !== null && this.state.quoting === true && chr === escape && pos + 1 < bufLen){ if ( escape !== null && this.state.quoting === true && this.__isEscape(buf, pos, chr) && pos + escape.length < bufLen ) { if (escapeIsQuote) { if (this.__isQuote(buf, pos + escape.length)) { this.state.escaping = true; pos += escape.length - 1; continue; } } else { this.state.escaping = true; pos += escape.length - 1; continue; } } // Not currently escaping and chr is a quote // TODO: need to compare bytes instead of single char if (this.state.commenting === false && this.__isQuote(buf, pos)) { if (this.state.quoting === true) { const nextChr = buf[pos + quote.length]; const isNextChrTrimable = rtrim && this.__isCharTrimable(buf, pos + quote.length); const isNextChrComment = comment !== null && this.__compareBytes(comment, buf, pos + quote.length, nextChr); const isNextChrDelimiter = this.__isDelimiter( buf, pos + quote.length, nextChr, ); const isNextChrRecordDelimiter = record_delimiter.length === 0 ? this.__autoDiscoverRecordDelimiter(buf, pos + quote.length) : this.__isRecordDelimiter(nextChr, buf, pos + quote.length); // Escape a quote // Treat next char as a regular character if ( escape !== null && this.__isEscape(buf, pos, chr) && this.__isQuote(buf, pos + escape.length) ) { pos += escape.length - 1; } else if ( !nextChr || isNextChrDelimiter || isNextChrRecordDelimiter || isNextChrComment || isNextChrTrimable ) { this.state.quoting = false; this.state.wasQuoting = true; pos += quote.length - 1; continue; } else if (relax_quotes === false) { const err = this.__error( new CsvError( "CSV_INVALID_CLOSING_QUOTE", [ "Invalid Closing Quote:", `got "${String.fromCharCode(nextChr)}"`, `at line ${this.info.lines}`, "instead of delimiter, record delimiter, trimable character", "(if activated) or comment", ], this.options, this.__infoField(), ), ); if (err !== undefined) return err; } else { this.state.quoting = false; this.state.wasQuoting = true; this.state.field.prepend(quote); pos += quote.length - 1; } } else { if (this.state.field.length !== 0) { // In relax_quotes mode, treat opening quote preceded by chrs as regular if (relax_quotes === false) { const info = this.__infoField(); const bom = Object.keys(boms) .map((b) => boms[b].equals(this.state.field.toString()) ? b : false, ) .filter(Boolean)[0]; const err = this.__error( new CsvError( "INVALID_OPENING_QUOTE", [ "Invalid Opening Quote:", `a quote is found on field ${JSON.stringify(info.column)} at line ${info.lines}, value is ${JSON.stringify(this.state.field.toString(encoding))}`, bom ? `(${bom} bom)` : undefined, ], this.options, info, { field: this.state.field, }, ), ); if (err !== undefined) return err; } } else { this.state.quoting = true; pos += quote.length - 1; continue; } } } if (this.state.quoting === false) { const recordDelimiterLength = this.__isRecordDelimiter( chr, buf, pos, ); if (recordDelimiterLength !== 0) { // Do not emit comments which take a full line const skipCommentLine = this.state.commenting && this.state.wasQuoting === false && this.state.record.length === 0 && this.state.field.length === 0; if (skipCommentLine) { this.info.comment_lines++; // Skip full comment line } else { // Activate records emission if above from_line if ( this.state.enabled === false && this.info.lines + (this.state.wasRowDelimiter === true ? 1 : 0) >= from_line ) { this.state.enabled = true; this.__resetField(); this.__resetRecord(); pos += recordDelimiterLength - 1; continue; } // Skip if line is empty and skip_empty_lines activated if ( skip_empty_lines === true && this.state.wasQuoting === false && this.state.record.length === 0 && this.state.field.length === 0 ) { this.info.empty_lines++; pos += recordDelimiterLength - 1; continue; } this.info.bytes = this.state.bufBytesStart + pos; const errField = this.__onField(); if (errField !== undefined) return errField; this.info.bytes = this.state.bufBytesStart + pos + recordDelimiterLength; const errRecord = this.__onRecord(push); if (errRecord !== undefined) return errRecord; if (to !== -1 && this.info.records >= to) { this.state.stop = true; close(); return; } } this.state.commenting = false; pos += recordDelimiterLength - 1; continue; } if (this.state.commenting) { continue; } if ( comment !== null && (comment_no_infix === false || (this.state.record.length === 0 && this.state.field.length === 0)) ) { const commentCount = this.__compareBytes(comment, buf, pos, chr); if (commentCount !== 0) { this.state.commenting = true; continue; } } const delimiterLength = this.__isDelimiter(buf, pos, chr); if (delimiterLength !== 0) { this.info.bytes = this.state.bufBytesStart + pos; const errField = this.__onField(); if (errField !== undefined) return errField; pos += delimiterLength - 1; continue; } } } if (this.state.commenting === false) { if ( max_record_size !== 0 && this.state.record_length + this.state.field.length > max_record_size ) { return this.__error( new CsvError( "CSV_MAX_RECORD_SIZE", [ "Max Record Size:", "record exceed the maximum number of tolerated bytes", `of ${max_record_size}`, `at line ${this.info.lines}`, ], this.options, this.__infoField(), ), ); } } const lappend = ltrim === false || this.state.quoting === true || this.state.field.length !== 0 ||