csv-parse
Version:
CSV parsing implementing the Node.js `stream.Transform` API
1,461 lines (1,447 loc) • 68.1 kB
JavaScript
'use strict';
class CsvError extends Error {
constructor(code, message, options, ...contexts) {
if (Array.isArray(message)) message = message.join(" ").trim();
super(message);
if (Error.captureStackTrace !== undefined) {
Error.captureStackTrace(this, CsvError);
}
this.code = code;
for (const context of contexts) {
for (const key in context) {
const value = context[key];
this[key] = Buffer.isBuffer(value)
? value.toString(options.encoding)
: value == null
? value
: JSON.parse(JSON.stringify(value));
}
}
}
}
const is_object = function (obj) {
return typeof obj === "object" && obj !== null && !Array.isArray(obj);
};
const normalize_columns_array = function (columns) {
const normalizedColumns = [];
for (let i = 0, l = columns.length; i < l; i++) {
const column = columns[i];
if (column === undefined || column === null || column === false) {
normalizedColumns[i] = { disabled: true };
} else if (typeof column === "string" || typeof column === "number") {
normalizedColumns[i] = { name: `${column}` };
} else if (is_object(column)) {
if (typeof column.name !== "string") {
throw new CsvError("CSV_OPTION_COLUMNS_MISSING_NAME", [
"Option columns missing name:",
`property "name" is required at position ${i}`,
"when column is an object literal",
]);
}
normalizedColumns[i] = column;
} else {
throw new CsvError("CSV_INVALID_COLUMN_DEFINITION", [
"Invalid column definition:",
"expect a string or a literal object,",
`got ${JSON.stringify(column)} at position ${i}`,
]);
}
}
return normalizedColumns;
};
class ResizeableBuffer {
constructor(size = 100) {
this.size = size;
this.length = 0;
this.buf = Buffer.allocUnsafe(size);
}
prepend(val) {
if (Buffer.isBuffer(val)) {
const length = this.length + val.length;
if (length >= this.size) {
this.resize();
if (length >= this.size) {
throw Error("INVALID_BUFFER_STATE");
}
}
const buf = this.buf;
this.buf = Buffer.allocUnsafe(this.size);
val.copy(this.buf, 0);
buf.copy(this.buf, val.length);
this.length += val.length;
} else {
const length = this.length++;
if (length === this.size) {
this.resize();
}
const buf = this.clone();
this.buf[0] = val;
buf.copy(this.buf, 1, 0, length);
}
}
append(val) {
const length = this.length++;
if (length === this.size) {
this.resize();
}
this.buf[length] = val;
}
clone() {
return Buffer.from(this.buf.slice(0, this.length));
}
resize() {
const length = this.length;
this.size = this.size * 2;
const buf = Buffer.allocUnsafe(this.size);
this.buf.copy(buf, 0, 0, length);
this.buf = buf;
}
toString(encoding) {
if (encoding) {
return this.buf.slice(0, this.length).toString(encoding);
} else {
return Uint8Array.prototype.slice.call(this.buf.slice(0, this.length));
}
}
toJSON() {
return this.toString("utf8");
}
reset() {
this.length = 0;
}
}
const init_state = function (options) {
// ECMAScript WhiteSpace + LineTerminator codepoints, encoded under
// `options.encoding`. Aligns trimming with `String.prototype.trim()`.
// https://tc39.es/ecma262/#sec-white-space
// https://tc39.es/ecma262/#sec-line-terminators
//
// Codepoints unrepresentable in the target encoding are dropped: Node's
// Buffer substitutes them with `?` (0x3F), and including those would cause
// literal `?` bytes in the input to be trimmed under `latin1`/`ascii`.
const timchars = [
// Basic Latin
0x0020, // [Space](https://www.fileformat.info/info/unicode/char/0020/index.htm)
0x0009, // [CHARACTER TABULATION (HT)](https://www.fileformat.info/info/unicode/char/0009/index.htm)
0x000a, // [LINE FEED (LF)](https://www.fileformat.info/info/unicode/char/000a/index.htm)
0x000d, // [CARRIAGE RETURN (CR)](https://www.fileformat.info/info/unicode/char/000d/index.htm)
0x000c, // [FORM FEED (FF)](https://www.fileformat.info/info/unicode/char/000c/index.htm)
0x000b, // [LINE TABULATION (VT)](https://www.fileformat.info/info/unicode/char/000b/index.htm)
// Latin-1 Supplement
0x00a0, // [NO-BREAK SPACE (NBSP)](https://www.fileformat.info/info/unicode/char/00a0/index.htm)
// Ogham
0x1680, // [OGHAM SPACE MARK](https://www.fileformat.info/info/unicode/char/1680/index.htm)
// General Punctuation
0x2000, // [EN QUAD](https://www.fileformat.info/info/unicode/char/2000/index.htm)
0x2001, // [EM QUAD](https://www.fileformat.info/info/unicode/char/2001/index.htm)
0x2002, // [EN SPACE](https://www.fileformat.info/info/unicode/char/2002/index.htm)
0x2003, // [EM SPACE](https://www.fileformat.info/info/unicode/char/2003/index.htm)
0x2004, // [THREE-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2004/index.htm)
0x2005, // [FOUR-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2005/index.htm)
0x2006, // [SIX-PER-EM SPACE](https://www.fileformat.info/info/unicode/char/2006/index.htm)
0x2007, // [FIGURE SPACE](https://www.fileformat.info/info/unicode/char/2007/index.htm)
0x2008, // [PUNCTUATION SPACE](https://www.fileformat.info/info/unicode/char/2008/index.htm)
0x2009, // [THIN SPACE](https://www.fileformat.info/info/unicode/char/2009/index.htm)
0x200a, // [HAIR SPACE](https://www.fileformat.info/info/unicode/char/200a/index.htm)
0x2028, // [LINE SEPARATOR](https://www.fileformat.info/info/unicode/char/2028/index.htm)
0x2029, // [PARAGRAPH SEPARATOR](https://www.fileformat.info/info/unicode/char/2029/index.htm)
0x202f, // [NARROW NO-BREAK SPACE (NNBSP)](https://www.fileformat.info/info/unicode/char/202f/index.htm)
0x205f, // [MEDIUM MATHEMATICAL SPACE (MMSP)](https://www.fileformat.info/info/unicode/char/205f/index.htm)
0x3000, // [IDEOGRAPHIC SPACE](https://www.fileformat.info/info/unicode/char/3000/index.htm)
0xfeff, // [ZERO WIDTH NO-BREAK SPACE (BOM)](https://www.fileformat.info/info/unicode/char/feff/index.htm)
].reduce((acc, codepoint) => {
const encoded = Buffer.from(
String.fromCharCode(codepoint),
options.encoding,
);
if (codepoint !== 0x3f && encoded.length === 1 && encoded[0] === 0x3f) {
return acc;
}
acc.push(encoded);
return acc;
}, []);
// First-byte lookup table for `__isCharTrimable`. Non-whitespace bytes
// (the common case) bail out in O(1) without scanning every timchar.
const timcharFirstBytes = new Uint8Array(256);
for (const t of timchars) timcharFirstBytes[t[0]] = 1;
return {
bomSkipped: false,
bufBytesStart: 0,
castField: options.cast_function,
commenting: false,
delimiterBufPrevious: undefined,
delimiterDiscovered: false,
// Current error encountered by a record
error: undefined,
enabled: options.from_line === 1,
escaping: false,
escapeIsQuote:
Buffer.isBuffer(options.escape) &&
Buffer.isBuffer(options.quote) &&
Buffer.compare(options.escape, options.quote) === 0,
// columns can be `false`, `true`, `Array`
expectedRecordLength: Array.isArray(options.columns)
? options.columns.length
: undefined,
field: new ResizeableBuffer(20),
firstLineToHeaders: options.cast_first_line_to_header,
needMoreDataSize: Math.max(
// Skip if the remaining buffer smaller than comment
options.comment !== null ? options.comment.length : 0,
// Skip if the remaining buffer can be delimiter
...(options.delimiter
? options.delimiter.map((delimiter) => delimiter.length)
: []),
// Auto discovery of delimiter is limited to 1 character
options.delimiter_auto ? 1 : 0,
// Skip if the remaining buffer can be escape sequence
options.quote !== null ? options.quote.length : 0,
// Skip if the remaining buffer can be a multi-byte trim character
...timchars.map((t) => t.length),
),
previousBuf: undefined,
quoting: false,
stop: false,
rawBuffer: new ResizeableBuffer(100),
record: [],
recordHasError: false,
record_length: 0,
recordDelimiterMaxLength:
options.record_delimiter.length === 0
? 0
: Math.max(...options.record_delimiter.map((v) => v.length)),
trimChars: [
Buffer.from(" ", options.encoding)[0],
Buffer.from("\t", options.encoding)[0],
],
wasQuoting: false,
wasRowDelimiter: false,
timchars: timchars,
timcharFirstBytes: timcharFirstBytes,
};
};
const underscore = function (str) {
return str.replace(/([A-Z])/g, function (_, match) {
return "_" + match.toLowerCase();
});
};
const normalize_options = function (opts) {
const options = {};
// Merge with user options
for (const opt in opts) {
options[underscore(opt)] = opts[opt];
}
// Normalize option `encoding`
// Note: defined first because other options depends on it
// to convert chars/strings into buffers.
if (options.encoding === undefined || options.encoding === true) {
options.encoding = "utf8";
} else if (options.encoding === null || options.encoding === false) {
options.encoding = null;
} else if (
typeof options.encoding !== "string" &&
options.encoding !== null
) {
throw new CsvError(
"CSV_INVALID_OPTION_ENCODING",
[
"Invalid option encoding:",
"encoding must be a string or null to return a buffer,",
`got ${JSON.stringify(options.encoding)}`,
],
options,
);
}
// Normalize option `bom`
if (
options.bom === undefined ||
options.bom === null ||
options.bom === false
) {
options.bom = false;
} else if (options.bom !== true) {
throw new CsvError(
"CSV_INVALID_OPTION_BOM",
[
"Invalid option bom:",
"bom must be true,",
`got ${JSON.stringify(options.bom)}`,
],
options,
);
}
// Normalize option `cast`
options.cast_function = null;
if (
options.cast === undefined ||
options.cast === null ||
options.cast === false ||
options.cast === ""
) {
options.cast = undefined;
} else if (typeof options.cast === "function") {
options.cast_function = options.cast;
options.cast = true;
} else if (options.cast !== true) {
throw new CsvError(
"CSV_INVALID_OPTION_CAST",
[
"Invalid option cast:",
"cast must be true or a function,",
`got ${JSON.stringify(options.cast)}`,
],
options,
);
}
// Normalize option `cast_date`
if (
options.cast_date === undefined ||
options.cast_date === null ||
options.cast_date === false ||
options.cast_date === ""
) {
options.cast_date = false;
} else if (options.cast_date === true) {
options.cast_date = function (value) {
const date = Date.parse(value);
return !isNaN(date) ? new Date(date) : value;
};
} else if (typeof options.cast_date !== "function") {
throw new CsvError(
"CSV_INVALID_OPTION_CAST_DATE",
[
"Invalid option cast_date:",
"cast_date must be true or a function,",
`got ${JSON.stringify(options.cast_date)}`,
],
options,
);
}
// Normalize option `columns`
options.cast_first_line_to_header = undefined;
if (options.columns === true) {
// Fields in the first line are converted as-is to columns
options.cast_first_line_to_header = undefined;
} else if (typeof options.columns === "function") {
options.cast_first_line_to_header = options.columns;
options.columns = true;
} else if (Array.isArray(options.columns)) {
options.columns = normalize_columns_array(options.columns);
} else if (
options.columns === undefined ||
options.columns === null ||
options.columns === false
) {
options.columns = false;
} else {
throw new CsvError(
"CSV_INVALID_OPTION_COLUMNS",
[
"Invalid option columns:",
"expect an array, a function or true,",
`got ${JSON.stringify(options.columns)}`,
],
options,
);
}
// Normalize option `group_columns_by_name`
if (
options.group_columns_by_name === undefined ||
options.group_columns_by_name === null ||
options.group_columns_by_name === false
) {
options.group_columns_by_name = false;
} else if (options.group_columns_by_name !== true) {
throw new CsvError(
"CSV_INVALID_OPTION_GROUP_COLUMNS_BY_NAME",
[
"Invalid option group_columns_by_name:",
"expect an boolean,",
`got ${JSON.stringify(options.group_columns_by_name)}`,
],
options,
);
} else if (options.columns === false) {
throw new CsvError(
"CSV_INVALID_OPTION_GROUP_COLUMNS_BY_NAME",
[
"Invalid option group_columns_by_name:",
"the `columns` mode must be activated.",
],
options,
);
}
// Normalize option `comment`
if (
options.comment === undefined ||
options.comment === null ||
options.comment === false ||
options.comment === ""
) {
options.comment = null;
} else {
if (typeof options.comment === "string") {
options.comment = Buffer.from(options.comment, options.encoding);
}
if (!Buffer.isBuffer(options.comment)) {
throw new CsvError(
"CSV_INVALID_OPTION_COMMENT",
[
"Invalid option comment:",
"comment must be a buffer or a string,",
`got ${JSON.stringify(options.comment)}`,
],
options,
);
}
}
// Normalize option `comment_no_infix`
if (
options.comment_no_infix === undefined ||
options.comment_no_infix === null ||
options.comment_no_infix === false
) {
options.comment_no_infix = false;
} else if (options.comment_no_infix !== true) {
throw new CsvError(
"CSV_INVALID_OPTION_COMMENT",
[
"Invalid option comment_no_infix:",
"value must be a boolean,",
`got ${JSON.stringify(options.comment_no_infix)}`,
],
options,
);
}
// Normalize option `delimiter_auto`
if (
options.delimiter_auto === undefined ||
options.delimiter_auto === null ||
options.delimiter_auto === false
) {
options.delimiter_auto = false;
} else if (options.delimiter_auto === true) {
options.delimiter_auto = {};
} else if (!is_object(options.delimiter_auto)) {
throw new CsvError(
"CSV_INVALID_OPTION_DELIMITER_AUTO",
[
"Invalid option delimiter_auto:",
"delimiter_auto must be a boolean or a configuration object,",
`got ${JSON.stringify(options.delimiter_auto)}`,
],
options,
);
}
if (options.delimiter_auto) {
if (options.delimiter_auto.preferred === undefined)
options.delimiter_auto.preferred = {
[",".charCodeAt(0)]: 1.8,
["\t".charCodeAt(0)]: 1.8,
[";".charCodeAt(0)]: 1.6,
[" ".charCodeAt(0)]: 1.6,
[":".charCodeAt(0)]: 1.5,
[".".charCodeAt(0)]: 1.4,
["/".charCodeAt(0)]: 1.4,
};
else if (!is_object(options.delimiter_auto.preferred)) {
throw new CsvError(
"CSV_INVALID_OPTION_DELIMITER_AUTO",
[
"Invalid option delimiter_auto:",
"preferred must be an object,",
`got ${JSON.stringify(options.delimiter_auto.preferred)}`,
],
options,
);
}
if (options.delimiter_auto.score === undefined)
options.delimiter_auto.score = (info, options) => {
return (
(info.total - info.std) * (options.preferred[info.char_code] || 1)
);
};
else if (typeof options.delimiter_auto.score !== "function") {
throw new CsvError(
"CSV_INVALID_OPTION_DELIMITER_AUTO",
[
"Invalid option delimiter_auto:",
"score must be a function,",
`got ${JSON.stringify(options.delimiter_auto.score)}`,
],
options,
);
}
if (options.delimiter_auto.size === undefined)
options.delimiter_auto.size = 2048;
else if (typeof options.delimiter_auto.size !== "number") {
throw new CsvError(
"CSV_INVALID_OPTION_DELIMITER_AUTO",
[
"Invalid option delimiter_auto:",
"size must be a number,",
`got ${JSON.stringify(options.delimiter_auto.size)}`,
],
options,
);
}
}
// Normalize option `delimiter`
const delimiter_json = JSON.stringify(options.delimiter);
if (options.delimiter_auto !== false) {
options.delimiter = [];
}
if (!Array.isArray(options.delimiter)) {
if (
options.delimiter === undefined ||
options.delimiter === null ||
options.delimiter === false
) {
options.delimiter = Buffer.from(",", options.encoding);
}
options.delimiter = [options.delimiter];
}
options.delimiter = options.delimiter.map(function (delimiter) {
if (typeof delimiter === "string") {
delimiter = Buffer.from(delimiter, options.encoding);
}
if (!Buffer.isBuffer(delimiter) || delimiter.length === 0) {
throw new CsvError(
"CSV_INVALID_OPTION_DELIMITER",
[
"Invalid option delimiter:",
"delimiter must be a non empty string or buffer or array of string|buffer,",
`got ${delimiter_json}`,
],
options,
);
}
return delimiter;
});
// Normalize option `escape`
if (options.escape === undefined || options.escape === true) {
options.escape = Buffer.from('"', options.encoding);
} else if (typeof options.escape === "string") {
options.escape = Buffer.from(options.escape, options.encoding);
} else if (options.escape === null || options.escape === false) {
options.escape = null;
}
if (options.escape !== null) {
if (!Buffer.isBuffer(options.escape)) {
throw new Error(
`Invalid Option: escape must be a buffer, a string or a boolean, got ${JSON.stringify(options.escape)}`,
);
}
}
// Normalize option `from`
if (options.from === undefined || options.from === null) {
options.from = 1;
} else {
if (typeof options.from === "string" && /\d+/.test(options.from)) {
options.from = parseInt(options.from);
}
if (Number.isInteger(options.from)) {
if (options.from < 0) {
throw new Error(
`Invalid Option: from must be a positive integer, got ${JSON.stringify(opts.from)}`,
);
}
} else {
throw new Error(
`Invalid Option: from must be an integer, got ${JSON.stringify(options.from)}`,
);
}
}
// Normalize option `from_line`
if (options.from_line === undefined || options.from_line === null) {
options.from_line = 1;
} else {
if (
typeof options.from_line === "string" &&
/\d+/.test(options.from_line)
) {
options.from_line = parseInt(options.from_line);
}
if (Number.isInteger(options.from_line)) {
if (options.from_line <= 0) {
throw new Error(
`Invalid Option: from_line must be a positive integer greater than 0, got ${JSON.stringify(opts.from_line)}`,
);
}
} else {
throw new Error(
`Invalid Option: from_line must be an integer, got ${JSON.stringify(opts.from_line)}`,
);
}
}
// Normalize options `ignore_last_delimiters`
if (
options.ignore_last_delimiters === undefined ||
options.ignore_last_delimiters === null
) {
options.ignore_last_delimiters = false;
} else if (typeof options.ignore_last_delimiters === "number") {
options.ignore_last_delimiters = Math.floor(options.ignore_last_delimiters);
if (options.ignore_last_delimiters === 0) {
options.ignore_last_delimiters = false;
}
} else if (typeof options.ignore_last_delimiters !== "boolean") {
throw new CsvError(
"CSV_INVALID_OPTION_IGNORE_LAST_DELIMITERS",
[
"Invalid option `ignore_last_delimiters`:",
"the value must be a boolean value or an integer,",
`got ${JSON.stringify(options.ignore_last_delimiters)}`,
],
options,
);
}
if (options.ignore_last_delimiters === true && options.columns === false) {
throw new CsvError(
"CSV_IGNORE_LAST_DELIMITERS_REQUIRES_COLUMNS",
[
"The option `ignore_last_delimiters`",
"requires the activation of the `columns` option",
],
options,
);
}
// Normalize option `info`
if (
options.info === undefined ||
options.info === null ||
options.info === false
) {
options.info = false;
} else if (options.info !== true) {
throw new Error(
`Invalid Option: info must be true, got ${JSON.stringify(options.info)}`,
);
}
// Normalize option `max_record_size`
if (
options.max_record_size === undefined ||
options.max_record_size === null ||
options.max_record_size === false
) {
options.max_record_size = 0;
} else if (
Number.isInteger(options.max_record_size) &&
options.max_record_size >= 0
) ; else if (
typeof options.max_record_size === "string" &&
/\d+/.test(options.max_record_size)
) {
options.max_record_size = parseInt(options.max_record_size);
} else {
throw new Error(
`Invalid Option: max_record_size must be a positive integer, got ${JSON.stringify(options.max_record_size)}`,
);
}
// Normalize option `objname`
if (
options.objname === undefined ||
options.objname === null ||
options.objname === false
) {
options.objname = undefined;
} else if (Buffer.isBuffer(options.objname)) {
if (options.objname.length === 0) {
throw new Error(`Invalid Option: objname must be a non empty buffer`);
}
if (options.encoding === null) ; else {
options.objname = options.objname.toString(options.encoding);
}
} else if (typeof options.objname === "string") {
if (options.objname.length === 0) {
throw new Error(`Invalid Option: objname must be a non empty string`);
}
// Great, nothing to do
} else if (typeof options.objname === "number") ; else {
throw new Error(
`Invalid Option: objname must be a string or a buffer, got ${options.objname}`,
);
}
if (options.objname !== undefined) {
if (typeof options.objname === "number") {
if (options.columns !== false) {
throw Error(
"Invalid Option: objname index cannot be combined with columns or be defined as a field",
);
}
} else {
// A string or a buffer
if (options.columns === false) {
throw Error(
"Invalid Option: objname field must be combined with columns or be defined as an index",
);
}
}
}
// Normalize option `on_record`
if (options.on_record === undefined || options.on_record === null) {
options.on_record = undefined;
} else if (typeof options.on_record !== "function") {
throw new CsvError(
"CSV_INVALID_OPTION_ON_RECORD",
[
"Invalid option `on_record`:",
"expect a function,",
`got ${JSON.stringify(options.on_record)}`,
],
options,
);
}
// Normalize option `on_skip`
// options.on_skip ??= (err, chunk) => {
// this.emit('skip', err, chunk);
// };
if (
options.on_skip !== undefined &&
options.on_skip !== null &&
typeof options.on_skip !== "function"
) {
throw new Error(
`Invalid Option: on_skip must be a function, got ${JSON.stringify(options.on_skip)}`,
);
}
// Normalize option `quote`
if (
options.quote === null ||
options.quote === false ||
options.quote === ""
) {
options.quote = null;
} else {
if (options.quote === undefined || options.quote === true) {
options.quote = Buffer.from('"', options.encoding);
} else if (typeof options.quote === "string") {
options.quote = Buffer.from(options.quote, options.encoding);
}
if (!Buffer.isBuffer(options.quote)) {
throw new Error(
`Invalid Option: quote must be a buffer or a string, got ${JSON.stringify(options.quote)}`,
);
}
}
// Normalize option `raw`
if (
options.raw === undefined ||
options.raw === null ||
options.raw === false
) {
options.raw = false;
} else if (options.raw !== true) {
throw new Error(
`Invalid Option: raw must be true, got ${JSON.stringify(options.raw)}`,
);
}
// Normalize option `record_delimiter`
if (options.record_delimiter === undefined) {
options.record_delimiter = [];
} else if (
typeof options.record_delimiter === "string" ||
Buffer.isBuffer(options.record_delimiter)
) {
if (options.record_delimiter.length === 0) {
throw new CsvError(
"CSV_INVALID_OPTION_RECORD_DELIMITER",
[
"Invalid option `record_delimiter`:",
"value must be a non empty string or buffer,",
`got ${JSON.stringify(options.record_delimiter)}`,
],
options,
);
}
options.record_delimiter = [options.record_delimiter];
} else if (!Array.isArray(options.record_delimiter)) {
throw new CsvError(
"CSV_INVALID_OPTION_RECORD_DELIMITER",
[
"Invalid option `record_delimiter`:",
"value must be a string, a buffer or array of string|buffer,",
`got ${JSON.stringify(options.record_delimiter)}`,
],
options,
);
}
options.record_delimiter = options.record_delimiter.map(function (rd, i) {
if (typeof rd !== "string" && !Buffer.isBuffer(rd)) {
throw new CsvError(
"CSV_INVALID_OPTION_RECORD_DELIMITER",
[
"Invalid option `record_delimiter`:",
"value must be a string, a buffer or array of string|buffer",
`at index ${i},`,
`got ${JSON.stringify(rd)}`,
],
options,
);
} else if (rd.length === 0) {
throw new CsvError(
"CSV_INVALID_OPTION_RECORD_DELIMITER",
[
"Invalid option `record_delimiter`:",
"value must be a non empty string or buffer",
`at index ${i},`,
`got ${JSON.stringify(rd)}`,
],
options,
);
}
if (typeof rd === "string") {
rd = Buffer.from(rd, options.encoding);
}
return rd;
});
// Normalize option `relax_column_count`
if (typeof options.relax_column_count === "boolean") ; else if (
options.relax_column_count === undefined ||
options.relax_column_count === null
) {
options.relax_column_count = false;
} else {
throw new Error(
`Invalid Option: relax_column_count must be a boolean, got ${JSON.stringify(options.relax_column_count)}`,
);
}
if (typeof options.relax_column_count_less === "boolean") ; else if (
options.relax_column_count_less === undefined ||
options.relax_column_count_less === null
) {
options.relax_column_count_less = false;
} else {
throw new Error(
`Invalid Option: relax_column_count_less must be a boolean, got ${JSON.stringify(options.relax_column_count_less)}`,
);
}
if (typeof options.relax_column_count_more === "boolean") ; else if (
options.relax_column_count_more === undefined ||
options.relax_column_count_more === null
) {
options.relax_column_count_more = false;
} else {
throw new Error(
`Invalid Option: relax_column_count_more must be a boolean, got ${JSON.stringify(options.relax_column_count_more)}`,
);
}
// Normalize option `relax_quotes`
if (typeof options.relax_quotes === "boolean") ; else if (
options.relax_quotes === undefined ||
options.relax_quotes === null
) {
options.relax_quotes = false;
} else {
throw new Error(
`Invalid Option: relax_quotes must be a boolean, got ${JSON.stringify(options.relax_quotes)}`,
);
}
// Normalize option `skip_empty_lines`
if (typeof options.skip_empty_lines === "boolean") ; else if (
options.skip_empty_lines === undefined ||
options.skip_empty_lines === null
) {
options.skip_empty_lines = false;
} else {
throw new Error(
`Invalid Option: skip_empty_lines must be a boolean, got ${JSON.stringify(options.skip_empty_lines)}`,
);
}
// Normalize option `skip_records_with_empty_values`
if (typeof options.skip_records_with_empty_values === "boolean") ; else if (
options.skip_records_with_empty_values === undefined ||
options.skip_records_with_empty_values === null
) {
options.skip_records_with_empty_values = false;
} else {
throw new Error(
`Invalid Option: skip_records_with_empty_values must be a boolean, got ${JSON.stringify(options.skip_records_with_empty_values)}`,
);
}
// Normalize option `skip_records_with_error`
if (typeof options.skip_records_with_error === "boolean") ; else if (
options.skip_records_with_error === undefined ||
options.skip_records_with_error === null
) {
options.skip_records_with_error = false;
} else {
throw new Error(
`Invalid Option: skip_records_with_error must be a boolean, got ${JSON.stringify(options.skip_records_with_error)}`,
);
}
// Normalize option `rtrim`
if (
options.rtrim === undefined ||
options.rtrim === null ||
options.rtrim === false
) {
options.rtrim = false;
} else if (options.rtrim !== true) {
throw new Error(
`Invalid Option: rtrim must be a boolean, got ${JSON.stringify(options.rtrim)}`,
);
}
// Normalize option `ltrim`
if (
options.ltrim === undefined ||
options.ltrim === null ||
options.ltrim === false
) {
options.ltrim = false;
} else if (options.ltrim !== true) {
throw new Error(
`Invalid Option: ltrim must be a boolean, got ${JSON.stringify(options.ltrim)}`,
);
}
// Normalize option `trim`
if (
options.trim === undefined ||
options.trim === null ||
options.trim === false
) {
options.trim = false;
} else if (options.trim !== true) {
throw new Error(
`Invalid Option: trim must be a boolean, got ${JSON.stringify(options.trim)}`,
);
}
// Normalize options `trim`, `ltrim` and `rtrim`
if (options.trim === true && opts.ltrim !== false) {
options.ltrim = true;
} else if (options.ltrim !== true) {
options.ltrim = false;
}
if (options.trim === true && opts.rtrim !== false) {
options.rtrim = true;
} else if (options.rtrim !== true) {
options.rtrim = false;
}
// Normalize option `to`
if (options.to === undefined || options.to === null) {
options.to = -1;
} else if (options.to !== -1) {
if (typeof options.to === "string" && /\d+/.test(options.to)) {
options.to = parseInt(options.to);
}
if (Number.isInteger(options.to)) {
if (options.to <= 0) {
throw new Error(
`Invalid Option: to must be a positive integer greater than 0, got ${JSON.stringify(opts.to)}`,
);
}
} else {
throw new Error(
`Invalid Option: to must be an integer, got ${JSON.stringify(opts.to)}`,
);
}
}
// Normalize option `to_line`
if (options.to_line === undefined || options.to_line === null) {
options.to_line = -1;
} else if (options.to_line !== -1) {
if (typeof options.to_line === "string" && /\d+/.test(options.to_line)) {
options.to_line = parseInt(options.to_line);
}
if (Number.isInteger(options.to_line)) {
if (options.to_line <= 0) {
throw new Error(
`Invalid Option: to_line must be a positive integer greater than 0, got ${JSON.stringify(opts.to_line)}`,
);
}
} else {
throw new Error(
`Invalid Option: to_line must be an integer, got ${JSON.stringify(opts.to_line)}`,
);
}
}
return options;
};
// Discussed in [issue #400](https://github.com/adaltas/node-csv/issues/400)
// See https://github.com/python/cpython/blob/ea1b1c579f600cc85d145c60862b2e6b98701b24/Lib/csv.py#L349
const delimiter_discover = function (records, options) {
// Normalize the configuration
if (!options) {
({ delimiter_auto: options } = normalize_options({ delimiter_auto: true }));
}
// Convert String to Buffer
if (typeof records === "string") {
records = Buffer.from(records);
}
// Convert Buffer to an array of records
if (Buffer.isBuffer(records)) {
records = ((data) => {
const records = [];
const parser = transform({ delimiter: [] });
const push = (record) => records.push(record);
const close = () => {};
const error = parser.parse(data, true, push, close);
if (error !== undefined) throw error;
return records;
})(records);
}
// Info array initialization, 127 entries, one per char code
const info = Array(127)
.fill()
.map(() => ({ lines: [] }));
// Traverse each records, count occurences per char code
records.map(([record], line) => {
for (let i = 0, l = record.length; i < l; i++) {
// Count the character frequency
const code = record.charCodeAt(i);
if (info[code].lines[line] === undefined) info[code].lines[line] = 0;
info[code].lines[line]++;
}
});
// Traverse each char code, compute the score
info.map((info, i) => {
info.char_code = i;
info.std = std(info.lines);
info.total = info.lines.reduce((acc, val) => acc + val, 0);
info.preferred = !!options.preferred[i];
info.score = options.score(info, options);
});
// Extract the dominant character
const result = info.reduce(
(acc, info) => (acc.score > info.score ? acc : info),
{},
);
return String.fromCharCode(result.char_code);
};
const std = function (array) {
const n = array.length;
if (n === 0) return 0;
const mean = array.reduce((a, b) => a + b) / n;
return Math.sqrt(
array.map((x) => Math.pow(x - mean, 2)).reduce((a, b) => a + b) / n,
);
};
const isRecordEmpty = function (record) {
return record.every(
(field) =>
field == null || (field.toString && field.toString().trim() === ""),
);
};
const cr = 13; // `\r`, carriage return, 0x0D in hexadécimal, 13 in decimal
const nl = 10; // `\n`, newline, 0x0A in hexadecimal, 10 in decimal
const boms = {
// Note, the following are equals:
// Buffer.from("\ufeff")
// Buffer.from([239, 187, 191])
// Buffer.from('EFBBBF', 'hex')
utf8: Buffer.from([239, 187, 191]),
// Note, the following are equals:
// Buffer.from "\ufeff", 'utf16le
// Buffer.from([255, 254])
utf16le: Buffer.from([255, 254]),
};
const transform = function (original_options = {}) {
const info = {
bytes: 0,
bytes_records: 0,
comment_lines: 0,
empty_lines: 0,
invalid_field_length: 0,
lines: 1,
records: 0,
};
const options = normalize_options(original_options);
return {
info: info,
original_options: original_options,
options: options,
state: init_state(options),
__needMoreData: function (i, bufLen, end) {
if (end) return false;
const { encoding, escape, quote } = this.options;
const { quoting, needMoreDataSize, recordDelimiterMaxLength } =
this.state;
const numOfCharLeft = bufLen - i - 1;
const requiredLength = Math.max(
needMoreDataSize,
// Skip if the remaining buffer smaller than record delimiter
// If "record_delimiter" is yet to be discovered:
// 1. It is equals to `[]` and "recordDelimiterMaxLength" equals `0`
// 2. We set the length to windows line ending in the current encoding
// Note, that encoding is known from user or bom discovery at that point
// recordDelimiterMaxLength,
recordDelimiterMaxLength === 0
? Buffer.from("\r\n", encoding).length
: recordDelimiterMaxLength,
// Skip if remaining buffer can be an escaped quote
quoting ? (escape === null ? 0 : escape.length) + quote.length : 0,
// Skip if remaining buffer can be record delimiter following the closing quote
quoting ? quote.length + recordDelimiterMaxLength : 0,
);
return numOfCharLeft < requiredLength;
},
// Central parser implementation
parse: function (nextBuf, end, push, close) {
const {
bom,
comment_no_infix,
delimiter_auto,
encoding,
from_line,
ltrim,
max_record_size,
raw,
relax_quotes,
rtrim,
skip_empty_lines,
to,
to_line,
} = this.options;
let { comment, escape, quote, record_delimiter } = this.options;
const {
bomSkipped,
delimiterDiscovered,
delimiterBufPrevious,
rawBuffer,
escapeIsQuote,
} = this.state;
// Automatic delimiter discovery
if (!delimiterDiscovered && delimiter_auto) {
let delimiterBuf;
if (delimiterBufPrevious === undefined) {
delimiterBuf = nextBuf;
} else if (
delimiterBufPrevious !== undefined &&
nextBuf === undefined
) {
delimiterBuf = delimiterBufPrevious;
} else {
delimiterBuf = Buffer.concat([delimiterBufPrevious, nextBuf]);
}
// Ensure that nextBuf is not concatenated a second time during buffer reconciliation
nextBuf = undefined;
// this.delimiterBufPrevious = delimiterBuf;
if (end || delimiterBuf.length > delimiter_auto.size) {
this.options.delimiter = [
Buffer.from(
delimiter_discover(delimiterBuf, this.options.delimiter_auto),
),
];
this.state.previousBuf = delimiterBuf;
this.state.delimiterBufPrevious = undefined;
this.state.delimiterDiscovered = true;
} else {
this.state.delimiterBufPrevious = delimiterBuf;
return;
}
}
// Previous buffers reconciliation
const { previousBuf } = this.state;
let buf;
if (previousBuf === undefined) {
if (nextBuf === undefined) {
// Handle empty string
close();
return;
} else {
buf = nextBuf;
}
} else if (previousBuf !== undefined && nextBuf === undefined) {
buf = previousBuf;
} else {
buf = Buffer.concat([previousBuf, nextBuf]);
}
// Handle UTF BOM
if (bomSkipped === false) {
if (bom === false) {
this.state.bomSkipped = true;
} else if (buf.length < 3) {
// No enough data
if (end === false) {
// Wait for more data
this.state.previousBuf = buf;
return;
}
} else {
for (const encoding in boms) {
if (boms[encoding].compare(buf, 0, boms[encoding].length) === 0) {
// Skip BOM
const bomLength = boms[encoding].length;
this.state.bufBytesStart += bomLength;
buf = buf.slice(bomLength);
// Renormalize original options with the new encoding
const options = normalize_options({
...this.original_options,
encoding: encoding,
});
// Properties are merged with the existing options instance
for (const key in options) {
this.options[key] = options[key];
}
// Options will re-evaluate the Buffer with the new encoding
({ comment, escape, quote } = this.options);
break;
}
}
this.state.bomSkipped = true;
}
}
const bufLen = buf.length;
let pos;
for (pos = 0; pos < bufLen; pos++) {
// Ensure we get enough space to look ahead
// There should be a way to move this out of the loop
if (this.__needMoreData(pos, bufLen, end)) {
break;
}
if (this.state.wasRowDelimiter === true) {
this.info.lines++;
this.state.wasRowDelimiter = false;
}
if (to_line !== -1 && this.info.lines > to_line) {
this.state.stop = true;
close();
return;
}
// Auto discovery of record_delimiter, unix, mac and windows supported
if (this.state.quoting === false && record_delimiter.length === 0) {
const record_delimiterCount = this.__autoDiscoverRecordDelimiter(
buf,
pos,
);
if (record_delimiterCount) {
record_delimiter = this.options.record_delimiter;
}
}
const chr = buf[pos];
if (raw === true) {
rawBuffer.append(chr);
}
if (
(chr === cr || chr === nl) &&
this.state.wasRowDelimiter === false
) {
this.state.wasRowDelimiter = true;
}
// Previous char was a valid escape char
// treat the current char as a regular char
if (this.state.escaping === true) {
this.state.escaping = false;
} else {
// Escape is only active inside quoted fields
// We are quoting, the char is an escape chr and there is a chr to escape
// if(escape !== null && this.state.quoting === true && chr === escape && pos + 1 < bufLen){
if (
escape !== null &&
this.state.quoting === true &&
this.__isEscape(buf, pos, chr) &&
pos + escape.length < bufLen
) {
if (escapeIsQuote) {
if (this.__isQuote(buf, pos + escape.length)) {
this.state.escaping = true;
pos += escape.length - 1;
continue;
}
} else {
this.state.escaping = true;
pos += escape.length - 1;
continue;
}
}
// Not currently escaping and chr is a quote
// TODO: need to compare bytes instead of single char
if (this.state.commenting === false && this.__isQuote(buf, pos)) {
if (this.state.quoting === true) {
const nextChr = buf[pos + quote.length];
const isNextChrTrimable =
rtrim && this.__isCharTrimable(buf, pos + quote.length);
const isNextChrComment =
comment !== null &&
this.__compareBytes(comment, buf, pos + quote.length, nextChr);
const isNextChrDelimiter = this.__isDelimiter(
buf,
pos + quote.length,
nextChr,
);
const isNextChrRecordDelimiter =
record_delimiter.length === 0
? this.__autoDiscoverRecordDelimiter(buf, pos + quote.length)
: this.__isRecordDelimiter(nextChr, buf, pos + quote.length);
// Escape a quote
// Treat next char as a regular character
if (
escape !== null &&
this.__isEscape(buf, pos, chr) &&
this.__isQuote(buf, pos + escape.length)
) {
pos += escape.length - 1;
} else if (
!nextChr ||
isNextChrDelimiter ||
isNextChrRecordDelimiter ||
isNextChrComment ||
isNextChrTrimable
) {
this.state.quoting = false;
this.state.wasQuoting = true;
pos += quote.length - 1;
continue;
} else if (relax_quotes === false) {
const err = this.__error(
new CsvError(
"CSV_INVALID_CLOSING_QUOTE",
[
"Invalid Closing Quote:",
`got "${String.fromCharCode(nextChr)}"`,
`at line ${this.info.lines}`,
"instead of delimiter, record delimiter, trimable character",
"(if activated) or comment",
],
this.options,
this.__infoField(),
),
);
if (err !== undefined) return err;
} else {
this.state.quoting = false;
this.state.wasQuoting = true;
this.state.field.prepend(quote);
pos += quote.length - 1;
}
} else {
if (this.state.field.length !== 0) {
// In relax_quotes mode, treat opening quote preceded by chrs as regular
if (relax_quotes === false) {
const info = this.__infoField();
const bom = Object.keys(boms)
.map((b) =>
boms[b].equals(this.state.field.toString()) ? b : false,
)
.filter(Boolean)[0];
const err = this.__error(
new CsvError(
"INVALID_OPENING_QUOTE",
[
"Invalid Opening Quote:",
`a quote is found on field ${JSON.stringify(info.column)} at line ${info.lines}, value is ${JSON.stringify(this.state.field.toString(encoding))}`,
bom ? `(${bom} bom)` : undefined,
],
this.options,
info,
{
field: this.state.field,
},
),
);
if (err !== undefined) return err;
}
} else {
this.state.quoting = true;
pos += quote.length - 1;
continue;
}
}
}
if (this.state.quoting === false) {
const recordDelimiterLength = this.__isRecordDelimiter(
chr,
buf,
pos,
);
if (recordDelimiterLength !== 0) {
// Do not emit comments which take a full line
const skipCommentLine =
this.state.commenting &&
this.state.wasQuoting === false &&
this.state.record.length === 0 &&
this.state.field.length === 0;
if (skipCommentLine) {
this.info.comment_lines++;
// Skip full comment line
} else {
// Activate records emission if above from_line
if (
this.state.enabled === false &&
this.info.lines +
(this.state.wasRowDelimiter === true ? 1 : 0) >=
from_line
) {
this.state.enabled = true;
this.__resetField();
this.__resetRecord();
pos += recordDelimiterLength - 1;
continue;
}
// Skip if line is empty and skip_empty_lines activated
if (
skip_empty_lines === true &&
this.state.wasQuoting === false &&
this.state.record.length === 0 &&
this.state.field.length === 0
) {
this.info.empty_lines++;
pos += recordDelimiterLength - 1;
continue;
}
this.info.bytes = this.state.bufBytesStart + pos;
const errField = this.__onField();
if (errField !== undefined) return errField;
this.info.bytes =
this.state.bufBytesStart + pos + recordDelimiterLength;
const errRecord = this.__onRecord(push);
if (errRecord !== undefined) return errRecord;
if (to !== -1 && this.info.records >= to) {
this.state.stop = true;
close();
return;
}
}
this.state.commenting = false;
pos += recordDelimiterLength - 1;
continue;
}
if (this.state.commenting) {
continue;
}
if (
comment !== null &&
(comment_no_infix === false ||
(this.state.record.length === 0 &&
this.state.field.length === 0))
) {
const commentCount = this.__compareBytes(comment, buf, pos, chr);
if (commentCount !== 0) {
this.state.commenting = true;
continue;
}
}
const delimiterLength = this.__isDelimiter(buf, pos, chr);
if (delimiterLength !== 0) {
this.info.bytes = this.state.bufBytesStart + pos;
const errField = this.__onField();
if (errField !== undefined) return errField;
pos += delimiterLength - 1;
continue;
}
}
}
if (this.state.commenting === false) {
if (
max_record_size !== 0 &&
this.state.record_length + this.state.field.length > max_record_size
) {
return this.__error(
new CsvError(
"CSV_MAX_RECORD_SIZE",
[
"Max Record Size:",
"record exceed the maximum number of tolerated bytes",
`of ${max_record_size}`,
`at line ${this.info.lines}`,
],
this.options,
this.__infoField(),
),
);
}
}
const lappend =
ltrim === false ||
this.state.quoting === true ||
this.state.field.length !== 0 ||