csv-parse
Version:
CSV parsing implementing the Node.js `stream.Transform` API
725 lines (698 loc) • 24.6 kB
JavaScript
// Generated by CoffeeScript 2.3.2
// # CSV Parser
// This module provides a CSV parser tested and used against large datasets. Over the year, it has been enhance and is now full of useful options.
// Please look at the [project website](https://csv.js.org/parse/) for additional information.
var Parser, StringDecoder, isObjLiteral, stream, util;
stream = require('stream');
util = require('util');
({StringDecoder} = require('string_decoder'));
// ## Usage
// Callback approach, for ease of use:
// `parse(data, [options], callback)`
// [Node.js Stream API](http://nodejs.org/api/stream.html), for maximum of power:
// `parse([options], [callback])`
module.exports = function() {
var callback, called, chunks, data, err, options, parser;
if (arguments.length === 3) {
data = arguments[0];
options = arguments[1];
callback = arguments[2];
if (typeof callback !== 'function') {
throw Error(`Invalid callback argument: ${JSON.stringify(callback)}`);
}
if (!(typeof data === 'string' || Buffer.isBuffer(arguments[0]))) {
return callback(Error(`Invalid data argument: ${JSON.stringify(data)}`));
}
} else if (arguments.length === 2) {
// 1st arg is data:string or options:object
if (typeof arguments[0] === 'string' || Buffer.isBuffer(arguments[0])) {
data = arguments[0];
} else if (isObjLiteral(arguments[0])) {
options = arguments[0];
} else {
err = `Invalid first argument: ${JSON.stringify(arguments[0])}`;
}
// 2nd arg is options:object or callback:function
if (typeof arguments[1] === 'function') {
callback = arguments[1];
} else if (isObjLiteral(arguments[1])) {
if (options) {
err = 'Invalid arguments: got options twice as first and second arguments';
} else {
options = arguments[1];
}
} else {
err = `Invalid first argument: ${JSON.stringify(arguments[1])}`;
}
if (err) {
if (!callback) {
throw Error(err);
} else {
return callback(Error(err));
}
}
} else if (arguments.length === 1) {
if (typeof arguments[0] === 'function') {
callback = arguments[0];
} else {
options = arguments[0];
}
}
if (options == null) {
options = {};
}
parser = new Parser(options);
if (data != null) {
process.nextTick(function() {
parser.write(data);
return parser.end();
});
}
if (callback) {
called = false;
chunks = options.objname ? {} : [];
parser.on('readable', function() {
var chunk, results;
results = [];
while (chunk = parser.read()) {
if (options.objname) {
results.push(chunks[chunk[0]] = chunk[1]);
} else {
results.push(chunks.push(chunk));
}
}
return results;
});
parser.on('error', function(err) {
called = true;
return callback(err);
});
parser.on('end', function() {
if (!called) {
return callback(null, chunks);
}
});
}
return parser;
};
// ## `Parser([options])`
// Options are documented [here](http://csv.js.org/parse/options/).
Parser = function(options = {}) {
var base, base1, base10, base11, base12, base13, base14, base15, base16, base17, base2, base3, base4, base5, base6, base7, base8, base9, k, v;
// @options = options
this.options = {};
for (k in options) {
v = options[k];
this.options[k] = v;
}
this.options.objectMode = true;
stream.Transform.call(this, this.options);
if ((base = this.options).rowDelimiter == null) {
base.rowDelimiter = null;
}
if (typeof this.options.rowDelimiter === 'string') {
this.options.rowDelimiter = [this.options.rowDelimiter];
}
if ((base1 = this.options).delimiter == null) {
base1.delimiter = ',';
}
if (this.options.quote !== void 0 && !this.options.quote) {
this.options.quote = '';
}
if ((base2 = this.options).quote == null) {
base2.quote = '"';
}
if ((base3 = this.options).escape == null) {
base3.escape = '"';
}
if ((base4 = this.options).columns == null) {
base4.columns = null;
}
if ((base5 = this.options).comment == null) {
base5.comment = '';
}
if ((base6 = this.options).objname == null) {
base6.objname = false;
}
if ((base7 = this.options).trim == null) {
base7.trim = false;
}
if ((base8 = this.options).ltrim == null) {
base8.ltrim = false;
}
if ((base9 = this.options).rtrim == null) {
base9.rtrim = false;
}
if (this.options.auto_parse != null) {
this.options.cast = this.options.auto_parse;
}
if ((base10 = this.options).cast == null) {
base10.cast = false;
}
if (this.options.auto_parse_date != null) {
this.options.cast_date = this.options.auto_parse_date;
}
if ((base11 = this.options).cast_date == null) {
base11.cast_date = false;
}
if (this.options.cast_date === true) {
this.options.cast_date = function(value) {
var m;
m = Date.parse(value);
if (!isNaN(m)) {
value = new Date(m);
}
return value;
};
}
if ((base12 = this.options).relax == null) {
base12.relax = false;
}
if ((base13 = this.options).relax_column_count == null) {
base13.relax_column_count = false;
}
if ((base14 = this.options).skip_empty_lines == null) {
base14.skip_empty_lines = false;
}
if ((base15 = this.options).max_limit_on_data_read == null) {
base15.max_limit_on_data_read = 128000;
}
if ((base16 = this.options).skip_lines_with_empty_values == null) {
base16.skip_lines_with_empty_values = false;
}
if ((base17 = this.options).skip_lines_with_error == null) {
base17.skip_lines_with_error = false;
}
// Counters
// lines = count + skipped_line_count + empty_line_count
this.lines = 0; // Number of lines encountered in the source dataset
this.count = 0; // Number of records being processed
this.skipped_line_count = 0; // Number of records skipped due to errors
this.empty_line_count = 0; // Number of empty lines
// Constants
this.is_int = /^(\-|\+)?([1-9]+[0-9]*)$/;
// @is_float = /^(\-|\+)?([0-9]+(\.[0-9]+)([eE][0-9]+)?|Infinity)$/
// @is_float = /^(\-|\+)?((([0-9])|([1-9]+[0-9]*))(\.[0-9]+)([eE][0-9]+)?|Infinity)$/
this.is_float = function(value) {
return (value - parseFloat(value) + 1) >= 0; // Borrowed from jquery
};
// Internal private state
this._ = {
decoder: new StringDecoder(),
quoting: false,
commenting: false,
field: null,
nextChar: null,
closingQuote: 0,
line: [],
chunks: [],
rawBuf: '',
buf: '',
rowDelimiterMaxLength: this.options.rowDelimiter ? Math.max(...this.options.rowDelimiter.map(function(v) {
return v.length;
})) : void 0,
lineHasError: false,
isEnded: false
};
return this;
};
// ## Internal API
// The Parser implement a [`stream.Transform` class](https://nodejs.org/api/stream.html#stream_class_stream_transform).
// ### Events
// The library extends Node [EventEmitter][event] class and emit all the events of the Writable and Readable [Stream API](http://nodejs.org/api/stream.html).
util.inherits(Parser, stream.Transform);
// For extra flexibility, you can get access to the original Parser class: `require('csv-parse').Parser`.
module.exports.Parser = Parser;
// ### `_transform(chunk, encoding, callback)`
// * `chunk` Buffer | String
// The chunk to be transformed. Will always be a buffer unless the decodeStrings option was set to false.
// * `encoding` String
// If the chunk is a string, then this is the encoding type. (Ignore if decodeStrings chunk is a buffer.)
// * `callback` Function
// Call this function (optionally with an error argument) when you are done processing the supplied chunk.
// Implementation of the [`stream.Transform` API](https://nodejs.org/api/stream.html#stream_class_stream_transform)
Parser.prototype._transform = function(chunk, encoding, callback) {
return setImmediate(() => {
var err;
if (chunk instanceof Buffer) {
chunk = this._.decoder.write(chunk);
}
err = this.__write(chunk, false);
if (err) {
return this.emit('error', err);
}
return callback();
});
};
Parser.prototype._flush = function(callback) {
return callback(this.__flush());
};
Parser.prototype.__flush = function() {
var err;
err = this.__write(this._.decoder.end(), true);
if (err) {
return err;
}
if (this._.quoting) {
err = this.error(`Quoted field not terminated at line ${this.lines + 1}`);
return err;
}
if (this._.line.length > 0) {
return this.__push(this._.line);
}
};
Parser.prototype.__push = function(line) {
var call_column_udf, columnName, columns, err, field, i, j, len, lineAsColumns, record;
if (this._.isEnded) {
return;
}
if (this.options.skip_lines_with_empty_values && line.join('').trim() === '') {
return;
}
record = null;
if (this.options.columns === true) {
this.options.columns = line;
return;
} else if (typeof this.options.columns === 'function') {
call_column_udf = function(fn, line) {
var columns, err;
try {
columns = fn.call(null, line);
return [null, columns];
} catch (error) {
err = error;
return [err];
}
};
[err, columns] = call_column_udf(this.options.columns, line);
if (err) {
return err;
}
this.options.columns = columns;
return;
}
if (!this._.line_length && line.length > 0) {
this._.line_length = this.options.columns ? this.options.columns.length : line.length;
}
// Dont check column count on empty lines
if (line.length === 1 && line[0] === '') {
this.empty_line_count++;
} else if (line.length !== this._.line_length) {
// Dont check column count with relax_column_count
if (this.options.relax_column_count) {
this.count++;
this.skipped_line_count++;
} else if (this.options.columns != null) {
// Suggest: Inconsistent header and column numbers: header is 1 and number of columns is 1 on line 1
err = this.error(`Number of columns on line ${this.lines} does not match header`);
return err;
} else {
err = this.error(`Number of columns is inconsistent on line ${this.lines}`);
return err;
}
} else {
this.count++;
}
if (this.options.columns != null) {
lineAsColumns = {};
for (i = j = 0, len = line.length; j < len; i = ++j) {
field = line[i];
columnName = this.options.columns[i];
if (columnName === void 0 || columnName === null || columnName === false) {
continue;
}
if (typeof columnName !== 'string') {
throw Error(`Invalid column name ${JSON.stringify(columnName)}`);
}
lineAsColumns[columnName] = field;
}
if (this.options.objname) {
record = [lineAsColumns[this.options.objname], lineAsColumns];
} else {
record = lineAsColumns;
}
} else {
record = line;
}
if (this.count < this.options.from) {
return;
}
if (this.options.raw) {
this.push({
raw: this._.rawBuf,
row: record
});
this._.rawBuf = '';
} else {
this.push(record);
}
if (this.listenerCount('record')) {
this.emit('record', record);
}
// When to is reached set ignore any future calls
if (this.count >= this.options.to) {
this._.isEnded = true;
return this.push(null);
}
return null;
};
Parser.prototype.__write = function(chars, end) {
var areNextCharsDelimiter, areNextCharsRowDelimiters, cast, char, err, escapeIsQuote, i, isDelimiter, isEscape, isNextCharAComment, isNextCharTrimable, isQuote, isRowDelimiter, isRowDelimiterLength, is_float, is_int, l, ltrim, nextCharPos, ref, ref1, ref2, ref3, ref4, ref5, ref6, remainingBuffer, rowDelimiter, rtrim, wasCommenting;
is_int = (value) => {
if (typeof this.is_int === 'function') {
return this.is_int(value);
} else {
return this.is_int.test(value);
}
};
is_float = (value) => {
if (typeof this.is_float === 'function') {
return this.is_float(value);
} else {
return this.is_float.test(value);
}
};
cast = (value, context = {}) => {
if (!this.options.cast) {
return value;
}
if (context.quoting == null) {
context.quoting = !!this._.closingQuote;
}
if (context.lines == null) {
context.lines = this.lines;
}
if (context.count == null) {
context.count = this.count;
}
if (context.index == null) {
context.index = this._.line.length;
}
// context.header ?= if @options.column and @lines is 1 and @count is 0 then true else false
if (context.header == null) {
context.header = this.options.columns === true;
}
if (context.column == null) {
context.column = Array.isArray(this.options.columns) ? this.options.columns[context.index] : context.index;
}
if (typeof this.options.cast === 'function') {
return this.options.cast(value, context);
}
if (is_int(value)) {
value = parseInt(value);
} else if (is_float(value)) {
value = parseFloat(value);
} else if (this.options.cast_date) {
value = this.options.cast_date(value, context);
}
return value;
};
ltrim = this.options.trim || this.options.ltrim;
rtrim = this.options.trim || this.options.rtrim;
chars = this._.buf + chars;
l = chars.length;
i = 0;
if (this.lines === 0 && 0xFEFF === chars.charCodeAt(0)) {
// Strip BOM header
i++;
}
while (i < l) {
// Ensure we get enough space to look ahead
if (!end) {
remainingBuffer = chars.substr(i, l - i);
// (i+1000 >= l) or
// Skip if the remaining buffer can be comment
// Skip if the remaining buffer can be row delimiter
if ((!this.options.rowDelimiter && i + 3 > l) || (!this._.commenting && l - i < this.options.comment.length && this.options.comment.substr(0, l - i) === remainingBuffer) || (this.options.rowDelimiter && l - i < this._.rowDelimiterMaxLength && this.options.rowDelimiter.some(function(rd) {
return rd.substr(0, l - i) === remainingBuffer;
// Skip if the remaining buffer can be row delimiter following the closing quote
})) || (this.options.rowDelimiter && this._.quoting && l - i < (this.options.quote.length + this._.rowDelimiterMaxLength) && this.options.rowDelimiter.some((rd) => {
return (this.options.quote + rd).substr(0, l - i) === remainingBuffer;
// Skip if the remaining buffer can be delimiter
// Skip if the remaining buffer can be escape sequence
})) || (l - i <= this.options.delimiter.length && this.options.delimiter.substr(0, l - i) === remainingBuffer) || (l - i <= this.options.escape.length && this.options.escape.substr(0, l - i) === remainingBuffer)) {
break;
}
}
char = this._.nextChar ? this._.nextChar : chars.charAt(i);
this._.nextChar = l > i + 1 ? chars.charAt(i + 1) : null;
if (this.options.raw) {
this._.rawBuf += char;
}
// Auto discovery of rowDelimiter, unix, mac and windows supported
if (this.options.rowDelimiter == null) {
nextCharPos = i;
rowDelimiter = null;
// First empty line
if (!this._.quoting && (char === '\n' || char === '\r')) {
rowDelimiter = char;
nextCharPos += 1;
} else if (this._.quoting && char === this.options.quote && ((ref = this._.nextChar) === '\n' || ref === '\r')) {
rowDelimiter = this._.nextChar;
nextCharPos += 2;
}
if (rowDelimiter) {
if (rowDelimiter === '\r' && chars.charAt(nextCharPos) === '\n') {
rowDelimiter += '\n';
}
this.options.rowDelimiter = [rowDelimiter];
this._.rowDelimiterMaxLength = rowDelimiter.length;
}
}
// Parse that damn char
// Note, shouldn't we have sth like chars.substr(i, @options.escape.length)
if (!this._.commenting && char === this.options.escape) {
// Make sure the escape is really here for escaping:
// If escape is same as quote, and escape is first char of a field
// and it's not quoted, then it is a quote
// Next char should be an escape or a quote
escapeIsQuote = this.options.escape === this.options.quote;
isEscape = this._.nextChar === this.options.escape;
isQuote = this._.nextChar === this.options.quote;
if (!(escapeIsQuote && !this._.field && !this._.quoting) && (isEscape || isQuote)) {
i++;
char = this._.nextChar;
this._.nextChar = chars.charAt(i + 1);
if (this._.field == null) {
this._.field = '';
}
this._.field += char;
// Since we're skipping the next one, better add it now if in raw mode.
if (this.options.raw) {
this._.rawBuf += char;
}
i++;
continue;
}
}
// Char match quote
if (!this._.commenting && char === this.options.quote) {
if (this._.acceptOnlyEmptyChars && (char !== ' ' && char !== '\t')) {
return this.error('Only trimable characters are accepted after quotes');
}
if (this._.quoting) {
// Make sure a closing quote is followed by a delimiter
// If we have a next character and
// it isnt a rowDelimiter and
// it isnt an column delimiter and
// it isnt the begining of a comment
// Otherwise, if this is not "relax" mode, throw an error
isNextCharTrimable = rtrim && ((ref1 = this._.nextChar) === ' ' || ref1 === '\t');
areNextCharsRowDelimiters = this.options.rowDelimiter && this.options.rowDelimiter.some(function(rd) {
return chars.substr(i + 1, rd.length) === rd;
});
areNextCharsDelimiter = chars.substr(i + 1, this.options.delimiter.length) === this.options.delimiter;
isNextCharAComment = this._.nextChar === this.options.comment;
if ((this._.nextChar != null) && !isNextCharTrimable && !areNextCharsRowDelimiters && !areNextCharsDelimiter && !isNextCharAComment) {
if (this.options.relax) {
this._.quoting = false;
if (this._.field) {
this._.field = `${this.options.quote}${this._.field}`;
}
} else {
if (err = this.error(`Invalid closing quote at line ${this.lines + 1}; found ${JSON.stringify(this._.nextChar)} instead of delimiter ${JSON.stringify(this.options.delimiter)}`)) {
return err;
}
}
} else if ((this._.nextChar != null) && isNextCharTrimable) {
i++;
this._.quoting = false;
this._.closingQuote = this.options.quote.length;
this._.acceptOnlyEmptyChars = true;
continue;
} else {
i++;
this._.quoting = false;
this._.closingQuote = this.options.quote.length;
if (end && i === l) {
this._.line.push(cast(this._.field || ''));
this._.field = null;
}
continue;
}
} else if (!this._.field) {
this._.quoting = true;
i++;
continue;
} else if ((this._.field != null) && !this.options.relax) {
if (err = this.error(`Invalid opening quote at line ${this.lines + 1}`)) {
return err;
}
}
}
// Otherwise, treat quote as a regular character
isRowDelimiter = this.options.rowDelimiter && this.options.rowDelimiter.some(function(rd) {
return chars.substr(i, rd.length) === rd;
});
if (isRowDelimiter || (end && i === l - 1)) {
this.lines++;
}
// Set the commenting flag
wasCommenting = false;
if (!this._.commenting && !this._.quoting && this.options.comment && chars.substr(i, this.options.comment.length) === this.options.comment) {
this._.commenting = true;
} else if (this._.commenting && isRowDelimiter) {
wasCommenting = true;
this._.commenting = false;
}
isDelimiter = chars.substr(i, this.options.delimiter.length) === this.options.delimiter;
if (this._.acceptOnlyEmptyChars) {
if (isDelimiter || isRowDelimiter) {
this._.acceptOnlyEmptyChars = false;
} else {
if (char === ' ' || char === '\t') {
i++;
continue;
} else {
return this.error('Only trimable characters are accepted after quotes');
}
}
}
if (!this._.commenting && !this._.quoting && (isDelimiter || isRowDelimiter)) {
if (isRowDelimiter) {
isRowDelimiterLength = this.options.rowDelimiter.filter(function(rd) {
return chars.substr(i, rd.length) === rd;
})[0].length;
}
// Empty lines
if (isRowDelimiter && this._.line.length === 0 && (this._.field == null)) {
if (wasCommenting || this.options.skip_empty_lines) {
i += isRowDelimiterLength;
this._.nextChar = chars.charAt(i);
continue;
}
}
if (rtrim) {
if (!this._.closingQuote) {
this._.field = (ref2 = this._.field) != null ? ref2.trimRight() : void 0;
}
}
this._.line.push(cast(this._.field || ''));
this._.closingQuote = 0;
this._.field = null;
// End of field
// Ensure that the delimiter doesnt match as well the rowDelimiter
if (isDelimiter && !isRowDelimiter) {
i += this.options.delimiter.length;
this._.nextChar = chars.charAt(i);
if (end && !this._.nextChar) {
isRowDelimiter = true;
this._.line.push('');
}
}
if (isRowDelimiter) { // End of record
if (!this._.lineHasError) {
err = this.__push(this._.line);
if (err) {
return err;
}
}
if (this._.lineHasError) {
this._.lineHasError = false;
}
// Some cleanup for the next record
this._.line = [];
i += isRowDelimiterLength;
this._.nextChar = chars.charAt(i);
continue;
}
} else if (!this._.commenting && !this._.quoting && (char === ' ' || char === '\t')) {
if (this._.field == null) {
// Left trim unless we are quoting or field already filled
this._.field = '';
}
if (!(ltrim && !this._.field)) {
this._.field += char;
}
i++;
} else if (!this._.commenting) {
if (this._.field == null) {
this._.field = '';
}
this._.field += char;
i++;
} else {
i++;
}
if (!this._.commenting && ((ref3 = this._.field) != null ? ref3.length : void 0) > this.options.max_limit_on_data_read) {
return Error(`Field exceeds max_limit_on_data_read setting (${this.options.max_limit_on_data_read}) ${JSON.stringify(this.options.delimiter)}`);
}
if (!this._.commenting && ((ref4 = this._.line) != null ? ref4.length : void 0) > this.options.max_limit_on_data_read) {
return Error(`Row delimiter not found in the file ${JSON.stringify(this.options.rowDelimiter)}`);
}
}
// Flush remaining fields and lines
if (end) {
if (l === 0) {
this.lines++;
}
if (this._.field != null) {
if (rtrim) {
if (!this._.closingQuote) {
this._.field = (ref5 = this._.field) != null ? ref5.trimRight() : void 0;
}
}
this._.line.push(cast(this._.field || ''));
this._.field = null;
}
if (((ref6 = this._.field) != null ? ref6.length : void 0) > this.options.max_limit_on_data_read) {
return Error(`Delimiter not found in the file ${JSON.stringify(this.options.delimiter)}`);
}
if (this._.line.length > this.options.max_limit_on_data_read) {
return Error(`Row delimiter not found in the file ${JSON.stringify(this.options.rowDelimiter)}`);
}
}
// Store un-parsed chars for next call
this._.buf = chars.substr(i);
return null;
};
Parser.prototype.error = function(msg) {
var err;
err = Error(msg);
if (!this.options.skip_lines_with_error) {
return err;
} else {
if (!this._.lineHasError) {
this._.lineHasError = true;
this.emit('skip', err);
}
}
return null;
};
// ## Utils
isObjLiteral = function(_obj) {
var _test;
_test = _obj;
if (typeof _obj !== 'object' || _obj === null || Array.isArray(_obj)) {
return false;
} else {
return (function() {
while (!false) {
if (Object.getPrototypeOf(_test = Object.getPrototypeOf(_test)) === null) {
break;
}
}
return Object.getPrototypeOf(_obj === _test);
})();
}
};