UNPKG

csv-parse

Version:

CSV parsing implementing the Node.js `stream.Transform` API

725 lines (698 loc) 24.6 kB
// Generated by CoffeeScript 2.3.2 // # CSV Parser // This module provides a CSV parser tested and used against large datasets. Over the year, it has been enhance and is now full of useful options. // Please look at the [project website](https://csv.js.org/parse/) for additional information. var Parser, StringDecoder, isObjLiteral, stream, util; stream = require('stream'); util = require('util'); ({StringDecoder} = require('string_decoder')); // ## Usage // Callback approach, for ease of use: // `parse(data, [options], callback)` // [Node.js Stream API](http://nodejs.org/api/stream.html), for maximum of power: // `parse([options], [callback])` module.exports = function() { var callback, called, chunks, data, err, options, parser; if (arguments.length === 3) { data = arguments[0]; options = arguments[1]; callback = arguments[2]; if (typeof callback !== 'function') { throw Error(`Invalid callback argument: ${JSON.stringify(callback)}`); } if (!(typeof data === 'string' || Buffer.isBuffer(arguments[0]))) { return callback(Error(`Invalid data argument: ${JSON.stringify(data)}`)); } } else if (arguments.length === 2) { // 1st arg is data:string or options:object if (typeof arguments[0] === 'string' || Buffer.isBuffer(arguments[0])) { data = arguments[0]; } else if (isObjLiteral(arguments[0])) { options = arguments[0]; } else { err = `Invalid first argument: ${JSON.stringify(arguments[0])}`; } // 2nd arg is options:object or callback:function if (typeof arguments[1] === 'function') { callback = arguments[1]; } else if (isObjLiteral(arguments[1])) { if (options) { err = 'Invalid arguments: got options twice as first and second arguments'; } else { options = arguments[1]; } } else { err = `Invalid first argument: ${JSON.stringify(arguments[1])}`; } if (err) { if (!callback) { throw Error(err); } else { return callback(Error(err)); } } } else if (arguments.length === 1) { if (typeof arguments[0] === 'function') { callback = arguments[0]; } else { options = arguments[0]; } } if (options == null) { options = {}; } parser = new Parser(options); if (data != null) { process.nextTick(function() { parser.write(data); return parser.end(); }); } if (callback) { called = false; chunks = options.objname ? {} : []; parser.on('readable', function() { var chunk, results; results = []; while (chunk = parser.read()) { if (options.objname) { results.push(chunks[chunk[0]] = chunk[1]); } else { results.push(chunks.push(chunk)); } } return results; }); parser.on('error', function(err) { called = true; return callback(err); }); parser.on('end', function() { if (!called) { return callback(null, chunks); } }); } return parser; }; // ## `Parser([options])` // Options are documented [here](http://csv.js.org/parse/options/). Parser = function(options = {}) { var base, base1, base10, base11, base12, base13, base14, base15, base16, base17, base2, base3, base4, base5, base6, base7, base8, base9, k, v; // @options = options this.options = {}; for (k in options) { v = options[k]; this.options[k] = v; } this.options.objectMode = true; stream.Transform.call(this, this.options); if ((base = this.options).rowDelimiter == null) { base.rowDelimiter = null; } if (typeof this.options.rowDelimiter === 'string') { this.options.rowDelimiter = [this.options.rowDelimiter]; } if ((base1 = this.options).delimiter == null) { base1.delimiter = ','; } if (this.options.quote !== void 0 && !this.options.quote) { this.options.quote = ''; } if ((base2 = this.options).quote == null) { base2.quote = '"'; } if ((base3 = this.options).escape == null) { base3.escape = '"'; } if ((base4 = this.options).columns == null) { base4.columns = null; } if ((base5 = this.options).comment == null) { base5.comment = ''; } if ((base6 = this.options).objname == null) { base6.objname = false; } if ((base7 = this.options).trim == null) { base7.trim = false; } if ((base8 = this.options).ltrim == null) { base8.ltrim = false; } if ((base9 = this.options).rtrim == null) { base9.rtrim = false; } if (this.options.auto_parse != null) { this.options.cast = this.options.auto_parse; } if ((base10 = this.options).cast == null) { base10.cast = false; } if (this.options.auto_parse_date != null) { this.options.cast_date = this.options.auto_parse_date; } if ((base11 = this.options).cast_date == null) { base11.cast_date = false; } if (this.options.cast_date === true) { this.options.cast_date = function(value) { var m; m = Date.parse(value); if (!isNaN(m)) { value = new Date(m); } return value; }; } if ((base12 = this.options).relax == null) { base12.relax = false; } if ((base13 = this.options).relax_column_count == null) { base13.relax_column_count = false; } if ((base14 = this.options).skip_empty_lines == null) { base14.skip_empty_lines = false; } if ((base15 = this.options).max_limit_on_data_read == null) { base15.max_limit_on_data_read = 128000; } if ((base16 = this.options).skip_lines_with_empty_values == null) { base16.skip_lines_with_empty_values = false; } if ((base17 = this.options).skip_lines_with_error == null) { base17.skip_lines_with_error = false; } // Counters // lines = count + skipped_line_count + empty_line_count this.lines = 0; // Number of lines encountered in the source dataset this.count = 0; // Number of records being processed this.skipped_line_count = 0; // Number of records skipped due to errors this.empty_line_count = 0; // Number of empty lines // Constants this.is_int = /^(\-|\+)?([1-9]+[0-9]*)$/; // @is_float = /^(\-|\+)?([0-9]+(\.[0-9]+)([eE][0-9]+)?|Infinity)$/ // @is_float = /^(\-|\+)?((([0-9])|([1-9]+[0-9]*))(\.[0-9]+)([eE][0-9]+)?|Infinity)$/ this.is_float = function(value) { return (value - parseFloat(value) + 1) >= 0; // Borrowed from jquery }; // Internal private state this._ = { decoder: new StringDecoder(), quoting: false, commenting: false, field: null, nextChar: null, closingQuote: 0, line: [], chunks: [], rawBuf: '', buf: '', rowDelimiterMaxLength: this.options.rowDelimiter ? Math.max(...this.options.rowDelimiter.map(function(v) { return v.length; })) : void 0, lineHasError: false, isEnded: false }; return this; }; // ## Internal API // The Parser implement a [`stream.Transform` class](https://nodejs.org/api/stream.html#stream_class_stream_transform). // ### Events // The library extends Node [EventEmitter][event] class and emit all the events of the Writable and Readable [Stream API](http://nodejs.org/api/stream.html). util.inherits(Parser, stream.Transform); // For extra flexibility, you can get access to the original Parser class: `require('csv-parse').Parser`. module.exports.Parser = Parser; // ### `_transform(chunk, encoding, callback)` // * `chunk` Buffer | String // The chunk to be transformed. Will always be a buffer unless the decodeStrings option was set to false. // * `encoding` String // If the chunk is a string, then this is the encoding type. (Ignore if decodeStrings chunk is a buffer.) // * `callback` Function // Call this function (optionally with an error argument) when you are done processing the supplied chunk. // Implementation of the [`stream.Transform` API](https://nodejs.org/api/stream.html#stream_class_stream_transform) Parser.prototype._transform = function(chunk, encoding, callback) { return setImmediate(() => { var err; if (chunk instanceof Buffer) { chunk = this._.decoder.write(chunk); } err = this.__write(chunk, false); if (err) { return this.emit('error', err); } return callback(); }); }; Parser.prototype._flush = function(callback) { return callback(this.__flush()); }; Parser.prototype.__flush = function() { var err; err = this.__write(this._.decoder.end(), true); if (err) { return err; } if (this._.quoting) { err = this.error(`Quoted field not terminated at line ${this.lines + 1}`); return err; } if (this._.line.length > 0) { return this.__push(this._.line); } }; Parser.prototype.__push = function(line) { var call_column_udf, columnName, columns, err, field, i, j, len, lineAsColumns, record; if (this._.isEnded) { return; } if (this.options.skip_lines_with_empty_values && line.join('').trim() === '') { return; } record = null; if (this.options.columns === true) { this.options.columns = line; return; } else if (typeof this.options.columns === 'function') { call_column_udf = function(fn, line) { var columns, err; try { columns = fn.call(null, line); return [null, columns]; } catch (error) { err = error; return [err]; } }; [err, columns] = call_column_udf(this.options.columns, line); if (err) { return err; } this.options.columns = columns; return; } if (!this._.line_length && line.length > 0) { this._.line_length = this.options.columns ? this.options.columns.length : line.length; } // Dont check column count on empty lines if (line.length === 1 && line[0] === '') { this.empty_line_count++; } else if (line.length !== this._.line_length) { // Dont check column count with relax_column_count if (this.options.relax_column_count) { this.count++; this.skipped_line_count++; } else if (this.options.columns != null) { // Suggest: Inconsistent header and column numbers: header is 1 and number of columns is 1 on line 1 err = this.error(`Number of columns on line ${this.lines} does not match header`); return err; } else { err = this.error(`Number of columns is inconsistent on line ${this.lines}`); return err; } } else { this.count++; } if (this.options.columns != null) { lineAsColumns = {}; for (i = j = 0, len = line.length; j < len; i = ++j) { field = line[i]; columnName = this.options.columns[i]; if (columnName === void 0 || columnName === null || columnName === false) { continue; } if (typeof columnName !== 'string') { throw Error(`Invalid column name ${JSON.stringify(columnName)}`); } lineAsColumns[columnName] = field; } if (this.options.objname) { record = [lineAsColumns[this.options.objname], lineAsColumns]; } else { record = lineAsColumns; } } else { record = line; } if (this.count < this.options.from) { return; } if (this.options.raw) { this.push({ raw: this._.rawBuf, row: record }); this._.rawBuf = ''; } else { this.push(record); } if (this.listenerCount('record')) { this.emit('record', record); } // When to is reached set ignore any future calls if (this.count >= this.options.to) { this._.isEnded = true; return this.push(null); } return null; }; Parser.prototype.__write = function(chars, end) { var areNextCharsDelimiter, areNextCharsRowDelimiters, cast, char, err, escapeIsQuote, i, isDelimiter, isEscape, isNextCharAComment, isNextCharTrimable, isQuote, isRowDelimiter, isRowDelimiterLength, is_float, is_int, l, ltrim, nextCharPos, ref, ref1, ref2, ref3, ref4, ref5, ref6, remainingBuffer, rowDelimiter, rtrim, wasCommenting; is_int = (value) => { if (typeof this.is_int === 'function') { return this.is_int(value); } else { return this.is_int.test(value); } }; is_float = (value) => { if (typeof this.is_float === 'function') { return this.is_float(value); } else { return this.is_float.test(value); } }; cast = (value, context = {}) => { if (!this.options.cast) { return value; } if (context.quoting == null) { context.quoting = !!this._.closingQuote; } if (context.lines == null) { context.lines = this.lines; } if (context.count == null) { context.count = this.count; } if (context.index == null) { context.index = this._.line.length; } // context.header ?= if @options.column and @lines is 1 and @count is 0 then true else false if (context.header == null) { context.header = this.options.columns === true; } if (context.column == null) { context.column = Array.isArray(this.options.columns) ? this.options.columns[context.index] : context.index; } if (typeof this.options.cast === 'function') { return this.options.cast(value, context); } if (is_int(value)) { value = parseInt(value); } else if (is_float(value)) { value = parseFloat(value); } else if (this.options.cast_date) { value = this.options.cast_date(value, context); } return value; }; ltrim = this.options.trim || this.options.ltrim; rtrim = this.options.trim || this.options.rtrim; chars = this._.buf + chars; l = chars.length; i = 0; if (this.lines === 0 && 0xFEFF === chars.charCodeAt(0)) { // Strip BOM header i++; } while (i < l) { // Ensure we get enough space to look ahead if (!end) { remainingBuffer = chars.substr(i, l - i); // (i+1000 >= l) or // Skip if the remaining buffer can be comment // Skip if the remaining buffer can be row delimiter if ((!this.options.rowDelimiter && i + 3 > l) || (!this._.commenting && l - i < this.options.comment.length && this.options.comment.substr(0, l - i) === remainingBuffer) || (this.options.rowDelimiter && l - i < this._.rowDelimiterMaxLength && this.options.rowDelimiter.some(function(rd) { return rd.substr(0, l - i) === remainingBuffer; // Skip if the remaining buffer can be row delimiter following the closing quote })) || (this.options.rowDelimiter && this._.quoting && l - i < (this.options.quote.length + this._.rowDelimiterMaxLength) && this.options.rowDelimiter.some((rd) => { return (this.options.quote + rd).substr(0, l - i) === remainingBuffer; // Skip if the remaining buffer can be delimiter // Skip if the remaining buffer can be escape sequence })) || (l - i <= this.options.delimiter.length && this.options.delimiter.substr(0, l - i) === remainingBuffer) || (l - i <= this.options.escape.length && this.options.escape.substr(0, l - i) === remainingBuffer)) { break; } } char = this._.nextChar ? this._.nextChar : chars.charAt(i); this._.nextChar = l > i + 1 ? chars.charAt(i + 1) : null; if (this.options.raw) { this._.rawBuf += char; } // Auto discovery of rowDelimiter, unix, mac and windows supported if (this.options.rowDelimiter == null) { nextCharPos = i; rowDelimiter = null; // First empty line if (!this._.quoting && (char === '\n' || char === '\r')) { rowDelimiter = char; nextCharPos += 1; } else if (this._.quoting && char === this.options.quote && ((ref = this._.nextChar) === '\n' || ref === '\r')) { rowDelimiter = this._.nextChar; nextCharPos += 2; } if (rowDelimiter) { if (rowDelimiter === '\r' && chars.charAt(nextCharPos) === '\n') { rowDelimiter += '\n'; } this.options.rowDelimiter = [rowDelimiter]; this._.rowDelimiterMaxLength = rowDelimiter.length; } } // Parse that damn char // Note, shouldn't we have sth like chars.substr(i, @options.escape.length) if (!this._.commenting && char === this.options.escape) { // Make sure the escape is really here for escaping: // If escape is same as quote, and escape is first char of a field // and it's not quoted, then it is a quote // Next char should be an escape or a quote escapeIsQuote = this.options.escape === this.options.quote; isEscape = this._.nextChar === this.options.escape; isQuote = this._.nextChar === this.options.quote; if (!(escapeIsQuote && !this._.field && !this._.quoting) && (isEscape || isQuote)) { i++; char = this._.nextChar; this._.nextChar = chars.charAt(i + 1); if (this._.field == null) { this._.field = ''; } this._.field += char; // Since we're skipping the next one, better add it now if in raw mode. if (this.options.raw) { this._.rawBuf += char; } i++; continue; } } // Char match quote if (!this._.commenting && char === this.options.quote) { if (this._.acceptOnlyEmptyChars && (char !== ' ' && char !== '\t')) { return this.error('Only trimable characters are accepted after quotes'); } if (this._.quoting) { // Make sure a closing quote is followed by a delimiter // If we have a next character and // it isnt a rowDelimiter and // it isnt an column delimiter and // it isnt the begining of a comment // Otherwise, if this is not "relax" mode, throw an error isNextCharTrimable = rtrim && ((ref1 = this._.nextChar) === ' ' || ref1 === '\t'); areNextCharsRowDelimiters = this.options.rowDelimiter && this.options.rowDelimiter.some(function(rd) { return chars.substr(i + 1, rd.length) === rd; }); areNextCharsDelimiter = chars.substr(i + 1, this.options.delimiter.length) === this.options.delimiter; isNextCharAComment = this._.nextChar === this.options.comment; if ((this._.nextChar != null) && !isNextCharTrimable && !areNextCharsRowDelimiters && !areNextCharsDelimiter && !isNextCharAComment) { if (this.options.relax) { this._.quoting = false; if (this._.field) { this._.field = `${this.options.quote}${this._.field}`; } } else { if (err = this.error(`Invalid closing quote at line ${this.lines + 1}; found ${JSON.stringify(this._.nextChar)} instead of delimiter ${JSON.stringify(this.options.delimiter)}`)) { return err; } } } else if ((this._.nextChar != null) && isNextCharTrimable) { i++; this._.quoting = false; this._.closingQuote = this.options.quote.length; this._.acceptOnlyEmptyChars = true; continue; } else { i++; this._.quoting = false; this._.closingQuote = this.options.quote.length; if (end && i === l) { this._.line.push(cast(this._.field || '')); this._.field = null; } continue; } } else if (!this._.field) { this._.quoting = true; i++; continue; } else if ((this._.field != null) && !this.options.relax) { if (err = this.error(`Invalid opening quote at line ${this.lines + 1}`)) { return err; } } } // Otherwise, treat quote as a regular character isRowDelimiter = this.options.rowDelimiter && this.options.rowDelimiter.some(function(rd) { return chars.substr(i, rd.length) === rd; }); if (isRowDelimiter || (end && i === l - 1)) { this.lines++; } // Set the commenting flag wasCommenting = false; if (!this._.commenting && !this._.quoting && this.options.comment && chars.substr(i, this.options.comment.length) === this.options.comment) { this._.commenting = true; } else if (this._.commenting && isRowDelimiter) { wasCommenting = true; this._.commenting = false; } isDelimiter = chars.substr(i, this.options.delimiter.length) === this.options.delimiter; if (this._.acceptOnlyEmptyChars) { if (isDelimiter || isRowDelimiter) { this._.acceptOnlyEmptyChars = false; } else { if (char === ' ' || char === '\t') { i++; continue; } else { return this.error('Only trimable characters are accepted after quotes'); } } } if (!this._.commenting && !this._.quoting && (isDelimiter || isRowDelimiter)) { if (isRowDelimiter) { isRowDelimiterLength = this.options.rowDelimiter.filter(function(rd) { return chars.substr(i, rd.length) === rd; })[0].length; } // Empty lines if (isRowDelimiter && this._.line.length === 0 && (this._.field == null)) { if (wasCommenting || this.options.skip_empty_lines) { i += isRowDelimiterLength; this._.nextChar = chars.charAt(i); continue; } } if (rtrim) { if (!this._.closingQuote) { this._.field = (ref2 = this._.field) != null ? ref2.trimRight() : void 0; } } this._.line.push(cast(this._.field || '')); this._.closingQuote = 0; this._.field = null; // End of field // Ensure that the delimiter doesnt match as well the rowDelimiter if (isDelimiter && !isRowDelimiter) { i += this.options.delimiter.length; this._.nextChar = chars.charAt(i); if (end && !this._.nextChar) { isRowDelimiter = true; this._.line.push(''); } } if (isRowDelimiter) { // End of record if (!this._.lineHasError) { err = this.__push(this._.line); if (err) { return err; } } if (this._.lineHasError) { this._.lineHasError = false; } // Some cleanup for the next record this._.line = []; i += isRowDelimiterLength; this._.nextChar = chars.charAt(i); continue; } } else if (!this._.commenting && !this._.quoting && (char === ' ' || char === '\t')) { if (this._.field == null) { // Left trim unless we are quoting or field already filled this._.field = ''; } if (!(ltrim && !this._.field)) { this._.field += char; } i++; } else if (!this._.commenting) { if (this._.field == null) { this._.field = ''; } this._.field += char; i++; } else { i++; } if (!this._.commenting && ((ref3 = this._.field) != null ? ref3.length : void 0) > this.options.max_limit_on_data_read) { return Error(`Field exceeds max_limit_on_data_read setting (${this.options.max_limit_on_data_read}) ${JSON.stringify(this.options.delimiter)}`); } if (!this._.commenting && ((ref4 = this._.line) != null ? ref4.length : void 0) > this.options.max_limit_on_data_read) { return Error(`Row delimiter not found in the file ${JSON.stringify(this.options.rowDelimiter)}`); } } // Flush remaining fields and lines if (end) { if (l === 0) { this.lines++; } if (this._.field != null) { if (rtrim) { if (!this._.closingQuote) { this._.field = (ref5 = this._.field) != null ? ref5.trimRight() : void 0; } } this._.line.push(cast(this._.field || '')); this._.field = null; } if (((ref6 = this._.field) != null ? ref6.length : void 0) > this.options.max_limit_on_data_read) { return Error(`Delimiter not found in the file ${JSON.stringify(this.options.delimiter)}`); } if (this._.line.length > this.options.max_limit_on_data_read) { return Error(`Row delimiter not found in the file ${JSON.stringify(this.options.rowDelimiter)}`); } } // Store un-parsed chars for next call this._.buf = chars.substr(i); return null; }; Parser.prototype.error = function(msg) { var err; err = Error(msg); if (!this.options.skip_lines_with_error) { return err; } else { if (!this._.lineHasError) { this._.lineHasError = true; this.emit('skip', err); } } return null; }; // ## Utils isObjLiteral = function(_obj) { var _test; _test = _obj; if (typeof _obj !== 'object' || _obj === null || Array.isArray(_obj)) { return false; } else { return (function() { while (!false) { if (Object.getPrototypeOf(_test = Object.getPrototypeOf(_test)) === null) { break; } } return Object.getPrototypeOf(_obj === _test); })(); } };