csv-reader
Version:
A CSV stream reader, with many many features, and ability to work with the largest datasets
304 lines (242 loc) • 9.81 kB
JavaScript
const Stream = require('stream');
const Util = require('util');
/**
* @const
* @type {RegExp}
*/
const PARSE_FLOAT_TEST = /^[-+]?\d+(?:\.\d*)?(?:[eE]\+\d+)?$|^(?:\d+)?\.\d+(?:e+\d+)?$|^[-+]?Infinity$|^[-+]?NaN$/;
const Transform = Stream.Transform;
/**
* @param {Object?} options
* @param {string} [options.delimiter=','] - Specify what is the CSV delimeter
* @param {boolean} [options.multiline=true] - Support Excel-like multiline CSV
* @param {boolean} [options.allowQuotes=true] - Allow quotation marks to wrap columns
* @param {boolean} [options.skipEmptyLines=false] - Should empty lines be automatically skipped?
* @param {boolean} [options.parseNumbers=false] - Automatically parse numbers (with a . as the decimal separator)
* @param {boolean} [options.parseBooleans=false] - Automatically parse booleans (strictly lowercase `true` and `false`)
* @param {boolean} [options.ltrim=false] - Automatically left-trims columns
* @param {boolean} [options.rtrim=false] - Automatically right-trims columns
* @param {boolean} [options.trim=false] - If true, then both 'ltrim' and 'rtrim' are set to true
* @param {boolean} [options.skipHeader=false] - If true, then skip the first header row <i>[deprecated]</i>
* @param {boolean} [options.skipLines=0] - Number of lines to skip (if `skipHeader` is `true`, then this gets +1)
* @param {boolean} [options.asObject=false] - If true, each row will be converted automatically to an object based
* on the header. This adds `1` to `skipLines`.
* @param {number} [options.headerLine=0] - Line number of the header (skipLines will be lines skipped after the header line)
* @returns {CsvReadableStream}
* @constructor
*/
const CsvReadableStream = function (options) {
options = options || {};
//noinspection JSUndefinedPropertyAssignment
options.objectMode = true;
if (!(this instanceof CsvReadableStream)) {
return new CsvReadableStream(options);
}
let data = null,
dataIndex = null,
nextIndex = null,
dataLen = null,
column = '',
columnCount = 0,
lastLineEndCR = false,
lookForBOM = true,
isQuoted = false,
rowCount = 0;
const multiline = !!options.multiline || typeof options.multiline === 'undefined',
delimiter = options.delimiter != null ? options.delimiter.toString() || ',' : ',',
allowQuotes = !!options.allowQuotes || typeof options.allowQuotes === 'undefined',
skipEmptyLines = !!options.skipEmptyLines,
parseNumbers = !!options.parseNumbers,
parseBooleans = !!options.parseBooleans,
ltrim = !!options.ltrim || !!options.trim,
rtrim = !!options.rtrim || !!options.trim,
trim = ltrim && rtrim,
asObject = !!options.asObject,
skipLines = options.headerLine ? options.headerLine + (options.skipLines || 0) : (options.skipLines || 0) + (options.skipHeader || asObject ? 1 : 0),
headerLine = (options.headerLine - 1 || 0),
postProcessingEnabled = parseNumbers || parseBooleans || ltrim || rtrim;
let headerRow = [];
/** @type {*[]|Object<string,*>} */
let columns = asObject === true ? {} : [];
const postProcessColumn = function (column) {
if (trim) {
column = column.trim();
} else if (ltrim) {
column = column.replace(/^\s+/, '');
} else if (rtrim) {
column = column.replace(/\s+$/, '');
}
if (parseBooleans) {
if (column === 'true') {
return true;
}
if (column === 'false') {
return false;
}
}
if (parseNumbers) {
if (PARSE_FLOAT_TEST.test(column)) {
return parseFloat(column);
}
}
return column;
};
this._processChunk = function (newData) {
if (newData) {
if (data) {
data = data.substring(dataIndex) + newData;
} else {
data = newData;
}
dataLen = data.length;
dataIndex = 0;
// Node doesn't strip BOMs, that's in user's land
if (lookForBOM) {
if (newData.charCodeAt(0) === 0xfeff) {
dataIndex++;
}
lookForBOM = false;
}
}
let isFinishedLine = false;
const rowIndex = rowCount;
for (; dataIndex < dataLen; dataIndex++) {
const c = data[dataIndex];
if (c === '\n' || c === '\r') {
if (!isQuoted || !multiline) {
if (lastLineEndCR && c === '\n') {
lastLineEndCR = false;
continue;
}
lastLineEndCR = c === '\r';
dataIndex++;
isFinishedLine = true;
rowCount++;
if (!multiline) {
isQuoted = false;
}
break;
}
}
if (isQuoted) {
if (c === '"') {
nextIndex = dataIndex + 1;
// Do we have enough data to peek at the next character?
if (nextIndex >= dataLen && !this._isStreamDone) {
// Wait for more data to arrive
break;
}
if (nextIndex < dataLen && data[nextIndex] === '"') {
column += '"';
dataIndex++;
} else {
isQuoted = false;
}
} else {
column += c;
}
} else {
if (c === delimiter) {
if (rowIndex === headerLine) {
headerRow.push(column.trim());
}
if (column.length > 0 && postProcessingEnabled === true) {
column = postProcessColumn(column);
}
if (asObject === true) {
columns[headerRow[columnCount]] = column;
} else {
columns.push(column);
}
column = '';
columnCount++;
} else if (c === '"' && allowQuotes) {
if (column.length) {
column += c;
} else {
isQuoted = true;
}
} else {
column += c;
}
}
}
if (dataIndex === dataLen) {
data = null;
}
if (isFinishedLine || (data === null && this._isStreamDone === true)) {
if (columnCount > 0 ||
column.length > 0 ||
data !== null ||
!this._isStreamDone) {
const isEmptyRow = columnCount === 1 && column.length === 0;
// Process last column
if (rowIndex === headerLine) {
headerRow.push(column.trim());
this.emit('header', headerRow);
}
if (column.length > 0 && postProcessingEnabled === true) {
column = postProcessColumn(column);
}
if (asObject === true) {
columns[headerRow[columnCount]] = column;
} else {
columns.push(column);
}
// Commit this row
let row = columns;
// Clear row state data
columns = asObject === true ? {} : [];
column = '';
columnCount = 0;
isQuoted = false;
if (rowIndex >= skipLines) {
// Is this row full or empty?
if (isEmptyRow === false || skipEmptyLines === false) {
// Emit the parsed row
//noinspection JSUnresolvedFunction
this.push(row);
}
}
// Look to see if there are more rows in available data
this._processChunk();
} else {
// We just ran into a newline at the end of the file, ignore it
}
} else {
if (data) {
// Let more data come in.
// We are probably waiting for a "peek" at the next character
} else {
// We have probably hit end of file.
// Let the end event come in.
}
}
};
Transform.call(this, options);
};
Util.inherits(CsvReadableStream, Transform);
//noinspection JSUnusedGlobalSymbols
CsvReadableStream.prototype._transform = function (chunk, enc, cb) {
try {
this._processChunk(chunk);
cb();
} catch (err) {
cb(err);
}
};
//noinspection JSUnusedGlobalSymbols
CsvReadableStream.prototype._flush = function (cb) {
try {
this._isStreamDone = true;
this._processChunk();
cb();
} catch (err) {
cb(err);
}
};
/**
* @module
* @type {CsvReadableStream}
*/
module.exports = CsvReadableStream;