js-csvparser
Version:
Fast and feature rich CSV parser with great auto detection for line ending and delimiter.
606 lines (517 loc) • 24.9 kB
JavaScript
(function (global, factory) {
typeof exports === 'object' && typeof module !== 'undefined' ? module.exports = factory() :
typeof define === 'function' && define.amd ? define(factory) :
(global.CSVParser = factory());
}(this, function () { 'use strict';
// util
// code snippets from https://github.com/moment/moment
function daysInMonth(year, month) {
return new Date(Date.UTC(year, month + 1, 0)).getUTCDate();
}
function checkOverflow(year, month, day, hour, minute, second) {
if (month && (month < 0 || month > 11)) return true;
if (day && (day < 0 || day > daysInMonth(year, month))) return true;
if (hour && (hour < 0 || hour > 24 || (hour === 24 && (minute !== 0 || second !== 0)))) return true;
if (minute && (minute < 0 || minute > 59)) return true;
if (second && (second < 0 || second > 59)) return true;
return false;
}
function parseTwoDigitYear(input) {
return Number(input) + (Number(input) > 68 ? 1900 : 2000);
}
// Code from http://stackoverflow.com/questions/3561493/is-there-a-regexp-escape-function-in-javascript
function unescapeFormat(s) {
return regexEscape(s.replace('\\', '').replace(/\\(\[)|\\(\])|\[([^\]\[]*)\]|\\(.)/g, function (matched, p1, p2, p3, p4) {
return p1 || p2 || p3 || p4;
}));
}
function regexEscape(s) {
return s.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&');
}
// export
function isDate(dateString, formatString) {
if (getDate(dateString, formatString) === 'NaD') {
return false;
}
return true;
}
function getDate(dateString, formatString) {
dateString = '' + dateString;
let formattingTokens = /(\[[^\[]*\])|(\\)?(yyyy|yy|mm|m|dd|d|HH|H|MM|M|SS|S|.)/g;
//let formattingTokens = /(\[[^\[]*\])|(\\)?([Hh]mm(ss)?|Mo|mm?m?m?|Do|DDDo|dd?d?d?|ddd?d?|do?|w[o|w]?|W[o|W]?|Qo?|YYYYYY|YYYYY|yyyy|yy|gg(ggg?)?|GG(GGG?)?|e|E|a|A|hh?|HH?|kk?|mm?|ss?|S{1,9}|x|X|zz?|ZZ?|.)/g;
let tokens = formatString.match(formattingTokens) || [];
let second = 0, minute = 0, hour = 0, day = null, month = null, year = null;
let regex;
for (let i = 0; i < tokens.length; i++) {
let token = tokens[i];
if (token === 'd' || token === 'm' || token === 'H' || token === 'M' || token === 'S') {
regex = /\d\d?/;
} else if (token === 'dd' || token === 'mm' || token === 'yy' || token === 'HH' || token === 'MM' || token === 'SS') {
regex = /\d\d/;
} else if (token === 'yyyy') {
regex = /\d{4}/;
} else {
regex = new RegExp(unescapeFormat(token));
}
let parsedInput = (dateString.match(regex) || [])[0];
if (parsedInput) {
// don't parse dates within text strings
let skipped = dateString.substr(0, dateString.indexOf(parsedInput));
if (skipped.length > 0) {
return 'NaD';
}
dateString = dateString.slice(dateString.indexOf(parsedInput) + parsedInput.length);
//console.log(parsedInput);
if (token === 'S' || token === 'SS') {
second = Number(parsedInput);
} else if (token === 'M' || token === 'MM') {
minute = Number(parsedInput);
} else if (token === 'H' || token === 'HH') {
hour = Number(parsedInput);
} else if (token === 'd' || token === 'dd') {
day = Number(parsedInput);
} else if (token === 'm' || token === 'mm') {
month = Number(parsedInput) - 1;
} else if (token === 'yy') {
year = parseTwoDigitYear(parsedInput);
} else if (token === 'yyyy') {
year = parsedInput.length === 2 ? parseTwoDigitYear(parsedInput) : Number(parsedInput);
}
}
}
if (year===null || month===null || day===null || checkOverflow(year, month, day, hour, minute, second)) {
return 'NaD';
} else {
return new Date(Date.UTC(year, month, day, hour, minute, second));
}
}
function detectDecimalDelimiter(input) {
input = input.trim();
let c = input.split(',').length - 1;
if (c > 1) return '.'; // '123,456,789' or '123,456,789.12'
if (input.indexOf(' ') >= 0) return ','; // '123 456'
if (input.indexOf('،') >= 0) return '.'; // '123،456'
if (input.indexOf('\'') >= 0) return '.'; // '123\'456'
let d = input.split('.').length - 1;
if (c === 1 && d === 1) { // '123,456.789' or '1.234,45'
let ci = input.lastIndexOf(',');
let di = input.lastIndexOf('.');
if (di > ci) return '.';
else return ',';
}
if (c + d === 1) {
let ci = input.indexOf(',');
let di = input.indexOf('.');
let len = input.length;
if (ci !== -1 && len - ci !== 4) return ',';
if (di !== -1 && len - di !== 4) return '.';
}
return 'ambiguous';
}
// default delimiter '.'
function findDecimal(output) {
for (let i = 0; i < output.length; i++) {
for (let j = 0; j < output[i].length; j++) {
if (/^[0-9.,' ،-]+$/.test(output[i][j])) {
let delimiter = detectDecimalDelimiter(output[i][j]);
if (delimiter !== 'ambiguous') {
return delimiter;
}
}
}
}
return '.';
}
function unionOptions(defaultOptions, options) {
for (let prop in options) {
if (typeof options[prop] === 'object') {
if (typeof defaultOptions[prop] !== 'undefined') {
unionOptions(defaultOptions[prop], options[prop]);
}
} else {
defaultOptions[prop] = options[prop];
}
}
return defaultOptions;
}
function parse(data, options) {
let defaultOptions = {
delimiter: ',',
lineEnding: '\n',
comment: '#',
maxRows: 0,
maxColumns: {
numberOfColumns: -1,
cutRemaining: false
}
};
// union options and defaultOptions
options = unionOptions(defaultOptions, options);
let out = [], row = [];
let len = data.length;
if (len === 0) return out;
loop1:
for (let start = 0, end = 0, rows = 0, columns = 0; ; end++) {
if (end >= len) { // EOF
//if (start < end)
row.push(data.substring(start, end));
if (row.length > 0) out.push(row);
break;
}
if (data[end] === options.delimiter) { // field
columns++;
if (columns === options.maxColumns.numberOfColumns) { // max columns reached
if (options.maxColumns.cutRemaining) { // find next row
row.push(data.substring(start, end));
end = nextLineOrEOF(end); // search for next line or EOF
} else { //
end = nextLineOrEOF(end); // search for next line or EOF
row.push(data.substring(start, end));
}
out.push(row);
row = [];
columns = 0;
rows++;
if (end === len || rows === options.maxRows) break; // EOF or max rows
if (options.lineEnding === '\r\n') end = end + 1;
} else {
row.push(data.substring(start, end));
}
start = end + 1;
continue;
}
if (data[end] === options.lineEnding || (options.lineEnding === '\r\n' && data[end] === '\r' && data[end + 1] === '\n')) { // line, the case '\r\n' is covered
row.push(data.substring(start, end));
out.push(row);
row = [];
columns = 0;
rows++;
if (rows === options.maxRows) break;
if (options.lineEnding === '\r\n') end = end + 1;
start = end + 1;
continue;
}
if (startOrNewLine(end) && data[end] === options.comment) { // comment
end = nextLineOrEOF(end); // search for next line or EOF
if (end === len) break; // EOF
if (options.lineEnding === '\r\n') end = end + 1;
start = end + 1;
continue;
}
if (data[end] === '"') { // first quote of quoted field, or misplaced quote
if (startOrNewLine(end) || data[end - 1] === options.delimiter) { // first quote of quoted field
start = end = end + 1;
// search for closing quote
loop2:
for (; ; end++) {
if (end === len) { // misplaced quote, EOF
row.push(data.substring(start, end));
out.push(row);
break loop1;
}
if (data[end] === '"') {
if (end === len - 1) { // EOF
row.push(data.substring(start, end).replace(/""/g, '"'));
out.push(row);
break loop1;
}
if (data[end + 1] === '"') { // escape quote
end = end + 1;
continue;
}
if (data[end + 1] === options.delimiter) { // field
columns++;
if (columns === options.maxColumns.numberOfColumns) { // max columns reached
if (options.maxColumns.cutRemaining) { // find next row
row.push(data.substring(start, end).replace(/""/g, '"'));
end = nextLineOrEOF(end); // search for next line or EOF
} else { //
end = nextLineOrEOF(end); // search for next line or EOF
row.push(data.substring(start, data[end - 1] === '"' ? (end - 1) : end).replace(/""/g, '"').replace(/","/g, ','));
}
out.push(row);
row = [];
columns = 0;
rows++;
if (end === len || rows === options.maxRows) break loop1;
if (options.lineEnding === '\r\n') end = end + 1;
} else {
row.push(data.substring(start, end).replace(/""/g, '"'));
end = end + 1;
}
start = end + 1;
break;
}
if (data[end + 1] === options.lineEnding || (options.lineEnding === '\r\n' && data[end + 1] === '\r' && data[end + 2] === '\n')) { // line, the case '\r\n' is covered
row.push(data.substring(start, end).replace(/""/g, '"'));
out.push(row);
row = [];
columns = 0;
rows++;
if (rows === options.maxRows) break loop1;
if (options.lineEnding === '\r\n') end = end + 1;
end = end + 1;
start = end + 1;
break;
}
}
}
} else {
if (data[end + 1] === options.delimiter || end === len - 1) { // misplaced quote at end of field (might EOF)
end = end + 1;
columns++;
if (columns === options.maxColumns.numberOfColumns) { // max columns reached
if (options.maxColumns.cutRemaining) { // find next row
row.push(data.substring(start, end));
end = nextLineOrEOF(end); // search for next line or EOF
} else { //
end = nextLineOrEOF(end); // search for next line or EOF
row.push(data.substring(start, end));
}
out.push(row);
row = [];
columns = 0;
rows++;
if (end === len || rows === options.maxRows) break; // EOF or max rows
if (options.lineEnding === '\r\n') end = end + 1;
} else {
row.push(data.substring(start, end));
}
start = end + 1;
continue;
}
}
}
}
function startOrNewLine(end) { // search for next line or EOF
if (end === 0 || data[end - 1] === options.lineEnding || (end > 1 && options.lineEnding === '\r\n' && data[end - 2] === '\r' && data[end - 1] === '\n')) return true;
return false;
}
function nextLineOrEOF(end) { // search for next line or EOF
for (; end !== len && !(data[end] === options.lineEnding || (options.lineEnding === '\r\n' && data[end] === '\r' && data[end + 1] === '\n')); end++);
return end;
}
return out;
}
const RECORD_SEP = String.fromCharCode(30);
const UNIT_SEP = String.fromCharCode(31);
function detectDelimiter(data, lineEnding) {
let delimiters = [',', ';', '\t', '|', RECORD_SEP, UNIT_SEP];
let res = [];
for (let i = 0; i < delimiters.length; i++) {
let example = parse(data, { delimiter: delimiters[i], lineEnding: lineEnding, maxRows: 10 });
//console.log(example);
let fields = 0, first = 0, delta = 1000;
let firstRow = false, secondRow = false;
for (let j = 0; j < example.length; j++) {
// skip empty rows
if (example[j] === '') {
continue;
}
let fieldCount = example[j].length;
fields += fieldCount;
// first non-empty row
if (!firstRow) {
firstRow = true;
first = example[j].length;
}
// second non-empty row
if (!secondRow) {
secondRow = true;
delta = Math.abs(first - fieldCount);
}
}
res.push([first, delta, fields, i]);
}
//console.log(res);
// order by (max fields first row - desc) (difference first second row - asc) (max fields - desc)
res.sort(function (a, b) {
let a0 = a[0], b0 = b[0];
let a1 = a[1], b1 = b[1];
let a2 = a[2], b2 = b[2];
if (a0 < b0) return 1;
if (a0 > b0) return -1;
if (a1 < b1) return -1;
if (a1 > b1) return 1;
if (a2 < b2) return 1;
if (a2 > b2) return -1;
return 0;
});
return delimiters[res[0][3]];
}
function detectLineEnding(data) {
data = data.substr(0, 1024 * 1024); // max length 1 MB
data = data.replace(/"[^"]*"/g, ''); // replace all quoted fields
//console.log(data);
let n = data.split('\n');
let r = data.split('\r');
let rn = data.split('\r\n');
let arr = [{ type: '\n', data: n, length: n.length - rn.length }, { type: '\r', data: r, length: r.length - rn.length }, { type: '\r\n', data: rn, length: rn.length - 1 }];
arr.sort(function (a, b) {
return b.length - a.length;
});
//console.log(arr);
if (arr[0].length == 0)
return '\n'; // default, no line breaks
if (arr[0].length == arr[1].length) // mixed line breaks
return arr[0].data[0].length <= arr[1].data[0].length ? arr[0].type : arr[1].type; // use first occurrence
return arr[0].type;
}
function csvparse(data, options) {
let defaultOptions = {
delimiter: 'auto',
lineEnding: 'auto',
comment: '#',
convertToTypes: {
convert: false,
decimalDelimiter: 'auto',
dateFormat: 'yyyy-mm-dd'
},
skipEmptyLines: false,
maxRows: 0,
maxColumns: {
numberOfColumns: -1,
cutRemaining: false
},
header: 'auto'
};
// union options and defaultOptions
options = unionOptions(defaultOptions, options);
if (options.lineEnding === 'auto') options.lineEnding = detectLineEnding(data);
if (options.delimiter === 'auto') options.delimiter = detectDelimiter(data, options.lineEnding);
let output = parse(data, options);
if (options.skipEmptyLines) {
for (let i = 0; i < output.length; i++) {
if (output[i].length === 1 && output[i][0] === '') {
output.splice(i--, 1);
}
}
}
if (options.convertToTypes.convert) {
if (options.convertToTypes.decimalDelimiter === 'auto') options.convertToTypes.decimalDelimiter = findDecimal(output);
let reg;
if (options.convertToTypes.decimalDelimiter === '.') {
reg = /[,' ،]/g;
} else {
reg = /[\.' ،]/g;
}
for (let i = 0; i < output.length; i++) {
for (let j = 0; j < output[i].length; j++) {
let value = output[i][j];
if (value === 'true' || value === 'TRUE') { // Boolean
output[i][j] = true;
} else if (value === 'false' || value === 'FALSE') { // Boolean
output[i][j] = false;
} else if (isDate(value, options.convertToTypes.dateFormat)) { // Date
output[i][j] = getDate(value, options.convertToTypes.dateFormat);
} else if (/[0-9]{2}\.[0-9]{2}\./.test(value)) { // maybe a date, eg. 01.01.
; // do nothing
} else {
value = value.replace(reg, '');
value = value.replace(options.convertToTypes.decimalDelimiter, '.');
if (value.endsWith('-')) value = '-' + value.substring(0, value.length - 1);
if (value !== '' && !isNaN(Number(value))) { // Number
output[i][j] = Number(value);
} else { // String
; // do nothing
}
}
}
}
}
let header = [];
if (output.length === 1) options.header = 0;
if (options.header === 'auto') {
let temp = [];
let firstRowColumns = 0;
for (let i = 0, j = 0; i < output.length; i++) {
if (i === 0) firstRowColumns = output[i].length;
if (output[i].length === firstRowColumns) { // only use rows with the same number of columns like the first row
let t = [];
for (let j = 0; j < output[i].length; j++) {
t.push(output[i][j]);
}
temp.push(t);
j++;
}
if (j == 20) break; // no more than 20 rows are used for detection
}
if (!options.convertToTypes.convert) {
let reg = /[,' ،.]/g;
for (let i = 0; i < temp.length; i++) {
for (let j = 0; j < temp[i].length; j++) {
let value = temp[i][j];
if (value === 'true' || value === 'TRUE') { // Boolean
temp[i][j] = true;
} else if (value === 'false' || value === 'FALSE') { // Boolean
temp[i][j] = false;
} else if (/[0-9]{2}\.[0-9]{2}\./.test(value)) { // maybe a date, eg. 01.01.
; // do nothing
} else {
value = checkDate(value);
if (value.date) {
temp[i][j] = value.value;
} else {
value = value.value;
value = value.replace(reg, '');
if (value.endsWith('-')) value = '-' + value.substring(0, value.length - 1);
if (value !== '' && !isNaN(Number(value))) { // Number
temp[i][j] = Number(value);
} else { // String
; // do nothing
}
}
}
}
}
}
let headerRows = 0;
let rows = temp.length,
columns = rows ? temp[0].length : 0,
relColumns = 0;
for (let i = 0; i < columns; i++) {
let stringRows = 0,
otherRows = 0;
for (let j = 0; j < rows; j++) {
if (typeof temp[j][i] === 'string') stringRows += 1;
else {
otherRows += 1;
break;
}
}
if (stringRows>0 && stringRows !== rows) {
relColumns += 1;
headerRows += stringRows
}
}
if (relColumns) headerRows /= relColumns;
headerRows = Math.round(headerRows);
options.header = headerRows;
}
if (options.header) { // positive number or true
if (options.header === true) options.header = 1;
header = output.slice(0, options.header);
output.splice(0, options.header);
}
return { options: options, data: output, header: header};
}
function checkDate (value) {
// some formats according to https://en.wikipedia.org/wiki/Date_format_by_country
let formats = ['dd.mm.yy', 'd.m.yy',
'dd.mm.yyyy', 'd.m.yyyy',
'dd/mm/yy', 'd/m/yy',
'dd/mm/yyyy', 'd/m/yyyy',
'dd-mm-yy', 'd-m-yy',
'dd-mm-yyyy', 'd-m-yyyy',
'yyyy-mm-dd', 'yy-mm-dd',
'yyyy/mm/dd', 'yy/mm/dd',
'yyyy mm dd',
'mm/dd/yy', 'm/d/yy',
'mm/dd/yyyy', 'm/d/yyyy']
for (let format of formats) {
if (isDate(value, format)) return { date: true, value: getDate(value, format) };
}
return { date: false, value: value };
}
return csvparse;
}));