UNPKG

monetdb-import

Version:

MonetDB automated data import for NodeJS

github.com/MonetDB/npm-monetdb-import

MonetDB/npm-monetdb-import

443 lines (389 loc) • 17.8 kB

JavaScript

var fs = require("fs"); var Q = require("q"); var csvParse = require("csv-parse"); var fileExists = require("file-exists"); var isBinary = require("isbinaryfile"); var MonetDB = require("monetdb")(); var CSVSniffer = require("csv-sniffer")(); // Private functions that are not tied to the Importer object and thus do not use the this keyword function __typeCheck(type, valueToCheck, optional) { var correct = typeof(valueToCheck) == type; if(optional) { // Exception if the variable is optional, than it also may be undefined or null correct = correct || valueToCheck === undefined || valueToCheck === null; } if(!correct) { throw new Error("Invalid argument type received; expected "+type+ ", but received "+typeof(valueToCheck)); } } function __typeToDbType(type) { switch(type) { case "string": return "STRING"; break; case "float": return "DOUBLE"; break; case "integer": return "BIGINT"; break; default: return "STRING"; break; } } function __labelFn(i) { return "c"+i; } function __labelTransformFn(label) { return label.toLowerCase() .replace(/\s/g, "_") .replace(/'/g, "") .replace(/"/g, "") .replace(/\n/g, " | ") .replace(/\r/g, ""); } module.exports = function() { function Importer(dbOptions, importOptions, filepath, schemaname, tablename, delimiters) { // Shift if importOptions is missing if(typeof(importOptions) != "object") { // optional importOptions not provided, shift parameters delimiters = tablename; tablename = schemaname; schemaname = filepath; filepath = importOptions; importOptions = {}; } // Shift if schemaname is missing if(typeof(tablename) != "string") { delimiters = tablename; tablename = schemaname; schemaname = "sys"; } if(!fileExists(filepath)) { throw new Error("File '" + filepath + "' could not be found. Please check the path and try again."); } if(isBinary.sync(filepath)) { throw new Error("File '" + filepath + "' appears to be binary. We can only deal with regular text files."); } __typeCheck("object", dbOptions); __typeCheck("object", importOptions, true); __typeCheck("string", filepath); __typeCheck("string", schemaname, true); __typeCheck("string", tablename); __typeCheck("object", delimiters, true); // private variables var _conn = null; var _closeConn = false; // set to true if we create a connection ourselves var _filepath = filepath; var _schemaname = schemaname; var _tablename = tablename; var _importOptions = importOptions; var _bestEffort = false; var _labelFn = __labelFn; var _labelTransformFn = __labelTransformFn; var _sqlLogFn = console.log; var _sniffer = new CSVSniffer(delimiters); var _sample = null; // private functions function _query(query) { _sqlLogFn && _sqlLogFn(query); return _conn.query(query, false); } function _getSample() { if(_sample) { return Q.when(_sample); } return Q.nfcall(fs.stat, _filepath).then(function(stat) { return Q.nfcall(fs.open, _filepath, "r").then(function(fd) { var bytesToRead = stat.size; if(_importOptions.sampleSize > 0 && _importOptions.sampleSize < stat.size) { bytesToRead = _importOptions.sampleSize; } var buf = new Buffer(bytesToRead); return Q.nfcall(fs.read, fd, buf, 0, bytesToRead, 0).then(function(bytesRead) { _sample = buf.toString(); return _sample; }); }); }); } function _getTablename() { return '"'+_schemaname+'"."'+_tablename+'"'; } var that = this; // priviliged functions (meaning they are public but can access the private variables) this.sniff = function(sniffOptions, fn) { if(typeof(sniffOptions) != "object") { // optional sniffOptions not provided, shift parameters fn = sniffOptions; sniffOptions = null; } __typeCheck("object", sniffOptions, true); __typeCheck("function", fn); _getSample().then(function(sample) { try { var sniffResult = _sniffer.sniff(sample, sniffOptions); // sniffResult calculated... make sure labels are appropriate for insertion in the database Importer.prepareLabels(sniffResult, {labelFn: _labelFn, labelTransformFn: _labelTransformFn}); fn(null, sniffResult); } catch(err) { fn("Failed to sniff file "+_filepath+" ("+err+")"); } }, function(err) { fn("Could not sample file "+_filepath+" ("+err+")") }); }; this.import = function(sniffResult, fn) { // Check arguments and shift if necessary if(typeof(sniffResult) != "object") { fn = sniffResult; sniffResult = null; } __typeCheck("object", sniffResult, true); __typeCheck("function", fn); if(!sniffResult) { sniffResult = that.sniffQ(); } var databaseCheckPromise = _query("SELECT COUNT(*) FROM "+_getTablename()).then(function() { // Query succeeded, meaning table exists... throw new Error("Table "+_getTablename()+" already exists!"); }, function() { // Failure, which is exactly what we needed! Return true to resole promise return true; }); var nrLines; // will be filled in on promise resolve var nrCols; var types = []; // contains the newly computed types while walking over the entire file Q.spread([ Q.when(sniffResult), databaseCheckPromise ], function(sniffR) { sniffResult = sniffR; //Store globally so we don't have to pass it on to next promises // We have to walk through the entire file to figure out: // 1) Number of lines in the file, so we have an upper bound to give to the copy into statement // 2) Most occurring number of columns // 3) Actual column types (csv-sniffer is based on a sample of the file) var deferred = Q.defer(); var nrRecords = 0; var nrColsDict = []; // e.g: [15: 60, 16: 1029, 17: 99, 18: 1, 99: 1] -> we would choose 16 var parseOptions = { delimiter: sniffResult.delimiter, rowDelimiter: sniffResult.newlineStr, quote: sniffResult.quoteChar, skipEmptyLines: true, trim: true, escape: "\\", relax: true }; var csvParser = csvParse(parseOptions); csvParser.on("readable", function () { var record; var l; while (record = csvParser.read()) { if(nrRecords++ == 0 && sniffResult.hasHeader) { // skip first record, if it is a header according to the sniff result continue; } // Update nrColsDict l = record.length; if (nrColsDict[l] === undefined) nrColsDict[l] = 1; else nrColsDict[l]++; // Update types dict record && record.forEach(function(val, i) { types[i] = _sniffer.getAccumulatedType(val, types[i]); }); } }); csvParser.on("error", function (err) { deferred.reject(err.message); }); csvParser.on("finish", function () { var maxNrCols = -1; nrCols = -1; nrColsDict.forEach(function (n, i) { if (n && n > maxNrCols) { maxNrCols = n; nrCols = i; } }); nrLines = csvParser.lines; deferred.resolve(); }); fs.createReadStream(_filepath).pipe(csvParser); return deferred.promise; }).then(function() { // Make sure we have the right amount of labels, and that the labels are valid (in case they came from the outside) Importer.prepareLabels(sniffResult, {labelFn: _labelFn, labelTransformFn: _labelTransformFn, nrCols: nrCols}); // Create a table that can be used to store the file var labelsQuoted = sniffResult.labels.map(function(label) { return '"'+label.replace('"', "")+'"'; }); return _query( "CREATE TABLE "+_getTablename()+" ("+ labelsQuoted.map(function(col, i) { return col+" "+__typeToDbType(types[i]); }).join(",\n")+ ")" ).fail(function(err) { throw new Error("Could not create database table "+_getTablename()+" ("+err+")"); }); }).then(function() { // Table is in place; we can now do the actual import var offset = sniffResult.hasHeader?"2":"1"; var delimiterStr = null; if(sniffResult.delimiter) { delimiterStr = "'"+sniffResult.delimiter+"'"; if(sniffResult.newlineStr) { delimiterStr += ", '"+sniffResult.newlineStr.replace("\r", "\\r").replace("\n", "\\n")+"'"; if(sniffResult.quoteChar) { delimiterStr += ", '"+sniffResult.quoteChar.replace("'", "\\'")+"'"; } } } var lockedStr = _importOptions.locked ? " LOCKED" : ""; var bestEffortStr = _bestEffort ? " BEST EFFORT" : ""; return _query( // Note: nrLines >= actual nr of records in input, but this is ok since MonetDB only expects an upper bound. "CALL sys.clearrejects();\n"+ "COPY "+nrLines+" OFFSET "+offset+" RECORDS \n"+ "INTO "+_getTablename()+" \n"+ "FROM ('"+_filepath+"') \n"+ (delimiterStr ? "DELIMITERS "+delimiterStr+"\n" : "")+ "NULL AS '" + _importOptions.nullString + "'" + lockedStr + bestEffortStr); }).then(function() { if(!_bestEffort) return; // import succeeded and best effort used; // get the reject result, but do not fail when something fails, instead return no rejects var rejectsLimit = parseInt(_importOptions.rejectsLimit); return _query("SELECT * FROM sys.rejects LIMIT " + (rejectsLimit > 0 ? rejectsLimit : 100)).then(function (result) { return result.data.map(function (row) { return row.reduce(function (o, v, i) { o[result.structure[i].column] = v; return o; }, {}); }); }, function () { return []; }); }).then(function(rejects) { var result = {}; if(rejects) result.rejects = rejects; // Attach counts as well, but do not fail the import if any of these fails return Q.allSettled([ _query("SELECT COUNT(DISTINCT rowid) FROM sys.rejects"), _query("SELECT COUNT(*) FROM " + _getTablename()) ]).then(function(d) { result.rejectedRows = d[0].state === "fulfilled" ? d[0].value.data[0][0] : -1; result.importedRows = d[1].state === "fulfilled" ? d[1].value.data[0][0] : -1; // However, if importedRows count is zero, we should consider this import failed.. if(result.importedRows == 0) { throw new Error("All of the rows in your file failed to import"); } return result; }); }).then(function(result) { fn && fn(null, result); }, function(err) { _query("DROP TABLE "+_getTablename()); fn && fn("Import failed. Reason: "+err); }).fin(function() { if(_closeConn) { _conn.close(); } }).done(); }; this.bestEffort = function(b) { _bestEffort = !!b; }; this.setLabelFn = function(fn) { __typeCheck("function", fn); _labelFn = fn; }; this.setLabelTransformFn = function(fn) { __typeCheck("function", fn); _labelTransformFn = fn; }; this.setSqlLogFn = function(fn) { __typeCheck("function", fn, true); _sqlLogFn = fn; }; // initialize database connection if(dbOptions.conn && dbOptions.conn.query) { _conn = dbOptions.conn; } else { delete dbOptions.conn; _closeConn = true; // indicate that the connection should be closed afterwards _conn = new MonetDB(dbOptions); _conn.connect().fail(function(err) { _sqlLogFn(err); }); } // initialize default options if(_importOptions.sampleSize === undefined) _importOptions.sampleSize = 0; if(_importOptions.locked === undefined) _importOptions.locked = true; if(_importOptions.nullString === undefined) _importOptions.nullString = ""; } // Static function that prepares the labels in the sniff result for insertion into the database, // by performing the following: // - Use the records array in the sniff result to determine the max number of columns in any row // - Extend the labels array to include enough values for the col max found in the previous step, // or the number of cols provided in opts.nrCols, by using the label fn // - Apply label transform fn to all labels // - Replace any empty values with a label generated with the label fn // - Extend any duplicate values with '(i)' Importer.prepareLabels = function(sniffResult, opts) { if(!opts) opts = {}; if(!opts.labelFn) { opts.labelFn = __labelFn; } if(!opts.labelTransformFn) { opts.labelTransformFn = __labelTransformFn; } if (!sniffResult.labels) sniffResult.labels = []; var nrCols = 0; if(opts.nrCols === undefined) { if (sniffResult.records) { // first, find the # of columns that occurs the most in the records var colCountOccurrences = []; sniffResult.records.forEach(function (cols) { if (!colCountOccurrences[cols.length]) { colCountOccurrences[cols.length] = 1; } else { ++colCountOccurrences[cols.length]; } }); Object.keys(colCountOccurrences).forEach(function (d) { nrCols = Math.max(nrCols, parseInt(d)); }); } } else { nrCols = parseInt(opts.nrCols); if(isNaN(nrCols)) nrCols = 0; } for(var i=0; i<nrCols; ++i) { if (typeof(sniffResult.labels[i]) != "string") { sniffResult.labels[i] = opts.labelFn((i + 1)); } } sniffResult.labels = sniffResult.labels.slice(0, nrCols); // Transform labels and replace empty labels with labels created with the labelFn sniffResult.labels = sniffResult.labels.map(function(label, i) { var transformed = opts.labelTransformFn(label).trim(); return transformed.length ? transformed : opts.labelFn(i + 1); }); // Duplicate elimination var labelDict = {}; sniffResult.labels = sniffResult.labels.map(function(label) { var res = label; var count = labelDict[label]; if(count === undefined) { count = labelDict[label] = 0; } else { res += "(" + count + ")"; } labelDict[label] = ++count; return res; }); }; // Q Integration ["sniff", "import"].forEach(function(funToQ) { var funQ = funToQ + 'Q'; Importer.prototype[funQ] = function() { return Q.npost(this, funToQ, arguments); } }); return Importer; };