UNPKG

qminer

Version:

A C++ based data analytics platform for processing large-scale real-time streams containing structured and unstructured data

544 lines (470 loc) 18.6 kB
/** * Copyright (c) 2015, Jozef Stefan Institute, Quintelligence d.o.o. and contributors * All rights reserved. * * This source code is licensed under the FreeBSD license found in the * LICENSE file in the root directory of this source tree. */ var nodefs = require('fs'); var util = require('util'); module.exports = exports = function (pathQmBinary) { var qm = require(pathQmBinary); // This loads only c++ functions of qm var fs = qm.fs; exports = qm; //!STARTJSDOC //================================================================== // BASE //================================================================== /** * @typedef {object} BaseLoadCSVParam * The parameter given to {@link module:qm.Base#loadCSV}. * @property {string} file - The name of the input file. * @property {string} store - Name of the store which will be created. * @property {module:qm.Base} base - QMiner base object that creates the store. * @property {string} [delimiter = ','] - Optional delimiter. * @property {string} [quote = '"'] - Optional character to escape values that contain a delimiter. */ /** * Loads the store from a CSV file. * @param {module:qm~BaseLoadCSVParam} opts - Options object. * @param {function} [callback] - Callback function, called on errors and when the procedure finishes. */ exports.Base.prototype.loadCSV = function (opts, callback) { // console.log('Loading CSV file ...'); if (opts.delimiter == null) opts.delimiter = ','; if (opts.quote == null) opts.quote = '"'; if (opts.ignoreFields == null) opts.ignoreFields = []; if (opts.file == null) throw new Error('Missing parameter file!'); if (callback == null) { callback = function (e) { if (e != null) { // console.log(e.stack); } } } try { var base = this; var fname = opts.file; var storeName = opts.store; var fieldTypes = null; var store = null; var buff = []; var ignoreFields = {}; for (var i = 0; i < opts.ignoreFields.length; i++) ignoreFields[opts.ignoreFields] = null; // read the CSV file and fill the store var headers = null; function transformLine(line) { var transformed = {}; for (var i = 0; i < line.length; i++) { var header = headers[i]; var value = line[i]; if (fieldTypes != null && fieldTypes[header] != null) { transformed[header] = fieldTypes[header] == 'float' ? parseFloat(value) : value; } else { transformed[header] = (isNaN(value) || value.length == 0) ? value : parseFloat(value); } } return transformed; } function initFieldTypes(data) { if (fieldTypes == null) fieldTypes = {}; for (var key in data) { var val = data[key]; if (fieldTypes[key] == null) { if (val.length == 0) fieldTypes[key] = null; else if (isNaN(val)) fieldTypes[key] = 'string'; else fieldTypes[key] = 'float'; } } if (fieldTypesInitialized()) { // console.log('Fields initialized: ' + JSON.stringify(fieldTypes)); } } function fieldTypesInitialized() { if (fieldTypes == null) return false; for (var key in fieldTypes) { if (fieldTypes[key] == null) return false; } return true; } function getUninitializedFlds() { var result = []; for (var key in fieldTypes) { if (fieldTypes[key] == null) result.push(key); } return result; } function createStore(rec) { try { var storeDef = { name: storeName, fields: [] }; for (var fieldName in rec) { storeDef.fields.push({ name: fieldName, type: fieldTypes[fieldName], "null": true, }); } // console.log('Creating store with definition ' + JSON.stringify(storeDef) + ' ...'); base.createStore(storeDef); store = base.store(storeName); // insert all the record in the buffer into the store buff.forEach(function (data) { store.push(data); }); } catch (e) { callback(e); } } var storeCreated = false; var line = 0; // console.log('Saving CSV to store ' + storeName + ' ' + fname + ' ...'); var fin = new fs.FIn(fname); fs.readCsvLines(fin, { onLine: function (lineArr) { try { if (line++ == 0) { // the first line are the headers headers = []; for (var i = 0; i < lineArr.length; i++) { headers.push(lineArr[i].replace(/\s+/g, '_').replace(/\.|%|\(|\)|\/|-|\+/g, '')) // remove invalid characters } // console.log('Headers initialized: ' + JSON.stringify(headers)); } else { if (line % 1000 == 0) { // console.log(line + ''); } var data = transformLine(lineArr); if (fieldTypes == null) initFieldTypes(data); if (store == null && fieldTypesInitialized()) createStore(data); else if (!fieldTypesInitialized()) initFieldTypes(data); if (store != null) { store.push(data); } else buff.push(data); } } catch (e) { // console.log('Exception while reading CSV lines: ' + e.stack); callback(e); } }, onEnd: function () { // finished // console.log('Finished!'); if (callback != null) { if (!fieldTypesInitialized()) { var fieldNames = getUninitializedFlds(); callback(new Error('Finished with uninitialized fields: ' + JSON.stringify(fieldNames)) + ', add them to ignore list!'); return; } else { callback(undefined, store); } } } }); fin.close(); } catch (e) { callback(e); } }; //================================================================== // STORE //================================================================== /** * Adds a stream aggregate to a store. * @param {function} trigger - The trigger containing the method {@link module:qm.StreamAggr#onAdd} and optional * {@link module:qm.StreamAggr#onUpdate} and {@link module:qm.StreamAggr#onDelete}. */ exports.Store.prototype.addTrigger = function (trigger) { // name is automatically generated // saveJson isn't needed var Callbacks = { onAdd: trigger.onAdd, saveJson: function (limit) { return {}; } }; if (trigger.onUpdate != undefined) { Callbacks["onUpdate"] = trigger.onUpdate; } if (trigger.onDelete != undefined) { Callbacks["onDelete"] = trigger.onDelete; } var streamAggr = new exports.StreamAggr(this.base, Callbacks, this.name); } /** * Adds a stream aggregate to the store. For use example see {@link module:qm.StreamAggr} constructor example. * @param {(module:qm~StreamAggregator | function)} arg - Constructor arguments. There are two argument types: * <br>1. Using the {@link module:qm~StreamAggregator} object, * <br>2. using a function/JavaScript class. The function has defined The object containing the schema of the stream aggregate or the function object defining the operations of the stream aggregate. */ exports.Store.prototype.addStreamAggr = function (params) { return new exports.StreamAggr(this.base, params, this.name); } /** * Inspects the stores. * @param {number} depth - The depth of inspection. How many times to recurse while formatting the store object. * @returns {string} String representation of the store. */ exports.Store.prototype.inspect = function (depth) { var d = (depth == null) ? 0 : depth; return util.inspect(this, { depth: d, 'customInspect': false }); } /** * Load given file line by line, parse each line to JSON and push it to the store. * @param {String} file - Name of the JSON line file. * @param {Number} [limit] - Maximal number of records to load from file. * @returns {number} Number of records loaded from file. */ exports.Store.prototype.loadJson = function (file, limit) { var fin = fs.openRead(file); var count = 0; while (!fin.eof) { var line = fin.readLine(); if (line == "") { continue; } try { var rec = JSON.parse(line); this.push(rec); // count, GC and report count++; if (limit != undefined && count == limit) { break; } } catch (err) { throw new Error("Error parsing line number: " + count + ", line content:[" + line + "]: " + err); } } return count; } //================================================================== // RECORD SET //================================================================== /** * Stores the record set as a CSV file. * * @param {Object} opts - Arguments. * @property {String} opts.fname - Name of the output file. * @property {Boolean} [opts.includeHeaders = true] - Indicates wether to include the header in the first line. * @property {String} [opts.timestampType = 'timestamp'] - Date format. Possible options: * <br>1. `'ISO'` - Datetime fields will be printed as ISO dates, * <br>2. `'timestamp'` - Datetime fields will be printed as timestamps. * @property {String} [opts.escapeChar = "] - Character which escapes quotes. */ exports.RecSet.prototype.saveCsv = function (opts) { if (opts == null || opts.fname == null) throw new Error('Missing parameter fname!'); if (opts.includeHeaders == null) opts.includeHeaders = true; if (opts.timestampType == null) opts.timestampType = 'timestamp'; if (opts.escapeChar == null) opts.escapeChar = '"'; var escapeStr = opts.escapeChar + '"'; // read field descriptions var fields = this.store.fields; var fieldDesc = []; for (var i = 0; i < fields.length; i++) { var desc = fields[i]; var type = desc.type; if (type != 'float' && type != 'int' && type != 'bool' && type != 'datetime' && type != 'string') throw new Error('Invalid field type: ' + type); if (desc.internal) continue; fieldDesc.push({name: desc.name, type: desc.type}); } var nFields = fieldDesc.length; var useTimestamp = opts.timestampType != 'ISO'; var fout = new fs.FOut(opts.fname); // write the headers if (opts.includeHeaders) { var headerLine = ''; for (var i = 0; i < nFields; i++) { headerLine += '"' + fieldDesc[i].name.replace(/"/g, escapeStr) + '"'; if (i < nFields - 1) headerLine += ','; } fout.writeLine(headerLine); } // write the lines var len = this.length; var recN = 0; this.each(function (rec) { var line = ''; for (var i = 0; i < nFields; i++) { var fldVal = rec[fieldDesc[i].name]; var type = fieldDesc[i].type; if (fldVal != null) { if (type == 'float' || type == 'int' || type == 'bool') { line += fldVal; } else if (type == 'datetime') { line += useTimestamp ? fldVal.getTime() : fldVal.toISOString(); } else if (type == 'string') { line += '"' + fldVal.replace(/"/g, escapeStr) + '"'; } else { throw new Error('Invalid type of field: ' + type); } } if (i < nFields - 1) line += ','; } if (recN++ < len - 1) fout.writeLine(line); else fout.write(line); }); fout.flush(); fout.close(); } //================================================================== // CIRCULAR RECORD BUFFER //================================================================== /** * @classdesc Circular buffer for storing records. Size of buffer is defined at * start and is denoted in number of records. When buffer is full, old records * are removed from the buffer and new records are stored in their place. For * adding and deleting a callback is called. Records are stored by their IDs. * @class * @param {Object} [params] - Constructor parameters. * @param {module:qm.Store} param.store - Store for the records in the buffer. * @param {number} param.size - Size of the buffer (number of records). * @param {function} [param.onAdd] - Callback executed when new record is * added to the buffer. Callback is give two parameters: * <br>`rec` - The record. Type {@link module:qm.Record}. * <br>`circRecBuff` - The circular record buffer instance. Type {@link module:qm.CircularRecordBuffer}. * @param {function} [param.onDelete] - Callback executed when record is removed * from the buffer. Callback is give two parameters: * <br>`rec` - The record. Type {@link module:qm.Record}. * <br>`circRecBuff` - The circular record buffer instance. Type {@link module:qm.CircularRecordBuffer}. * @example * // TODO */ exports.CircularRecordBuffer = function (params) { // check we have all encessary parameters if (params.store == undefined) { throw new Error("CircularRecordBuffer requires store in constructor"); } if (!(params.store instanceof qm.Store)) { throw new Error("CircularRecordBuffer requires store in constructor" + params.store); } if (params.size == undefined) { throw new Error("CircularRecordBuffer requires size in constructor"); } if (!(params.size >= 1)) { throw new Error("CircularRecordBuffer positive size in constructor"); } // parameters this.store = params.store; this.size = params.size; this.buffer = new qm.la.IntVector(); this.next = 0; // Callbacks this.onAdd = (params.onAdd == undefined) ? function () {} : params.onAdd; this.onDelete = (params.onDelete == undefined) ? function () {} : params.onDelete; /** * Load circular buffer from input stream. Assumes store, onAdd and onDelete * were already initialized in constructor. * @param {module:fs.FIn} fin - input stream. * @example * // TODO */ this.load = function (fin) { var finParam = fin.readJson(); this.size = finParam.size; this.next = finParam.next; this.buffer.load(fin); } /** * Saves circular buffer to the output stream. Does not save store, onAdd * and onDelete callbacks. * @param {module:fs.FOut} fout - Output stream. * @returns {module:fs.FOut} The output stream `fout`. * @example * // TODO */ this.save = function (fout) { fout.writeJson({ size: this.size, next: this.next }); this.buffer.save(fout); return fout; } /** * Add new record to the buffer. * @param {module:qm.Record} rec - New record. * @example * // TODO */ this.push = function (rec) { if (this.buffer.length < this.size) { // we did not fill buffer yet, just add new element this.buffer.push(rec.$id); this.onAdd(rec, this); } else { // we are full, first delete the oldest record. var oldRec = this.store[this.buffer[this.next]]; this.onDelete(oldRec, this); // remember new record this.buffer[this.next] = rec.$id; this.onAdd(rec, this); // move pointer to the oldest record forwards this.next++; if (this.next == this.size) { this.next = 0; } } } } //================================================================== // FEATURE SPACE //================================================================== //#- `qm.FeatureSpace.getSparseVectorFeatures(spVec)` -- Return array of feature //# names based on feature space `fsp` where the elements of a sparse feature //# vector `spVec` are non-zero. exports.FeatureSpace.prototype.getSparseVectorFeatures = function (spVec) { // get index vector var idxVec = spVec.idxVec(); var cols = []; for (var elN = 0; elN < idxVec.length; elN++) { cols.push(this.getFeature(idxVec[elN])); } return cols; } //================================================================== // EXPORTS //================================================================== // deprecated, here for backwards compatibility exports.load = function () { var _obj = {}; _obj.jsonFileLimit = function (store, file, limit) { return store.loadJson(file, limit); } _obj.jsonFile = function (store, file) { return store.loadJson(file); } return _obj; }(); exports.delLock = function () { if (nodefs.existsSync('lock')) { try { nodefs.unlinkSync('lock'); } catch (e) { console.log(e); } } } exports.rmDir = function (dirPath) { try { var files = nodefs.readdirSync(dirPath); } catch (e) { return; } if (files.length > 0) for (var i = 0; i < files.length; i++) { var filePath = dirPath + '/' + files[i]; if (nodefs.statSync(filePath).isFile()) nodefs.unlinkSync(filePath); else rmDir(filePath); } nodefs.rmdirSync(dirPath); }; function forbidConstructor(obj) { proto = obj.prototype; obj = function () {throw new Error('constructor is private, ' + obj.prototype.constructor.name + ' is factory based.');} obj.prototype = proto; return obj; } // Forbids constructors that would crash node - these objects are factory constructed exports.Store = forbidConstructor(exports.Store); exports.RecSet = forbidConstructor(exports.RecSet); //!ENDJSDOC return exports; }