UNPKG

rdd.js

Version:

A node.js lib to process files/data like spark rdd.

237 lines (192 loc) 5.48 kB
const fs = require('fs'); const cs = require('co-stream'); const glob = require('glob'); const MultiStream = require('multistream'); const csv = require('csv'); const getKey = item => item instanceof Array ? item[0] : item.key; const getValue = item => item instanceof Array ? item[1] : (item.value || item.val); function pathToStream(path) { if (path instanceof Array) { const streams = path .filter(p => fs.statSync(p).isFile()) .map(p => fs.createReadStream(p)); if (streams.length === 0) { throw new Error('No file matched for path: [' + path.join(',') + ']'); } else { return streams.length === 1 ? streams[0] : MultiStream(streams); } } else { return pathToStream(glob.sync(path)); } } function lineWriter(lineTransform) { let buf = [], bufLen = 0; return cs.through(function (line) { line = lineTransform ? lineTransform(line) : line; buf.push(line); bufLen += line.length; if (bufLen >= 131072) { buf.push(''); this.queue(buf.join('\n')); bufLen = 0; buf = []; } }, function () { if (bufLen > 0) { buf.push(''); this.queue(buf.join('\n')); } this.queue(null); }); } class RDD { static fromIterable(iterable) { return new RDD(() => cs.fromIterable(iterable)); } static fromTextFile(path) { return new RDD(() => pathToStream(path), { text: true }); } static fromJsonFile(path) { return new RDD(() => pathToStream(path) .pipe(cs.split()) .pipe(cs.object.map(line => line.trim() ? JSON.parse(line) : null)) ); } static fromCsvFile(path, opt = { columns: true }) { return new RDD(() => pathToStream(path).pipe(csv.parse(opt)) ); } constructor(fnGetStream, opt) { this._ops = []; this._opt = {}; if (opt && opt.text) { this._ops.push(cs.split); } this._fnGetStream = fnGetStream; } // If you use stream getter returns a promise, this will also return a promise. toStream() { return this._opsToStream(); } parallel(p) { this._opt.parallel = p; return this; } map(mapFn) { return this._push(() => cs.object.map(mapFn, this._opt)); } flatMap(mapFn) { return this._push(() => cs.object.flatMap(mapFn, this._opt)); } filter(filterFn) { return this._push(() => cs.object.filter(filterFn, this._opt)); } reduce(reduceFn, initVal) { let first = true; let m = initVal; const stream = cs.object.each(item => { if (first && (initVal === undefined || initVal === null)) { first = false; m = item; } else { m = reduceFn(m, item); } }, this._opt); this._opsToStream().pipe(stream); return this._streamToPromise(stream, () => m); } count() { return this.reduce(m => m + 1, 0); } collect() { return this.reduce((m, i) => { m.push(i); return m; }, []); } forEach(fn) { const stream = cs.object.each(fn, this._opt); this._opsToStream().pipe(stream); return this._streamToPromise(stream, () => {}); } saveAsTextFile(path) { const writer = fs.createWriteStream(path); this._opsToStream() .pipe(lineWriter()) .pipe(writer); return this._streamToPromise(writer, () => {}); } saveAsJsonFile(path) { const writer = fs.createWriteStream(path); this._opsToStream() .pipe(lineWriter(item => JSON.stringify(item))) .pipe(writer); return this._streamToPromise(writer, () => {}); } saveAsCsvFile(path, opt = { header: true }) { const writer = fs.createWriteStream(path); this._opsToStream() .pipe(csv.stringify(opt)) .pipe(writer); return this._streamToPromise(writer, () => {}); } reduceBy(keyFn, reduceFn, initVal, valueFn) { const map = new Map(); const stream = cs.object.each(item => { const key = keyFn(item); const value = valueFn ? valueFn(item) : item; if (map.has(key)) { const m = map.get(key); map.set(key, reduceFn(m, value)); } else { const val = initVal !== undefined && initVal !== null ? reduceFn(initVal, value) : value; map.set(key, val); } }, this._opt); this._opsToStream().pipe(stream); return RDD.fromIterable(this._streamToPromise(stream, () => map)); } groupBy(keyFn) { return this.reduceBy(keyFn, (m, item) => m.concat([item]), []); } reduceByKey(reduceFn, initVal) { return this.reduceBy(getKey, reduceFn, initVal, getValue); } groupByKey() { return this.reduceByKey((m, v) => m.concat([v]), []); } _streamToPromise(stream, valueGetter) { return new Promise((resolve, reject) => { stream.on('end', () => resolve(valueGetter())); stream.on('finish', () => resolve(valueGetter())); stream.on('error', err => reject(err)); }); } _push(op) { const ret = new RDD(this._fnGetStream); ret._ops = [].concat(this._ops, [op]); return ret; } _opsToStream() { let myBePromise = this._fnGetStream(); if (myBePromise.then) { return { pipe: s => myBePromise.then(stream => { for (let op of this._ops) { stream = stream.pipe(op()); } return stream.pipe(s); }) }; } else { let stream = myBePromise; for (let op of this._ops) { stream = stream.pipe(op()); } return stream; } } } module.exports = RDD;