files-sets
Version:
nodejs命令行对用行分隔的文件做集合运算
159 lines (149 loc) • 5.32 kB
JavaScript
//传进来的文件默认是一个个集合
const fs = require("fs");
const EventEmitter = require("events");
const readLine = require('lei-stream').readLine;
const et = new EventEmitter();
const path = require('path');
let fileArr = ['./white_list_users.txt', './22_23.txt']; //默认文件
let queue = [];
var keysArr = [];
let queueEndLength = 0;
var intersectionCount = 0;
var handleCount = 0;
let currentSet = new Set();
const startTime = new Date().getTime();
let oneSize = 1200*10000;//默认1200万量级
let argv = require('yargs').command("intersection", '求交集', function(yargs) {
// console.log(yargs);
var argv = yargs.reset().demand(['b','c']).describe({'b': "需要计算的基础集合的文件路径","c":"需要计算的对比集合的文件路径"}).alias("b", "base").alias('c','compare').help("h").alias("h", "help").option("m", {
alias: "mode",
default: false,
describe: "是否开启内存比较模式,内存比较模式很快,但是不适合大文件,超过500m的文件建议不要开启这个模式",
type: 'string'
}).option("s",{
alias: "size",
default: oneSize,
describe: "每次加载多少条内容到内存中,默认是12000000",
type: 'string'
}).argv;
// fileArr = argv.p.split(',');
fileArr[0] = argv.b;
fileArr[1] = argv.c;
let mode = Boolean(argv.m);
if(argv.m==='false'){
mode = false;
}
if(argv.s){
oneSize = Number(argv.s);
}
//默认用慢速的
if (mode === false) {
// console.log('start');
intersectionBig(fileArr)
} else {
intersectionSmall(fileArr)
}
}).help("h").alias("h", "help").argv;
// intersectionBig(fileArr)
function intersectionBig(fileArr) {
console.log(`${new Date()}开始计算...`);
queueEndLength = fileArr.length;
if (queueEndLength !== 2) {
console.error('必须是2个文件');
return;
}
let count;
let fsCalculation = [];
let baseStream = fs.createReadStream(fileArr[0]);
readLine(baseStream).go((line, next) => {
// console.log('line start', line.toString())
currentSet.add(line.toString());
if (currentSet.size >= oneSize) {
// console.log('is Paused', baseStream.isPaused())
calculate({sets: currentSet, path: fileArr[1]}).then((data) => {
handleCount += currentSet.size;
currentSet.clear();
intersectionCount += data.sets.size;
console.log(`${new Date()} 正在计算中...当前交集数为:${intersectionCount}个,已处理数据量${handleCount}`)
// console.log('strea continue')
next();
});
} else {
next();
}
}, function() {
calculate({sets: currentSet, path: fileArr[1]}).then((data) => {
handleCount += currentSet.size;
currentSet.clear();
intersectionCount += data.sets.size;
// console.log('stream end');
let endTime = new Date().getTime();
let takeTime = Math.floor((endTime - startTime) / (1000 * 60));
console.log(`计算完成,总耗时${takeTime}分钟,总交集数量为${intersectionCount},总处理数据量为基础集合数据量为:${handleCount};对比的集合数据量:${data.handleCount}`);
});
});
}
/**
* 计算交集
* @return {[type]} [description]
*/
function calculate(options) {
let count =0;
return new Promise(function(s, f) {
var intersectionSet = new Set();
options = options || {};
if (options.sets && options.path) {
var sets = options.sets,
path = options.path;
readLine(path).go((currentValue,next) => {
if (sets.has(currentValue)) {
intersectionSet.add(currentValue)
}
++count;
next();
},()=>{
s({sets:intersectionSet,
handleCount:count});
})
} else {
f(new Error('参数错误'))
}
})
}
function intersectionSmall(fileArr) {
queueEndLength = fileArr.length;
if (queueEndLength < 1) {
console.error('至少需要一个文件');
}
et.on('allEnd', () => {
// 交集
let currentSet = keysArr[0];
for (let i = 1; i < keysArr.length; ++i) {
currentSet = new Set([...currentSet].filter(x => keysArr[i].has(x)))
}
// console.log('集合',currentSet);
console.log('交集数量:', currentSet.size);
let endTime = new Date().getTime();
let to = Math.floor((endTime - startTime) / (1000 * 60));
console.log('耗时' + to + "分钟")
});
fileArr.forEach((v, i) => {
keysArr[i] = new Set();
et.on(v, (data) => {
queuePush(data);
});
readLine(v).go((line,next) => {
keysArr[i].add(line.toString());
next();
},()=>{
et.emit(v, keysArr[i]);
})
});
}
function queuePush(data) {
queue.push(data);
if (queue.length === queueEndLength) {
et.emit('allEnd');
}
}