UNPKG

files-sets

Version:

nodejs命令行对用行分隔的文件做集合运算

159 lines (149 loc) 5.32 kB
#!/usr/bin/env node //传进来的文件默认是一个个集合 const fs = require("fs"); const EventEmitter = require("events"); const readLine = require('lei-stream').readLine; const et = new EventEmitter(); const path = require('path'); let fileArr = ['./white_list_users.txt', './22_23.txt']; //默认文件 let queue = []; var keysArr = []; let queueEndLength = 0; var intersectionCount = 0; var handleCount = 0; let currentSet = new Set(); const startTime = new Date().getTime(); let oneSize = 1200*10000;//默认1200万量级 let argv = require('yargs').command("intersection", '求交集', function(yargs) { // console.log(yargs); var argv = yargs.reset().demand(['b','c']).describe({'b': "需要计算的基础集合的文件路径","c":"需要计算的对比集合的文件路径"}).alias("b", "base").alias('c','compare').help("h").alias("h", "help").option("m", { alias: "mode", default: false, describe: "是否开启内存比较模式,内存比较模式很快,但是不适合大文件,超过500m的文件建议不要开启这个模式", type: 'string' }).option("s",{ alias: "size", default: oneSize, describe: "每次加载多少条内容到内存中,默认是12000000", type: 'string' }).argv; // fileArr = argv.p.split(','); fileArr[0] = argv.b; fileArr[1] = argv.c; let mode = Boolean(argv.m); if(argv.m==='false'){ mode = false; } if(argv.s){ oneSize = Number(argv.s); } //默认用慢速的 if (mode === false) { // console.log('start'); intersectionBig(fileArr) } else { intersectionSmall(fileArr) } }).help("h").alias("h", "help").argv; // intersectionBig(fileArr) function intersectionBig(fileArr) { console.log(`${new Date()}开始计算...`); queueEndLength = fileArr.length; if (queueEndLength !== 2) { console.error('必须是2个文件'); return; } let count; let fsCalculation = []; let baseStream = fs.createReadStream(fileArr[0]); readLine(baseStream).go((line, next) => { // console.log('line start', line.toString()) currentSet.add(line.toString()); if (currentSet.size >= oneSize) { // console.log('is Paused', baseStream.isPaused()) calculate({sets: currentSet, path: fileArr[1]}).then((data) => { handleCount += currentSet.size; currentSet.clear(); intersectionCount += data.sets.size; console.log(`${new Date()} 正在计算中...当前交集数为:${intersectionCount}个,已处理数据量${handleCount}`) // console.log('strea continue') next(); }); } else { next(); } }, function() { calculate({sets: currentSet, path: fileArr[1]}).then((data) => { handleCount += currentSet.size; currentSet.clear(); intersectionCount += data.sets.size; // console.log('stream end'); let endTime = new Date().getTime(); let takeTime = Math.floor((endTime - startTime) / (1000 * 60)); console.log(`计算完成,总耗时${takeTime}分钟,总交集数量为${intersectionCount},总处理数据量为基础集合数据量为:${handleCount};对比的集合数据量:${data.handleCount}`); }); }); } /** * 计算交集 * @return {[type]} [description] */ function calculate(options) { let count =0; return new Promise(function(s, f) { var intersectionSet = new Set(); options = options || {}; if (options.sets && options.path) { var sets = options.sets, path = options.path; readLine(path).go((currentValue,next) => { if (sets.has(currentValue)) { intersectionSet.add(currentValue) } ++count; next(); },()=>{ s({sets:intersectionSet, handleCount:count}); }) } else { f(new Error('参数错误')) } }) } function intersectionSmall(fileArr) { queueEndLength = fileArr.length; if (queueEndLength < 1) { console.error('至少需要一个文件'); } et.on('allEnd', () => { // 交集 let currentSet = keysArr[0]; for (let i = 1; i < keysArr.length; ++i) { currentSet = new Set([...currentSet].filter(x => keysArr[i].has(x))) } // console.log('集合',currentSet); console.log('交集数量:', currentSet.size); let endTime = new Date().getTime(); let to = Math.floor((endTime - startTime) / (1000 * 60)); console.log('耗时' + to + "分钟") }); fileArr.forEach((v, i) => { keysArr[i] = new Set(); et.on(v, (data) => { queuePush(data); }); readLine(v).go((line,next) => { keysArr[i].add(line.toString()); next(); },()=>{ et.emit(v, keysArr[i]); }) }); } function queuePush(data) { queue.push(data); if (queue.length === queueEndLength) { et.emit('allEnd'); } }