iterparse
Version:
Delightful data parsing
198 lines (197 loc) • 8.02 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.fileGroupBy = void 0;
const tslib_1 = require("tslib");
const ix_1 = require("ix");
const tmp_1 = require("tmp");
const fs_extra_1 = require("fs-extra");
const P = tslib_1.__importStar(require("ts-prime"));
const helpers_1 = require("./helpers");
class GroupingProgressDisplay {
constructor(progress) {
this.progress = progress;
}
toString() {
const json = this.toJSON();
if (this.progress.state === 'IDLE') {
return `Grouping idle...`;
}
if (this.progress.state === 'GROUPING') {
return `Grouping, Items: ${this.progress.groupedItems.toLocaleString()}, Total Groups: ${json.groupedGroups}, Grouped Size: ${helpers_1.formatBytes(json.groupedBytes)}, Memory: ${helpers_1.formatBytes(json.memory)}`;
}
return `Reading, Progress: ${((json.readedItems / json.groupedItems) * 100).toFixed(2)}%, Groups: ${json.readedGroups}/${json.groupedGroups}, Memory: ${helpers_1.formatBytes(json.memory)}`;
}
toJSON() {
const groupingDiff = Math.floor(Date.now() - this.progress.parsingStartTime);
const groupingBytesPerMs = Math.floor(this.progress.groupedBytes / groupingDiff) || 0;
const groupingBytesPerSecond = Math.floor(groupingBytesPerMs * 1000);
const readingDiff = Math.floor(Date.now() - this.progress.parsingStartTime);
const readingBytesPerMs = Math.floor(this.progress.groupedBytes / readingDiff) || 0;
const readingBytesPerSecond = Math.floor(readingBytesPerMs * 1000);
return {
state: this.progress.state,
groupingBytesPerSecond,
readingBytesPerSecond,
memory: process.memoryUsage().heapUsed,
groupingStartTime: this.progress.parsingStartTime,
groupingStopTime: this.progress.parsingStopTime,
readingStartTime: this.progress.readingStartTime,
readingStopTime: this.progress.readingStopTime,
groupedItems: this.progress.groupedItems,
groupedBytes: this.progress.groupedBytes,
groupedGroups: this.progress.groupedGroups,
readedItems: this.progress.readedItems,
readedBytes: this.progress.readedBytes,
readedGroups: this.progress.readedGroups
};
}
}
class GroupingProgress {
constructor() {
this.groupedBytes = 0;
this.groupedItems = 0;
this.groupedGroups = 0;
this.readedBytes = 0;
this.readedGroups = 0;
this.readedItems = 0;
this.state = 'IDLE';
this.parsingStartTime = Date.now();
this.parsingStopTime = Date.now();
this.readingStartTime = Date.now();
this.readingStopTime = Date.now();
}
start(action) {
switch (action) {
case 'GROUPING':
this.state = 'GROUPING';
this.parsingStartTime = Date.now();
return;
case 'READING':
this.state = 'READING';
this.readingStartTime = Date.now();
return;
default:
throw new Error(`Action ${action} not allowed`);
}
}
stop(action) {
switch (action) {
case 'GROUPING':
this.parsingStopTime = Date.now();
return;
case 'READING':
this.readingStopTime = Date.now();
return;
default:
throw new Error(`Action ${action} not allowed`);
}
}
addChunk(chunk) {
this.groupedBytes += chunk;
}
addGroup(group) {
this.groupedGroups += group;
}
addItem(count = 1) {
this.groupedItems += count;
}
readChunk(chunk) {
this.readedBytes += chunk;
}
readGroup(group) {
this.readedGroups += group;
}
readItem(count = 1) {
this.readedItems += count;
}
set(data) {
if (data.currentSize) {
this.groupedBytes = data.currentSize;
}
if (data.items) {
this.groupedItems = data.items;
}
}
}
function fileGroupBy(args) {
const progress = new GroupingProgress();
let interval;
function groupProcess() {
var _a, _b;
return tslib_1.__asyncGenerator(this, arguments, function* groupProcess_1() {
var e_1, _c;
const tmpFile = tmp_1.tmpNameSync();
const encoding = 'utf8';
const fd = yield tslib_1.__await(fs_extra_1.open(tmpFile, 'a+'));
const groupFileMap = {
groups: new Map(),
lastPosition: 0,
};
interval = setInterval(() => {
var _a;
if (progress.state === 'IDLE')
return;
(_a = args.progress) === null || _a === void 0 ? void 0 : _a.call(args, new GroupingProgressDisplay(progress));
}, args.progressFrequency || 1000);
try {
for (var _d = tslib_1.__asyncValues(args.source), _e; _e = yield tslib_1.__await(_d.next()), !_e.done;) {
const value = _e.value;
if (progress.groupedGroups === 0) {
progress.start('GROUPING');
}
const parsedValue = JSON.stringify(value);
const size = Buffer.byteLength(parsedValue, encoding);
const groupId = args.groupingFn(value).toString();
yield tslib_1.__await(fs_extra_1.appendFile(fd, parsedValue, { encoding }));
progress.addItem(1);
progress.addChunk(size);
const group = groupFileMap.groups.get(groupId);
if (group == null) {
progress.addGroup(1);
}
const newGroup = group || [];
newGroup.push([groupFileMap.lastPosition, size]);
groupFileMap.groups.set(groupId, newGroup);
groupFileMap.lastPosition = groupFileMap.lastPosition + size;
continue;
}
}
catch (e_1_1) { e_1 = { error: e_1_1 }; }
finally {
try {
if (_e && !_e.done && (_c = _d.return)) yield tslib_1.__await(_c.call(_d));
}
finally { if (e_1) throw e_1.error; }
}
(_a = args.progress) === null || _a === void 0 ? void 0 : _a.call(args, new GroupingProgressDisplay(progress));
progress.start('READING');
for (const [groupId, mapData] of groupFileMap.groups) {
progress.readGroup(1);
yield yield tslib_1.__await({
key: groupId,
items: yield tslib_1.__await(Promise.all(mapData.map(async ([location, size]) => {
progress.readItem(1);
progress.readChunk(size);
const buffer = Buffer.alloc(size);
await fs_extra_1.read(fd, buffer, 0, size, location);
const item = P.canFail(() => {
return JSON.parse(buffer.toString(encoding));
});
if (P.isError(item)) {
throw new Error(`Critical error: something went wrong in grouping process`);
}
return item;
})))
});
}
clearInterval(interval);
(_b = args.progress) === null || _b === void 0 ? void 0 : _b.call(args, new GroupingProgressDisplay(progress));
});
}
return ix_1.AsyncIterable.from(groupProcess()).finally(() => {
if (interval == null)
return;
clearInterval(interval);
});
}
exports.fileGroupBy = fileGroupBy;