@crawlee/core
Version:
The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.
144 lines • 5.74 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.serializeArray = serializeArray;
exports.deserializeArray = deserializeArray;
exports.createDeserialize = createDeserialize;
const tslib_1 = require("tslib");
const node_stream_1 = require("node:stream");
const node_util_1 = tslib_1.__importDefault(require("node:util"));
const node_zlib_1 = tslib_1.__importDefault(require("node:zlib"));
const ow_1 = tslib_1.__importDefault(require("ow"));
const StreamArray_1 = tslib_1.__importDefault(require("stream-json/streamers/StreamArray"));
const pipeline = node_util_1.default.promisify(node_stream_1.pipeline);
/**
* Transforms an array of items to a JSON in a streaming
* fashion to save memory. It operates in batches to speed
* up the process.
* @internal
*/
class ArrayToJson extends node_stream_1.Readable {
constructor(data, options = {}) {
super({
...options,
autoDestroy: true,
emitClose: true,
});
Object.defineProperty(this, "data", {
enumerable: true,
configurable: true,
writable: true,
value: data
});
Object.defineProperty(this, "offset", {
enumerable: true,
configurable: true,
writable: true,
value: 0
});
Object.defineProperty(this, "batchSize", {
enumerable: true,
configurable: true,
writable: true,
value: void 0
});
const { batchSize = 10000 } = options;
this.batchSize = batchSize;
this.data = data;
this.push('[');
}
_read() {
try {
const items = this.data.slice(this.offset, this.offset + this.batchSize);
if (items.length) {
const json = JSON.stringify(items);
// Strip brackets to flatten the batch.
const itemString = json.substring(1, json.length - 1);
if (this.offset > 0)
this.push(',', 'utf8');
this.push(itemString, 'utf8');
this.offset += this.batchSize;
}
else {
this.push(']');
this.push(null);
}
}
catch (err) {
this.emit('error', err);
}
}
}
/**
* Uses Gzip compression to take an array of values, which can be anything
* from entries in a Dataset to Requests in a RequestList and compresses
* them to a Buffer in a memory-efficient way (streaming one by one). Ideally,
* the largest chunk of memory consumed will be the final compressed Buffer.
* This could be further improved by outputting a Stream, if and when
* apify-client supports streams.
* @internal
*/
async function serializeArray(data) {
(0, ow_1.default)(data, ow_1.default.array);
const { chunks, collector } = createChunkCollector();
await pipeline(new ArrayToJson(data), node_zlib_1.default.createGzip(), collector);
return Buffer.concat(chunks);
}
/**
* Decompresses a Buffer previously created with compressData (technically,
* any JSON that is an Array) and collects it into an Array of values
* in a memory-efficient way (streaming the array items one by one instead
* of creating a fully decompressed buffer -> full JSON -> full Array all
* in memory at once. Could be further optimized to ingest a Stream if and
* when apify-client supports streams.
* @internal
*/
async function deserializeArray(compressedData) {
(0, ow_1.default)(compressedData, ow_1.default.uint8Array);
const { chunks, collector } = createChunkCollector({ fromValuesStream: true });
await pipeline(node_stream_1.Readable.from([compressedData]), node_zlib_1.default.createGunzip(), StreamArray_1.default.withParser(), collector);
return chunks;
}
/**
* Creates a stream that decompresses a Buffer previously created with
* compressData (technically, any JSON that is an Array) and collects it
* into an Array of values in a memory-efficient way (streaming the array
* items one by one instead of creating a fully decompressed buffer
* -> full JSON -> full Array all in memory at once. Could be further
* optimized to ingest a Stream if and when apify-client supports streams.
* @internal
*/
function createDeserialize(compressedData) {
(0, ow_1.default)(compressedData, ow_1.default.uint8Array);
const streamArray = StreamArray_1.default.withParser();
const destination = pluckValue(streamArray);
(0, node_stream_1.pipeline)(node_stream_1.Readable.from([compressedData]), node_zlib_1.default.createGunzip(), destination,
// @ts-expect-error Something's wrong here, the types are wrong but tests fail if we correct the code to make them right
(err) => destination.emit(err));
return destination;
}
function createChunkCollector(options = {}) {
const { fromValuesStream = false } = options;
const chunks = [];
const collector = new node_stream_1.Writable({
decodeStrings: false,
objectMode: fromValuesStream,
write(chunk, _nil, callback) {
chunks.push(fromValuesStream ? chunk.value : chunk);
callback();
},
writev(chunkObjects, callback) {
const buffers = chunkObjects.map(({ chunk }) => {
return fromValuesStream ? chunk.value : chunk;
});
chunkObjects.push(...buffers);
callback();
},
});
return { collector, chunks };
}
function pluckValue(streamArray) {
const realPush = streamArray.push.bind(streamArray);
streamArray.push = (obj) => realPush(obj && obj.value);
return streamArray;
}
//# sourceMappingURL=serialization.js.map