UNPKG

s2-tools

Version:

A collection of geospatial tools primarily designed for WGS84, Web Mercator, and S2.

132 lines 5.03 kB
import { mergeSortedChunks } from './mergeSortedChunks'; import { sortChunk } from './sortChunk'; import { availableParallelism, tmpdir } from 'os'; import { createReadStream, createWriteStream } from 'fs'; import { exists, stat, unlink } from 'fs/promises'; /** * Sorts an array using external-sorting. * @param inputs - a list of input files without their extensions. e.g. './file1', './file2', './file3' * @param output - output folder to place the sorted keys * @param maxHeap - max instance of the parsed entity in memory * @param threadCount - number of workers * @param tmpDir - temporary directory */ export async function externalSort(inputs, output, maxHeap = 100_000, threadCount = 1, tmpDir = tmpdir()) { // 1) Get the size of the input const sizes = await getSizes(inputs); // 2) Build chunk list const chunks = buildChunks(sizes, tmpDir, maxHeap); // 3) Sort chunks - using either workers or single threaded let sortedFiles = []; if (threadCount === 1 || chunks.length <= 10) { for (const chunk of chunks) sortedFiles.push(await sortChunk(chunk)); } else { sortedFiles = await sortChunksWithWorkers(chunks, threadCount); } // 4) Merge chunks await mergeSortedChunks(sortedFiles, output).catch((err) => console.error(err)); await mergeValues(output, sizes); // 5) Cleanup for (const file of sortedFiles) await unlink(file); } /** * @param inputs - a list of file's * @returns - a list of file names, inputs, and sizes */ async function getSizes(inputs) { const sizes = []; let valueOffset = 0; for (const input of inputs) { const valueSize = (await exists(`${input}.values`)) ? await stat(`${input}.values`).then((stat) => stat.size) : 0; sizes.push({ name: input.split('/').pop(), input, keySize: await stat(`${input}.keys`).then((stat) => stat.size), valueSize, valueOffset, }); valueOffset += valueSize; } return sizes; } /** * @param fileSizes - a list of file names and sizes * @param outDir - output directory to store temporary sorted files * @param maxHeap - max number of keys in memory * @returns - a list of chunks */ function buildChunks(fileSizes, outDir, maxHeap) { const chunks = []; for (const { name, input, keySize, valueOffset } of fileSizes) { for (let start = 0; start < keySize; start += maxHeap * 16) { const end = Math.min(start + maxHeap * 16, keySize); chunks.push({ name, input: `${input}.keys`, outDir, start, end, valueOffset }); } } return chunks; } /** * @param chunks - a list of chunks * @param tc - user defined thread count * @returns - a list of sorted files */ async function sortChunksWithWorkers(chunks, tc) { const sortedFiles = []; const threadCount = Math.min(tc, availableParallelism()); // Have workers sort chunks await new Promise((resolve) => { // begin the workers and ship chunks const chunkLength = chunks.length; const threads = Math.min(threadCount, chunkLength); let threadsComplete = 0; for (let i = 0; i < threads; i++) { const worker = new Worker(new URL('./worker', import.meta.url).href, { type: 'module' }); worker.postMessage(chunks.shift()); /** @param msg - a sorted file */ worker.onmessage = (msg) => { sortedFiles.push(msg.data); if (chunks.length === 0) { worker.terminate(); threadsComplete++; if (threadsComplete === threads) resolve(); } else { worker.postMessage(chunks.shift()); } }; } }); return sortedFiles; } /** * merge the values files since the sorted key indexes have been merged as well. * @param output - name of the output folder * @param sizes - list of unique input values */ async function mergeValues(output, sizes) { if (sizes.length <= 1) return; const values = sizes .sort((a, b) => a.valueOffset - b.valueOffset) .filter((c) => c.input !== output && c.valueSize > 0) .map((c) => c.input); if (values.length === 0) return; const writeStream = createWriteStream(`${output}.values`, { flags: 'a' }); // Open output file in append mode for (const value of values) { const readStream = createReadStream(`${value}.values`); // Create a read stream for each file readStream.pipe(writeStream, { end: false }); // Pipe data to the write stream await new Promise((resolve, reject) => { readStream.on('end', resolve); // Resolve when reading ends readStream.on('error', reject); // Reject on error }); } writeStream.end(); // Close the write stream } //# sourceMappingURL=index.js.map