UNPKG

js-stream-dataset-json

Version:
912 lines 37.1 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const fs_1 = __importDefault(require("fs")); const promises_1 = __importDefault(require("fs/promises")); const readline_1 = __importDefault(require("readline")); const zlib_1 = __importDefault(require("zlib")); const JSONStream_1 = __importDefault(require("JSONStream")); // Main class for dataset JSON; class DatasetJson { /** * Read observations. * @constructor * @param filePath - Path to the file. * @param options - Configuration options * @param options.encoding - File encoding. Default is 'utf8'. * @param options.isNdJson - Force NDJSON format. If not specified, detected from file extension. * @param options.isCompressed - Force NDJSON format. If not specified, detected from file extension. * @param options.checkExists - Throw error if file does not exist. Default is false. */ constructor(filePath, options) { // Required attributes this.requiredAttributes = [ 'datasetJSONCreationDateTime', 'datasetJSONVersion', 'records', 'name', 'label', 'columns', ]; // Write queue management this.writeQueueDrain = Promise.resolve(); /** * Auxilary function to verify if required elements are parsed; * @return True if all required attributes are present, otherwise false. */ this.checkAttributesParsed = (item) => { return this.requiredAttributes.every((key) => item[key] === true); }; this.filePath = filePath; this.currentPosition = 0; const { encoding = 'utf8', checkExists = false } = options || {}; this.encoding = encoding; this.isFirstWrite = true; // If option isNdjson is not specified, try to detect it from the file extension; if (options?.isNdJson === undefined) { this.isNdJson = this.filePath.toLowerCase().endsWith('.ndjson'); } else { this.isNdJson = options.isNdJson; } // If option isCompressed is not specified, try to detect it from the file extension; if (options?.isCompressed === undefined) { this.isCompressed = this.filePath.toLowerCase().endsWith('.dsjc'); } else { this.isCompressed = options.isCompressed; } // In case of compressed file, change the NDJSON format is used if (this.isCompressed) { this.isNdJson = true; } this.allRowsRead = false; this.metadataLoaded = false; this.metadata = { datasetJSONCreationDateTime: '', datasetJSONVersion: '', records: -1, name: '', label: '', columns: [], }; // Get all possible encoding values from BufferEncoding type const validEncodings = [ 'ascii', 'utf8', 'utf16le', 'ucs2', 'base64', 'latin1', ]; // Check encoding if (!validEncodings.includes(this.encoding)) { throw new Error(`Unsupported encoding ${this.encoding}`); } // Check if file exists; if (!fs_1.default.existsSync(this.filePath)) { if (checkExists === true) { throw new Error(`Could not read file ${this.filePath}`); } else { this.stats = null; this.stream = null; } } else { this.stats = fs_1.default.statSync(this.filePath); this.stream = null; } } /** * Check if the file was modified * @return True if file has changed, otherwise false. */ async fileChanged() { const stats = await promises_1.default.stat(this.filePath); if (this.stats !== null && stats.mtimeMs !== this.stats.mtimeMs) { return true; } return false; } /** * Get Dataset-JSON metadata * @return An object with file metadata. */ async getMetadata() { // If the file did not change, use the metadata obtained during initialization; if (!(await this.fileChanged()) && this.metadataLoaded === true) { return this.metadata; } else { if (this.isNdJson) { return this.getNdjsonMetadata(); } else { return this.getJsonMetadata(); } } } /** * Get Dataset-JSON metadata when the file is in JSON format. * @return An object with file metadata. */ async getJsonMetadata() { return new Promise((resolve, reject) => { this.metadataLoaded = false; // Metadata for ItemGroup const metadata = { datasetJSONCreationDateTime: '', datasetJSONVersion: '', records: -1, name: '', label: '', columns: [], studyOID: '', metaDataVersionOID: '', }; const parsedMetadata = { datasetJSONCreationDateTime: false, datasetJSONVersion: false, dbLastModifiedDateTime: false, fileOID: false, originator: false, sourceSystem: false, itemGroupOID: false, columns: false, records: false, name: false, label: false, studyOID: false, metaDataVersionOID: false, metaDataRef: false, }; // Restart stream if (this.currentPosition !== 0 || this.stream?.destroyed || this.stream === null) { if (this.stream !== null && !this.stream?.destroyed) { this.stream?.destroy(); } this.stream = fs_1.default.createReadStream(this.filePath, { encoding: this.encoding, }); } if (this.stream === null) { reject(new Error('Could not create read stream for file ' + this.filePath)); return; } this.stream .pipe(JSONStream_1.default.parse('rows..*', (data, nodePath) => { return { path: nodePath, value: data }; })) .on('end', () => { // Check if all required attributes are parsed after the file is fully loaded; if (!this.checkAttributesParsed(parsedMetadata)) { const notParsed = Object.keys(parsedMetadata).filter((key) => !parsedMetadata[key] && this.requiredAttributes.includes(key)); reject(new Error('Could not find required metadata elements ' + notParsed.join(', '))); } this.metadataLoaded = true; this.metadata = metadata; resolve(metadata); }) .on('header', (data) => { // In correctly formed Dataset-JSON, all metadata attributes are present before rows Object.keys(data).forEach((key) => { if (Object.keys(parsedMetadata).includes(key)) { // eslint-disable-next-line @typescript-eslint/no-explicit-any metadata[key] = data[key]; parsedMetadata[key] = true; } }); // Check if all required elements were parsed if (this.checkAttributesParsed(parsedMetadata)) { this.metadataLoaded = true; this.metadata = metadata; resolve(metadata); this.stream?.destroy(); } }) .on('footer', (data) => { // If not all required metadata attributes were found before rows, check if they are present after Object.keys(data).forEach((key) => { if (Object.keys(parsedMetadata).includes(key)) { // eslint-disable-next-line @typescript-eslint/no-explicit-any metadata[key] = data[key]; parsedMetadata[key] = true; } }); // Check if all required elements were parsed if (this.checkAttributesParsed(parsedMetadata)) { this.metadataLoaded = true; this.metadata = metadata; resolve(metadata); this.stream?.destroy(); } }); }); } /** * Get Dataset-JSON metadata when the file is in NDJSON format. * @return An object with file metadata. */ async getNdjsonMetadata() { return new Promise((resolve, reject) => { this.metadataLoaded = false; // All metadata is stored in the first line of the file const metadata = { datasetJSONCreationDateTime: '', datasetJSONVersion: '', records: -1, name: '', label: '', columns: [], studyOID: '', metaDataVersionOID: '', }; const parsedMetadata = { datasetJSONCreationDateTime: false, datasetJSONVersion: false, dbLastModifiedDateTime: false, fileOID: false, originator: false, sourceSystem: false, itemGroupOID: false, columns: false, records: false, name: false, label: false, studyOID: false, metaDataVersionOID: false, metaDataRef: false, }; // Restart stream if (this.stream === null || this.currentPosition !== 0 || this.stream?.destroyed) { if (this.stream !== null && !this.stream?.destroyed) { this.stream?.destroy(); } if (this.isCompressed) { const rawStream = fs_1.default.createReadStream(this.filePath); const gunzip = zlib_1.default.createGunzip(); this.stream = rawStream.pipe(gunzip); } else { this.stream = fs_1.default.createReadStream(this.filePath, { encoding: this.encoding, }); } } if (this.stream === null) { reject(new Error('Could not create read stream for file ' + this.filePath)); return; } this.rlStream = readline_1.default.createInterface({ input: this.stream, crlfDelay: Infinity, }); this.rlStream.on('line', (line) => { const data = JSON.parse(line); // Fill metadata with parsed attributes Object.keys(data).forEach((key) => { if (Object.keys(parsedMetadata).includes(key)) { // eslint-disable-next-line @typescript-eslint/no-explicit-any metadata[key] = data[key]; parsedMetadata[key] = true; } }); // Check if all required elements were parsed if (this.checkAttributesParsed(parsedMetadata)) { this.metadataLoaded = true; this.metadata = metadata; resolve(metadata); } else { const notParsed = Object.keys(parsedMetadata).filter((key) => !parsedMetadata[key] && this.requiredAttributes.includes(key)); reject(new Error('Could not find required metadata elements: ' + notParsed.join(', '))); } if (this.rlStream !== undefined) { this.rlStream.close(); } this.stream?.destroy(); }); }); } /** * Read observations. * @param start - The first row number to read. * @param length - The number of records to read. * @param type - The type of the returned object. * @param filterColumns - The list of columns to return when type is object. If empty, all columns are returned. * @param filter - A filter class object used to filter data records when reading the dataset. * @return An array of observations. */ async getData(props) { // Check if metadata is loaded if (this.metadataLoaded === false) { await this.getMetadata(); } let { filterColumns = [] } = props; // Convert filterColumns to lowercase for case-insensitive comparison filterColumns = filterColumns.map((item) => item.toLowerCase()); // Check if metadata is loaded if (this.metadata.columns.length === 0 || this.metadata.records === -1) { return Promise.reject(new Error('Metadata is not loaded or there are no columns')); } const { start = 0, length } = props; // Check if start and length are valid if ((typeof length === 'number' && length <= 0) || start < 0 || start > this.metadata.records) { return Promise.reject(new Error('Invalid start/length parameter values')); } // Reset all rows read flag this.allRowsRead = false; if (this.isNdJson) { return this.getNdjsonData({ ...props, filterColumns }); } else { return this.getJsonData({ ...props, filterColumns }); } } async getJsonData(props) { // Default type to array; const { start = 0, length, type = 'array', filter } = props; const filterColumns = props.filterColumns; const filterColumnIndeces = filterColumns.map((column) => this.metadata.columns.findIndex((item) => item.name.toLowerCase() === column.toLowerCase())); return new Promise((resolve, reject) => { // Validate parameters const columnNames = []; if (type === 'object') { columnNames.push(...this.metadata.columns.map((item) => item.name)); } // If possible, continue reading existing stream, otherwise recreate it. let currentPosition = this.currentPosition; if (this.stream === null || this.stream.destroyed || currentPosition > start) { if (this.stream !== null && !this.stream.destroyed) { this.stream.destroy(); } this.stream = fs_1.default.createReadStream(this.filePath, { encoding: this.encoding, }); currentPosition = 0; this.parser = JSONStream_1.default.parse(['rows', true], (data, nodePath) => { return { path: nodePath, value: data }; }); this.stream.pipe(this.parser); } if (this.parser === undefined) { reject(new Error('Could not create JSON parser')); return; } const currentData = []; let filteredRecords = 0; const isFiltered = filter !== undefined; this.parser .on('end', () => { this.currentPosition = currentPosition; this.allRowsRead = true; resolve(currentData); }) .on('data', (data) => { currentPosition += 1; if (length === undefined || (currentPosition > start && (isFiltered ? filteredRecords < length : currentPosition <= start + length))) { if (!isFiltered || filter.filterRow(data.value)) { if (type === 'array') { if (isFiltered) { filteredRecords += 1; } if (filterColumnIndeces.length === 0) { currentData.push(data.value); } else { // Keep only indeces specified in filterColumnIndeces currentData.push(data.value.filter((_, index) => filterColumnIndeces.includes(index))); } } else if (type === 'object') { const obj = {}; if (filterColumns.length === 0) { columnNames.forEach((name, index) => { obj[name] = data.value[index]; }); } else { // Keep only attributes specified in filterColumns columnNames.forEach((name, index) => { if (filterColumns.includes(name.toLowerCase())) { obj[name] = data.value[index]; } }); } if (isFiltered) { filteredRecords += 1; } currentData.push(obj); } } } if (length !== undefined && (isFiltered ? filteredRecords === length : currentPosition === start + length) && this.parser !== undefined) { const parser = this.parser; // Pause the stream and remove current event listeners parser.pause(); parser.removeAllListeners('end'); parser.removeAllListeners('data'); this.currentPosition = currentPosition; resolve(currentData); } }); // Resume the stream if it was paused if (this.parser.paused) { // Remove previous data this.parser.resume(); } }); } async getNdjsonData(props) { return new Promise((resolve, reject) => { // Default type to array; const { start = 0, length, type = 'array', filter } = props; const filterColumns = props.filterColumns; const filterColumnIndeces = filterColumns.map((column) => this.metadata.columns.findIndex((item) => item.name.toLowerCase() === column.toLowerCase())); // If possible, continue reading existing stream, otherwise recreate it. let currentPosition = this.currentPosition; if (this.stream === null || this.stream.destroyed || currentPosition > start) { if (this.stream !== null && !this.stream.destroyed) { this.stream.destroy(); } if (this.isCompressed) { const rawStream = fs_1.default.createReadStream(this.filePath); const gunzip = zlib_1.default.createGunzip(); this.stream = rawStream.pipe(gunzip); } else { this.stream = fs_1.default.createReadStream(this.filePath, { encoding: this.encoding, }); } currentPosition = 0; this.rlStream = readline_1.default.createInterface({ input: this.stream, crlfDelay: Infinity, }); } if (this.rlStream === undefined) { reject(new Error('Could not create readline stream')); return; } const columnNames = []; if (type === 'object') { columnNames.push(...this.metadata.columns.map((item) => item.name)); } const currentData = []; let isFirstLine = true; let filteredRecords = 0; const isFiltered = filter !== undefined; this.rlStream .on('line', (line) => { if (currentPosition === 0 && isFirstLine) { // First line contains metadata, so skip it when reading the data isFirstLine = false; return; } currentPosition += 1; if ((length === undefined || (currentPosition > start && (isFiltered ? filteredRecords < length : currentPosition <= start + length))) && line.length > 0) { const data = JSON.parse(line); if (!isFiltered || filter.filterRow(data)) { if (type === 'array') { if (isFiltered) { filteredRecords += 1; } if (filterColumnIndeces.length === 0) { currentData.push(data); } else { // Keep only indeces specified in filterColumnIndeces currentData.push(data.filter((_, index) => filterColumnIndeces.includes(index))); } } else if (type === 'object') { const obj = {}; if (filterColumns.length === 0) { columnNames.forEach((name, index) => { obj[name] = data[index]; }); } else { // Keep only attributes specified in filterColumns columnNames.forEach((name, index) => { if (filterColumns.includes(name.toLowerCase())) { obj[name] = data[index]; } }); } if (isFiltered) { filteredRecords += 1; } currentData.push(obj); } } } if (length !== undefined && (isFiltered ? filteredRecords === length : currentPosition === start + length)) { // When pausing readline, it does not stop immidiately and can emit extra lines, // so pausing approach is not yet implemented this.currentPosition = currentPosition; if (this.rlStream !== undefined) { this.rlStream.close(); } this.stream?.destroy(); resolve(currentData); } }) .on('error', (err) => { reject(err); }) .on('close', () => { this.currentPosition = currentPosition; if (currentPosition >= this.metadata.records - 1) { this.allRowsRead = true; } resolve(currentData); }); }); } /** * Read observations as an iterable. * @param start - The first row number to read. * @param bufferLength - The number of records to read in a chunk. * @param type - The type of the returned object. * @param filterColumns - The list of columns to return when type is object. If empty, all columns are returned. * @return An iterable object. */ async *readRecords(props) { // Check if metadata is loaded if (this.metadataLoaded === false) { await this.getMetadata(); } const { start = 0, bufferLength = 1000, type, filterColumns, } = props || {}; let currentPosition = start; while (true) { const data = await this.getData({ start: currentPosition, length: bufferLength, type, filterColumns, }); yield* data; if (this.allRowsRead === true || data.length === 0 || this.currentPosition <= currentPosition) { break; } currentPosition = this.currentPosition; } } /** * Get unique values observations. * @param columns - The list of variables for which to obtain the unique observations. * @param limit - The maximum number of values to store. 0 - no limit. * @param bufferLength - The number of records to read in a chunk. * @param sort - Controls whether to sort the unique values. * @return An array of observations. */ async getUniqueValues(props) { const { limit = 0, bufferLength = 1000, sort = true, addCount = false } = props; let { columns } = props; const result = {}; // Check if metadata is loaded if (this.metadataLoaded === false) { await this.getMetadata(); } const notFoundColumns = []; // Use the case of the columns as specified in the metadata columns = columns.map((item) => { const column = this.metadata.columns.find((column) => column.name.toLowerCase() === item.toLowerCase()); if (column === undefined) { notFoundColumns.push(item); return ''; } else { return column.name; } }); if (notFoundColumns.length > 0) { return Promise.reject(new Error(`Columns ${notFoundColumns.join(', ')} not found`)); } // Store number of unique values found const uniqueCount = {}; columns.forEach((column) => { uniqueCount[column] = 0; }); for await (const row of this.readRecords({ bufferLength, type: 'object', filterColumns: columns, })) { columns.forEach((column) => { if (result[column] === undefined) { result[column] = { values: [], counts: {} }; } if ((limit === 0 || uniqueCount[column] < limit)) { if (!result[column].values.includes(row[column])) { result[column].values.push(row[column]); uniqueCount[column] += 1; } if (addCount) { const valueId = row[column] === null ? 'null' : String(row[column]); result[column].counts[valueId] = result[column].counts[valueId] > 0 ? (result[column].counts[valueId] + 1) : 1; } } }); // Check if all unique values are found const isFinished = limit !== 0 && Object.keys(uniqueCount).every((key) => uniqueCount[key] >= limit); if (isFinished) { break; } } // Sort result if (sort) { Object.keys(result).forEach((key) => { result[key].values.sort(); // Counts cannot be properly sorted as it is an object, so it has to be sorted once transformed to array }); } return result; } /** * Helper method to safely write data to stream with backpressure handling * @param data - String data to write */ async writeWithBackpressure(data) { // Create new Promise for this write operation const writeOperation = this.writeQueueDrain.then(() => { return new Promise((resolve) => { if (!this.writeStream?.write(data)) { this.writeStream?.once('drain', () => resolve()); } else { resolve(); } }); }); // Update queue with current operation this.writeQueueDrain = writeOperation; // Wait for this write to complete await writeOperation; } /** * Write data to the file * @param props.metadata - Dataset metadata * @param props.data - Data to write * @param props.action - Write action: create, write, or finalize * @param props.options - Write options (prettify, highWaterMark) */ async write(props) { const { metadata, data, action, options = {} } = props; const { highWaterMark = 16384, // 16KB default indentSize = 2, compressionLevel = 9, } = options; // Check if the file already exists; if (action === 'create') { if (fs_1.default.existsSync(this.filePath)) { // Remove the file fs_1.default.unlinkSync(this.filePath); // Reset read stream if (this.stream && !this.stream.destroyed) { this.stream.destroy(); this.stream = null; this.stats = null; } } } let { prettify = false } = options; // In case of compressed file, prettify must be false if (this.isCompressed && prettify) { prettify = false; } if (action === 'create') { if (!metadata) { throw new Error('Metadata is required for create action'); } this.writeMode = this.isNdJson ? 'ndjson' : 'json'; this.isFirstWrite = true; if (this.isCompressed) { // Create gzip stream const outputStream = fs_1.default.createWriteStream(this.filePath, { encoding: this.encoding, highWaterMark, }); this.outputStream = outputStream; const gzip = zlib_1.default.createGzip({ level: compressionLevel }); gzip.pipe(outputStream); this.writeStream = gzip; } else { this.writeStream = fs_1.default.createWriteStream(this.filePath, { encoding: this.encoding, highWaterMark, }); } if (this.writeMode === 'json') { // Remove rows from metadata to avoid empty array let initialStr = prettify ? JSON.stringify(metadata, null, indentSize) : JSON.stringify(metadata); // Remove closing brace and add rows array opening initialStr = initialStr.slice(0, -1); // In case of prettify, remove last new line if (prettify && initialStr.endsWith('\n')) { initialStr = initialStr.slice(0, -1); } // Add rows array opening initialStr = initialStr + (prettify ? ',\n' + ' '.repeat(indentSize) + '"rows": [' : ',"rows":['); await this.writeWithBackpressure(initialStr); } else { await this.writeWithBackpressure(JSON.stringify(metadata) + '\n'); } if (data) { await this.write({ data, action: 'write', options }); } } else if (action === 'write') { if (!this.writeStream) { throw new Error('No active write stream. Call create first.'); } if (!data || !data.length) { return; } let rowBuffer = ''; if (this.writeMode === 'json') { for (let i = 0; i < data.length; i++) { const prefix = this.isFirstWrite && i === 0 ? '' : ','; const rowStr = prettify ? prefix + '\n' + ' '.repeat(indentSize * 2) + JSON.stringify(data[i]) : prefix + JSON.stringify(data[i]); rowBuffer += rowStr; } } else { for (const row of data) { rowBuffer += JSON.stringify(row) + '\n'; } } await this.writeWithBackpressure(rowBuffer); this.isFirstWrite = false; } else if (action === 'finalize') { if (!this.writeStream) { throw new Error('No active write stream. Call create first.'); } if (data) { await this.write({ data, action: 'write', options }); } if (this.writeMode === 'json') { await this.writeWithBackpressure(prettify ? '\n' + ' '.repeat(indentSize) + ']\n}\n' : ']}\n'); } // Wait for all writes to complete and close stream await new Promise((resolve, reject) => { this.writeStream?.on('error', reject); if (this.isCompressed) { this.outputStream?.on('finish', () => { resolve(); }); } else { this.writeStream?.on('finish', () => { resolve(); }); } this.writeStream?.end(() => { this.writeStream = undefined; this.outputStream = undefined; this.writeMode = undefined; this.isFirstWrite = true; }); }); } } /** * Close all open streams and reset state */ async close() { // Clean up read streams if (this.stream && !this.stream.destroyed) { this.stream.destroy(); this.stream = null; } if (this.rlStream) { this.rlStream.close(); this.rlStream = undefined; } // Clean up parser if (this.parser) { this.parser.removeAllListeners(); this.parser = undefined; } // Clean up write streams if (this.writeStream) { await new Promise((resolve) => { this.writeStream?.end(() => { resolve(); }); }); this.writeStream = undefined; } if (this.outputStream) { this.outputStream = undefined; } // Reset state variables this.currentPosition = 0; this.allRowsRead = false; this.writeMode = undefined; this.isFirstWrite = true; // Reset write queue this.writeQueueDrain = Promise.resolve(); } /** * Write data to file in one operation * @param props.metadata - Dataset metadata * @param props.data - Data to write * @param props.options - Write options (prettify, highWaterMark) */ async writeData(props) { const { metadata, data, options } = props; // Create file and write metadata await this.write({ metadata, action: 'create', options, }); // Write data if provided if (data?.length) { await this.write({ data, action: 'write', options, }); } // Finalize the file await this.write({ action: 'finalize', options, }); } } exports.default = DatasetJson; //# sourceMappingURL=datasetJson.js.map