js-stream-dataset-json
Version:
Stream Dataset-JSON files
912 lines • 37.1 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
const fs_1 = __importDefault(require("fs"));
const promises_1 = __importDefault(require("fs/promises"));
const readline_1 = __importDefault(require("readline"));
const zlib_1 = __importDefault(require("zlib"));
const JSONStream_1 = __importDefault(require("JSONStream"));
// Main class for dataset JSON;
class DatasetJson {
/**
* Read observations.
* @constructor
* @param filePath - Path to the file.
* @param options - Configuration options
* @param options.encoding - File encoding. Default is 'utf8'.
* @param options.isNdJson - Force NDJSON format. If not specified, detected from file extension.
* @param options.isCompressed - Force NDJSON format. If not specified, detected from file extension.
* @param options.checkExists - Throw error if file does not exist. Default is false.
*/
constructor(filePath, options) {
// Required attributes
this.requiredAttributes = [
'datasetJSONCreationDateTime',
'datasetJSONVersion',
'records',
'name',
'label',
'columns',
];
// Write queue management
this.writeQueueDrain = Promise.resolve();
/**
* Auxilary function to verify if required elements are parsed;
* @return True if all required attributes are present, otherwise false.
*/
this.checkAttributesParsed = (item) => {
return this.requiredAttributes.every((key) => item[key] === true);
};
this.filePath = filePath;
this.currentPosition = 0;
const { encoding = 'utf8', checkExists = false } = options || {};
this.encoding = encoding;
this.isFirstWrite = true;
// If option isNdjson is not specified, try to detect it from the file extension;
if (options?.isNdJson === undefined) {
this.isNdJson = this.filePath.toLowerCase().endsWith('.ndjson');
}
else {
this.isNdJson = options.isNdJson;
}
// If option isCompressed is not specified, try to detect it from the file extension;
if (options?.isCompressed === undefined) {
this.isCompressed = this.filePath.toLowerCase().endsWith('.dsjc');
}
else {
this.isCompressed = options.isCompressed;
}
// In case of compressed file, change the NDJSON format is used
if (this.isCompressed) {
this.isNdJson = true;
}
this.allRowsRead = false;
this.metadataLoaded = false;
this.metadata = {
datasetJSONCreationDateTime: '',
datasetJSONVersion: '',
records: -1,
name: '',
label: '',
columns: [],
};
// Get all possible encoding values from BufferEncoding type
const validEncodings = [
'ascii',
'utf8',
'utf16le',
'ucs2',
'base64',
'latin1',
];
// Check encoding
if (!validEncodings.includes(this.encoding)) {
throw new Error(`Unsupported encoding ${this.encoding}`);
}
// Check if file exists;
if (!fs_1.default.existsSync(this.filePath)) {
if (checkExists === true) {
throw new Error(`Could not read file ${this.filePath}`);
}
else {
this.stats = null;
this.stream = null;
}
}
else {
this.stats = fs_1.default.statSync(this.filePath);
this.stream = null;
}
}
/**
* Check if the file was modified
* @return True if file has changed, otherwise false.
*/
async fileChanged() {
const stats = await promises_1.default.stat(this.filePath);
if (this.stats !== null && stats.mtimeMs !== this.stats.mtimeMs) {
return true;
}
return false;
}
/**
* Get Dataset-JSON metadata
* @return An object with file metadata.
*/
async getMetadata() {
// If the file did not change, use the metadata obtained during initialization;
if (!(await this.fileChanged()) && this.metadataLoaded === true) {
return this.metadata;
}
else {
if (this.isNdJson) {
return this.getNdjsonMetadata();
}
else {
return this.getJsonMetadata();
}
}
}
/**
* Get Dataset-JSON metadata when the file is in JSON format.
* @return An object with file metadata.
*/
async getJsonMetadata() {
return new Promise((resolve, reject) => {
this.metadataLoaded = false;
// Metadata for ItemGroup
const metadata = {
datasetJSONCreationDateTime: '',
datasetJSONVersion: '',
records: -1,
name: '',
label: '',
columns: [],
studyOID: '',
metaDataVersionOID: '',
};
const parsedMetadata = {
datasetJSONCreationDateTime: false,
datasetJSONVersion: false,
dbLastModifiedDateTime: false,
fileOID: false,
originator: false,
sourceSystem: false,
itemGroupOID: false,
columns: false,
records: false,
name: false,
label: false,
studyOID: false,
metaDataVersionOID: false,
metaDataRef: false,
};
// Restart stream
if (this.currentPosition !== 0 ||
this.stream?.destroyed ||
this.stream === null) {
if (this.stream !== null && !this.stream?.destroyed) {
this.stream?.destroy();
}
this.stream = fs_1.default.createReadStream(this.filePath, {
encoding: this.encoding,
});
}
if (this.stream === null) {
reject(new Error('Could not create read stream for file ' + this.filePath));
return;
}
this.stream
.pipe(JSONStream_1.default.parse('rows..*', (data, nodePath) => {
return { path: nodePath, value: data };
}))
.on('end', () => {
// Check if all required attributes are parsed after the file is fully loaded;
if (!this.checkAttributesParsed(parsedMetadata)) {
const notParsed = Object.keys(parsedMetadata).filter((key) => !parsedMetadata[key] &&
this.requiredAttributes.includes(key));
reject(new Error('Could not find required metadata elements ' +
notParsed.join(', ')));
}
this.metadataLoaded = true;
this.metadata = metadata;
resolve(metadata);
})
.on('header', (data) => {
// In correctly formed Dataset-JSON, all metadata attributes are present before rows
Object.keys(data).forEach((key) => {
if (Object.keys(parsedMetadata).includes(key)) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
metadata[key] =
data[key];
parsedMetadata[key] = true;
}
});
// Check if all required elements were parsed
if (this.checkAttributesParsed(parsedMetadata)) {
this.metadataLoaded = true;
this.metadata = metadata;
resolve(metadata);
this.stream?.destroy();
}
})
.on('footer', (data) => {
// If not all required metadata attributes were found before rows, check if they are present after
Object.keys(data).forEach((key) => {
if (Object.keys(parsedMetadata).includes(key)) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
metadata[key] =
data[key];
parsedMetadata[key] = true;
}
});
// Check if all required elements were parsed
if (this.checkAttributesParsed(parsedMetadata)) {
this.metadataLoaded = true;
this.metadata = metadata;
resolve(metadata);
this.stream?.destroy();
}
});
});
}
/**
* Get Dataset-JSON metadata when the file is in NDJSON format.
* @return An object with file metadata.
*/
async getNdjsonMetadata() {
return new Promise((resolve, reject) => {
this.metadataLoaded = false;
// All metadata is stored in the first line of the file
const metadata = {
datasetJSONCreationDateTime: '',
datasetJSONVersion: '',
records: -1,
name: '',
label: '',
columns: [],
studyOID: '',
metaDataVersionOID: '',
};
const parsedMetadata = {
datasetJSONCreationDateTime: false,
datasetJSONVersion: false,
dbLastModifiedDateTime: false,
fileOID: false,
originator: false,
sourceSystem: false,
itemGroupOID: false,
columns: false,
records: false,
name: false,
label: false,
studyOID: false,
metaDataVersionOID: false,
metaDataRef: false,
};
// Restart stream
if (this.stream === null ||
this.currentPosition !== 0 ||
this.stream?.destroyed) {
if (this.stream !== null && !this.stream?.destroyed) {
this.stream?.destroy();
}
if (this.isCompressed) {
const rawStream = fs_1.default.createReadStream(this.filePath);
const gunzip = zlib_1.default.createGunzip();
this.stream = rawStream.pipe(gunzip);
}
else {
this.stream = fs_1.default.createReadStream(this.filePath, {
encoding: this.encoding,
});
}
}
if (this.stream === null) {
reject(new Error('Could not create read stream for file ' + this.filePath));
return;
}
this.rlStream = readline_1.default.createInterface({
input: this.stream,
crlfDelay: Infinity,
});
this.rlStream.on('line', (line) => {
const data = JSON.parse(line);
// Fill metadata with parsed attributes
Object.keys(data).forEach((key) => {
if (Object.keys(parsedMetadata).includes(key)) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
metadata[key] =
data[key];
parsedMetadata[key] = true;
}
});
// Check if all required elements were parsed
if (this.checkAttributesParsed(parsedMetadata)) {
this.metadataLoaded = true;
this.metadata = metadata;
resolve(metadata);
}
else {
const notParsed = Object.keys(parsedMetadata).filter((key) => !parsedMetadata[key] &&
this.requiredAttributes.includes(key));
reject(new Error('Could not find required metadata elements: ' +
notParsed.join(', ')));
}
if (this.rlStream !== undefined) {
this.rlStream.close();
}
this.stream?.destroy();
});
});
}
/**
* Read observations.
* @param start - The first row number to read.
* @param length - The number of records to read.
* @param type - The type of the returned object.
* @param filterColumns - The list of columns to return when type is object. If empty, all columns are returned.
* @param filter - A filter class object used to filter data records when reading the dataset.
* @return An array of observations.
*/
async getData(props) {
// Check if metadata is loaded
if (this.metadataLoaded === false) {
await this.getMetadata();
}
let { filterColumns = [] } = props;
// Convert filterColumns to lowercase for case-insensitive comparison
filterColumns = filterColumns.map((item) => item.toLowerCase());
// Check if metadata is loaded
if (this.metadata.columns.length === 0 ||
this.metadata.records === -1) {
return Promise.reject(new Error('Metadata is not loaded or there are no columns'));
}
const { start = 0, length } = props;
// Check if start and length are valid
if ((typeof length === 'number' && length <= 0) ||
start < 0 ||
start > this.metadata.records) {
return Promise.reject(new Error('Invalid start/length parameter values'));
}
// Reset all rows read flag
this.allRowsRead = false;
if (this.isNdJson) {
return this.getNdjsonData({ ...props, filterColumns });
}
else {
return this.getJsonData({ ...props, filterColumns });
}
}
async getJsonData(props) {
// Default type to array;
const { start = 0, length, type = 'array', filter } = props;
const filterColumns = props.filterColumns;
const filterColumnIndeces = filterColumns.map((column) => this.metadata.columns.findIndex((item) => item.name.toLowerCase() === column.toLowerCase()));
return new Promise((resolve, reject) => {
// Validate parameters
const columnNames = [];
if (type === 'object') {
columnNames.push(...this.metadata.columns.map((item) => item.name));
}
// If possible, continue reading existing stream, otherwise recreate it.
let currentPosition = this.currentPosition;
if (this.stream === null ||
this.stream.destroyed ||
currentPosition > start) {
if (this.stream !== null && !this.stream.destroyed) {
this.stream.destroy();
}
this.stream = fs_1.default.createReadStream(this.filePath, {
encoding: this.encoding,
});
currentPosition = 0;
this.parser = JSONStream_1.default.parse(['rows', true], (data, nodePath) => {
return { path: nodePath, value: data };
});
this.stream.pipe(this.parser);
}
if (this.parser === undefined) {
reject(new Error('Could not create JSON parser'));
return;
}
const currentData = [];
let filteredRecords = 0;
const isFiltered = filter !== undefined;
this.parser
.on('end', () => {
this.currentPosition = currentPosition;
this.allRowsRead = true;
resolve(currentData);
})
.on('data', (data) => {
currentPosition += 1;
if (length === undefined ||
(currentPosition > start &&
(isFiltered
? filteredRecords < length
: currentPosition <= start + length))) {
if (!isFiltered || filter.filterRow(data.value)) {
if (type === 'array') {
if (isFiltered) {
filteredRecords += 1;
}
if (filterColumnIndeces.length === 0) {
currentData.push(data.value);
}
else {
// Keep only indeces specified in filterColumnIndeces
currentData.push(data.value.filter((_, index) => filterColumnIndeces.includes(index)));
}
}
else if (type === 'object') {
const obj = {};
if (filterColumns.length === 0) {
columnNames.forEach((name, index) => {
obj[name] = data.value[index];
});
}
else {
// Keep only attributes specified in filterColumns
columnNames.forEach((name, index) => {
if (filterColumns.includes(name.toLowerCase())) {
obj[name] = data.value[index];
}
});
}
if (isFiltered) {
filteredRecords += 1;
}
currentData.push(obj);
}
}
}
if (length !== undefined &&
(isFiltered
? filteredRecords === length
: currentPosition === start + length) &&
this.parser !== undefined) {
const parser = this.parser;
// Pause the stream and remove current event listeners
parser.pause();
parser.removeAllListeners('end');
parser.removeAllListeners('data');
this.currentPosition = currentPosition;
resolve(currentData);
}
});
// Resume the stream if it was paused
if (this.parser.paused) {
// Remove previous data
this.parser.resume();
}
});
}
async getNdjsonData(props) {
return new Promise((resolve, reject) => {
// Default type to array;
const { start = 0, length, type = 'array', filter } = props;
const filterColumns = props.filterColumns;
const filterColumnIndeces = filterColumns.map((column) => this.metadata.columns.findIndex((item) => item.name.toLowerCase() === column.toLowerCase()));
// If possible, continue reading existing stream, otherwise recreate it.
let currentPosition = this.currentPosition;
if (this.stream === null ||
this.stream.destroyed ||
currentPosition > start) {
if (this.stream !== null && !this.stream.destroyed) {
this.stream.destroy();
}
if (this.isCompressed) {
const rawStream = fs_1.default.createReadStream(this.filePath);
const gunzip = zlib_1.default.createGunzip();
this.stream = rawStream.pipe(gunzip);
}
else {
this.stream = fs_1.default.createReadStream(this.filePath, {
encoding: this.encoding,
});
}
currentPosition = 0;
this.rlStream = readline_1.default.createInterface({
input: this.stream,
crlfDelay: Infinity,
});
}
if (this.rlStream === undefined) {
reject(new Error('Could not create readline stream'));
return;
}
const columnNames = [];
if (type === 'object') {
columnNames.push(...this.metadata.columns.map((item) => item.name));
}
const currentData = [];
let isFirstLine = true;
let filteredRecords = 0;
const isFiltered = filter !== undefined;
this.rlStream
.on('line', (line) => {
if (currentPosition === 0 && isFirstLine) {
// First line contains metadata, so skip it when reading the data
isFirstLine = false;
return;
}
currentPosition += 1;
if ((length === undefined ||
(currentPosition > start &&
(isFiltered
? filteredRecords < length
: currentPosition <= start + length))) &&
line.length > 0) {
const data = JSON.parse(line);
if (!isFiltered || filter.filterRow(data)) {
if (type === 'array') {
if (isFiltered) {
filteredRecords += 1;
}
if (filterColumnIndeces.length === 0) {
currentData.push(data);
}
else {
// Keep only indeces specified in filterColumnIndeces
currentData.push(data.filter((_, index) => filterColumnIndeces.includes(index)));
}
}
else if (type === 'object') {
const obj = {};
if (filterColumns.length === 0) {
columnNames.forEach((name, index) => {
obj[name] = data[index];
});
}
else {
// Keep only attributes specified in filterColumns
columnNames.forEach((name, index) => {
if (filterColumns.includes(name.toLowerCase())) {
obj[name] = data[index];
}
});
}
if (isFiltered) {
filteredRecords += 1;
}
currentData.push(obj);
}
}
}
if (length !== undefined &&
(isFiltered
? filteredRecords === length
: currentPosition === start + length)) {
// When pausing readline, it does not stop immidiately and can emit extra lines,
// so pausing approach is not yet implemented
this.currentPosition = currentPosition;
if (this.rlStream !== undefined) {
this.rlStream.close();
}
this.stream?.destroy();
resolve(currentData);
}
})
.on('error', (err) => {
reject(err);
})
.on('close', () => {
this.currentPosition = currentPosition;
if (currentPosition >= this.metadata.records - 1) {
this.allRowsRead = true;
}
resolve(currentData);
});
});
}
/**
* Read observations as an iterable.
* @param start - The first row number to read.
* @param bufferLength - The number of records to read in a chunk.
* @param type - The type of the returned object.
* @param filterColumns - The list of columns to return when type is object. If empty, all columns are returned.
* @return An iterable object.
*/
async *readRecords(props) {
// Check if metadata is loaded
if (this.metadataLoaded === false) {
await this.getMetadata();
}
const { start = 0, bufferLength = 1000, type, filterColumns, } = props || {};
let currentPosition = start;
while (true) {
const data = await this.getData({
start: currentPosition,
length: bufferLength,
type,
filterColumns,
});
yield* data;
if (this.allRowsRead === true || data.length === 0 || this.currentPosition <= currentPosition) {
break;
}
currentPosition = this.currentPosition;
}
}
/**
* Get unique values observations.
* @param columns - The list of variables for which to obtain the unique observations.
* @param limit - The maximum number of values to store. 0 - no limit.
* @param bufferLength - The number of records to read in a chunk.
* @param sort - Controls whether to sort the unique values.
* @return An array of observations.
*/
async getUniqueValues(props) {
const { limit = 0, bufferLength = 1000, sort = true, addCount = false } = props;
let { columns } = props;
const result = {};
// Check if metadata is loaded
if (this.metadataLoaded === false) {
await this.getMetadata();
}
const notFoundColumns = [];
// Use the case of the columns as specified in the metadata
columns = columns.map((item) => {
const column = this.metadata.columns.find((column) => column.name.toLowerCase() === item.toLowerCase());
if (column === undefined) {
notFoundColumns.push(item);
return '';
}
else {
return column.name;
}
});
if (notFoundColumns.length > 0) {
return Promise.reject(new Error(`Columns ${notFoundColumns.join(', ')} not found`));
}
// Store number of unique values found
const uniqueCount = {};
columns.forEach((column) => {
uniqueCount[column] = 0;
});
for await (const row of this.readRecords({
bufferLength,
type: 'object',
filterColumns: columns,
})) {
columns.forEach((column) => {
if (result[column] === undefined) {
result[column] = { values: [], counts: {} };
}
if ((limit === 0 || uniqueCount[column] < limit)) {
if (!result[column].values.includes(row[column])) {
result[column].values.push(row[column]);
uniqueCount[column] += 1;
}
if (addCount) {
const valueId = row[column] === null ? 'null' : String(row[column]);
result[column].counts[valueId] = result[column].counts[valueId] > 0 ? (result[column].counts[valueId] + 1) : 1;
}
}
});
// Check if all unique values are found
const isFinished = limit !== 0 && Object.keys(uniqueCount).every((key) => uniqueCount[key] >= limit);
if (isFinished) {
break;
}
}
// Sort result
if (sort) {
Object.keys(result).forEach((key) => {
result[key].values.sort();
// Counts cannot be properly sorted as it is an object, so it has to be sorted once transformed to array
});
}
return result;
}
/**
* Helper method to safely write data to stream with backpressure handling
* @param data - String data to write
*/
async writeWithBackpressure(data) {
// Create new Promise for this write operation
const writeOperation = this.writeQueueDrain.then(() => {
return new Promise((resolve) => {
if (!this.writeStream?.write(data)) {
this.writeStream?.once('drain', () => resolve());
}
else {
resolve();
}
});
});
// Update queue with current operation
this.writeQueueDrain = writeOperation;
// Wait for this write to complete
await writeOperation;
}
/**
* Write data to the file
* @param props.metadata - Dataset metadata
* @param props.data - Data to write
* @param props.action - Write action: create, write, or finalize
* @param props.options - Write options (prettify, highWaterMark)
*/
async write(props) {
const { metadata, data, action, options = {} } = props;
const { highWaterMark = 16384, // 16KB default
indentSize = 2, compressionLevel = 9, } = options;
// Check if the file already exists;
if (action === 'create') {
if (fs_1.default.existsSync(this.filePath)) {
// Remove the file
fs_1.default.unlinkSync(this.filePath);
// Reset read stream
if (this.stream && !this.stream.destroyed) {
this.stream.destroy();
this.stream = null;
this.stats = null;
}
}
}
let { prettify = false } = options;
// In case of compressed file, prettify must be false
if (this.isCompressed && prettify) {
prettify = false;
}
if (action === 'create') {
if (!metadata) {
throw new Error('Metadata is required for create action');
}
this.writeMode = this.isNdJson ? 'ndjson' : 'json';
this.isFirstWrite = true;
if (this.isCompressed) {
// Create gzip stream
const outputStream = fs_1.default.createWriteStream(this.filePath, {
encoding: this.encoding,
highWaterMark,
});
this.outputStream = outputStream;
const gzip = zlib_1.default.createGzip({ level: compressionLevel });
gzip.pipe(outputStream);
this.writeStream = gzip;
}
else {
this.writeStream = fs_1.default.createWriteStream(this.filePath, {
encoding: this.encoding,
highWaterMark,
});
}
if (this.writeMode === 'json') {
// Remove rows from metadata to avoid empty array
let initialStr = prettify
? JSON.stringify(metadata, null, indentSize)
: JSON.stringify(metadata);
// Remove closing brace and add rows array opening
initialStr = initialStr.slice(0, -1);
// In case of prettify, remove last new line
if (prettify && initialStr.endsWith('\n')) {
initialStr = initialStr.slice(0, -1);
}
// Add rows array opening
initialStr =
initialStr +
(prettify
? ',\n' + ' '.repeat(indentSize) + '"rows": ['
: ',"rows":[');
await this.writeWithBackpressure(initialStr);
}
else {
await this.writeWithBackpressure(JSON.stringify(metadata) + '\n');
}
if (data) {
await this.write({ data, action: 'write', options });
}
}
else if (action === 'write') {
if (!this.writeStream) {
throw new Error('No active write stream. Call create first.');
}
if (!data || !data.length) {
return;
}
let rowBuffer = '';
if (this.writeMode === 'json') {
for (let i = 0; i < data.length; i++) {
const prefix = this.isFirstWrite && i === 0 ? '' : ',';
const rowStr = prettify
? prefix +
'\n' +
' '.repeat(indentSize * 2) +
JSON.stringify(data[i])
: prefix + JSON.stringify(data[i]);
rowBuffer += rowStr;
}
}
else {
for (const row of data) {
rowBuffer += JSON.stringify(row) + '\n';
}
}
await this.writeWithBackpressure(rowBuffer);
this.isFirstWrite = false;
}
else if (action === 'finalize') {
if (!this.writeStream) {
throw new Error('No active write stream. Call create first.');
}
if (data) {
await this.write({ data, action: 'write', options });
}
if (this.writeMode === 'json') {
await this.writeWithBackpressure(prettify ? '\n' + ' '.repeat(indentSize) + ']\n}\n' : ']}\n');
}
// Wait for all writes to complete and close stream
await new Promise((resolve, reject) => {
this.writeStream?.on('error', reject);
if (this.isCompressed) {
this.outputStream?.on('finish', () => {
resolve();
});
}
else {
this.writeStream?.on('finish', () => {
resolve();
});
}
this.writeStream?.end(() => {
this.writeStream = undefined;
this.outputStream = undefined;
this.writeMode = undefined;
this.isFirstWrite = true;
});
});
}
}
/**
* Close all open streams and reset state
*/
async close() {
// Clean up read streams
if (this.stream && !this.stream.destroyed) {
this.stream.destroy();
this.stream = null;
}
if (this.rlStream) {
this.rlStream.close();
this.rlStream = undefined;
}
// Clean up parser
if (this.parser) {
this.parser.removeAllListeners();
this.parser = undefined;
}
// Clean up write streams
if (this.writeStream) {
await new Promise((resolve) => {
this.writeStream?.end(() => {
resolve();
});
});
this.writeStream = undefined;
}
if (this.outputStream) {
this.outputStream = undefined;
}
// Reset state variables
this.currentPosition = 0;
this.allRowsRead = false;
this.writeMode = undefined;
this.isFirstWrite = true;
// Reset write queue
this.writeQueueDrain = Promise.resolve();
}
/**
* Write data to file in one operation
* @param props.metadata - Dataset metadata
* @param props.data - Data to write
* @param props.options - Write options (prettify, highWaterMark)
*/
async writeData(props) {
const { metadata, data, options } = props;
// Create file and write metadata
await this.write({
metadata,
action: 'create',
options,
});
// Write data if provided
if (data?.length) {
await this.write({
data,
action: 'write',
options,
});
}
// Finalize the file
await this.write({
action: 'finalize',
options,
});
}
}
exports.default = DatasetJson;
//# sourceMappingURL=datasetJson.js.map