UNPKG

xport-js

Version:

Node.js library to read SAS XPORT v5/v6 data transport files (*.xpt).

469 lines (435 loc) 18.6 kB
import Member from './member'; import { Header, Options, UniqueValues } from '../types/library'; import { DatasetMetadata as DatasetJsonMetadata, ItemDescription as DatasetJsonColumn } from 'js-stream-dataset-json'; import { createReadStream, createWriteStream } from 'fs'; import Filter, { ItemDataArray } from 'js-array-filter'; import path from 'path'; /** * @typedef {"array" | "object" } rowFormat */ class Library { members: Member[]; created: object; modified: object; sasVersion: string; osVersion: string; pathToFile: string; header: Header; /** * Library associated with the XPORT file. * @param pathToFile Path to XPT file. * @param options Options. */ constructor (pathToFile: string) { this.pathToFile = pathToFile; this.members = []; this.header = { sasSymbol: [], sasLib: '', sasVer: '', sasOs: '', sasCreate: '', sasModified: '' }; } /** * Parse Library Header information. * # 1. The first header record: * * HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!! * 000000000000000000000000000000 * * 2. The first real header record ... as a C structure: * * struct REAL_HEADER { * char sas_symbol[2][8]; / "SAS", twice / * char saslib[8]; / "SASLIB" / * char sasver[8]; / version of SAS used / * char sas_os[8]; / operating system used / * char blanks[24]; * char sas_create[16]; / datetime created / * }; * * 3. Second real header record * * ddMMMyy:hh:mm:ss * * In this record, the string is the datetime modified. Most * often, the datetime created and datetime modified will always * be the same. Pad with ASCII blanks to 80 bytes. Note that only * a 2-digit year appears. If any program needs to read in this * 2-digit year, be prepared to deal with dates in the 1900s or * the 2000s. * @param data Raw header - 3x80 bytes. */ private parseHeader (dataBuffer: Buffer): void { // Skip first 80 bytes till the first real header record const headerBuffer = dataBuffer.subarray(80, 3 * 80); const sasSymbol1 = headerBuffer.subarray(0, 8).toString('ascii').trim(); const sasSymbol2 = headerBuffer.subarray(8, 16).toString('ascii').trim(); const sasLib = headerBuffer.subarray(16, 24).toString('ascii').trim(); const sasVer = headerBuffer.subarray(24, 32).toString('ascii').trim(); const sasOs = headerBuffer.subarray(32, 40).toString('ascii').trim(); const sasCreate = headerBuffer.subarray(64, 80).toString('ascii').trim(); const sasModified = headerBuffer.subarray(80, 2 * 80).toString('ascii').trim(); this.header.sasSymbol = [sasSymbol1, sasSymbol2]; this.header.sasLib = sasLib; this.header.sasVer = sasVer; this.header.sasOs = sasOs; this.header.sasCreate = sasCreate; this.header.sasModified = sasModified; } public getHeader (): Header { return this.header; } private parseMembers (data: Buffer, obsStart: number): void { const member = new Member(obsStart); member.parseRaw(data); // Currently only one member is supported this.members.length = 0; // Clear the array while keeping the reference this.members.push(member); // Add the new member } /** * Get metadata information from XPORT file. */ public async getMetadata<T extends "xport" | "dataset-json1.1">( format: T = "xport" as T ): Promise<T extends "dataset-json1.1" ? DatasetJsonMetadata : object> { // Get header of the XPT containing metadata let data = Buffer.from([]); const stream = createReadStream(this.pathToFile); // Position of the first observation in the dataset; let obsStart: number; for await (const chunk of stream) { data = Buffer.concat([data, chunk]); // Stop reading the header when the first observatin is met const obsString = data.toString('binary').indexOf('HEADER RECORD*******OBS HEADER RECORD!!!!!!!000000000000000000000000000000'); if (obsString >= 0) { obsStart = obsString + 80; break; } } // Parse header - first 3x80 bytes this.parseHeader(data.subarray(0, 3 * 80)); // Parse members - the rest this.parseMembers(data.subarray(3 * 80), obsStart); const result: object[] = []; Object.values(this.members).forEach((member: Member) => { member.variableOrder.forEach((varName: string) => { const variable = member.variables[varName]; const varAttrs: { [key: string]: string|null|number } = { dataset: member.name, name: variable.name, label: variable.label, length: variable.length, type: variable.type, }; if (variable.formatName !== '') { varAttrs.format = variable.formatName + variable.formatW + '.'; // Avoid formats like DATE9.0 if (variable.formatD !== '0') { varAttrs.format += variable.formatD; } } if (variable.informatName !== '') { varAttrs.informat = variable.informatName + variable.informatW + '.'; if (variable.informatD !== '0') { varAttrs.informat += variable.informatD; } } result.push(varAttrs); }); }); if (format === 'xport') { return result as T extends "dataset-json1.1" ? DatasetJsonMetadata : object; } else if (format === 'dataset-json1.1') { if (this.members.length !== 1) { // throw(new Error('format only supports single dataset files')); } const currentMember = this.members[0]; const records = currentMember.getRecordsNum(this.pathToFile); const updatedColumns: DatasetJsonColumn[] = result.map((column: { [key: string]: string }) => { const updateType = column.type === 'Char' ? 'string' : 'double'; const updatedColumn: DatasetJsonColumn = { itemOID: column.name, name: column.name, label: column.label, dataType: updateType, length: parseInt(column.length), displayFormat: column.format, }; return updatedColumn; }); // Format metadata similar to Dataset-JSON 1.1 spec const djMetadata: DatasetJsonMetadata = { datasetJSONCreationDateTime: currentMember.created, datasetJSONVersion: '', records, name: currentMember.name, label: currentMember.label, columns: updatedColumns, dbLastModifiedDateTime: currentMember.modified, sourceSystem: { name: `${this.header.sasSymbol[0]} ${this.header.sasOs}`, version: this.header.sasVer, } }; return djMetadata; } } private getHeaderRecord (member: Member, options: Options): string[]|object { // If keep is used, flag which variables to skip let keep = options?.keep !== undefined ? options.keep : []; keep = keep.map(varName => varName.toUpperCase()); const skip: boolean[] = []; member.variableOrder.forEach((varName: string) => { if (keep.length > 0 && !keep.includes(varName.toUpperCase())) { skip.push(true); } else { skip.push(false); } }); if (options?.rowFormat === 'object') { const header: { [key: string]: string } = {}; member.variableOrder.forEach((varName: string, index: number) => { if (!skip[index]) { header[varName] = member.variables[varName].label; } }); return header; } else { const header: string[] = []; member.variableOrder.forEach((varName: string, index: number) => { if (!skip[index]) { header.push(varName); } }); return header; } } /** * Read observations as async iterable. * @param options Read options. * - **dsNames** List of dataset names to read, by default all datasets are read. * - **rowFormat** [default=array] Output observation format. * <br> array: [value1, value2, value3, ...] * <br> object: { var1: value1, var: value2, var3: value3, ... } * - **keep** [default=[]] Array of variables to keep in the result (case-insensitive) * - **skipHeader** [default=false] Flag to control whether the first record contains variable names. * - **encoding** [default=binary] String encoding, default is latin1 (binary). * See the list of [encodings](https://nodejs.org/api/buffer.html#buffer_buffers_and_character_encodings) supported by Node.js. */ public async * read (options?: Options): AsyncIterable<Array<number|string>|object> { // Check if metadata already parsed if (Object.keys(this.members).length === 0) { await this.getMetadata(); } for (let i = 0; i < Object.keys(this.members).length; i++) { const member = Object.values(this.members)[i]; // Output header if (!options?.skipHeader) { yield this.getHeaderRecord(member, options); } for await (const obs of member.read(this.pathToFile, options)) { yield obs; } } /* TODO Add multiple dataset case Object.values(this.members).forEach((member: Member) => { let result = await member.read(this.pathToFile); }); */ return []; } /** * Get all observations. This method will load all records into memory, for large datasets, the read method is suggested. * @param options Read options. See read method options for details */ public async getData(props: { start?: number; length?: number; type?: "object" | "array"; filterColumns?: string[]; filter?: Filter; skipHeader?: boolean; roundPrecision?: number; }): Promise<Array<Array<number|string>|object>> { // Check if metadata already parsed const { start = 0, length, type = 'array', filter, skipHeader = true, filterColumns, roundPrecision } = props; const isFiltered = filter !== undefined; if (Object.keys(this.members).length === 0) { await this.getMetadata(); } // Form options; const options: Options = { rowFormat: type, keep: filterColumns, skipHeader: skipHeader, filter: filter, roundPrecision, }; let currentObs = 0; const result = []; for (let i = 0; i < Object.keys(this.members).length; i++) { const member = Object.values(this.members)[i]; // Output header if (!skipHeader) { result.push(this.getHeaderRecord(member, options)); } for await (const obs of member.read(this.pathToFile, options)) { currentObs++; if (start !== undefined && currentObs <= start) { // Skip until start continue; } if (isFiltered) { if (filter.filterRow(obs as ItemDataArray)) { result.push(obs); } } else { result.push(obs); } if (length && result.length === length) { // Stop when length is reached break; } } } return result; } /** * Get unique values observations. * @param columns - The list of variables for which to obtain the unique observations. * @param limit - The maximum number of values to store. 0 - no limit. * @param sort - Controls whether to sort the unique values. * @return An array of observations. */ async getUniqueValues(props: { columns: string[]; limit?: number; addCount?: boolean; sort?: boolean; roundPrecision?: number; }): Promise<UniqueValues> { const { limit = 0, addCount = false, sort = false, roundPrecision } = props; let { columns } = props; // Check if metadata already parsed const metadata = await this.getMetadata('dataset-json1.1'); const notFoundColumns: string[] = []; // Use the case of the columns as specified in the metadata columns = columns.map((item) => { const column = metadata.columns.find( (column) => column.name.toLowerCase() === item.toLowerCase() ); if (column === undefined) { notFoundColumns.push(item); return ''; } else { return column.name as string; } }); if (notFoundColumns.length > 0) { return Promise.reject( new Error(`Columns ${notFoundColumns.join(', ')} not found`) ); } // Store number of unique values found const uniqueCount: { [name: string]: number } = {}; columns.forEach((column) => { uniqueCount[column] = 0; }); // Form options; const options: Options = { rowFormat: "object", keep: columns, skipHeader: true, roundPrecision, }; const result: UniqueValues = {}; for (let i = 0; i < Object.keys(this.members).length; i++) { const member = Object.values(this.members)[i]; for await (const obs of member.read(this.pathToFile, options)) { const obsObject = obs as { [key: string]: string|number|null }; columns.forEach((column) => { if (result[column] === undefined) { result[column] = { values: [], counts: {} }; } if ( (limit === 0 || uniqueCount[column] < limit) ) { if (!result[column].values.includes(obsObject[column])) { result[column].values.push(obsObject[column]); uniqueCount[column] += 1; } if (addCount) { const valueId = obsObject[column] === null ? 'null' : String(obsObject[column]); result[column].counts[valueId] = result[column].counts[valueId] > 0 ? (result[column].counts[valueId] + 1) : 1; } } }); // Check if all columns are filled if (Object.values(uniqueCount).every(count => count >= limit) && limit > 0) { break; } } } // Sort values if required if (sort) { Object.keys(result).forEach((column) => { result[column].values.sort((a, b) => { if (typeof a === 'string' && typeof b === 'string') { return a.localeCompare(b); } else if (typeof a === 'number' && typeof b === 'number') { return a - b; } else if (a === null && b === null) { return 0; } else if (a === null) { return -1; } else if (b === null) { return 1; } else { return 0; } }); }); } return result; } /** * Convert XPT to CSV files. Each dataset within the XPT file is written to the outDir folder as a separate csv file. * @param outDir Output folder. * @param options Read options. See read() method options. */ public async toCsv (outDir: string, options?: Options): Promise<void> { for (let i = 0; i < Object.keys(this.members).length; i++) { const member: Member = Object.values(this.members)[i]; // If list of datasets provided, filter those not in the list if (options?.dsNames.length > 0 && !options.dsNames.map(dsName => dsName.toUpperCase()).includes(member.name.toUpperCase()) ) { continue; } const writer = createWriteStream(path.join(outDir, member.name + '.csv')); // Force row format to be array const modifiedOpitions: Options = { ...options, rowFormat: 'array' }; // Print header if (!options?.skipHeader) { const header: string[] = this.getHeaderRecord(member, modifiedOpitions) as string[]; writer.write(header.join() + '\n'); } for await (const obs of member.read(this.pathToFile, modifiedOpitions)) { // Escape double quotes and commas const escapedObs: Array<string|number> = (obs as Array<string|number>).map(elem => { if (typeof elem === 'string' && /,|"/.test(elem)) { return '"' + elem.replace('"', '""') + '"'; } else { return elem; } }); writer.write(escapedObs.join(',') + '\n'); } writer.end(); } } } export default Library;