UNPKG

@overture-stack/lyric

Version:
189 lines (188 loc) 8.11 kB
import bytes from 'bytes'; import { parse as csvParse } from 'csv-parse'; import firstline from 'firstline'; import fs from 'fs'; import { z } from 'zod'; import { parse, } from '@overture-stack/lectern-client'; import { getSubmittedFileType } from '../services/submission/submissionFile.js'; import { BATCH_ERROR_TYPE } from './types.js'; export const SUPPORTED_FILE_EXTENSIONS = z.enum(['tsv', 'csv']); export const columnSeparatorValue = { tsv: '\t', csv: ',', }; /** * Determines the separator character for a given file based on its extension. * @param file The name of the file whose extension determines the separator character. * @returns The separator character associated with the file extension, or `undefined` if * the file extension is invalid or unrecognized. */ export const getSeparatorCharacter = (file) => { const fileExtension = getSubmittedFileType(file); if (fileExtension.success) { return columnSeparatorValue[fileExtension.data]; } return; }; /** * Maps a record array to an object with keys from headers, formatting each value for compatibility. * @param headers An array of header names, used as keys for the returned object. * @param record An array of values corresponding to each header, to be formatted and mapped. * @returns An `UnprocessedDataRecord` object where each header in `headers` is a key, * and each value is the corresponding entry in `record` formatted for compatibility. */ export const mapRecordToHeaders = (headers, record) => { return headers.reduce((obj, nextKey, index) => { const dataStr = record[index] || ''; const formattedData = formatForExcelCompatibility(dataStr); obj[nextKey] = formattedData; return obj; }, {}); }; /** * Reads only first line of the file * Usefull when file is too large and we're only interested in column names * @param file A file we want to read * @returns a string with the content of the first line of the file */ export const readHeaders = async (file) => { return firstline(file.path); }; /** * Reads a text file and parse it to a JSON format. * Records are parsed to match schema field types. * Supported files: .tsv and .csv * @param {Express.Multer.File} file A file to read * @param {Schema} schema Schema to parse data with * @returns a JSON format objet */ export const readTextFile = async (file, schema) => { const returnRecords = []; const returnErrors = []; const separatorCharacter = getSeparatorCharacter(file); if (!separatorCharacter) { throw new Error('Invalid file Extension'); } let headers = []; let lineNumber = 0; return new Promise((resolve, reject) => { const stream = fs.createReadStream(file.path).pipe(csvParse({ delimiter: separatorCharacter })); stream.on('data', (record) => { lineNumber++; if (!headers.length) { headers = Object.values(record); } else { const mappedRecord = mapRecordToHeaders(headers, record); try { const parseSchemaResult = parse.parseRecordValues(mappedRecord, schema); if (parseSchemaResult.success) { returnRecords.push(parseSchemaResult.data.record); } else { returnRecords.push(parseSchemaResult.data.record); returnErrors.push({ recordErrors: parseSchemaResult.data.errors, recordIndex: lineNumber, }); } if (lineNumber % 1000 === 0) { // TODO: Add batch processing logic here (e.g., write to database or process as needed) // returnRecords = []; // Clear the array after processing the batch // returnErrors = []; // Clear the array after processing the batch } } catch (error) { console.error(`Catching error parsing data: ${error}`); } } }); stream.on('end', () => { resolve({ records: returnRecords, errors: returnErrors }); }); stream.on('close', () => { stream.destroy(); fs.unlink(file.path, () => { }); }); stream.on('error', () => { reject({ records: returnRecords, errors: returnErrors }); }); }); }; function formatForExcelCompatibility(data) { // tsv exported from excel might add double quotations to indicate string and escape double quotes // this function removes those extra double quatations from a given string return data .trim() .replace(/^"/, '') // excel might add a beginning double quotes to indicate string .replace(/"$/, '') // excel might add a trailing double quote to indicate string .replace(/""/g, '"') // excel might've used a second double quote to escape a double quote in a string .trim(); } export function getSizeInBytes(size) { // Parse the string value into an integer in bytes. // If value is a number it is assumed is in bytes. return bytes.parse(size) || 0; } /** * Formats a file size from bytes to a specified unit with a defined precision. * * @param sizeInBytes - The file size in bytes to be formatted. * @param unit - The unit to which the size should be converted (e.g., 'MB', 'GB'). * @param precision - The number of decimal places to include in the formatted output. * @returns The file size formatted as a string in the specified unit with the given precision. Returns null if sizeInBytes is not a Finite number. * */ export const formatByteSize = (sizeInBytes, unit, precision) => { // Returns null if sizeInBytes is not Finite return bytes.format(sizeInBytes, { unit, decimalPlaces: precision }); }; /** * Processes an array of uploaded files, filtering valid `.tsv` files and checking for required headers * * @param {Express.Multer.File[]} files An array of `Express.Multer.File` objects representing the uploaded files. * @returns A `Promise<FileProcessingResult>` that resolves to an object containing two arrays: * - `validFiles`: Files that have a `.tsv` extension and contain the `systemId` header. * - `fileErrors`: Files that either have an invalid extension or are missing the required `systemId` header. */ export async function processFiles(files) { const result = { validFiles: [], fileErrors: [], }; for (const file of files) { try { if (getSubmittedFileType(file).success) { const fileHeaders = await readHeaders(file); // Wait for the async operation if (fileHeaders.includes('systemId')) { result.validFiles.push(file); } else { const batchError = { type: BATCH_ERROR_TYPE.MISSING_REQUIRED_HEADER, message: `File '${file.originalname}' is missing the column 'systemId'`, batchName: file.originalname, }; result.fileErrors.push(batchError); } } else { const batchError = { type: BATCH_ERROR_TYPE.INVALID_FILE_EXTENSION, message: `File '${file.originalname}' has invalid file extension. File extension must be '${SUPPORTED_FILE_EXTENSIONS.options}'`, batchName: file.originalname, }; result.fileErrors.push(batchError); } } catch (error) { const batchError = { type: BATCH_ERROR_TYPE.FILE_READ_ERROR, message: `Error reading file '${file.originalname}'`, batchName: file.originalname, }; result.fileErrors.push(batchError); } } return result; }