UNPKG

@hotglue/gluestick-ts

Version:

TypeScript version of the gluestick ETL library for hotglue IPaaS platform

255 lines 10.1 kB
import fs from 'fs-extra'; import * as path from 'path'; import pl from 'nodejs-polars'; export class Reader { static ROOT_DIR = process.env.ROOT_DIR || '.'; static INPUT_DIR = `${Reader.ROOT_DIR}/sync-output`; root; dir; inputFiles; constructor(dir = Reader.INPUT_DIR, root = Reader.ROOT_DIR) { this.root = root; this.dir = dir; this.inputFiles = this.readDirectories(); } toString() { return JSON.stringify(Object.keys(this.inputFiles)); } keys() { return Object.keys(this.inputFiles); } get(stream, options = {}) { const filepath = this.inputFiles[stream]; if (!filepath) { return null; } if (filepath.endsWith('.parquet')) { try { // TODO: Implement chunked parquet reading if chunksize is specified if (options.chunksize) { console.warn('Chunked parquet reading not yet implemented, reading full file'); } let df = pl.readParquet(filepath); // Apply catalog types if requested if (options.catalogTypes) { const catalog = this.readCatalog(); if (catalog) { const schema = this.getSchemaFromCatalog(catalog, stream); if (Object.keys(schema).length > 0) { // Apply schema transformations to the DataFrame for (const [colName, polarsType] of Object.entries(schema)) { if (df.columns.includes(colName)) { try { df = df.withColumn(pl.col(colName).cast(polarsType)); } catch (error) { console.warn(`Failed to cast column ${colName} to ${polarsType}:`, error); } } } } } } return df; } catch (error) { console.error(`Failed to read parquet file ${filepath}:`, error); return null; } } if (filepath.endsWith('.csv')) { const catalog = this.readCatalog(); let readOptions = { ...options }; if (catalog && options.catalogTypes) { const schema = this.getSchemaFromCatalog(catalog, stream); if (Object.keys(schema).length > 0) { readOptions.dtype = schema; } } try { let df = pl.readCSV(filepath, readOptions); // Handle date parsing if specified if (readOptions.parseDates) { for (const dateCol of readOptions.parseDates) { try { df = df.withColumn(pl.col(dateCol).str.strptime(pl.Datetime('us'), '%Y-%m-%d %H:%M:%S%.f')); } catch (e) { // Try alternative date format try { df = df.withColumn(pl.col(dateCol).str.strptime(pl.Datetime('us'), '%Y-%m-%d')); } catch (e2) { console.warn(`Failed to parse dates for column ${dateCol}: ${e2}`); } } } } return df; } catch (error) { console.error(`Failed to read CSV file ${filepath}:`, error); return null; } } console.warn(`Unsupported file format for ${filepath}`); return null; } getMetadata(stream) { const filepath = this.inputFiles[stream]; if (!filepath) { throw new Error(`There is no file for stream with name ${stream}.`); } if (filepath.endsWith('.parquet')) { try { // Note: nodejs-polars doesn't expose parquet metadata like PyArrow // This is a limitation we'll need to work around console.warn('Parquet metadata extraction not fully supported in nodejs-polars'); return {}; } catch (error) { console.error(`Failed to read parquet metadata for ${filepath}:`, error); return {}; } } return {}; } getPk(stream) { const keyProperties = []; const filepath = this.inputFiles[stream]; if (filepath && filepath.endsWith('.parquet')) { // Try to get key properties from parquet metadata const metadata = this.getMetadata(stream); if (metadata.key_properties) { try { // Parse the key_properties if it's stored as a string return JSON.parse(metadata.key_properties); } catch (error) { console.warn(`Failed to parse key_properties from parquet metadata:`, error); } } } // Fallback to catalog for both CSV and parquet files const catalog = this.readCatalog(); if (catalog) { const streamInfo = catalog.streams.find((c) => c.stream === stream || c.tap_stream_id === stream); if (streamInfo && streamInfo.metadata) { const breadcrumb = streamInfo.metadata.find((s) => s.breadcrumb.length === 0); if (breadcrumb && breadcrumb.metadata) { const tableKeyProperties = breadcrumb.metadata['table-key-properties']; if (Array.isArray(tableKeyProperties)) { keyProperties.push(...tableKeyProperties); } } } } return keyProperties; } readDirectories(ignore = []) { const results = {}; let allFiles = []; if (fs.existsSync(this.dir) && fs.statSync(this.dir).isDirectory()) { const entries = fs.readdirSync(this.dir); for (const entry of entries) { const filePath = path.join(this.dir, entry); if (fs.statSync(filePath).isFile()) { if (filePath.endsWith('.csv') || filePath.endsWith('.parquet')) { allFiles.push(filePath); } } } } else if (fs.existsSync(this.dir)) { allFiles.push(this.dir); } for (const file of allFiles) { const filename = path.basename(file); let entityType = filename.replace(/\.(csv|parquet)$/, ''); if (entityType.includes('-')) { entityType = entityType.split('-')[0]; } if (!results[entityType] && !ignore.includes(entityType)) { results[entityType] = file; } } return results; } readCatalog() { const catalogPath = path.join(this.root, 'catalog.json'); if (fs.existsSync(catalogPath)) { try { const catalogData = fs.readFileSync(catalogPath, 'utf8'); return JSON.parse(catalogData); } catch (error) { console.warn(`Failed to read catalog: ${error}`); return null; } } return null; } getSchemaFromCatalog(catalog, stream) { const filepath = this.inputFiles[stream]; if (!filepath) { return {}; } // Get headers from CSV let headers = []; try { const df = pl.readCSV(filepath, { nRows: 0 }); headers = df.columns; } catch (error) { console.warn(`Failed to read headers from ${filepath}:`, error); return {}; } const streamInfo = catalog.streams.find((c) => c.stream === stream || c.tap_stream_id === stream); if (!streamInfo) { return {}; } const types = streamInfo.schema.properties; const schema = {}; for (const col of headers) { const colType = types[col]; if (colType) { // Handle anyOf types const anyOfList = colType.anyOf || []; let finalColType = colType; if (anyOfList.length > 0) { const typeWithFormat = anyOfList.find((t) => t.format); finalColType = typeWithFormat || { type: 'object' }; } if (finalColType.format === 'date-time') { schema[col] = pl.Datetime; continue; } if (finalColType.type) { const catalogType = Array.isArray(finalColType.type) ? finalColType.type.filter((t) => t !== 'null') : [finalColType.type]; if (catalogType.length === 1) { switch (catalogType[0]) { case 'integer': schema[col] = pl.Int64; break; case 'number': schema[col] = pl.Float64; break; case 'boolean': schema[col] = pl.Bool; break; default: schema[col] = pl.Utf8; break; } continue; } } } schema[col] = pl.Utf8; } return schema; } } //# sourceMappingURL=reader.js.map