@hotglue/gluestick-ts
Version:
TypeScript version of the gluestick ETL library for hotglue IPaaS platform
255 lines • 10.1 kB
JavaScript
import fs from 'fs-extra';
import * as path from 'path';
import pl from 'nodejs-polars';
export class Reader {
static ROOT_DIR = process.env.ROOT_DIR || '.';
static INPUT_DIR = `${Reader.ROOT_DIR}/sync-output`;
root;
dir;
inputFiles;
constructor(dir = Reader.INPUT_DIR, root = Reader.ROOT_DIR) {
this.root = root;
this.dir = dir;
this.inputFiles = this.readDirectories();
}
toString() {
return JSON.stringify(Object.keys(this.inputFiles));
}
keys() {
return Object.keys(this.inputFiles);
}
get(stream, options = {}) {
const filepath = this.inputFiles[stream];
if (!filepath) {
return null;
}
if (filepath.endsWith('.parquet')) {
try {
// TODO: Implement chunked parquet reading if chunksize is specified
if (options.chunksize) {
console.warn('Chunked parquet reading not yet implemented, reading full file');
}
let df = pl.readParquet(filepath);
// Apply catalog types if requested
if (options.catalogTypes) {
const catalog = this.readCatalog();
if (catalog) {
const schema = this.getSchemaFromCatalog(catalog, stream);
if (Object.keys(schema).length > 0) {
// Apply schema transformations to the DataFrame
for (const [colName, polarsType] of Object.entries(schema)) {
if (df.columns.includes(colName)) {
try {
df = df.withColumn(pl.col(colName).cast(polarsType));
}
catch (error) {
console.warn(`Failed to cast column ${colName} to ${polarsType}:`, error);
}
}
}
}
}
}
return df;
}
catch (error) {
console.error(`Failed to read parquet file ${filepath}:`, error);
return null;
}
}
if (filepath.endsWith('.csv')) {
const catalog = this.readCatalog();
let readOptions = { ...options };
if (catalog && options.catalogTypes) {
const schema = this.getSchemaFromCatalog(catalog, stream);
if (Object.keys(schema).length > 0) {
readOptions.dtype = schema;
}
}
try {
let df = pl.readCSV(filepath, readOptions);
// Handle date parsing if specified
if (readOptions.parseDates) {
for (const dateCol of readOptions.parseDates) {
try {
df = df.withColumn(pl.col(dateCol).str.strptime(pl.Datetime('us'), '%Y-%m-%d %H:%M:%S%.f'));
}
catch (e) {
// Try alternative date format
try {
df = df.withColumn(pl.col(dateCol).str.strptime(pl.Datetime('us'), '%Y-%m-%d'));
}
catch (e2) {
console.warn(`Failed to parse dates for column ${dateCol}: ${e2}`);
}
}
}
}
return df;
}
catch (error) {
console.error(`Failed to read CSV file ${filepath}:`, error);
return null;
}
}
console.warn(`Unsupported file format for ${filepath}`);
return null;
}
getMetadata(stream) {
const filepath = this.inputFiles[stream];
if (!filepath) {
throw new Error(`There is no file for stream with name ${stream}.`);
}
if (filepath.endsWith('.parquet')) {
try {
// Note: nodejs-polars doesn't expose parquet metadata like PyArrow
// This is a limitation we'll need to work around
console.warn('Parquet metadata extraction not fully supported in nodejs-polars');
return {};
}
catch (error) {
console.error(`Failed to read parquet metadata for ${filepath}:`, error);
return {};
}
}
return {};
}
getPk(stream) {
const keyProperties = [];
const filepath = this.inputFiles[stream];
if (filepath && filepath.endsWith('.parquet')) {
// Try to get key properties from parquet metadata
const metadata = this.getMetadata(stream);
if (metadata.key_properties) {
try {
// Parse the key_properties if it's stored as a string
return JSON.parse(metadata.key_properties);
}
catch (error) {
console.warn(`Failed to parse key_properties from parquet metadata:`, error);
}
}
}
// Fallback to catalog for both CSV and parquet files
const catalog = this.readCatalog();
if (catalog) {
const streamInfo = catalog.streams.find((c) => c.stream === stream || c.tap_stream_id === stream);
if (streamInfo && streamInfo.metadata) {
const breadcrumb = streamInfo.metadata.find((s) => s.breadcrumb.length === 0);
if (breadcrumb && breadcrumb.metadata) {
const tableKeyProperties = breadcrumb.metadata['table-key-properties'];
if (Array.isArray(tableKeyProperties)) {
keyProperties.push(...tableKeyProperties);
}
}
}
}
return keyProperties;
}
readDirectories(ignore = []) {
const results = {};
let allFiles = [];
if (fs.existsSync(this.dir) && fs.statSync(this.dir).isDirectory()) {
const entries = fs.readdirSync(this.dir);
for (const entry of entries) {
const filePath = path.join(this.dir, entry);
if (fs.statSync(filePath).isFile()) {
if (filePath.endsWith('.csv') || filePath.endsWith('.parquet')) {
allFiles.push(filePath);
}
}
}
}
else if (fs.existsSync(this.dir)) {
allFiles.push(this.dir);
}
for (const file of allFiles) {
const filename = path.basename(file);
let entityType = filename.replace(/\.(csv|parquet)$/, '');
if (entityType.includes('-')) {
entityType = entityType.split('-')[0];
}
if (!results[entityType] && !ignore.includes(entityType)) {
results[entityType] = file;
}
}
return results;
}
readCatalog() {
const catalogPath = path.join(this.root, 'catalog.json');
if (fs.existsSync(catalogPath)) {
try {
const catalogData = fs.readFileSync(catalogPath, 'utf8');
return JSON.parse(catalogData);
}
catch (error) {
console.warn(`Failed to read catalog: ${error}`);
return null;
}
}
return null;
}
getSchemaFromCatalog(catalog, stream) {
const filepath = this.inputFiles[stream];
if (!filepath) {
return {};
}
// Get headers from CSV
let headers = [];
try {
const df = pl.readCSV(filepath, { nRows: 0 });
headers = df.columns;
}
catch (error) {
console.warn(`Failed to read headers from ${filepath}:`, error);
return {};
}
const streamInfo = catalog.streams.find((c) => c.stream === stream || c.tap_stream_id === stream);
if (!streamInfo) {
return {};
}
const types = streamInfo.schema.properties;
const schema = {};
for (const col of headers) {
const colType = types[col];
if (colType) {
// Handle anyOf types
const anyOfList = colType.anyOf || [];
let finalColType = colType;
if (anyOfList.length > 0) {
const typeWithFormat = anyOfList.find((t) => t.format);
finalColType = typeWithFormat || { type: 'object' };
}
if (finalColType.format === 'date-time') {
schema[col] = pl.Datetime;
continue;
}
if (finalColType.type) {
const catalogType = Array.isArray(finalColType.type)
? finalColType.type.filter((t) => t !== 'null')
: [finalColType.type];
if (catalogType.length === 1) {
switch (catalogType[0]) {
case 'integer':
schema[col] = pl.Int64;
break;
case 'number':
schema[col] = pl.Float64;
break;
case 'boolean':
schema[col] = pl.Bool;
break;
default:
schema[col] = pl.Utf8;
break;
}
continue;
}
}
}
schema[col] = pl.Utf8;
}
return schema;
}
}
//# sourceMappingURL=reader.js.map