UNPKG

@hotglue/gluestick-ts

Version:

TypeScript version of the gluestick ETL library for hotglue IPaaS platform

307 lines 11.9 kB
import fs from 'fs-extra'; import * as path from 'path'; import pl from 'nodejs-polars'; import { Reader } from './reader.js'; const TYPE_MAPPING = { float: { type: ["number", "null"] }, int: { type: ["integer", "null"] }, bool: { type: ["boolean", "null"] }, str: { type: ["string", "null"] }, date: { format: "date-time", type: ["string", "null"], }, array: { type: ["array", "null"], items: { type: ["object", "string", "null"] } }, }; export function genSingerHeader(df, allowObjects, schema, catalogSchema = false, recursiveTyping = true) { let headerMap = { type: ["object", "null"], properties: {} }; let modifiedDf = df.clone(); if (schema && !catalogSchema) { headerMap = schema; return [df, headerMap]; } for (const col of df.columns) { const dtype = df.getColumn(col).dtype.toString().toLowerCase(); // Handle datetime columns if (dtype.includes('date') || dtype.includes('time')) { // Convert datetime columns to ISO string format try { modifiedDf = modifiedDf.withColumn(pl.col(col).date.strftime("%Y-%m-%dT%H:%M:%S.%fZ")); } catch (error) { // If conversion fails, keep as string console.warn(`Failed to convert datetime column ${col}:`, error); } } const colType = getColumnType(dtype); if (colType && TYPE_MAPPING[colType]) { headerMap.properties[col] = TYPE_MAPPING[colType]; } else if (allowObjects) { const values = df.getColumn(col).toArray().filter(v => v != null); if (values.length === 0) { headerMap.properties[col] = TYPE_MAPPING.str; continue; } const firstValue = values[0]; if (Array.isArray(firstValue)) { if (recursiveTyping) { const newInput = {}; for (const row of values) { if (Array.isArray(row) && row.length > 0) { for (const arrValue of row) { if (typeof arrValue === 'object' && arrValue !== null && !Array.isArray(arrValue)) { const tempDict = { ...arrValue }; Object.assign(newInput, tempDict); } else { Object.assign(newInput, arrValue); } } } } const itemsSchema = toSingerSchema(newInput); headerMap.properties[col] = { type: ["array", "null"], items: itemsSchema }; if (Object.keys(newInput).length === 0) { headerMap.properties[col] = { items: TYPE_MAPPING.str, type: ["array", "null"], }; } } else { headerMap.properties[col] = TYPE_MAPPING.array; } } else if (typeof firstValue === 'object' && firstValue !== null && !Array.isArray(firstValue)) { const objectSchema = { type: ["object", "null"], properties: {} }; for (const [k, v] of Object.entries(firstValue)) { if (objectSchema.properties) { objectSchema.properties[k] = toSingerSchema(v); } } headerMap.properties[col] = objectSchema; } else { headerMap.properties[col] = TYPE_MAPPING.str; } } else { headerMap.properties[col] = TYPE_MAPPING.str; // Serialize complex objects to strings for Singer compatibility const values = df.getColumn(col).toArray(); const hasComplexObjects = values.some(v => v != null && (typeof v === 'object' || Array.isArray(v))); if (hasComplexObjects) { try { const serializedValues = values.map((value) => { if (value == null) return value; if (typeof value === 'object' || Array.isArray(value)) { return JSON.stringify(deepConvertDatetimes(value)); } return value; }); modifiedDf = modifiedDf.withColumn(pl.lit(serializedValues).alias(col)); } catch (error) { console.warn(`Failed to serialize complex objects in column ${col}:`, error); } } } } // Update schema using types from catalog and keeping extra columns if (catalogSchema && schema) { Object.assign(headerMap.properties, schema.properties); } return [modifiedDf, headerMap]; } export function toSingerSchema(input) { if (typeof input === 'object' && input !== null && !Array.isArray(input)) { const property = { type: ["object", "null"], properties: {} }; for (const [k, v] of Object.entries(input)) { if (property.properties) { property.properties[k] = toSingerSchema(v); } } return property; } else if (Array.isArray(input)) { if (input.length > 0) { return { type: ["array", "null"], items: toSingerSchema(input[0]) }; } else { return { items: { type: ["string", "null"] }, type: ["array", "null"] }; } } else if (typeof input === 'boolean') { return { type: ["boolean", "null"] }; } else if (typeof input === 'number') { return Number.isInteger(input) ? { type: ["integer", "null"] } : { type: ["number", "null"] }; } return { type: ["string", "null"] }; } export function deepConvertDatetimes(value) { if (Array.isArray(value)) { return value.map(child => deepConvertDatetimes(child)); } else if (typeof value === 'object' && value !== null) { const result = {}; for (const [k, v] of Object.entries(value)) { result[k] = deepConvertDatetimes(v); } return result; } else if (value instanceof Date) { return value.toISOString(); } return value; } export function parseObjs(x) { if (typeof x !== 'string') { return x; } try { return JSON.parse(x); } catch (error) { // If JSON parsing fails, return the original string return x; } } export function getCatalogSchema(stream) { const reader = new Reader(); const catalog = reader['readCatalog'](); // Access private method if (!catalog) { throw new Error(`No catalog found`); } const streamInfo = catalog.streams.find(s => s.stream === stream || s.tap_stream_id === stream); if (!streamInfo) { throw new Error(`No schema found in catalog for stream ${stream}`); } const schema = streamInfo.schema; const relevantSchema = { type: schema.type || ["object", "null"], properties: schema.properties || {} }; // Ensure every array type has an items dict for (const [propName, prop] of Object.entries(relevantSchema.properties)) { if (prop && typeof prop === 'object') { const propType = prop.type; if ((propType === "array" || (Array.isArray(propType) && propType.includes("array"))) && !prop.items) { prop.items = {}; } } } return relevantSchema; } export function parseDfCols(df, schema) { let modifiedDf = df.clone(); for (const col of df.columns) { const colType = schema.properties[col]?.type || []; const typeArray = Array.isArray(colType) ? colType : [colType]; // Check if column should contain objects or arrays const shouldParseObjects = typeArray.some(type => ["object", "array"].includes(type)); if (shouldParseObjects) { try { const values = df.getColumn(col).toArray(); const parsedValues = values.map((value) => parseObjs(value)); modifiedDf = modifiedDf.withColumn(pl.lit(parsedValues).alias(col)); } catch (error) { console.warn(`Failed to parse objects in column ${col}:`, error); } } } return modifiedDf; } export function toSinger(df, stream, outputDir, options = {}) { const { keys = [], filename = "data.singer", allowObjects = false, schema, keepNullFields = false, catalogStream, recursiveTyping = true } = options; const catalogSchema = process.env.USE_CATALOG_SCHEMA?.toLowerCase() === "true"; let processedDf = df.clone(); let finalSchema = schema; // Drop columns with all null values unless we want to keep null fields if (allowObjects && !catalogSchema && !keepNullFields) { // Filter out columns that are entirely null const columnsToKeep = df.columns.filter(col => { const values = df.getColumn(col).toArray(); return values.some(v => v != null); }); if (columnsToKeep.length > 0) { // processedDf = processedDf.select(columnsToKeep); } } if (catalogSchema || catalogStream) { const streamName = catalogStream || stream; finalSchema = getCatalogSchema(streamName); processedDf = parseDfCols(processedDf, finalSchema); } const [modifiedDf, headerMap] = genSingerHeader(processedDf, allowObjects || catalogSchema, finalSchema, catalogSchema, recursiveTyping); // Ensure output directory exists fs.ensureDirSync(outputDir); const outputPath = path.join(outputDir, filename); const mode = fs.existsSync(outputPath) ? 'a' : 'w'; // Write singer format const schemaRecord = { type: "SCHEMA", stream: stream, schema: headerMap, key_properties: keys }; let output = ''; output += JSON.stringify(schemaRecord) + '\n'; // Convert DataFrame to records and write each as a RECORD const records = modifiedDf.toRecords(); for (const record of records) { let filteredRecord = record; // Filter null values unless we want to keep them if (!catalogSchema && !keepNullFields) { filteredRecord = Object.fromEntries(Object.entries(record).filter(([_, value]) => value != null)); } // Convert datetimes filteredRecord = deepConvertDatetimes(filteredRecord); const singerRecord = { type: "RECORD", stream: stream, record: filteredRecord }; output += JSON.stringify(singerRecord) + '\n'; } // Write state record const stateRecord = { type: "STATE", value: {} }; output += JSON.stringify(stateRecord) + '\n'; // Write or append to file if (mode === 'a') { fs.appendFileSync(outputPath, output); } else { fs.writeFileSync(outputPath, output); } } function getColumnType(dtype) { if (dtype.includes('date') || dtype.includes('time')) { return 'date'; } else if (dtype.includes('float') || dtype.includes('double')) { return 'float'; } else if (dtype.includes('int')) { return 'int'; } else if (dtype.includes('bool')) { return 'bool'; } else if (dtype.includes('str') || dtype.includes('utf8')) { return 'str'; } return null; } //# sourceMappingURL=singer.js.map