UNPKG

signalk-parquet

Version:

SignalK plugin to save marine data directly to Parquet files with regimen-based control

464 lines 21.4 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ParquetWriter = void 0; const fs = __importStar(require("fs-extra")); const path = __importStar(require("path")); // Try to import ParquetJS, fall back if not available // eslint-disable-next-line @typescript-eslint/no-explicit-any let parquet; try { parquet = require('@dsnp/parquetjs'); } catch (error) { parquet = null; } class ParquetWriter { constructor(options = { format: 'json' }) { this.format = options.format || 'json'; this.app = options.app; } async writeRecords(filepath, records) { try { await fs.ensureDir(path.dirname(filepath)); switch (this.format) { case 'json': return await this.writeJSON(filepath, records); case 'csv': return await this.writeCSV(filepath, records); case 'parquet': return await this.writeParquet(filepath, records); default: throw new Error(`Unsupported format: ${this.format}`); } } catch (error) { throw new Error(`Failed to write records: ${error.message}`); } } async writeJSON(filepath, records) { const jsonPath = filepath.replace(/\.(parquet|csv)$/, '.json'); await fs.writeJson(jsonPath, records, { spaces: 2 }); return jsonPath; } async writeCSV(filepath, records) { if (records.length === 0) return filepath; const csvPath = filepath.replace(/\.(parquet|json)$/, '.csv'); // Get all unique keys from all records const allKeys = new Set(); records.forEach(record => { Object.keys(record).forEach(key => allKeys.add(key)); }); const headers = Array.from(allKeys).sort(); const csvRows = [headers.join(',')]; records.forEach(record => { const row = headers.map(header => { // eslint-disable-next-line @typescript-eslint/no-explicit-any const value = record[header]; if (value === null || value === undefined) return ''; if (typeof value === 'string' && (value.includes(',') || value.includes('"'))) { return `"${value.replace(/"/g, '""')}"`; } return String(value); }); csvRows.push(row.join(',')); }); await fs.writeFile(csvPath, csvRows.join('\n')); return csvPath; } async writeParquet(filepath, records) { try { if (records.length === 0) { this.app?.debug('No records to write to Parquet file'); return filepath; } // Check if ParquetJS is available if (!parquet) { this.app?.debug('ParquetJS not available, falling back to JSON'); return await this.writeJSON(filepath, records); } // Use intelligent schema detection for optimal data types const schema = this.createParquetSchema(records); // Create Parquet writer const writer = await parquet.ParquetWriter.openFile(schema, filepath); // Write records to Parquet file for (let i = 0; i < records.length; i++) { const record = records[i]; // eslint-disable-next-line @typescript-eslint/no-explicit-any const cleanRecord = {}; // Prepare record for typed Parquet schema const preparedRecord = this.prepareRecordForParquet(record, schema); Object.assign(cleanRecord, preparedRecord); await writer.appendRow(cleanRecord); } // Close the writer await writer.close(); // Validate the written file const isValid = await this.validateParquetFile(filepath); if (!isValid) { // Move invalid file to quarantine and log const quarantineDir = path.join(path.dirname(filepath), 'quarantine'); await fs.ensureDir(quarantineDir); const quarantineFile = path.join(quarantineDir, path.basename(filepath)); await fs.move(filepath, quarantineFile); await this.logQuarantine(quarantineFile, 'write', 'File failed validation after write'); throw new Error(`Parquet file failed validation after write, moved to quarantine: ${quarantineFile}`); } return filepath; } catch (error) { this.app?.debug(`❌ Parquet writing failed: ${error.message}`); this.app?.debug(`Error stack: ${error.stack}`); // Save to failed directory to maintain schema consistency const failedDir = path.join(path.dirname(filepath), 'failed'); await fs.ensureDir(failedDir); const failedPath = path.join(failedDir, path.basename(filepath).replace('.parquet', '_FAILED.json')); this.app?.debug(`💾 Saving failed Parquet data as JSON to: ${failedPath}`); this.app?.debug('⚠️ This data will need manual conversion to maintain DuckDB schema consistency'); await this.writeJSON(failedPath, records); // Throw error to alert system that Parquet writing is broken throw new Error(`Parquet writing failed for ${filepath}. Data saved to ${failedPath} for recovery.`); } } // Create Parquet schema based on sample records // eslint-disable-next-line @typescript-eslint/no-explicit-any createParquetSchema(records) { if (!parquet || records.length === 0) { this.app?.debug('createParquetSchema: No parquet lib or empty records, throwing error'); throw new Error('Cannot create Parquet schema'); } // Get all unique column names from all records const allColumns = new Set(); records.forEach(record => { Object.keys(record).forEach(key => allColumns.add(key)); }); const columns = Array.from(allColumns).sort(); const schemaFields = {}; // Analyze each column to determine the best Parquet type columns.forEach(colName => { const values = records // eslint-disable-next-line @typescript-eslint/no-explicit-any .map(r => r[colName]) .filter(v => v !== null && v !== undefined); if (values.length === 0) { // All null values, default to string schemaFields[colName] = { type: 'UTF8', optional: true }; return; } const hasNumbers = values.some(v => typeof v === 'number'); const hasStrings = values.some(v => typeof v === 'string'); const hasBooleans = values.some(v => typeof v === 'boolean'); const hasBigInts = values.some(v => typeof v === 'bigint'); // Only log details for the value column that we care about if (colName === 'value') { } if (hasBigInts && !hasNumbers && !hasStrings && !hasBooleans) { // All BigInts - use UTF8 to be safe schemaFields[colName] = { type: 'UTF8', optional: true }; if (colName === 'value') { } } else if (hasNumbers && !hasStrings && !hasBooleans && !hasBigInts) { // All numbers - check if integers or floats const allIntegers = values.every(v => Number.isInteger(v)); schemaFields[colName] = { type: allIntegers ? 'INT64' : 'DOUBLE', optional: true, }; if (colName === 'value') { } } else if (hasBooleans && !hasNumbers && !hasStrings && !hasBigInts) { schemaFields[colName] = { type: 'BOOLEAN', optional: true }; } else { // Mixed types or strings - use UTF8 schemaFields[colName] = { type: 'UTF8', optional: true }; if (colName === 'value') { } } }); return new parquet.ParquetSchema(schemaFields); } // Prepare a record for typed Parquet writing prepareRecordForParquet(record, // eslint-disable-next-line @typescript-eslint/no-explicit-any schema // eslint-disable-next-line @typescript-eslint/no-explicit-any ) { // eslint-disable-next-line @typescript-eslint/no-explicit-any const cleanRecord = {}; const schemaFields = schema.schema; Object.keys(schemaFields).forEach(fieldName => { // eslint-disable-next-line @typescript-eslint/no-explicit-any const value = record[fieldName]; const fieldType = schemaFields[fieldName].type; if (value === null || value === undefined) { cleanRecord[fieldName] = null; } else if (typeof value === 'bigint') { // Handle BigInt values by converting to appropriate type switch (fieldType) { case 'DOUBLE': case 'FLOAT': cleanRecord[fieldName] = Number(value); break; case 'INT64': case 'INT32': // Convert BigInt to number if it fits in safe integer range if (value <= Number.MAX_SAFE_INTEGER && value >= Number.MIN_SAFE_INTEGER) { cleanRecord[fieldName] = Number(value); } else { cleanRecord[fieldName] = value.toString(); } break; case 'UTF8': default: cleanRecord[fieldName] = value.toString(); break; } } else { switch (fieldType) { case 'DOUBLE': case 'FLOAT': cleanRecord[fieldName] = typeof value === 'number' ? value : parseFloat(String(value)); break; case 'INT64': case 'INT32': cleanRecord[fieldName] = typeof value === 'number' ? Math.round(value) : parseInt(String(value)); break; case 'BOOLEAN': cleanRecord[fieldName] = typeof value === 'boolean' ? value : Boolean(value); break; case 'UTF8': default: if (typeof value === 'object') { cleanRecord[fieldName] = JSON.stringify(value); } else { cleanRecord[fieldName] = String(value); } break; } } }); return cleanRecord; } // Merge multiple files (for daily consolidation like Python version) async mergeFiles(sourceFiles, targetFile) { try { const allRecords = []; for (const sourceFile of sourceFiles) { if (await fs.pathExists(sourceFile)) { const ext = path.extname(sourceFile).toLowerCase(); if (ext === '.json') { const records = await fs.readJson(sourceFile); allRecords.push(...(Array.isArray(records) ? records : [records])); } else if (ext === '.parquet') { // Read Parquet file if (parquet) { try { const reader = await parquet.ParquetReader.openFile(sourceFile); const cursor = reader.getCursor(); let record = null; while ((record = await cursor.next())) { allRecords.push(record); } await reader.close(); } catch (parquetError) { this.app?.debug(`Failed to read Parquet file ${sourceFile}: ${parquetError.message}`); } } } else if (ext === '.csv') { // Could implement CSV reading if needed this.app?.debug(`CSV merging not implemented for ${sourceFile}`); } } } if (allRecords.length > 0) { // Sort by timestamp allRecords.sort((a, b) => { const timeA = a.received_timestamp || a.signalk_timestamp || ''; const timeB = b.received_timestamp || b.signalk_timestamp || ''; return timeA.localeCompare(timeB); }); await this.writeRecords(targetFile, allRecords); return allRecords.length; } return 0; } catch (error) { throw new Error(`Failed to merge files: ${error.message}`); } } // Validate parquet file for corruption async validateParquetFile(filepath) { try { if (!parquet || !(await fs.pathExists(filepath))) { return false; } // Check file size (must be > 100 bytes as per existing logic) const stats = await fs.stat(filepath); const fileSize = stats.size; if (fileSize < 100) { this.app?.debug(`❌ Parquet file too small: ${filepath} (${fileSize} bytes)`); return false; } // Try to open and read the parquet file try { const reader = await parquet.ParquetReader.openFile(filepath); const cursor = reader.getCursor(); // Try to read first record to verify file structure const firstRecord = await cursor.next(); await reader.close(); // Log file size for debugging (matches your stat command format) this.app?.debug(`✅ Valid parquet file: ${fileSize.toString().padStart(12, ' ')} ${filepath}`); return firstRecord !== null; } catch (readError) { this.app?.debug(`❌ Parquet file read failed: ${filepath} - ${readError.message}`); return false; } } catch (error) { this.app?.debug(`❌ Parquet validation error: ${filepath} - ${error.message}`); return false; } } // Log quarantined files async logQuarantine(filepath, operation, reason) { try { const stats = await fs.stat(filepath); const logEntry = { timestamp: new Date().toISOString(), filepath, fileSize: stats.size, operation, reason, formattedSize: `${stats.size.toString().padStart(12, ' ')} ${filepath}` }; const quarantineDir = path.dirname(filepath); const logFile = path.join(quarantineDir, 'quarantine.log'); // Append to log file const logLine = `${logEntry.timestamp} | ${logEntry.operation} | ${logEntry.fileSize} bytes | ${logEntry.reason} | ${filepath}\n`; await fs.appendFile(logFile, logLine); this.app?.debug(`📋 Quarantine logged: ${logEntry.formattedSize}`); } catch (error) { this.app?.debug(`Failed to log quarantine entry: ${error.message}`); } } // Daily file consolidation (matching Python behavior) async consolidateDaily(dataDir, date, filenamePrefix = 'signalk_data') { try { const dateStr = date.toISOString().split('T')[0]; // YYYY-MM-DD const consolidatedFiles = []; // Walk through all topic directories const walkDir = async (dir) => { const items = await fs.readdir(dir); for (const item of items) { const itemPath = path.join(dir, item); const stat = await fs.stat(itemPath); if (stat.isDirectory() && item !== 'processed') { await walkDir(itemPath); } else if (item.includes(dateStr) && !item.includes('_consolidated')) { // This is a file for our target date const topicDir = path.dirname(itemPath); const consolidatedFile = path.join(topicDir, `${filenamePrefix}_${dateStr}_consolidated.parquet`); if (!consolidatedFiles.find(f => f.target === consolidatedFile)) { consolidatedFiles.push({ target: consolidatedFile, sources: [], }); } const entry = consolidatedFiles.find(f => f.target === consolidatedFile); if (entry) { entry.sources.push(itemPath); } } } }; await walkDir(dataDir); // Consolidate each topic's files for (const entry of consolidatedFiles) { const recordCount = await this.mergeFiles(entry.sources, entry.target); this.app?.debug(`Consolidated ${entry.sources.length} files into ${entry.target} (${recordCount} records)`); // Validate consolidated parquet file const isValid = await this.validateParquetFile(entry.target); if (!isValid) { // Move corrupt file to quarantine const quarantineDir = path.join(path.dirname(entry.target), 'quarantine'); await fs.ensureDir(quarantineDir); const quarantineFile = path.join(quarantineDir, path.basename(entry.target)); await fs.move(entry.target, quarantineFile); // Log to quarantine log await this.logQuarantine(quarantineFile, 'consolidation', 'File failed validation after consolidation'); this.app?.debug(`⚠️ Moved corrupt file to quarantine: ${quarantineFile}`); continue; // Skip moving source files since consolidation failed } // Move source files to processed folder const processedDir = path.join(path.dirname(entry.target), 'processed'); await fs.ensureDir(processedDir); for (const sourceFile of entry.sources) { const basename = path.basename(sourceFile); const processedFile = path.join(processedDir, basename); await fs.move(sourceFile, processedFile); } } return consolidatedFiles.length; } catch (error) { throw new Error(`Failed to consolidate daily files: ${error.message}`); } } } exports.ParquetWriter = ParquetWriter; //# sourceMappingURL=parquet-writer.js.map