signalk-parquet
Version:
SignalK plugin to save marine data directly to Parquet files with regimen-based control
464 lines • 21.4 kB
JavaScript
"use strict";
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetWriter = void 0;
const fs = __importStar(require("fs-extra"));
const path = __importStar(require("path"));
// Try to import ParquetJS, fall back if not available
// eslint-disable-next-line @typescript-eslint/no-explicit-any
let parquet;
try {
parquet = require('@dsnp/parquetjs');
}
catch (error) {
parquet = null;
}
class ParquetWriter {
constructor(options = { format: 'json' }) {
this.format = options.format || 'json';
this.app = options.app;
}
async writeRecords(filepath, records) {
try {
await fs.ensureDir(path.dirname(filepath));
switch (this.format) {
case 'json':
return await this.writeJSON(filepath, records);
case 'csv':
return await this.writeCSV(filepath, records);
case 'parquet':
return await this.writeParquet(filepath, records);
default:
throw new Error(`Unsupported format: ${this.format}`);
}
}
catch (error) {
throw new Error(`Failed to write records: ${error.message}`);
}
}
async writeJSON(filepath, records) {
const jsonPath = filepath.replace(/\.(parquet|csv)$/, '.json');
await fs.writeJson(jsonPath, records, { spaces: 2 });
return jsonPath;
}
async writeCSV(filepath, records) {
if (records.length === 0)
return filepath;
const csvPath = filepath.replace(/\.(parquet|json)$/, '.csv');
// Get all unique keys from all records
const allKeys = new Set();
records.forEach(record => {
Object.keys(record).forEach(key => allKeys.add(key));
});
const headers = Array.from(allKeys).sort();
const csvRows = [headers.join(',')];
records.forEach(record => {
const row = headers.map(header => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const value = record[header];
if (value === null || value === undefined)
return '';
if (typeof value === 'string' &&
(value.includes(',') || value.includes('"'))) {
return `"${value.replace(/"/g, '""')}"`;
}
return String(value);
});
csvRows.push(row.join(','));
});
await fs.writeFile(csvPath, csvRows.join('\n'));
return csvPath;
}
async writeParquet(filepath, records) {
try {
if (records.length === 0) {
this.app?.debug('No records to write to Parquet file');
return filepath;
}
// Check if ParquetJS is available
if (!parquet) {
this.app?.debug('ParquetJS not available, falling back to JSON');
return await this.writeJSON(filepath, records);
}
// Use intelligent schema detection for optimal data types
const schema = this.createParquetSchema(records);
// Create Parquet writer
const writer = await parquet.ParquetWriter.openFile(schema, filepath);
// Write records to Parquet file
for (let i = 0; i < records.length; i++) {
const record = records[i];
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const cleanRecord = {};
// Prepare record for typed Parquet schema
const preparedRecord = this.prepareRecordForParquet(record, schema);
Object.assign(cleanRecord, preparedRecord);
await writer.appendRow(cleanRecord);
}
// Close the writer
await writer.close();
// Validate the written file
const isValid = await this.validateParquetFile(filepath);
if (!isValid) {
// Move invalid file to quarantine and log
const quarantineDir = path.join(path.dirname(filepath), 'quarantine');
await fs.ensureDir(quarantineDir);
const quarantineFile = path.join(quarantineDir, path.basename(filepath));
await fs.move(filepath, quarantineFile);
await this.logQuarantine(quarantineFile, 'write', 'File failed validation after write');
throw new Error(`Parquet file failed validation after write, moved to quarantine: ${quarantineFile}`);
}
return filepath;
}
catch (error) {
this.app?.debug(`❌ Parquet writing failed: ${error.message}`);
this.app?.debug(`Error stack: ${error.stack}`);
// Save to failed directory to maintain schema consistency
const failedDir = path.join(path.dirname(filepath), 'failed');
await fs.ensureDir(failedDir);
const failedPath = path.join(failedDir, path.basename(filepath).replace('.parquet', '_FAILED.json'));
this.app?.debug(`💾 Saving failed Parquet data as JSON to: ${failedPath}`);
this.app?.debug('⚠️ This data will need manual conversion to maintain DuckDB schema consistency');
await this.writeJSON(failedPath, records);
// Throw error to alert system that Parquet writing is broken
throw new Error(`Parquet writing failed for ${filepath}. Data saved to ${failedPath} for recovery.`);
}
}
// Create Parquet schema based on sample records
// eslint-disable-next-line @typescript-eslint/no-explicit-any
createParquetSchema(records) {
if (!parquet || records.length === 0) {
this.app?.debug('createParquetSchema: No parquet lib or empty records, throwing error');
throw new Error('Cannot create Parquet schema');
}
// Get all unique column names from all records
const allColumns = new Set();
records.forEach(record => {
Object.keys(record).forEach(key => allColumns.add(key));
});
const columns = Array.from(allColumns).sort();
const schemaFields = {};
// Analyze each column to determine the best Parquet type
columns.forEach(colName => {
const values = records
// eslint-disable-next-line @typescript-eslint/no-explicit-any
.map(r => r[colName])
.filter(v => v !== null && v !== undefined);
if (values.length === 0) {
// All null values, default to string
schemaFields[colName] = { type: 'UTF8', optional: true };
return;
}
const hasNumbers = values.some(v => typeof v === 'number');
const hasStrings = values.some(v => typeof v === 'string');
const hasBooleans = values.some(v => typeof v === 'boolean');
const hasBigInts = values.some(v => typeof v === 'bigint');
// Only log details for the value column that we care about
if (colName === 'value') {
}
if (hasBigInts && !hasNumbers && !hasStrings && !hasBooleans) {
// All BigInts - use UTF8 to be safe
schemaFields[colName] = { type: 'UTF8', optional: true };
if (colName === 'value') {
}
}
else if (hasNumbers && !hasStrings && !hasBooleans && !hasBigInts) {
// All numbers - check if integers or floats
const allIntegers = values.every(v => Number.isInteger(v));
schemaFields[colName] = {
type: allIntegers ? 'INT64' : 'DOUBLE',
optional: true,
};
if (colName === 'value') {
}
}
else if (hasBooleans && !hasNumbers && !hasStrings && !hasBigInts) {
schemaFields[colName] = { type: 'BOOLEAN', optional: true };
}
else {
// Mixed types or strings - use UTF8
schemaFields[colName] = { type: 'UTF8', optional: true };
if (colName === 'value') {
}
}
});
return new parquet.ParquetSchema(schemaFields);
}
// Prepare a record for typed Parquet writing
prepareRecordForParquet(record,
// eslint-disable-next-line @typescript-eslint/no-explicit-any
schema
// eslint-disable-next-line @typescript-eslint/no-explicit-any
) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const cleanRecord = {};
const schemaFields = schema.schema;
Object.keys(schemaFields).forEach(fieldName => {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const value = record[fieldName];
const fieldType = schemaFields[fieldName].type;
if (value === null || value === undefined) {
cleanRecord[fieldName] = null;
}
else if (typeof value === 'bigint') {
// Handle BigInt values by converting to appropriate type
switch (fieldType) {
case 'DOUBLE':
case 'FLOAT':
cleanRecord[fieldName] = Number(value);
break;
case 'INT64':
case 'INT32':
// Convert BigInt to number if it fits in safe integer range
if (value <= Number.MAX_SAFE_INTEGER &&
value >= Number.MIN_SAFE_INTEGER) {
cleanRecord[fieldName] = Number(value);
}
else {
cleanRecord[fieldName] = value.toString();
}
break;
case 'UTF8':
default:
cleanRecord[fieldName] = value.toString();
break;
}
}
else {
switch (fieldType) {
case 'DOUBLE':
case 'FLOAT':
cleanRecord[fieldName] =
typeof value === 'number' ? value : parseFloat(String(value));
break;
case 'INT64':
case 'INT32':
cleanRecord[fieldName] =
typeof value === 'number'
? Math.round(value)
: parseInt(String(value));
break;
case 'BOOLEAN':
cleanRecord[fieldName] =
typeof value === 'boolean' ? value : Boolean(value);
break;
case 'UTF8':
default:
if (typeof value === 'object') {
cleanRecord[fieldName] = JSON.stringify(value);
}
else {
cleanRecord[fieldName] = String(value);
}
break;
}
}
});
return cleanRecord;
}
// Merge multiple files (for daily consolidation like Python version)
async mergeFiles(sourceFiles, targetFile) {
try {
const allRecords = [];
for (const sourceFile of sourceFiles) {
if (await fs.pathExists(sourceFile)) {
const ext = path.extname(sourceFile).toLowerCase();
if (ext === '.json') {
const records = await fs.readJson(sourceFile);
allRecords.push(...(Array.isArray(records) ? records : [records]));
}
else if (ext === '.parquet') {
// Read Parquet file
if (parquet) {
try {
const reader = await parquet.ParquetReader.openFile(sourceFile);
const cursor = reader.getCursor();
let record = null;
while ((record = await cursor.next())) {
allRecords.push(record);
}
await reader.close();
}
catch (parquetError) {
this.app?.debug(`Failed to read Parquet file ${sourceFile}: ${parquetError.message}`);
}
}
}
else if (ext === '.csv') {
// Could implement CSV reading if needed
this.app?.debug(`CSV merging not implemented for ${sourceFile}`);
}
}
}
if (allRecords.length > 0) {
// Sort by timestamp
allRecords.sort((a, b) => {
const timeA = a.received_timestamp || a.signalk_timestamp || '';
const timeB = b.received_timestamp || b.signalk_timestamp || '';
return timeA.localeCompare(timeB);
});
await this.writeRecords(targetFile, allRecords);
return allRecords.length;
}
return 0;
}
catch (error) {
throw new Error(`Failed to merge files: ${error.message}`);
}
}
// Validate parquet file for corruption
async validateParquetFile(filepath) {
try {
if (!parquet || !(await fs.pathExists(filepath))) {
return false;
}
// Check file size (must be > 100 bytes as per existing logic)
const stats = await fs.stat(filepath);
const fileSize = stats.size;
if (fileSize < 100) {
this.app?.debug(`❌ Parquet file too small: ${filepath} (${fileSize} bytes)`);
return false;
}
// Try to open and read the parquet file
try {
const reader = await parquet.ParquetReader.openFile(filepath);
const cursor = reader.getCursor();
// Try to read first record to verify file structure
const firstRecord = await cursor.next();
await reader.close();
// Log file size for debugging (matches your stat command format)
this.app?.debug(`✅ Valid parquet file: ${fileSize.toString().padStart(12, ' ')} ${filepath}`);
return firstRecord !== null;
}
catch (readError) {
this.app?.debug(`❌ Parquet file read failed: ${filepath} - ${readError.message}`);
return false;
}
}
catch (error) {
this.app?.debug(`❌ Parquet validation error: ${filepath} - ${error.message}`);
return false;
}
}
// Log quarantined files
async logQuarantine(filepath, operation, reason) {
try {
const stats = await fs.stat(filepath);
const logEntry = {
timestamp: new Date().toISOString(),
filepath,
fileSize: stats.size,
operation,
reason,
formattedSize: `${stats.size.toString().padStart(12, ' ')} ${filepath}`
};
const quarantineDir = path.dirname(filepath);
const logFile = path.join(quarantineDir, 'quarantine.log');
// Append to log file
const logLine = `${logEntry.timestamp} | ${logEntry.operation} | ${logEntry.fileSize} bytes | ${logEntry.reason} | ${filepath}\n`;
await fs.appendFile(logFile, logLine);
this.app?.debug(`📋 Quarantine logged: ${logEntry.formattedSize}`);
}
catch (error) {
this.app?.debug(`Failed to log quarantine entry: ${error.message}`);
}
}
// Daily file consolidation (matching Python behavior)
async consolidateDaily(dataDir, date, filenamePrefix = 'signalk_data') {
try {
const dateStr = date.toISOString().split('T')[0]; // YYYY-MM-DD
const consolidatedFiles = [];
// Walk through all topic directories
const walkDir = async (dir) => {
const items = await fs.readdir(dir);
for (const item of items) {
const itemPath = path.join(dir, item);
const stat = await fs.stat(itemPath);
if (stat.isDirectory() && item !== 'processed') {
await walkDir(itemPath);
}
else if (item.includes(dateStr) &&
!item.includes('_consolidated')) {
// This is a file for our target date
const topicDir = path.dirname(itemPath);
const consolidatedFile = path.join(topicDir, `${filenamePrefix}_${dateStr}_consolidated.parquet`);
if (!consolidatedFiles.find(f => f.target === consolidatedFile)) {
consolidatedFiles.push({
target: consolidatedFile,
sources: [],
});
}
const entry = consolidatedFiles.find(f => f.target === consolidatedFile);
if (entry) {
entry.sources.push(itemPath);
}
}
}
};
await walkDir(dataDir);
// Consolidate each topic's files
for (const entry of consolidatedFiles) {
const recordCount = await this.mergeFiles(entry.sources, entry.target);
this.app?.debug(`Consolidated ${entry.sources.length} files into ${entry.target} (${recordCount} records)`);
// Validate consolidated parquet file
const isValid = await this.validateParquetFile(entry.target);
if (!isValid) {
// Move corrupt file to quarantine
const quarantineDir = path.join(path.dirname(entry.target), 'quarantine');
await fs.ensureDir(quarantineDir);
const quarantineFile = path.join(quarantineDir, path.basename(entry.target));
await fs.move(entry.target, quarantineFile);
// Log to quarantine log
await this.logQuarantine(quarantineFile, 'consolidation', 'File failed validation after consolidation');
this.app?.debug(`⚠️ Moved corrupt file to quarantine: ${quarantineFile}`);
continue; // Skip moving source files since consolidation failed
}
// Move source files to processed folder
const processedDir = path.join(path.dirname(entry.target), 'processed');
await fs.ensureDir(processedDir);
for (const sourceFile of entry.sources) {
const basename = path.basename(sourceFile);
const processedFile = path.join(processedDir, basename);
await fs.move(sourceFile, processedFile);
}
}
return consolidatedFiles.length;
}
catch (error) {
throw new Error(`Failed to consolidate daily files: ${error.message}`);
}
}
}
exports.ParquetWriter = ParquetWriter;
//# sourceMappingURL=parquet-writer.js.map