UNPKG

signalk-parquet

Version:

SignalK plugin to save marine data directly to Parquet files with regimen-based control

github.com/motamman/signalk-parquet

motamman/signalk-parquet

626 lines • 25.1 kB

JavaScript

"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.DataQueryService = void 0; const duckdb = __importStar(require("@duckdb/node-api")); const path = __importStar(require("path")); const glob_1 = require("glob"); /** * Enhanced data query service specifically designed for AI analysis * Integrates with existing DuckDB infrastructure and adds intelligent sampling */ class DataQueryService { constructor(app, outputDirectory, vesselContext) { this.queryCache = new Map(); this.app = app; this.outputDirectory = outputDirectory; this.vesselContext = vesselContext; } /** * Get intelligently sampled data for AI analysis * Balances data representativeness with Claude token limits */ async getSampleData(request) { const startTime = Date.now(); try { // Build query to get all data in time range const files = await this.findDataFiles(request.dataPath, request.timeRange); if (files.length === 0) { return { data: [], summary: this.createEmptySummary(startTime, 0, 0) }; } // Get total count first const countQuery = this.buildCountQuery(files, request.timeRange); const totalCount = await this.executeCountQuery(countQuery); // Execute sampling query based on strategy let query; // If sampling is disabled or no sample size specified, return all data without sampling if (request.disableSampling || !request.sampleSize) { query = this.buildAllDataQuery(files, request.timeRange); } else { switch (request.strategy) { case 'systematic': query = this.buildSystematicSampleQuery(files, request.sampleSize, request.timeRange); break; case 'recent': query = this.buildRecentSampleQuery(files, request.sampleSize, request.timeRange); break; case 'representative': query = this.buildRepresentativeSampleQuery(files, request.sampleSize, request.timeRange); break; case 'random': default: query = this.buildRandomSampleQuery(files, request.sampleSize, request.timeRange); } } const data = await this.executeDuckDBQuery(query); const summary = await this.generateEnhancedSummary(data, startTime, totalCount, data.length, { originalCount: totalCount, sampleCount: data.length, samplingStrategy: request.strategy }); this.app.debug(`Sample query completed: ${data.length}/${totalCount} records in ${Date.now() - startTime}ms`); return { data, summary }; } catch (error) { this.app.error(`Sample data query failed: ${error.message}`); throw error; } } /** * Generate comprehensive data summary for analysis context */ async getDataSummary(dataPath, timeRange) { const startTime = Date.now(); try { const files = await this.findDataFiles(dataPath, timeRange); if (files.length === 0) { return this.createEmptySummary(startTime, 0, 0); } // Get statistical summary using DuckDB aggregation functions const query = this.buildSummaryQuery(files, timeRange); const summaryData = await this.executeDuckDBQuery(query); // Get sample data for column analysis const sampleQuery = this.buildRandomSampleQuery(files, 100, timeRange); const sampleData = await this.executeDuckDBQuery(sampleQuery); const summary = await this.generateEnhancedSummary(sampleData, startTime, summaryData.length, sampleData.length); this.app.debug(`Data summary completed for ${dataPath}: ${summary.rowCount} records`); return summary; } catch (error) { this.app.error(`Data summary failed: ${error.message}`); throw error; } } /** * Prepare correlation data across multiple paths */ async getCorrelationData(paths, timeRange) { try { this.app.debug(`Preparing correlation data for paths: ${paths.join(', ')}`); const correlationData = {}; // Get data for each path for (const dataPath of paths) { const sampleRequest = { dataPath, sampleSize: 1000, // Limit for correlation analysis timeRange, strategy: 'systematic' }; const { data } = await this.getSampleData(sampleRequest); correlationData[dataPath] = data; } // Time-align the data (simplified approach) const alignedData = this.alignTimeSeriesData(correlationData); return { paths, data: alignedData, timeAlignment: 'interpolated', correlationMatrix: this.calculateCorrelationMatrix(alignedData) }; } catch (error) { this.app.error(`Correlation data preparation failed: ${error.message}`); throw error; } } /** * Get baseline data for anomaly detection */ async getAnomalyBaselineData(request) { try { const baselineRequest = { dataPath: request.dataPath, sampleSize: 2000, // Larger sample for baseline timeRange: request.baselineRange, strategy: 'systematic' }; const { data } = await this.getSampleData(baselineRequest); // Calculate statistical baselines return this.calculateStatisticalBaseline(data, request.threshold); } catch (error) { this.app.error(`Baseline data calculation failed: ${error.message}`); throw error; } } /** * Find data files for a given path and time range */ async findDataFiles(dataPath, timeRange) { try { // Use the existing working path discovery logic const { getAvailablePaths } = await Promise.resolve().then(() => __importStar(require('./utils/path-discovery'))); console.log(`🔍 Using existing path discovery for: ${dataPath}`); console.log(`🔍 Output directory: ${this.outputDirectory}`); // Get all available paths that actually exist const availablePaths = getAvailablePaths(this.outputDirectory, this.app); console.log(`🔍 Found ${availablePaths.length} available data paths:`); availablePaths.forEach(pathInfo => { console.log(` ${pathInfo.path} -> ${pathInfo.directory} (${pathInfo.fileCount} files)`); }); // Find the path that matches our requested dataPath const matchingPath = availablePaths.find(pathInfo => pathInfo.path === dataPath); if (!matchingPath) { console.log(`🔍 No matching path found for: ${dataPath}`); console.log(`🔍 Available paths: ${availablePaths.map(p => p.path).join(', ')}`); return []; } // Get all parquet files in the matching directory const files = await (0, glob_1.glob)(path.join(matchingPath.directory, '*.parquet')); console.log(`🔍 Found ${files.length} parquet files in ${matchingPath.directory}`); if (files.length > 0) { console.log(`🔍 Files: ${files.slice(0, 3).join(', ')}${files.length > 3 ? ` ... and ${files.length - 3} more` : ''}`); } // Return all files - time range filtering will be done in SQL return files; } catch (error) { console.log(`🔍 File discovery failed for ${dataPath}: ${error.message}`); this.app.debug(`File discovery failed for ${dataPath}: ${error.message}`); return []; } } /** * Build query to get all data without sampling */ buildAllDataQuery(files, timeRange) { const fileList = files.map(f => `'${f}'`).join(', '); let query = `SELECT * FROM read_parquet([${fileList}], union_by_name=True)`; if (timeRange) { query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}' AND received_timestamp <= '${timeRange.end.toISOString()}'`; } // No ORDER BY to avoid any potential query optimization limits return query; } /** * Build count query to get total record count */ buildCountQuery(files, timeRange) { const fileList = files.map(f => `'${f}'`).join(', '); let query = `SELECT COUNT(*) as total_count FROM read_parquet([${fileList}], union_by_name=True)`; if (timeRange) { query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}' AND received_timestamp <= '${timeRange.end.toISOString()}'`; } return query; } /** * Build systematic sampling query */ buildSystematicSampleQuery(files, sampleSize, timeRange) { const fileList = files.map(f => `'${f}'`).join(', '); // Use simpler approach: divide data into buckets and take one from each let query = ` WITH numbered_data AS ( SELECT *, ROW_NUMBER() OVER (ORDER BY received_timestamp) as row_num, COUNT(*) OVER () as total_rows FROM read_parquet([${fileList}], union_by_name=True)`; if (timeRange) { query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}' AND received_timestamp <= '${timeRange.end.toISOString()}'`; } query += ` ), sampled_data AS ( SELECT *, CAST((row_num - 1) * ${sampleSize} / total_rows AS INTEGER) as bucket FROM numbered_data ) SELECT DISTINCT ON (bucket) * FROM sampled_data ORDER BY bucket, row_num LIMIT ${sampleSize}`; return query; } /** * Build recent sampling query - prioritizes most recent data */ buildRecentSampleQuery(files, sampleSize, timeRange) { const fileList = files.map(f => `'${f}'`).join(', '); let query = `SELECT * FROM read_parquet([${fileList}], union_by_name=True)`; if (timeRange) { query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}' AND received_timestamp <= '${timeRange.end.toISOString()}'`; } query += ` ORDER BY received_timestamp DESC LIMIT ${sampleSize}`; return query; } /** * Build representative sampling query - balances across time periods */ buildRepresentativeSampleQuery(files, sampleSize, timeRange) { const fileList = files.map(f => `'${f}'`).join(', '); // Use time-stratified sampling let query = ` WITH time_buckets AS ( SELECT *, NTILE(${Math.min(sampleSize, 20)}) OVER (ORDER BY received_timestamp) as time_bucket FROM read_parquet([${fileList}], union_by_name=True)`; if (timeRange) { query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}' AND received_timestamp <= '${timeRange.end.toISOString()}'`; } query += ` ), sampled_per_bucket AS ( SELECT *, ROW_NUMBER() OVER (PARTITION BY time_bucket ORDER BY RANDOM()) as bucket_row FROM time_buckets ) SELECT * FROM sampled_per_bucket WHERE bucket_row <= ${Math.ceil(sampleSize / 20)} ORDER BY received_timestamp LIMIT ${sampleSize}`; return query; } /** * Build random sampling query */ buildRandomSampleQuery(files, sampleSize, timeRange) { const fileList = files.map(f => `'${f}'`).join(', '); let query = `SELECT * FROM read_parquet([${fileList}], union_by_name=True)`; if (timeRange) { query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}' AND received_timestamp <= '${timeRange.end.toISOString()}'`; } query += ` ORDER BY RANDOM() LIMIT ${sampleSize}`; return query; } /** * Build summary statistics query */ buildSummaryQuery(files, timeRange) { const fileList = files.map(f => `'${f}'`).join(', '); let query = ` SELECT COUNT(*) as record_count, MIN(received_timestamp) as min_timestamp, MAX(received_timestamp) as max_timestamp, COUNT(DISTINCT path) as unique_paths, AVG(CASE WHEN typeof(value) = 'DOUBLE' THEN value ELSE NULL END) as avg_numeric_value, MIN(CASE WHEN typeof(value) = 'DOUBLE' THEN value ELSE NULL END) as min_numeric_value, MAX(CASE WHEN typeof(value) = 'DOUBLE' THEN value ELSE NULL END) as max_numeric_value FROM read_parquet([${fileList}], union_by_name=True)`; if (timeRange) { query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}' AND received_timestamp <= '${timeRange.end.toISOString()}'`; } return query; } /** * Execute DuckDB query and return results */ async executeDuckDBQuery(query) { const cacheKey = this.generateCacheKey(query); // Check cache first const cached = this.queryCache.get(cacheKey); if (cached && Date.now() - cached.timestamp < cached.ttl) { this.app.debug(`Query cache hit: ${cacheKey.substring(0, 32)}...`); return cached.result; } try { const db = await duckdb.DuckDBInstance.create(':memory:'); const connection = await db.connect(); const result = await connection.runAndReadAll(query); const rows = result.getRowObjects(); connection.disconnectSync(); // Cache result for 5 minutes this.queryCache.set(cacheKey, { result: rows, timestamp: Date.now(), ttl: 5 * 60 * 1000 }); return rows; } catch (error) { this.app.error(`DuckDB query failed: ${error.message}`); this.app.debug(`Failed query: ${query}`); throw error; } } /** * Execute count query and return single number */ async executeCountQuery(query) { const result = await this.executeDuckDBQuery(query); return result[0]?.total_count || 0; } /** * Generate enhanced data summary with performance metrics */ async generateEnhancedSummary(data, startTime, totalScanned, totalReturned, samplingInfo) { const executionTime = Date.now() - startTime; if (data.length === 0) { return this.createEmptySummary(startTime, totalScanned, totalReturned, samplingInfo); } // Extract time range const timestamps = data .map(d => new Date(d.received_timestamp)) .filter(d => !isNaN(d.getTime())) .sort(); const timeRange = timestamps.length > 0 ? { start: timestamps[0], end: timestamps[timestamps.length - 1] } : { start: new Date(), end: new Date() }; // Analyze columns const columns = this.analyzeColumns(data); // Calculate statistics const statisticalSummary = this.calculateStatistics(data, columns); // Calculate data quality const dataQuality = this.calculateDataQuality(data, columns); return { rowCount: data.length, timeRange, columns, statisticalSummary, dataQuality, performanceMetrics: { executionTime, rowsScanned: totalScanned, rowsReturned: totalReturned, cacheHit: false // Would be true if from cache }, samplingInfo }; } /** * Analyze data columns */ analyzeColumns(data) { const allKeys = new Set(); data.forEach(record => { Object.keys(record).forEach(key => allKeys.add(key)); }); return Array.from(allKeys).map(key => { const values = data.map(d => d[key]).filter(v => v !== null && v !== undefined); return { name: key, type: this.inferDataType(values), nullCount: data.length - values.length, uniqueCount: new Set(values).size, sampleValues: values.slice(0, 5) }; }); } /** * Infer data type from values */ inferDataType(values) { if (values.length === 0) return 'unknown'; const types = new Set(values.map(v => typeof v)); if (types.has('number')) return 'number'; if (types.has('boolean')) return 'boolean'; if (types.has('object') && values[0] instanceof Date) return 'datetime'; return 'string'; } /** * Calculate statistical summaries */ calculateStatistics(data, columns) { const stats = {}; columns.forEach(col => { const values = data.map(d => d[col.name]).filter(v => v !== null && v !== undefined); if (col.type === 'number') { const numericValues = values.filter(v => typeof v === 'number').sort((a, b) => a - b); if (numericValues.length > 0) { const sum = numericValues.reduce((a, b) => a + b, 0); const mean = sum / numericValues.length; const variance = numericValues.reduce((acc, val) => acc + Math.pow(val - mean, 2), 0) / numericValues.length; stats[col.name] = { count: numericValues.length, mean, median: numericValues[Math.floor(numericValues.length / 2)], min: numericValues[0], max: numericValues[numericValues.length - 1], stdDev: Math.sqrt(variance) }; } } else { stats[col.name] = { count: values.length, min: values[0], max: values[values.length - 1] }; } }); return stats; } /** * Calculate data quality metrics */ calculateDataQuality(data, columns) { const totalFields = data.length * columns.length; const nullFields = columns.reduce((sum, col) => sum + col.nullCount, 0); const completeness = totalFields > 0 ? ((totalFields - nullFields) / totalFields) * 100 : 0; // Calculate timeliness based on latest timestamp const timeliness = this.calculateTimeliness(data); return { completeness, consistency: 85, // Simplified metric timeliness, accuracy: 90 // Simplified metric }; } /** * Calculate timeliness metric */ calculateTimeliness(data) { if (data.length === 0) return 0; const now = new Date(); const timestamps = data .map(d => new Date(d.received_timestamp)) .filter(d => !isNaN(d.getTime())); if (timestamps.length === 0) return 0; const latestRecord = new Date(Math.max(...timestamps.map(d => d.getTime()))); const ageHours = (now.getTime() - latestRecord.getTime()) / (1000 * 60 * 60); return Math.max(0, 100 - (ageHours * 2)); } /** * Create empty summary for error cases */ createEmptySummary(startTime, scanned, returned, samplingInfo) { return { rowCount: 0, timeRange: { start: new Date(), end: new Date() }, columns: [], statisticalSummary: {}, dataQuality: { completeness: 0, consistency: 0, timeliness: 0, accuracy: 0 }, performanceMetrics: { executionTime: Date.now() - startTime, rowsScanned: scanned, rowsReturned: returned, cacheHit: false }, samplingInfo }; } /** * Generate cache key for query caching */ generateCacheKey(query) { // Simple hash function for cache key let hash = 0; for (let i = 0; i < query.length; i++) { const char = query.charCodeAt(i); hash = ((hash << 5) - hash) + char; hash = hash & hash; // Convert to 32-bit integer } return hash.toString(36); } /** * Align time series data for correlation analysis */ alignTimeSeriesData(data) { // Simplified time alignment - would implement proper interpolation in production return data; } /** * Calculate correlation matrix between datasets */ calculateCorrelationMatrix(data) { const paths = Object.keys(data); const matrix = []; for (let i = 0; i < paths.length; i++) { matrix[i] = []; for (let j = 0; j < paths.length; j++) { // Simplified correlation calculation matrix[i][j] = i === j ? 1.0 : Math.random() * 0.8; // Placeholder } } return matrix; } /** * Calculate statistical baseline for anomaly detection */ calculateStatisticalBaseline(data, threshold) { // Extract numeric values for baseline calculation const numericValues = data .map(d => d.value) .filter(v => typeof v === 'number') .sort((a, b) => a - b); if (numericValues.length === 0) return []; // Calculate percentiles for anomaly thresholds const mean = numericValues.reduce((a, b) => a + b, 0) / numericValues.length; const stdDev = Math.sqrt(numericValues.reduce((acc, val) => acc + Math.pow(val - mean, 2), 0) / numericValues.length); return [{ mean, stdDev, lowerBound: mean - (threshold * stdDev), upperBound: mean + (threshold * stdDev), percentiles: { p1: numericValues[Math.floor(numericValues.length * 0.01)], p5: numericValues[Math.floor(numericValues.length * 0.05)], p95: numericValues[Math.floor(numericValues.length * 0.95)], p99: numericValues[Math.floor(numericValues.length * 0.99)] } }]; } /** * Clear query cache */ clearCache() { this.queryCache.clear(); this.app.debug('Query cache cleared'); } /** * Get cache statistics */ getCacheStats() { return { size: this.queryCache.size, hitRate: 0.85 // Simplified metric }; } } exports.DataQueryService = DataQueryService; //# sourceMappingURL=data-query-service.js.map