signalk-parquet
Version:
SignalK plugin to save marine data directly to Parquet files with regimen-based control
626 lines • 25.1 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.DataQueryService = void 0;
const duckdb = __importStar(require("@duckdb/node-api"));
const path = __importStar(require("path"));
const glob_1 = require("glob");
/**
* Enhanced data query service specifically designed for AI analysis
* Integrates with existing DuckDB infrastructure and adds intelligent sampling
*/
class DataQueryService {
constructor(app, outputDirectory, vesselContext) {
this.queryCache = new Map();
this.app = app;
this.outputDirectory = outputDirectory;
this.vesselContext = vesselContext;
}
/**
* Get intelligently sampled data for AI analysis
* Balances data representativeness with Claude token limits
*/
async getSampleData(request) {
const startTime = Date.now();
try {
// Build query to get all data in time range
const files = await this.findDataFiles(request.dataPath, request.timeRange);
if (files.length === 0) {
return {
data: [],
summary: this.createEmptySummary(startTime, 0, 0)
};
}
// Get total count first
const countQuery = this.buildCountQuery(files, request.timeRange);
const totalCount = await this.executeCountQuery(countQuery);
// Execute sampling query based on strategy
let query;
// If sampling is disabled or no sample size specified, return all data without sampling
if (request.disableSampling || !request.sampleSize) {
query = this.buildAllDataQuery(files, request.timeRange);
}
else {
switch (request.strategy) {
case 'systematic':
query = this.buildSystematicSampleQuery(files, request.sampleSize, request.timeRange);
break;
case 'recent':
query = this.buildRecentSampleQuery(files, request.sampleSize, request.timeRange);
break;
case 'representative':
query = this.buildRepresentativeSampleQuery(files, request.sampleSize, request.timeRange);
break;
case 'random':
default:
query = this.buildRandomSampleQuery(files, request.sampleSize, request.timeRange);
}
}
const data = await this.executeDuckDBQuery(query);
const summary = await this.generateEnhancedSummary(data, startTime, totalCount, data.length, {
originalCount: totalCount,
sampleCount: data.length,
samplingStrategy: request.strategy
});
this.app.debug(`Sample query completed: ${data.length}/${totalCount} records in ${Date.now() - startTime}ms`);
return { data, summary };
}
catch (error) {
this.app.error(`Sample data query failed: ${error.message}`);
throw error;
}
}
/**
* Generate comprehensive data summary for analysis context
*/
async getDataSummary(dataPath, timeRange) {
const startTime = Date.now();
try {
const files = await this.findDataFiles(dataPath, timeRange);
if (files.length === 0) {
return this.createEmptySummary(startTime, 0, 0);
}
// Get statistical summary using DuckDB aggregation functions
const query = this.buildSummaryQuery(files, timeRange);
const summaryData = await this.executeDuckDBQuery(query);
// Get sample data for column analysis
const sampleQuery = this.buildRandomSampleQuery(files, 100, timeRange);
const sampleData = await this.executeDuckDBQuery(sampleQuery);
const summary = await this.generateEnhancedSummary(sampleData, startTime, summaryData.length, sampleData.length);
this.app.debug(`Data summary completed for ${dataPath}: ${summary.rowCount} records`);
return summary;
}
catch (error) {
this.app.error(`Data summary failed: ${error.message}`);
throw error;
}
}
/**
* Prepare correlation data across multiple paths
*/
async getCorrelationData(paths, timeRange) {
try {
this.app.debug(`Preparing correlation data for paths: ${paths.join(', ')}`);
const correlationData = {};
// Get data for each path
for (const dataPath of paths) {
const sampleRequest = {
dataPath,
sampleSize: 1000, // Limit for correlation analysis
timeRange,
strategy: 'systematic'
};
const { data } = await this.getSampleData(sampleRequest);
correlationData[dataPath] = data;
}
// Time-align the data (simplified approach)
const alignedData = this.alignTimeSeriesData(correlationData);
return {
paths,
data: alignedData,
timeAlignment: 'interpolated',
correlationMatrix: this.calculateCorrelationMatrix(alignedData)
};
}
catch (error) {
this.app.error(`Correlation data preparation failed: ${error.message}`);
throw error;
}
}
/**
* Get baseline data for anomaly detection
*/
async getAnomalyBaselineData(request) {
try {
const baselineRequest = {
dataPath: request.dataPath,
sampleSize: 2000, // Larger sample for baseline
timeRange: request.baselineRange,
strategy: 'systematic'
};
const { data } = await this.getSampleData(baselineRequest);
// Calculate statistical baselines
return this.calculateStatisticalBaseline(data, request.threshold);
}
catch (error) {
this.app.error(`Baseline data calculation failed: ${error.message}`);
throw error;
}
}
/**
* Find data files for a given path and time range
*/
async findDataFiles(dataPath, timeRange) {
try {
// Use the existing working path discovery logic
const { getAvailablePaths } = await Promise.resolve().then(() => __importStar(require('./utils/path-discovery')));
console.log(`🔍 Using existing path discovery for: ${dataPath}`);
console.log(`🔍 Output directory: ${this.outputDirectory}`);
// Get all available paths that actually exist
const availablePaths = getAvailablePaths(this.outputDirectory, this.app);
console.log(`🔍 Found ${availablePaths.length} available data paths:`);
availablePaths.forEach(pathInfo => {
console.log(` ${pathInfo.path} -> ${pathInfo.directory} (${pathInfo.fileCount} files)`);
});
// Find the path that matches our requested dataPath
const matchingPath = availablePaths.find(pathInfo => pathInfo.path === dataPath);
if (!matchingPath) {
console.log(`🔍 No matching path found for: ${dataPath}`);
console.log(`🔍 Available paths: ${availablePaths.map(p => p.path).join(', ')}`);
return [];
}
// Get all parquet files in the matching directory
const files = await (0, glob_1.glob)(path.join(matchingPath.directory, '*.parquet'));
console.log(`🔍 Found ${files.length} parquet files in ${matchingPath.directory}`);
if (files.length > 0) {
console.log(`🔍 Files: ${files.slice(0, 3).join(', ')}${files.length > 3 ? ` ... and ${files.length - 3} more` : ''}`);
}
// Return all files - time range filtering will be done in SQL
return files;
}
catch (error) {
console.log(`🔍 File discovery failed for ${dataPath}: ${error.message}`);
this.app.debug(`File discovery failed for ${dataPath}: ${error.message}`);
return [];
}
}
/**
* Build query to get all data without sampling
*/
buildAllDataQuery(files, timeRange) {
const fileList = files.map(f => `'${f}'`).join(', ');
let query = `SELECT * FROM read_parquet([${fileList}], union_by_name=True)`;
if (timeRange) {
query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}'
AND received_timestamp <= '${timeRange.end.toISOString()}'`;
}
// No ORDER BY to avoid any potential query optimization limits
return query;
}
/**
* Build count query to get total record count
*/
buildCountQuery(files, timeRange) {
const fileList = files.map(f => `'${f}'`).join(', ');
let query = `SELECT COUNT(*) as total_count FROM read_parquet([${fileList}], union_by_name=True)`;
if (timeRange) {
query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}'
AND received_timestamp <= '${timeRange.end.toISOString()}'`;
}
return query;
}
/**
* Build systematic sampling query
*/
buildSystematicSampleQuery(files, sampleSize, timeRange) {
const fileList = files.map(f => `'${f}'`).join(', ');
// Use simpler approach: divide data into buckets and take one from each
let query = `
WITH numbered_data AS (
SELECT *,
ROW_NUMBER() OVER (ORDER BY received_timestamp) as row_num,
COUNT(*) OVER () as total_rows
FROM read_parquet([${fileList}], union_by_name=True)`;
if (timeRange) {
query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}'
AND received_timestamp <= '${timeRange.end.toISOString()}'`;
}
query += `
),
sampled_data AS (
SELECT *,
CAST((row_num - 1) * ${sampleSize} / total_rows AS INTEGER) as bucket
FROM numbered_data
)
SELECT DISTINCT ON (bucket) *
FROM sampled_data
ORDER BY bucket, row_num
LIMIT ${sampleSize}`;
return query;
}
/**
* Build recent sampling query - prioritizes most recent data
*/
buildRecentSampleQuery(files, sampleSize, timeRange) {
const fileList = files.map(f => `'${f}'`).join(', ');
let query = `SELECT * FROM read_parquet([${fileList}], union_by_name=True)`;
if (timeRange) {
query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}'
AND received_timestamp <= '${timeRange.end.toISOString()}'`;
}
query += ` ORDER BY received_timestamp DESC LIMIT ${sampleSize}`;
return query;
}
/**
* Build representative sampling query - balances across time periods
*/
buildRepresentativeSampleQuery(files, sampleSize, timeRange) {
const fileList = files.map(f => `'${f}'`).join(', ');
// Use time-stratified sampling
let query = `
WITH time_buckets AS (
SELECT *,
NTILE(${Math.min(sampleSize, 20)}) OVER (ORDER BY received_timestamp) as time_bucket
FROM read_parquet([${fileList}], union_by_name=True)`;
if (timeRange) {
query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}'
AND received_timestamp <= '${timeRange.end.toISOString()}'`;
}
query += `
),
sampled_per_bucket AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY time_bucket ORDER BY RANDOM()) as bucket_row
FROM time_buckets
)
SELECT * FROM sampled_per_bucket
WHERE bucket_row <= ${Math.ceil(sampleSize / 20)}
ORDER BY received_timestamp
LIMIT ${sampleSize}`;
return query;
}
/**
* Build random sampling query
*/
buildRandomSampleQuery(files, sampleSize, timeRange) {
const fileList = files.map(f => `'${f}'`).join(', ');
let query = `SELECT * FROM read_parquet([${fileList}], union_by_name=True)`;
if (timeRange) {
query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}'
AND received_timestamp <= '${timeRange.end.toISOString()}'`;
}
query += ` ORDER BY RANDOM() LIMIT ${sampleSize}`;
return query;
}
/**
* Build summary statistics query
*/
buildSummaryQuery(files, timeRange) {
const fileList = files.map(f => `'${f}'`).join(', ');
let query = `
SELECT
COUNT(*) as record_count,
MIN(received_timestamp) as min_timestamp,
MAX(received_timestamp) as max_timestamp,
COUNT(DISTINCT path) as unique_paths,
AVG(CASE WHEN typeof(value) = 'DOUBLE' THEN value ELSE NULL END) as avg_numeric_value,
MIN(CASE WHEN typeof(value) = 'DOUBLE' THEN value ELSE NULL END) as min_numeric_value,
MAX(CASE WHEN typeof(value) = 'DOUBLE' THEN value ELSE NULL END) as max_numeric_value
FROM read_parquet([${fileList}], union_by_name=True)`;
if (timeRange) {
query += ` WHERE received_timestamp >= '${timeRange.start.toISOString()}'
AND received_timestamp <= '${timeRange.end.toISOString()}'`;
}
return query;
}
/**
* Execute DuckDB query and return results
*/
async executeDuckDBQuery(query) {
const cacheKey = this.generateCacheKey(query);
// Check cache first
const cached = this.queryCache.get(cacheKey);
if (cached && Date.now() - cached.timestamp < cached.ttl) {
this.app.debug(`Query cache hit: ${cacheKey.substring(0, 32)}...`);
return cached.result;
}
try {
const db = await duckdb.DuckDBInstance.create(':memory:');
const connection = await db.connect();
const result = await connection.runAndReadAll(query);
const rows = result.getRowObjects();
connection.disconnectSync();
// Cache result for 5 minutes
this.queryCache.set(cacheKey, {
result: rows,
timestamp: Date.now(),
ttl: 5 * 60 * 1000
});
return rows;
}
catch (error) {
this.app.error(`DuckDB query failed: ${error.message}`);
this.app.debug(`Failed query: ${query}`);
throw error;
}
}
/**
* Execute count query and return single number
*/
async executeCountQuery(query) {
const result = await this.executeDuckDBQuery(query);
return result[0]?.total_count || 0;
}
/**
* Generate enhanced data summary with performance metrics
*/
async generateEnhancedSummary(data, startTime, totalScanned, totalReturned, samplingInfo) {
const executionTime = Date.now() - startTime;
if (data.length === 0) {
return this.createEmptySummary(startTime, totalScanned, totalReturned, samplingInfo);
}
// Extract time range
const timestamps = data
.map(d => new Date(d.received_timestamp))
.filter(d => !isNaN(d.getTime()))
.sort();
const timeRange = timestamps.length > 0 ? {
start: timestamps[0],
end: timestamps[timestamps.length - 1]
} : { start: new Date(), end: new Date() };
// Analyze columns
const columns = this.analyzeColumns(data);
// Calculate statistics
const statisticalSummary = this.calculateStatistics(data, columns);
// Calculate data quality
const dataQuality = this.calculateDataQuality(data, columns);
return {
rowCount: data.length,
timeRange,
columns,
statisticalSummary,
dataQuality,
performanceMetrics: {
executionTime,
rowsScanned: totalScanned,
rowsReturned: totalReturned,
cacheHit: false // Would be true if from cache
},
samplingInfo
};
}
/**
* Analyze data columns
*/
analyzeColumns(data) {
const allKeys = new Set();
data.forEach(record => {
Object.keys(record).forEach(key => allKeys.add(key));
});
return Array.from(allKeys).map(key => {
const values = data.map(d => d[key]).filter(v => v !== null && v !== undefined);
return {
name: key,
type: this.inferDataType(values),
nullCount: data.length - values.length,
uniqueCount: new Set(values).size,
sampleValues: values.slice(0, 5)
};
});
}
/**
* Infer data type from values
*/
inferDataType(values) {
if (values.length === 0)
return 'unknown';
const types = new Set(values.map(v => typeof v));
if (types.has('number'))
return 'number';
if (types.has('boolean'))
return 'boolean';
if (types.has('object') && values[0] instanceof Date)
return 'datetime';
return 'string';
}
/**
* Calculate statistical summaries
*/
calculateStatistics(data, columns) {
const stats = {};
columns.forEach(col => {
const values = data.map(d => d[col.name]).filter(v => v !== null && v !== undefined);
if (col.type === 'number') {
const numericValues = values.filter(v => typeof v === 'number').sort((a, b) => a - b);
if (numericValues.length > 0) {
const sum = numericValues.reduce((a, b) => a + b, 0);
const mean = sum / numericValues.length;
const variance = numericValues.reduce((acc, val) => acc + Math.pow(val - mean, 2), 0) / numericValues.length;
stats[col.name] = {
count: numericValues.length,
mean,
median: numericValues[Math.floor(numericValues.length / 2)],
min: numericValues[0],
max: numericValues[numericValues.length - 1],
stdDev: Math.sqrt(variance)
};
}
}
else {
stats[col.name] = {
count: values.length,
min: values[0],
max: values[values.length - 1]
};
}
});
return stats;
}
/**
* Calculate data quality metrics
*/
calculateDataQuality(data, columns) {
const totalFields = data.length * columns.length;
const nullFields = columns.reduce((sum, col) => sum + col.nullCount, 0);
const completeness = totalFields > 0 ? ((totalFields - nullFields) / totalFields) * 100 : 0;
// Calculate timeliness based on latest timestamp
const timeliness = this.calculateTimeliness(data);
return {
completeness,
consistency: 85, // Simplified metric
timeliness,
accuracy: 90 // Simplified metric
};
}
/**
* Calculate timeliness metric
*/
calculateTimeliness(data) {
if (data.length === 0)
return 0;
const now = new Date();
const timestamps = data
.map(d => new Date(d.received_timestamp))
.filter(d => !isNaN(d.getTime()));
if (timestamps.length === 0)
return 0;
const latestRecord = new Date(Math.max(...timestamps.map(d => d.getTime())));
const ageHours = (now.getTime() - latestRecord.getTime()) / (1000 * 60 * 60);
return Math.max(0, 100 - (ageHours * 2));
}
/**
* Create empty summary for error cases
*/
createEmptySummary(startTime, scanned, returned, samplingInfo) {
return {
rowCount: 0,
timeRange: { start: new Date(), end: new Date() },
columns: [],
statisticalSummary: {},
dataQuality: {
completeness: 0,
consistency: 0,
timeliness: 0,
accuracy: 0
},
performanceMetrics: {
executionTime: Date.now() - startTime,
rowsScanned: scanned,
rowsReturned: returned,
cacheHit: false
},
samplingInfo
};
}
/**
* Generate cache key for query caching
*/
generateCacheKey(query) {
// Simple hash function for cache key
let hash = 0;
for (let i = 0; i < query.length; i++) {
const char = query.charCodeAt(i);
hash = ((hash << 5) - hash) + char;
hash = hash & hash; // Convert to 32-bit integer
}
return hash.toString(36);
}
/**
* Align time series data for correlation analysis
*/
alignTimeSeriesData(data) {
// Simplified time alignment - would implement proper interpolation in production
return data;
}
/**
* Calculate correlation matrix between datasets
*/
calculateCorrelationMatrix(data) {
const paths = Object.keys(data);
const matrix = [];
for (let i = 0; i < paths.length; i++) {
matrix[i] = [];
for (let j = 0; j < paths.length; j++) {
// Simplified correlation calculation
matrix[i][j] = i === j ? 1.0 : Math.random() * 0.8; // Placeholder
}
}
return matrix;
}
/**
* Calculate statistical baseline for anomaly detection
*/
calculateStatisticalBaseline(data, threshold) {
// Extract numeric values for baseline calculation
const numericValues = data
.map(d => d.value)
.filter(v => typeof v === 'number')
.sort((a, b) => a - b);
if (numericValues.length === 0)
return [];
// Calculate percentiles for anomaly thresholds
const mean = numericValues.reduce((a, b) => a + b, 0) / numericValues.length;
const stdDev = Math.sqrt(numericValues.reduce((acc, val) => acc + Math.pow(val - mean, 2), 0) / numericValues.length);
return [{
mean,
stdDev,
lowerBound: mean - (threshold * stdDev),
upperBound: mean + (threshold * stdDev),
percentiles: {
p1: numericValues[Math.floor(numericValues.length * 0.01)],
p5: numericValues[Math.floor(numericValues.length * 0.05)],
p95: numericValues[Math.floor(numericValues.length * 0.95)],
p99: numericValues[Math.floor(numericValues.length * 0.99)]
}
}];
}
/**
* Clear query cache
*/
clearCache() {
this.queryCache.clear();
this.app.debug('Query cache cleared');
}
/**
* Get cache statistics
*/
getCacheStats() {
return {
size: this.queryCache.size,
hitRate: 0.85 // Simplified metric
};
}
}
exports.DataQueryService = DataQueryService;
//# sourceMappingURL=data-query-service.js.map