signalk-parquet
Version:
SignalK plugin to save marine data directly to Parquet files with regimen-based control
1,239 lines (1,053 loc) • 124 kB
text/typescript
import Anthropic from '@anthropic-ai/sdk';
import { ServerAPI } from '@signalk/server-api';
import { DataRecord, PluginState } from './types';
import { VesselContextManager } from './vessel-context';
import { getAvailablePaths } from './utils/path-discovery';
import * as fs from 'fs-extra';
import * as path from 'path';
import { DuckDBInstance } from '@duckdb/node-api';
// Claude AI Integration Types
export interface ClaudeAnalyzerConfig {
apiKey: string;
model: 'claude-opus-4-1-20250805' | 'claude-opus-4-20250514' | 'claude-sonnet-4-20250514';
maxTokens: number;
temperature: number;
}
export interface AnalysisRequest {
dataPath: string;
analysisType: 'summary' | 'anomaly' | 'trend' | 'correlation' | 'custom';
timeRange?: { start: Date; end: Date };
customPrompt?: string;
context?: Record<string, any>;
aggregationMethod?: string;
resolution?: string;
useDatabaseAccess?: boolean;
}
export interface FollowUpRequest {
conversationId: string;
question: string;
}
export interface AnomalyDetection {
timestamp: string;
value: any;
expectedRange: { min: number; max: number };
severity: 'low' | 'medium' | 'high';
description: string;
confidence: number;
}
export interface AnalysisResponse {
id: string;
analysis: string;
insights: string[];
recommendations?: string[];
anomalies?: AnomalyDetection[];
confidence: number;
dataQuality: string;
timestamp: string;
usage?: {
input_tokens: number;
output_tokens: number;
};
metadata: {
dataPath: string;
analysisType: string;
recordCount: number;
timeRange?: { start: Date; end: Date };
useDatabaseAccess?: boolean;
};
}
export interface DataSummary {
rowCount: number;
timeRange: { start: Date; end: Date };
columns: ColumnInfo[];
statisticalSummary: Record<string, Statistics>;
dataQuality: DataQualityMetrics;
}
export interface ColumnInfo {
name: string;
type: string;
nullCount: number;
uniqueCount: number;
sampleValues: any[];
}
export interface Statistics {
count: number;
mean?: number;
median?: number;
min?: any;
max?: any;
stdDev?: number;
}
export interface DataQualityMetrics {
completeness: number; // Percentage of non-null values
consistency: number; // Data format consistency
timeliness: number; // Data freshness
accuracy: number; // Estimated data accuracy
}
export interface AvailablePathsFilter {
vesselContext?: string; // 'vessels.self', 'vessels.*', 'vessels.urn:mrn:...'
pathPattern?: string; // regex pattern for path filtering
source?: string; // filter by data source
hasValue?: boolean; // only paths with current values
includeMetadata?: boolean; // include _sources, meta, etc.
maxDepth?: number; // maximum depth to traverse
}
export interface AvailablePathInfo {
path: string;
fullPath: string; // complete SignalK path including vessel context
vesselId?: string; // vessel ID if applicable
currentValue?: any; // current value if hasValue=true
source?: string; // data source info
lastUpdate?: string; // last update timestamp
}
export class ClaudeAnalyzer {
private client: Anthropic;
private config: ClaudeAnalyzerConfig;
private app?: ServerAPI;
private dataDirectory?: string;
private vesselContextManager: VesselContextManager;
private activeConversations: Map<string, Array<any>> = new Map();
private state?: PluginState;
constructor(config: ClaudeAnalyzerConfig, app?: ServerAPI, dataDirectory?: string, state?: PluginState) {
this.config = config;
this.app = app;
this.dataDirectory = dataDirectory;
this.state = state;
this.vesselContextManager = new VesselContextManager(app, dataDirectory);
if (!config.apiKey) {
throw new Error('Claude API key is required for analysis functionality');
}
this.client = new Anthropic({
apiKey: config.apiKey,
defaultHeaders: {
'anthropic-version': '2023-06-01'
}
});
}
/**
* Main analysis method - analyzes data and returns structured insights
*/
async analyzeData(request: AnalysisRequest): Promise<AnalysisResponse> {
try {
this.app?.debug(`Starting Claude analysis: ${request.analysisType} for ${request.dataPath}${request.useDatabaseAccess ? ' (DATABASE ACCESS MODE)' : ' (SAMPLING MODE)'}`);
// Route to appropriate analysis system
if (request.useDatabaseAccess) {
return await this.analyzeWithDatabaseAccess(request);
}
// Legacy system: Prepare data for analysis
const data = await this.prepareDataForAnalysis(request);
// Build analysis prompt with data structure guidance
const prompt = this.buildAnalysisPrompt(data, request);
// Call Claude API
const response = await this.client.messages.create({
model: this.config.model,
max_tokens: this.config.maxTokens,
temperature: this.config.temperature,
messages: [{
role: 'user',
content: prompt
}]
});
// Parse response
const analysisResult = this.parseAnalysisResponse(response, request, data);
// Save analysis to history
await this.saveAnalysisToHistory(analysisResult);
this.app?.debug(`Claude analysis completed: ${analysisResult.id}`);
return analysisResult;
} catch (error) {
const errorMessage = (error as Error).message;
this.app?.error(`Claude analysis failed: ${errorMessage}`);
// Prevent recursive error messages
if (errorMessage.includes('Analysis failed:')) {
throw error; // Re-throw original error to avoid nesting
}
throw new Error(`Analysis failed: ${errorMessage}`);
}
}
/**
* Quick analysis using predefined templates
*/
async quickAnalysis(dataPath: string, analysisType: string, timeRange?: { start: Date; end: Date }): Promise<AnalysisResponse> {
const request: AnalysisRequest = {
dataPath,
analysisType: analysisType as any,
timeRange,
// No sampleSize needed - using REST API
};
return this.analyzeData(request);
}
/**
* Detect anomalies in the data
*/
async detectAnomalies(dataPath: string, timeRange?: { start: Date; end: Date }): Promise<AnomalyDetection[]> {
const request: AnalysisRequest = {
dataPath,
analysisType: 'anomaly',
timeRange,
customPrompt: 'Focus specifically on detecting anomalies and unusual patterns in this maritime data. Return detailed anomaly information.'
};
const result = await this.analyzeData(request);
return result.anomalies || [];
}
/**
* Prepare data for analysis - includes sampling and summarization
*/
private async prepareDataForAnalysis(request: AnalysisRequest): Promise<any> {
try {
let data: any[];
// Load data from parquet files using existing method
data = await this.loadDataFromPath(request.dataPath, request.timeRange, request.aggregationMethod, request.resolution);
// Generate statistical summary
const summary = this.generateDataSummary(data);
// Sample data very aggressively for production systems with lots of data
const maxSamples = data.length > 10000 ? 20 : 50; // Ultra-aggressive for large datasets
const sampledData = this.sampleDataForAnalysis(data, maxSamples);
return {
summary,
sampleData: sampledData,
originalCount: data.length
};
} catch (error) {
this.app?.error(`Failed to prepare data for analysis: ${(error as Error).message}`);
throw error;
}
}
/**
* Load data from parquet files based on path and time range
*/
private async loadDataFromPath(dataPath: string, timeRange?: { start: Date; end: Date }, aggregationMethod?: string, resolution?: string): Promise<DataRecord[]> {
try {
// Use the existing REST API instead of custom query logic
const baseUrl = `http://localhost:3000`; // Use default SignalK port
// Construct paths with aggregation method if provided
// HistoryAPI supports format: "path:aggregateMethod" (e.g., "environment.outside.tempest.observations.solarRadiation:max")
// For multiple paths, apply aggregation to each path individually
const pathsWithAggregation = dataPath.split(',').map(path => {
const trimmedPath = path.trim();
return aggregationMethod && aggregationMethod !== 'average'
? `${trimmedPath}:${aggregationMethod}`
: trimmedPath; // 'average' is the default, so no need to specify it
}).join(',');
// Build query parameters for the history API (only valid parameters)
const params = new URLSearchParams({
paths: pathsWithAggregation,
from: timeRange ? timeRange.start.toISOString() : new Date(Date.now() - 48 * 60 * 60 * 1000).toISOString(),
to: timeRange ? timeRange.end.toISOString() : new Date().toISOString()
});
// Set resolution if provided (empty string = auto/let HistoryAPI choose)
if (resolution && resolution.trim() !== '') {
params.set('resolution', resolution);
console.log(`📊 CLAUDE ANALYZER: Using custom resolution: ${resolution}ms`);
} else {
console.log(`📊 CLAUDE ANALYZER: Using auto resolution (HistoryAPI will choose optimal bucketing)`);
}
const url = `${baseUrl}/api/history/values?${params}`;
console.log(`🌐 CLAUDE ANALYZER: Making REST API call to ${url}`);
console.log(`📊 CLAUDE ANALYZER: Using paths "${pathsWithAggregation}" ${aggregationMethod ? `with aggregation method "${aggregationMethod}"` : 'with default aggregation'}`);
// Make HTTP request to the history API
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const apiResult = await response.json() as any;
console.log(`📊 CLAUDE ANALYZER (REST API): Raw API result:`, Object.keys(apiResult));
// Convert API result to DataRecord format
// API returns: { context, range, values: [{path, method}, ...], data: [[timestamp, value1, value2, ...], ...] }
const records: DataRecord[] = [];
if (apiResult.data && Array.isArray(apiResult.data) && apiResult.values && Array.isArray(apiResult.values)) {
console.log(`🔍 CLAUDE ANALYZER: Processing ${apiResult.data.length} data rows with ${apiResult.values.length} value columns`);
console.log(`🔍 CLAUDE ANALYZER: Value column info:`, apiResult.values);
// Safety limit to prevent stack overflow
const maxRows = Math.min(apiResult.data.length, 10000);
const maxCols = Math.min(apiResult.values.length, 20);
for (let rowIndex = 0; rowIndex < maxRows; rowIndex++) {
const row = apiResult.data[rowIndex];
if (Array.isArray(row) && row.length >= 2) {
const timestamp = row[0]; // First column is always timestamp
// Process each data column (starting from index 1)
for (let colIndex = 1; colIndex < row.length && colIndex <= maxCols; colIndex++) {
const value = row[colIndex];
const valueInfo = apiResult.values[colIndex - 1]; // values array is 0-indexed
if (rowIndex < 3) {
console.log(`🔍 Sample row ${rowIndex}, col ${colIndex}: timestamp=${timestamp}, path=${valueInfo?.path}, method=${valueInfo?.method}, value=${value}`);
}
records.push({
received_timestamp: timestamp, // Use the actual timestamp from data as-is
signalk_timestamp: timestamp, // Use the actual timestamp from data as-is
path: valueInfo?.path || 'unknown',
value: typeof value === 'bigint' ? Number(value) : value,
context: this.app?.selfContext || 'unknown',
source: 'rest-api',
source_label: `REST API (${valueInfo?.method || 'default'})`,
aggregation_method: valueInfo?.method
} as DataRecord & { aggregation_method?: string });
}
}
}
} else {
console.log(`🔍 CLAUDE ANALYZER: No data array found in API result`);
}
console.log(`📊 CLAUDE ANALYZER (REST API): Loaded ${records.length} records for analysis from ${pathsWithAggregation}`);
this.app?.debug(`REST API loaded ${records.length} records for analysis from ${pathsWithAggregation}`);
return records;
} catch (error) {
this.app?.error(`Failed to load data from ${dataPath}: ${(error as Error).message}`);
// Fallback to sample data if query fails
const sampleData: DataRecord[] = [{
received_timestamp: new Date().toISOString(),
signalk_timestamp: new Date().toISOString(),
context: 'vessels.self',
path: dataPath,
value: 0,
source: 'fallback-sample'
}];
return sampleData;
}
}
/**
* Generate statistical summary of the data
*/
private generateDataSummary(data: DataRecord[]): DataSummary {
if (data.length === 0) {
return {
rowCount: 0,
timeRange: { start: new Date(), end: new Date() },
columns: [],
statisticalSummary: {},
dataQuality: {
completeness: 0,
consistency: 0,
timeliness: 0,
accuracy: 0
}
};
}
// Extract time range
const timestamps = data.map(d => new Date(d.received_timestamp)).sort();
const timeRange = {
start: timestamps[0],
end: timestamps[timestamps.length - 1]
};
// Analyze columns
const columns: ColumnInfo[] = [];
const allKeys = new Set<string>();
data.forEach(record => {
Object.keys(record).forEach(key => allKeys.add(key));
});
allKeys.forEach(key => {
const values = data.map(d => (d as any)[key]).filter(v => v !== null && v !== undefined);
columns.push({
name: key,
type: typeof values[0],
nullCount: data.length - values.length,
uniqueCount: new Set(values).size,
sampleValues: values.slice(0, 5)
});
});
// Calculate statistics for numeric values
const statisticalSummary: Record<string, Statistics> = {};
columns.forEach(col => {
if (col.type === 'number') {
const values = data.map(d => (d as any)[col.name]).filter(v => typeof v === 'number');
if (values.length > 0) {
const sorted = values.sort((a, b) => a - b);
const sum = values.reduce((a, b) => a + b, 0);
const mean = sum / values.length;
const variance = values.reduce((acc, val) => acc + Math.pow(val - mean, 2), 0) / values.length;
statisticalSummary[col.name] = {
count: values.length,
mean,
median: sorted[Math.floor(sorted.length / 2)],
min: sorted[0],
max: sorted[sorted.length - 1],
stdDev: Math.sqrt(variance)
};
}
} else {
statisticalSummary[col.name] = {
count: data.length - col.nullCount,
min: col.sampleValues[0],
max: col.sampleValues[col.sampleValues.length - 1]
};
}
});
// Calculate data quality metrics
const dataQuality = this.calculateDataQuality(data, columns);
return {
rowCount: data.length,
timeRange,
columns,
statisticalSummary,
dataQuality
};
}
/**
* Calculate data quality metrics
*/
private calculateDataQuality(data: DataRecord[], columns: ColumnInfo[]): DataQualityMetrics {
const totalFields = data.length * columns.length;
const nullFields = columns.reduce((sum, col) => sum + col.nullCount, 0);
const completeness = ((totalFields - nullFields) / totalFields) * 100;
// Simple heuristics for other quality metrics
const consistency = 85; // Placeholder - would analyze format consistency
const timeliness = this.calculateTimeliness(data);
const accuracy = 90; // Placeholder - would need validation data
return {
completeness,
consistency,
timeliness,
accuracy
};
}
/**
* Calculate data timeliness based on timestamps
*/
private calculateTimeliness(data: DataRecord[]): number {
if (data.length === 0) return 0;
const now = new Date();
const latestRecord = new Date(Math.max(...data.map(d => new Date(d.received_timestamp).getTime())));
const ageHours = (now.getTime() - latestRecord.getTime()) / (1000 * 60 * 60);
// Timeliness decreases as data gets older
return Math.max(0, 100 - (ageHours * 2)); // 2% decrease per hour
}
/**
* Sample data for analysis to respect Claude token limits
*/
private sampleDataForAnalysis(data: DataRecord[], maxSamples: number): DataRecord[] {
if (data.length <= maxSamples) {
return data;
}
// Reduce max samples to limit token usage - be very aggressive for production
const tokenSafeMaxSamples = Math.min(maxSamples, 30);
// Intelligent sampling - take some from beginning, middle, and end
const step = Math.floor(data.length / tokenSafeMaxSamples);
const sampled: DataRecord[] = [];
for (let i = 0; i < data.length && sampled.length < tokenSafeMaxSamples; i += step) {
sampled.push(data[i]);
}
// Include very few recent records to save tokens
const recentCount = Math.min(5, tokenSafeMaxSamples - sampled.length);
const recentRecords = data.slice(-recentCount);
return [...sampled, ...recentRecords].slice(0, tokenSafeMaxSamples);
}
/**
* Safely stringify data that may contain BigInt values
*/
private safeStringify(obj: any, space?: number): string {
return JSON.stringify(obj, (_, value) => {
return typeof value === 'bigint' ? value.toString() : value;
}, space);
}
/**
* Analyze data structure to guide Claude on how to interpret the data
*/
private analyzeDataStructure(sampleData: any[]): string {
if (!sampleData || sampleData.length === 0) {
return "- No sample data available for structure analysis";
}
const firstRecord = sampleData[0];
const notes: string[] = [];
// Analyze paths and aggregation methods
const uniquePaths = new Set(sampleData.map(record => record.path).filter(Boolean));
const aggregationMethods = new Set(sampleData.map(record => record.aggregation_method).filter(Boolean));
if (uniquePaths.size > 1) {
notes.push(`- Multi-path analysis: ${uniquePaths.size} different SignalK paths`);
notes.push(`- Paths included: ${Array.from(uniquePaths).join(', ')}`);
} else {
notes.push(`- Single path analysis: ${Array.from(uniquePaths)[0] || 'unknown'}`);
}
if (aggregationMethods.size > 0) {
notes.push(`- Aggregation methods applied: ${Array.from(aggregationMethods).join(', ')}`);
}
// Check for value_json presence
const hasValueJson = firstRecord.hasOwnProperty('value_json') && firstRecord.value_json !== null;
const hasDirectValues = Object.keys(firstRecord).some(key => key.startsWith('value_') && key !== 'value_json');
if (hasValueJson) {
notes.push("- Data contains JSON objects in 'value_json' column");
notes.push("- Main data values are stored as JSON objects (e.g., position data with longitude/latitude)");
// Try to parse a sample to show structure
try {
const parsed = typeof firstRecord.value_json === 'string'
? JSON.parse(firstRecord.value_json)
: firstRecord.value_json;
const keys = Object.keys(parsed);
notes.push(`- JSON structure contains: ${keys.join(', ')}`);
} catch (e) {
notes.push("- JSON values present but structure varies");
}
}
if (hasDirectValues) {
const directValueColumns = Object.keys(firstRecord).filter(key =>
key.startsWith('value_') && key !== 'value_json' && firstRecord[key] !== null
);
if (directValueColumns.length > 0) {
notes.push("- Data also contains direct value columns:");
notes.push(` ${directValueColumns.join(', ')}`);
}
}
// Guidance for Claude
if (hasValueJson && hasDirectValues) {
notes.push("- ANALYSIS NOTE: Use 'value_json' for the primary data values, direct columns may be supplementary");
} else if (hasValueJson) {
notes.push("- ANALYSIS NOTE: Primary data is in 'value_json' objects - parse this for meaningful values");
} else if (hasDirectValues) {
notes.push("- ANALYSIS NOTE: Data values are in direct columns (value_longitude, value_latitude, etc.)");
}
// Check for other important columns
const standardColumns = ['received_timestamp', 'timestamp', 'context', 'path', 'source'];
const presentColumns = standardColumns.filter(col => firstRecord.hasOwnProperty(col));
if (presentColumns.length > 0) {
notes.push(`- Standard SignalK columns available: ${presentColumns.join(', ')}`);
}
return notes.length > 0 ? notes.join('\n') : "- Standard data structure detected";
}
/**
* Build analysis prompt based on data and request type
*/
private buildAnalysisPrompt(data: any, request: AnalysisRequest): string {
const { summary, sampleData } = data;
const dataStructureNote = this.analyzeDataStructure(sampleData);
const vesselContext = this.vesselContextManager.generateClaudeContext();
let prompt = `You are an expert maritime data analyst. Analyze the following SignalK vessel data and provide insights.
${vesselContext}
DATA SUMMARY:
- Path: ${request.dataPath}
- Records: ${summary.rowCount} (showing sample of ${sampleData.length})
- Time Range: ${summary.timeRange.start.toISOString()} to ${summary.timeRange.end.toISOString()}
- Data Quality: ${Math.round(summary.dataQuality.completeness)}% complete, ${Math.round(summary.dataQuality.accuracy)}% accuracy
DATA STRUCTURE NOTES:
${dataStructureNote}
STATISTICAL SUMMARY:
${this.safeStringify(summary.statisticalSummary, 2)}
SAMPLE DATA:
${this.safeStringify(sampleData, 2)}
`;
// Add specific analysis instructions based on type
switch (request.analysisType) {
case 'summary':
prompt += `
ANALYSIS REQUEST: Provide a comprehensive summary of this maritime data.
Focus on:
1. Overall trends and patterns
2. Operational insights
3. Performance indicators
4. Notable observations
5. Data quality assessment
Please structure your response as:
- Executive Summary (2-3 sentences)
- Key Insights (bullet points)
- Recommendations (actionable items)
- Data Quality Notes
`;
break;
case 'anomaly':
prompt += `
ANALYSIS REQUEST: Detect anomalies and unusual patterns in this maritime data.
Focus on:
1. Statistical outliers
2. Unusual temporal patterns
3. Operational anomalies
4. Safety concerns
5. Equipment irregularities
For each anomaly found, specify:
- Timestamp
- Value and expected range
- Severity (low/medium/high)
- Description and potential cause
- Confidence level
`;
break;
case 'trend':
prompt += `
ANALYSIS REQUEST: Analyze trends and patterns in this maritime data over time.
Focus on:
1. Temporal trends (increasing/decreasing/cyclical)
2. Seasonal patterns
3. Operational patterns
4. Performance trends
5. Predictive insights
Provide trend analysis with confidence levels and future projections where appropriate.
`;
break;
case 'custom':
prompt += `
ANALYSIS REQUEST: ${request.customPrompt}
IMPORTANT: When analyzing the data, note that:
- This data comes from the SignalK REST API with proper timestamp alignment
- Each record has a 'path' field indicating the SignalK data source
- Multiple paths may be included for correlation analysis (check 'path' field for each record)
- Aggregation methods (like 'max', 'ema', 'sma') are applied and noted in 'aggregation_method' field
- All timestamps are properly synchronized across different data sources
- Focus on the 'value' field for numerical data and 'path' field to distinguish data sources
- If 'value_json' contains objects, extract the meaningful values from these JSON structures
- Consider the SignalK data path context to understand what type of maritime data you're analyzing
Please provide detailed analysis addressing the specific request while considering maritime operations context.
`;
break;
default:
prompt += `
ANALYSIS REQUEST: Analyze this maritime data and provide relevant insights for vessel operations.
`;
}
prompt += `
RESPONSE FORMAT:
Please structure your response as JSON with the following format:
{
"analysis": "Main analysis text",
"insights": ["insight1", "insight2", ...],
"recommendations": ["recommendation1", "recommendation2", ...],
"anomalies": [{"timestamp": "ISO8601", "value": "actual", "expectedRange": {"min": 0, "max": 100}, "severity": "high", "description": "...", "confidence": 0.9}],
"confidence": 0.85,
"dataQuality": "assessment of data quality"
}
`;
return prompt;
}
/**
* Parse Claude's analysis response into structured format
*/
private parseAnalysisResponse(response: any, request: AnalysisRequest, data: any): AnalysisResponse {
try {
let content = '';
if (response.content && response.content[0] && response.content[0].text) {
content = response.content[0].text;
} else if (typeof response === 'string') {
content = response;
}
// Try to extract JSON from the response
let parsedResponse: any = {};
const jsonMatch = content.match(/\{[\s\S]*\}/);
if (jsonMatch) {
try {
parsedResponse = JSON.parse(jsonMatch[0]);
} catch (parseError) {
// If JSON parsing fails, create structured response from text
parsedResponse = {
analysis: content,
insights: this.extractBulletPoints(content),
recommendations: [],
confidence: 0.8,
dataQuality: "Analysis completed"
};
}
} else {
parsedResponse = {
analysis: content,
insights: this.extractBulletPoints(content),
recommendations: [],
confidence: 0.8,
dataQuality: "Analysis completed"
};
}
// Generate unique ID
const analysisId = `analysis_${Date.now()}_${Math.random().toString(36).substring(2, 11)}`;
return {
id: analysisId,
analysis: parsedResponse.analysis || content,
insights: parsedResponse.insights || [],
recommendations: parsedResponse.recommendations || [],
anomalies: parsedResponse.anomalies || [],
confidence: parsedResponse.confidence || 0.8,
dataQuality: parsedResponse.dataQuality || "Analysis completed",
timestamp: new Date().toISOString(),
metadata: {
dataPath: request.dataPath,
analysisType: request.analysisType,
recordCount: data.originalCount || 0,
timeRange: request.timeRange,
useDatabaseAccess: request.useDatabaseAccess
}
};
} catch (error) {
this.app?.error(`Failed to parse Claude response: ${(error as Error).message}`);
throw new Error(`Response parsing failed: ${(error as Error).message}`);
}
}
/**
* Extract bullet points from text for insights
*/
private extractBulletPoints(text: string): string[] {
const lines = text.split('\n');
const bulletPoints: string[] = [];
lines.forEach(line => {
const trimmed = line.trim();
if (trimmed.match(/^[-*•]\s/) || trimmed.match(/^\d+\.\s/)) {
bulletPoints.push(trimmed.replace(/^[-*•]\s*/, '').replace(/^\d+\.\s*/, ''));
}
});
return bulletPoints;
}
/**
* Save analysis to history for later retrieval
*/
private async saveAnalysisToHistory(analysis: AnalysisResponse): Promise<void> {
try {
// Create history directory in plugin's data directory
if (!this.dataDirectory) {
throw new Error('No data directory configured for plugin');
}
const historyDir = path.join(this.dataDirectory, 'analysis-history');
await fs.ensureDir(historyDir);
// Save analysis to file
const filename = `${analysis.id}.json`;
const filepath = path.join(historyDir, filename);
await fs.writeJson(filepath, analysis, { spaces: 2 });
this.app?.debug(`Analysis saved to history: ${filepath}`);
} catch (error) {
this.app?.error(`Failed to save analysis to history: ${(error as Error).message}`);
// Don't throw - this is not critical
}
}
/**
* Get analysis history
*/
async getAnalysisHistory(limit: number = 20): Promise<AnalysisResponse[]> {
try {
if (!this.dataDirectory) {
return []; // No data directory configured, return empty history
}
const historyDir = path.join(this.dataDirectory, 'analysis-history');
if (!await fs.pathExists(historyDir)) {
return [];
}
const files = await fs.readdir(historyDir);
const analysisFiles = files.filter(f => f.endsWith('.json')).sort().reverse();
const history: AnalysisResponse[] = [];
for (let i = 0; i < Math.min(limit, analysisFiles.length); i++) {
try {
const analysis = await fs.readJson(path.join(historyDir, analysisFiles[i]));
history.push(analysis);
} catch (error) {
this.app?.debug(`Failed to read analysis file ${analysisFiles[i]}: ${(error as Error).message}`);
}
}
return history;
} catch (error) {
this.app?.error(`Failed to get analysis history: ${(error as Error).message}`);
return [];
}
}
/**
* Delete an analysis from history
*/
async deleteAnalysis(analysisId: string): Promise<{ success: boolean; error?: string }> {
try {
if (!this.dataDirectory) {
return { success: false, error: 'No data directory configured for plugin' };
}
const historyDir = path.join(this.dataDirectory, 'analysis-history');
const filename = `${analysisId}.json`;
const filePath = path.join(historyDir, filename);
if (!await fs.pathExists(filePath)) {
return { success: false, error: 'Analysis not found' };
}
await fs.remove(filePath);
this.app?.debug(`Deleted analysis: ${analysisId}`);
return { success: true };
} catch (error) {
this.app?.error(`Failed to delete analysis: ${(error as Error).message}`);
return { success: false, error: (error as Error).message };
}
}
/**
* Tony's approach: Direct database access analysis
* Claude can query the database interactively during analysis
*/
async analyzeWithDatabaseAccess(request: AnalysisRequest): Promise<AnalysisResponse> {
try {
this.app?.debug('🚀 Using direct database access');
// Ensure vessel context is loaded before generating context for Claude
await this.vesselContextManager.refreshVesselInfo();
const vesselContext = this.vesselContextManager.generateClaudeContext();
this.app?.debug(`🛥️ Vessel context for Claude (${vesselContext.length} chars):\n${vesselContext.substring(0, 500)}${vesselContext.length > 500 ? '...' : ''}`);
const schemaInfo = await this.getEnhancedSchemaForClaude();
this.app?.debug(`📊 Schema info for Claude (${schemaInfo.length} chars):\n${schemaInfo.substring(0, 1000)}${schemaInfo.length > 1000 ? '...' : ''}`);
// Debug: Log if schema is empty or suspicious
if (!schemaInfo || schemaInfo.length < 100) {
this.app?.error(`❌ Schema info appears empty or too short! Length: ${schemaInfo?.length || 0}`);
this.app?.error(`Schema content: "${schemaInfo}"`);
}
// Build time range guidance for Claude
let timeRangeGuidance = '';
if (request.timeRange) {
console.log(`🔍 REQUEST TIME RANGE DEBUG:`, {
userRequested: request.customPrompt || request.analysisType,
actualStart: request.timeRange.start.toISOString(),
actualEnd: request.timeRange.end.toISOString(),
calculatedHours: (request.timeRange.end.getTime() - request.timeRange.start.getTime()) / (1000 * 60 * 60)
});
timeRangeGuidance = `
ANALYSIS SCOPE: Focus your analysis on data between ${request.timeRange.start.toISOString().replace('.000Z', 'Z')} and ${request.timeRange.end.toISOString().replace('.000Z', 'Z')}.
IMPORTANT: Always include WHERE clauses in your SQL queries to filter results to this time range:
WHERE signalk_timestamp >= '${request.timeRange.start.toISOString().replace('.000Z', 'Z')}' AND signalk_timestamp <= '${request.timeRange.end.toISOString().replace('.000Z', 'Z')}'`;
} else {
// Default to recent data if no time range specified
const now = new Date();
const sixHoursAgo = new Date(now.getTime() - 6 * 60 * 60 * 1000);
timeRangeGuidance = `
TIME RANGE FOCUS: Since no specific time range was provided, focus on recent data (last 6 hours).
IMPORTANT: Always include WHERE clauses to limit results to recent data:
WHERE signalk_timestamp >= '${sixHoursAgo.toISOString().replace('.000Z', 'Z')}'`;
}
// Get system timezone for timestamp interpretation
const systemTimezone = Intl.DateTimeFormat().resolvedOptions().timeZone;
const now = new Date();
const timezoneOffset = -now.getTimezoneOffset() / 60; // Convert to hours from UTC
const initialPrompt = `You are an expert maritime data analyst with direct access to a comprehensive database.
IMPORTANT: Please use the vessel context information provided below for all analysis and responses. This vessel information is critical for accurate maritime analysis.
${vesselContext}
CRITICAL TIMESTAMP INFORMATION:
- ALL SignalK timestamps in the database are in UTC (ending with 'Z')
- System timezone: ${systemTimezone} (UTC${timezoneOffset >= 0 ? '+' : ''}${timezoneOffset})
- When interpreting times for the user, convert UTC timestamps to local time (${systemTimezone})
- Example: 2025-09-02T00:24:44Z (UTC) = ${new Date('2025-09-02T00:24:44Z').toLocaleString('en-US', {timeZone: systemTimezone, timeZoneName: 'short'})}
${schemaInfo}${timeRangeGuidance}
ANALYSIS REQUEST: ${request.customPrompt || 'Analyze maritime data and provide insights'}
You can query the database using the query_maritime_database function. Start by exploring the data to understand what's available, then provide comprehensive analysis.
REMEMBER:
- Always refer to and use the vessel context provided above (vessel name, dimensions, operational details, etc.) when analyzing data and providing recommendations.
- ALWAYS include time range WHERE clauses in your queries to avoid loading excessive historical data.
- Keep query results focused and relevant to the specified time period.
- CRITICAL: ONLY use the exact paths listed in the "AVAILABLE DATA PATHS" section. DO NOT make up or guess path names.
- If a path you want to use is not in the available paths list, it does not exist - inform the user instead of guessing.
CRITICAL FOR TOKEN EFFICIENCY:
- NEVER query raw individual records - ALWAYS use time bucketing and aggregation
- MANDATORY SQL pattern for all data queries:
SELECT
strftime(date_trunc('hour', signalk_timestamp::TIMESTAMP), '%Y-%m-%dT%H:%M:%SZ') as time_bucket,
AVG(CAST(value AS DOUBLE)) as avg_value,
MAX(CAST(value AS DOUBLE)) as max_value,
MIN(CAST(value AS DOUBLE)) as min_value,
COUNT(*) as record_count
FROM 'path/*.parquet'
WHERE signalk_timestamp >= 'start_time' AND signalk_timestamp <= 'end_time'
AND value IS NOT NULL
GROUP BY time_bucket
ORDER BY time_bucket
- Use date_trunc('minute', ...) for detailed analysis, date_trunc('hour', ...) for overviews
- NEVER return more than 100 time buckets per query
Focus on:
1. Current vessel status and recent activity
2. Patterns in navigation, weather, and performance data
3. Safety considerations and operational insights
4. Data quality and completeness assessment
Begin your analysis by querying relevant data within the specified time range.`;
this.app?.debug(`📝 Full prompt for Claude (${initialPrompt.length} chars):\n${initialPrompt.substring(0, 2000)}${initialPrompt.length > 2000 ? '...[TRUNCATED]' : ''}`);
// Save full prompt to file for debugging
const fs = require('fs');
const debugFile = `/tmp/claude-prompt-debug-${Date.now()}.txt`;
fs.writeFileSync(debugFile, `FULL CLAUDE PROMPT (${initialPrompt.length} chars):\n\n${initialPrompt}`);
this.app?.debug(`📄 Full prompt saved to: ${debugFile}`);
// Extract system context and user prompt
const systemContext = `You are an expert maritime data analyst with direct access to a comprehensive database.
CRITICAL DATA INTEGRITY RULES - VIOLATION OF THESE RULES IS UNACCEPTABLE:
- NEVER fabricate, guess, or make up any data, coordinates, timestamps, or values
- If a query returns no data, you MUST say "No data available" or "Query returned no results"
- NEVER invent plausible-sounding but false information
- If you don't know something, explicitly state "I don't have this information"
- Financial and navigational decisions depend on accurate data - false information causes real harm
- NEVER create example data, sample values, or hypothetical scenarios when discussing real vessel data
- ALWAYS reference the specific query that provided any data you present
- If insufficient data exists for analysis, explicitly state this rather than creating synthetic data
MANDATORY DATA VALIDATION PROTOCOL - THESE STEPS ARE REQUIRED FOR EVERY RESPONSE:
1. BEFORE presenting ANY data, you MUST explicitly state: "Query executed: [exact SQL]"
2. BEFORE presenting ANY data, you MUST explicitly state: "Rows returned: [exact count]"
3. BEFORE creating ANY chart, you MUST explicitly state: "Data points for chart: [exact count from query]"
4. BEFORE analyzing time ranges, you MUST explicitly state: "Actual time range in data: [start] to [end]"
5. If user requests specific time window and data doesn't match, you MUST state: "Requested: [X], Available: [Y]"
MANDATORY TIME CALCULATION VALIDATION - NEVER USE DEFAULT TIME PERIODS:
1. When user requests specific time periods (72 hours, 3 days, 1 week, etc.), ALWAYS calculate exact timestamps
2. NEVER default to familiar periods like "yesterday" or "24 hours" when user specified different duration
3. ALWAYS verify: "User requested [X hours/days], calculating [X hours/days] back from [current time]"
4. ALWAYS state explicitly: "Time range calculated: [exact start timestamp] to [exact end timestamp]"
5. If user says "72 hours", you MUST calculate 72 hours, not 24 hours or any other duration
6. NEVER assume data might not be available - execute the exact request first, then report actual results
7. NEVER claim "database restrictions" or "time range restrictions" exist without first querying the requested range
8. Execute the full requested query first - only report actual limitations found in the results
9. CRITICAL: If user requests "7 days", you MUST query 7 days back from current time, not 1 day or 24 hours
10. CRITICAL: If user requests "1 week", you MUST query 7 days back from current time, not 1 day or 24 hours
11. NEVER say "this is the complete dataset" unless you actually queried the full requested time period
12. ABSOLUTELY FORBIDDEN: Do not mention "database time range restrictions" - no such restrictions exist
13. ABSOLUTELY FORBIDDEN: Do not say "I need to work within" any restrictions - query the full requested range
14. THERE ARE NO DATABASE LIMITATIONS - the database contains weeks of historical data
15. If user asks for 7 days, query FROM 7 days ago TO now - do not modify this range for any reason
MANDATORY QUERY EXECUTION RULE - NO EXCEPTIONS:
When user specifies ANY time frame (72 hours, 3 days, 1 week, 7 days, etc.):
1. Calculate the EXACT start and end timestamps for that period
2. Execute your database queries using those EXACT timestamps
3. NEVER modify, restrict, or reduce the time range for any reason
4. Query the database with the full requested range - period, no exceptions
CRITICAL: Before executing ANY query, you MUST:
1. Extract the EXACT time range from user request
2. Query the database to find the ACTUAL available data range
3. Use ONLY the available data range - DO NOT default to 24 hours
4. If requested range exceeds available data, use ALL available data and state the actual range used
QUERY EXECUTION RULES:
- For time-based requests, FIRST run: SELECT MIN(signalk_timestamp), MAX(signalk_timestamp) FROM relevant_table
- Use the full available range, not arbitrary subsets
- State actual data range used in response
- If user asks for "7 days" but only 3 days exist, use all 3 days and explain
- NEVER assume 24-hour periods. ALWAYS query for full available dataset first.
FORBIDDEN ACTIONS - THESE WILL RESULT IN IMMEDIATE FAILURE:
- Creating ASCII charts, text visualizations, or any fake visual representations
- Using terms like "trending", "pattern", or "shows" without showing exact data points
- Making statements about data without first showing the query that produced it
- Creating any visualization that isn't a proper Plotly.js JSON specification
- Presenting analysis conclusions without first showing raw query results
CHART EMBEDDING CAPABILITIES:
When you want to include charts in your response, add a Plotly.js JSON chart specification in a code block like this:
\`\`\`json
{
"type": "chart",
"title": "Speed Over Ground Trend",
"data": [
{
"x": ["12:00", "13:00", "14:00", "15:00"],
"y": [5.2, 6.1, 5.8, 7.3],
"name": "Speed Over Ground",
"type": "scatter",
"mode": "lines+markers",
"line": {"color": "#1976d2", "width": 2},
"marker": {"color": "#1976d2", "size": 6}
}
],
"layout": {
"title": "Speed Over Ground Trend",
"xaxis": {"title": "Time"},
"yaxis": {"title": "Speed (knots)"},
"showlegend": true
}
}
\`\`\`
SUPPORTED CHART TYPES:
- **Line Charts**: type: "scatter", mode: "lines+markers" or "lines"
- **Bar Charts**: type: "bar"
- **Scatter Plots**: type: "scatter", mode: "markers"
- **Wind Rose/Radar**: type: "scatterpolar" with r and theta values
- **Multiple Series**: Include multiple objects in the data array
- **Styling**: Use line.color, marker.color, line.width, etc.
Include this JSON when analysis would benefit from visualization.
CRITICAL CHART DATA RULES - CHARTS WITH FAKE DATA ARE FORBIDDEN:
- ONLY use data that comes from actual database query results
- NEVER fabricate, estimate, or interpolate data points
- NEVER extend data beyond what the query returned
- If you don't have enough data points for a meaningful chart, say so explicitly
- All chart data must be traceable to specific query results you executed
- Include a comment in your response explaining which query provided the chart data
MANDATORY CHART VALIDATION - REQUIRED BEFORE ANY CHART:
1. Count exact data points from query result
2. State: "Creating chart with [N] actual data points from query"
3. If query returns 5 rows, chart must have exactly 5 data points - NEVER MORE
4. If user asks for 72-hour window but data spans 24 hours, explicitly state the mismatch
5. NEVER fill gaps or extend trends - use only actual timestamps and values from database
6. Show first 3 and last 3 actual data rows before creating chart
7. Explicitly verify: "Chart data matches query results: [timestamp1: value1], [timestamp2: value2]..."
RESPONSE STRUCTURE REQUIREMENTS:
1. Always start with: "QUERY VALIDATION:"
2. Show the exact SQL executed
3. Show exact row count and time range
4. If creating chart, show sample data points
5. Only then provide analysis using that specific data
IMPORTANT: Please use the vessel context information provided below for all analysis and responses. This vessel information is critical for accurate maritime analysis.
${vesselContext}
${schemaInfo}${timeRangeGuidance}
CRITICAL FOR TOKEN EFFICIENCY:
- NEVER query raw individual records - ALWAYS use time bucketing and aggregation
- MANDATORY SQL pattern for all data queries:
SELECT
strftime(date_trunc('hour', signalk_timestamp::TIMESTAMP), '%Y-%m-%dT%H:%M:%SZ') as time_bucket,
AVG(CAST(value AS DOUBLE)) as avg_value,
MAX(CAST(value AS DOUBLE)) as max_value,
MIN(CAST(value AS DOUBLE)) as min_value,
COUNT(*) as record_count
FROM 'path/*.parquet'
WHERE signalk_timestamp >= 'start_time' AND signalk_timestamp <= 'end_time'
AND value IS NOT NULL
GROUP BY time_bucket
ORDER BY time_bucket
- Use date_trunc('minute', ...) for detailed analysis, date_trunc('hour', ...) for overviews
- NEVER return more than 100 time buckets per query
REMEMBER:
- Always refer to and use the vessel context provided above (vessel name, dimensions, operational details, etc.) when analyzing data and providing recommendations.
- ALWAYS include time range WHERE clauses in your queries to avoid loading excessive historical data.
- Keep query results focused and relevant to the specified time period.
- CRITICAL: ONLY use the exact paths listed in the "AVAILABLE DATA PATHS" section. DO NOT make up or guess path names.
- If a path you want to use is not in the available paths list, it does not exist - inform the user instead of guessing.
Focus on:
1. Current vessel status and recent activity
2. Patterns in navigation, weather, and performance data
3. Safety considerations and operational insights
4. Data quality and completeness assessment`;
this.app?.debug(`🔧 Final system context (${systemContext.length} chars):`);
this.app?.debug(`📋 System context preview:\n${systemContext.substring(0, 2000)}${systemContext.length > 2000 ? '...' : ''}`);
const userPrompt = `${request.customPrompt || 'Analyze maritime data and provide insights'}
Begin your analysis by querying relevant data within the specified time range.`;
// Start conversation with Claude with function calling capability
let conversationMessages: Array<any> = [{
role: 'user',
content: userPrompt
}];
let analysisResult = '';
let queryCount = 0;
const maxQueries = 10; // Allow more queries for thorough analysis
let totalTokenUsage = { input_tokens: 0, output_tokens: 0 };
// Check if user is requesting real-time data
const needsRealTimeData = this.checkForRealTimeKeywords(request.customPrompt || '', conversationMessages);
// Identify relevant regimens based on keywords
const relevantRegimens = this.identifyRelevantRegimens(request.customPrompt || '');
// Build tools array - always include database access
const availableTools: any[] = [{
name: 'query_maritime_database',
description: 'Execute SQL queries against the maritime Parquet database to explore and analyze data',
input_schema: {
type: 'object',
properties: {
sql: {
type: 'string',
description: 'SQL query to execute against the Parquet database'
},
purpose: {
type: 'string',
description: 'Brief description of what this query is trying to discover'
}
},
required: ['sql', 'purpose']
}
}];
// Add real-time SignalK data tool if keywords detected
if (needsRealTimeData) {
availableTools.push({
name: 'get_current_signalk_data',
description: 'Get current real-time SignalK data VALUES for specific known paths. Use this ONLY when you already know the specific paths and need their actual values. DO NOT use this for path discovery - use get_available_signalk_paths first to discover what paths exist.',
input_schema: {
type: 'object',
properties: {
paths: {
type: 'array',
items: { type: 'string' },
description: 'Array of SignalK paths to query (e.g., ["navigation.position", "navigation.speedOverGround"]). Leave empty to get all available current values.'
},
purpose: {
type: 'string',
description: 'Brief description of why you need this real-time data'
},
vesselContext: {
type: 'string',
description: 'Vessel context to query. Use "vessels.*" for ALL vessels (recommended for multi-vessel queries), "vessels.self" for own vessel, or "vessels.urn:mrn:imo:mmsi:123456789" for specific vessel. Defaults to vessels.self if not specified.'
}
},
required: ['purpose']
}
});
// Also add path discovery tool for targeted queries
availableTools.push({
name: 'get_available_signalk_paths',
description: 'DISCOVER what SignalK paths are currently available with filtering options. Use this for ALL path discovery queries like "list paths", "what data is available", "show available paths", etc. ALWAYS use this BEFORE get_current_signalk_data.',
input_schema: {
type: 'object',
properties: {
vesselContext: {
type: 'string',
description: 'Vessel context to query. Use "vessels.*" for ALL vessels, "vessels.self" for own vessel, or specific vessel ID. Defaults to vessels.self.'
},
pathPattern: {