signalk-parquet
Version:
SignalK plugin and webapp that archives SK data to Parquet files with a regimen control system, advanced querying, Claude integrated AI analysis, spatial capabilities, and REST API.
226 lines • 8.56 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.getPathComponentSchema = getPathComponentSchema;
exports.clearSchemaCache = clearSchemaCache;
const node_api_1 = require("@duckdb/node-api");
const path = __importStar(require("path"));
const fs = __importStar(require("fs-extra"));
/**
* Cache for path component schemas
* Key: `${context}:${path}`
*/
const schemaCache = new Map();
const SCHEMA_CACHE_TTL_MS = 2 * 60 * 1000; // 2 minutes
/**
* Get the component schema for an object-valued path across all parquet files
* Returns the union of all value_* columns found in any file for this path
*/
async function getPathComponentSchema(dataDir, context, pathStr) {
const cacheKey = `${context}:${pathStr}`;
const now = Date.now();
// Check cache first
const cached = schemaCache.get(cacheKey);
if (cached && now - cached.timestamp < SCHEMA_CACHE_TTL_MS) {
return cached;
}
try {
// Find all parquet files for this path
const contextPath = toContextFilePath(context);
const pathDirParts = pathStr.split('.');
const pathDir = path.join(dataDir, contextPath, ...pathDirParts);
if (!(await fs.pathExists(pathDir))) {
return null;
}
// Recursively find all .parquet files
const parquetFiles = await findParquetFiles(pathDir);
if (parquetFiles.length === 0) {
return null;
}
// Query schemas from all files to get union of components
const allComponents = new Map();
const duckDB = await node_api_1.DuckDBInstance.create();
const connection = await duckDB.connect();
try {
for (const filePath of parquetFiles) {
try {
// First check if this file has a 'value' column
const valueColQuery = `
SELECT name
FROM parquet_schema('${filePath.replace(/'/g, "''")}')
WHERE name = 'value'
`;
const valueColResult = await connection.runAndReadAll(valueColQuery);
const hasValueColumn = valueColResult.getRowObjects().length > 0;
// If 'value' column exists, skip this file - it's a scalar path
if (hasValueColumn) {
continue;
}
// Query the parquet schema for data component columns
// Exclude metadata columns like value_units, value_description, value_json
const schemaQuery = `
SELECT name, type
FROM parquet_schema('${filePath.replace(/'/g, "''")}')
WHERE name LIKE 'value_%'
AND name NOT IN ('value_json', 'value_units', 'value_description')
`;
const result = await connection.runAndReadAll(schemaQuery);
const rows = result.getRowObjects();
rows.forEach(row => {
const columnName = row.name;
const columnType = row.type;
const componentName = columnName.replace(/^value_/, '');
// Skip if we already have this component
if (allComponents.has(componentName)) {
return;
}
// Determine data type category
const dataType = inferDataTypeCategory(columnType);
allComponents.set(componentName, {
name: componentName,
columnName: columnName,
dataType: dataType,
});
});
}
catch (error) {
// Skip files with errors (corrupted, etc.)
console.warn(`[Schema Cache] Error reading schema from ${filePath}:`, error);
}
}
}
finally {
connection.disconnectSync();
}
if (allComponents.size === 0) {
// No value_* columns found - this is a simple scalar path
return null;
}
const schema = {
components: allComponents,
timestamp: now,
};
// Cache it
schemaCache.set(cacheKey, schema);
return schema;
}
catch (error) {
console.error(`[Schema Cache] Error getting schema for ${pathStr}:`, error);
return null;
}
}
/**
* Clear the schema cache (useful for testing or when data structure changes)
*/
function clearSchemaCache() {
schemaCache.clear();
console.log('[Schema Cache] Schema cache cleared');
}
/**
* Infer data type category from DuckDB type string
*/
function inferDataTypeCategory(duckdbType) {
const typeUpper = duckdbType.toUpperCase();
// Numeric types
if (typeUpper.includes('INT') ||
typeUpper.includes('DOUBLE') ||
typeUpper.includes('FLOAT') ||
typeUpper.includes('DECIMAL') ||
typeUpper.includes('NUMERIC') ||
typeUpper.includes('REAL') ||
typeUpper.includes('BIGINT') ||
typeUpper.includes('SMALLINT') ||
typeUpper.includes('TINYINT')) {
return 'numeric';
}
// String types
if (typeUpper.includes('VARCHAR') ||
typeUpper.includes('CHAR') ||
typeUpper.includes('TEXT') ||
typeUpper.includes('STRING') ||
typeUpper.includes('UTF8')) {
return 'string';
}
// Boolean
if (typeUpper.includes('BOOL')) {
return 'boolean';
}
return 'unknown';
}
/**
* Convert context to filesystem path
*/
function toContextFilePath(context) {
const parts = context.split('.');
if (parts.length === 2) {
// e.g., "vessels.urn:mrn:imo:mmsi:368396230" -> "vessels/urn_mrn_imo_mmsi_368396230"
return `${parts[0]}/${parts[1].replace(/:/g, '_')}`;
}
return context.replace(/\./g, '/').replace(/:/g, '_');
}
/**
* Recursively find all .parquet files in a directory
*/
async function findParquetFiles(dir) {
const files = [];
async function scan(currentDir) {
try {
const entries = await fs.readdir(currentDir, { withFileTypes: true });
for (const entry of entries) {
// Skip special directories
if (entry.name === 'quarantine' ||
entry.name === 'processed' ||
entry.name === 'failed' ||
entry.name === 'repaired' ||
entry.name === 'claude-schemas') {
continue;
}
const fullPath = path.join(currentDir, entry.name);
if (entry.isFile() && entry.name.endsWith('.parquet')) {
files.push(fullPath);
}
else if (entry.isDirectory()) {
await scan(fullPath);
}
}
}
catch (error) {
// Skip directories we can't read
}
}
await scan(dir);
return files;
}
//# sourceMappingURL=schema-cache.js.map