UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

313 lines 12.9 kB
"use strict"; /** * Parquet Parser Implementation * Supports .parquet files using hyparquet library */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ParquetParser = exports.ParquetDetector = void 0; exports.createParquetParser = createParquetParser; const fs_1 = require("fs"); const path = __importStar(require("path")); const data_parser_1 = require("./base/data-parser"); const types_1 = require("../core/types"); const logger_1 = require("../utils/logger"); /** * Parquet Format Detector */ class ParquetDetector { getSupportedExtensions() { return ['.parquet']; } getFormatName() { return 'parquet'; } async detect(filePath) { try { // Check extension first const ext = path.extname(filePath).toLowerCase(); const extensionScore = this.getSupportedExtensions().includes(ext) ? 0.4 : 0; if (extensionScore === 0) { return { format: 'parquet', confidence: 0, metadata: { reason: 'Unsupported extension' }, }; } // Try to read Parquet metadata without parsing full file const metadata = await this.readParquetMetadata(filePath); if (metadata.numRows >= 0) { const confidence = Math.min(0.98, extensionScore + 0.58); // High confidence for valid Parquet files return { format: 'parquet', confidence, metadata, estimatedRows: metadata.numRows, estimatedColumns: metadata.columnNames.length, encoding: 'utf8', suggestedOptions: { hasHeader: true, // Parquet always has schema-defined column names }, }; } return { format: 'parquet', confidence: extensionScore, metadata: { reason: 'No valid Parquet data found' }, }; } catch (error) { logger_1.logger.warn(`Parquet detection failed: ${error.message}`); return { format: 'parquet', confidence: 0, metadata: { error: error.message }, }; } } async readParquetMetadata(filePath) { try { // Check if file exists and get size const fileStats = await fs_1.promises.stat(filePath); // Dynamic import hyparquet (with type assertion due to incomplete type definitions) const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet')))); const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema } = hyparquet; // Create async buffer for hyparquet const file = await asyncBufferFromFile(filePath); // Read metadata using hyparquet const metadata = await parquetMetadataAsync(file); const schema = parquetSchema(metadata); return { numRows: Number(metadata.num_rows), schema, columnNames: schema.children.map((child) => child.element.name), fileSize: fileStats.size, rowGroups: metadata.row_groups.length, compressionType: metadata.row_groups[0]?.columns[0]?.meta_data?.codec, }; } catch (error) { throw new types_1.DataPilotError(`Failed to read Parquet metadata: ${error.message}`, 'PARQUET_METADATA_ERROR', types_1.ErrorSeverity.MEDIUM, types_1.ErrorCategory.PARSING); } } } exports.ParquetDetector = ParquetDetector; /** * Parquet Parser Implementation */ class ParquetParser extends data_parser_1.BaseParser { headers = []; metadata = null; getSupportedExtensions() { return ['.parquet']; } getFormatName() { return 'parquet'; } async detect(filePath) { const detector = new ParquetDetector(); return detector.detect(filePath); } async *parse(filePath, options) { const mergedOptions = { ...this.options, ...options }; try { // Get file size for stats const fileStats = await fs_1.promises.stat(filePath); this.updateStats(fileStats.size, 0); // Dynamic import hyparquet (with type assertion due to incomplete type definitions) const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet')))); const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema, parquetReadObjects } = hyparquet; // Create async buffer for hyparquet const file = await asyncBufferFromFile(filePath); // Read metadata first this.metadata = await this.readFileMetadata(file); this.headers = this.metadata.columnNames; logger_1.logger.info(`Parsing Parquet file: ${this.metadata.numRows} rows, ${this.headers.length} columns`); // Apply row limit if specified const maxRows = mergedOptions.maxRows || this.metadata.numRows; const rowLimit = Math.min(maxRows, this.metadata.numRows); // Read data using hyparquet with row filtering const parquetOptions = {}; if (rowLimit < this.metadata.numRows) { parquetOptions.rowStart = 0; parquetOptions.rowEnd = rowLimit; } // Add column filtering if specified (not implemented in current options but could be added) const data = await parquetReadObjects({ file, ...parquetOptions, }); // Convert to ParsedRow format let rowIndex = 0; for (const row of data) { if (this.aborted || rowIndex >= rowLimit) break; // Convert object to array matching headers order const rowData = this.headers.map((header) => { const value = row[header]; return this.formatValue(value); }); yield { index: rowIndex++, data: rowData, raw: JSON.stringify(row, (key, value) => typeof value === 'bigint' ? value.toString() : value), metadata: { originalType: 'parquet', rowGroups: this.metadata?.rowGroups, compressionType: this.metadata?.compressionType, columnCount: rowData.length, }, }; this.updateStats(0, 1); } } catch (error) { throw new types_1.DataPilotError(`Parquet parsing failed: ${error.message}`, 'PARQUET_PARSING_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING); } } async readFileMetadata(file) { try { const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet')))); const { parquetMetadataAsync, parquetSchema } = hyparquet; const metadata = await parquetMetadataAsync(file); const schema = parquetSchema(metadata); return { numRows: Number(metadata.num_rows), schema, columnNames: schema.children.map((child) => child.element.name), fileSize: 0, // Will be set by calling code rowGroups: metadata.row_groups.length, compressionType: metadata.row_groups[0]?.columns[0]?.meta_data?.codec, }; } catch (error) { throw new types_1.DataPilotError(`Failed to read Parquet file metadata: ${error.message}`, 'PARQUET_FILE_METADATA_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING); } } formatValue(value) { if (value === null || value === undefined) { return ''; } // Handle different data types from Parquet if (typeof value === 'object') { // Handle dates if (value instanceof Date) { return value.toISOString().split('T')[0]; // Return date as YYYY-MM-DD } // Handle complex objects (arrays, nested objects) if (Array.isArray(value)) { return value.map((item) => this.formatValue(item)).join(';'); } // Handle nested objects return JSON.stringify(value); } // Handle BigInt values (common in Parquet) if (typeof value === 'bigint') { return value.toString(); } // Handle boolean values if (typeof value === 'boolean') { return value.toString(); } // Handle numbers with proper precision if (typeof value === 'number') { // Preserve precision for decimals, avoid scientific notation for large integers return Number.isInteger(value) ? value.toString() : value.toPrecision(10).replace(/\.?0+$/, ''); } // Handle string values return String(value); } /** * Get detected headers for column mapping */ getHeaders() { return [...this.headers]; } /** * Get Parquet file metadata */ getMetadata() { return this.metadata; } /** * Get schema information from Parquet file */ async getSchema(filePath) { try { const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet')))); const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema } = hyparquet; const file = await asyncBufferFromFile(filePath); const metadata = await parquetMetadataAsync(file); return parquetSchema(metadata); } catch (error) { throw new types_1.DataPilotError(`Failed to read Parquet schema: ${error.message}`, 'PARQUET_SCHEMA_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING); } } /** * Get row group information for optimization */ async getRowGroups(filePath) { try { const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet')))); const { asyncBufferFromFile, parquetMetadataAsync } = hyparquet; const file = await asyncBufferFromFile(filePath); const metadata = await parquetMetadataAsync(file); return metadata.row_groups.map((rg, index) => ({ index, numRows: Number(rg.num_rows), totalByteSize: Number(rg.total_byte_size), columns: rg.columns.map((col) => ({ name: col.meta_data.path_in_schema, type: col.meta_data.type, compression: col.meta_data.codec, })), })); } catch (error) { throw new types_1.DataPilotError(`Failed to read Parquet row groups: ${error.message}`, 'PARQUET_ROW_GROUPS_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING); } } } exports.ParquetParser = ParquetParser; /** * Factory function to create Parquet parser */ function createParquetParser(options) { return new ParquetParser(options); } //# sourceMappingURL=parquet-parser.js.map