datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
313 lines • 12.9 kB
JavaScript
;
/**
* Parquet Parser Implementation
* Supports .parquet files using hyparquet library
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetParser = exports.ParquetDetector = void 0;
exports.createParquetParser = createParquetParser;
const fs_1 = require("fs");
const path = __importStar(require("path"));
const data_parser_1 = require("./base/data-parser");
const types_1 = require("../core/types");
const logger_1 = require("../utils/logger");
/**
* Parquet Format Detector
*/
class ParquetDetector {
getSupportedExtensions() {
return ['.parquet'];
}
getFormatName() {
return 'parquet';
}
async detect(filePath) {
try {
// Check extension first
const ext = path.extname(filePath).toLowerCase();
const extensionScore = this.getSupportedExtensions().includes(ext) ? 0.4 : 0;
if (extensionScore === 0) {
return {
format: 'parquet',
confidence: 0,
metadata: { reason: 'Unsupported extension' },
};
}
// Try to read Parquet metadata without parsing full file
const metadata = await this.readParquetMetadata(filePath);
if (metadata.numRows >= 0) {
const confidence = Math.min(0.98, extensionScore + 0.58); // High confidence for valid Parquet files
return {
format: 'parquet',
confidence,
metadata,
estimatedRows: metadata.numRows,
estimatedColumns: metadata.columnNames.length,
encoding: 'utf8',
suggestedOptions: {
hasHeader: true, // Parquet always has schema-defined column names
},
};
}
return {
format: 'parquet',
confidence: extensionScore,
metadata: { reason: 'No valid Parquet data found' },
};
}
catch (error) {
logger_1.logger.warn(`Parquet detection failed: ${error.message}`);
return {
format: 'parquet',
confidence: 0,
metadata: { error: error.message },
};
}
}
async readParquetMetadata(filePath) {
try {
// Check if file exists and get size
const fileStats = await fs_1.promises.stat(filePath);
// Dynamic import hyparquet (with type assertion due to incomplete type definitions)
const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet'))));
const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema } = hyparquet;
// Create async buffer for hyparquet
const file = await asyncBufferFromFile(filePath);
// Read metadata using hyparquet
const metadata = await parquetMetadataAsync(file);
const schema = parquetSchema(metadata);
return {
numRows: Number(metadata.num_rows),
schema,
columnNames: schema.children.map((child) => child.element.name),
fileSize: fileStats.size,
rowGroups: metadata.row_groups.length,
compressionType: metadata.row_groups[0]?.columns[0]?.meta_data?.codec,
};
}
catch (error) {
throw new types_1.DataPilotError(`Failed to read Parquet metadata: ${error.message}`, 'PARQUET_METADATA_ERROR', types_1.ErrorSeverity.MEDIUM, types_1.ErrorCategory.PARSING);
}
}
}
exports.ParquetDetector = ParquetDetector;
/**
* Parquet Parser Implementation
*/
class ParquetParser extends data_parser_1.BaseParser {
headers = [];
metadata = null;
getSupportedExtensions() {
return ['.parquet'];
}
getFormatName() {
return 'parquet';
}
async detect(filePath) {
const detector = new ParquetDetector();
return detector.detect(filePath);
}
async *parse(filePath, options) {
const mergedOptions = { ...this.options, ...options };
try {
// Get file size for stats
const fileStats = await fs_1.promises.stat(filePath);
this.updateStats(fileStats.size, 0);
// Dynamic import hyparquet (with type assertion due to incomplete type definitions)
const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet'))));
const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema, parquetReadObjects } = hyparquet;
// Create async buffer for hyparquet
const file = await asyncBufferFromFile(filePath);
// Read metadata first
this.metadata = await this.readFileMetadata(file);
this.headers = this.metadata.columnNames;
logger_1.logger.info(`Parsing Parquet file: ${this.metadata.numRows} rows, ${this.headers.length} columns`);
// Apply row limit if specified
const maxRows = mergedOptions.maxRows || this.metadata.numRows;
const rowLimit = Math.min(maxRows, this.metadata.numRows);
// Read data using hyparquet with row filtering
const parquetOptions = {};
if (rowLimit < this.metadata.numRows) {
parquetOptions.rowStart = 0;
parquetOptions.rowEnd = rowLimit;
}
// Add column filtering if specified (not implemented in current options but could be added)
const data = await parquetReadObjects({
file,
...parquetOptions,
});
// Convert to ParsedRow format
let rowIndex = 0;
for (const row of data) {
if (this.aborted || rowIndex >= rowLimit)
break;
// Convert object to array matching headers order
const rowData = this.headers.map((header) => {
const value = row[header];
return this.formatValue(value);
});
yield {
index: rowIndex++,
data: rowData,
raw: JSON.stringify(row, (key, value) => typeof value === 'bigint' ? value.toString() : value),
metadata: {
originalType: 'parquet',
rowGroups: this.metadata?.rowGroups,
compressionType: this.metadata?.compressionType,
columnCount: rowData.length,
},
};
this.updateStats(0, 1);
}
}
catch (error) {
throw new types_1.DataPilotError(`Parquet parsing failed: ${error.message}`, 'PARQUET_PARSING_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING);
}
}
async readFileMetadata(file) {
try {
const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet'))));
const { parquetMetadataAsync, parquetSchema } = hyparquet;
const metadata = await parquetMetadataAsync(file);
const schema = parquetSchema(metadata);
return {
numRows: Number(metadata.num_rows),
schema,
columnNames: schema.children.map((child) => child.element.name),
fileSize: 0, // Will be set by calling code
rowGroups: metadata.row_groups.length,
compressionType: metadata.row_groups[0]?.columns[0]?.meta_data?.codec,
};
}
catch (error) {
throw new types_1.DataPilotError(`Failed to read Parquet file metadata: ${error.message}`, 'PARQUET_FILE_METADATA_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING);
}
}
formatValue(value) {
if (value === null || value === undefined) {
return '';
}
// Handle different data types from Parquet
if (typeof value === 'object') {
// Handle dates
if (value instanceof Date) {
return value.toISOString().split('T')[0]; // Return date as YYYY-MM-DD
}
// Handle complex objects (arrays, nested objects)
if (Array.isArray(value)) {
return value.map((item) => this.formatValue(item)).join(';');
}
// Handle nested objects
return JSON.stringify(value);
}
// Handle BigInt values (common in Parquet)
if (typeof value === 'bigint') {
return value.toString();
}
// Handle boolean values
if (typeof value === 'boolean') {
return value.toString();
}
// Handle numbers with proper precision
if (typeof value === 'number') {
// Preserve precision for decimals, avoid scientific notation for large integers
return Number.isInteger(value)
? value.toString()
: value.toPrecision(10).replace(/\.?0+$/, '');
}
// Handle string values
return String(value);
}
/**
* Get detected headers for column mapping
*/
getHeaders() {
return [...this.headers];
}
/**
* Get Parquet file metadata
*/
getMetadata() {
return this.metadata;
}
/**
* Get schema information from Parquet file
*/
async getSchema(filePath) {
try {
const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet'))));
const { asyncBufferFromFile, parquetMetadataAsync, parquetSchema } = hyparquet;
const file = await asyncBufferFromFile(filePath);
const metadata = await parquetMetadataAsync(file);
return parquetSchema(metadata);
}
catch (error) {
throw new types_1.DataPilotError(`Failed to read Parquet schema: ${error.message}`, 'PARQUET_SCHEMA_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING);
}
}
/**
* Get row group information for optimization
*/
async getRowGroups(filePath) {
try {
const hyparquet = (await Promise.resolve().then(() => __importStar(require('hyparquet'))));
const { asyncBufferFromFile, parquetMetadataAsync } = hyparquet;
const file = await asyncBufferFromFile(filePath);
const metadata = await parquetMetadataAsync(file);
return metadata.row_groups.map((rg, index) => ({
index,
numRows: Number(rg.num_rows),
totalByteSize: Number(rg.total_byte_size),
columns: rg.columns.map((col) => ({
name: col.meta_data.path_in_schema,
type: col.meta_data.type,
compression: col.meta_data.codec,
})),
}));
}
catch (error) {
throw new types_1.DataPilotError(`Failed to read Parquet row groups: ${error.message}`, 'PARQUET_ROW_GROUPS_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING);
}
}
}
exports.ParquetParser = ParquetParser;
/**
* Factory function to create Parquet parser
*/
function createParquetParser(options) {
return new ParquetParser(options);
}
//# sourceMappingURL=parquet-parser.js.map