UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

252 lines 9.87 kB
"use strict"; /** * Parser Registry - Universal format detection and parser management */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.globalParserRegistry = exports.ParserRegistry = void 0; const fs_1 = require("fs"); const path = __importStar(require("path")); const types_1 = require("../../core/types"); const logger_1 = require("../../utils/logger"); /** * Central registry for all data parsers * Handles format detection and parser instantiation */ class ParserRegistry { registrations = new Map(); extensionMap = new Map(); // extension -> format[] /** * Register a parser for a specific format */ register(registration) { const { format, extensions } = registration; // Register the parser this.registrations.set(format, registration); // Map extensions to format for (const ext of extensions) { const normalized = ext.toLowerCase().startsWith('.') ? ext.toLowerCase() : `.${ext.toLowerCase()}`; if (!this.extensionMap.has(normalized)) { this.extensionMap.set(normalized, []); } this.extensionMap.get(normalized).push(format); } logger_1.logger.info(`Registered parser for format: ${format} (extensions: ${extensions.join(', ')})`); } /** * Auto-detect format and return appropriate parser */ async getParser(filePath, options = {}) { // 1. Force format if specified if (options.format) { return await this.getParserByFormat(filePath, options.format, options); } // 2. Try extension-based detection first (fast) const extensionCandidates = await this.getCandidatesByExtension(filePath); // 3. Run content detection on candidates const detectionResults = await this.runContentDetection(filePath, extensionCandidates); // 4. Sort by confidence and priority detectionResults.sort((a, b) => { // Primary: confidence if (Math.abs(a.detection.confidence - b.detection.confidence) > 0.1) { return b.detection.confidence - a.detection.confidence; } // Secondary: priority return b.registration.priority - a.registration.priority; }); // 5. Return best match or throw error const best = detectionResults[0]; if (!best || best.detection.confidence < 0.5) { throw new types_1.DataPilotError(this.buildUnsupportedFormatError(filePath, detectionResults), 'UNSUPPORTED_FORMAT', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION); } logger_1.logger.info(`Selected parser: ${best.format} (confidence: ${best.detection.confidence.toFixed(2)})`); return best; } /** * Get parser by specific format */ async getParserByFormat(filePath, format, options = {}) { const registration = this.registrations.get(format); if (!registration) { throw new types_1.DataPilotError(`Unsupported format: ${format}. Available formats: ${this.getSupportedFormats().join(', ')}`, 'UNSUPPORTED_FORMAT', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION); } const parser = registration.parserFactory(options); const detection = await registration.detector.detect(filePath); return { parser, format, detection, registration, }; } /** * Get candidate formats based on file extension */ async getCandidatesByExtension(filePath) { const extension = path.extname(filePath).toLowerCase(); const formatNames = this.extensionMap.get(extension) || []; const candidates = formatNames .map((format) => this.registrations.get(format)) .filter((reg) => reg !== undefined) .sort((a, b) => b.priority - a.priority); // If no extension matches, try all parsers (lower priority) if (candidates.length === 0) { const allRegistrations = Array.from(this.registrations.values()).sort((a, b) => b.priority - a.priority); logger_1.logger.warn(`No parser found for extension ${extension}, trying all parsers`); return allRegistrations; } return candidates; } /** * Run content detection on candidate parsers */ async runContentDetection(filePath, candidates) { const results = []; for (const registration of candidates) { try { const detection = await registration.detector.detect(filePath); if (detection.confidence > 0) { const parser = registration.parserFactory(); results.push({ parser, format: registration.format, detection, registration, }); } } catch (error) { logger_1.logger.warn(`Detection failed for ${registration.format}: ${error.message}`); } } return results; } /** * Build comprehensive error message for unsupported formats */ buildUnsupportedFormatError(filePath, detectionResults) { const extension = path.extname(filePath); const supportedFormats = this.getSupportedFormats(); const supportedExtensions = this.getSupportedExtensions(); let message = `Unsupported file format: ${extension}\n\n`; message += `Supported formats: ${supportedFormats.join(', ')}\n`; message += `Supported extensions: ${supportedExtensions.join(', ')}\n\n`; if (detectionResults.length > 0) { message += 'Detection results:\n'; for (const result of detectionResults.slice(0, 3)) { message += ` - ${result.format}: ${(result.detection.confidence * 100).toFixed(1)}% confidence\n`; } message += '\n'; } message += 'Suggestions:\n'; message += ` - Check if the file is corrupted\n`; message += ` - Try specifying format explicitly: --format csv\n`; message += ` - Convert to a supported format first\n`; return message; } /** * Get all supported format names */ getSupportedFormats() { return Array.from(this.registrations.keys()).sort(); } /** * Get all supported file extensions */ getSupportedExtensions() { return Array.from(this.extensionMap.keys()).sort(); } /** * Get format information */ getFormatInfo(format) { return this.registrations.get(format); } /** * Check if format is supported */ isFormatSupported(format) { return this.registrations.has(format); } /** * Get statistics about registered parsers */ getRegistryStats() { const formats = Array.from(this.registrations.values()).map((reg) => ({ name: reg.format, extensions: reg.extensions, priority: reg.priority, })); return { formatCount: this.registrations.size, extensionCount: this.extensionMap.size, formats, }; } /** * Validate file can be parsed by any registered parser */ async validateFile(filePath) { try { // Check if file exists await fs_1.promises.access(filePath); // Get all detection results const candidates = await this.getCandidatesByExtension(filePath); const allResults = await this.runContentDetection(filePath, candidates); // Find best match const sorted = allResults.sort((a, b) => b.detection.confidence - a.detection.confidence); const bestMatch = sorted[0]; return { supported: bestMatch?.detection.confidence > 0.5, bestMatch: bestMatch?.detection.confidence > 0.5 ? bestMatch : undefined, allResults, }; } catch (error) { return { supported: false, allResults: [], }; } } } exports.ParserRegistry = ParserRegistry; /** * Global parser registry instance */ exports.globalParserRegistry = new ParserRegistry(); //# sourceMappingURL=parser-registry.js.map