datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
252 lines • 9.87 kB
JavaScript
;
/**
* Parser Registry - Universal format detection and parser management
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.globalParserRegistry = exports.ParserRegistry = void 0;
const fs_1 = require("fs");
const path = __importStar(require("path"));
const types_1 = require("../../core/types");
const logger_1 = require("../../utils/logger");
/**
* Central registry for all data parsers
* Handles format detection and parser instantiation
*/
class ParserRegistry {
registrations = new Map();
extensionMap = new Map(); // extension -> format[]
/**
* Register a parser for a specific format
*/
register(registration) {
const { format, extensions } = registration;
// Register the parser
this.registrations.set(format, registration);
// Map extensions to format
for (const ext of extensions) {
const normalized = ext.toLowerCase().startsWith('.')
? ext.toLowerCase()
: `.${ext.toLowerCase()}`;
if (!this.extensionMap.has(normalized)) {
this.extensionMap.set(normalized, []);
}
this.extensionMap.get(normalized).push(format);
}
logger_1.logger.info(`Registered parser for format: ${format} (extensions: ${extensions.join(', ')})`);
}
/**
* Auto-detect format and return appropriate parser
*/
async getParser(filePath, options = {}) {
// 1. Force format if specified
if (options.format) {
return await this.getParserByFormat(filePath, options.format, options);
}
// 2. Try extension-based detection first (fast)
const extensionCandidates = await this.getCandidatesByExtension(filePath);
// 3. Run content detection on candidates
const detectionResults = await this.runContentDetection(filePath, extensionCandidates);
// 4. Sort by confidence and priority
detectionResults.sort((a, b) => {
// Primary: confidence
if (Math.abs(a.detection.confidence - b.detection.confidence) > 0.1) {
return b.detection.confidence - a.detection.confidence;
}
// Secondary: priority
return b.registration.priority - a.registration.priority;
});
// 5. Return best match or throw error
const best = detectionResults[0];
if (!best || best.detection.confidence < 0.5) {
throw new types_1.DataPilotError(this.buildUnsupportedFormatError(filePath, detectionResults), 'UNSUPPORTED_FORMAT', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION);
}
logger_1.logger.info(`Selected parser: ${best.format} (confidence: ${best.detection.confidence.toFixed(2)})`);
return best;
}
/**
* Get parser by specific format
*/
async getParserByFormat(filePath, format, options = {}) {
const registration = this.registrations.get(format);
if (!registration) {
throw new types_1.DataPilotError(`Unsupported format: ${format}. Available formats: ${this.getSupportedFormats().join(', ')}`, 'UNSUPPORTED_FORMAT', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.VALIDATION);
}
const parser = registration.parserFactory(options);
const detection = await registration.detector.detect(filePath);
return {
parser,
format,
detection,
registration,
};
}
/**
* Get candidate formats based on file extension
*/
async getCandidatesByExtension(filePath) {
const extension = path.extname(filePath).toLowerCase();
const formatNames = this.extensionMap.get(extension) || [];
const candidates = formatNames
.map((format) => this.registrations.get(format))
.filter((reg) => reg !== undefined)
.sort((a, b) => b.priority - a.priority);
// If no extension matches, try all parsers (lower priority)
if (candidates.length === 0) {
const allRegistrations = Array.from(this.registrations.values()).sort((a, b) => b.priority - a.priority);
logger_1.logger.warn(`No parser found for extension ${extension}, trying all parsers`);
return allRegistrations;
}
return candidates;
}
/**
* Run content detection on candidate parsers
*/
async runContentDetection(filePath, candidates) {
const results = [];
for (const registration of candidates) {
try {
const detection = await registration.detector.detect(filePath);
if (detection.confidence > 0) {
const parser = registration.parserFactory();
results.push({
parser,
format: registration.format,
detection,
registration,
});
}
}
catch (error) {
logger_1.logger.warn(`Detection failed for ${registration.format}: ${error.message}`);
}
}
return results;
}
/**
* Build comprehensive error message for unsupported formats
*/
buildUnsupportedFormatError(filePath, detectionResults) {
const extension = path.extname(filePath);
const supportedFormats = this.getSupportedFormats();
const supportedExtensions = this.getSupportedExtensions();
let message = `Unsupported file format: ${extension}\n\n`;
message += `Supported formats: ${supportedFormats.join(', ')}\n`;
message += `Supported extensions: ${supportedExtensions.join(', ')}\n\n`;
if (detectionResults.length > 0) {
message += 'Detection results:\n';
for (const result of detectionResults.slice(0, 3)) {
message += ` - ${result.format}: ${(result.detection.confidence * 100).toFixed(1)}% confidence\n`;
}
message += '\n';
}
message += 'Suggestions:\n';
message += ` - Check if the file is corrupted\n`;
message += ` - Try specifying format explicitly: --format csv\n`;
message += ` - Convert to a supported format first\n`;
return message;
}
/**
* Get all supported format names
*/
getSupportedFormats() {
return Array.from(this.registrations.keys()).sort();
}
/**
* Get all supported file extensions
*/
getSupportedExtensions() {
return Array.from(this.extensionMap.keys()).sort();
}
/**
* Get format information
*/
getFormatInfo(format) {
return this.registrations.get(format);
}
/**
* Check if format is supported
*/
isFormatSupported(format) {
return this.registrations.has(format);
}
/**
* Get statistics about registered parsers
*/
getRegistryStats() {
const formats = Array.from(this.registrations.values()).map((reg) => ({
name: reg.format,
extensions: reg.extensions,
priority: reg.priority,
}));
return {
formatCount: this.registrations.size,
extensionCount: this.extensionMap.size,
formats,
};
}
/**
* Validate file can be parsed by any registered parser
*/
async validateFile(filePath) {
try {
// Check if file exists
await fs_1.promises.access(filePath);
// Get all detection results
const candidates = await this.getCandidatesByExtension(filePath);
const allResults = await this.runContentDetection(filePath, candidates);
// Find best match
const sorted = allResults.sort((a, b) => b.detection.confidence - a.detection.confidence);
const bestMatch = sorted[0];
return {
supported: bestMatch?.detection.confidence > 0.5,
bestMatch: bestMatch?.detection.confidence > 0.5 ? bestMatch : undefined,
allResults,
};
}
catch (error) {
return {
supported: false,
allResults: [],
};
}
}
}
exports.ParserRegistry = ParserRegistry;
/**
* Global parser registry instance
*/
exports.globalParserRegistry = new ParserRegistry();
//# sourceMappingURL=parser-registry.js.map