UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

229 lines 8.71 kB
"use strict"; /** * TSV Parser Implementation * Tab-separated values parser built on CSV parser foundation */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.TSVParser = exports.TSVDetector = void 0; exports.createTSVParser = createTSVParser; const csv_parser_adapter_1 = require("./adapters/csv-parser-adapter"); const logger_1 = require("../utils/logger"); const fs_1 = require("fs"); const path = __importStar(require("path")); /** * TSV Format Detector */ class TSVDetector { getSupportedExtensions() { return ['.tsv', '.tab']; } getFormatName() { return 'tsv'; } async detect(filePath) { try { // Check extension first const ext = path.extname(filePath).toLowerCase(); const extensionScore = this.getSupportedExtensions().includes(ext) ? 0.4 : 0; // Read sample of file to detect tab-separated structure const sample = await this.readSample(filePath, 2048); const analysis = await this.analyzeTSVStructure(sample); if (analysis.isTabSeparated) { const confidence = Math.min(0.95, extensionScore + analysis.confidence); return { format: 'tsv', confidence, metadata: { delimiter: '\t', quote: analysis.quote, hasHeader: analysis.hasHeader, tabCount: analysis.avgTabsPerLine, lineCount: analysis.lineCount, columnCount: analysis.estimatedColumns, estimatedColumns: analysis.estimatedColumns, }, encoding: 'utf8', estimatedRows: analysis.lineCount, estimatedColumns: analysis.estimatedColumns, suggestedOptions: { delimiter: '\t', quote: analysis.quote, hasHeader: analysis.hasHeader, encoding: 'utf8', }, }; } return { format: 'tsv', confidence: extensionScore, metadata: { reason: 'No consistent tab separation detected' }, }; } catch (error) { logger_1.logger.warn(`TSV detection failed: ${error.message}`); return { format: 'tsv', confidence: 0, metadata: { error: error.message }, }; } } async readSample(filePath, maxBytes) { const buffer = Buffer.alloc(maxBytes); const file = await fs_1.promises.open(filePath, 'r'); try { const { bytesRead } = await file.read(buffer, 0, maxBytes, 0); return buffer.slice(0, bytesRead).toString('utf8'); } finally { await file.close(); } } async analyzeTSVStructure(sample) { const lines = sample.split('\n').filter((line) => line.trim().length > 0); if (lines.length === 0) { return { isTabSeparated: false, confidence: 0, quote: '"', hasHeader: false, avgTabsPerLine: 0, lineCount: 0, estimatedColumns: 0, }; } // Count tabs per line const tabCounts = lines.slice(0, 10).map((line) => (line.match(/\t/g) || []).length); const avgTabs = tabCounts.reduce((sum, count) => sum + count, 0) / tabCounts.length; const tabVariance = tabCounts.reduce((sum, count) => sum + Math.pow(count - avgTabs, 2), 0) / tabCounts.length; // Check for consistent tab usage - stricter threshold for consistency const isConsistent = tabVariance < 0.5; // Stricter variance threshold const hasEnoughTabs = avgTabs >= 1; // At least one tab per line on average // Detect quote character (less common in TSV but possible) const quote = this.detectQuoteCharacter(lines); // Detect header row const hasHeader = this.detectHeaderRow(lines); // Calculate confidence let confidence = 0.3; // Base confidence for TSV detection if (isConsistent) confidence += 0.3; if (hasEnoughTabs) confidence += 0.2; if (avgTabs >= 2) confidence += 0.1; // Multiple columns if (tabVariance === 0) confidence += 0.1; // Perfect consistency // More aggressive penalties for inconsistency if (tabVariance > 0.5) confidence -= 0.3; // Penalty for moderate inconsistency if (tabVariance > 1) confidence -= 0.4; // Higher penalty for high inconsistency return { isTabSeparated: isConsistent && hasEnoughTabs, confidence, quote, hasHeader, avgTabsPerLine: avgTabs, lineCount: lines.length, estimatedColumns: Math.round(avgTabs) + 1, // Columns = tabs + 1 }; } detectQuoteCharacter(lines) { const sampleLines = lines.slice(0, 5); // Count occurrences of different quote characters const doubleQuoteCount = sampleLines.join('').split('"').length - 1; const singleQuoteCount = sampleLines.join('').split("'").length - 1; // Return most common quote character (default to double quote) return doubleQuoteCount >= singleQuoteCount ? '"' : "'"; } detectHeaderRow(lines) { if (lines.length === 0) return false; const firstLine = lines[0]; const firstLineCells = firstLine.split('\t'); // Heuristic: if first line has mostly non-numeric values, it's likely a header const nonNumericCells = firstLineCells.filter((cell) => { const trimmed = cell.trim().replace(/['"]/g, ''); // Remove quotes return trimmed !== '' && isNaN(Number(trimmed)); }); return nonNumericCells.length > firstLineCells.length / 2; } } exports.TSVDetector = TSVDetector; /** * TSV Parser Implementation * Extends CSV parser with tab-specific defaults */ class TSVParser extends csv_parser_adapter_1.CSVParserAdapter { constructor(options = {}) { // Force tab delimiter and set TSV-specific defaults const tsvOptions = { ...options, delimiter: '\t', // Always use tab for TSV quote: options.quote || '"', hasHeader: options.hasHeader ?? true, encoding: options.encoding || 'utf8', }; super(tsvOptions); } /** * Override format detection to use TSV detector */ async detect(filePath) { const detector = new TSVDetector(); return detector.detect(filePath); } /** * Get supported file extensions */ getSupportedExtensions() { return ['.tsv', '.tab']; } /** * Get format name */ getFormatName() { return 'tsv'; } } exports.TSVParser = TSVParser; /** * Factory function to create TSV parser */ function createTSVParser(options) { return new TSVParser(options); } //# sourceMappingURL=tsv-parser.js.map