UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

185 lines 6.18 kB
"use strict"; /** * Efficient encoding detection for CSV files */ Object.defineProperty(exports, "__esModule", { value: true }); exports.EncodingDetector = void 0; const types_1 = require("./types"); class EncodingDetector { /** * Detect encoding from a buffer sample * Uses multiple strategies for maximum accuracy */ static detect(buffer) { // Check for BOM first const bomResult = this.detectBOM(buffer); if (bomResult.confidence === 1.0) { return bomResult; } // Statistical analysis for encoding detection const stats = this.analyzeBuffer(buffer); // UTF-8 detection if (this.isValidUTF8(buffer)) { const utf8Confidence = this.calculateUTF8Confidence(stats); if (utf8Confidence > 0.8) { return { encoding: 'utf8', confidence: utf8Confidence, hasBOM: false, bomLength: 0, }; } } // Check for UTF-16 const utf16Result = this.detectUTF16(buffer, stats); if (utf16Result.confidence > 0.7) { return utf16Result; } // Fallback to UTF-8 with lower confidence return { encoding: 'utf8', confidence: 0.5, hasBOM: false, bomLength: 0, }; } static detectBOM(buffer) { for (const pattern of types_1.ENCODING_PATTERNS) { if (pattern.bom && buffer.length >= pattern.bom.length) { if (buffer.slice(0, pattern.bom.length).equals(pattern.bom)) { return { encoding: pattern.encoding === 'utf16be' ? 'utf16le' : pattern.encoding, confidence: 1.0, hasBOM: true, bomLength: pattern.bom.length, }; } } } return { encoding: 'utf8', confidence: 0, hasBOM: false, bomLength: 0, }; } static analyzeBuffer(buffer) { const stats = { nullBytes: 0, asciiChars: 0, nonAsciiChars: 0, controlChars: 0, highBytes: 0, evenNulls: 0, oddNulls: 0, total: buffer.length, }; for (let i = 0; i < buffer.length; i++) { const byte = buffer[i]; if (byte === 0) { stats.nullBytes++; if (i % 2 === 0) stats.evenNulls++; else stats.oddNulls++; } else if (byte < 0x20 && byte !== 0x09 && byte !== 0x0a && byte !== 0x0d) { stats.controlChars++; } else if (byte >= 0x20 && byte <= 0x7e) { stats.asciiChars++; } else if (byte >= 0x80) { stats.nonAsciiChars++; if (byte >= 0xf0) stats.highBytes++; } } return stats; } static isValidUTF8(buffer) { let i = 0; while (i < buffer.length) { const byte = buffer[i]; if (byte <= 0x7f) { // Single byte character i++; } else if ((byte & 0xe0) === 0xc0) { // Two byte character if (i + 1 >= buffer.length || (buffer[i + 1] & 0xc0) !== 0x80) { return false; } i += 2; } else if ((byte & 0xf0) === 0xe0) { // Three byte character if (i + 2 >= buffer.length || (buffer[i + 1] & 0xc0) !== 0x80 || (buffer[i + 2] & 0xc0) !== 0x80) { return false; } i += 3; } else if ((byte & 0xf8) === 0xf0) { // Four byte character if (i + 3 >= buffer.length || (buffer[i + 1] & 0xc0) !== 0x80 || (buffer[i + 2] & 0xc0) !== 0x80 || (buffer[i + 3] & 0xc0) !== 0x80) { return false; } i += 4; } else { return false; } } return true; } static calculateUTF8Confidence(stats) { if (stats.nullBytes > 0) return 0; if (stats.controlChars > stats.total * 0.1) return 0.3; const asciiRatio = stats.asciiChars / stats.total; const nonAsciiRatio = stats.nonAsciiChars / stats.total; if (asciiRatio > 0.95) return 0.95; if (asciiRatio > 0.8 && nonAsciiRatio < 0.2) return 0.9; if (stats.highBytes === 0 && nonAsciiRatio < 0.3) return 0.85; return 0.7; } static detectUTF16(_buffer, stats) { // UTF-16 typically has many null bytes const nullRatio = stats.nullBytes / stats.total; if (nullRatio < 0.2) { return { encoding: 'utf8', confidence: 0, hasBOM: false, bomLength: 0 }; } // Check for alternating nulls (UTF-16 pattern) const evenNullRatio = stats.evenNulls / stats.nullBytes; const oddNullRatio = stats.oddNulls / stats.nullBytes; if (evenNullRatio > 0.8) { return { encoding: 'utf16le', confidence: Math.min(0.9, nullRatio * 2), hasBOM: false, bomLength: 0, }; } else if (oddNullRatio > 0.8) { // Map utf16be to utf16le for Node.js compatibility return { encoding: 'utf16le', confidence: Math.min(0.9, nullRatio * 2), hasBOM: false, bomLength: 0, }; } return { encoding: 'utf8', confidence: 0, hasBOM: false, bomLength: 0 }; } } exports.EncodingDetector = EncodingDetector; //# sourceMappingURL=encoding-detector.js.map