datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
229 lines • 8.71 kB
JavaScript
;
/**
* TSV Parser Implementation
* Tab-separated values parser built on CSV parser foundation
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.TSVParser = exports.TSVDetector = void 0;
exports.createTSVParser = createTSVParser;
const csv_parser_adapter_1 = require("./adapters/csv-parser-adapter");
const logger_1 = require("../utils/logger");
const fs_1 = require("fs");
const path = __importStar(require("path"));
/**
* TSV Format Detector
*/
class TSVDetector {
getSupportedExtensions() {
return ['.tsv', '.tab'];
}
getFormatName() {
return 'tsv';
}
async detect(filePath) {
try {
// Check extension first
const ext = path.extname(filePath).toLowerCase();
const extensionScore = this.getSupportedExtensions().includes(ext) ? 0.4 : 0;
// Read sample of file to detect tab-separated structure
const sample = await this.readSample(filePath, 2048);
const analysis = await this.analyzeTSVStructure(sample);
if (analysis.isTabSeparated) {
const confidence = Math.min(0.95, extensionScore + analysis.confidence);
return {
format: 'tsv',
confidence,
metadata: {
delimiter: '\t',
quote: analysis.quote,
hasHeader: analysis.hasHeader,
tabCount: analysis.avgTabsPerLine,
lineCount: analysis.lineCount,
columnCount: analysis.estimatedColumns,
estimatedColumns: analysis.estimatedColumns,
},
encoding: 'utf8',
estimatedRows: analysis.lineCount,
estimatedColumns: analysis.estimatedColumns,
suggestedOptions: {
delimiter: '\t',
quote: analysis.quote,
hasHeader: analysis.hasHeader,
encoding: 'utf8',
},
};
}
return {
format: 'tsv',
confidence: extensionScore,
metadata: { reason: 'No consistent tab separation detected' },
};
}
catch (error) {
logger_1.logger.warn(`TSV detection failed: ${error.message}`);
return {
format: 'tsv',
confidence: 0,
metadata: { error: error.message },
};
}
}
async readSample(filePath, maxBytes) {
const buffer = Buffer.alloc(maxBytes);
const file = await fs_1.promises.open(filePath, 'r');
try {
const { bytesRead } = await file.read(buffer, 0, maxBytes, 0);
return buffer.slice(0, bytesRead).toString('utf8');
}
finally {
await file.close();
}
}
async analyzeTSVStructure(sample) {
const lines = sample.split('\n').filter((line) => line.trim().length > 0);
if (lines.length === 0) {
return {
isTabSeparated: false,
confidence: 0,
quote: '"',
hasHeader: false,
avgTabsPerLine: 0,
lineCount: 0,
estimatedColumns: 0,
};
}
// Count tabs per line
const tabCounts = lines.slice(0, 10).map((line) => (line.match(/\t/g) || []).length);
const avgTabs = tabCounts.reduce((sum, count) => sum + count, 0) / tabCounts.length;
const tabVariance = tabCounts.reduce((sum, count) => sum + Math.pow(count - avgTabs, 2), 0) / tabCounts.length;
// Check for consistent tab usage - stricter threshold for consistency
const isConsistent = tabVariance < 0.5; // Stricter variance threshold
const hasEnoughTabs = avgTabs >= 1; // At least one tab per line on average
// Detect quote character (less common in TSV but possible)
const quote = this.detectQuoteCharacter(lines);
// Detect header row
const hasHeader = this.detectHeaderRow(lines);
// Calculate confidence
let confidence = 0.3; // Base confidence for TSV detection
if (isConsistent)
confidence += 0.3;
if (hasEnoughTabs)
confidence += 0.2;
if (avgTabs >= 2)
confidence += 0.1; // Multiple columns
if (tabVariance === 0)
confidence += 0.1; // Perfect consistency
// More aggressive penalties for inconsistency
if (tabVariance > 0.5)
confidence -= 0.3; // Penalty for moderate inconsistency
if (tabVariance > 1)
confidence -= 0.4; // Higher penalty for high inconsistency
return {
isTabSeparated: isConsistent && hasEnoughTabs,
confidence,
quote,
hasHeader,
avgTabsPerLine: avgTabs,
lineCount: lines.length,
estimatedColumns: Math.round(avgTabs) + 1, // Columns = tabs + 1
};
}
detectQuoteCharacter(lines) {
const sampleLines = lines.slice(0, 5);
// Count occurrences of different quote characters
const doubleQuoteCount = sampleLines.join('').split('"').length - 1;
const singleQuoteCount = sampleLines.join('').split("'").length - 1;
// Return most common quote character (default to double quote)
return doubleQuoteCount >= singleQuoteCount ? '"' : "'";
}
detectHeaderRow(lines) {
if (lines.length === 0)
return false;
const firstLine = lines[0];
const firstLineCells = firstLine.split('\t');
// Heuristic: if first line has mostly non-numeric values, it's likely a header
const nonNumericCells = firstLineCells.filter((cell) => {
const trimmed = cell.trim().replace(/['"]/g, ''); // Remove quotes
return trimmed !== '' && isNaN(Number(trimmed));
});
return nonNumericCells.length > firstLineCells.length / 2;
}
}
exports.TSVDetector = TSVDetector;
/**
* TSV Parser Implementation
* Extends CSV parser with tab-specific defaults
*/
class TSVParser extends csv_parser_adapter_1.CSVParserAdapter {
constructor(options = {}) {
// Force tab delimiter and set TSV-specific defaults
const tsvOptions = {
...options,
delimiter: '\t', // Always use tab for TSV
quote: options.quote || '"',
hasHeader: options.hasHeader ?? true,
encoding: options.encoding || 'utf8',
};
super(tsvOptions);
}
/**
* Override format detection to use TSV detector
*/
async detect(filePath) {
const detector = new TSVDetector();
return detector.detect(filePath);
}
/**
* Get supported file extensions
*/
getSupportedExtensions() {
return ['.tsv', '.tab'];
}
/**
* Get format name
*/
getFormatName() {
return 'tsv';
}
}
exports.TSVParser = TSVParser;
/**
* Factory function to create TSV parser
*/
function createTSVParser(options) {
return new TSVParser(options);
}
//# sourceMappingURL=tsv-parser.js.map