UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

454 lines 17.6 kB
"use strict"; /** * JSON Parser Implementation * Supports JSON arrays, objects, and JSONL (JSON Lines) format */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.JSONParser = exports.JSONDetector = void 0; exports.createJSONParser = createJSONParser; const fs_1 = require("fs"); const path = __importStar(require("path")); const data_parser_1 = require("./base/data-parser"); const types_1 = require("../core/types"); /** * JSON Format Detector */ class JSONDetector { getSupportedExtensions() { return ['.json', '.jsonl', '.ndjson']; } getFormatName() { return 'json'; } async detect(filePath) { try { // Check extension first const ext = path.extname(filePath).toLowerCase(); const extensionScore = this.getSupportedExtensions().includes(ext) ? 0.25 : 0; // Read sample of file const sample = await this.readSample(filePath, 2048); // Try parsing as JSONL first (more specific) const jsonlResult = await this.tryParseJSONL(sample); if (jsonlResult.success) { return { format: 'json', confidence: Math.min(0.9, extensionScore + 0.6), metadata: jsonlResult.metadata, estimatedRows: jsonlResult.metadata.estimatedRecords, suggestedOptions: { arrayMode: 'records', flattenObjects: true, }, }; } // Try parsing as JSON const jsonResult = await this.tryParseJSON(sample); if (jsonResult.success) { // Adjust confidence based on whether this is a partial parse const baseConfidence = jsonResult.metadata.partialParse ? 0.65 : 0.75; let confidence = Math.min(0.95, extensionScore + baseConfidence); // For non-partial parses with extension, ensure high confidence if (!jsonResult.metadata.partialParse && extensionScore > 0) { confidence = 0.95; } return { format: 'json', confidence, metadata: jsonResult.metadata, estimatedRows: jsonResult.metadata.estimatedRecords, suggestedOptions: { arrayMode: jsonResult.metadata.type === 'array' ? 'records' : 'values', flattenObjects: jsonResult.metadata.nestedLevels > 1, }, }; } // If JSON parsing fails and no JSONL content detected, return low confidence return { format: 'json', confidence: 0, metadata: { error: 'Not valid JSON or JSONL format' }, }; } catch (error) { return { format: 'json', confidence: 0, metadata: { error: error.message }, }; } } async readSample(filePath, maxBytes) { try { // For JSON detection, try to read the entire file first if it's not too large const stats = await fs_1.promises.stat(filePath); // If file is small enough, read it entirely for better detection if (stats.size <= maxBytes * 2) { return await fs_1.promises.readFile(filePath, 'utf8'); } // Otherwise read a sample from the beginning const buffer = Buffer.alloc(maxBytes); const file = await fs_1.promises.open(filePath, 'r'); try { const { bytesRead } = await file.read(buffer, 0, maxBytes, 0); return buffer.slice(0, bytesRead).toString('utf8'); } finally { await file.close(); } } catch (error) { // Fallback to buffer reading if file operations fail const buffer = Buffer.alloc(maxBytes); const file = await fs_1.promises.open(filePath, 'r'); try { const { bytesRead } = await file.read(buffer, 0, maxBytes, 0); return buffer.slice(0, bytesRead).toString('utf8'); } finally { await file.close(); } } } async tryParseJSON(sample) { try { // First check if it looks like JSON by examining structure const trimmed = sample.trim(); if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) { return { success: false, metadata: { type: 'object' } }; } const parsed = JSON.parse(sample); if (Array.isArray(parsed)) { const elementTypes = parsed.slice(0, 10).map((item) => typeof item); return { success: true, metadata: { type: 'array', estimatedRecords: parsed.length, arrayElementTypes: [...new Set(elementTypes)], nestedLevels: this.calculateNestingLevel(parsed[0] || {}), }, }; } else if (typeof parsed === 'object' && parsed !== null) { return { success: true, metadata: { type: 'object', keys: Object.keys(parsed), estimatedRecords: 1, nestedLevels: this.calculateNestingLevel(parsed), }, }; } return { success: false, metadata: { type: 'object' } }; } catch (error) { // If parsing fails, check if it's malformed but looks like JSON const trimmed = sample.trim(); if (trimmed.startsWith('{') || trimmed.startsWith('[')) { // Check if it's incomplete vs completely malformed by looking at the end const isObject = trimmed.startsWith('{'); const properClosing = isObject ? trimmed.endsWith('}') : trimmed.endsWith(']'); if (!properClosing && trimmed.length > 50) { // For large files, look for JSON-like patterns even in samples const hasJsonPatterns = trimmed.includes('"') && (trimmed.includes(':') || trimmed.includes(',')); if (hasJsonPatterns) { // Likely incomplete JSON due to sampling - give it some confidence return { success: true, metadata: { type: trimmed.startsWith('[') ? 'array' : 'object', estimatedRecords: 1, nestedLevels: 1, keys: [], partialParse: true } }; } } } return { success: false, metadata: { type: 'object' } }; } } async tryParseJSONL(sample) { const lines = sample.split('\n').filter((line) => line.trim()); if (lines.length === 0) { return { success: false, metadata: { type: 'jsonl' } }; } // JSONL requires multiple lines - single line is likely regular JSON if (lines.length === 1) { return { success: false, metadata: { type: 'jsonl' } }; } let validLines = 0; let sampleKeys = []; for (const line of lines.slice(0, 10)) { try { const parsed = JSON.parse(line.trim()); if (typeof parsed === 'object' && parsed !== null) { validLines++; if (sampleKeys.length === 0) { sampleKeys = Object.keys(parsed); } } } catch { // Invalid JSON line } } const confidence = validLines / Math.min(lines.length, 10); if (confidence > 0.7) { return { success: true, metadata: { type: 'jsonl', keys: sampleKeys, estimatedRecords: lines.length, nestedLevels: 1, }, }; } return { success: false, metadata: { type: 'jsonl' } }; } calculateNestingLevel(obj, level = 0) { if (typeof obj !== 'object' || obj === null) { return level; } let maxLevel = level; for (const value of Object.values(obj)) { if (typeof value === 'object' && value !== null) { maxLevel = Math.max(maxLevel, this.calculateNestingLevel(value, level + 1)); } } return maxLevel; } } exports.JSONDetector = JSONDetector; /** * JSON Parser Implementation */ class JSONParser extends data_parser_1.BaseParser { headers = []; getSupportedExtensions() { return ['.json', '.jsonl', '.ndjson']; } getFormatName() { return 'json'; } async detect(filePath) { const detector = new JSONDetector(); return detector.detect(filePath); } async *parse(filePath, options) { const mergedOptions = { ...this.options, ...options }; try { const content = await fs_1.promises.readFile(filePath, 'utf8'); this.updateStats(Buffer.byteLength(content, 'utf8'), 0); // Determine JSON type const isJSONL = await this.isJSONLFormat(content); if (isJSONL) { yield* this.parseJSONL(content, mergedOptions); } else { yield* this.parseJSON(content, mergedOptions); } } catch (error) { throw new types_1.DataPilotError(`JSON parsing failed: ${error.message}`, 'JSON_PARSING_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING); } } async isJSONLFormat(content) { const lines = content.split('\n').slice(0, 5); let validJSONLines = 0; for (const line of lines) { if (line.trim()) { try { JSON.parse(line.trim()); validJSONLines++; } catch { // Not valid JSON line } } } return validJSONLines > 0 && validJSONLines === lines.filter((l) => l.trim()).length; } async *parseJSON(content, options) { try { const parsed = JSON.parse(content); if (Array.isArray(parsed)) { yield* this.parseJSONArray(parsed, options); } else if (typeof parsed === 'object' && parsed !== null) { yield* this.parseJSONObject(parsed, options); } else { throw new Error('JSON must be an object or array'); } } catch (error) { this.addError(0, `Invalid JSON: ${error.message}`, 'INVALID_JSON'); throw error; } } async *parseJSONArray(array, options) { const maxRows = options.maxRows || array.length; const flattenObjects = options.flattenObjects ?? true; // Extract headers from first element if (array.length > 0) { const firstItem = array[0]; if (typeof firstItem === 'object' && firstItem !== null) { this.headers = this.extractHeaders(firstItem, flattenObjects); } } for (let i = 0; i < Math.min(array.length, maxRows); i++) { if (this.aborted) break; const item = array[i]; const rowData = this.convertToRowData(item, flattenObjects); yield { index: i, data: rowData, raw: JSON.stringify(item), metadata: { originalType: typeof item }, }; this.updateStats(0, 1); } } async *parseJSONL(content, options) { const lines = content.split('\n').filter((line) => line.trim()); const maxRows = options.maxRows || lines.length; const flattenObjects = options.flattenObjects ?? true; // Extract headers from first line if (lines.length > 0) { try { const firstItem = JSON.parse(lines[0]); if (typeof firstItem === 'object' && firstItem !== null) { this.headers = this.extractHeaders(firstItem, flattenObjects); } } catch (error) { this.addError(0, `Invalid JSON in first line: ${error.message}`, 'INVALID_JSONL'); } } for (let i = 0; i < Math.min(lines.length, maxRows); i++) { if (this.aborted) break; const line = lines[i].trim(); if (!line) continue; try { const item = JSON.parse(line); const rowData = this.convertToRowData(item, flattenObjects); yield { index: i, data: rowData, raw: line, metadata: { originalType: typeof item }, }; this.updateStats(0, 1); } catch (error) { this.addError(i, `Invalid JSON line: ${error.message}`, 'INVALID_JSONL'); // Continue with next line } } } async *parseJSONObject(obj, options) { const flattenObjects = options.flattenObjects ?? true; this.headers = this.extractHeaders(obj, flattenObjects); const rowData = this.convertToRowData(obj, flattenObjects); yield { index: 0, data: rowData, raw: JSON.stringify(obj), metadata: { originalType: 'object' }, }; this.updateStats(0, 1); } extractHeaders(obj, flatten) { if (!flatten || typeof obj !== 'object' || obj === null) { return Object.keys(obj || {}); } const flattened = this.flattenObject(obj); return Object.keys(flattened); } convertToRowData(item, flatten) { if (typeof item !== 'object' || item === null) { return [String(item)]; } const data = flatten ? this.flattenObject(item) : item; // Ensure consistent column order using headers if (this.headers.length > 0) { return this.headers.map((header) => { const value = data[header]; return value !== undefined ? String(value) : ''; }); } return Object.values(data).map((v) => String(v)); } flattenObject(obj, prefix = '', separator = '.') { const flattened = {}; for (const [key, value] of Object.entries(obj)) { const newKey = prefix ? `${prefix}${separator}${key}` : key; if (value !== null && typeof value === 'object' && !Array.isArray(value)) { // Recursively flatten nested objects Object.assign(flattened, this.flattenObject(value, newKey, separator)); } else if (Array.isArray(value)) { // Convert arrays to string representation flattened[newKey] = JSON.stringify(value); } else { flattened[newKey] = value; } } return flattened; } /** * Get detected headers for column mapping */ getHeaders() { return [...this.headers]; } } exports.JSONParser = JSONParser; /** * Factory function to create JSON parser */ function createJSONParser(options) { return new JSONParser(options); } //# sourceMappingURL=json-parser.js.map