datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
454 lines • 17.6 kB
JavaScript
;
/**
* JSON Parser Implementation
* Supports JSON arrays, objects, and JSONL (JSON Lines) format
*/
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
Object.defineProperty(exports, "__esModule", { value: true });
exports.JSONParser = exports.JSONDetector = void 0;
exports.createJSONParser = createJSONParser;
const fs_1 = require("fs");
const path = __importStar(require("path"));
const data_parser_1 = require("./base/data-parser");
const types_1 = require("../core/types");
/**
* JSON Format Detector
*/
class JSONDetector {
getSupportedExtensions() {
return ['.json', '.jsonl', '.ndjson'];
}
getFormatName() {
return 'json';
}
async detect(filePath) {
try {
// Check extension first
const ext = path.extname(filePath).toLowerCase();
const extensionScore = this.getSupportedExtensions().includes(ext) ? 0.25 : 0;
// Read sample of file
const sample = await this.readSample(filePath, 2048);
// Try parsing as JSONL first (more specific)
const jsonlResult = await this.tryParseJSONL(sample);
if (jsonlResult.success) {
return {
format: 'json',
confidence: Math.min(0.9, extensionScore + 0.6),
metadata: jsonlResult.metadata,
estimatedRows: jsonlResult.metadata.estimatedRecords,
suggestedOptions: {
arrayMode: 'records',
flattenObjects: true,
},
};
}
// Try parsing as JSON
const jsonResult = await this.tryParseJSON(sample);
if (jsonResult.success) {
// Adjust confidence based on whether this is a partial parse
const baseConfidence = jsonResult.metadata.partialParse ? 0.65 : 0.75;
let confidence = Math.min(0.95, extensionScore + baseConfidence);
// For non-partial parses with extension, ensure high confidence
if (!jsonResult.metadata.partialParse && extensionScore > 0) {
confidence = 0.95;
}
return {
format: 'json',
confidence,
metadata: jsonResult.metadata,
estimatedRows: jsonResult.metadata.estimatedRecords,
suggestedOptions: {
arrayMode: jsonResult.metadata.type === 'array' ? 'records' : 'values',
flattenObjects: jsonResult.metadata.nestedLevels > 1,
},
};
}
// If JSON parsing fails and no JSONL content detected, return low confidence
return {
format: 'json',
confidence: 0,
metadata: { error: 'Not valid JSON or JSONL format' },
};
}
catch (error) {
return {
format: 'json',
confidence: 0,
metadata: { error: error.message },
};
}
}
async readSample(filePath, maxBytes) {
try {
// For JSON detection, try to read the entire file first if it's not too large
const stats = await fs_1.promises.stat(filePath);
// If file is small enough, read it entirely for better detection
if (stats.size <= maxBytes * 2) {
return await fs_1.promises.readFile(filePath, 'utf8');
}
// Otherwise read a sample from the beginning
const buffer = Buffer.alloc(maxBytes);
const file = await fs_1.promises.open(filePath, 'r');
try {
const { bytesRead } = await file.read(buffer, 0, maxBytes, 0);
return buffer.slice(0, bytesRead).toString('utf8');
}
finally {
await file.close();
}
}
catch (error) {
// Fallback to buffer reading if file operations fail
const buffer = Buffer.alloc(maxBytes);
const file = await fs_1.promises.open(filePath, 'r');
try {
const { bytesRead } = await file.read(buffer, 0, maxBytes, 0);
return buffer.slice(0, bytesRead).toString('utf8');
}
finally {
await file.close();
}
}
}
async tryParseJSON(sample) {
try {
// First check if it looks like JSON by examining structure
const trimmed = sample.trim();
if (!trimmed.startsWith('{') && !trimmed.startsWith('[')) {
return { success: false, metadata: { type: 'object' } };
}
const parsed = JSON.parse(sample);
if (Array.isArray(parsed)) {
const elementTypes = parsed.slice(0, 10).map((item) => typeof item);
return {
success: true,
metadata: {
type: 'array',
estimatedRecords: parsed.length,
arrayElementTypes: [...new Set(elementTypes)],
nestedLevels: this.calculateNestingLevel(parsed[0] || {}),
},
};
}
else if (typeof parsed === 'object' && parsed !== null) {
return {
success: true,
metadata: {
type: 'object',
keys: Object.keys(parsed),
estimatedRecords: 1,
nestedLevels: this.calculateNestingLevel(parsed),
},
};
}
return { success: false, metadata: { type: 'object' } };
}
catch (error) {
// If parsing fails, check if it's malformed but looks like JSON
const trimmed = sample.trim();
if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
// Check if it's incomplete vs completely malformed by looking at the end
const isObject = trimmed.startsWith('{');
const properClosing = isObject ? trimmed.endsWith('}') : trimmed.endsWith(']');
if (!properClosing && trimmed.length > 50) {
// For large files, look for JSON-like patterns even in samples
const hasJsonPatterns = trimmed.includes('"') && (trimmed.includes(':') || trimmed.includes(','));
if (hasJsonPatterns) {
// Likely incomplete JSON due to sampling - give it some confidence
return {
success: true,
metadata: {
type: trimmed.startsWith('[') ? 'array' : 'object',
estimatedRecords: 1,
nestedLevels: 1,
keys: [],
partialParse: true
}
};
}
}
}
return { success: false, metadata: { type: 'object' } };
}
}
async tryParseJSONL(sample) {
const lines = sample.split('\n').filter((line) => line.trim());
if (lines.length === 0) {
return { success: false, metadata: { type: 'jsonl' } };
}
// JSONL requires multiple lines - single line is likely regular JSON
if (lines.length === 1) {
return { success: false, metadata: { type: 'jsonl' } };
}
let validLines = 0;
let sampleKeys = [];
for (const line of lines.slice(0, 10)) {
try {
const parsed = JSON.parse(line.trim());
if (typeof parsed === 'object' && parsed !== null) {
validLines++;
if (sampleKeys.length === 0) {
sampleKeys = Object.keys(parsed);
}
}
}
catch {
// Invalid JSON line
}
}
const confidence = validLines / Math.min(lines.length, 10);
if (confidence > 0.7) {
return {
success: true,
metadata: {
type: 'jsonl',
keys: sampleKeys,
estimatedRecords: lines.length,
nestedLevels: 1,
},
};
}
return { success: false, metadata: { type: 'jsonl' } };
}
calculateNestingLevel(obj, level = 0) {
if (typeof obj !== 'object' || obj === null) {
return level;
}
let maxLevel = level;
for (const value of Object.values(obj)) {
if (typeof value === 'object' && value !== null) {
maxLevel = Math.max(maxLevel, this.calculateNestingLevel(value, level + 1));
}
}
return maxLevel;
}
}
exports.JSONDetector = JSONDetector;
/**
* JSON Parser Implementation
*/
class JSONParser extends data_parser_1.BaseParser {
headers = [];
getSupportedExtensions() {
return ['.json', '.jsonl', '.ndjson'];
}
getFormatName() {
return 'json';
}
async detect(filePath) {
const detector = new JSONDetector();
return detector.detect(filePath);
}
async *parse(filePath, options) {
const mergedOptions = { ...this.options, ...options };
try {
const content = await fs_1.promises.readFile(filePath, 'utf8');
this.updateStats(Buffer.byteLength(content, 'utf8'), 0);
// Determine JSON type
const isJSONL = await this.isJSONLFormat(content);
if (isJSONL) {
yield* this.parseJSONL(content, mergedOptions);
}
else {
yield* this.parseJSON(content, mergedOptions);
}
}
catch (error) {
throw new types_1.DataPilotError(`JSON parsing failed: ${error.message}`, 'JSON_PARSING_ERROR', types_1.ErrorSeverity.HIGH, types_1.ErrorCategory.PARSING);
}
}
async isJSONLFormat(content) {
const lines = content.split('\n').slice(0, 5);
let validJSONLines = 0;
for (const line of lines) {
if (line.trim()) {
try {
JSON.parse(line.trim());
validJSONLines++;
}
catch {
// Not valid JSON line
}
}
}
return validJSONLines > 0 && validJSONLines === lines.filter((l) => l.trim()).length;
}
async *parseJSON(content, options) {
try {
const parsed = JSON.parse(content);
if (Array.isArray(parsed)) {
yield* this.parseJSONArray(parsed, options);
}
else if (typeof parsed === 'object' && parsed !== null) {
yield* this.parseJSONObject(parsed, options);
}
else {
throw new Error('JSON must be an object or array');
}
}
catch (error) {
this.addError(0, `Invalid JSON: ${error.message}`, 'INVALID_JSON');
throw error;
}
}
async *parseJSONArray(array, options) {
const maxRows = options.maxRows || array.length;
const flattenObjects = options.flattenObjects ?? true;
// Extract headers from first element
if (array.length > 0) {
const firstItem = array[0];
if (typeof firstItem === 'object' && firstItem !== null) {
this.headers = this.extractHeaders(firstItem, flattenObjects);
}
}
for (let i = 0; i < Math.min(array.length, maxRows); i++) {
if (this.aborted)
break;
const item = array[i];
const rowData = this.convertToRowData(item, flattenObjects);
yield {
index: i,
data: rowData,
raw: JSON.stringify(item),
metadata: { originalType: typeof item },
};
this.updateStats(0, 1);
}
}
async *parseJSONL(content, options) {
const lines = content.split('\n').filter((line) => line.trim());
const maxRows = options.maxRows || lines.length;
const flattenObjects = options.flattenObjects ?? true;
// Extract headers from first line
if (lines.length > 0) {
try {
const firstItem = JSON.parse(lines[0]);
if (typeof firstItem === 'object' && firstItem !== null) {
this.headers = this.extractHeaders(firstItem, flattenObjects);
}
}
catch (error) {
this.addError(0, `Invalid JSON in first line: ${error.message}`, 'INVALID_JSONL');
}
}
for (let i = 0; i < Math.min(lines.length, maxRows); i++) {
if (this.aborted)
break;
const line = lines[i].trim();
if (!line)
continue;
try {
const item = JSON.parse(line);
const rowData = this.convertToRowData(item, flattenObjects);
yield {
index: i,
data: rowData,
raw: line,
metadata: { originalType: typeof item },
};
this.updateStats(0, 1);
}
catch (error) {
this.addError(i, `Invalid JSON line: ${error.message}`, 'INVALID_JSONL');
// Continue with next line
}
}
}
async *parseJSONObject(obj, options) {
const flattenObjects = options.flattenObjects ?? true;
this.headers = this.extractHeaders(obj, flattenObjects);
const rowData = this.convertToRowData(obj, flattenObjects);
yield {
index: 0,
data: rowData,
raw: JSON.stringify(obj),
metadata: { originalType: 'object' },
};
this.updateStats(0, 1);
}
extractHeaders(obj, flatten) {
if (!flatten || typeof obj !== 'object' || obj === null) {
return Object.keys(obj || {});
}
const flattened = this.flattenObject(obj);
return Object.keys(flattened);
}
convertToRowData(item, flatten) {
if (typeof item !== 'object' || item === null) {
return [String(item)];
}
const data = flatten ? this.flattenObject(item) : item;
// Ensure consistent column order using headers
if (this.headers.length > 0) {
return this.headers.map((header) => {
const value = data[header];
return value !== undefined ? String(value) : '';
});
}
return Object.values(data).map((v) => String(v));
}
flattenObject(obj, prefix = '', separator = '.') {
const flattened = {};
for (const [key, value] of Object.entries(obj)) {
const newKey = prefix ? `${prefix}${separator}${key}` : key;
if (value !== null && typeof value === 'object' && !Array.isArray(value)) {
// Recursively flatten nested objects
Object.assign(flattened, this.flattenObject(value, newKey, separator));
}
else if (Array.isArray(value)) {
// Convert arrays to string representation
flattened[newKey] = JSON.stringify(value);
}
else {
flattened[newKey] = value;
}
}
return flattened;
}
/**
* Get detected headers for column mapping
*/
getHeaders() {
return [...this.headers];
}
}
exports.JSONParser = JSONParser;
/**
* Factory function to create JSON parser
*/
function createJSONParser(options) {
return new JSONParser(options);
}
//# sourceMappingURL=json-parser.js.map