UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

457 lines 14.5 kB
/** * Section 5: Data Engineering & Structural Insights Types * Schema optimization, transformation pipelines, and ML readiness assessment */ export interface Section5Result { engineeringAnalysis: DataEngineeringAnalysis; warnings: Section5Warning[]; performanceMetrics: Section5PerformanceMetrics; metadata: Section5Metadata; } export interface Section5Warning { category: 'schema' | 'transformation' | 'scalability' | 'performance' | 'security'; severity: 'low' | 'medium' | 'high' | 'critical'; message: string; impact: string; suggestion: string; } export interface Section5PerformanceMetrics { analysisTimeMs: number; transformationsEvaluated: number; schemaRecommendationsGenerated: number; mlFeaturesDesigned: number; } export interface Section5Metadata { analysisApproach: string; sourceDatasetSize: number; engineeredFeatureCount: number; mlReadinessScore: number; } export interface DataEngineeringAnalysis { schemaAnalysis: SchemaAnalysis; structuralIntegrity: StructuralIntegrityAnalysis; transformationPipeline: TransformationPipelineRecommendations; scalabilityAssessment: ScalabilityAssessment; dataGovernance: DataGovernanceConsiderations; mlReadiness: MLReadinessAssessment; knowledgeBaseOutput: KnowledgeBaseOutput; } export interface SchemaAnalysis { currentSchema: CurrentSchemaProfile; optimizedSchema: OptimizedSchemaRecommendation; dataTypeConversions: DataTypeConversion[]; characterEncodingRecommendations: EncodingRecommendations; normalizationInsights: NormalizationInsights; } export interface CurrentSchemaProfile { columns: SchemaColumn[]; estimatedRowCount: number; estimatedSizeBytes: number; detectedEncoding: string; } export interface SchemaColumn { originalName: string; detectedType: string; inferredSemanticType: string; nullabilityPercentage: number; uniquenessPercentage: number; sampleValues: string[]; } export interface OptimizedSchemaRecommendation { targetSystem: string; ddlStatement: string; columns: OptimizedColumn[]; indexes: IndexRecommendation[]; constraints: any[]; } export interface OptimizedColumn { originalName: string; optimizedName: string; recommendedType: string; constraints: string[]; reasoning: string; } export interface DataTypeConversion { columnName: string; currentType: string; recommendedType: string; conversionLogic: string; reasoning: string; riskLevel: 'low' | 'medium' | 'high'; exampleTransformation: string; } export interface EncodingRecommendations { detectedEncoding: string; recommendedEncoding: string; collationRecommendation: string; characterSetIssues: string[]; } export interface NormalizationInsights { redundancyDetected: RedundancyAnalysis[]; normalizationOpportunities: NormalizationOpportunity[]; denormalizationJustifications: DenormalizationJustification[]; } export interface StructuralIntegrityAnalysis { primaryKeyCandidates: PrimaryKeyCandidate[]; foreignKeyRelationships: ForeignKeyRelationship[]; orphanedRecords: OrphanedRecordAnalysis[]; dataIntegrityScore: DataIntegrityScore; } export interface PrimaryKeyCandidate { columnName: string; uniqueness: number; completeness: number; stability: number; confidence: 'low' | 'medium' | 'high'; reasoning: string; } export interface ForeignKeyRelationship { columnName: string; referencedTable: string; referencedColumn: string; confidence: 'low' | 'medium' | 'high'; cardinality: string; integrityViolations: number; actionRecommendation: string; } export interface OrphanedRecordAnalysis { relationshipDescription: string; orphanedCount: number; orphanedPercentage: number; impactAssessment: string; resolutionStrategy: string; } export interface DataIntegrityScore { score: number; interpretation: string; factors: IntegrityFactor[]; } export interface IntegrityFactor { factor: string; impact: 'positive' | 'negative'; weight: number; description: string; } export interface TransformationPipelineRecommendations { columnStandardization: ColumnStandardization[]; missingValueStrategy: MissingValueStrategy[]; outlierTreatment: OutlierTreatmentStrategy[]; categoricalEncoding: CategoricalEncodingStrategy[]; numericalTransformations: NumericalTransformationStrategy[]; dateTimeFeatureEngineering: DateTimeEngineeringStrategy[]; textProcessingPipeline: TextProcessingStrategy[]; booleanFeatureCreation: BooleanFeatureCreation[]; featureHashingRecommendations: FeatureHashingRecommendation[]; } export interface ColumnStandardization { originalName: string; standardizedName: string; namingConvention: string; reasoning: string; } export interface MissingValueStrategy { columnName: string; strategy: 'drop' | 'median' | 'mean' | 'mode' | 'forward_fill' | 'backward_fill' | 'interpolate' | 'model_based' | 'fixed_value'; parameters: Record<string, any>; flagColumn: string; reasoning: string; impact: string; } export interface OutlierTreatmentStrategy { columnName: string; detectionMethod: string; treatmentMethod: 'cap' | 'winsorize' | 'remove' | 'transform' | 'flag_only'; parameters: Record<string, any>; flagColumn: string; reasoning: string; expectedImpact: string; } export interface CategoricalEncodingStrategy { columnName: string; encodingMethod: 'one_hot' | 'ordinal' | 'target' | 'binary' | 'hash' | 'leave_as_is'; parameters: Record<string, any>; resultingColumns: string[]; reasoning: string; considerations: string[]; } export interface NumericalTransformationStrategy { columnName: string; transformations: NumericalTransformation[]; reasoning: string; mlConsiderations: string[]; } export interface NumericalTransformation { transformation: 'log' | 'sqrt' | 'power' | 'box_cox' | 'yeo_johnson' | 'standard_scale' | 'min_max_scale' | 'robust_scale' | 'quantile_transform'; parameters: Record<string, any>; resultingColumnName: string; purpose: string; } export interface DateTimeEngineeringStrategy { columnName: string; extractedFeatures: DateTimeFeature[]; calculatedFeatures: CalculatedDateFeature[]; reasoning: string; } export interface DateTimeFeature { featureName: string; extractionMethod: string; purpose: string; } export interface CalculatedDateFeature { featureName: string; calculationMethod: string; basedOnColumns: string[]; purpose: string; } export interface TextProcessingStrategy { columnName: string; cleaningSteps: TextCleaningStep[]; vectorizationMethod: 'tfidf' | 'count' | 'word2vec' | 'doc2vec' | 'bert' | 'none'; vectorizationParameters: Record<string, any>; resultingFeatureCount: number; considerations: string[]; } export interface TextCleaningStep { step: string; description: string; purpose: string; } export interface BooleanFeatureCreation { featureName: string; creationLogic: string; basedOnColumns: string[]; purpose: string; businessRule: string; } export interface FeatureHashingRecommendation { columnName: string; currentCardinality: number; recommendedHashSize: number; reasoning: string; tradeoffs: string[]; } export interface ScalabilityAssessment { currentMetrics: DataVolumeMetrics; scalabilityAnalysis: ScalabilityAnalysis; indexingRecommendations: IndexRecommendation[]; partitioningStrategies: PartitioningStrategy[]; performanceOptimizations: PerformanceOptimization[]; } export interface DataVolumeMetrics { diskSizeMB: number; inMemorySizeMB: number; rowCount: number; columnCount: number; estimatedGrowthRate: number; } export interface ScalabilityAnalysis { currentCapability: string; futureProjections: FutureProjection[]; technologyRecommendations: TechnologyRecommendation[]; bottleneckAnalysis: BottleneckAnalysis[]; } export interface FutureProjection { timeframe: string; projectedSize: number; projectedComplexity: string; recommendedApproach: string; } export interface TechnologyRecommendation { technology: string; useCase: string; benefits: string[]; considerations: string[]; implementationComplexity: 'low' | 'medium' | 'high'; } export interface BottleneckAnalysis { component: string; currentLimitation: string; impactOnPerformance: string; mitigationStrategy: string; } export interface IndexRecommendation { indexType: 'primary' | 'unique' | 'btree' | 'hash' | 'composite'; columns: string[]; purpose: string; expectedImpact: string; maintenanceConsiderations: string; } export interface PartitioningStrategy { partitionType: 'range' | 'list' | 'hash' | 'composite'; partitionColumns: string[]; reasoning: string; expectedBenefits: string[]; implementationNotes: string[]; } export interface PerformanceOptimization { area: string; currentIssue: string; recommendation: string; expectedImprovement: string; implementationEffort: 'low' | 'medium' | 'high'; } export interface DataGovernanceConsiderations { sensitivityClassification: SensitivityClassification[]; dataFreshnessAnalysis: DataFreshnessAnalysis; versioningRecommendations: VersioningRecommendation[]; lineageConsiderations: LineageConsideration[]; retentionPolicyRecommendations: RetentionPolicyRecommendation[]; complianceConsiderations: ComplianceConsideration[]; } export interface SensitivityClassification { columnName: string; sensitivityLevel: 'public' | 'internal' | 'confidential' | 'restricted'; dataCategory: string; protectionRecommendations: string[]; accessControlSuggestions: string[]; } export interface DataFreshnessAnalysis { lastUpdateDetected: string | null; updateFrequencyEstimate: string; freshnessScore: number; implications: string[]; recommendations: string[]; } export interface VersioningRecommendation { strategy: string; implementation: string; benefits: string[]; considerations: string[]; } export interface LineageConsideration { aspect: string; recommendation: string; toolSuggestions: string[]; implementationApproach: string; } export interface RetentionPolicyRecommendation { dataCategory: string; recommendedRetentionPeriod: string; reasoning: string; complianceFactors: string[]; } export interface ComplianceConsideration { regulation: string; applicableColumns: string[]; requirements: string[]; recommendations: string[]; } export interface MLReadinessAssessment { overallScore: number; enhancingFactors: MLEnhancingFactor[]; remainingChallenges: MLChallenge[]; featurePreparationMatrix: FeaturePreparationEntry[]; modelingConsiderations: ModelingConsideration[]; } export interface MLEnhancingFactor { factor: string; impact: 'high' | 'medium' | 'low'; description: string; } export interface MLChallenge { challenge: string; severity: 'high' | 'medium' | 'low'; impact: string; mitigationStrategy: string; estimatedEffort: string; } export interface FeaturePreparationEntry { featureName: string; originalColumn: string; finalDataType: string; keyIssues: string[]; engineeringSteps: string[]; finalMLFeatureType: string; modelingNotes: string[]; } export interface ModelingConsideration { aspect: string; consideration: string; impact: string; recommendations: string[]; } export interface KnowledgeBaseOutput { datasetProfile: DatasetProfile; schemaRecommendations: SchemaRecommendationSummary[]; inferredRelationships: InferredRelationshipSummary[]; keyTransformations: TransformationSummary[]; } export interface DatasetProfile { fileName: string; analysisDate: string; totalRows: number; totalColumnsOriginal: number; totalColumnsEngineeredForML: number; estimatedTechnicalDebtHours: number; mlReadinessScore: number; } export interface SchemaRecommendationSummary { columnNameOriginal: string; columnNameTarget: string; recommendedType: string; constraints: string[]; transformations: string[]; } export interface InferredRelationshipSummary { fromColumn: string; toTableColumn: string; relationshipType: string; confidence: string; } export interface TransformationSummary { featureGroup: string; steps: string[]; impact: string; } export interface RedundancyAnalysis { redundancyType: string; affectedColumns: string[]; description: string; recommendedAction: string; } export interface NormalizationOpportunity { opportunity: string; affectedColumns: string[]; normalizedForm: string; benefits: string[]; considerations: string[]; } export interface DenormalizationJustification { justification: string; affectedColumns: string[]; reasoning: string; tradeoffs: string[]; } export interface DatabaseTypeInference { sqlType: string; constraints: string[]; reasoning: string; } export interface PCAInsights { enhancingFactors: MLEnhancingFactor[]; challenges: MLChallenge[]; modelingConsiderations: ModelingConsideration[]; dimensionalityRecommendations: { applicable: boolean; recommendedComponents?: number; varianceRetained?: number; dominantFeatures?: string[]; implementationSteps?: string[]; }; } export interface Section5Config { enabledAnalyses: ('schema' | 'integrity' | 'transformations' | 'scalability' | 'governance' | 'ml_readiness')[]; targetDatabaseSystem: 'postgresql' | 'mysql' | 'sqlite' | 'generic_sql'; mlFrameworkTarget: 'scikit_learn' | 'pytorch' | 'tensorflow' | 'generic'; includeKnowledgeBase: boolean; governanceLevel: 'basic' | 'standard' | 'enterprise'; performanceOptimizationLevel: 'basic' | 'moderate' | 'aggressive'; } export interface Section5Progress { stage: 'initialization' | 'schema_analysis' | 'integrity_analysis' | 'transformations' | 'scalability' | 'governance' | 'ml_readiness' | 'finalization'; percentage: number; message: string; currentStep: number; totalSteps: number; } //# sourceMappingURL=types.d.ts.map