UNPKG

@finos/legend-extension-dsl-data-quality

Version:
747 lines (698 loc) 24.1 kB
/** * Copyright (c) 2020-present, Goldman Sachs * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import { type AbstractPureGraphManager, type ExecutionResult, type EXECUTION_SERIALIZATION_FORMAT, type ExecutionOptions, type GraphDataOrigin, type PureModel, type RawExecutionPlan, type RootGraphFetchTree, type V1_ExecutionResult, type V1_ParameterValue, type V1_PureModelContext, type V1_RootGraphFetchTree, type RawLambda, type V1_RawLambda, V1_getEngineSerializationFormat, LegendSDLC, PureClientVersion, V1_buildExecutionError, V1_buildExecutionResult, V1_ExecutionError, V1_GraphBuilderContextBuilder, V1_GraphTransformerContextBuilder, V1_LegendSDLC, V1_ProcessingContext, V1_Protocol, V1_PureGraphManager, V1_PureModelContextPointer, V1_pureModelContextPropSchema, V1_deserializeExecutionResult, V1_parameterValueModelSchema, V1_transformParameterValue, V1_transformRawLambda, V1_RemoteEngine, V1_rawLambdaModelSchema, } from '@finos/legend-graph'; import { createModelSchema, list, optional, primitive } from 'serializr'; import { type PlainObject, assertErrorThrown, customListWithSchema, guaranteeNonNullable, guaranteeType, NetworkClientError, returnUndefOnError, SerializationFactory, UnsupportedOperationError, usingModelSchema, } from '@finos/legend-shared'; import { DSL_DataQuality_PureGraphManagerExtension } from '../DSL_DataQuality_PureGraphManagerExtension.js'; import { V1_buildDataQualityGraphFetchTree, V1_transformRootGraphFetchTreeToDataQualityRootGraphFetchTree, } from './transformation/V1_DSL_DataQuality_ValueSpecificationBuilderHelper.js'; import type { DataQualityRootGraphFetchTree } from '../../../../graph/metamodel/pure/packageableElements/data-quality/DataQualityGraphFetchTree.js'; import type { DataQualityRelationValidation, DQExecuteInputOptions, DQReconciliationInputOptions, DQValidationSuggestionInputOptions, } from '../../../../graph/metamodel/pure/packageableElements/data-quality/DataQualityValidationConfiguration.js'; const DQ_GENERATE_EXECUTION_PLAN = 'generate execution plan'; const DQ_EXECUTE_PLAN = 'execute plan'; const DQ_EXECUTE_DATA_PROFILING = 'execute data profiling'; const DQ_FETCH_RULE_SUGGESTIONS = 'fetch rule suggestions'; const DQ_DEBUG_EXECUTION_PLAN = 'debug execution plan'; const DQ_FETCH_PROPERTY_PATH_TREE = 'dq fetch property path tree'; const DQ_EXECUTE_RECONCILIATION = 'execute reconciliation'; export class V1_DQExecuteInput { clientVersion: string | undefined; model!: V1_PureModelContext; lambdaParameterValues: V1_ParameterValue[] = []; packagePath!: string; defectsLimit: number | undefined; queryLimit: number | undefined; allValidationsChecked: boolean | undefined; validationName: string | undefined; runQuery: boolean | undefined; static readonly serialization = new SerializationFactory( createModelSchema(V1_DQExecuteInput, { clientVersion: optional(primitive()), model: V1_pureModelContextPropSchema, lambdaParameterValues: customListWithSchema(V1_parameterValueModelSchema), packagePath: primitive(), defectsLimit: optional(primitive()), queryLimit: optional(primitive()), validationName: optional(primitive()), runQuery: optional(primitive()), }), ); } export class V1_DQRuleSuggestionInput { clientVersion: string | undefined; model!: V1_PureModelContext; lambdaParameterValues: V1_ParameterValue[] = []; packagePath!: string; static readonly serialization = new SerializationFactory( createModelSchema(V1_DQRuleSuggestionInput, { clientVersion: optional(primitive()), model: V1_pureModelContextPropSchema, lambdaParameterValues: customListWithSchema(V1_parameterValueModelSchema), packagePath: primitive(), }), ); } export class V1_DQReconciliationInput { clientVersion: string | undefined; model!: V1_PureModelContext; source!: V1_RawLambda; target!: V1_RawLambda; keys: string[] = []; colsForHash: string[] = []; defectLimit: number | undefined; queryLimit: number | undefined; aggregatedHash: boolean | undefined; sourceHashCol: string | undefined; targetHashCol: string | undefined; includeColumnValues: boolean | undefined; runSourceQuery: boolean | undefined; runTargetQuery: boolean | undefined; sourceLambdaParameterValues: V1_ParameterValue[] = []; targetLambdaParameterValues: V1_ParameterValue[] = []; static readonly serialization = new SerializationFactory( createModelSchema(V1_DQReconciliationInput, { clientVersion: optional(primitive()), model: V1_pureModelContextPropSchema, source: usingModelSchema(V1_rawLambdaModelSchema), target: usingModelSchema(V1_rawLambdaModelSchema), keys: list(primitive()), colsForHash: list(primitive()), defectLimit: optional(primitive()), queryLimit: optional(primitive()), aggregatedHash: optional(primitive()), sourceHashCol: optional(primitive()), targetHashCol: optional(primitive()), includeColumnValues: optional(primitive()), runSourceQuery: optional(primitive()), runTargetQuery: optional(primitive()), sourceLambdaParameterValues: customListWithSchema( V1_parameterValueModelSchema, ), targetLambdaParameterValues: customListWithSchema( V1_parameterValueModelSchema, ), }), ); } export class V1_DSL_Data_Quality_PureGraphManagerExtension extends DSL_DataQuality_PureGraphManagerExtension { declare graphManager: V1_PureGraphManager; static readonly DEV_PROTOCOL_VERSION = PureClientVersion.VX_X_X; constructor(graphManager: AbstractPureGraphManager) { super(graphManager); this.graphManager = guaranteeType(graphManager, V1_PureGraphManager); } getSupportedProtocolVersion(): string { return PureClientVersion.V1_0_0; } private buildPureModelSDLCPointer( origin: GraphDataOrigin, clientVersion: string | undefined, ): V1_PureModelContextPointer { if (origin instanceof LegendSDLC) { return new V1_PureModelContextPointer( clientVersion ? new V1_Protocol( V1_PureGraphManager.PURE_PROTOCOL_NAME, clientVersion, ) : undefined, new V1_LegendSDLC(origin.groupId, origin.artifactId, origin.versionId), ); } throw new UnsupportedOperationError('Unsupported graph origin'); } private executeValidation = ( input: PlainObject<V1_DQExecuteInput>, options?: { returnAsResponse?: boolean; serializationFormat?: EXECUTION_SERIALIZATION_FORMAT | undefined; }, ): Promise<PlainObject<V1_ExecutionResult> | Response> => { // TODO: improve abstraction so that we do not need to access the engine server client directly const engineServerClient = guaranteeType( this.graphManager.engine, V1_RemoteEngine, 'executeValidation is only supported by remote engine', ).getEngineServerClient(); return engineServerClient.postWithTracing( engineServerClient.getTraceData(DQ_EXECUTE_PLAN), `${engineServerClient._pure()}/dataquality/execute`, input, {}, undefined, { serializationFormat: options?.serializationFormat ? V1_getEngineSerializationFormat(options.serializationFormat) : undefined, }, { enableCompression: true }, { skipProcessing: Boolean(options?.returnAsResponse), }, ); }; private async runValidationAndReturnString( input: V1_DQExecuteInput, ): Promise<string> { return ( (await this.executeValidation( V1_DQExecuteInput.serialization.toJson(input), { returnAsResponse: true, }, )) as Response ).text(); } private async export( input: V1_DQExecuteInput, options?: ExecutionOptions, ): Promise<Response> { try { return guaranteeNonNullable( (await this.executeValidation( V1_DQExecuteInput.serialization.toJson(input), { serializationFormat: options?.serializationFormat, returnAsResponse: true, }, )) as Response, ); } catch (error) { assertErrorThrown(error); if (error instanceof NetworkClientError) { throw V1_buildExecutionError( V1_ExecutionError.serialization.fromJson( error.payload as PlainObject<V1_ExecutionError>, ), ); } throw error; } } createExecutionInput( graph: PureModel, packagePath: string, dqExecuteInput: V1_DQExecuteInput, options: DQExecuteInputOptions, ): V1_DQExecuteInput { dqExecuteInput.clientVersion = options.clientVersion ?? V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION; dqExecuteInput.model = graph.origin ? this.buildPureModelSDLCPointer(graph.origin, undefined) : this.graphManager.getFullGraphModelData(graph); dqExecuteInput.lambdaParameterValues = options.lambdaParameterValues ? options.lambdaParameterValues.map(V1_transformParameterValue) : []; dqExecuteInput.packagePath = packagePath; dqExecuteInput.defectsLimit = options.previewLimit; dqExecuteInput.runQuery = options.runQuery; if (options.runQuery) { dqExecuteInput.queryLimit = options.queryLimit; } if (!options.allValidationsChecked) { dqExecuteInput.validationName = options.validationName; } return dqExecuteInput; } generatePlan = async ( graph: PureModel, packagePath: string, options: DQExecuteInputOptions, ): Promise<RawExecutionPlan> => { const input = this.createExecutionInput( graph, packagePath, new V1_DQExecuteInput(), options, ); const serializedInput = V1_DQExecuteInput.serialization.toJson(input); // TODO: improve abstraction so that we do not need to access the engine server client directly const engineServerClient = guaranteeType( this.graphManager.engine, V1_RemoteEngine, 'generatePlan is only supported by remote engine', ).getEngineServerClient(); return engineServerClient.postWithTracing( engineServerClient.getTraceData(DQ_GENERATE_EXECUTION_PLAN), `${engineServerClient._pure()}/dataquality/generatePlan`, serializedInput, {}, undefined, undefined, { enableCompression: true }, ); }; execute = async ( graph: PureModel, packagePath: string, options: DQExecuteInputOptions, ): Promise<ExecutionResult> => { const input = this.createExecutionInput( graph, packagePath, new V1_DQExecuteInput(), options, ); try { const validationResultInText = await this.runValidationAndReturnString(input); const rawExecutionResult = returnUndefOnError(() => this.graphManager.engine.parseExecutionResults( validationResultInText, undefined, ), ) ?? validationResultInText; const v1_executionResult = V1_deserializeExecutionResult(rawExecutionResult); return V1_buildExecutionResult(v1_executionResult); } catch (error) { assertErrorThrown(error); if (error instanceof NetworkClientError) { throw V1_buildExecutionError( V1_ExecutionError.serialization.fromJson( error.payload as PlainObject<V1_ExecutionError>, ), ); } throw error; } }; exportData = async ( graph: PureModel, packagePath: string, options: DQExecuteInputOptions, ): Promise<Response> => { const input = this.createExecutionInput( graph, packagePath, new V1_DQExecuteInput(), options, ); return this.export(input, options); }; debugExecutionPlanGeneration = async ( graph: PureModel, packagePath: string, options: DQExecuteInputOptions, ): Promise<{ plan: RawExecutionPlan; debug: string }> => { const input = this.createExecutionInput( graph, packagePath, new V1_DQExecuteInput(), options, ); const serializedInput = V1_DQExecuteInput.serialization.toJson(input); // TODO: improve abstraction so that we do not need to access the engine server client directly const engineServerClient = guaranteeType( this.graphManager.engine, V1_RemoteEngine, 'debugExecutionPlanGeneration is only supported by remote engine', ).getEngineServerClient(); const result: { plan: RawExecutionPlan; debug: string[] } = await engineServerClient.postWithTracing( engineServerClient.getTraceData(DQ_DEBUG_EXECUTION_PLAN), `${engineServerClient._pure()}/dataquality/debugPlan`, serializedInput, {}, undefined, undefined, { enableCompression: true }, ); return { plan: result.plan, debug: result.debug.join('\n'), }; }; fetchStructuralValidations = async ( graph: PureModel, packagePath: string, options: DQExecuteInputOptions, ): Promise<RootGraphFetchTree> => { // TODO: improve abstraction so that we do not need to access the engine server client directly const engineServerClient = guaranteeType( this.graphManager.engine, V1_RemoteEngine, 'fetchStructuralValidations is only supported by remote engine', ).getEngineServerClient(); const input = this.createExecutionInput( graph, packagePath, new V1_DQExecuteInput(), options, ); const serializedInput = V1_DQExecuteInput.serialization.toJson(input); const V1_rootGraphFetchTree: V1_RootGraphFetchTree = await engineServerClient.postWithTracing( engineServerClient.getTraceData(DQ_FETCH_PROPERTY_PATH_TREE), `${engineServerClient._pure()}/dataquality/propertyPathTree`, serializedInput, {}, undefined, undefined, { enableCompression: true }, ); const V1_dataQualityRootGraphFetchTree = V1_transformRootGraphFetchTreeToDataQualityRootGraphFetchTree( V1_rootGraphFetchTree, ); const context = new V1_GraphBuilderContextBuilder( graph, graph, this.graphManager.graphBuilderExtensions, this.graphManager.logService, ).build(); return V1_buildDataQualityGraphFetchTree( V1_dataQualityRootGraphFetchTree, context, undefined, [], new V1_ProcessingContext(''), true, ) as DataQualityRootGraphFetchTree; }; private executeDataProfiling = ( input: PlainObject<V1_DQExecuteInput>, options?: { returnAsResponse?: boolean; serializationFormat?: EXECUTION_SERIALIZATION_FORMAT | undefined; }, ): Promise<PlainObject<V1_ExecutionResult> | Response> => { const engineServerClient = guaranteeType( this.graphManager.engine, V1_RemoteEngine, 'executeDataProfiling is only supported by remote engine', ).getEngineServerClient(); return engineServerClient.postWithTracing( engineServerClient.getTraceData(DQ_EXECUTE_DATA_PROFILING), `${engineServerClient._pure()}/dataquality/profile`, input, {}, undefined, { serializationFormat: options?.serializationFormat ? V1_getEngineSerializationFormat(options.serializationFormat) : undefined, }, { enableCompression: true }, { skipProcessing: Boolean(options?.returnAsResponse), }, ); }; runDataProfiling = async ( graph: PureModel, packagePath: string, options: DQExecuteInputOptions, ): Promise<ExecutionResult> => { const input = this.createExecutionInput( graph, packagePath, new V1_DQExecuteInput(), options, ); try { const profilingResult = (await this.executeDataProfiling( V1_DQExecuteInput.serialization.toJson(input), { returnAsResponse: true, }, )) as Response; const profilingResultInText = await profilingResult.text(); const rawExecutionResult = returnUndefOnError(() => this.graphManager.engine.parseExecutionResults( profilingResultInText, undefined, ), ) ?? profilingResultInText; const v1_executionResult = V1_deserializeExecutionResult(rawExecutionResult); return V1_buildExecutionResult(v1_executionResult); } catch (error) { assertErrorThrown(error); if (error instanceof NetworkClientError) { throw V1_buildExecutionError( V1_ExecutionError.serialization.fromJson( error.payload as PlainObject<V1_ExecutionError>, ), ); } throw error; } }; exportDataProfiling = async ( graph: PureModel, packagePath: string, options: DQExecuteInputOptions, ): Promise<Response> => { const input = new V1_DQRuleSuggestionInput(); input.packagePath = packagePath; input.clientVersion = options.clientVersion ?? V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION; input.model = graph.origin ? this.buildPureModelSDLCPointer(graph.origin, undefined) : this.graphManager.getFullGraphModelData(graph); input.lambdaParameterValues = options.lambdaParameterValues ? options.lambdaParameterValues.map(V1_transformParameterValue) : []; try { return guaranteeNonNullable( (await this.executeDataProfiling( V1_DQExecuteInput.serialization.toJson(input), { serializationFormat: options.serializationFormat, returnAsResponse: true, }, )) as Response, ); } catch (error) { assertErrorThrown(error); if (error instanceof NetworkClientError) { throw V1_buildExecutionError( V1_ExecutionError.serialization.fromJson( error.payload as PlainObject<V1_ExecutionError>, ), ); } throw error; } }; fetchValidationSuggestions = async ( graph: PureModel, packagePath: string, options: DQValidationSuggestionInputOptions, ): Promise<DataQualityRelationValidation> => { const input = new V1_DQRuleSuggestionInput(); input.packagePath = packagePath; input.clientVersion = options.clientVersion ?? V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION; input.model = graph.origin ? this.buildPureModelSDLCPointer(graph.origin, undefined) : this.graphManager.getFullGraphModelData(graph); input.lambdaParameterValues = options.lambdaParameterValues ? options.lambdaParameterValues.map(V1_transformParameterValue) : []; const engineServerClient = guaranteeType( this.graphManager.engine, V1_RemoteEngine, 'fetchValidationSuggestions is only supported by remote engine', ).getEngineServerClient(); return engineServerClient.postWithTracing( engineServerClient.getTraceData(DQ_FETCH_RULE_SUGGESTIONS), `${engineServerClient._pure()}/dataquality/ruleSuggestions`, V1_DQExecuteInput.serialization.toJson(input), {}, undefined, {}, { enableCompression: true }, {}, ); }; private rawLambdaToV1(lambda: RawLambda): V1_RawLambda { return V1_transformRawLambda( lambda, new V1_GraphTransformerContextBuilder( this.graphManager.pluginManager.getPureProtocolProcessorPlugins(), ).build(), ); } private createReconciliationInput( graph: PureModel, options: DQReconciliationInputOptions, ): V1_DQReconciliationInput { const input = new V1_DQReconciliationInput(); input.clientVersion = options.clientVersion ?? V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION; input.model = graph.origin ? this.buildPureModelSDLCPointer(graph.origin, undefined) : this.graphManager.getFullGraphModelData(graph); const runningSourceOrTargetQuery = options.runSourceQuery ?? options.runTargetQuery; input.source = this.rawLambdaToV1(options.source); input.target = this.rawLambdaToV1(options.target); input.keys = options.keys; input.colsForHash = options.colsForHash; input.aggregatedHash = options.aggregatedHash; input.sourceHashCol = options.sourceHashCol; input.targetHashCol = options.targetHashCol; input.includeColumnValues = options.includeColumnValues; input.runSourceQuery = options.runSourceQuery; input.runTargetQuery = options.runTargetQuery; if (runningSourceOrTargetQuery) { input.queryLimit = options.limit; } else { input.defectLimit = options.limit; } if (options.sourceLambdaParameterValues) { input.sourceLambdaParameterValues = options.sourceLambdaParameterValues.map(V1_transformParameterValue); } if (options.targetLambdaParameterValues) { input.targetLambdaParameterValues = options.targetLambdaParameterValues.map(V1_transformParameterValue); } return input; } runReconciliation = async ( graph: PureModel, options: DQReconciliationInputOptions, ): Promise<ExecutionResult> => { const input = this.createReconciliationInput(graph, options); return this.runReconciliationWithInput(input); }; runReconciliationSourceQuery = async ( graph: PureModel, options: DQReconciliationInputOptions, ): Promise<ExecutionResult> => { const input = this.createReconciliationInput(graph, { ...options, runSourceQuery: true, runTargetQuery: undefined, }); return this.runReconciliationWithInput(input); }; runReconciliationTargetQuery = async ( graph: PureModel, options: DQReconciliationInputOptions, ): Promise<ExecutionResult> => { const input = this.createReconciliationInput(graph, { ...options, runSourceQuery: undefined, runTargetQuery: true, }); return this.runReconciliationWithInput(input); }; private async runReconciliationWithInput( input: V1_DQReconciliationInput, ): Promise<ExecutionResult> { try { const engineServerClient = guaranteeType( this.graphManager.engine, V1_RemoteEngine, 'runReconciliation is only supported by remote engine', ).getEngineServerClient(); const result = await engineServerClient.postWithTracing( engineServerClient.getTraceData(DQ_EXECUTE_RECONCILIATION), `${engineServerClient._pure()}/dataquality/reconciliation`, V1_DQReconciliationInput.serialization.toJson(input), {}, undefined, undefined, { enableCompression: true }, { skipProcessing: true }, ); const resultInText = await (result as Response).text(); const rawExecutionResult = returnUndefOnError(() => this.graphManager.engine.parseExecutionResults( resultInText, undefined, ), ) ?? resultInText; const v1_executionResult = V1_deserializeExecutionResult(rawExecutionResult); return V1_buildExecutionResult(v1_executionResult); } catch (error) { assertErrorThrown(error); if (error instanceof NetworkClientError) { throw V1_buildExecutionError( V1_ExecutionError.serialization.fromJson( error.payload as PlainObject<V1_ExecutionError>, ), ); } throw error; } } }