@finos/legend-extension-dsl-data-quality
Version:
Legend extension for Data Quality
747 lines (698 loc) • 24.1 kB
text/typescript
/**
* Copyright (c) 2020-present, Goldman Sachs
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import {
type AbstractPureGraphManager,
type ExecutionResult,
type EXECUTION_SERIALIZATION_FORMAT,
type ExecutionOptions,
type GraphDataOrigin,
type PureModel,
type RawExecutionPlan,
type RootGraphFetchTree,
type V1_ExecutionResult,
type V1_ParameterValue,
type V1_PureModelContext,
type V1_RootGraphFetchTree,
type RawLambda,
type V1_RawLambda,
V1_getEngineSerializationFormat,
LegendSDLC,
PureClientVersion,
V1_buildExecutionError,
V1_buildExecutionResult,
V1_ExecutionError,
V1_GraphBuilderContextBuilder,
V1_GraphTransformerContextBuilder,
V1_LegendSDLC,
V1_ProcessingContext,
V1_Protocol,
V1_PureGraphManager,
V1_PureModelContextPointer,
V1_pureModelContextPropSchema,
V1_deserializeExecutionResult,
V1_parameterValueModelSchema,
V1_transformParameterValue,
V1_transformRawLambda,
V1_RemoteEngine,
V1_rawLambdaModelSchema,
} from '@finos/legend-graph';
import { createModelSchema, list, optional, primitive } from 'serializr';
import {
type PlainObject,
assertErrorThrown,
customListWithSchema,
guaranteeNonNullable,
guaranteeType,
NetworkClientError,
returnUndefOnError,
SerializationFactory,
UnsupportedOperationError,
usingModelSchema,
} from '@finos/legend-shared';
import { DSL_DataQuality_PureGraphManagerExtension } from '../DSL_DataQuality_PureGraphManagerExtension.js';
import {
V1_buildDataQualityGraphFetchTree,
V1_transformRootGraphFetchTreeToDataQualityRootGraphFetchTree,
} from './transformation/V1_DSL_DataQuality_ValueSpecificationBuilderHelper.js';
import type { DataQualityRootGraphFetchTree } from '../../../../graph/metamodel/pure/packageableElements/data-quality/DataQualityGraphFetchTree.js';
import type {
DataQualityRelationValidation,
DQExecuteInputOptions,
DQReconciliationInputOptions,
DQValidationSuggestionInputOptions,
} from '../../../../graph/metamodel/pure/packageableElements/data-quality/DataQualityValidationConfiguration.js';
const DQ_GENERATE_EXECUTION_PLAN = 'generate execution plan';
const DQ_EXECUTE_PLAN = 'execute plan';
const DQ_EXECUTE_DATA_PROFILING = 'execute data profiling';
const DQ_FETCH_RULE_SUGGESTIONS = 'fetch rule suggestions';
const DQ_DEBUG_EXECUTION_PLAN = 'debug execution plan';
const DQ_FETCH_PROPERTY_PATH_TREE = 'dq fetch property path tree';
const DQ_EXECUTE_RECONCILIATION = 'execute reconciliation';
export class V1_DQExecuteInput {
clientVersion: string | undefined;
model!: V1_PureModelContext;
lambdaParameterValues: V1_ParameterValue[] = [];
packagePath!: string;
defectsLimit: number | undefined;
queryLimit: number | undefined;
allValidationsChecked: boolean | undefined;
validationName: string | undefined;
runQuery: boolean | undefined;
static readonly serialization = new SerializationFactory(
createModelSchema(V1_DQExecuteInput, {
clientVersion: optional(primitive()),
model: V1_pureModelContextPropSchema,
lambdaParameterValues: customListWithSchema(V1_parameterValueModelSchema),
packagePath: primitive(),
defectsLimit: optional(primitive()),
queryLimit: optional(primitive()),
validationName: optional(primitive()),
runQuery: optional(primitive()),
}),
);
}
export class V1_DQRuleSuggestionInput {
clientVersion: string | undefined;
model!: V1_PureModelContext;
lambdaParameterValues: V1_ParameterValue[] = [];
packagePath!: string;
static readonly serialization = new SerializationFactory(
createModelSchema(V1_DQRuleSuggestionInput, {
clientVersion: optional(primitive()),
model: V1_pureModelContextPropSchema,
lambdaParameterValues: customListWithSchema(V1_parameterValueModelSchema),
packagePath: primitive(),
}),
);
}
export class V1_DQReconciliationInput {
clientVersion: string | undefined;
model!: V1_PureModelContext;
source!: V1_RawLambda;
target!: V1_RawLambda;
keys: string[] = [];
colsForHash: string[] = [];
defectLimit: number | undefined;
queryLimit: number | undefined;
aggregatedHash: boolean | undefined;
sourceHashCol: string | undefined;
targetHashCol: string | undefined;
includeColumnValues: boolean | undefined;
runSourceQuery: boolean | undefined;
runTargetQuery: boolean | undefined;
sourceLambdaParameterValues: V1_ParameterValue[] = [];
targetLambdaParameterValues: V1_ParameterValue[] = [];
static readonly serialization = new SerializationFactory(
createModelSchema(V1_DQReconciliationInput, {
clientVersion: optional(primitive()),
model: V1_pureModelContextPropSchema,
source: usingModelSchema(V1_rawLambdaModelSchema),
target: usingModelSchema(V1_rawLambdaModelSchema),
keys: list(primitive()),
colsForHash: list(primitive()),
defectLimit: optional(primitive()),
queryLimit: optional(primitive()),
aggregatedHash: optional(primitive()),
sourceHashCol: optional(primitive()),
targetHashCol: optional(primitive()),
includeColumnValues: optional(primitive()),
runSourceQuery: optional(primitive()),
runTargetQuery: optional(primitive()),
sourceLambdaParameterValues: customListWithSchema(
V1_parameterValueModelSchema,
),
targetLambdaParameterValues: customListWithSchema(
V1_parameterValueModelSchema,
),
}),
);
}
export class V1_DSL_Data_Quality_PureGraphManagerExtension extends DSL_DataQuality_PureGraphManagerExtension {
declare graphManager: V1_PureGraphManager;
static readonly DEV_PROTOCOL_VERSION = PureClientVersion.VX_X_X;
constructor(graphManager: AbstractPureGraphManager) {
super(graphManager);
this.graphManager = guaranteeType(graphManager, V1_PureGraphManager);
}
getSupportedProtocolVersion(): string {
return PureClientVersion.V1_0_0;
}
private buildPureModelSDLCPointer(
origin: GraphDataOrigin,
clientVersion: string | undefined,
): V1_PureModelContextPointer {
if (origin instanceof LegendSDLC) {
return new V1_PureModelContextPointer(
clientVersion
? new V1_Protocol(
V1_PureGraphManager.PURE_PROTOCOL_NAME,
clientVersion,
)
: undefined,
new V1_LegendSDLC(origin.groupId, origin.artifactId, origin.versionId),
);
}
throw new UnsupportedOperationError('Unsupported graph origin');
}
private executeValidation = (
input: PlainObject<V1_DQExecuteInput>,
options?: {
returnAsResponse?: boolean;
serializationFormat?: EXECUTION_SERIALIZATION_FORMAT | undefined;
},
): Promise<PlainObject<V1_ExecutionResult> | Response> => {
// TODO: improve abstraction so that we do not need to access the engine server client directly
const engineServerClient = guaranteeType(
this.graphManager.engine,
V1_RemoteEngine,
'executeValidation is only supported by remote engine',
).getEngineServerClient();
return engineServerClient.postWithTracing(
engineServerClient.getTraceData(DQ_EXECUTE_PLAN),
`${engineServerClient._pure()}/dataquality/execute`,
input,
{},
undefined,
{
serializationFormat: options?.serializationFormat
? V1_getEngineSerializationFormat(options.serializationFormat)
: undefined,
},
{ enableCompression: true },
{
skipProcessing: Boolean(options?.returnAsResponse),
},
);
};
private async runValidationAndReturnString(
input: V1_DQExecuteInput,
): Promise<string> {
return (
(await this.executeValidation(
V1_DQExecuteInput.serialization.toJson(input),
{
returnAsResponse: true,
},
)) as Response
).text();
}
private async export(
input: V1_DQExecuteInput,
options?: ExecutionOptions,
): Promise<Response> {
try {
return guaranteeNonNullable(
(await this.executeValidation(
V1_DQExecuteInput.serialization.toJson(input),
{
serializationFormat: options?.serializationFormat,
returnAsResponse: true,
},
)) as Response,
);
} catch (error) {
assertErrorThrown(error);
if (error instanceof NetworkClientError) {
throw V1_buildExecutionError(
V1_ExecutionError.serialization.fromJson(
error.payload as PlainObject<V1_ExecutionError>,
),
);
}
throw error;
}
}
createExecutionInput(
graph: PureModel,
packagePath: string,
dqExecuteInput: V1_DQExecuteInput,
options: DQExecuteInputOptions,
): V1_DQExecuteInput {
dqExecuteInput.clientVersion =
options.clientVersion ??
V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION;
dqExecuteInput.model = graph.origin
? this.buildPureModelSDLCPointer(graph.origin, undefined)
: this.graphManager.getFullGraphModelData(graph);
dqExecuteInput.lambdaParameterValues = options.lambdaParameterValues
? options.lambdaParameterValues.map(V1_transformParameterValue)
: [];
dqExecuteInput.packagePath = packagePath;
dqExecuteInput.defectsLimit = options.previewLimit;
dqExecuteInput.runQuery = options.runQuery;
if (options.runQuery) {
dqExecuteInput.queryLimit = options.queryLimit;
}
if (!options.allValidationsChecked) {
dqExecuteInput.validationName = options.validationName;
}
return dqExecuteInput;
}
generatePlan = async (
graph: PureModel,
packagePath: string,
options: DQExecuteInputOptions,
): Promise<RawExecutionPlan> => {
const input = this.createExecutionInput(
graph,
packagePath,
new V1_DQExecuteInput(),
options,
);
const serializedInput = V1_DQExecuteInput.serialization.toJson(input);
// TODO: improve abstraction so that we do not need to access the engine server client directly
const engineServerClient = guaranteeType(
this.graphManager.engine,
V1_RemoteEngine,
'generatePlan is only supported by remote engine',
).getEngineServerClient();
return engineServerClient.postWithTracing(
engineServerClient.getTraceData(DQ_GENERATE_EXECUTION_PLAN),
`${engineServerClient._pure()}/dataquality/generatePlan`,
serializedInput,
{},
undefined,
undefined,
{ enableCompression: true },
);
};
execute = async (
graph: PureModel,
packagePath: string,
options: DQExecuteInputOptions,
): Promise<ExecutionResult> => {
const input = this.createExecutionInput(
graph,
packagePath,
new V1_DQExecuteInput(),
options,
);
try {
const validationResultInText =
await this.runValidationAndReturnString(input);
const rawExecutionResult =
returnUndefOnError(() =>
this.graphManager.engine.parseExecutionResults(
validationResultInText,
undefined,
),
) ?? validationResultInText;
const v1_executionResult =
V1_deserializeExecutionResult(rawExecutionResult);
return V1_buildExecutionResult(v1_executionResult);
} catch (error) {
assertErrorThrown(error);
if (error instanceof NetworkClientError) {
throw V1_buildExecutionError(
V1_ExecutionError.serialization.fromJson(
error.payload as PlainObject<V1_ExecutionError>,
),
);
}
throw error;
}
};
exportData = async (
graph: PureModel,
packagePath: string,
options: DQExecuteInputOptions,
): Promise<Response> => {
const input = this.createExecutionInput(
graph,
packagePath,
new V1_DQExecuteInput(),
options,
);
return this.export(input, options);
};
debugExecutionPlanGeneration = async (
graph: PureModel,
packagePath: string,
options: DQExecuteInputOptions,
): Promise<{ plan: RawExecutionPlan; debug: string }> => {
const input = this.createExecutionInput(
graph,
packagePath,
new V1_DQExecuteInput(),
options,
);
const serializedInput = V1_DQExecuteInput.serialization.toJson(input);
// TODO: improve abstraction so that we do not need to access the engine server client directly
const engineServerClient = guaranteeType(
this.graphManager.engine,
V1_RemoteEngine,
'debugExecutionPlanGeneration is only supported by remote engine',
).getEngineServerClient();
const result: { plan: RawExecutionPlan; debug: string[] } =
await engineServerClient.postWithTracing(
engineServerClient.getTraceData(DQ_DEBUG_EXECUTION_PLAN),
`${engineServerClient._pure()}/dataquality/debugPlan`,
serializedInput,
{},
undefined,
undefined,
{ enableCompression: true },
);
return {
plan: result.plan,
debug: result.debug.join('\n'),
};
};
fetchStructuralValidations = async (
graph: PureModel,
packagePath: string,
options: DQExecuteInputOptions,
): Promise<RootGraphFetchTree> => {
// TODO: improve abstraction so that we do not need to access the engine server client directly
const engineServerClient = guaranteeType(
this.graphManager.engine,
V1_RemoteEngine,
'fetchStructuralValidations is only supported by remote engine',
).getEngineServerClient();
const input = this.createExecutionInput(
graph,
packagePath,
new V1_DQExecuteInput(),
options,
);
const serializedInput = V1_DQExecuteInput.serialization.toJson(input);
const V1_rootGraphFetchTree: V1_RootGraphFetchTree =
await engineServerClient.postWithTracing(
engineServerClient.getTraceData(DQ_FETCH_PROPERTY_PATH_TREE),
`${engineServerClient._pure()}/dataquality/propertyPathTree`,
serializedInput,
{},
undefined,
undefined,
{ enableCompression: true },
);
const V1_dataQualityRootGraphFetchTree =
V1_transformRootGraphFetchTreeToDataQualityRootGraphFetchTree(
V1_rootGraphFetchTree,
);
const context = new V1_GraphBuilderContextBuilder(
graph,
graph,
this.graphManager.graphBuilderExtensions,
this.graphManager.logService,
).build();
return V1_buildDataQualityGraphFetchTree(
V1_dataQualityRootGraphFetchTree,
context,
undefined,
[],
new V1_ProcessingContext(''),
true,
) as DataQualityRootGraphFetchTree;
};
private executeDataProfiling = (
input: PlainObject<V1_DQExecuteInput>,
options?: {
returnAsResponse?: boolean;
serializationFormat?: EXECUTION_SERIALIZATION_FORMAT | undefined;
},
): Promise<PlainObject<V1_ExecutionResult> | Response> => {
const engineServerClient = guaranteeType(
this.graphManager.engine,
V1_RemoteEngine,
'executeDataProfiling is only supported by remote engine',
).getEngineServerClient();
return engineServerClient.postWithTracing(
engineServerClient.getTraceData(DQ_EXECUTE_DATA_PROFILING),
`${engineServerClient._pure()}/dataquality/profile`,
input,
{},
undefined,
{
serializationFormat: options?.serializationFormat
? V1_getEngineSerializationFormat(options.serializationFormat)
: undefined,
},
{ enableCompression: true },
{
skipProcessing: Boolean(options?.returnAsResponse),
},
);
};
runDataProfiling = async (
graph: PureModel,
packagePath: string,
options: DQExecuteInputOptions,
): Promise<ExecutionResult> => {
const input = this.createExecutionInput(
graph,
packagePath,
new V1_DQExecuteInput(),
options,
);
try {
const profilingResult = (await this.executeDataProfiling(
V1_DQExecuteInput.serialization.toJson(input),
{
returnAsResponse: true,
},
)) as Response;
const profilingResultInText = await profilingResult.text();
const rawExecutionResult =
returnUndefOnError(() =>
this.graphManager.engine.parseExecutionResults(
profilingResultInText,
undefined,
),
) ?? profilingResultInText;
const v1_executionResult =
V1_deserializeExecutionResult(rawExecutionResult);
return V1_buildExecutionResult(v1_executionResult);
} catch (error) {
assertErrorThrown(error);
if (error instanceof NetworkClientError) {
throw V1_buildExecutionError(
V1_ExecutionError.serialization.fromJson(
error.payload as PlainObject<V1_ExecutionError>,
),
);
}
throw error;
}
};
exportDataProfiling = async (
graph: PureModel,
packagePath: string,
options: DQExecuteInputOptions,
): Promise<Response> => {
const input = new V1_DQRuleSuggestionInput();
input.packagePath = packagePath;
input.clientVersion =
options.clientVersion ??
V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION;
input.model = graph.origin
? this.buildPureModelSDLCPointer(graph.origin, undefined)
: this.graphManager.getFullGraphModelData(graph);
input.lambdaParameterValues = options.lambdaParameterValues
? options.lambdaParameterValues.map(V1_transformParameterValue)
: [];
try {
return guaranteeNonNullable(
(await this.executeDataProfiling(
V1_DQExecuteInput.serialization.toJson(input),
{
serializationFormat: options.serializationFormat,
returnAsResponse: true,
},
)) as Response,
);
} catch (error) {
assertErrorThrown(error);
if (error instanceof NetworkClientError) {
throw V1_buildExecutionError(
V1_ExecutionError.serialization.fromJson(
error.payload as PlainObject<V1_ExecutionError>,
),
);
}
throw error;
}
};
fetchValidationSuggestions = async (
graph: PureModel,
packagePath: string,
options: DQValidationSuggestionInputOptions,
): Promise<DataQualityRelationValidation> => {
const input = new V1_DQRuleSuggestionInput();
input.packagePath = packagePath;
input.clientVersion =
options.clientVersion ??
V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION;
input.model = graph.origin
? this.buildPureModelSDLCPointer(graph.origin, undefined)
: this.graphManager.getFullGraphModelData(graph);
input.lambdaParameterValues = options.lambdaParameterValues
? options.lambdaParameterValues.map(V1_transformParameterValue)
: [];
const engineServerClient = guaranteeType(
this.graphManager.engine,
V1_RemoteEngine,
'fetchValidationSuggestions is only supported by remote engine',
).getEngineServerClient();
return engineServerClient.postWithTracing(
engineServerClient.getTraceData(DQ_FETCH_RULE_SUGGESTIONS),
`${engineServerClient._pure()}/dataquality/ruleSuggestions`,
V1_DQExecuteInput.serialization.toJson(input),
{},
undefined,
{},
{ enableCompression: true },
{},
);
};
private rawLambdaToV1(lambda: RawLambda): V1_RawLambda {
return V1_transformRawLambda(
lambda,
new V1_GraphTransformerContextBuilder(
this.graphManager.pluginManager.getPureProtocolProcessorPlugins(),
).build(),
);
}
private createReconciliationInput(
graph: PureModel,
options: DQReconciliationInputOptions,
): V1_DQReconciliationInput {
const input = new V1_DQReconciliationInput();
input.clientVersion =
options.clientVersion ??
V1_DSL_Data_Quality_PureGraphManagerExtension.DEV_PROTOCOL_VERSION;
input.model = graph.origin
? this.buildPureModelSDLCPointer(graph.origin, undefined)
: this.graphManager.getFullGraphModelData(graph);
const runningSourceOrTargetQuery =
options.runSourceQuery ?? options.runTargetQuery;
input.source = this.rawLambdaToV1(options.source);
input.target = this.rawLambdaToV1(options.target);
input.keys = options.keys;
input.colsForHash = options.colsForHash;
input.aggregatedHash = options.aggregatedHash;
input.sourceHashCol = options.sourceHashCol;
input.targetHashCol = options.targetHashCol;
input.includeColumnValues = options.includeColumnValues;
input.runSourceQuery = options.runSourceQuery;
input.runTargetQuery = options.runTargetQuery;
if (runningSourceOrTargetQuery) {
input.queryLimit = options.limit;
} else {
input.defectLimit = options.limit;
}
if (options.sourceLambdaParameterValues) {
input.sourceLambdaParameterValues =
options.sourceLambdaParameterValues.map(V1_transformParameterValue);
}
if (options.targetLambdaParameterValues) {
input.targetLambdaParameterValues =
options.targetLambdaParameterValues.map(V1_transformParameterValue);
}
return input;
}
runReconciliation = async (
graph: PureModel,
options: DQReconciliationInputOptions,
): Promise<ExecutionResult> => {
const input = this.createReconciliationInput(graph, options);
return this.runReconciliationWithInput(input);
};
runReconciliationSourceQuery = async (
graph: PureModel,
options: DQReconciliationInputOptions,
): Promise<ExecutionResult> => {
const input = this.createReconciliationInput(graph, {
...options,
runSourceQuery: true,
runTargetQuery: undefined,
});
return this.runReconciliationWithInput(input);
};
runReconciliationTargetQuery = async (
graph: PureModel,
options: DQReconciliationInputOptions,
): Promise<ExecutionResult> => {
const input = this.createReconciliationInput(graph, {
...options,
runSourceQuery: undefined,
runTargetQuery: true,
});
return this.runReconciliationWithInput(input);
};
private async runReconciliationWithInput(
input: V1_DQReconciliationInput,
): Promise<ExecutionResult> {
try {
const engineServerClient = guaranteeType(
this.graphManager.engine,
V1_RemoteEngine,
'runReconciliation is only supported by remote engine',
).getEngineServerClient();
const result = await engineServerClient.postWithTracing(
engineServerClient.getTraceData(DQ_EXECUTE_RECONCILIATION),
`${engineServerClient._pure()}/dataquality/reconciliation`,
V1_DQReconciliationInput.serialization.toJson(input),
{},
undefined,
undefined,
{ enableCompression: true },
{ skipProcessing: true },
);
const resultInText = await (result as Response).text();
const rawExecutionResult =
returnUndefOnError(() =>
this.graphManager.engine.parseExecutionResults(
resultInText,
undefined,
),
) ?? resultInText;
const v1_executionResult =
V1_deserializeExecutionResult(rawExecutionResult);
return V1_buildExecutionResult(v1_executionResult);
} catch (error) {
assertErrorThrown(error);
if (error instanceof NetworkClientError) {
throw V1_buildExecutionError(
V1_ExecutionError.serialization.fromJson(
error.payload as PlainObject<V1_ExecutionError>,
),
);
}
throw error;
}
}
}