Skip to content

Expect multicolumn column a must equal b or c

Expectation to check if column 'a' equals 'b', or 'c'.

ExpectMulticolumnColumnAMustEqualBOrC

Bases: MulticolumnMapExpectation

Expect that the column 'a' is equal to 'b' when this is not empty; otherwise 'a' must be equal to 'c'.

Parameters:

Name Type Description Default
column_list

The column names to evaluate.

required

Other Parameters:

Name Type Description
ignore_row_if

default to "never".

result_format

Which output mode to use: BOOLEAN_ONLY, BASIC, COMPLETE, or SUMMARY. Default set to BASIC.

include_config

If True, then include the expectation config as part of the result object. Default set to True.

catch_exceptions

If True, then catch exceptions and include them as part of the result object. Default set to False.

Returns:

Type Description

An ExpectationSuiteValidationResult.

Source code in mkdocs/lakehouse_engine/packages/dq_processors/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c.py
class ExpectMulticolumnColumnAMustEqualBOrC(MulticolumnMapExpectation):
    """Expect that the column 'a' is equal to 'b' when this is not empty; otherwise 'a' must be equal to 'c'.

    Args:
        column_list: The column names to evaluate.

    Keyword Args:
        ignore_row_if: default to "never".
        result_format:  Which output mode to use:
            `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
            Default set to `BASIC`.
        include_config: If True, then include the expectation
            config as part of the result object.
            Default set to True.
        catch_exceptions: If True, then catch exceptions
            and include them as part of the result object.
            Default set to False.

    Returns:
        An ExpectationSuiteValidationResult.
    """  # noqa: E501

    ignore_row_if: Literal[
        "all_values_are_missing", "any_value_is_missing", "never"
    ] = "never"
    result_format: dict = {"result_format": "BASIC"}
    include_config: bool = True
    catch_exceptions: bool = False
    mostly: float = 1.0
    column_list: Any = None
    validation_regex_c: Any = None

    examples = [
        {
            "dataset_name": "Test Dataset",
            "data": [
                {
                    "data": {
                        "a": ["d001", "1000", "1001"],
                        "b": [None, "1000", "1001"],
                        "c": ["d001", "d002", "d002"],
                        "d": ["d001", "d002", "1001"],
                    },
                    "schemas": {
                        "spark": {
                            "a": "StringType",
                            "b": "StringType",
                            "c": "StringType",
                            "d": "StringType",
                        }
                    },
                }
            ],
            "tests": [
                {
                    "title": "negative_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_list": ["d", "b", "c"],
                        "validation_regex_c": "d[0-9]{3}$",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["d", "b", "c"],
                        },
                    },
                    "out": {
                        "success": False,
                        "unexpected_index_list": [
                            {
                                "d": "d002",
                                "b": "1000",
                                "c": "d002",
                            }
                        ],
                    },
                },
                {
                    "title": "positive_test",
                    "exact_match_out": False,
                    "include_in_gallery": True,
                    "in": {
                        "column_list": ["a", "b", "c"],
                        "validation_regex_c": "d[0-9]{3}$",
                        "result_format": {
                            "result_format": "COMPLETE",
                            "unexpected_index_column_names": ["a", "b", "c"],
                        },
                    },
                    "out": {"success": True},
                },
            ],
        },
    ]

    map_metric = "multicolumn_values.column_a_must_equal_b_or_c"
    success_keys = (
        "validation_regex_b",
        "validation_regex_c",
        "mostly",
    )  # type: ignore

    def _validate(
        self,
        metrics: Dict,
        runtime_configuration: Optional[dict] = None,
        execution_engine: Optional[ExecutionEngine] = None,
    ) -> Any:
        """Custom implementation of the GE _validate method.

        This method is used on the tests to validate both the result
        of the tests themselves and if the unexpected index list
        is correctly generated.
        The GE test logic does not do this validation, and thus
        we need to make it manually.

        Args:
            metrics: Test result metrics.
            runtime_configuration: Configuration used when running the expectation.
            execution_engine: Execution Engine where the expectation was run.

        Returns:
            Dictionary with the result of the validation.
        """
        validate_result(
            self,
            metrics,
        )

        return super()._validate(metrics, runtime_configuration, execution_engine)

MulticolumnCustomMetric

Bases: MulticolumnMapMetricProvider

Expectation metric definition.

This expectation asserts that column 'a' must equal to column 'b' or column 'c'. In addition to this it is possible to validate that column 'b' or 'c' match a regex.

Source code in mkdocs/lakehouse_engine/packages/dq_processors/custom_expectations/expect_multicolumn_column_a_must_equal_b_or_c.py
class MulticolumnCustomMetric(MulticolumnMapMetricProvider):
    """Expectation metric definition.

    This expectation asserts that column 'a' must equal to column 'b' or column 'c'.
    In addition to this it is possible to validate that column 'b' or 'c' match a regex.
    """

    condition_metric_name = "multicolumn_values.column_a_must_equal_b_or_c"
    condition_domain_keys = (
        "batch_id",
        "table",
        "column_list",
        "ignore_row_if",
    )

    condition_value_keys = ("validation_regex_b", "validation_regex_c")

    @multicolumn_condition_partial(engine=SparkDFExecutionEngine)
    def _spark(
        self: MulticolumnMapMetricProvider, column_list: list, **kwargs: dict
    ) -> Any:
        validation_regex_b = (
            kwargs.get("validation_regex_b") if "validation_regex_b" in kwargs else ".*"
        )
        validation_regex_c = (
            kwargs.get("validation_regex_c") if "validation_regex_c" in kwargs else ".*"
        )

        return (column_list[0].isNotNull()) & (
            (
                column_list[1].isNotNull()
                & (column_list[1].rlike(validation_regex_b))
                & (column_list[0] == column_list[1])
            )
            | (
                (column_list[1].isNull())
                & (column_list[2].rlike(validation_regex_c))
                & (column_list[0] == column_list[2])
            )
        )