Skip to content

Data maskers

Module with data masking transformers.

DataMaskers

Bases: object

Class containing data masking transformers.

Source code in mkdocs/lakehouse_engine/packages/transformers/data_maskers.py
class DataMaskers(object):
    """Class containing data masking transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def hash_masker(
        cls,
        cols: List[str],
        approach: str = "SHA",
        num_bits: int = 256,
        suffix: str = "_hash",
    ) -> Callable:
        """Mask specific columns using an hashing approach.

        Args:
            cols: list of column names to mask.
            approach: hashing approach. Defaults to 'SHA'. There's "MURMUR3" as well.
            num_bits: number of bits of the SHA approach. Only applies to SHA approach.
            suffix: suffix to apply to new column name. Defaults to "_hash".
                Note: you can pass an empty suffix to have the original column replaced.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='hash_masker')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            masked_df = df
            for col in cols:
                if approach == "MURMUR3":
                    masked_df = masked_df.withColumn(col + suffix, hash(col))
                elif approach == "SHA":
                    masked_df = masked_df.withColumn(col + suffix, sha2(col, num_bits))
                else:
                    raise WrongArgumentsException("Hashing approach is not supported.")

            return masked_df

        return inner

    @classmethod
    def column_dropper(cls, cols: List[str]) -> Callable:
        """Drop specific columns.

        Args:
            cols: list of column names to drop.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='column_dropper')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            drop_df = df
            for col in cols:
                drop_df = drop_df.drop(col)

            return drop_df

        return inner

column_dropper(cols) classmethod

Drop specific columns.

Parameters:

Name Type Description Default
cols List[str]

list of column names to drop.

required

Returns:

Type Description
Callable

A function to be called in .transform() spark function.

View Example of column_dropper (See full example here)
21{
22    "function": "column_dropper",
23    "args": {
24        "cols": [
25            "customer",
26            "article"
27        ]
28    }
29}
Source code in mkdocs/lakehouse_engine/packages/transformers/data_maskers.py
@classmethod
def column_dropper(cls, cols: List[str]) -> Callable:
    """Drop specific columns.

    Args:
        cols: list of column names to drop.

    Returns:
        A function to be called in .transform() spark function.

    {{get_example(method_name='column_dropper')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        drop_df = df
        for col in cols:
            drop_df = drop_df.drop(col)

        return drop_df

    return inner

hash_masker(cols, approach='SHA', num_bits=256, suffix='_hash') classmethod

Mask specific columns using an hashing approach.

Parameters:

Name Type Description Default
cols List[str]

list of column names to mask.

required
approach str

hashing approach. Defaults to 'SHA'. There's "MURMUR3" as well.

'SHA'
num_bits int

number of bits of the SHA approach. Only applies to SHA approach.

256
suffix str

suffix to apply to new column name. Defaults to "_hash". Note: you can pass an empty suffix to have the original column replaced.

'_hash'

Returns:

Type Description
Callable

A function to be called in .transform() spark function.

View Example of hash_masker (See full example here)
21{
22    "function": "hash_masker",
23    "args": {
24        "cols": [
25            "customer",
26            "article"
27        ]
28    }
29}
Source code in mkdocs/lakehouse_engine/packages/transformers/data_maskers.py
@classmethod
def hash_masker(
    cls,
    cols: List[str],
    approach: str = "SHA",
    num_bits: int = 256,
    suffix: str = "_hash",
) -> Callable:
    """Mask specific columns using an hashing approach.

    Args:
        cols: list of column names to mask.
        approach: hashing approach. Defaults to 'SHA'. There's "MURMUR3" as well.
        num_bits: number of bits of the SHA approach. Only applies to SHA approach.
        suffix: suffix to apply to new column name. Defaults to "_hash".
            Note: you can pass an empty suffix to have the original column replaced.

    Returns:
        A function to be called in .transform() spark function.

    {{get_example(method_name='hash_masker')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        masked_df = df
        for col in cols:
            if approach == "MURMUR3":
                masked_df = masked_df.withColumn(col + suffix, hash(col))
            elif approach == "SHA":
                masked_df = masked_df.withColumn(col + suffix, sha2(col, num_bits))
            else:
                raise WrongArgumentsException("Hashing approach is not supported.")

        return masked_df

    return inner