Skip to content

Unions

Module with union transformers.

Unions

Bases: object

Class containing union transformers.

Source code in mkdocs/lakehouse_engine/packages/transformers/unions.py
class Unions(object):
    """Class containing union transformers."""

    _logger = LoggingHandler(__name__).get_logger()

    @classmethod
    def union(
        cls,
        union_with: List[DataFrame],
        deduplication: bool = True,
    ) -> Callable:
        """Union dataframes, resolving columns by position (not by name).

        Args:
            union_with: list of dataframes to union.
            deduplication: whether to perform deduplication of elements or not.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='union')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            union_df = reduce(lambda x, y: x.union(y), [df] + union_with)

            return union_df.distinct() if deduplication else union_df

        return inner

    @classmethod
    def union_by_name(
        cls,
        union_with: List[DataFrame],
        deduplication: bool = True,
        allow_missing_columns: bool = True,
    ) -> Callable:
        """Union dataframes, resolving columns by name (not by position).

        Args:
            union_with: list of dataframes to union.
            deduplication: whether to perform deduplication of elements or not.
            allow_missing_columns: allow the union of DataFrames with different
                schemas.

        Returns:
            A function to be called in .transform() spark function.

        {{get_example(method_name='union_by_name')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            union_df = reduce(
                lambda x, y: x.unionByName(
                    y, allowMissingColumns=allow_missing_columns
                ),
                [df] + union_with,
            )

            return union_df.distinct() if deduplication else union_df

        return inner

union(union_with, deduplication=True) classmethod

Union dataframes, resolving columns by position (not by name).

Parameters:

Name Type Description Default
union_with List[DataFrame]

list of dataframes to union.

required
deduplication bool

whether to perform deduplication of elements or not.

True

Returns:

Type Description
Callable

A function to be called in .transform() spark function.

Source code in mkdocs/lakehouse_engine/packages/transformers/unions.py
@classmethod
def union(
    cls,
    union_with: List[DataFrame],
    deduplication: bool = True,
) -> Callable:
    """Union dataframes, resolving columns by position (not by name).

    Args:
        union_with: list of dataframes to union.
        deduplication: whether to perform deduplication of elements or not.

    Returns:
        A function to be called in .transform() spark function.

    {{get_example(method_name='union')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        union_df = reduce(lambda x, y: x.union(y), [df] + union_with)

        return union_df.distinct() if deduplication else union_df

    return inner

union_by_name(union_with, deduplication=True, allow_missing_columns=True) classmethod

Union dataframes, resolving columns by name (not by position).

Parameters:

Name Type Description Default
union_with List[DataFrame]

list of dataframes to union.

required
deduplication bool

whether to perform deduplication of elements or not.

True
allow_missing_columns bool

allow the union of DataFrames with different schemas.

True

Returns:

Type Description
Callable

A function to be called in .transform() spark function.

Source code in mkdocs/lakehouse_engine/packages/transformers/unions.py
@classmethod
def union_by_name(
    cls,
    union_with: List[DataFrame],
    deduplication: bool = True,
    allow_missing_columns: bool = True,
) -> Callable:
    """Union dataframes, resolving columns by name (not by position).

    Args:
        union_with: list of dataframes to union.
        deduplication: whether to perform deduplication of elements or not.
        allow_missing_columns: allow the union of DataFrames with different
            schemas.

    Returns:
        A function to be called in .transform() spark function.

    {{get_example(method_name='union_by_name')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        union_df = reduce(
            lambda x, y: x.unionByName(
                y, allowMissingColumns=allow_missing_columns
            ),
            [df] + union_with,
        )

        return union_df.distinct() if deduplication else union_df

    return inner