Skip to content

Aggregators

Aggregators module.

Aggregators

Bases: object

Class containing all aggregation functions.

Source code in mkdocs/lakehouse_engine/packages/transformers/aggregators.py
class Aggregators(object):
    """Class containing all aggregation functions."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def get_max_value(input_col: str, output_col: str = "latest") -> Callable:
        """Get the maximum value of a given column of a dataframe.

        Args:
            input_col: name of the input column.
            output_col: name of the output column (defaults to "latest").

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='get_max_value')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.select(col(input_col)).agg(max(input_col).alias(output_col))

        return inner

get_max_value(input_col, output_col='latest') staticmethod

Get the maximum value of a given column of a dataframe.

Parameters:

Name Type Description Default
input_col str

name of the input column.

required
output_col str

name of the output column (defaults to "latest").

'latest'

Returns:

Type Description
Callable

A function to be executed in the .transform() spark function.

View Example of get_max_value (See full example here)
28{
29    "function": "get_max_value",
30    "args": {
31        "input_col": "extraction_date"
32    }
33}
Source code in mkdocs/lakehouse_engine/packages/transformers/aggregators.py
@staticmethod
def get_max_value(input_col: str, output_col: str = "latest") -> Callable:
    """Get the maximum value of a given column of a dataframe.

    Args:
        input_col: name of the input column.
        output_col: name of the output column (defaults to "latest").

    Returns:
        A function to be executed in the .transform() spark function.

    {{get_example(method_name='get_max_value')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        return df.select(col(input_col)).agg(max(input_col).alias(output_col))

    return inner