Skip to content

Date transformers

Module containing date transformers.

DateTransformers

Bases: object

Class with set of transformers to transform dates in several forms.

Source code in mkdocs/lakehouse_engine/packages/transformers/date_transformers.py
class DateTransformers(object):
    """Class with set of transformers to transform dates in several forms."""

    _logger = LoggingHandler(__name__).get_logger()

    @staticmethod
    def add_current_date(output_col: str) -> Callable:
        """Add column with current date.

        The current date comes from the driver as a constant, not from every executor.

        Args:
            output_col: name of the output column.

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='add_current_date')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            return df.withColumn(output_col, lit(datetime.now()))

        return inner

    @staticmethod
    def convert_to_date(
        cols: List[str], source_format: Optional[str] = None
    ) -> Callable:
        """Convert multiple string columns with a source format into dates.

        Args:
            cols: list of names of the string columns to convert.
            source_format: dates source format (e.g., YYYY-MM-dd). [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='convert_to_date')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            converted_df = df
            for c in cols:
                converted_df = converted_df.withColumn(
                    c, to_date(col(c), source_format)
                )

            return converted_df

        return inner

    @staticmethod
    def convert_to_timestamp(
        cols: List[str], source_format: Optional[str] = None
    ) -> Callable:
        """Convert multiple string columns with a source format into timestamps.

        Args:
            cols: list of names of the string columns to convert.
            source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS).
                [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='convert_to_timestamp')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            converted_df = df
            for c in cols:
                converted_df = converted_df.withColumn(
                    c, to_timestamp(col(c), source_format)
                )

            return converted_df

        return inner

    @staticmethod
    def format_date(cols: List[str], target_format: Optional[str] = None) -> Callable:
        """Convert multiple date/timestamp columns into strings with the target format.

        Args:
            cols: list of names of the string columns to convert.
            target_format: strings target format (e.g., YYYY-MM-dd). [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='format_date')}}
        """

        def inner(df: DataFrame) -> DataFrame:
            converted_df = df
            for c in cols:
                converted_df = converted_df.withColumn(
                    c, date_format(col(c), target_format)
                )

            return converted_df

        return inner

    @staticmethod
    def get_date_hierarchy(cols: List[str], formats: Optional[dict] = None) -> Callable:
        """Create day/month/week/quarter/year hierarchy for the provided date columns.

        Uses Spark's extract function.

        Args:
            cols: list of names of the date columns to create the hierarchy.
            formats: dict with the correspondence between the hierarchy and the format
                to apply. [Check here](
                https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).
                Example: {
                    "year": "year",
                    "month": "month",
                    "day": "day",
                    "week": "week",
                    "quarter": "quarter"
                }

        Returns:
            A function to be executed in the .transform() spark function.

        {{get_example(method_name='get_date_hierarchy')}}
        """
        if not formats:
            formats = {
                "year": "year",
                "month": "month",
                "day": "day",
                "week": "week",
                "quarter": "quarter",
            }

        def inner(df: DataFrame) -> DataFrame:
            transformer_df = df
            for c in cols:
                transformer_df = transformer_df.selectExpr(
                    "*",
                    f"extract({formats['day']} from {c}) as {c}_day",
                    f"extract({formats['month']} from {c}) as {c}_month",
                    f"extract({formats['week']} from {c}) as {c}_week",
                    f"extract({formats['quarter']} from {c}) as {c}_quarter",
                    f"extract({formats['year']} from {c}) as {c}_year",
                )

            return transformer_df

        return inner

add_current_date(output_col) staticmethod

Add column with current date.

The current date comes from the driver as a constant, not from every executor.

Parameters:

Name Type Description Default
output_col str

name of the output column.

required

Returns:

Type Description
Callable

A function to be executed in the .transform() spark function.

View Example of add_current_date (See full example here)
21{
22    "function": "add_current_date",
23    "args": {
24        "output_col": "curr_date"
25    }
26}
Source code in mkdocs/lakehouse_engine/packages/transformers/date_transformers.py
@staticmethod
def add_current_date(output_col: str) -> Callable:
    """Add column with current date.

    The current date comes from the driver as a constant, not from every executor.

    Args:
        output_col: name of the output column.

    Returns:
        A function to be executed in the .transform() spark function.

    {{get_example(method_name='add_current_date')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        return df.withColumn(output_col, lit(datetime.now()))

    return inner

convert_to_date(cols, source_format=None) staticmethod

Convert multiple string columns with a source format into dates.

Parameters:

Name Type Description Default
cols List[str]

list of names of the string columns to convert.

required
source_format Optional[str]

dates source format (e.g., YYYY-MM-dd). Check here.

None

Returns:

Type Description
Callable

A function to be executed in the .transform() spark function.

View Example of convert_to_date (See full example here)
27{
28    "function": "convert_to_date",
29    "args": {
30        "cols": [
31            "order_date2"
32        ],
33        "source_format": "dd-MM-yyyy"
34    }
35}
Source code in mkdocs/lakehouse_engine/packages/transformers/date_transformers.py
@staticmethod
def convert_to_date(
    cols: List[str], source_format: Optional[str] = None
) -> Callable:
    """Convert multiple string columns with a source format into dates.

    Args:
        cols: list of names of the string columns to convert.
        source_format: dates source format (e.g., YYYY-MM-dd). [Check here](
            https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

    Returns:
        A function to be executed in the .transform() spark function.

    {{get_example(method_name='convert_to_date')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        converted_df = df
        for c in cols:
            converted_df = converted_df.withColumn(
                c, to_date(col(c), source_format)
            )

        return converted_df

    return inner

convert_to_timestamp(cols, source_format=None) staticmethod

Convert multiple string columns with a source format into timestamps.

Parameters:

Name Type Description Default
cols List[str]

list of names of the string columns to convert.

required
source_format Optional[str]

dates source format (e.g., MM-dd-yyyy HH🇲🇲ss.SSS). Check here.

None

Returns:

Type Description
Callable

A function to be executed in the .transform() spark function.

View Example of convert_to_timestamp (See full example here)
41{
42    "function": "convert_to_timestamp",
43    "args": {
44        "cols": [
45            "ship_date"
46        ],
47        "source_format": "yyyy-dd-MM HH:mm:ss"
48    }
49}
Source code in mkdocs/lakehouse_engine/packages/transformers/date_transformers.py
@staticmethod
def convert_to_timestamp(
    cols: List[str], source_format: Optional[str] = None
) -> Callable:
    """Convert multiple string columns with a source format into timestamps.

    Args:
        cols: list of names of the string columns to convert.
        source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS).
            [Check here](
            https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

    Returns:
        A function to be executed in the .transform() spark function.

    {{get_example(method_name='convert_to_timestamp')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        converted_df = df
        for c in cols:
            converted_df = converted_df.withColumn(
                c, to_timestamp(col(c), source_format)
            )

        return converted_df

    return inner

format_date(cols, target_format=None) staticmethod

Convert multiple date/timestamp columns into strings with the target format.

Parameters:

Name Type Description Default
cols List[str]

list of names of the string columns to convert.

required
target_format Optional[str]

strings target format (e.g., YYYY-MM-dd). Check here.

None

Returns:

Type Description
Callable

A function to be executed in the .transform() spark function.

View Example of format_date (See full example here)
48{
49    "function": "format_date",
50    "args": {
51        "cols": [
52            "order_date3",
53            "ship_date"
54        ],
55        "target_format": "yy-d-M"
56    }
57}
Source code in mkdocs/lakehouse_engine/packages/transformers/date_transformers.py
@staticmethod
def format_date(cols: List[str], target_format: Optional[str] = None) -> Callable:
    """Convert multiple date/timestamp columns into strings with the target format.

    Args:
        cols: list of names of the string columns to convert.
        target_format: strings target format (e.g., YYYY-MM-dd). [Check here](
            https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).

    Returns:
        A function to be executed in the .transform() spark function.

    {{get_example(method_name='format_date')}}
    """

    def inner(df: DataFrame) -> DataFrame:
        converted_df = df
        for c in cols:
            converted_df = converted_df.withColumn(
                c, date_format(col(c), target_format)
            )

        return converted_df

    return inner

get_date_hierarchy(cols, formats=None) staticmethod

Create day/month/week/quarter/year hierarchy for the provided date columns.

Uses Spark's extract function.

Parameters:

Name Type Description Default
cols List[str]

list of names of the date columns to create the hierarchy.

required
formats Optional[dict]

dict with the correspondence between the hierarchy and the format to apply. Check here. Example: { "year": "year", "month": "month", "day": "day", "week": "week", "quarter": "quarter" }

None

Returns:

Type Description
Callable

A function to be executed in the .transform() spark function.

View Example of get_date_hierarchy (See full example here)
55{
56    "function": "get_date_hierarchy",
57    "args": {
58        "cols": [
59            "order_date2",
60            "ship_date2"
61        ]
62    }
63}
Source code in mkdocs/lakehouse_engine/packages/transformers/date_transformers.py
@staticmethod
def get_date_hierarchy(cols: List[str], formats: Optional[dict] = None) -> Callable:
    """Create day/month/week/quarter/year hierarchy for the provided date columns.

    Uses Spark's extract function.

    Args:
        cols: list of names of the date columns to create the hierarchy.
        formats: dict with the correspondence between the hierarchy and the format
            to apply. [Check here](
            https://docs.oracle.com/javase/10/docs/api/java/time/format/DateTimeFormatter.html).
            Example: {
                "year": "year",
                "month": "month",
                "day": "day",
                "week": "week",
                "quarter": "quarter"
            }

    Returns:
        A function to be executed in the .transform() spark function.

    {{get_example(method_name='get_date_hierarchy')}}
    """
    if not formats:
        formats = {
            "year": "year",
            "month": "month",
            "day": "day",
            "week": "week",
            "quarter": "quarter",
        }

    def inner(df: DataFrame) -> DataFrame:
        transformer_df = df
        for c in cols:
            transformer_df = transformer_df.selectExpr(
                "*",
                f"extract({formats['day']} from {c}) as {c}_day",
                f"extract({formats['month']} from {c}) as {c}_month",
                f"extract({formats['week']} from {c}) as {c}_week",
                f"extract({formats['quarter']} from {c}) as {c}_quarter",
                f"extract({formats['year']} from {c}) as {c}_year",
            )

        return transformer_df

    return inner