lakehouse_engine.transformers.null_handlers

Module with null handlers transformers.

 1"""Module with null handlers transformers."""
 2
 3from typing import Callable, List
 4
 5from pyspark.sql import DataFrame
 6
 7from lakehouse_engine.utils.logging_handler import LoggingHandler
 8
 9
10class NullHandlers(object):
11    """Class containing null handler transformers."""
12
13    _logger = LoggingHandler(__name__).get_logger()
14
15    @classmethod
16    def replace_nulls(
17        cls,
18        replace_on_nums: bool = True,
19        default_num_value: int = -999,
20        replace_on_strings: bool = True,
21        default_string_value: str = "UNKNOWN",
22        subset_cols: List[str] = None,
23    ) -> Callable:
24        """Replace nulls in a dataframe.
25
26        Args:
27            replace_on_nums: if it is to replace nulls on numeric columns.
28                Applies to ints, longs and floats.
29            default_num_value: default integer value to use as replacement.
30            replace_on_strings: if it is to replace nulls on string columns.
31            default_string_value: default string value to use as replacement.
32            subset_cols: list of columns in which to replace nulls. If not
33                provided, all nulls in all columns will be replaced as specified.
34
35        Returns:
36            A function to be called in .transform() spark function.
37        """
38
39        def inner(df: DataFrame) -> DataFrame:
40            if replace_on_nums:
41                df = df.na.fill(default_num_value, subset_cols)
42            if replace_on_strings:
43                df = df.na.fill(default_string_value, subset_cols)
44
45            return df
46
47        return inner
class NullHandlers:
11class NullHandlers(object):
12    """Class containing null handler transformers."""
13
14    _logger = LoggingHandler(__name__).get_logger()
15
16    @classmethod
17    def replace_nulls(
18        cls,
19        replace_on_nums: bool = True,
20        default_num_value: int = -999,
21        replace_on_strings: bool = True,
22        default_string_value: str = "UNKNOWN",
23        subset_cols: List[str] = None,
24    ) -> Callable:
25        """Replace nulls in a dataframe.
26
27        Args:
28            replace_on_nums: if it is to replace nulls on numeric columns.
29                Applies to ints, longs and floats.
30            default_num_value: default integer value to use as replacement.
31            replace_on_strings: if it is to replace nulls on string columns.
32            default_string_value: default string value to use as replacement.
33            subset_cols: list of columns in which to replace nulls. If not
34                provided, all nulls in all columns will be replaced as specified.
35
36        Returns:
37            A function to be called in .transform() spark function.
38        """
39
40        def inner(df: DataFrame) -> DataFrame:
41            if replace_on_nums:
42                df = df.na.fill(default_num_value, subset_cols)
43            if replace_on_strings:
44                df = df.na.fill(default_string_value, subset_cols)
45
46            return df
47
48        return inner

Class containing null handler transformers.

@classmethod
def replace_nulls( cls, replace_on_nums: bool = True, default_num_value: int = -999, replace_on_strings: bool = True, default_string_value: str = 'UNKNOWN', subset_cols: List[str] = None) -> Callable:
16    @classmethod
17    def replace_nulls(
18        cls,
19        replace_on_nums: bool = True,
20        default_num_value: int = -999,
21        replace_on_strings: bool = True,
22        default_string_value: str = "UNKNOWN",
23        subset_cols: List[str] = None,
24    ) -> Callable:
25        """Replace nulls in a dataframe.
26
27        Args:
28            replace_on_nums: if it is to replace nulls on numeric columns.
29                Applies to ints, longs and floats.
30            default_num_value: default integer value to use as replacement.
31            replace_on_strings: if it is to replace nulls on string columns.
32            default_string_value: default string value to use as replacement.
33            subset_cols: list of columns in which to replace nulls. If not
34                provided, all nulls in all columns will be replaced as specified.
35
36        Returns:
37            A function to be called in .transform() spark function.
38        """
39
40        def inner(df: DataFrame) -> DataFrame:
41            if replace_on_nums:
42                df = df.na.fill(default_num_value, subset_cols)
43            if replace_on_strings:
44                df = df.na.fill(default_string_value, subset_cols)
45
46            return df
47
48        return inner

Replace nulls in a dataframe.

Arguments:
  • replace_on_nums: if it is to replace nulls on numeric columns. Applies to ints, longs and floats.
  • default_num_value: default integer value to use as replacement.
  • replace_on_strings: if it is to replace nulls on string columns.
  • default_string_value: default string value to use as replacement.
  • subset_cols: list of columns in which to replace nulls. If not provided, all nulls in all columns will be replaced as specified.
Returns:

A function to be called in .transform() spark function.

View Example
21{
22    "function": "replace_nulls",
23    "args": {
24        "subset_cols": [
25            "amount"
26        ]
27    }
28}
View Full Acon