lakehouse_engine.transformers.regex_transformers

Regex transformers module.

 1"""Regex transformers module."""
 2
 3from typing import Callable
 4
 5from pyspark.sql import DataFrame
 6from pyspark.sql.functions import col, regexp_extract
 7
 8from lakehouse_engine.utils.logging_handler import LoggingHandler
 9
10
11class RegexTransformers(object):
12    """Class containing all regex functions."""
13
14    _logger = LoggingHandler(__name__).get_logger()
15
16    @staticmethod
17    def with_regex_value(
18        input_col: str,
19        output_col: str,
20        regex: str,
21        drop_input_col: bool = False,
22        idx: int = 1,
23    ) -> Callable:
24        """Get the result of applying a regex to an input column (via regexp_extract).
25
26        Args:
27            input_col: name of the input column.
28            output_col: name of the output column.
29            regex: regular expression.
30            drop_input_col: whether to drop input_col or not.
31            idx: index to return.
32
33        Returns:
34             A function to be executed in the .transform() spark function.
35        """
36
37        def inner(df: DataFrame) -> DataFrame:
38            df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx))
39
40            if drop_input_col:
41                df = df.drop(input_col)
42
43            return df
44
45        return inner
class RegexTransformers:
12class RegexTransformers(object):
13    """Class containing all regex functions."""
14
15    _logger = LoggingHandler(__name__).get_logger()
16
17    @staticmethod
18    def with_regex_value(
19        input_col: str,
20        output_col: str,
21        regex: str,
22        drop_input_col: bool = False,
23        idx: int = 1,
24    ) -> Callable:
25        """Get the result of applying a regex to an input column (via regexp_extract).
26
27        Args:
28            input_col: name of the input column.
29            output_col: name of the output column.
30            regex: regular expression.
31            drop_input_col: whether to drop input_col or not.
32            idx: index to return.
33
34        Returns:
35             A function to be executed in the .transform() spark function.
36        """
37
38        def inner(df: DataFrame) -> DataFrame:
39            df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx))
40
41            if drop_input_col:
42                df = df.drop(input_col)
43
44            return df
45
46        return inner

Class containing all regex functions.

@staticmethod
def with_regex_value( input_col: str, output_col: str, regex: str, drop_input_col: bool = False, idx: int = 1) -> Callable:
17    @staticmethod
18    def with_regex_value(
19        input_col: str,
20        output_col: str,
21        regex: str,
22        drop_input_col: bool = False,
23        idx: int = 1,
24    ) -> Callable:
25        """Get the result of applying a regex to an input column (via regexp_extract).
26
27        Args:
28            input_col: name of the input column.
29            output_col: name of the output column.
30            regex: regular expression.
31            drop_input_col: whether to drop input_col or not.
32            idx: index to return.
33
34        Returns:
35             A function to be executed in the .transform() spark function.
36        """
37
38        def inner(df: DataFrame) -> DataFrame:
39            df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx))
40
41            if drop_input_col:
42                df = df.drop(input_col)
43
44            return df
45
46        return inner

Get the result of applying a regex to an input column (via regexp_extract).

Arguments:
  • input_col: name of the input column.
  • output_col: name of the output column.
  • regex: regular expression.
  • drop_input_col: whether to drop input_col or not.
  • idx: index to return.
Returns:

A function to be executed in the .transform() spark function.

View Example
40{
41    "function": "with_regex_value",
42    "args": {
43        "input_col": "lhe_extraction_filepath",
44        "output_col": "extraction_date",
45        "drop_input_col": true,
46        "regex": ".*WE_SO_SCL_(\\d+).csv"
47    }
48}
View Full Acon