lakehouse_engine.transformers.regex_transformers
Regex transformers module.
1"""Regex transformers module.""" 2 3from typing import Callable 4 5from pyspark.sql import DataFrame 6from pyspark.sql.functions import col, regexp_extract 7 8from lakehouse_engine.utils.logging_handler import LoggingHandler 9 10 11class RegexTransformers(object): 12 """Class containing all regex functions.""" 13 14 _logger = LoggingHandler(__name__).get_logger() 15 16 @staticmethod 17 def with_regex_value( 18 input_col: str, 19 output_col: str, 20 regex: str, 21 drop_input_col: bool = False, 22 idx: int = 1, 23 ) -> Callable: 24 """Get the result of applying a regex to an input column (via regexp_extract). 25 26 Args: 27 input_col: name of the input column. 28 output_col: name of the output column. 29 regex: regular expression. 30 drop_input_col: whether to drop input_col or not. 31 idx: index to return. 32 33 Returns: 34 A function to be executed in the .transform() spark function. 35 """ 36 37 def inner(df: DataFrame) -> DataFrame: 38 df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx)) 39 40 if drop_input_col: 41 df = df.drop(input_col) 42 43 return df 44 45 return inner
class
RegexTransformers:
12class RegexTransformers(object): 13 """Class containing all regex functions.""" 14 15 _logger = LoggingHandler(__name__).get_logger() 16 17 @staticmethod 18 def with_regex_value( 19 input_col: str, 20 output_col: str, 21 regex: str, 22 drop_input_col: bool = False, 23 idx: int = 1, 24 ) -> Callable: 25 """Get the result of applying a regex to an input column (via regexp_extract). 26 27 Args: 28 input_col: name of the input column. 29 output_col: name of the output column. 30 regex: regular expression. 31 drop_input_col: whether to drop input_col or not. 32 idx: index to return. 33 34 Returns: 35 A function to be executed in the .transform() spark function. 36 """ 37 38 def inner(df: DataFrame) -> DataFrame: 39 df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx)) 40 41 if drop_input_col: 42 df = df.drop(input_col) 43 44 return df 45 46 return inner
Class containing all regex functions.
@staticmethod
def
with_regex_value( input_col: str, output_col: str, regex: str, drop_input_col: bool = False, idx: int = 1) -> Callable:
17 @staticmethod 18 def with_regex_value( 19 input_col: str, 20 output_col: str, 21 regex: str, 22 drop_input_col: bool = False, 23 idx: int = 1, 24 ) -> Callable: 25 """Get the result of applying a regex to an input column (via regexp_extract). 26 27 Args: 28 input_col: name of the input column. 29 output_col: name of the output column. 30 regex: regular expression. 31 drop_input_col: whether to drop input_col or not. 32 idx: index to return. 33 34 Returns: 35 A function to be executed in the .transform() spark function. 36 """ 37 38 def inner(df: DataFrame) -> DataFrame: 39 df = df.withColumn(output_col, regexp_extract(col(input_col), regex, idx)) 40 41 if drop_input_col: 42 df = df.drop(input_col) 43 44 return df 45 46 return inner
Get the result of applying a regex to an input column (via regexp_extract).
Arguments:
- input_col: name of the input column.
- output_col: name of the output column.
- regex: regular expression.
- drop_input_col: whether to drop input_col or not.
- idx: index to return.
Returns:
A function to be executed in the .transform() spark function.