lakehouse_engine.transformers.column

@classmethod

def cast(cls, cols: Dict[str, str]) -> Callable: View Source

30    @classmethod
31    def cast(cls, cols: Dict[str, str]) -> Callable:
32        """Cast specific columns into the designated type.
33
34        Args:
35            cols: dict with columns and respective target types.
36                Target types need to have the exact name of spark types:
37                https://spark.apache.org/docs/latest/sql-ref-datatypes.html
38
39        Returns:
40            A function to be called in .transform() spark function.
41        """
42
43        def inner(df: DataFrame) -> DataFrame:
44            cast_df = df
45            for c, t in cols.items():
46                cast_df = cast_df.withColumn(c, col(c).cast(getattr(spark_types, t)()))
47
48            return cast_df
49
50        return inner

Cast specific columns into the designated type.

Arguments:

cols: dict with columns and respective target types. Target types need to have the exact name of spark types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html

Returns:

View Example

38{
39    "function": "cast",
40    "args": {
41        "cols": {
42            "code": "StringType"
43        }
44    }
45}

View Full Acon

@classmethod

def column_selector(cls, cols: collections.OrderedDict) -> Callable: View Source

52    @classmethod
53    def column_selector(cls, cols: OrderedDict) -> Callable:
54        """Select specific columns with specific output aliases.
55
56        Args:
57            cols: dict with columns to select and respective aliases.
58
59        Returns:
60            A function to be called in .transform() spark function.
61        """
62
63        def inner(df: DataFrame) -> DataFrame:
64            return df.select(*[col(c).alias(a) for c, a in cols.items()])
65
66        return inner

Select specific columns with specific output aliases.

Arguments:

cols: dict with columns to select and respective aliases.

Returns:

@classmethod

def flatten_schema( cls, max_level: int = None, shorten_names: bool = False, alias: bool = True, num_chars: int = 7, ignore_cols: List = None) -> Callable: View Source

 68    @classmethod
 69    def flatten_schema(
 70        cls,
 71        max_level: int = None,
 72        shorten_names: bool = False,
 73        alias: bool = True,
 74        num_chars: int = 7,
 75        ignore_cols: List = None,
 76    ) -> Callable:
 77        """Flatten the schema of the dataframe.
 78
 79        Args:
 80            max_level: level until which you want to flatten the schema.
 81                Default: None.
 82            shorten_names: whether to shorten the names of the prefixes
 83                of the fields being flattened or not. Default: False.
 84            alias: whether to define alias for the columns being flattened
 85                or not. Default: True.
 86            num_chars: number of characters to consider when shortening
 87                the names of the fields. Default: 7.
 88            ignore_cols: columns which you don't want to flatten.
 89                Default: None.
 90
 91        Returns:
 92            A function to be called in .transform() spark function.
 93        """
 94
 95        def inner(df: DataFrame) -> DataFrame:
 96            return df.select(
 97                SchemaUtils.schema_flattener(
 98                    schema=df.schema,
 99                    max_level=max_level,
100                    shorten_names=shorten_names,
101                    alias=alias,
102                    num_chars=num_chars,
103                    ignore_cols=ignore_cols,
104                )
105            )
106
107        return inner

Flatten the schema of the dataframe.

Arguments:

max_level: level until which you want to flatten the schema. Default: None.
shorten_names: whether to shorten the names of the prefixes of the fields being flattened or not. Default: False.
alias: whether to define alias for the columns being flattened or not. Default: True.
num_chars: number of characters to consider when shortening the names of the fields. Default: 7.
ignore_cols: columns which you don't want to flatten. Default: None.

Returns:

View Example

 95{
 96    "function": "flatten_schema",
 97    "args": {
 98        "max_level": 2
 99    }
100}

View Full Acon

@classmethod

def explode_columns( cls, explode_arrays: bool = False, array_cols_to_explode: List[str] = None, explode_maps: bool = False, map_cols_to_explode: List[str] = None) -> Callable: View Source

109    @classmethod
110    def explode_columns(
111        cls,
112        explode_arrays: bool = False,
113        array_cols_to_explode: List[str] = None,
114        explode_maps: bool = False,
115        map_cols_to_explode: List[str] = None,
116    ) -> Callable:
117        """Explode columns with types like ArrayType and MapType.
118
119        After it can be applied the flatten_schema transformation,
120        if we desired for example to explode the map (as we explode a StructType)
121        or to explode a StructType inside the array.
122        We recommend you to specify always the columns desired to explode
123        and not explode all columns.
124
125        Args:
126            explode_arrays: whether you want to explode array columns (True)
127                or not (False). Default: False.
128            array_cols_to_explode: array columns which you want to explode.
129                If you don't specify it will get all array columns and explode them.
130                Default: None.
131            explode_maps: whether you want to explode map columns (True)
132                or not (False). Default: False.
133            map_cols_to_explode: map columns which you want to explode.
134                If you don't specify it will get all map columns and explode them.
135                Default: None.
136
137        Returns:
138            A function to be called in .transform() spark function.
139        """
140
141        def inner(df: DataFrame) -> DataFrame:
142            if explode_arrays or (array_cols_to_explode is not None):
143                df = cls._explode_arrays(df, array_cols_to_explode)
144
145            if explode_maps or (map_cols_to_explode is not None):
146                df = cls._explode_maps(df, map_cols_to_explode)
147
148            return df
149
150        return inner

Explode columns with types like ArrayType and MapType.

After it can be applied the flatten_schema transformation, if we desired for example to explode the map (as we explode a StructType) or to explode a StructType inside the array. We recommend you to specify always the columns desired to explode and not explode all columns.

Arguments:

explode_arrays: whether you want to explode array columns (True) or not (False). Default: False.
array_cols_to_explode: array columns which you want to explode. If you don't specify it will get all array columns and explode them. Default: None.
explode_maps: whether you want to explode map columns (True) or not (False). Default: False.
map_cols_to_explode: map columns which you want to explode. If you don't specify it will get all map columns and explode them. Default: None.

Returns:

View Example

44{
45    "function": "explode_columns",
46    "args": {
47        "explode_arrays": true
48    }
49}

View Full Acon

@classmethod

def with_expressions(cls, cols_and_exprs: Dict[str, str]) -> Callable: View Source

173    @classmethod
174    def with_expressions(cls, cols_and_exprs: Dict[str, str]) -> Callable:
175        """Execute Spark SQL expressions to create the specified columns.
176
177        This function uses the Spark expr function. [Check here](
178        https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.expr.html).
179
180        Args:
181            cols_and_exprs: dict with columns and respective expressions to compute
182                (Spark SQL expressions).
183
184        Returns:
185            A function to be called in .transform() spark function.
186        """
187
188        def inner(df: DataFrame) -> DataFrame:
189            enriched_df = df
190            for c, e in cols_and_exprs.items():
191                enriched_df = enriched_df.withColumn(c, expr(e))
192
193            return enriched_df
194
195        return inner

Execute Spark SQL expressions to create the specified columns.

This function uses the Spark expr function. Check here.

Arguments:

cols_and_exprs: dict with columns and respective expressions to compute (Spark SQL expressions).

Returns:

View Example

25{
26    "function": "with_expressions",
27    "args": {
28        "cols_and_exprs": {
29            "constant": "'just a constant'",
30            "length_customer2": "length(customer2)"
31        }
32    }
33}

View Full Acon

@classmethod

def rename(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable: View Source

197    @classmethod
198    def rename(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:
199        """Rename specific columns into the designated name.
200
201        Args:
202            cols: dict with columns and respective target names.
203            escape_col_names: whether to escape column names (e.g. `/BIC/COL1`) or not.
204                If True it creates a column with the new name and drop the old one.
205                If False, uses the native withColumnRenamed Spark function.
206                Default: True.
207
208        Returns:
209            Function to be called in .transform() spark function.
210        """
211
212        def inner(df: DataFrame) -> DataFrame:
213            renamed_df = df
214            if escape_col_names:
215                for old_name, new_name in cols.items():
216                    renamed_df = renamed_df.withColumn(new_name, col(old_name))
217                    renamed_df = renamed_df.drop(old_name)
218            else:
219                for old_name, new_name in cols.items():
220                    renamed_df = df.withColumnRenamed(old_name, new_name)
221
222            return renamed_df
223
224        return inner

Rename specific columns into the designated name.

Arguments:

cols: dict with columns and respective target names.
escape_col_names: whether to escape column names (e.g. /BIC/COL1) or not. If True it creates a column with the new name and drop the old one. If False, uses the native withColumnRenamed Spark function. Default: True.

Returns:

View Example

37{
38    "function": "rename",
39    "args": {
40        "cols": {
41            "ARTICLE": "article"
42        }
43    }
44}

View Full Acon

@classmethod

def from_avro( cls, schema: str = None, key_col: str = 'key', value_col: str = 'value', options: dict = None, expand_key: bool = False, expand_value: bool = True) -> Callable: View Source

226    @classmethod
227    def from_avro(
228        cls,
229        schema: str = None,
230        key_col: str = "key",
231        value_col: str = "value",
232        options: dict = None,
233        expand_key: bool = False,
234        expand_value: bool = True,
235    ) -> Callable:
236        """Select all attributes from avro.
237
238        Args:
239            schema: the schema string.
240            key_col: the name of the key column.
241            value_col: the name of the value column.
242            options: extra options (e.g., mode: "PERMISSIVE").
243            expand_key: whether you want to expand the content inside the key
244                column or not. Default: false.
245            expand_value: whether you want to expand the content inside the value
246                column or not. Default: true.
247
248        Returns:
249            Function to be called in .transform() spark function.
250        """
251
252        def inner(df: DataFrame) -> DataFrame:
253            cols_to_select = [
254                column for column in df.columns if column not in [key_col, value_col]
255            ]
256
257            return df.select(
258                *cols_to_select,
259                key_col,
260                from_avro(col(value_col), schema, options if options else {}).alias(
261                    value_col
262                ),
263            ).select(
264                *cols_to_select,
265                f"{key_col}.*" if expand_key else key_col,
266                f"{value_col}.*" if expand_value else value_col,
267            )
268
269        return inner

Select all attributes from avro.

Arguments:

schema: the schema string.
key_col: the name of the key column.
value_col: the name of the value column.
options: extra options (e.g., mode: "PERMISSIVE").
expand_key: whether you want to expand the content inside the key column or not. Default: false.
expand_value: whether you want to expand the content inside the value column or not. Default: true.

Returns:

@classmethod

def from_avro_with_registry( cls, schema_registry: str, value_schema: str, value_col: str = 'value', key_schema: str = None, key_col: str = 'key', expand_key: bool = False, expand_value: bool = True) -> Callable: View Source

271    @classmethod
272    def from_avro_with_registry(
273        cls,
274        schema_registry: str,
275        value_schema: str,
276        value_col: str = "value",
277        key_schema: str = None,
278        key_col: str = "key",
279        expand_key: bool = False,
280        expand_value: bool = True,
281    ) -> Callable:
282        """Select all attributes from avro using a schema registry.
283
284        Args:
285            schema_registry: the url to the schema registry.
286            value_schema: the name of the value schema entry in the schema registry.
287            value_col: the name of the value column.
288            key_schema: the name of the key schema entry in the schema
289                registry. Default: None.
290            key_col: the name of the key column.
291            expand_key: whether you want to expand the content inside the key
292                column or not. Default: false.
293            expand_value: whether you want to expand the content inside the value
294                column or not. Default: true.
295
296        Returns:
297            Function to be called in .transform() spark function.
298        """
299
300        def inner(df: DataFrame) -> DataFrame:
301            cols_to_select = [
302                column for column in df.columns if column not in [key_col, value_col]
303            ]
304
305            return df.select(  # type: ignore
306                *cols_to_select,
307                (
308                    from_avro(
309                        data=col(key_col),
310                        subject=key_schema,
311                        schemaRegistryAddress=schema_registry,  # type: ignore
312                    ).alias(key_col)
313                    if key_schema
314                    else key_col
315                ),
316                from_avro(
317                    data=col(value_col),
318                    subject=value_schema,
319                    schemaRegistryAddress=schema_registry,  # type: ignore
320                ).alias(value_col),
321            ).select(
322                *cols_to_select,
323                f"{key_col}.*" if expand_key else key_col,
324                f"{value_col}.*" if expand_value else value_col,
325            )
326
327        return inner

Select all attributes from avro using a schema registry.

Arguments:

schema_registry: the url to the schema registry.
value_schema: the name of the value schema entry in the schema registry.
value_col: the name of the value column.
key_schema: the name of the key schema entry in the schema registry. Default: None.
key_col: the name of the key column.
expand_key: whether you want to expand the content inside the key column or not. Default: false.
expand_value: whether you want to expand the content inside the value column or not. Default: true.

Returns:

@classmethod

def from_json( cls, input_col: str, schema_path: Optional[str] = None, schema: Optional[dict] = None, json_options: Optional[dict] = None, drop_all_cols: bool = False, disable_dbfs_retry: bool = False) -> Callable: View Source

329    @classmethod
330    def from_json(
331        cls,
332        input_col: str,
333        schema_path: Optional[str] = None,
334        schema: Optional[dict] = None,
335        json_options: Optional[dict] = None,
336        drop_all_cols: bool = False,
337        disable_dbfs_retry: bool = False,
338    ) -> Callable:
339        """Convert a json string into a json column (struct).
340
341        The new json column can be added to the existing columns (default) or it can
342        replace all the others, being the only one to output. The new column gets the
343        same name as the original one suffixed with '_json'.
344
345        Args:
346            input_col: dict with columns and respective target names.
347            schema_path: path to the StructType schema (spark schema).
348            schema: dict with the StructType schema (spark schema).
349            json_options: options to parse the json value.
350            drop_all_cols: whether to drop all the input columns or not.
351                Defaults to False.
352            disable_dbfs_retry: optional flag to disable file storage dbfs.
353
354        Returns:
355            A function to be called in .transform() spark function.
356        """
357
358        def inner(df: DataFrame) -> DataFrame:
359            if schema_path:
360                json_schema = SchemaUtils.from_file(schema_path, disable_dbfs_retry)
361            elif schema:
362                json_schema = SchemaUtils.from_dict(schema)
363            else:
364                raise WrongArgumentsException(
365                    "A file or dict schema needs to be provided."
366                )
367
368            if drop_all_cols:
369                df_with_json = df.select(
370                    from_json(
371                        col(input_col).cast("string").alias(f"{input_col}_json"),
372                        json_schema,
373                        json_options if json_options else {},
374                    ).alias(f"{input_col}_json")
375                )
376            else:
377                df_with_json = df.select(
378                    "*",
379                    from_json(
380                        col(input_col).cast("string").alias(f"{input_col}_json"),
381                        json_schema,
382                        json_options if json_options else {},
383                    ).alias(f"{input_col}_json"),
384                )
385
386            return df_with_json
387
388        return inner

Convert a json string into a json column (struct).

The new json column can be added to the existing columns (default) or it can replace all the others, being the only one to output. The new column gets the same name as the original one suffixed with '_json'.

Arguments:

input_col: dict with columns and respective target names.
schema_path: path to the StructType schema (spark schema).
schema: dict with the StructType schema (spark schema).
json_options: options to parse the json value.
drop_all_cols: whether to drop all the input columns or not. Defaults to False.
disable_dbfs_retry: optional flag to disable file storage dbfs.

Returns:

View Example

34{
35    "function": "from_json",
36    "args": {
37        "input_col": "sample",
38        "schema": {
39            "type": "struct",
40            "fields": [
41                {
42                    "name": "field1",
43                    "type": "string",
44                    "nullable": true,
45                    "metadata": {}
46                },
47                {
48                    "name": "field2",
49                    "type": "string",
50                    "nullable": true,
51                    "metadata": {}
52                },
53                {
54                    "name": "field3",
55                    "type": "double",
56                    "nullable": true,
57                    "metadata": {}
58                },
59                {
60                    "name": "field4",
61                    "type": {
62                        "type": "struct",
63                        "fields": [
64                            {
65                                "name": "field1",
66                                "type": "string",
67                                "nullable": true,
68                                "metadata": {}
69                            },
70                            {
71                                "name": "field2",
72                                "type": "string",
73                                "nullable": true,
74                                "metadata": {}
75                            }
76                        ]
77                    },
78                    "nullable": true,
79                    "metadata": {}
80                }
81            ]
82        }
83    }
84}

View Full Acon

@classmethod

def to_json( cls, in_cols: List[str], out_col: str, json_options: Optional[dict] = None) -> Callable: View Source

390    @classmethod
391    def to_json(
392        cls, in_cols: List[str], out_col: str, json_options: Optional[dict] = None
393    ) -> Callable:
394        """Convert dataframe columns into a json value.
395
396        Args:
397            in_cols: name(s) of the input column(s).
398                Example values:
399                "*" - all
400                columns; "my_col" - one column named "my_col";
401                "my_col1, my_col2" - two columns.
402            out_col: name of the output column.
403            json_options: options to parse the json value.
404
405        Returns:
406            A function to be called in .transform() spark function.
407        """
408
409        def inner(df: DataFrame) -> DataFrame:
410            return df.withColumn(
411                out_col, to_json(struct(*in_cols), json_options if json_options else {})
412            )
413
414        return inner

Convert dataframe columns into a json value.

Arguments:

in_cols: name(s) of the input column(s). Example values: "*" - all columns; "my_col" - one column named "my_col"; "my_col1, my_col2" - two columns.
out_col: name of the output column.
json_options: options to parse the json value.

Returns:

View Example

85{
86    "function": "to_json",
87    "args": {
88        "in_cols": [
89            "item",
90            "amount"
91        ],
92        "out_col": "item_amount_json"
93    }
94}

View Full Acon

lakehouse_engine.transformers.column_reshapers

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns:

Arguments:

Returns: