Minimal Example¶
This scenario illustrates the minimal configuration that you can have to use dq_specs
, in which
it uses required parameters: spec_id, input_id, dq_type, bucket, dq_functions
.
Regarding the dq_functions, it uses 3 functions (retrieved from the expectations supported by GX), which check:
- expect_column_to_exist - if a column exist in the data;
- expect_table_row_count_to_be_between - if the row count of the data is between the defined interval;
- expect_table_column_count_to_be_between - if the number of columns in the data is bellow the max value defined.
from lakehouse_engine.engine import load_data
acon = {
"input_specs": [
{
"spec_id": "dummy_deliveries_source",
"read_type": "batch",
"data_format": "csv",
"options": {
"header": True,
"delimiter": "|",
"inferSchema": True,
},
"location": "s3://my_data_product_bucket/dummy_deliveries/",
}
],
"dq_specs": [
{
"spec_id": "dq_validator",
"input_id": "dummy_deliveries_source",
"dq_type": "validator",
"bucket": "my_data_product_bucket",
"tbl_to_derive_pk": "my_database.dummy_deliveries",
"dq_functions": [
{"function": "expect_column_to_exist", "args": {"column": "salesorder"}},
{"function": "expect_table_row_count_to_be_between", "args": {"min_value": 15, "max_value": 25}},
{"function": "expect_table_column_count_to_be_between", "args": {"max_value": 7}},
],
}
],
"output_specs": [
{
"spec_id": "dummy_deliveries_bronze",
"input_id": "dq_validator",
"write_type": "overwrite",
"data_format": "delta",
"location": "s3://my_data_product_bucket/bronze/dummy_deliveries_dq_template/",
}
],
}
load_data(acon=acon)