From 17b30730bfdcf1a80fd98ea6e801ee8d3cb2a7dd Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Tue, 4 Feb 2025 11:28:45 +0100 Subject: [PATCH 1/4] chore: added test fixture Signed-off-by: Luka Peschke --- .../split/followed_by_rename_pypika.yaml | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 server/tests/backends/fixtures/split/followed_by_rename_pypika.yaml diff --git a/server/tests/backends/fixtures/split/followed_by_rename_pypika.yaml b/server/tests/backends/fixtures/split/followed_by_rename_pypika.yaml new file mode 100644 index 0000000000..f68b33e965 --- /dev/null +++ b/server/tests/backends/fixtures/split/followed_by_rename_pypika.yaml @@ -0,0 +1,53 @@ +exclude: + - mongo + - pandas + - snowflake +step: + pipeline: + - name: convert + columns: + - brewing_date + dataType: text + - name: split + column: brewing_date + delimiter: '-' + numberColsToKeep: 2 + - name: rename + toRename: + - - brewing_date_1 + - renamed_column_1 + - - brewing_date_2 + - renamed_column_2 + - name: select + columns: + - renamed_column_1 + - renamed_column_2 +expected: + schema: + pandas_version: 1.5.0 + fields: + - name: renamed_column_1 + type: string + - name: renamed_column_2 + type: string + data: + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' + - renamed_column_1: '2022' + renamed_column_2: '01' From 577b97ea6ad07085132f3f5c5345e9cc81a9c8be Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Tue, 4 Feb 2025 11:53:46 +0100 Subject: [PATCH 2/4] fix(server/pypika): fix aliases generated for split step [TCTC-10029] Signed-off-by: Luka Peschke --- .../translators/googlebigquery.py | 10 ++++---- .../test_sql_bigquery_translator_steps.py | 23 ------------------- server/uv.lock | 2 +- 3 files changed, 7 insertions(+), 28 deletions(-) diff --git a/server/src/weaverbird/backends/pypika_translator/translators/googlebigquery.py b/server/src/weaverbird/backends/pypika_translator/translators/googlebigquery.py index aa16d76f34..b43a24be45 100644 --- a/server/src/weaverbird/backends/pypika_translator/translators/googlebigquery.py +++ b/server/src/weaverbird/backends/pypika_translator/translators/googlebigquery.py @@ -142,6 +142,8 @@ def split( safe_offset = CustomFunction("SAFE_OFFSET", ["index"]) + splitted_cols_aliases = [] + # Sub-optimal, could do that in two sub_queries, one for splitting to a temp array col, and # another one to select eveything needed from the array col rather than splitting N times def gen_splitted_cols(): @@ -155,13 +157,13 @@ def gen_splitted_cols(): # # The IfNull is required because other backends use SPLIT_PART, which will return an # empty string rather than NULL - yield functions.IfNull(LiteralValue(f"{split_str}[{safe_offset_str}]"), "").as_( - f"{step.column}_{i + 1}" - ) + splitted_col_alias = f"{step.column}_{i + 1}" + splitted_cols_aliases.append(splitted_col_alias) + yield functions.IfNull(LiteralValue(f"{split_str}[{safe_offset_str}]"), "").as_(splitted_col_alias) splitted_cols = list(gen_splitted_cols()) query: QueryBuilder = prev_step_table.select(*columns, *splitted_cols) - return StepContext(query, columns + splitted_cols) + return StepContext(query, columns + splitted_cols_aliases) @classmethod def _date_trunc(cls, date_part: str, target_column: Field) -> Term: diff --git a/server/tests/backends/sql_translator_integration_tests/test_sql_bigquery_translator_steps.py b/server/tests/backends/sql_translator_integration_tests/test_sql_bigquery_translator_steps.py index 5c12ca21e5..5119701a95 100644 --- a/server/tests/backends/sql_translator_integration_tests/test_sql_bigquery_translator_steps.py +++ b/server/tests/backends/sql_translator_integration_tests/test_sql_bigquery_translator_steps.py @@ -1,26 +1,3 @@ -""" -BigQuery free DBs have tables that expire after 60 days. -If the table "beers.beers_tiny" is expired, re-create it: -- open the BigQuery console https://console.cloud.google.com/bigquery?project=biquery-integration-tests&ws=!1m4!1m3!3m2!1sbiquery-integration-tests!2sbeers -- use "create table", choose "Upload" and use the `beers-bigquery.csv` file available [here](https://github.com/ToucanToco/weaverbird/pull/1835#issuecomment-1647810149) - - name the table "beers" and check "Edit text" for the schema - - fill the schema with: - ``` - price_per_l:FLOAT, - alcohol_degree:FLOAT, - name:STRING, - cost:FLOAT, - beer_kind:STRING, - volume_ml:FLOAT, - brewing_date:DATE, - nullable_name:STRING - ``` -- run the query: -`` -`CREATE TABLE `beers.beers_tiny` AS SELECT * FROM `beers.beers` ORDER BY brewing_date LIMIT 10 -``` -""" - import json from io import StringIO from os import environ diff --git a/server/uv.lock b/server/uv.lock index 5197df6fb5..a91d19350f 100644 --- a/server/uv.lock +++ b/server/uv.lock @@ -2372,7 +2372,7 @@ wheels = [ [[package]] name = "weaverbird" -version = "0.49.0" +version = "0.50.0" source = { editable = "." } dependencies = [ { name = "pydantic" }, From a84d920708fe3c97bff741d836a7cb5ecdedb664 Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Tue, 4 Feb 2025 12:06:30 +0100 Subject: [PATCH 3/4] test(server/pypika): added fixture for TCTC-10030 as well Signed-off-by: Luka Peschke --- .../followed_by_replace_and_text_pypika.yaml | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 server/tests/backends/fixtures/split/followed_by_replace_and_text_pypika.yaml diff --git a/server/tests/backends/fixtures/split/followed_by_replace_and_text_pypika.yaml b/server/tests/backends/fixtures/split/followed_by_replace_and_text_pypika.yaml new file mode 100644 index 0000000000..f62cce91f2 --- /dev/null +++ b/server/tests/backends/fixtures/split/followed_by_replace_and_text_pypika.yaml @@ -0,0 +1,55 @@ +exclude: + - mongo + - pandas + - snowflake +step: + pipeline: + - name: convert + columns: + - brewing_date + dataType: text + - name: split + column: brewing_date + delimiter: '-' + numberColsToKeep: 2 + - name: replace + searchColumn: brewing_date_2 + toReplace: + - - '01' + - 'HELLO' + - name: text + newColumn: 'bye' + text: 'bye' + - name: select + columns: + - brewing_date_2 + - bye +expected: + schema: + pandas_version: 1.5.0 + fields: + - name: brewing_date_2 + type: string + - name: bye + type: string + data: + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' + - brewing_date_2: 'HELLO' + bye: 'bye' From bace95e0ae051d3221365bdfc7671cccf4a13d5b Mon Sep 17 00:00:00 2001 From: Luka Peschke Date: Tue, 4 Feb 2025 12:11:30 +0100 Subject: [PATCH 4/4] docs: update changelog Signed-off-by: Luka Peschke --- server/CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/CHANGELOG.md b/server/CHANGELOG.md index f3ac1e0a3c..bfbf0ac618 100644 --- a/server/CHANGELOG.md +++ b/server/CHANGELOG.md @@ -1,8 +1,11 @@ # Changelog (weaverbird python package) - ## Unreleased +### Fixed + +- Pypika: The split step can now be followed by other steps for Google Big Query + ## [0.50.0] - 2025-02-03 ### Changed