From 0d9acc17750909af4c834c3b1cc56670dcac8d0f Mon Sep 17 00:00:00 2001 From: Jim Crist-Harif Date: Mon, 9 Sep 2024 16:49:12 -0500 Subject: [PATCH] depr(api): deprecate filtering/expression projection in `Table.__getitem__` --- docs/how-to/extending/builtin.qmd | 2 +- docs/tutorials/ibis-for-pandas-users.qmd | 36 ++- docs/tutorials/ibis-for-sql-users.qmd | 64 +++--- .../apache-flink/1_single_feature.qmd | 4 +- .../bigquery/tests/system/test_client.py | 6 +- .../bigquery/tests/unit/test_compiler.py | 12 +- .../clickhouse/tests/test_aggregations.py | 2 +- ibis/backends/clickhouse/tests/test_client.py | 2 +- .../clickhouse/tests/test_functions.py | 2 +- ibis/backends/clickhouse/tests/test_select.py | 26 +-- ibis/backends/dask/tests/test_arrays.py | 4 +- ibis/backends/dask/tests/test_join.py | 36 +-- ibis/backends/dask/tests/test_operations.py | 16 +- ibis/backends/dask/tests/test_window.py | 10 +- ibis/backends/flink/tests/test_compiler.py | 4 +- .../impala/tests/test_bucket_histogram.py | 2 +- ibis/backends/impala/tests/test_client.py | 8 +- ibis/backends/impala/tests/test_ddl.py | 14 +- .../impala/tests/test_ddl_compilation.py | 2 +- ibis/backends/impala/tests/test_exprs.py | 34 +-- ibis/backends/impala/tests/test_in_not_in.py | 2 +- ibis/backends/impala/tests/test_partition.py | 6 +- ibis/backends/impala/tests/test_sql.py | 53 ++--- .../backends/impala/tests/test_value_exprs.py | 8 +- ibis/backends/impala/tests/test_window.py | 6 +- ibis/backends/mssql/tests/test_client.py | 2 +- ibis/backends/pandas/tests/test_arrays.py | 4 +- ibis/backends/pandas/tests/test_join.py | 48 ++-- ibis/backends/pandas/tests/test_operations.py | 14 +- ibis/backends/pandas/tests/test_window.py | 10 +- .../backends/postgres/tests/test_functions.py | 6 +- .../postgres/tests/test_geospatial.py | 12 +- ibis/backends/postgres/tests/test_json.py | 2 +- ibis/backends/postgres/tests/test_postgis.py | 2 +- ibis/backends/postgres/tests/test_string.py | 2 +- ibis/backends/postgres/tests/test_udf.py | 8 +- ibis/backends/pyspark/tests/test_array.py | 2 +- ibis/backends/pyspark/tests/test_ddl.py | 8 +- ibis/backends/pyspark/tests/test_null.py | 4 +- .../risingwave/tests/test_functions.py | 4 +- ibis/backends/risingwave/tests/test_json.py | 2 +- ibis/backends/tests/test_aggregation.py | 2 +- ibis/backends/tests/test_client.py | 2 +- ibis/backends/tests/test_generic.py | 37 ++-- ibis/backends/tests/test_interactive.py | 2 +- ibis/backends/tests/test_join.py | 26 +-- ibis/backends/tests/test_sql.py | 2 +- ibis/backends/tests/test_struct.py | 2 +- ibis/backends/tests/test_temporal.py | 6 +- ibis/backends/tests/test_window.py | 4 +- ibis/backends/tests/tpc/ds/test_queries.py | 11 +- ibis/backends/tests/tpc/h/test_queries.py | 28 +-- ibis/expr/tests/test_format.py | 18 +- ibis/expr/tests/test_newrels.py | 14 +- ibis/expr/tests/test_visualize.py | 8 +- ibis/expr/types/relations.py | 208 ++++-------------- ibis/tests/benchmarks/test_benchmarks.py | 10 +- ibis/tests/expr/test_analysis.py | 28 +-- ibis/tests/expr/test_analytics.py | 4 +- ibis/tests/expr/test_case.py | 2 +- ibis/tests/expr/test_format_sql_operations.py | 4 +- ibis/tests/expr/test_struct.py | 6 +- ibis/tests/expr/test_table.py | 163 +++++--------- ibis/tests/expr/test_value_exprs.py | 10 +- ibis/tests/expr/test_window_frames.py | 6 +- 65 files changed, 454 insertions(+), 640 deletions(-) diff --git a/docs/how-to/extending/builtin.qmd b/docs/how-to/extending/builtin.qmd index 3d3e0f9578b90..0fbb488a8bd24 100644 --- a/docs/how-to/extending/builtin.qmd +++ b/docs/how-to/extending/builtin.qmd @@ -79,7 +79,7 @@ rest of the library: pkgs = ibis.read_parquet( "https://storage.googleapis.com/ibis-tutorial-data/pypi/2024-04-24/packages.parquet" ) -pandas_ish = pkgs[jw_sim(pkgs.name, "pandas") >= 0.9] +pandas_ish = pkgs.filter(jw_sim(pkgs.name, "pandas") >= 0.9) pandas_ish ``` diff --git a/docs/tutorials/ibis-for-pandas-users.qmd b/docs/tutorials/ibis-for-pandas-users.qmd index e0fc2f5908e54..8001f45386f15 100644 --- a/docs/tutorials/ibis-for-pandas-users.qmd +++ b/docs/tutorials/ibis-for-pandas-users.qmd @@ -126,13 +126,6 @@ Selecting columns is very similar to in pandas. In fact, you can use the same sy t[["one", "two"]] ``` -However, since row-level indexing is not supported in Ibis, the inner list is not necessary. - - -```{python} -t["one", "two"] -``` - ## Selecting columns Selecting columns is done using the same syntax as in pandas `DataFrames`. You can use either @@ -192,11 +185,11 @@ new_col = unnamed.name("new_col") new_col ``` -You can then add this column to the table using a projection. +You can then add this column to the table using `mutate` ```{python} -proj = t["one", "two", new_col] +proj = t.mutate(new_col) proj ``` @@ -301,10 +294,9 @@ penguins.limit(5) ### Filtering rows In addition to limiting the number of rows that are returned, it is possible to -filter the rows using expressions. Expressions are constructed very similarly to -the way they are in pandas. Ibis expressions are constructed from operations on -columns in a table which return a boolean result. This result is then used to -filter the table. +filter the rows using expressions. This is done using the `filter` method in +ibis. Ibis expressions are constructed from operations on columns in a table +which return a boolean result. This result is then used to filter the table. ```{python} @@ -324,7 +316,7 @@ get 6 rows back. ```{python} -filtered = penguins[expr] +filtered = penguins.filter(expr) filtered ``` @@ -332,24 +324,22 @@ Of course, the filtering expression can be applied inline as well. ```{python} -filtered = penguins[penguins.bill_length_mm > 37.0] +filtered = penguins.filter(penguins.bill_length_mm > 37.0) filtered ``` -Multiple filtering expressions can be combined into a single expression or chained onto existing -table expressions. +Multiple filtering expressions may be passed in to a single call (filtering +only rows where they're all true), or combined together using common boolean +operators like (`&`, `|`). The expressions below are equivalent: ```{python} -filtered = penguins[(penguins.bill_length_mm > 37.0) & (penguins.bill_depth_mm > 18.0)] +filtered = penguins.filter(penguins.bill_length_mm > 37.0, penguins.bill_depth_mm > 18.0) filtered ``` -The code above will return the same rows as the code below. - - ```{python} -filtered = penguins[penguins.bill_length_mm > 37.0][penguins.bill_depth_mm > 18.0] +filtered = penguins.filter((penguins.bill_length_mm > 37.0) & (penguins.bill_depth_mm > 18.0)) filtered ``` @@ -359,7 +349,7 @@ is greater than the mean. ```{python} -filtered = penguins[penguins.bill_length_mm > penguins.bill_length_mm.mean()] +filtered = penguins.filter(penguins.bill_length_mm > penguins.bill_length_mm.mean()) filtered ``` diff --git a/docs/tutorials/ibis-for-sql-users.qmd b/docs/tutorials/ibis-for-sql-users.qmd index 534090bfce649..cbb9b4974d706 100644 --- a/docs/tutorials/ibis-for-sql-users.qmd +++ b/docs/tutorials/ibis-for-sql-users.qmd @@ -46,12 +46,6 @@ FROM my_data In Ibis, this is -```{python} -proj = t["two", "one"] -``` - -or - ```{python} proj = t.select("two", "one") ``` @@ -78,7 +72,7 @@ new_col = (t.three * 2).name("new_col") Now, we have: ```{python} -proj = t["two", "one", new_col] +proj = t.select("two", "one", new_col) ibis.to_sql(proj) ``` @@ -113,7 +107,7 @@ select all columns in a table using the `SELECT *` construct. To do this, use the table expression itself in a projection: ```{python} -proj = t[t] +proj = t.select(t) ibis.to_sql(proj) ``` @@ -121,7 +115,7 @@ This is how `mutate` is implemented. The example above `t.mutate(new_col=t.three * 2)` can be written as a normal projection: ```{python} -proj = t[t, new_col] +proj = t.select(t, new_col) ibis.to_sql(proj) ``` @@ -144,7 +138,7 @@ To write this with Ibis, it is: ```{python} diff = (t.two - t2.value).name("diff") -joined = t.join(t2, t.one == t2.key)[t, diff] +joined = t.join(t2, t.one == t2.key).select(t, diff) ``` And verify the generated SQL: @@ -188,19 +182,18 @@ ibis.to_sql(expr) ## Filtering / `WHERE` -You can add filter clauses to a table expression either by indexing with -`[]` (similar to pandas) or use the `filter` method: +You can add filter clauses to a table expression by using the `filter` method: ```{python} -filtered = t[t.two > 0] +filtered = t.filter(t.two > 0) ibis.to_sql(filtered) ``` -`filter` can take a list of expressions, which must all be satisfied for +`filter` can take multiple expressions, which must all be satisfied for a row to appear in the result: ```{python} -filtered = t.filter([t.two > 0, t.one.isin(["A", "B"])]) +filtered = t.filter(t.two > 0, t.one.isin(["A", "B"])) ibis.to_sql(filtered) ``` @@ -209,7 +202,7 @@ To compose boolean expressions with `AND` or `OR`, use the respective ```{python} cond = (t.two < 0) | ((t.two > 0) | t.one.isin(["A", "B"])) -filtered = t[cond] +filtered = t.filter(cond) ibis.to_sql(filtered) ``` @@ -617,7 +610,7 @@ ibis.to_sql(expr) ```{python} agged = ( - expr[expr.one.notnull()] + expr.filter(expr.one.notnull()) .group_by("is_valid") .aggregate(three_count=lambda t: t.three.notnull().sum()) ) @@ -632,7 +625,7 @@ keyword. The result of `between` is boolean and can be used with any other boolean expression: ```{python} -expr = t[t.two.between(10, 50) & t.one.notnull()] +expr = t.filter(t.two.between(10, 50) & t.one.notnull()) ibis.to_sql(expr) ``` @@ -684,7 +677,7 @@ After one or more joins, you can reference any of the joined tables in a projection immediately after: ```{python} -expr = joined[t1, t2.value2] +expr = joined.select(t1, t2.value2) ibis.to_sql(expr) ``` @@ -692,7 +685,7 @@ If you need to compute an expression that involves both tables, you can do that also: ```{python} -expr = joined[t1.key1, (t1.value1 - t2.value2).name("diff")] +expr = joined.select(t1.key1, (t1.value1 - t2.value2).name("diff")) ibis.to_sql(expr) ``` @@ -800,7 +793,7 @@ In these case, we can specify a list of common join keys: ```{python} joined = t4.join(t5, ["key1", "key2", "key3"]) -expr = joined[t4, t5.value2] +expr = joined.select(t4, t5.value2) ibis.to_sql(expr) ``` @@ -808,7 +801,7 @@ You can mix the overlapping key names with other expressions: ```{python} joined = t4.join(t5, ["key1", "key2", t4.key3.left(4) == t5.key3.left(4)]) -expr = joined[t4, t5.value2] +expr = joined.select(t4, t5.value2) ibis.to_sql(expr) ``` @@ -885,7 +878,7 @@ cond = (events.user_id == purchases.user_id).any() This can now be used to filter `events`: ```{python} -expr = events[cond] +expr = events.filter(cond) ibis.to_sql(expr) ``` @@ -893,7 +886,7 @@ If you negate the condition, it will instead give you only event data from user *that have not made a purchase*: ```{python} -expr = events[-cond] +expr = events.filter(-cond) ibis.to_sql(expr) ``` @@ -916,7 +909,7 @@ you can write with Ibis: ```{python} cond = events.user_id.isin(purchases.user_id) -expr = events[cond] +expr = events.filter(cond) ibis.to_sql(expr) ``` @@ -941,7 +934,7 @@ WHERE value1 > ( With Ibis, the code is simpler and more pandas-like: ```{python} -expr = t1[t1.value1 > t2.value2.max()] +expr = t1.filter(t1.value1 > t2.value2.max()) ibis.to_sql(expr) ``` @@ -968,8 +961,8 @@ With Ibis, the code is similar, but you add the correlated filter to the average statistic: ```{python} -stat = t2[t1.key1 == t2.key3].value2.mean() -expr = t1[t1.value1 > stat] +stat = t2.filter(t1.key1 == t2.key3).value2.mean() +expr = t1.filter(t1.value1 > stat) ibis.to_sql(expr) ``` @@ -1118,7 +1111,7 @@ Ibis provides a `row_number()` function that allows you to do this: expr = purchases.mutate( row_number=ibis.row_number().over(group_by=[_.user_id], order_by=_.price) ) -expr = expr[_.row_number < 3] +expr = expr.filter(_.row_number < 3) ``` The output of this is a table with the three most expensive items that each user has purchased @@ -1149,7 +1142,7 @@ Ibis has a set of interval APIs that allow you to do date/time arithmetic. For example: ```{python} -expr = events[events.ts > (ibis.now() - ibis.interval(years=1))] +expr = events.filter(events.ts > (ibis.now() - ibis.interval(years=1))) ibis.to_sql(expr) ``` @@ -1214,12 +1207,13 @@ purchases = ibis.table( metric = purchases.amount.sum().name("total") agged = purchases.group_by(["region", "kind"]).aggregate(metric) -left = agged[agged.kind == "foo"] -right = agged[agged.kind == "bar"] +left = agged.filter(agged.kind == "foo") +right = agged.filter(agged.kind == "bar") -result = left.join(right, left.region == right.region)[ - left.region, (left.total - right.total).name("diff") -] +result = ( + left.join(right, left.region == right.region) + .select(left.region, (left.total - right.total).name("diff")) +) ``` Ibis automatically creates a CTE for `agged`: diff --git a/docs/tutorials/open-source-software/apache-flink/1_single_feature.qmd b/docs/tutorials/open-source-software/apache-flink/1_single_feature.qmd index 86747c5d8e009..4fef3e407f6a0 100644 --- a/docs/tutorials/open-source-software/apache-flink/1_single_feature.qmd +++ b/docs/tutorials/open-source-software/apache-flink/1_single_feature.qmd @@ -184,7 +184,7 @@ transaction count over the past five hours may be useful features. Let’s write out each of these using Ibis API: ```{python} -user_trans_amt_last_360m_agg = source_table[ +user_trans_amt_last_360m_agg = source_table.select( source_table.user_id, # Calculate the average transaction amount over the past six hours source_table.amt.mean() @@ -207,7 +207,7 @@ user_trans_amt_last_360m_agg = source_table[ ) .name("user_trans_count_last_360min"), source_table.trans_date_trans_time, -] +) ``` `over()` creates an [over diff --git a/ibis/backends/bigquery/tests/system/test_client.py b/ibis/backends/bigquery/tests/system/test_client.py index c31a33bd86942..0ae05872a52ef 100644 --- a/ibis/backends/bigquery/tests/system/test_client.py +++ b/ibis/backends/bigquery/tests/system/test_client.py @@ -186,7 +186,7 @@ def test_scalar_param_partition_time(parted_alltypes): assert "PARTITIONTIME" in parted_alltypes.columns assert "PARTITIONTIME" in parted_alltypes.schema() param = ibis.param("timestamp('UTC')") - expr = parted_alltypes[param > parted_alltypes.PARTITIONTIME] + expr = parted_alltypes.filter(param > parted_alltypes.PARTITIONTIME) df = expr.execute(params={param: "2017-01-01"}) assert df.empty @@ -201,7 +201,7 @@ def test_parted_column(con, kind): def test_cross_project_query(public): table = public.table("posts_questions") - expr = table[table.tags.contains("ibis")][["title", "tags"]] + expr = table.filter(table.tags.contains("ibis"))[["title", "tags"]] n = 5 df = expr.limit(n).execute() assert len(df) == n @@ -231,7 +231,7 @@ def test_multiple_project_queries_execute(con): trips = con.table("trips", database="nyc-tlc.yellow").limit(5) predicate = posts_questions.tags == trips.rate_code cols = [posts_questions.title] - join = posts_questions.left_join(trips, predicate)[cols] + join = posts_questions.left_join(trips, predicate).select(cols) result = join.execute() assert list(result.columns) == ["title"] assert len(result) == 5 diff --git a/ibis/backends/bigquery/tests/unit/test_compiler.py b/ibis/backends/bigquery/tests/unit/test_compiler.py index e058cd214c864..660ee779d800e 100644 --- a/ibis/backends/bigquery/tests/unit/test_compiler.py +++ b/ibis/backends/bigquery/tests/unit/test_compiler.py @@ -151,11 +151,11 @@ def test_projection_fusion_only_peeks_at_immediate_parent(snapshot): ("val", "int64"), ] table = ibis.table(schema, name="unbound_table") - table = table[table.PARTITIONTIME < ibis.date("2017-01-01")] + table = table.filter(table.PARTITIONTIME < ibis.date("2017-01-01")) table = table.mutate(file_date=table.file_date.cast("date")) - table = table[table.file_date < ibis.date("2017-01-01")] + table = table.filter(table.file_date < ibis.date("2017-01-01")) table = table.mutate(XYZ=table.val * 2) - expr = table.join(table.view())[table] + expr = table.join(table.view()).select(table) snapshot.assert_match(to_sql(expr), "out.sql") @@ -276,7 +276,7 @@ class MockBackend(ibis.backends.bigquery.Backend): for _ in range(num_joins): # noqa: F402 table = table.mutate(dummy=ibis.literal("")) table_ = table.view() - table = table.left_join(table_, ["dummy"])[[table_]] + table = table.left_join(table_, ["dummy"]).select(table_) start = time.time() table.compile() @@ -417,9 +417,9 @@ def test_divide_by_zero(alltypes, op, snapshot): def test_identical_to(alltypes, snapshot): - expr = alltypes[ + expr = alltypes.filter( _.string_col.identical_to("a") & _.date_string_col.identical_to("b") - ] + ) snapshot.assert_match(to_sql(expr), "out.sql") diff --git a/ibis/backends/clickhouse/tests/test_aggregations.py b/ibis/backends/clickhouse/tests/test_aggregations.py index 6f376d263bab0..c9e5bb38c9ad5 100644 --- a/ibis/backends/clickhouse/tests/test_aggregations.py +++ b/ibis/backends/clickhouse/tests/test_aggregations.py @@ -163,7 +163,7 @@ def test_boolean_reduction(alltypes, op, df): def test_anonymous_aggregate(alltypes, df): t = alltypes - expr = t[t.double_col > t.double_col.mean()] + expr = t.filter(t.double_col > t.double_col.mean()) result = expr.execute().set_index("id") expected = df[df.double_col > df.double_col.mean()].set_index("id") tm.assert_frame_equal(result, expected, check_like=True) diff --git a/ibis/backends/clickhouse/tests/test_client.py b/ibis/backends/clickhouse/tests/test_client.py index 311889aea0917..270b305f3aebf 100644 --- a/ibis/backends/clickhouse/tests/test_client.py +++ b/ibis/backends/clickhouse/tests/test_client.py @@ -129,7 +129,7 @@ def test_sql_query_limits(alltypes): def test_embedded_identifier_quoting(alltypes): t = alltypes - expr = t[[(t.double_col * 2).name("double(fun)")]]["double(fun)"].sum() + expr = t.select((t.double_col * 2).name("double(fun)"))["double(fun)"].sum() expr.execute() diff --git a/ibis/backends/clickhouse/tests/test_functions.py b/ibis/backends/clickhouse/tests/test_functions.py index feb945d59b909..6a8d185e4d345 100644 --- a/ibis/backends/clickhouse/tests/test_functions.py +++ b/ibis/backends/clickhouse/tests/test_functions.py @@ -476,7 +476,7 @@ def my_add(a: int, b: int) -> int: ... n = 5 expr = ( - alltypes[alltypes.int_col == 1] + alltypes.filter(alltypes.int_col == 1) .limit(n) .int_col.collect() .map(lambda x: my_add(x, 1)) diff --git a/ibis/backends/clickhouse/tests/test_select.py b/ibis/backends/clickhouse/tests/test_select.py index 3087b15bbdeb2..364284191ab00 100644 --- a/ibis/backends/clickhouse/tests/test_select.py +++ b/ibis/backends/clickhouse/tests/test_select.py @@ -38,23 +38,23 @@ def time_right(con): def test_timestamp_extract_field(alltypes, assert_sql): t = alltypes.timestamp_col - expr = alltypes[ + expr = alltypes.select( t.year().name("year"), t.month().name("month"), t.day().name("day"), t.hour().name("hour"), t.minute().name("minute"), t.second().name("second"), - ] + ) assert_sql(expr) def test_isin_notin_in_select(alltypes, assert_sql): values = ["foo", "bar"] - filtered = alltypes[alltypes.string_col.isin(values)] + filtered = alltypes.filter(alltypes.string_col.isin(values)) assert_sql(filtered, "out1.sql") - filtered = alltypes[alltypes.string_col.notin(values)] + filtered = alltypes.filter(alltypes.string_col.notin(values)) assert_sql(filtered, "out2.sql") @@ -100,7 +100,7 @@ def test_simple_scalar_aggregates(alltypes, assert_sql): # Things like table.column.{sum, mean, ...}() table = alltypes - expr = table[table.int_col > 0].float_col.sum() + expr = table.filter(table.int_col > 0).float_col.sum() assert_sql(expr) @@ -152,7 +152,7 @@ def test_simple_scalar_aggregates(alltypes, assert_sql): def test_table_column_unbox(alltypes, assert_sql): m = alltypes.float_col.sum().name("total") - agged = alltypes[alltypes.int_col > 0].group_by("string_col").aggregate([m]) + agged = alltypes.filter(alltypes.int_col > 0).group_by("string_col").aggregate([m]) expr = agged.string_col assert_sql(expr) @@ -213,7 +213,7 @@ def test_simple_joins( ): t1, t2 = batting, awards_players pred = [t1[left_key] == t2[right_key]] - expr = getattr(t1, join_type)(t2, pred)[[t1]] + expr = getattr(t1, join_type)(t2, pred).select(t1) assert_sql(expr) @@ -226,7 +226,7 @@ def test_self_reference_simple(con, alltypes, assert_sql): def test_join_self_reference(con, alltypes, assert_sql): t1 = alltypes t2 = t1.view() - expr = t1.inner_join(t2, ["id"])[[t1]] + expr = t1.inner_join(t2, ["id"]).select(t1) assert_sql(expr) assert len(con.execute(expr)) @@ -261,7 +261,7 @@ def test_filter_predicates(diamonds): expr = diamonds for pred in predicates: - expr = expr[pred(expr)].select(expr) + expr = expr.filter(pred(expr)).select(expr) expr.execute() @@ -305,9 +305,9 @@ def test_join_with_external_table_errors(alltypes): ) alltypes = alltypes.mutate(b=alltypes.tinyint_col) - expr = alltypes.inner_join(external_table, ["b"])[ + expr = alltypes.inner_join(external_table, ["b"]).select( external_table.a, external_table.c, alltypes.id - ] + ) with pytest.raises(cc.driver.exceptions.DatabaseError): expr.execute() @@ -328,9 +328,9 @@ def test_join_with_external_table(alltypes, df): ) alltypes = alltypes.mutate(b=alltypes.tinyint_col) - expr = alltypes.inner_join(external_table, ["b"])[ + expr = alltypes.inner_join(external_table, ["b"]).select( external_table.a, external_table.c, alltypes.id - ] + ) result = expr.execute(external_tables={"external": external_df}) expected = df.assign(b=df.tinyint_col).merge(external_df, on="b")[["a", "c", "id"]] diff --git a/ibis/backends/dask/tests/test_arrays.py b/ibis/backends/dask/tests/test_arrays.py index 107cca5fedaa6..39d89ee7a1e05 100644 --- a/ibis/backends/dask/tests/test_arrays.py +++ b/ibis/backends/dask/tests/test_arrays.py @@ -59,7 +59,7 @@ def test_array_collect(t, df): def test_array_collect_rolling_partitioned(t, df): window = ibis.trailing_window(1, order_by=t.plain_int64) colexpr = t.plain_float64.collect().over(window) - expr = t["dup_strings", "plain_int64", colexpr.name("collected")] + expr = t.select("dup_strings", "plain_int64", colexpr.name("collected")) result = expr.compile() expected = dd.from_pandas( pd.DataFrame( @@ -134,7 +134,7 @@ def test_array_slice_scalar(client, start, stop): [1, 3, 4, 11, -11], ) def test_array_index(t, df, index): - expr = t[t.array_of_float64[index].name("indexed")] + expr = t.select(t.array_of_float64[index].name("indexed")) result = expr.execute() expected = pd.DataFrame( { diff --git a/ibis/backends/dask/tests/test_join.py b/ibis/backends/dask/tests/test_join.py index 75b1235d5182a..9614c00fd598b 100644 --- a/ibis/backends/dask/tests/test_join.py +++ b/ibis/backends/dask/tests/test_join.py @@ -30,9 +30,9 @@ @join_type def test_join(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how)[ + expr = left.join(right, left.key == right.key, how=how).select( left, right.other_value, right.key3 - ] + ) result = expr.compile() expected = dd.merge(df1, df2, how=how, on="key") tm.assert_frame_equal( @@ -43,7 +43,7 @@ def test_join(how, left, right, df1, df2): @join_type def test_join_project_left_table(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how)[left, right.key3] + expr = left.join(right, left.key == right.key, how=how).select(left, right.key3) result = expr.compile() expected = dd.merge(df1, df2, how=how, on="key")[list(left.columns) + ["key3"]] tm.assert_frame_equal( @@ -81,7 +81,7 @@ def test_join_with_duplicate_non_key_columns(how, left, right, df1, df2): @join_type def test_join_with_post_expression_selection(how, left, right, df1, df2): join = left.join(right, left.key == right.key, how=how) - expr = join[left.key, left.value, right.other_value] + expr = join.select(left.key, left.value, right.other_value) result = expr.compile() expected = dd.merge(df1, df2, on="key", how=how)[["key", "value", "other_value"]] tm.assert_frame_equal( @@ -96,8 +96,8 @@ def test_join_with_post_expression_filter(how, left): rhs = left[["key2", "value"]] joined = lhs.join(rhs, "key2", how=how) - projected = joined[lhs, rhs.value] - expr = projected[projected.value == 4] + projected = joined.select(lhs, rhs.value) + expr = projected.filter(projected.value == 4) result = expr.compile() df1 = lhs.compile() @@ -118,12 +118,12 @@ def test_multi_join_with_post_expression_filter(how, left, df1): rhs2 = left[["key2", "value"]].rename(value2="value") joined = lhs.join(rhs, "key2", how=how) - projected = joined[lhs, rhs.value] - filtered = projected[projected.value == 4] + projected = joined.select(lhs, rhs.value) + filtered = projected.filter(projected.value == 4) joined2 = filtered.join(rhs2, "key2") - projected2 = joined2[filtered.key, rhs2.value2] - expr = projected2[projected2.value2 == 3] + projected2 = joined2.select(filtered.key, rhs2.value2) + expr = projected2.filter(projected2.value2 == 3) result = expr.compile() @@ -145,7 +145,7 @@ def test_multi_join_with_post_expression_filter(how, left, df1): def test_join_with_non_trivial_key(how, left, right, df1, df2): # also test that the order of operands in the predicate doesn't matter join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join[left.key, left.value, right.other_value] + expr = join.select(left.key, left.value, right.other_value) result = expr.compile() expected = ( @@ -168,8 +168,8 @@ def test_join_with_non_trivial_key(how, left, right, df1, df2): def test_join_with_non_trivial_key_project_table(how, left, right, df1, df2): # also test that the order of operands in the predicate doesn't matter join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join[left, right.other_value] - expr = expr[expr.key.length() == 1] + expr = join.select(left, right.other_value) + expr = expr.filter(expr.key.length() == 1) result = expr.compile() expected = ( @@ -194,7 +194,7 @@ def test_join_with_project_right_duplicate_column(client, how, left, df1, df3): # also test that the order of operands in the predicate doesn't matter right = client.table("df3") join = left.join(right, ["key"], how=how) - expr = join[left.key, right.key2, right.other_value] + expr = join.select(left.key, right.key2, right.other_value) result = expr.compile() expected = ( @@ -216,7 +216,9 @@ def test_join_with_project_right_duplicate_column(client, how, left, df1, df3): @merge_asof_minversion def test_asof_join(time_left, time_right, time_df1, time_df2): - expr = time_left.asof_join(time_right, "time")[time_left, time_right.other_value] + expr = time_left.asof_join(time_right, "time").select( + time_left, time_right.other_value + ) result = expr.compile() expected = dd.merge_asof(time_df1, time_df2, on="time") tm.assert_frame_equal( @@ -229,9 +231,9 @@ def test_asof_join(time_left, time_right, time_df1, time_df2): def test_keyed_asof_join( time_keyed_left, time_keyed_right, time_keyed_df1, time_keyed_df2 ): - expr = time_keyed_left.asof_join(time_keyed_right, "time", predicates="key")[ + expr = time_keyed_left.asof_join(time_keyed_right, "time", predicates="key").select( time_keyed_left, time_keyed_right.other_value - ] + ) result = expr.compile() expected = dd.merge_asof(time_keyed_df1, time_keyed_df2, on="time", by="key") tm.assert_frame_equal( diff --git a/ibis/backends/dask/tests/test_operations.py b/ibis/backends/dask/tests/test_operations.py index cf6bd9a9eb040..d1979bec71497 100644 --- a/ibis/backends/dask/tests/test_operations.py +++ b/ibis/backends/dask/tests/test_operations.py @@ -32,7 +32,9 @@ def test_literal(client): def test_selection(t, df): - expr = t[((t.plain_strings == "a") | (t.plain_int64 == 3)) & (t.dup_strings == "d")] + expr = t.filter( + ((t.plain_strings == "a") | (t.plain_int64 == 3)) & (t.dup_strings == "d") + ) result = expr.compile() expected = df[ ((df.plain_strings == "a") | (df.plain_int64 == 3)) & (df.dup_strings == "d") @@ -56,12 +58,10 @@ def test_mutate(t, df): @pytest.mark.xfail(reason="TODO - windowing - #2553") def test_project_scope_does_not_override(t, df): col = t.plain_int64 - expr = t[ - [ - col.name("new_col"), - col.sum().over(ibis.window(group_by="dup_strings")).name("grouped"), - ] - ] + expr = t.select( + col.name("new_col"), + col.sum().over(ibis.window(group_by="dup_strings")).name("grouped"), + ) result = expr.compile() expected = dd.concat( [ @@ -402,7 +402,7 @@ def test_nullif_inf(con): def test_group_concat(t, df): expr = ( - t[t.dup_ints == 1] + t.filter(t.dup_ints == 1) .group_by(t.dup_strings) .aggregate(foo=t.dup_ints.group_concat(",")) ) diff --git a/ibis/backends/dask/tests/test_window.py b/ibis/backends/dask/tests/test_window.py index c8c116170300b..f810215e53a66 100644 --- a/ibis/backends/dask/tests/test_window.py +++ b/ibis/backends/dask/tests/test_window.py @@ -161,7 +161,7 @@ def test_players(players, players_df): def test_batting_filter_mean(batting, batting_df): - expr = batting[batting.G > batting.G.mean()] + expr = batting.filter(batting.G > batting.G.mean()) result = expr.execute() expected = ( batting_df[batting_df.G > batting_df.G.mean()].reset_index(drop=True).compute() @@ -348,7 +348,7 @@ def test_mutate_with_window_after_join(con, sort_kind): right = ibis.memtable(right_df) joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] + proj = joined.select(left, right.value) expr = proj.group_by("ints").mutate(sum=proj.value.sum()) result = con.execute(expr) expected = pd.DataFrame( @@ -380,7 +380,7 @@ def test_mutate_scalar_with_window_after_join(npartitions): left, right = map(con.table, ("left", "right")) joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] + proj = joined.select(left, right.value) expr = proj.mutate(sum=proj.value.sum(), const=ibis.literal(1)) result = expr.execute() result = result.sort_values(["ints", "value"]).reset_index(drop=True) @@ -415,8 +415,8 @@ def test_project_scalar_after_join(npartitions): left, right = map(con.table, ("left", "right")) joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] - expr = proj[proj.value.sum().name("sum"), ibis.literal(1).name("const")] + proj = joined.select(left, right.value) + expr = proj.select(proj.value.sum().name("sum"), ibis.literal(1).name("const")) result = expr.execute().reset_index(drop=True) expected = pd.DataFrame( { diff --git a/ibis/backends/flink/tests/test_compiler.py b/ibis/backends/flink/tests/test_compiler.py index a39e0629ef6f4..ed7ea10773bac 100644 --- a/ibis/backends/flink/tests/test_compiler.py +++ b/ibis/backends/flink/tests/test_compiler.py @@ -37,9 +37,9 @@ def test_complex_projections(simple_table, assert_sql): def test_filter(simple_table, assert_sql): - expr = simple_table[ + expr = simple_table.filter( ((simple_table.c > 0) | (simple_table.c < 0)) & simple_table.g.isin(["A", "B"]) - ] + ) assert_sql(expr) diff --git a/ibis/backends/impala/tests/test_bucket_histogram.py b/ibis/backends/impala/tests/test_bucket_histogram.py index af1ca0591a14d..344b6ec99da50 100644 --- a/ibis/backends/impala/tests/test_bucket_histogram.py +++ b/ibis/backends/impala/tests/test_bucket_histogram.py @@ -84,6 +84,6 @@ def test_bucket_assign_labels(table, snapshot): labelled = size.tier.label( ["Under 0", "0 to 10", "10 to 25", "25 to 50"], nulls="error" ).name("tier2") - expr = size[labelled, size[1]] + expr = size.select(labelled, size[1]) snapshot.assert_match(translate(expr), "out.sql") diff --git a/ibis/backends/impala/tests/test_client.py b/ibis/backends/impala/tests/test_client.py index 680d6110c1c0c..212aaf3b98a43 100644 --- a/ibis/backends/impala/tests/test_client.py +++ b/ibis/backends/impala/tests/test_client.py @@ -88,16 +88,18 @@ def test_adapt_scalar_array_results(con, alltypes): def test_interactive_repr_call_failure(con): t = con.table("lineitem").limit(100000) - t = t[t, t.l_receiptdate.cast("timestamp").name("date")] + t = t.select(t, t.l_receiptdate.cast("timestamp").name("date")) keys = [t.date.year().name("year"), "l_linestatus"] filt = t.l_linestatus.isin(["F"]) - expr = t[filt].group_by(keys).aggregate(t.l_extendedprice.mean().name("avg_px")) + expr = ( + t.filter(filt).group_by(keys).aggregate(t.l_extendedprice.mean().name("avg_px")) + ) w2 = ibis.trailing_window(9, group_by=expr.l_linestatus, order_by=expr.year) metric = expr["avg_px"].mean().over(w2) - enriched = expr[expr, metric] + enriched = expr.select(expr, metric) with config.option_context("interactive", True): repr(enriched) diff --git a/ibis/backends/impala/tests/test_ddl.py b/ibis/backends/impala/tests/test_ddl.py index 71273d06624a7..fd12a69a0be96 100644 --- a/ibis/backends/impala/tests/test_ddl.py +++ b/ibis/backends/impala/tests/test_ddl.py @@ -159,19 +159,21 @@ def test_insert_validate_types(con, alltypes, test_data_db, temp_table): t = con.table(temp_table, database=db) - to_insert = expr[ + to_insert = expr.select( expr.tinyint_col, expr.smallint_col.name("int_col"), expr.string_col - ] + ) t.insert(to_insert.limit(10)) - to_insert = expr[ + to_insert = expr.select( expr.tinyint_col, expr.smallint_col.cast("int32").name("int_col"), expr.string_col, - ] + ) t.insert(to_insert.limit(10)) - to_insert = expr[expr.tinyint_col, expr.bigint_col.name("int_col"), expr.string_col] + to_insert = expr.select( + expr.tinyint_col, expr.bigint_col.name("int_col"), expr.string_col + ) limit_expr = to_insert.limit(10) with pytest.raises(com.IbisError): @@ -296,7 +298,7 @@ def test_query_delimited_file_directory(con, test_data_dir, temp_table): table = con.delimited_file(hdfs_path, schema, name=temp_table, delimiter=",") expr = ( - table[table.bar > 0] + table.filter(table.bar > 0) .group_by("foo") .aggregate( [ diff --git a/ibis/backends/impala/tests/test_ddl_compilation.py b/ibis/backends/impala/tests/test_ddl_compilation.py index 929075d92aa61..d6f386ec0cbe8 100644 --- a/ibis/backends/impala/tests/test_ddl_compilation.py +++ b/ibis/backends/impala/tests/test_ddl_compilation.py @@ -168,7 +168,7 @@ def _get_ddl_string(props): @pytest.fixture def expr(t): - return t[t.bigint_col > 0] + return t.filter(t.bigint_col > 0) def test_create_external_table_as(mockcon, snapshot): diff --git a/ibis/backends/impala/tests/test_exprs.py b/ibis/backends/impala/tests/test_exprs.py index 5b2557dd4fdd3..7472e49734487 100644 --- a/ibis/backends/impala/tests/test_exprs.py +++ b/ibis/backends/impala/tests/test_exprs.py @@ -17,7 +17,7 @@ def test_embedded_identifier_quoting(alltypes): t = alltypes - expr = t[[(t.double_col * 2).name("double(fun)")]]["double(fun)"].sum() + expr = t.select((t.double_col * 2).name("double(fun)"))["double(fun)"].sum() expr.execute() @@ -134,7 +134,7 @@ def test_builtins(con, alltypes): proj_exprs = [expr.name("e%d" % i) for i, expr in enumerate(exprs)] - projection = table[proj_exprs] + projection = table.select(proj_exprs) projection.limit(10).execute() _check_impala_output_types_match(con, projection) @@ -352,7 +352,7 @@ def test_filter_predicates(con): expr = t for pred in predicates: - expr = expr[pred(expr)].select(expr) + expr = expr.filter(pred(expr)).select(expr) expr.execute() @@ -420,7 +420,7 @@ def test_decimal_timestamp_builtins(con): proj_exprs = [expr.name("e%d" % i) for i, expr in enumerate(exprs)] - projection = table[proj_exprs].limit(10) + projection = table.select(proj_exprs).limit(10) projection.execute() @@ -520,7 +520,7 @@ def test_analytic_functions(alltypes): def test_anti_join_self_reference_works(con, alltypes): t = alltypes.limit(100) t2 = t.view() - case = t[-((t.string_col == t2.string_col).any())] + case = t.filter(~((t.string_col == t2.string_col).any())) con.explain(case) @@ -540,7 +540,8 @@ def test_tpch_self_join_failure(con): joined_all = ( region.join(nation, region.r_regionkey == nation.n_regionkey) .join(customer, customer.c_nationkey == nation.n_nationkey) - .join(orders, orders.o_custkey == customer.c_custkey)[fields_of_interest] + .join(orders, orders.o_custkey == customer.c_custkey) + .select(fields_of_interest) ) year = joined_all.odate.year().name("year") @@ -554,7 +555,7 @@ def test_tpch_self_join_failure(con): yoy = current.join( prior, ((current.region == prior.region) & (current.year == (prior.year - 1))), - )[current.region, current.year, yoy_change] + ).select(current.region, current.year, yoy_change) # no analysis failure con.explain(yoy) @@ -577,14 +578,15 @@ def test_tpch_correlated_subquery_failure(con): tpch = ( region.join(nation, region.r_regionkey == nation.n_regionkey) .join(customer, customer.c_nationkey == nation.n_nationkey) - .join(orders, orders.o_custkey == customer.c_custkey)[fields_of_interest] + .join(orders, orders.o_custkey == customer.c_custkey) + .select(fields_of_interest) ) t2 = tpch.view() - conditional_avg = t2[(t2.region == tpch.region)].amount.mean() + conditional_avg = t2.filter(t2.region == tpch.region).amount.mean() amount_filter = tpch.amount > conditional_avg - expr = tpch[amount_filter].limit(0) + expr = tpch.filter(amount_filter).limit(0) # impala can't plan this because its correlated subquery implementation is # broken: it cannot detect the outer reference inside the inner query @@ -622,7 +624,7 @@ def test_unions_with_ctes(con, alltypes): ) expr2 = expr1.view() - join1 = expr1.join(expr2, expr1.string_col == expr2.string_col)[[expr1]] + join1 = expr1.join(expr2, expr1.string_col == expr2.string_col).select(expr1) join2 = join1.view() expr = join1.union(join2) @@ -665,12 +667,12 @@ def test_where_with_timestamp(snapshot): def test_filter_with_analytic(snapshot): x = ibis.table(ibis.schema([("col", "int32")]), "x") - with_filter_col = x[x.columns + [ibis.null().name("filter")]] - filtered = with_filter_col[with_filter_col["filter"].isnull()] - subquery = filtered[filtered.columns] + with_filter_col = x.select(x.columns + [ibis.null().name("filter")]) + filtered = with_filter_col.filter(with_filter_col["filter"].isnull()) + subquery = filtered.select(filtered.columns) - with_analytic = subquery[["col", subquery.count().name("analytic")]] - expr = with_analytic[with_analytic.columns] + with_analytic = subquery.select("col", subquery.count().name("analytic")) + expr = with_analytic.select(with_analytic.columns) snapshot.assert_match(ibis.impala.compile(expr), "out.sql") diff --git a/ibis/backends/impala/tests/test_in_not_in.py b/ibis/backends/impala/tests/test_in_not_in.py index ceeb3aebe0027..c3f65230b7e50 100644 --- a/ibis/backends/impala/tests/test_in_not_in.py +++ b/ibis/backends/impala/tests/test_in_not_in.py @@ -33,6 +33,6 @@ def test_literal_in_fields(table, method_name, snapshot): def test_isin_notin_in_select(table, method_name, snapshot): values = ["foo", "bar"] method = getattr(table.g, method_name) - filtered = table[method(values)] + filtered = table.filter(method(values)) result = translate(filtered) snapshot.assert_match(result, "out.sql") diff --git a/ibis/backends/impala/tests/test_partition.py b/ibis/backends/impala/tests/test_partition.py index a44ff89213642..52fe8a9b8bb5c 100644 --- a/ibis/backends/impala/tests/test_partition.py +++ b/ibis/backends/impala/tests/test_partition.py @@ -111,7 +111,9 @@ def test_insert_select_partitioned_table(con, df, temp_table, unpart_t): unique_keys = df[part_keys].drop_duplicates() for i, (year, month) in enumerate(unique_keys.itertuples(index=False)): - select_stmt = unpart_t[(unpart_t.year == year) & (unpart_t.month == month)] + select_stmt = unpart_t.filter( + (unpart_t.year == year) & (unpart_t.month == month) + ) # test both styles of insert if i: @@ -132,7 +134,7 @@ def tmp_parted(con): def test_create_partitioned_table_from_expr(con, alltypes, tmp_parted): t = alltypes - expr = t[t.id <= 10][["id", "double_col", "month", "year"]] + expr = t.filter(t.id <= 10)[["id", "double_col", "month", "year"]] name = tmp_parted con.create_table(name, expr, partition=[t.year]) new = con.table(name) diff --git a/ibis/backends/impala/tests/test_sql.py b/ibis/backends/impala/tests/test_sql.py index 65c125a8f4579..a5f72375542e3 100644 --- a/ibis/backends/impala/tests/test_sql.py +++ b/ibis/backends/impala/tests/test_sql.py @@ -28,7 +28,7 @@ def test_join_no_predicates_for_impala(con, join_type, snapshot): t1 = con.table("star1") t2 = con.table("star2") - joined = getattr(t1, join_type)(t2)[[t1]] + joined = getattr(t1, join_type)(t2).select(t1) result = ibis.to_sql(joined, dialect="impala") snapshot.assert_match(result, "out.sql") @@ -76,8 +76,8 @@ def test_nested_join_multiple_ctes(snapshot): movies = ibis.table(dict(movieid="int64", title="string"), name="movies") expr = ratings.timestamp.cast("timestamp") - ratings2 = ratings["userid", "movieid", "rating", expr.name("datetime")] - joined2 = ratings2.join(movies, ["movieid"])[ratings2, movies["title"]] + ratings2 = ratings.select("userid", "movieid", "rating", expr.name("datetime")) + joined2 = ratings2.join(movies, ["movieid"]).select(ratings2, movies["title"]) joined3 = joined2.filter([joined2.userid == 118205, joined2.datetime.year() > 2001]) top_user_old_movie_ids = joined3.filter( [joined3.userid == 118205, joined3.datetime.year() < 2009] @@ -85,7 +85,7 @@ def test_nested_join_multiple_ctes(snapshot): # projection from a filter was hiding an insidious bug, so we're disabling # that for now see issue #1295 cond = joined3.movieid.isin(top_user_old_movie_ids.movieid) - result = joined3[cond] + result = joined3.filter(cond) compiled_result = ibis.to_sql(result, dialect="impala") snapshot.assert_match(compiled_result, "out.sql") @@ -109,7 +109,7 @@ def test_join_with_nested_or_condition(snapshot): t2 = t1.view() joined = t1.join(t2, [t1.a == t2.a, (t1.a != t2.b) | (t1.b != t2.a)]) - expr = joined[t1] + expr = joined.select(t1) result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") @@ -119,7 +119,7 @@ def test_join_with_nested_xor_condition(snapshot): t2 = t1.view() joined = t1.join(t2, [t1.a == t2.a, (t1.a != t2.b) ^ (t1.b != t2.a)]) - expr = joined[t1] + expr = joined.select(t1) result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") @@ -128,7 +128,7 @@ def test_join_with_nested_xor_condition(snapshot): def test_is_parens(method, snapshot): t = ibis.table([("a", "string"), ("b", "string")], "table") func = operator.methodcaller(method) - expr = t[func(t.a) == func(t.b)] + expr = t.filter(func(t.a) == func(t.b)) result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") @@ -136,7 +136,7 @@ def test_is_parens(method, snapshot): def test_is_parens_identical_to(snapshot): t = ibis.table([("a", "string"), ("b", "string")], "table") - expr = t[t.a.identical_to(None) == t.b.identical_to(None)] + expr = t.filter(t.a.identical_to(None) == t.b.identical_to(None)) result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") @@ -147,37 +147,37 @@ def test_join_aliasing(snapshot): [("a", "int64"), ("b", "int64"), ("c", "int64")], name="test_table" ) test = test.mutate(d=test.a + 20) - test2 = test[test.d, test.c] + test2 = test.select(test.d, test.c) idx = (test2.d / 15).cast("int64").name("idx") test3 = test2.group_by([test2.d, idx, test2.c]).aggregate(row_count=test2.count()) test3_totals = test3.group_by(test3.d).aggregate(total=test3.row_count.sum()) - test4 = test3.join(test3_totals, test3.d == test3_totals.d)[ + test4 = test3.join(test3_totals, test3.d == test3_totals.d).select( test3, test3_totals.total - ] - test5 = test4[test4.row_count < test4.total / 2] + ) + test5 = test4.filter(test4.row_count < test4.total / 2) agg = ( test.group_by([test.d, test.b]) .aggregate(count=test.count(), unique=test.c.nunique()) .view() ) - result = agg.join(test5, agg.d == test5.d)[agg, test5.total] + result = agg.join(test5, agg.d == test5.d).select(agg, test5.total) result = ibis.to_sql(result, dialect="impala") snapshot.assert_match(result, "out.sql") def test_multiple_filters(snapshot): t = ibis.table([("a", "int64"), ("b", "string")], name="t0") - filt = t[t.a < 100] - expr = filt[filt.a == filt.a.max()] + filt = t.filter(t.a < 100) + expr = filt.filter(filt.a == filt.a.max()) result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") def test_multiple_filters2(snapshot): t = ibis.table([("a", "int64"), ("b", "string")], name="t0") - filt = t[t.a < 100] - expr = filt[filt.a == filt.a.max()] - expr = expr[expr.b == "a"] + filt = t.filter(t.a < 100) + expr = filt.filter(filt.a == filt.a.max()) + expr = expr.filter(expr.b == "a") result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") @@ -250,7 +250,8 @@ def tpch(region, nation, customer, orders): return ( region.join(nation, region.r_regionkey == nation.n_regionkey) .join(customer, customer.c_nationkey == nation.n_nationkey) - .join(orders, orders.o_custkey == customer.c_custkey)[fields_of_interest] + .join(orders, orders.o_custkey == customer.c_custkey) + .select(fields_of_interest) ) @@ -259,18 +260,20 @@ def test_join_key_name(tpch, snapshot): pre_sizes = tpch.group_by(year).size() t2 = tpch.view() - conditional_avg = t2[t2.region == tpch.region].o_totalprice.mean().name("mean") + conditional_avg = ( + t2.filter(t2.region == tpch.region).o_totalprice.mean().name("mean") + ) amount_filter = tpch.o_totalprice > conditional_avg - post_sizes = tpch[amount_filter].group_by(year).size() + post_sizes = tpch.filter(amount_filter).group_by(year).size() percent = (post_sizes[1] / pre_sizes[1].cast("double")).name("fraction") - expr = pre_sizes.join(post_sizes, pre_sizes.year == post_sizes.year)[ + expr = pre_sizes.join(post_sizes, pre_sizes.year == post_sizes.year).select( pre_sizes.year, pre_sizes[1].name("pre_count"), post_sizes[1].name("post_count"), percent, - ] + ) result = ibis.impala.compile(expr) snapshot.assert_match(result, "out.sql") @@ -281,11 +284,11 @@ def test_join_key_name2(tpch, snapshot): pre_sizes = tpch.group_by(year).size() post_sizes = tpch.group_by(year).size().view() - expr = pre_sizes.join(post_sizes, pre_sizes.year == post_sizes.year)[ + expr = pre_sizes.join(post_sizes, pre_sizes.year == post_sizes.year).select( pre_sizes.year, pre_sizes[1].name("pre_count"), post_sizes[1].name("post_count"), - ] + ) result = ibis.impala.compile(expr) snapshot.assert_match(result, "out.sql") diff --git a/ibis/backends/impala/tests/test_value_exprs.py b/ibis/backends/impala/tests/test_value_exprs.py index becef1317143f..f590e87df5bb9 100644 --- a/ibis/backends/impala/tests/test_value_exprs.py +++ b/ibis/backends/impala/tests/test_value_exprs.py @@ -175,11 +175,11 @@ def test_timestamp_extract_field(table, field, snapshot): def test_sql_extract(table, snapshot): # integration with SQL translation - expr = table[ + expr = table.select( table.i.year().name("year"), table.i.month().name("month"), table.i.day().name("day"), - ] + ) result = ibis.to_sql(expr, dialect="impala") snapshot.assert_match(result, "out.sql") @@ -252,8 +252,8 @@ def test_correlated_predicate_subquery(table, snapshot): t1 = t0.view() # both are valid constructions - expr1 = t0[t0.g == t1.g] - expr2 = t1[t0.g == t1.g] + expr1 = t0.filter(t0.g == t1.g) + expr2 = t1.filter(t0.g == t1.g) snapshot.assert_match(translate(expr1), "out1.sql") snapshot.assert_match(translate(expr2), "out2.sql") diff --git a/ibis/backends/impala/tests/test_window.py b/ibis/backends/impala/tests/test_window.py index aeac63b38ee61..0d9356e282a6b 100644 --- a/ibis/backends/impala/tests/test_window.py +++ b/ibis/backends/impala/tests/test_window.py @@ -22,7 +22,7 @@ def assert_sql_equal(expr, snapshot, out="out.sql"): def test_aggregate_in_projection(alltypes, snapshot): t = alltypes - proj = t[t, (t.f / t.f.sum()).name("normed_f")] + proj = t.select(t, (t.f / t.f.sum()).name("normed_f")) assert_sql_equal(proj, snapshot) @@ -93,7 +93,7 @@ def test_nested_analytic_function(alltypes, snapshot): def test_rank_functions(alltypes, snapshot): t = alltypes - proj = t[t.g, t.f.rank().name("minr"), t.f.dense_rank().name("denser")] + proj = t.select(t.g, t.f.rank().name("minr"), t.f.dense_rank().name("denser")) assert_sql_equal(proj, snapshot) @@ -113,7 +113,7 @@ def test_order_by_desc(alltypes, snapshot): w = window(order_by=ibis.desc(t.f)) - proj = t[t.f, ibis.row_number().over(w).name("revrank")] + proj = t.select(t.f, ibis.row_number().over(w).name("revrank")) assert_sql_equal(proj, snapshot, "out1.sql") expr = t.group_by("g").order_by(ibis.desc(t.f))[t.d.lag().name("foo"), t.a.max()] diff --git a/ibis/backends/mssql/tests/test_client.py b/ibis/backends/mssql/tests/test_client.py index 24a8a2fb1a4d9..95b12c2972b73 100644 --- a/ibis/backends/mssql/tests/test_client.py +++ b/ibis/backends/mssql/tests/test_client.py @@ -159,7 +159,7 @@ def count_big(x, where: bool = True) -> int: expr = count_big(ft.id) expr = count_big(ft.id, where=ft.id == 1) - assert expr.execute() == ft[ft.id == 1].count().execute() + assert expr.execute() == ft.filter(ft.id == 1).count().execute() @pytest.mark.parametrize("string", ["a", " ", "a ", " a", ""]) diff --git a/ibis/backends/pandas/tests/test_arrays.py b/ibis/backends/pandas/tests/test_arrays.py index 98d1bb6fcd8df..9b657eb9cf3c7 100644 --- a/ibis/backends/pandas/tests/test_arrays.py +++ b/ibis/backends/pandas/tests/test_arrays.py @@ -74,7 +74,7 @@ def test_array_collect_grouped(t, df): def test_array_collect_rolling_partitioned(t, df): window = ibis.trailing_window(1, order_by=t.plain_int64) colexpr = t.plain_float64.collect().over(window) - expr = t["dup_strings", "plain_int64", colexpr.name("collected")] + expr = t.select("dup_strings", "plain_int64", colexpr.name("collected")) result = expr.execute() expected = pd.DataFrame( { @@ -134,7 +134,7 @@ def test_array_slice_scalar(client, start, stop): @pytest.mark.parametrize("index", [1, 3, 4, 11, -11]) def test_array_index(t, df, index): - expr = t[t.array_of_float64[index].name("indexed")] + expr = t.select(t.array_of_float64[index].name("indexed")) result = expr.execute() expected = pd.DataFrame( { diff --git a/ibis/backends/pandas/tests/test_join.py b/ibis/backends/pandas/tests/test_join.py index c4f730e84ea04..4d44efd1c63a2 100644 --- a/ibis/backends/pandas/tests/test_join.py +++ b/ibis/backends/pandas/tests/test_join.py @@ -17,16 +17,16 @@ @mutating_join_type def test_join(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how)[ + expr = left.join(right, left.key == right.key, how=how).select( left, right.other_value, right.key3 - ] + ) result = expr.execute() expected = pd.merge(df1, df2, how=how, on="key") tm.assert_frame_equal(result[expected.columns], expected) def test_cross_join(left, right, df1, df2): - expr = left.cross_join(right)[left, right.other_value, right.key3] + expr = left.cross_join(right).select(left, right.other_value, right.key3) result = expr.execute() expected = pd.merge( df1.assign(dummy=1), df2.assign(dummy=1), how="inner", on="dummy" @@ -37,14 +37,14 @@ def test_cross_join(left, right, df1, df2): @mutating_join_type def test_join_project_left_table(how, left, right, df1, df2): - expr = left.join(right, left.key == right.key, how=how)[left, right.key3] + expr = left.join(right, left.key == right.key, how=how).select(left, right.key3) result = expr.execute() expected = pd.merge(df1, df2, how=how, on="key")[list(left.columns) + ["key3"]] tm.assert_frame_equal(result[expected.columns], expected) def test_cross_join_project_left_table(left, right, df1, df2): - expr = left.cross_join(right)[left, right.key3] + expr = left.cross_join(right).select(left, right.key3) result = expr.execute() expected = pd.merge( df1.assign(dummy=1), df2.assign(dummy=1), how="inner", on="dummy" @@ -67,9 +67,9 @@ def test_cross_join_project_left_table(left, right, df1, df2): ], ) def test_join_with_multiple_predicates(how, left, right, df1, df2): - expr = left.join(right, [left.key == right.key, left.key2 == right.key3], how=how)[ - left, right.key3, right.other_value - ] + expr = left.join( + right, [left.key == right.key, left.key2 == right.key3], how=how + ).select(left, right.key3, right.other_value) result = expr.execute() expected = pd.merge( df1, @@ -110,7 +110,9 @@ def test_join_with_multiple_predicates(how, left, right, df1, df2): ) def test_join_with_multiple_predicates_written_as_one(how, left, right, df1, df2): predicate = (left.key == right.key) & (left.key2 == right.key3) - expr = left.join(right, predicate, how=how)[left, right.key3, right.other_value] + expr = left.join(right, predicate, how=how).select( + left, right.key3, right.other_value + ) result = expr.execute() expected = pd.merge( df1, df2, how=how, left_on=["key", "key2"], right_on=["key", "key3"] @@ -155,7 +157,9 @@ def test_join_with_duplicate_non_key_columns_not_selected(how, left, right, df1, left = left.mutate(x=left.value * 2) right = right.mutate(x=right.other_value * 3) right = right[["key", "other_value"]] - expr = left.join(right, left.key == right.key, how=how)[left, right.other_value] + expr = left.join(right, left.key == right.key, how=how).select( + left, right.other_value + ) result = expr.execute() expected = pd.merge( df1.assign(x=df1.value * 2), @@ -169,7 +173,7 @@ def test_join_with_duplicate_non_key_columns_not_selected(how, left, right, df1, @mutating_join_type def test_join_with_post_expression_selection(how, left, right, df1, df2): join = left.join(right, left.key == right.key, how=how) - expr = join[left.key, left.value, right.other_value] + expr = join.select(left.key, left.value, right.other_value) result = expr.execute() expected = pd.merge(df1, df2, on="key", how=how)[["key", "value", "other_value"]] tm.assert_frame_equal(result[expected.columns], expected) @@ -181,8 +185,8 @@ def test_join_with_post_expression_filter(how, left): rhs = left[["key2", "value"]] joined = lhs.join(rhs, "key2", how=how) - projected = joined[lhs, rhs.value] - expr = projected[projected.value == 4] + projected = joined.select(lhs, rhs.value) + expr = projected.filter(projected.value == 4) result = expr.execute() df1 = lhs.execute() @@ -200,12 +204,12 @@ def test_multi_join_with_post_expression_filter(how, left, df1): rhs2 = left[["key2", "value"]].rename(value2="value") joined = lhs.join(rhs, "key2", how=how) - projected = joined[lhs, rhs.value] - filtered = projected[projected.value == 4] + projected = joined.select(lhs, rhs.value) + filtered = projected.filter(projected.value == 4) joined2 = filtered.join(rhs2, "key2") - projected2 = joined2[filtered.key, rhs2.value2] - expr = projected2[projected2.value2 == 3] + projected2 = joined2.select(filtered.key, rhs2.value2) + expr = projected2.filter(projected2.value2 == 3) result = expr.execute() @@ -224,7 +228,7 @@ def test_multi_join_with_post_expression_filter(how, left, df1): def test_join_with_non_trivial_key(how, left, right, df1, df2): # also test that the order of operands in the predicate doesn't matter join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join[left.key, left.value, right.other_value] + expr = join.select(left.key, left.value, right.other_value) result = expr.execute() expected = ( @@ -244,8 +248,8 @@ def test_join_with_non_trivial_key(how, left, right, df1, df2): def test_join_with_non_trivial_key_project_table(how, left, right, df1, df2): # also test that the order of operands in the predicate doesn't matter join = left.join(right, right.key.length() == left.key.length(), how=how) - expr = join[left, right.other_value] - expr = expr[expr.key.length() == 1] + expr = join.select(left, right.other_value) + expr = expr.filter(expr.key.length() == 1) result = expr.execute() expected = ( @@ -267,7 +271,7 @@ def test_join_with_project_right_duplicate_column(client, how, left, df1, df3): # also test that the order of operands in the predicate doesn't matter right = client.table("df3") join = left.join(right, ["key"], how=how) - expr = join[left.key, right.key2, right.other_value] + expr = join.select(left.key, right.key2, right.other_value) result = expr.execute() expected = ( @@ -283,7 +287,7 @@ def test_join_with_window_function(players_base, players_df, batting, batting_df # this should be semi_join tbl = batting.left_join(players, ["playerID"]) - t = tbl[batting.G, batting.playerID, batting.teamID] + t = tbl.select(batting.G, batting.playerID, batting.teamID) expr = t.group_by(t.teamID).mutate( team_avg=lambda d: d.G.mean(), demeaned_by_player=lambda d: d.G - d.G.mean(), diff --git a/ibis/backends/pandas/tests/test_operations.py b/ibis/backends/pandas/tests/test_operations.py index b116995c22bdf..6e56472a92642 100644 --- a/ibis/backends/pandas/tests/test_operations.py +++ b/ibis/backends/pandas/tests/test_operations.py @@ -28,7 +28,9 @@ def test_literal(client): def test_selection(t, df): - expr = t[((t.plain_strings == "a") | (t.plain_int64 == 3)) & (t.dup_strings == "d")] + expr = t.filter( + ((t.plain_strings == "a") | (t.plain_int64 == 3)) & (t.dup_strings == "d") + ) result = expr.execute() expected = df[ ((df.plain_strings == "a") | (df.plain_int64 == 3)) & (df.dup_strings == "d") @@ -45,12 +47,10 @@ def test_mutate(t, df): def test_project_scope_does_not_override(t, df): col = t.plain_int64 - expr = t[ - [ - col.name("new_col"), - col.sum().over(ibis.window(group_by="dup_strings")).name("grouped"), - ] - ] + expr = t.select( + col.name("new_col"), + col.sum().over(ibis.window(group_by="dup_strings")).name("grouped"), + ) result = expr.execute() expected = pd.concat( [ diff --git a/ibis/backends/pandas/tests/test_window.py b/ibis/backends/pandas/tests/test_window.py index d588120b8fd4f..a0cf0f4e3eed0 100644 --- a/ibis/backends/pandas/tests/test_window.py +++ b/ibis/backends/pandas/tests/test_window.py @@ -172,7 +172,7 @@ def test_players(players, players_df): def test_batting_filter_mean(batting, batting_df): - expr = batting[batting.G > batting.G.mean()] + expr = batting.filter(batting.G > batting.G.mean()) result = expr.execute() expected = batting_df[batting_df.G > batting_df.G.mean()].reset_index(drop=True) tm.assert_frame_equal(result[expected.columns], expected) @@ -361,7 +361,7 @@ def test_mutate_with_window_after_join(sort_kind): left, right = map(con.table, ("left", "right")) joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] + proj = joined.select(left, right.value) expr = proj.group_by("ints").mutate(sum=proj.value.sum()) result = expr.execute() expected = pd.DataFrame( @@ -390,7 +390,7 @@ def test_mutate_scalar_with_window_after_join(): left, right = map(con.table, ("left", "right")) joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] + proj = joined.select(left, right.value) expr = proj.mutate(sum=proj.value.sum(), const=ibis.literal(1)) result = expr.execute() expected = pd.DataFrame( @@ -416,8 +416,8 @@ def test_project_scalar_after_join(): left, right = map(con.table, ("left", "right")) joined = left.outer_join(right, left.ints == right.group) - proj = joined[left, right.value] - expr = proj[proj.value.sum().name("sum"), ibis.literal(1).name("const")] + proj = joined.select(left, right.value) + expr = proj.select(proj.value.sum().name("sum"), ibis.literal(1).name("const")) result = expr.execute() expected = pd.DataFrame( { diff --git a/ibis/backends/postgres/tests/test_functions.py b/ibis/backends/postgres/tests/test_functions.py index e0c148057ca29..0c277fa634b5d 100644 --- a/ibis/backends/postgres/tests/test_functions.py +++ b/ibis/backends/postgres/tests/test_functions.py @@ -647,7 +647,7 @@ def test_not_exists(alltypes, df): t = alltypes t2 = t.view() - expr = t[~((t.string_col == t2.string_col).any())] + expr = t.filter(~((t.string_col == t2.string_col).any())) result = expr.execute() left, right = df, t2.execute() @@ -855,7 +855,7 @@ def test_window_with_arithmetic(alltypes, df): def test_anonymous_aggregate(alltypes, df): t = alltypes - expr = t[t.double_col > t.double_col.mean()] + expr = t.filter(t.double_col > t.double_col.mean()) result = expr.execute() expected = df[df.double_col > df.double_col.mean()].reset_index(drop=True) tm.assert_frame_equal(result, expected) @@ -908,7 +908,7 @@ def test_array_collect(array_types): @pytest.mark.parametrize("index", [0, 1, 3, 4, 11, -1, -3, -4, -11]) def test_array_index(array_types, index): - expr = array_types[array_types.y[index].name("indexed")] + expr = array_types.select(array_types.y[index].name("indexed")) result = expr.execute() expected = pd.DataFrame( { diff --git a/ibis/backends/postgres/tests/test_geospatial.py b/ibis/backends/postgres/tests/test_geospatial.py index 9821f9d3ec26e..574cb1e6e6d84 100644 --- a/ibis/backends/postgres/tests/test_geospatial.py +++ b/ibis/backends/postgres/tests/test_geospatial.py @@ -232,7 +232,7 @@ def test_get_point(geotable, expr_fn, expected): # boundaries with the contains predicate. Work around this by adding a # small buffer. expr = geotable["geo_linestring"].buffer(0.01).contains(arg) - result = geotable[geotable, expr.name("tmp")].execute()["tmp"] + result = geotable.select(geotable, expr.name("tmp")).execute()["tmp"] testing.assert_almost_equal(result, expected, decimal=2) @@ -257,7 +257,7 @@ def test_area(con, geotable): ) def test_srid(geotable, condition, expected): """Testing for geo spatial srid operation.""" - expr = geotable[geotable.id, condition(geotable).name("tmp")] + expr = geotable.select(geotable.id, condition(geotable).name("tmp")) result = expr.execute()["tmp"][[0]] assert np.all(result == expected) @@ -275,7 +275,7 @@ def test_srid(geotable, condition, expected): ) def test_set_srid(geotable, condition, expected): """Testing for geo spatial set_srid operation.""" - expr = geotable[geotable.id, condition(geotable).name("tmp")] + expr = geotable.select(geotable.id, condition(geotable).name("tmp")) result = expr.execute()["tmp"][[0]] assert np.all(result == expected) @@ -305,7 +305,7 @@ def test_set_srid(geotable, condition, expected): ) def test_transform(geotable, condition, expected): """Testing for geo spatial transform operation.""" - expr = geotable[geotable.id, condition(geotable).name("tmp")] + expr = geotable.select(geotable.id, condition(geotable).name("tmp")) result = expr.execute()["tmp"][[0]] assert np.all(result == expected) @@ -325,7 +325,7 @@ def test_transform(geotable, condition, expected): def test_cast_geography(geotable, expr_fn): """Testing for geo spatial transform operation.""" p = expr_fn(geotable).cast("geography") - expr = geotable[geotable.id, p.distance(p).name("tmp")] + expr = geotable.select(geotable.id, p.distance(p).name("tmp")) result = expr.execute()["tmp"][[0]] # distance from a point to a same point should be 0 assert np.all(result == 0) @@ -346,7 +346,7 @@ def test_cast_geography(geotable, expr_fn): def test_cast_geometry(geotable, expr_fn): """Testing for geo spatial transform operation.""" p = expr_fn(geotable).cast("geometry") - expr = geotable[geotable.id, p.distance(p).name("tmp")] + expr = geotable.select(geotable.id, p.distance(p).name("tmp")) result = expr.execute()["tmp"][[0]] # distance from a point to a same point should be 0 assert np.all(result == 0) diff --git a/ibis/backends/postgres/tests/test_json.py b/ibis/backends/postgres/tests/test_json.py index 219562b81cb88..a3e11838a48c6 100644 --- a/ibis/backends/postgres/tests/test_json.py +++ b/ibis/backends/postgres/tests/test_json.py @@ -23,7 +23,7 @@ def jsonb_t(con): @pytest.mark.parametrize("data", [param({"status": True}, id="status")]) def test_json(data, alltypes): lit = ibis.literal(json.dumps(data), type="json").name("tmp") - expr = alltypes[[alltypes.id, lit]].head(1) + expr = alltypes.select(alltypes.id, lit).head(1) df = expr.execute() assert df["tmp"].iloc[0] == data diff --git a/ibis/backends/postgres/tests/test_postgis.py b/ibis/backends/postgres/tests/test_postgis.py index c1c1b6484715d..9a20356a2553f 100644 --- a/ibis/backends/postgres/tests/test_postgis.py +++ b/ibis/backends/postgres/tests/test_postgis.py @@ -21,7 +21,7 @@ def test_load_geodata(con): def test_empty_select(geotable): - expr = geotable[geotable.geo_point.geo_equals(geotable.geo_linestring)] + expr = geotable.filter(geotable.geo_point.geo_equals(geotable.geo_linestring)) result = expr.execute() assert len(result) == 0 diff --git a/ibis/backends/postgres/tests/test_string.py b/ibis/backends/postgres/tests/test_string.py index d069c293f2f3b..25237fb8a65d5 100644 --- a/ibis/backends/postgres/tests/test_string.py +++ b/ibis/backends/postgres/tests/test_string.py @@ -15,6 +15,6 @@ @pytest.mark.usefixtures("con") def test_special_strings(alltypes, data, data_type): lit = ibis.literal(data, type=data_type).name("tmp") - expr = alltypes[[alltypes.id, lit]].head(1) + expr = alltypes.select(alltypes.id, lit).head(1) df = expr.execute() assert df["tmp"].iloc[0] == uuid.UUID(data) diff --git a/ibis/backends/postgres/tests/test_udf.py b/ibis/backends/postgres/tests/test_udf.py index 59a494a0cc5f6..0c56392c04bff 100644 --- a/ibis/backends/postgres/tests/test_udf.py +++ b/ibis/backends/postgres/tests/test_udf.py @@ -85,7 +85,9 @@ def test_existing_sql_udf(con_for_udf, test_database, table): """Test creating ibis UDF object based on existing UDF in the database.""" # Create ibis UDF objects referring to UDFs already created in the database custom_length_udf = con_for_udf.function("custom_len", database=test_database) - result_obj = table[table, custom_length_udf(table["user_name"]).name("custom_len")] + result_obj = table.select( + table, custom_length_udf(table["user_name"]).name("custom_len") + ) result = result_obj.execute() assert result["custom_len"].sum() == result["name_length"].sum() @@ -93,7 +95,9 @@ def test_existing_sql_udf(con_for_udf, test_database, table): def test_existing_plpython_udf(con_for_udf, test_database, table): # Create ibis UDF objects referring to UDFs already created in the database py_length_udf = con_for_udf.function("pylen", database=test_database) - result_obj = table[table, py_length_udf(table["user_name"]).name("custom_len")] + result_obj = table.select( + table, py_length_udf(table["user_name"]).name("custom_len") + ) result = result_obj.execute() assert result["custom_len"].sum() == result["name_length"].sum() diff --git a/ibis/backends/pyspark/tests/test_array.py b/ibis/backends/pyspark/tests/test_array.py index 8d45e24e9358e..b253b084cc70b 100644 --- a/ibis/backends/pyspark/tests/test_array.py +++ b/ibis/backends/pyspark/tests/test_array.py @@ -82,7 +82,7 @@ def test_array_slice_scalar(con, start, stop): @pytest.mark.parametrize("index", [1, 3, 4, 11, -11]) def test_array_index(t, df, index): - expr = t[t.array_int[index].name("indexed")] + expr = t.select(t.array_int[index].name("indexed")) result = expr.execute() expected = pd.DataFrame( diff --git a/ibis/backends/pyspark/tests/test_ddl.py b/ibis/backends/pyspark/tests/test_ddl.py index 64720c06e8bf1..975d2840ca0ee 100644 --- a/ibis/backends/pyspark/tests/test_ddl.py +++ b/ibis/backends/pyspark/tests/test_ddl.py @@ -134,16 +134,16 @@ def test_insert_validate_types(con, alltypes, test_data_db, temp_table): database=db, ) - to_insert = expr[ + to_insert = expr.select( expr.tinyint_col, expr.smallint_col.name("int_col"), expr.string_col - ] + ) con.insert(temp_table, to_insert.limit(10)) - to_insert = expr[ + to_insert = expr.select( expr.tinyint_col, expr.smallint_col.cast("int32").name("int_col"), expr.string_col, - ] + ) con.insert(temp_table, to_insert.limit(10)) diff --git a/ibis/backends/pyspark/tests/test_null.py b/ibis/backends/pyspark/tests/test_null.py index 048330d6b39ab..b4ee62dcb7e7d 100644 --- a/ibis/backends/pyspark/tests/test_null.py +++ b/ibis/backends/pyspark/tests/test_null.py @@ -11,7 +11,7 @@ def test_isnull(con): table_pandas = table.execute() for col, _ in table_pandas.items(): - result = table[table[col].isnull()].execute().reset_index(drop=True) + result = table.filter(table[col].isnull()).execute().reset_index(drop=True) expected = table_pandas[table_pandas[col].isnull()].reset_index(drop=True) tm.assert_frame_equal(result, expected) @@ -21,6 +21,6 @@ def test_notnull(con): table_pandas = table.execute() for col, _ in table_pandas.items(): - result = table[table[col].notnull()].execute().reset_index(drop=True) + result = table.filter(table[col].notnull()).execute().reset_index(drop=True) expected = table_pandas[table_pandas[col].notnull()].reset_index(drop=True) tm.assert_frame_equal(result, expected) diff --git a/ibis/backends/risingwave/tests/test_functions.py b/ibis/backends/risingwave/tests/test_functions.py index 48550824022ab..d26fc39f53909 100644 --- a/ibis/backends/risingwave/tests/test_functions.py +++ b/ibis/backends/risingwave/tests/test_functions.py @@ -448,7 +448,7 @@ def test_not_exists(alltypes, df): t = alltypes t2 = t.view() - expr = t[~((t.string_col == t2.string_col).any())] + expr = t.filter(~((t.string_col == t2.string_col).any())) result = expr.execute() left, right = df, t2.execute() @@ -615,7 +615,7 @@ def test_window_with_arithmetic(alltypes, df): def test_anonymous_aggregate(alltypes, df): t = alltypes - expr = t[t.double_col > t.double_col.mean()] + expr = t.filter(t.double_col > t.double_col.mean()) result = expr.execute() expected = df[df.double_col > df.double_col.mean()].reset_index(drop=True) tm.assert_frame_equal(result, expected) diff --git a/ibis/backends/risingwave/tests/test_json.py b/ibis/backends/risingwave/tests/test_json.py index 6f6d04b58f7a6..92cb197672f67 100644 --- a/ibis/backends/risingwave/tests/test_json.py +++ b/ibis/backends/risingwave/tests/test_json.py @@ -13,6 +13,6 @@ @pytest.mark.parametrize("data", [param({"status": True}, id="status")]) def test_json(data, alltypes): lit = ibis.literal(json.dumps(data), type="json").name("tmp") - expr = alltypes[[alltypes.id, lit]].head(1) + expr = alltypes.select(alltypes.id, lit).head(1) df = expr.execute() assert df["tmp"].iloc[0] == data diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index a44d95cb08ace..19e1207ee405e 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -1596,7 +1596,7 @@ def test_agg_sort(alltypes): def test_filter(backend, alltypes, df): expr = ( - alltypes[_.string_col == "1"] + alltypes.filter(_.string_col == "1") .mutate(x=L(1, "int64")) .group_by(_.x) .aggregate(sum=_.double_col.sum()) diff --git a/ibis/backends/tests/test_client.py b/ibis/backends/tests/test_client.py index bfb97a5c046ab..a2480a83d608e 100644 --- a/ibis/backends/tests/test_client.py +++ b/ibis/backends/tests/test_client.py @@ -218,7 +218,7 @@ def test_load_data(backend, con, temp_table, lamduh): [ param(lambda t: t.string_col, [("string_col", dt.String)], id="column"), param( - lambda t: t[t.string_col, t.bigint_col], + lambda t: t.select(t.string_col, t.bigint_col), [("string_col", dt.String), ("bigint_col", dt.Int64)], id="table", ), diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index 8dbcefaacdc07..23f5d83fdd924 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -174,7 +174,7 @@ def test_isna(backend, alltypes, col, value, filt): table = alltypes.select(**{col: value}) df = table.execute() - result = table[filt(table[col])].execute().reset_index(drop=True) + result = table.filter(filt(table[col])).execute().reset_index(drop=True) expected = df[df[col].isna()].reset_index(drop=True) backend.assert_frame_equal(result, expected) @@ -255,7 +255,7 @@ def test_identical_to(backend, alltypes, sorted_df): dt = df[["tinyint_col", "double_col"]] ident = sorted_alltypes.tinyint_col.identical_to(sorted_alltypes.double_col) - expr = sorted_alltypes["id", ident.name("tmp")].order_by("id") + expr = sorted_alltypes.select("id", ident.name("tmp")).order_by("id") result = expr.execute().tmp expected = (dt.tinyint_col.isnull() & dt.double_col.isnull()) | ( @@ -280,9 +280,9 @@ def test_identical_to(backend, alltypes, sorted_df): @pytest.mark.notimpl(["druid"]) def test_isin(backend, alltypes, sorted_df, column, elements): sorted_alltypes = alltypes.order_by("id") - expr = sorted_alltypes[ + expr = sorted_alltypes.select( "id", sorted_alltypes[column].isin(elements).name("tmp") - ].order_by("id") + ).order_by("id") result = expr.execute().tmp expected = sorted_df[column].isin(elements) @@ -304,9 +304,9 @@ def test_isin(backend, alltypes, sorted_df, column, elements): @pytest.mark.notimpl(["druid"]) def test_notin(backend, alltypes, sorted_df, column, elements): sorted_alltypes = alltypes.order_by("id") - expr = sorted_alltypes[ + expr = sorted_alltypes.select( "id", sorted_alltypes[column].notin(elements).name("tmp") - ].order_by("id") + ).order_by("id") result = expr.execute().tmp expected = ~sorted_df[column].isin(elements) @@ -339,7 +339,7 @@ def test_notin(backend, alltypes, sorted_df, column, elements): @pytest.mark.notimpl(["druid"]) def test_filter(backend, alltypes, sorted_df, predicate_fn, expected_fn): sorted_alltypes = alltypes.order_by("id") - table = sorted_alltypes[predicate_fn(sorted_alltypes)].order_by("id") + table = sorted_alltypes.filter(predicate_fn(sorted_alltypes)).order_by("id") result = table.execute() expected = sorted_df[expected_fn(sorted_df)] @@ -427,8 +427,8 @@ def test_select_filter_mutate(backend, alltypes, df): ) # Actual test - t = t[t.columns] - t = t[~t["float_col"].isnan()] + t = t.select(t.columns) + t = t.filter(~t["float_col"].isnan()) t = t.mutate(float_col=t["float_col"].cast("float64")) result = t.execute() @@ -956,7 +956,7 @@ def test_table_describe_large(con): ], ) def test_isin_notin(backend, alltypes, df, ibis_op, pandas_op): - expr = alltypes[ibis_op] + expr = alltypes.filter(ibis_op) expected = df.loc[pandas_op(df)].sort_values(["id"]).reset_index(drop=True) result = expr.execute().sort_values(["id"]).reset_index(drop=True) backend.assert_frame_equal(result, expected) @@ -990,7 +990,7 @@ def test_isin_notin(backend, alltypes, df, ibis_op, pandas_op): ], ) def test_isin_notin_column_expr(backend, alltypes, df, ibis_op, pandas_op): - expr = alltypes[ibis_op].order_by("id") + expr = alltypes.filter(ibis_op).order_by("id") expected = df[pandas_op(df)].sort_values(["id"]).reset_index(drop=True) result = expr.execute() backend.assert_frame_equal(result, expected) @@ -1078,11 +1078,6 @@ def test_interactive(alltypes, monkeypatch): repr(expr) -def test_correlated_subquery(alltypes): - expr = alltypes[_.double_col > _.view().double_col] - assert expr.compile() is not None - - @pytest.mark.notimpl(["polars", "pyspark"]) @pytest.mark.notimpl( ["risingwave"], @@ -1090,8 +1085,8 @@ def test_correlated_subquery(alltypes): reason='DataFrame.iloc[:, 0] (column name="playerID") are different', ) def test_uncorrelated_subquery(backend, batting, batting_df): - subset_batting = batting[batting.yearID <= 2000] - expr = batting[_.yearID == subset_batting.yearID.max()]["playerID", "yearID"] + subset_batting = batting.filter(batting.yearID <= 2000) + expr = batting.filter(_.yearID == subset_batting.yearID.max())["playerID", "yearID"] result = expr.execute() expected = batting_df[batting_df.yearID == 2000][["playerID", "yearID"]] @@ -1124,10 +1119,10 @@ def test_int_scalar(alltypes): def test_exists(batting, awards_players, method_name): years = [1980, 1981] batting_years = [1871, *years] - batting = batting[batting.yearID.isin(batting_years)] - awards_players = awards_players[awards_players.yearID.isin(years)] + batting = batting.filter(batting.yearID.isin(batting_years)) + awards_players = awards_players.filter(awards_players.yearID.isin(years)) method = methodcaller(method_name) - expr = batting[method(batting.yearID == awards_players.yearID)] + expr = batting.filter(method(batting.yearID == awards_players.yearID)) result = expr.execute() assert not result.empty diff --git a/ibis/backends/tests/test_interactive.py b/ibis/backends/tests/test_interactive.py index 276377a249d9e..bd19507f4a7b5 100644 --- a/ibis/backends/tests/test_interactive.py +++ b/ibis/backends/tests/test_interactive.py @@ -86,6 +86,6 @@ def test_interactive_non_compilable_repr_does_not_fail(table): def test_isin_rule_suppressed_exception_repr_not_fail(table): bool_clause = table["string_col"].notin(["1", "4", "7"]) - expr = table[bool_clause]["string_col"].value_counts() + expr = table.filter(bool_clause)["string_col"].value_counts() repr(expr) diff --git a/ibis/backends/tests/test_join.py b/ibis/backends/tests/test_join.py index f10ecb782eb1b..6e8d09e014692 100644 --- a/ibis/backends/tests/test_join.py +++ b/ibis/backends/tests/test_join.py @@ -64,8 +64,8 @@ def check_eq(left, right, how, **kwargs): ) @pytest.mark.notimpl(["druid"]) def test_mutating_join(backend, batting, awards_players, how): - left = batting[batting.yearID == 2015] - right = awards_players[awards_players.lgID == "NL"].drop("yearID", "lgID") + left = batting.filter(batting.yearID == 2015) + right = awards_players.filter(awards_players.lgID == "NL").drop("yearID", "lgID") left_df = left.execute() right_df = right.execute() @@ -114,8 +114,8 @@ def test_mutating_join(backend, batting, awards_players, how): @pytest.mark.notimpl(["dask", "druid"]) @pytest.mark.notyet(["flink"], reason="Flink doesn't support semi joins or anti joins") def test_filtering_join(backend, batting, awards_players, how): - left = batting[batting.yearID == 2015] - right = awards_players[awards_players.lgID == "NL"].drop("yearID", "lgID") + left = batting.filter(batting.yearID == 2015) + right = awards_players.filter(awards_players.lgID == "NL").drop("yearID", "lgID") left_df = left.execute() right_df = right.execute() @@ -142,10 +142,10 @@ def test_filtering_join(backend, batting, awards_players, how): def test_join_then_filter_no_column_overlap(awards_players, batting): - left = batting[batting.yearID == 2015] + left = batting.filter(batting.yearID == 2015) year = left.yearID.name("year") - left = left[year, "RBI"] - right = awards_players[awards_players.lgID == "NL"] + left = left.select(year, "RBI") + right = awards_players.filter(awards_players.lgID == "NL") expr = left.join(right, left.year == right.yearID) filters = [expr.RBI == 9] @@ -196,8 +196,8 @@ def test_semi_join_topk(con, batting, awards_players, func): reason="postgres can't handle null types columns", ) def test_join_with_pandas(batting, awards_players): - batting_filt = batting[lambda t: t.yearID < 1900] - awards_players_filt = awards_players[lambda t: t.yearID < 1900].execute() + batting_filt = batting.filter(lambda t: t.yearID < 1900) + awards_players_filt = awards_players.filter(lambda t: t.yearID < 1900).execute() assert isinstance(awards_players_filt, pd.DataFrame) expr = batting_filt.join(awards_players_filt, "yearID") df = expr.execute() @@ -205,10 +205,10 @@ def test_join_with_pandas(batting, awards_players): def test_join_with_pandas_non_null_typed_columns(batting, awards_players): - batting_filt = batting[lambda t: t.yearID < 1900][["yearID"]] - awards_players_filt = awards_players[lambda t: t.yearID < 1900][ - ["yearID"] - ].execute() + batting_filt = batting.filter(lambda t: t.yearID < 1900).select("yearID") + awards_players_filt = ( + awards_players.filter(lambda t: t.yearID < 1900).select("yearID").execute() + ) # ensure that none of the columns of either table have type null batting_schema = batting_filt.schema() diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index 72ca7b12a4d61..2085e27502905 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -88,7 +88,7 @@ def test_cte_refs_in_topo_order(backend, snapshot): @pytest.mark.never(["pandas", "dask", "polars"], reason="not SQL", raises=ValueError) def test_isin_bug(con, snapshot): t = ibis.table(dict(x="int"), name="t") - good = t[t.x > 2].x + good = t.filter(t.x > 2).x expr = t.x.isin(good) snapshot.assert_match(str(ibis.to_sql(expr, dialect=con.name)), "out.sql") diff --git a/ibis/backends/tests/test_struct.py b/ibis/backends/tests/test_struct.py index 8757175f6b606..b00fff2ae047e 100644 --- a/ibis/backends/tests/test_struct.py +++ b/ibis/backends/tests/test_struct.py @@ -120,7 +120,7 @@ def test_collect_into_struct(alltypes): t = alltypes expr = ( - t[_.string_col.isin(("0", "1"))] + t.filter(_.string_col.isin(("0", "1"))) .group_by(group="string_col") .agg( val=lambda t: ibis.struct( diff --git a/ibis/backends/tests/test_temporal.py b/ibis/backends/tests/test_temporal.py index b1a907a50e4de..ee08a0083835f 100644 --- a/ibis/backends/tests/test_temporal.py +++ b/ibis/backends/tests/test_temporal.py @@ -1037,7 +1037,7 @@ def test_interval_add_cast_scalar(backend, alltypes): def test_interval_add_cast_column(backend, alltypes, df): timestamp_date = alltypes.timestamp_col.date() delta = alltypes.bigint_col.cast("interval('D')") - expr = alltypes["id", (timestamp_date + delta).name("tmp")] + expr = alltypes.select("id", (timestamp_date + delta).name("tmp")) result = expr.execute().sort_values("id").reset_index().tmp df = df.sort_values("id").reset_index(drop=True) expected = ( @@ -1702,7 +1702,7 @@ def test_interval_literal(con, backend): def test_date_column_from_ymd(backend, con, alltypes, df): c = alltypes.timestamp_col expr = ibis.date(c.year(), c.month(), c.day()) - tbl = alltypes[expr.name("timestamp_col")] + tbl = alltypes.select(expr.name("timestamp_col")) result = con.execute(tbl) golden = df.timestamp_col.dt.date.astype(result.timestamp_col.dtype) @@ -1719,7 +1719,7 @@ def test_timestamp_column_from_ymdhms(backend, con, alltypes, df): expr = ibis.timestamp( c.year(), c.month(), c.day(), c.hour(), c.minute(), c.second() ) - tbl = alltypes[expr.name("timestamp_col")] + tbl = alltypes.select(expr.name("timestamp_col")) result = con.execute(tbl) golden = df.timestamp_col.dt.floor("s").astype(result.timestamp_col.dtype) diff --git a/ibis/backends/tests/test_window.py b/ibis/backends/tests/test_window.py index a2664eef47075..7d21fb9f7fb17 100644 --- a/ibis/backends/tests/test_window.py +++ b/ibis/backends/tests/test_window.py @@ -610,7 +610,7 @@ def test_grouped_unbounded_window( def test_simple_ungrouped_unbound_following_window( backend, alltypes, ibis_method, pandas_fn ): - t = alltypes[alltypes.double_col < 50].order_by("id") + t = alltypes.filter(alltypes.double_col < 50).order_by("id") df = t.execute() w = ibis.window(rows=(0, None), order_by=t.id) @@ -635,7 +635,7 @@ def test_simple_ungrouped_unbound_following_window( reason="Feature is not yet implemented: Window function with empty PARTITION BY is not supported yet", ) def test_simple_ungrouped_window_with_scalar_order_by(alltypes): - t = alltypes[alltypes.double_col < 50].order_by("id") + t = alltypes.filter(alltypes.double_col < 50).order_by("id") w = ibis.window(rows=(0, None), order_by=ibis.null()) expr = t.double_col.sum().over(w).name("double_col") # hard to reproduce this in pandas, so just test that it actually executes diff --git a/ibis/backends/tests/tpc/ds/test_queries.py b/ibis/backends/tests/tpc/ds/test_queries.py index 4520ee9544b38..540503cd42e34 100644 --- a/ibis/backends/tests/tpc/ds/test_queries.py +++ b/ibis/backends/tests/tpc/ds/test_queries.py @@ -3480,11 +3480,12 @@ def agg_sales_net_by_month(sales, ns, sales_expr, net_expr): ) .join(date_dim, sales[f"{ns}_sold_date_sk"] == date_dim.d_date_sk) .join(time_dim, sales[f"{ns}_sold_time_sk"] == time_dim.t_time_sk) - .join(ship_mode, sales[f"{ns}_ship_mode_sk"] == ship_mode.sm_ship_mode_sk)[ - (_.d_year == 2001) - & (_.t_time.between(30838, 30838 + 28800)) - & (_.sm_carrier.isin(["DHL", "BARIAN"])) - ] + .join(ship_mode, sales[f"{ns}_ship_mode_sk"] == ship_mode.sm_ship_mode_sk) + .filter( + (_.d_year == 2001), + (_.t_time.between(30838, 30838 + 28800)), + (_.sm_carrier.isin(["DHL", "BARIAN"])), + ) .group_by( "w_warehouse_name", "w_warehouse_sq_ft", diff --git a/ibis/backends/tests/tpc/h/test_queries.py b/ibis/backends/tests/tpc/h/test_queries.py index f51817a31a76c..57c1384d9338a 100644 --- a/ibis/backends/tests/tpc/h/test_queries.py +++ b/ibis/backends/tests/tpc/h/test_queries.py @@ -66,9 +66,9 @@ def test_02(part, supplier, partsupp, nation, region): .join(region, nation.n_regionkey == region.r_regionkey) ) - subexpr = subexpr[ + subexpr = subexpr.filter( (subexpr.r_name == REGION) & (expr.p_partkey == subexpr.ps_partkey) - ] + ) filters = [ expr.p_size == SIZE, @@ -210,7 +210,7 @@ def test_07(supplier, lineitem, orders, customer, nation): q = q.join(n1, supplier.s_nationkey == n1.n_nationkey) q = q.join(n2, customer.c_nationkey == n2.n_nationkey) - q = q[ + q = q.select( n1.n_name.name("supp_nation"), n2.n_name.name("cust_nation"), lineitem.l_shipdate, @@ -218,7 +218,7 @@ def test_07(supplier, lineitem, orders, customer, nation): lineitem.l_discount, lineitem.l_shipdate.year().name("l_year"), (lineitem.l_extendedprice * (1 - lineitem.l_discount)).name("volume"), - ] + ) q = q.filter( [ @@ -255,14 +255,14 @@ def test_08(part, supplier, region, lineitem, orders, customer, nation): q = q.join(region, n1.n_regionkey == region.r_regionkey) q = q.join(n2, supplier.s_nationkey == n2.n_nationkey) - q = q[ + q = q.select( orders.o_orderdate.year().name("o_year"), (lineitem.l_extendedprice * (1 - lineitem.l_discount)).name("volume"), n2.n_name.name("nation"), region.r_name, orders.o_orderdate, part.p_type, - ] + ) q = q.filter( [ @@ -297,14 +297,14 @@ def test_09(part, supplier, lineitem, partsupp, orders, nation): q = q.join(orders, orders.o_orderkey == lineitem.l_orderkey) q = q.join(nation, supplier.s_nationkey == nation.n_nationkey) - q = q[ + q = q.select( (q.l_extendedprice * (1 - q.l_discount) - q.ps_supplycost * q.l_quantity).name( "amount" ), q.o_orderdate.year().name("o_year"), q.n_name.name("nation"), q.p_name, - ] + ) q = q.filter([q.p_name.like("%" + COLOR + "%")]) @@ -494,7 +494,7 @@ def test_15(lineitem, supplier): q = supplier.join(qrev, supplier.s_suppkey == qrev.l_suppkey) q = q.filter([q.total_revenue == qrev.total_revenue.max()]) - q = q[q.s_suppkey, q.s_name, q.s_address, q.s_phone, q.total_revenue] + q = q.select(q.s_suppkey, q.s_name, q.s_address, q.s_phone, q.total_revenue) return q.order_by([q.s_suppkey]) @@ -679,7 +679,7 @@ def test_20(supplier, nation, partsupp, part, lineitem): q1 = q1.filter([q1.n_name == NATION, q1.s_suppkey.isin(q2.ps_suppkey)]) - q1 = q1[q1.s_name, q1.s_address] + q1 = q1.select(q1.s_name, q1.s_address) return q1.order_by(q1.s_name) @@ -704,7 +704,7 @@ def test_21(supplier, lineitem, orders, nation): q = q.join(lineitem, supplier.s_suppkey == lineitem.l_suppkey) q = q.join(orders, orders.o_orderkey == lineitem.l_orderkey) q = q.join(nation, supplier.s_nationkey == nation.n_nationkey) - q = q[ + q = q.select( q.l_orderkey.name("l1_orderkey"), q.o_orderstatus, q.l_receiptdate, @@ -712,7 +712,7 @@ def test_21(supplier, lineitem, orders, nation): q.l_suppkey.name("l1_suppkey"), q.s_name, q.n_name, - ] + ) q = q.filter( [ q.o_orderstatus == "F", @@ -764,9 +764,9 @@ def test_22(customer, orders): ~(orders.o_custkey == customer.c_custkey).any(), ] ) - custsale = custsale[ + custsale = custsale.select( customer.c_phone.substr(0, 2).name("cntrycode"), customer.c_acctbal - ] + ) gq = custsale.group_by(custsale.cntrycode) outerq = gq.aggregate(numcust=custsale.count(), totacctbal=custsale.c_acctbal.sum()) diff --git a/ibis/expr/tests/test_format.py b/ibis/expr/tests/test_format.py index 806eed5369312..6fe8b3cddf94f 100644 --- a/ibis/expr/tests/test_format.py +++ b/ibis/expr/tests/test_format.py @@ -89,7 +89,7 @@ def test_format_multiple_join_with_projection(snapshot): table3 = ibis.table([("bar_id", "string"), ("value2", "double")], "three") - filtered = table[table["f"] > 0] + filtered = table.filter(table["f"] > 0) pred1 = filtered["foo_id"] == table2["foo_id"] pred2 = filtered["bar_id"] == table3["bar_id"] @@ -98,7 +98,7 @@ def test_format_multiple_join_with_projection(snapshot): j2 = j1.inner_join(table3, [pred2]) # Project out the desired fields - view = j2[[filtered, table2["value1"], table3["value2"]]] + view = j2.select(filtered, table2["value1"], table3["value2"]) # it works! result = repr(view) @@ -112,7 +112,7 @@ def test_memoize_filtered_table(snapshot): ) dests = ["ORD", "JFK", "SFO"] - t = airlines[airlines.dest.isin(dests)] + t = airlines.filter(airlines.dest.isin(dests)) delay_filter = t.dest.topk(10, by=t.arrdelay.mean()) result = repr(delay_filter) @@ -149,11 +149,11 @@ def test_memoize_filtered_tables_in_join(snapshot): metric = purchases.amount.sum().name("total") agged = purchases.group_by(["region", "kind"]).aggregate(metric) - left = agged[agged.kind == "foo"] - right = agged[agged.kind == "bar"] + left = agged.filter(agged.kind == "foo") + right = agged.filter(agged.kind == "bar") cond = left.region == right.region - joined = left.join(right, cond)[left, right.total.name("right_total")] + joined = left.join(right, cond).select(left, right.total.name("right_total")) result = repr(joined) snapshot.assert_match(result, "repr.txt") @@ -179,7 +179,7 @@ def test_scalar_parameter_formatting(): def test_same_column_multiple_aliases(snapshot): table = ibis.table([("col", "int64")], name="t") - expr = table[table.col.name("fakealias1"), table.col.name("fakealias2")] + expr = table.select(table.col.name("fakealias1"), table.col.name("fakealias2")) result = repr(expr) assert "UnboundTable: t" in result @@ -412,7 +412,7 @@ def values(self): return {} table = MyRelation(alltypes, kind="foo").to_expr() - expr = table[table, table.a.name("a2")] + expr = table.select(table, table.a.name("a2")) result = repr(expr) snapshot.assert_match(result, "repr.txt") @@ -441,7 +441,7 @@ def shape(self): def test_format_show_variables(monkeypatch, alltypes, snapshot): monkeypatch.setattr(ibis.options.repr, "show_variables", True) - filtered = alltypes[alltypes.f > 0] + filtered = alltypes.filter(alltypes.f > 0) ordered = filtered.order_by("f") projected = ordered[["a", "b", "f"]] diff --git a/ibis/expr/tests/test_newrels.py b/ibis/expr/tests/test_newrels.py index 34f76ae7763ed..f996c51f5dcaf 100644 --- a/ibis/expr/tests/test_newrels.py +++ b/ibis/expr/tests/test_newrels.py @@ -769,7 +769,7 @@ def test_join_predicate_dereferencing(): table2 = ibis.table({"foo_id": str, "value1": float, "value3": float}) table3 = ibis.table({"bar_id": str, "value2": float}) - filtered = table[table["f"] > 0] + filtered = table.filter(table["f"] > 0) # dereference table.foo_id to filtered.foo_id j1 = filtered.left_join(table2, table["foo_id"] == table2["foo_id"]) @@ -793,7 +793,7 @@ def test_join_predicate_dereferencing(): j1 = filtered.left_join(table2, table["foo_id"] == table2["foo_id"]) j2 = j1.inner_join(table3, filtered["bar_id"] == table3["bar_id"]) - view = j2[[filtered, table2["value1"], table3["value2"]]] + view = j2.select(filtered, table2["value1"], table3["value2"]) with join_tables(j2) as (r1, r2, r3): expected = ops.JoinChain( first=r1, @@ -1148,7 +1148,7 @@ def test_self_join_view(): def test_self_join_with_view_projection(): t1 = ibis.table(schema={"x": "int", "y": "int", "z": "str"}) t2 = t1.view() - expr = t1.inner_join(t2, ["x"])[[t1]] + expr = t1.inner_join(t2, ["x"]).select(t1) with join_tables(expr) as (r1, r2): expected = ops.JoinChain( @@ -1200,7 +1200,7 @@ def test_join_chain_gets_reused_and_continued_after_a_select(): c = ibis.table(name="c", schema={"e": "int64", "f": "string"}) ab = a.join(b, [a.a == b.c]) - abc = ab[a.b, b.d].join(c, [a.a == c.e]) + abc = ab.select(a.b, b.d).join(c, [a.a == c.e]) with join_tables(abc) as (r1, r2, r3): expected = ops.JoinChain( @@ -1442,8 +1442,8 @@ def test_join_between_joins(): ) t4 = ibis.table([("key3", "string"), ("value4", "double")], "fourth") - left = t1.inner_join(t2, [("key1", "key1")])[t1, t2.value2] - right = t3.inner_join(t4, [("key3", "key3")])[t3, t4.value4] + left = t1.inner_join(t2, [("key1", "key1")]).select(t1, t2.value2) + right = t3.inner_join(t4, [("key3", "key3")]).select(t3, t4.value4) joined = left.inner_join(right, left.key2 == right.key2) @@ -1535,7 +1535,7 @@ def test_join_with_compound_predicate(): (t1.a + t1.a != t2.b) & (t1.b + t1.b != t2.a), ], ) - expr = joined[t1] + expr = joined.select(t1) with join_tables(joined) as (r1, r2): expected = ops.JoinChain( first=r1, diff --git a/ibis/expr/tests/test_visualize.py b/ibis/expr/tests/test_visualize.py index a77b65628051d..2f5df98af2966 100644 --- a/ibis/expr/tests/test_visualize.py +++ b/ibis/expr/tests/test_visualize.py @@ -30,8 +30,8 @@ def key(node): lambda t: t.a, lambda t: t.a + t.b, lambda t: t.a + t.b > 3**t.a, - lambda t: t[(t.a + t.b * 2 * t.b / t.b**3 > 4) & (t.b > 5)], - lambda t: t[(t.a + t.b * 2 * t.b / t.b**3 > 4) & (t.b > 5)] + lambda t: t.filter((t.a + t.b * 2 * t.b / t.b**3 > 4) & (t.b > 5)), + lambda t: t.filter((t.a + t.b * 2 * t.b / t.b**3 > 4) & (t.b > 5)) .group_by("c") .aggregate(amean=lambda f: f.a.mean(), bsum=lambda f: f.b.sum()), ], @@ -86,7 +86,7 @@ def test_join(how): left = ibis.table([("a", "int64"), ("b", "string")]) right = ibis.table([("b", "string"), ("c", "int64")]) joined = left.join(right, left.b == right.b, how=how) - result = joined[left.a, right.c] + result = joined.select(left.a, right.c) graph = viz.to_graph(result) assert key(result.op()) in graph.source @@ -134,7 +134,7 @@ def test_asof_join(): right = right.mutate(foo=1) joined = api.asof_join(left, right, "time") - result = joined[left, right.foo] + result = joined.select(left, right.foo) graph = viz.to_graph(result) assert key(result.op()) in graph.source diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index d0fe76d10aa13..56745f33be7c4 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -3,6 +3,7 @@ import itertools import operator import re +import warnings from collections import deque from collections.abc import Callable, Iterable, Iterator, Mapping, Sequence from keyword import iskeyword @@ -559,16 +560,16 @@ def preview( console_width=console_width, ) - def __getitem__(self, what): - """Select items from a table expression. - - This method implements square bracket syntax for table expressions, - including various forms of projection and filtering. + def __getitem__(self, what: str | int | slice | Sequence[str | int]): + """Select one or more columns or rows from a table expression. Parameters ---------- what - Selection object. This can be a variety of types including strings, ints, lists. + What to select. Options are: + - A `str` column name or `int` column index to select a single column. + - A sequence of column names or indices to select multiple columns. + - A slice to select a subset of rows. Returns ------- @@ -579,10 +580,8 @@ def __getitem__(self, what): Examples -------- >>> import ibis - >>> import ibis.selectors as s - >>> from ibis import _ >>> ibis.options.interactive = True - >>> t = ibis.examples.penguins.fetch() + >>> t = ibis.examples.penguins.fetch().head() >>> t ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ @@ -594,15 +593,9 @@ def __getitem__(self, what): │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ … │ │ Adelie │ Torgersen │ NULL │ NULL │ NULL │ … │ │ Adelie │ Torgersen │ 36.7 │ 19.3 │ 193 │ … │ - │ Adelie │ Torgersen │ 39.3 │ 20.6 │ 190 │ … │ - │ Adelie │ Torgersen │ 38.9 │ 17.8 │ 181 │ … │ - │ Adelie │ Torgersen │ 39.2 │ 19.6 │ 195 │ … │ - │ Adelie │ Torgersen │ 34.1 │ 18.1 │ 193 │ … │ - │ Adelie │ Torgersen │ 42.0 │ 20.2 │ 190 │ … │ - │ … │ … │ … │ … │ … │ … │ └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ - Return a column by name + Select a single column by name: >>> t["island"] ┏━━━━━━━━━━━┓ @@ -615,15 +608,9 @@ def __getitem__(self, what): │ Torgersen │ │ Torgersen │ │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ … │ └───────────┘ - Return the second column, starting from index 0 + Select a single column by index: >>> t.columns[1] 'island' @@ -638,105 +625,11 @@ def __getitem__(self, what): │ Torgersen │ │ Torgersen │ │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ Torgersen │ - │ … │ └───────────┘ - Extract a range of rows - - >>> t[:2] - ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ - ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ - ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ - │ string │ string │ float64 │ float64 │ int64 │ … │ - ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤ - │ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181 │ … │ - │ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186 │ … │ - └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ - >>> t[:5] - ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ - ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ - ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ - │ string │ string │ float64 │ float64 │ int64 │ … │ - ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤ - │ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181 │ … │ - │ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186 │ … │ - │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ … │ - │ Adelie │ Torgersen │ NULL │ NULL │ NULL │ … │ - │ Adelie │ Torgersen │ 36.7 │ 19.3 │ 193 │ … │ - └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ - >>> t[2:5] - ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ - ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ - ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ - │ string │ string │ float64 │ float64 │ int64 │ … │ - ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤ - │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ … │ - │ Adelie │ Torgersen │ NULL │ NULL │ NULL │ … │ - │ Adelie │ Torgersen │ 36.7 │ 19.3 │ 193 │ … │ - └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ + Select multiple columns by name: - Some backends support negative slice indexing - - >>> t[-5:] # last 5 rows - ┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ - ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ - ┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ - │ string │ string │ float64 │ float64 │ int64 │ … │ - ├───────────┼────────┼────────────────┼───────────────┼───────────────────┼───┤ - │ Chinstrap │ Dream │ 55.8 │ 19.8 │ 207 │ … │ - │ Chinstrap │ Dream │ 43.5 │ 18.1 │ 202 │ … │ - │ Chinstrap │ Dream │ 49.6 │ 18.2 │ 193 │ … │ - │ Chinstrap │ Dream │ 50.8 │ 19.0 │ 210 │ … │ - │ Chinstrap │ Dream │ 50.2 │ 18.7 │ 198 │ … │ - └───────────┴────────┴────────────────┴───────────────┴───────────────────┴───┘ - >>> t[-5:-3] # last 5th to 3rd rows - ┏━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ - ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ - ┡━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ - │ string │ string │ float64 │ float64 │ int64 │ … │ - ├───────────┼────────┼────────────────┼───────────────┼───────────────────┼───┤ - │ Chinstrap │ Dream │ 55.8 │ 19.8 │ 207 │ … │ - │ Chinstrap │ Dream │ 43.5 │ 18.1 │ 202 │ … │ - └───────────┴────────┴────────────────┴───────────────┴───────────────────┴───┘ - >>> t[2:-2] # chop off the first two and last two rows - ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ - ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ - ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ - │ string │ string │ float64 │ float64 │ int64 │ … │ - ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤ - │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ … │ - │ Adelie │ Torgersen │ NULL │ NULL │ NULL │ … │ - │ Adelie │ Torgersen │ 36.7 │ 19.3 │ 193 │ … │ - │ Adelie │ Torgersen │ 39.3 │ 20.6 │ 190 │ … │ - │ Adelie │ Torgersen │ 38.9 │ 17.8 │ 181 │ … │ - │ Adelie │ Torgersen │ 39.2 │ 19.6 │ 195 │ … │ - │ Adelie │ Torgersen │ 34.1 │ 18.1 │ 193 │ … │ - │ Adelie │ Torgersen │ 42.0 │ 20.2 │ 190 │ … │ - │ Adelie │ Torgersen │ 37.8 │ 17.1 │ 186 │ … │ - │ Adelie │ Torgersen │ 37.8 │ 17.3 │ 180 │ … │ - │ … │ … │ … │ … │ … │ … │ - └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ - - Select columns - - >>> t[["island", "bill_length_mm"]].head() - ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ - ┃ island ┃ bill_length_mm ┃ - ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ - │ string │ float64 │ - ├───────────┼────────────────┤ - │ Torgersen │ 39.1 │ - │ Torgersen │ 39.5 │ - │ Torgersen │ 40.3 │ - │ Torgersen │ NULL │ - │ Torgersen │ 36.7 │ - └───────────┴────────────────┘ - >>> t["island", "bill_length_mm"].head() + >>> t[["island", "bill_length_mm"]] ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ ┃ island ┃ bill_length_mm ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ @@ -748,37 +641,10 @@ def __getitem__(self, what): │ Torgersen │ NULL │ │ Torgersen │ 36.7 │ └───────────┴────────────────┘ - >>> t[_.island, _.bill_length_mm].head() - ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ - ┃ island ┃ bill_length_mm ┃ - ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ - │ string │ float64 │ - ├───────────┼────────────────┤ - │ Torgersen │ 39.1 │ - │ Torgersen │ 39.5 │ - │ Torgersen │ 40.3 │ - │ Torgersen │ NULL │ - │ Torgersen │ 36.7 │ - └───────────┴────────────────┘ - - Filtering - >>> t[t.island.lower() != "torgersen"].head() - ┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ - ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ - ┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ - │ string │ string │ float64 │ float64 │ int64 │ … │ - ├─────────┼────────┼────────────────┼───────────────┼───────────────────┼───┤ - │ Adelie │ Biscoe │ 37.8 │ 18.3 │ 174 │ … │ - │ Adelie │ Biscoe │ 37.7 │ 18.7 │ 180 │ … │ - │ Adelie │ Biscoe │ 35.9 │ 19.2 │ 189 │ … │ - │ Adelie │ Biscoe │ 38.2 │ 18.1 │ 185 │ … │ - │ Adelie │ Biscoe │ 38.8 │ 17.2 │ 180 │ … │ - └─────────┴────────┴────────────────┴───────────────┴───────────────────┴───┘ + Select a range of rows: - Selectors - - >>> t[~s.numeric() | (s.numeric() & ~s.c("year"))].head() + >>> t[:2] ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ @@ -786,45 +652,45 @@ def __getitem__(self, what): ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤ │ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181 │ … │ │ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186 │ … │ + └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ + >>> t[2:5] + ┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━┓ + ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ … ┃ + ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━┩ + │ string │ string │ float64 │ float64 │ int64 │ … │ + ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼───┤ │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ … │ │ Adelie │ Torgersen │ NULL │ NULL │ NULL │ … │ │ Adelie │ Torgersen │ 36.7 │ 19.3 │ 193 │ … │ └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴───┘ - >>> t[s.r["bill_length_mm":"body_mass_g"]].head() - ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ - ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ - ┡━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ - │ float64 │ float64 │ int64 │ int64 │ - ├────────────────┼───────────────┼───────────────────┼─────────────┤ - │ 39.1 │ 18.7 │ 181 │ 3750 │ - │ 39.5 │ 17.4 │ 186 │ 3800 │ - │ 40.3 │ 18.0 │ 195 │ 3250 │ - │ NULL │ NULL │ NULL │ NULL │ - │ 36.7 │ 19.3 │ 193 │ 3450 │ - └────────────────┴───────────────┴───────────────────┴─────────────┘ """ from ibis.expr.types.logical import BooleanValue - if isinstance(what, slice): - limit, offset = util.slice_to_limit_offset(what, self.count()) - return self.limit(limit, offset=offset) - # skip the self.bind call for single column access with strings or ints - # because dereferencing has significant overhead - elif isinstance(what, str): + if isinstance(what, str): return ops.Field(self.op(), what).to_expr() elif isinstance(what, int): return ops.Field(self.op(), self.columns[what]).to_expr() + elif isinstance(what, slice): + limit, offset = util.slice_to_limit_offset(what, self.count()) + return self.limit(limit, offset=offset) args = [ self.columns[arg] if isinstance(arg, int) else arg for arg in util.promote_list(what) ] + if util.all_of(args, str): + return self.select(args) + + # Once this deprecation is removed, we'll want to error here instead. + warnings.warn( + "Selecting/filtering arbitrary expressions in `Table.__getitem__` is " + "deprecated and will be removed in version 10.0. Please use " + "`Table.select` or `Table.filter` instead.", + FutureWarning, + ) values = self.bind(args) - if isinstance(what, (str, int)): - assert len(values) == 1 - return values[0] - elif util.all_of(values, BooleanValue): + if util.all_of(values, BooleanValue): return self.filter(values) else: return self.select(values) @@ -2923,7 +2789,7 @@ def unpack(self, *columns: str) -> Table: result_columns.extend(expr[field] for field in expr.names) else: result_columns.append(column) - return self[result_columns] + return self.select(result_columns) def info(self) -> Table: """Return summary information about a table. diff --git a/ibis/tests/benchmarks/test_benchmarks.py b/ibis/tests/benchmarks/test_benchmarks.py index 0ff9dee0db2e4..ee5a76b8d160a 100644 --- a/ibis/tests/benchmarks/test_benchmarks.py +++ b/ibis/tests/benchmarks/test_benchmarks.py @@ -53,7 +53,7 @@ def t(): def make_base(t): - return t[ + return t.filter( ( (t.year > 2016) | ((t.year == 2016) & (t.month > 6)) @@ -80,7 +80,7 @@ def make_base(t): & (t.minute <= 5) ) ) - ] + ) @pytest.fixture(scope="module") @@ -394,9 +394,9 @@ def tpc_h02(part, supplier, partsupp, nation, region): .join(region, nation.n_regionkey == region.r_regionkey) ) - subexpr = subexpr[ + subexpr = subexpr.filter( (subexpr.r_name == REGION) & (expr.p_partkey == subexpr.ps_partkey) - ] + ) filters = [ expr.p_size == SIZE, @@ -529,7 +529,7 @@ def eq(a, b): def multiple_joins(table, num_joins): for _ in range(num_joins): table = table.mutate(dummy=ibis.literal("")) - table = table.left_join(table.view(), ["dummy"])[[table]] + table = table.left_join(table.view(), ["dummy"]).select(table) @pytest.mark.parametrize("num_joins", [1, 10]) diff --git a/ibis/tests/expr/test_analysis.py b/ibis/tests/expr/test_analysis.py index 527bbab84c8f9..6536d761a82d7 100644 --- a/ibis/tests/expr/test_analysis.py +++ b/ibis/tests/expr/test_analysis.py @@ -17,7 +17,7 @@ def test_rewrite_join_projection_without_other_ops(con): table2 = con.table("star2") table3 = con.table("star3") - filtered = table[table["f"] > 0] + filtered = table.filter(table["f"] > 0) pred1 = table["foo_id"] == table2["foo_id"] pred2 = filtered["bar_id"] == table3["bar_id"] @@ -25,7 +25,7 @@ def test_rewrite_join_projection_without_other_ops(con): j1 = filtered.left_join(table2, [pred1]) j2 = j1.inner_join(table3, [pred2]) # Project out the desired fields - view = j2[[filtered, table2["value1"], table3["value2"]]] + view = j2.select(filtered, table2["value1"], table3["value2"]) with join_tables(j2) as (r1, r2, r3): # Construct the thing we expect to obtain @@ -90,11 +90,11 @@ def test_filter_on_projected_field(con): .join(orders, orders.o_custkey == customer.c_custkey) ) - tpch = all_join[fields_of_interest] + tpch = all_join.select(*fields_of_interest) # Correlated subquery, yikes! t2 = tpch.view() - conditional_avg = t2[(t2.region == tpch.region)].amount.mean() + conditional_avg = t2.filter(t2.region == tpch.region).amount.mean() # `amount` is part of the projection above as an aliased field amount_filter = tpch.amount > conditional_avg @@ -116,7 +116,7 @@ def test_join_predicate_from_derived_raises(): table2 = ibis.table([("key", "string"), ("value", "double")], "bar_table") filter_pred = table["f"] > 0 - table3 = table[filter_pred] + table3 = table.filter(filter_pred) with pytest.raises(com.IntegrityError, match="they belong to another relation"): # TODO(kszucs): could be smarter actually and rewrite the predicate @@ -153,8 +153,8 @@ def test_filter_self_join(): metrics={"total": purchases.amount.sum()}, ) - left = agged[agged.kind == "foo"] - right = agged[agged.kind == "bar"] + left = agged.filter(agged.kind == "foo") + right = agged.filter(agged.kind == "bar") assert left.op() == ops.Filter( parent=agged, predicates=[agged.kind == "foo"], @@ -186,11 +186,13 @@ def test_filter_self_join(): def test_is_ancestor_analytic(): x = ibis.table(ibis.schema([("col", "int32")]), "x") - with_filter_col = x[x.columns + [ibis.null().name("filter")]] - filtered = with_filter_col[with_filter_col["filter"].isnull()] - subquery = filtered[filtered.columns] + with_filter_col = x.select(x.columns + [ibis.null().name("filter")]) + filtered = with_filter_col.filter(with_filter_col["filter"].isnull()) + subquery = filtered.select(filtered.columns) - with_analytic = subquery[subquery.columns + [subquery.count().name("analytic")]] + with_analytic = subquery.select( + subquery.columns + [subquery.count().name("analytic")] + ) assert not subquery.op().equals(with_analytic.op()) @@ -252,10 +254,10 @@ def test_select_filter_mutate_fusion(): t = ibis.table(ibis.schema([("col", "float32")]), "t") - t1 = t[["col"]] + t1 = t.select("col") assert t1.op() == ops.Project(parent=t, values={"col": t.col}) - t2 = t1[t1["col"].isnan()] + t2 = t1.filter(t1["col"].isnan()) assert t2.op() == ops.Filter(parent=t1, predicates=[t1.col.isnan()]) t3 = t2.mutate(col=t2["col"].cast("int32")) diff --git a/ibis/tests/expr/test_analytics.py b/ibis/tests/expr/test_analytics.py index c6c57c3b2e5a7..ab2f6a17b7af3 100644 --- a/ibis/tests/expr/test_analytics.py +++ b/ibis/tests/expr/test_analytics.py @@ -44,7 +44,7 @@ def test_category_project(alltypes): t = alltypes tier = t.double_col.bucket([0, 50, 100]).name("tier") - expr = t[tier, t] + expr = t.select(tier, t) assert isinstance(expr.tier, ir.IntegerColumn) @@ -99,7 +99,7 @@ def test_histogram(alltypes): def test_topk_analysis_bug(airlines): # GH #398 dests = ["ORD", "JFK", "SFO"] - t = airlines[airlines.dest.isin(dests)] + t = airlines.filter(airlines.dest.isin(dests)) filtered = t.semi_join(t.origin.topk(10, by=t.arrdelay.mean()), "origin") assert filtered is not None diff --git a/ibis/tests/expr/test_case.py b/ibis/tests/expr/test_case.py index dbd0b9d21746f..97bfcba5d6649 100644 --- a/ibis/tests/expr/test_case.py +++ b/ibis/tests/expr/test_case.py @@ -211,7 +211,7 @@ def test_case_mixed_type(): expr = ( t0.three.case().when(0, "low").when(1, "high").else_("null").end().name("label") ) - result = t0[expr] + result = t0.select(expr) assert result["label"].type().equals(dt.string) diff --git a/ibis/tests/expr/test_format_sql_operations.py b/ibis/tests/expr/test_format_sql_operations.py index 4025aa11cb528..500866ad86b20 100644 --- a/ibis/tests/expr/test_format_sql_operations.py +++ b/ibis/tests/expr/test_format_sql_operations.py @@ -31,7 +31,7 @@ def test_memoize_database_table(con, snapshot): table2 = con.table("test2") filter_pred = table["f"] > 0 - table3 = table[filter_pred] + table3 = table.filter(filter_pred) join_pred = table3["g"] == table2["key"] joined = table2.inner_join(table3, [join_pred]) @@ -56,7 +56,7 @@ def test_memoize_insert_sort_key(con, snapshot): dest_avg=t.arrdelay.mean(), dev=t.arrdelay - t.arrdelay.mean() ) - worst = expr[expr.dev.notnull()].order_by(ibis.desc("dev")).limit(10) + worst = expr.filter(expr.dev.notnull()).order_by(ibis.desc("dev")).limit(10) result = repr(worst) assert result.count("airlines") == 1 diff --git a/ibis/tests/expr/test_struct.py b/ibis/tests/expr/test_struct.py index 92013d6c63908..9b6d8914dff6f 100644 --- a/ibis/tests/expr/test_struct.py +++ b/ibis/tests/expr/test_struct.py @@ -62,11 +62,11 @@ def test_struct_pickle(): def test_lift(t): - assert t.a.lift().equals(t[_.a.b, _.a.c]) + assert t.a.lift().equals(t.select(_.a.b, _.a.c)) def test_unpack_from_table(t): - assert t.unpack("a").equals(t[_.a.b, _.a.c, _.d]) + assert t.unpack("a").equals(t.select(_.a.b, _.a.c, _.d)) def test_lift_join(t, s): @@ -86,7 +86,7 @@ def test_lift_join(t, s): def test_unpack_join_from_table(t, s): join = t.join(s, t.d == s.a.g) result = join.unpack("a_right") - expected = join[_.a, _.d, _.a_right.f, _.a_right.g] + expected = join.select(_.a, _.d, _.a_right.f, _.a_right.g) assert result.equals(expected) diff --git a/ibis/tests/expr/test_table.py b/ibis/tests/expr/test_table.py index ba9e7b218001a..6d802403dca9b 100644 --- a/ibis/tests/expr/test_table.py +++ b/ibis/tests/expr/test_table.py @@ -97,7 +97,7 @@ def test_getitem_column_select(table): def test_select_using_selector(table): - expr = table[s.numeric()] + expr = table.select(s.numeric()) expected = table.select( table.a, table.b, @@ -124,9 +124,8 @@ def test_getitem_attribute(table): result = table.a assert_equal(result, table["a"]) - # Project and add a name that conflicts with a Table built-in - # attribute - view = table[[table, table["a"].name("schema")]] + # Project and add a name that conflicts with a Table built-in attribute + view = table.mutate(schema=table.a) assert not isinstance(view.schema, Column) @@ -176,7 +175,7 @@ def test_projection_with_exprs(table): col_exprs = [table["b"].log().name("log_b"), mean_diff.name("mean_diff")] - proj = table[col_exprs + ["g"]] + proj = table.select(col_exprs + ["g"]) schema = proj.schema() assert schema.names == ("log_b", "mean_diff", "g") assert schema.types == (dt.double, dt.double, dt.string) @@ -219,7 +218,7 @@ def test_projection_with_star_expr(table): t = table # it lives! - proj = t[t, new_expr] + proj = t.select(t, new_expr) repr(proj) ex_names = table.schema().names + ("bigger_a",) @@ -228,14 +227,35 @@ def test_projection_with_star_expr(table): # cannot pass an invalid table expression t2 = t.aggregate([t["a"].sum().name("sum(a)")], by=["g"]) with pytest.raises(IntegrityError): - t[[t2]] + t.select(t2) # TODO: there may be some ways this can be invalid -def test_projection_convenient_syntax(table): - proj = table[table, table["a"].name("foo")] - proj2 = table[[table, table["a"].name("foo")]] - assert_equal(proj, proj2) +def test_deprecated_getitem_select_filter(table): + # Select + sol1 = table.select(table, table.a.name("foo")) + with pytest.warns(FutureWarning): + e1 = table[table, table["a"].name("foo")] + e2 = table[[table, table["a"].name("foo")]] + + assert_equal(e1, sol1) + assert_equal(e2, sol1) + + # Select with selector + sol2 = table.select(s.numeric()) + with pytest.warns(FutureWarning): + e3 = table[s.numeric()] + + assert_equal(e3, sol2) + + # Filter + sol3 = table.filter(table.a > 10, table.a < 20) + with pytest.warns(FutureWarning): + e4 = table[table.a > 10, table.a < 20] + e5 = table[[table.a > 10, table.a < 20]] + + assert_equal(e4, sol3) + assert_equal(e5, sol3) def test_projection_mutate_analysis_bug(con): @@ -243,7 +263,7 @@ def test_projection_mutate_analysis_bug(con): t = con.table("airlines") - filtered = t[t.depdelay.notnull()] + filtered = t.filter(t.depdelay.notnull()) leg = ibis.literal("-").join([t.origin, t.dest]) mutated = filtered.mutate(leg=leg) @@ -251,19 +271,6 @@ def test_projection_mutate_analysis_bug(con): mutated["year", "month", "day", "depdelay", "leg"] -def test_projection_self(table): - result = table[table] - expected = table.select(table) - - assert_equal(result, expected) - - -def test_projection_array_expr(table): - result = table[table.a] - expected = table[[table.a]] - assert_equal(result, expected) - - @pytest.mark.parametrize("empty", [list(), dict()]) def test_projection_no_expr(table, empty): with pytest.raises(com.IbisTypeError, match="must select at least one"): @@ -299,7 +306,7 @@ def test_mutate(table): kw5=ibis.literal(9), kw6=ibis.literal("ten"), ) - expected = table[ + expected = table.select( table, (table.a + 1).name("x1"), table.b.sum().name("x2"), @@ -313,7 +320,7 @@ def test_mutate(table): (table.a + 8).name("kw4"), ibis.literal(9).name("kw5"), ibis.literal("ten").name("kw6"), - ] + ) assert_equal(expr, expected) @@ -322,7 +329,7 @@ def test_mutate_alter_existing_columns(table): foo = table.d * 2 expr = table.mutate(f=new_f, foo=foo) - expected = table[ + expected = table.select( "a", "b", "c", @@ -335,7 +342,7 @@ def test_mutate_alter_existing_columns(table): "j", "k", foo.name("foo"), - ] + ) assert_equal(expr, expected) @@ -345,25 +352,11 @@ def test_replace_column(): expr = tb.b.cast("int32") tb2 = tb.mutate(b=expr) - expected = tb[tb.a, expr.name("b"), tb.c] + expected = tb.select(tb.a, expr.name("b"), tb.c) assert_equal(tb2, expected) -def test_filter_no_list(table): - pred = table.a > 5 - - result = table.filter(pred) - expected = table[pred] - assert_equal(result, expected) - - -def test_add_predicate(table): - pred = table["a"] > 5 - result = table[pred] - assert isinstance(result.op(), ops.Filter) - - def test_invalid_predicate(table, schema): # a lookalike table2 = api.table(schema, name="bar") @@ -379,12 +372,12 @@ def test_add_predicate_coalesce(table): pred1 = table["a"] > 5 pred2 = table["b"] > 0 - result = simplify(table[pred1][pred2].op()).to_expr() + result = simplify(table.filter(pred1).filter(pred2).op()).to_expr() expected = table.filter([pred1, pred2]) assert_equal(result, expected) # 59, if we are not careful, we can obtain broken refs - subset = table[pred1] + subset = table.filter(pred1) result = simplify(subset.filter([subset["b"] > 0]).op()).to_expr() assert_equal(result, expected) @@ -392,7 +385,7 @@ def test_add_predicate_coalesce(table): def test_repr_same_but_distinct_objects(con): t = con.table("test1") t_copy = con.table("test1") - table2 = t[t_copy["f"] > 0] + table2 = t.filter(t_copy["f"] > 0) result = repr(table2) assert result.count("DatabaseTable") == 1 @@ -402,10 +395,10 @@ def test_filter_fusion_distinct_table_objects(con): t = con.table("test1") tt = con.table("test1") - expr = t[t.f > 0][t.c > 0] - expr2 = t[t.f > 0][tt.c > 0] - expr3 = t[tt.f > 0][tt.c > 0] - expr4 = t[tt.f > 0][t.c > 0] + expr = t.filter(t.f > 0).filter(t.c > 0) + expr2 = t.filter(t.f > 0).filter(tt.c > 0) + expr3 = t.filter(tt.f > 0).filter(tt.c > 0) + expr4 = t.filter(tt.f > 0).filter(t.c > 0) assert_equal(expr, expr2) assert repr(expr) == repr(expr2) @@ -1095,18 +1088,6 @@ def test_join_combo_with_projection(table): repr(proj) -def test_join_getitem_projection(con): - region = con.table("tpch_region") - nation = con.table("tpch_nation") - - pred = region.r_regionkey == nation.n_regionkey - joined = region.inner_join(nation, pred) - - result = joined[nation] - expected = joined.select(nation) - assert_equal(result, expected) - - def test_self_join(table): # Self-joins are problematic with this design because column # expressions may reference either the left or right For example: @@ -1127,7 +1108,7 @@ def test_self_join(table): joined = left.inner_join(right, [right["g"] == left["g"]]) # Project out left table schema - proj = joined[[left]] + proj = joined.select(left) assert_equal(proj.schema(), left.schema()) # Try aggregating on top of joined @@ -1148,18 +1129,6 @@ def test_self_join_no_view_convenience(table): assert result.columns == expected_cols -def test_join_reference_bug(con): - # GH#403 - orders = con.table("tpch_orders") - customer = con.table("tpch_customer") - lineitem = con.table("tpch_lineitem") - - items = orders.join(lineitem, orders.o_orderkey == lineitem.l_orderkey)[ - lineitem, orders.o_custkey, orders.o_orderpriority - ].join(customer, [("o_custkey", "c_custkey")]) - items["o_orderpriority"].value_counts() - - def test_join_project_after(table): # e.g. # @@ -1494,21 +1463,12 @@ def test_unresolved_existence_predicate(t1, t2): filtered = t2.filter(t1.key1 == t2.key1) subquery = ops.ExistsSubquery(filtered) expected = ops.Filter(parent=t1, predicates=[subquery]) - assert t1[expr].op() == expected + assert t1.filter(expr).op() == expected filtered = t1.filter(t1.key1 == t2.key1) subquery = ops.ExistsSubquery(filtered) expected = ops.Filter(parent=t2, predicates=[subquery]) - assert t2[expr].op() == expected - - -def test_resolve_existence_predicate(t1, t2): - expr = t1[(t1.key1 == t2.key1).any()] - op = expr.op() - assert isinstance(op, ops.Filter) - - pred = op.predicates[0].to_expr() - assert isinstance(pred.op(), ops.ExistsSubquery) + assert t2.filter(expr).op() == expected def test_aggregate_metrics(table): @@ -1564,11 +1524,8 @@ def test_filter(table): m = table.mutate(foo=table.f * 2, bar=table.e / 2) result = m.filter(lambda x: x.foo > 10) - result2 = m[lambda x: x.foo > 10] - expected = m[m.foo > 10] - + expected = m.filter(m.foo > 10) assert_equal(result, expected) - assert_equal(result2, expected) result = m.filter([lambda x: x.foo > 10, lambda x: x.bar < 0]) expected = m.filter([m.foo > 10, m.bar < 0]) @@ -1602,10 +1559,8 @@ def f(x): return (x.foo * 2).name("bar") result = m.select([f, "f"]) - result2 = m[f, "f"] expected = m.select([f(m), "f"]) assert_equal(result, expected) - assert_equal(result2, expected) def test_mutate2(table): @@ -1774,20 +1729,14 @@ def test_merge_as_of_allows_overlapping_columns(): name="t", ) - signal_one = table[ + signal_one = table.filter( table["field"].contains("signal_one") & table["field"].contains("current") - ] - signal_one = signal_one[ - "value", "timestamp_received", "field" - ] # select columns we care about + )["value", "timestamp_received", "field"] signal_one = signal_one.rename(current="value", signal_one="field") - signal_two = table[ + signal_two = table.filter( table["field"].contains("signal_two") & table["field"].contains("voltage") - ] - signal_two = signal_two[ - "value", "timestamp_received", "field" - ] # select columns we care about + )["value", "timestamp_received", "field"] signal_two = signal_two.rename(voltage="value", signal_two="field") merged = signal_one.asof_join(signal_two, "timestamp_received") @@ -1806,7 +1755,7 @@ def test_select_from_unambiguous_join_with_strings(): t = ibis.table([("a", "int64"), ("b", "string")]) s = ibis.table([("b", "int64"), ("c", "string")]) joined = t.left_join(s, [t.b == s.c]) - expr = joined[t, "c"] + expr = joined.select(t, "c") assert expr.columns == ["a", "b", "c"] @@ -1913,12 +1862,6 @@ def test_default_backend_with_unbound_table(): assert expr.execute() -def test_array_string_compare(): - t = ibis.table(schema=dict(by="string", words="array"), name="t") - expr = t[t.by == "foo"].mutate(words=_.words.unnest()).filter(_.words == "the") - assert expr is not None - - @pytest.mark.parametrize("value", [True, False]) def test_filter_with_literal(value): t = ibis.table(dict(a="string")) diff --git a/ibis/tests/expr/test_value_exprs.py b/ibis/tests/expr/test_value_exprs.py index cc5a756437e57..e95bda04d864e 100644 --- a/ibis/tests/expr/test_value_exprs.py +++ b/ibis/tests/expr/test_value_exprs.py @@ -291,7 +291,7 @@ def test_isin_notin_list(table, container): def test_value_counts(table, string_col): bool_clause = table[string_col].notin(["1", "4", "7"]) - expr = table[bool_clause][string_col].value_counts() + expr = table.filter(bool_clause)[string_col].value_counts() assert isinstance(expr, ir.Table) @@ -1362,7 +1362,7 @@ def test_select_on_unambiguous_join(join_method): def test_chained_select_on_join(): t = ibis.table([("a", dt.int64)], name="t") s = ibis.table([("a", dt.int64), ("b", dt.string)], name="s") - join = t.join(s)[t.a, s.b] + join = t.join(s).select(t.a, s.b) expr1 = join["a", "b"] expr2 = join.select(["a", "b"]) assert expr1.equals(expr2) @@ -1376,7 +1376,7 @@ def test_repr_list_of_lists(): def test_repr_list_of_lists_in_table(): t = ibis.table([("a", "int64")], name="t") lit = ibis.literal([[1]]) - expr = t[t, lit.name("array_of_array")] + expr = t.select(t, lit.name("array_of_array")) repr(expr) @@ -1504,7 +1504,7 @@ def test_deferred_r_ops(op_name, expected_left, expected_right): right = _.a op = getattr(operator, op_name) - expr = t[op(left, right).name("b")] + expr = t.select(op(left, right).name("b")) node = expr.op().values["b"] assert node.left.equals(expected_left(t).op()) assert node.right.equals(expected_right(t).op()) @@ -1675,7 +1675,7 @@ def test_rowid_only_physical_tables(): table = ibis.table({"x": "int", "y": "string"}, name="t") table.rowid() # works - table[table.rowid(), table.x].filter(_.x > 10) # works + table.select(table.rowid(), table.x).filter(_.x > 10) # works with pytest.raises(com.IbisTypeError, match="only valid for physical tables"): table.filter(table.x > 0).rowid() diff --git a/ibis/tests/expr/test_window_frames.py b/ibis/tests/expr/test_window_frames.py index 5f6ac6dd0fb6a..7477544f3da97 100644 --- a/ibis/tests/expr/test_window_frames.py +++ b/ibis/tests/expr/test_window_frames.py @@ -511,7 +511,9 @@ def metric(x): return x.arrdelay.mean().name("avg_delay") annual_delay = ( - t[t.dest.isin(["JFK", "SFO"])].group_by(["dest", "year"]).aggregate(metric) + t.filter(t.dest.isin(["JFK", "SFO"])) + .group_by(["dest", "year"]) + .aggregate(metric) ) what = annual_delay.group_by("dest") enriched = what.mutate(grand_avg=annual_delay.avg_delay.mean()) @@ -521,7 +523,7 @@ def metric(x): .name("grand_avg") .over(ibis.window(group_by=annual_delay.dest)) ) - expected = annual_delay[annual_delay, expr] + expected = annual_delay.select(annual_delay, expr) assert enriched.equals(expected)