From be451a466f8f60b20fc02e89c6f07742f3eeacb5 Mon Sep 17 00:00:00 2001 From: Jim Crist-Harif Date: Thu, 28 Mar 2024 13:08:58 -0500 Subject: [PATCH] depr(api): deprecate `how` parameter to `arbitrary` --- ibis/backends/bigquery/compiler.py | 9 +- ibis/backends/clickhouse/compiler.py | 10 +- ibis/backends/datafusion/compiler.py | 2 +- ibis/backends/druid/compiler.py | 1 - ibis/backends/duckdb/compiler.py | 1 + ibis/backends/exasol/compiler.py | 1 - ibis/backends/flink/compiler.py | 1 - ibis/backends/impala/compiler.py | 1 - ibis/backends/mssql/compiler.py | 1 - ibis/backends/mysql/compiler.py | 1 - ibis/backends/oracle/compiler.py | 1 - ibis/backends/pandas/executor.py | 7 +- ibis/backends/postgres/compiler.py | 1 + ibis/backends/pyspark/compiler.py | 13 +- ibis/backends/risingwave/compiler.py | 1 + ibis/backends/snowflake/compiler.py | 12 +- ibis/backends/sql/compiler.py | 7 - ibis/backends/sqlite/compiler.py | 13 +- ibis/backends/sqlite/udf.py | 6 +- .../test_union_aliasing/duckdb/out.sql | 2 +- .../test_union_aliasing/risingwave/out.sql | 68 -------- ibis/backends/tests/test_aggregation.py | 156 ++++-------------- ibis/backends/tests/test_generic.py | 31 +--- ibis/backends/tests/test_sql.py | 7 +- ibis/backends/trino/compiler.py | 8 +- ibis/expr/operations/reductions.py | 6 +- ibis/expr/types/generic.py | 31 ++-- ibis/expr/types/relations.py | 6 +- ibis/tests/expr/test_value_exprs.py | 13 +- 29 files changed, 90 insertions(+), 327 deletions(-) delete mode 100644 ibis/backends/tests/snapshots/test_sql/test_union_aliasing/risingwave/out.sql diff --git a/ibis/backends/bigquery/compiler.py b/ibis/backends/bigquery/compiler.py index eb24ccfdf1d7b..b8a8ac7dd400d 100644 --- a/ibis/backends/bigquery/compiler.py +++ b/ibis/backends/bigquery/compiler.py @@ -71,6 +71,7 @@ class BigQueryCompiler(SQLGlotCompiler): ) SIMPLE_OPS = { + ops.Arbitrary: "any_value", ops.StringAscii: "ascii", ops.BitAnd: "bit_and", ops.BitOr: "bit_or", @@ -496,14 +497,6 @@ def visit_Last(self, op, *, arg, where): array = self.f.array_reverse(self.f.array_agg(sge.IgnoreNulls(this=arg))) return array[self.f.safe_offset(0)] - def visit_Arbitrary(self, op, *, arg, how, where): - if how != "first": - raise com.UnsupportedOperationError( - f"{how!r} value not supported for arbitrary in BigQuery" - ) - - return self.agg.any_value(arg, where=where) - def visit_ArrayFilter(self, op, *, arg, body, param): return self.f.array( sg.select(param).from_(self._unnest(arg, as_=param)).where(body) diff --git a/ibis/backends/clickhouse/compiler.py b/ibis/backends/clickhouse/compiler.py index ba24173c6ae19..07ae3fa79bede 100644 --- a/ibis/backends/clickhouse/compiler.py +++ b/ibis/backends/clickhouse/compiler.py @@ -41,6 +41,7 @@ class ClickHouseCompiler(SQLGlotCompiler): ops.Any: "max", ops.ApproxCountDistinct: "uniqHLL12", ops.ApproxMedian: "median", + ops.Arbitrary: "any", ops.ArgMax: "argMax", ops.ArgMin: "argMin", ops.ArrayCollect: "groupArray", @@ -202,15 +203,6 @@ def visit_Correlation(self, op, *, left, right, how, where): ) return self.agg.corr(left, right, where=where) - def visit_Arbitrary(self, op, *, arg, how, where): - if how == "first": - return self.agg.any(arg, where=where) - elif how == "last": - return self.agg.anyLast(arg, where=where) - else: - assert how == "heavy" - return self.agg.anyHeavy(arg, where=where) - def visit_Substring(self, op, *, arg, start, length): # Clickhouse is 1-indexed suffix = (length,) * (length is not None) diff --git a/ibis/backends/datafusion/compiler.py b/ibis/backends/datafusion/compiler.py index 17a724fdf2407..3684302feda75 100644 --- a/ibis/backends/datafusion/compiler.py +++ b/ibis/backends/datafusion/compiler.py @@ -29,7 +29,6 @@ class DataFusionCompiler(SQLGlotCompiler): UNSUPPORTED_OPERATIONS = frozenset( ( - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayDistinct, @@ -65,6 +64,7 @@ class DataFusionCompiler(SQLGlotCompiler): ) SIMPLE_OPS = { + ops.Arbitrary: "first_value", ops.ApproxMedian: "approx_median", ops.ArrayRemove: "array_remove_all", ops.BitAnd: "bit_and", diff --git a/ibis/backends/druid/compiler.py b/ibis/backends/druid/compiler.py index b7d7d3302ae14..7b652e4b3d0d9 100644 --- a/ibis/backends/druid/compiler.py +++ b/ibis/backends/druid/compiler.py @@ -32,7 +32,6 @@ class DruidCompiler(SQLGlotCompiler): UNSUPPORTED_OPERATIONS = frozenset( ( ops.ApproxMedian, - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayCollect, diff --git a/ibis/backends/duckdb/compiler.py b/ibis/backends/duckdb/compiler.py index d2f8280adb597..28c8a33315a32 100644 --- a/ibis/backends/duckdb/compiler.py +++ b/ibis/backends/duckdb/compiler.py @@ -34,6 +34,7 @@ class DuckDBCompiler(SQLGlotCompiler): type_mapper = DuckDBType SIMPLE_OPS = { + ops.Arbitrary: "any_value", ops.ArrayPosition: "list_indexof", ops.BitAnd: "bit_and", ops.BitOr: "bit_or", diff --git a/ibis/backends/exasol/compiler.py b/ibis/backends/exasol/compiler.py index 0f102998438b7..4ba725fd1a0a7 100644 --- a/ibis/backends/exasol/compiler.py +++ b/ibis/backends/exasol/compiler.py @@ -36,7 +36,6 @@ class ExasolCompiler(SQLGlotCompiler): ( ops.AnalyticVectorizedUDF, ops.ApproxMedian, - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayCollect, diff --git a/ibis/backends/flink/compiler.py b/ibis/backends/flink/compiler.py index 4ee2ae4115f9b..2f41c9807045d 100644 --- a/ibis/backends/flink/compiler.py +++ b/ibis/backends/flink/compiler.py @@ -35,7 +35,6 @@ class FlinkCompiler(SQLGlotCompiler): ( ops.AnalyticVectorizedUDF, ops.ApproxMedian, - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayCollect, diff --git a/ibis/backends/impala/compiler.py b/ibis/backends/impala/compiler.py index 8b0285a5dae6d..13a11d6147686 100644 --- a/ibis/backends/impala/compiler.py +++ b/ibis/backends/impala/compiler.py @@ -29,7 +29,6 @@ class ImpalaCompiler(SQLGlotCompiler): UNSUPPORTED_OPERATIONS = frozenset( ( - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayCollect, diff --git a/ibis/backends/mssql/compiler.py b/ibis/backends/mssql/compiler.py index 15fd79340d1bf..9d84a1330e214 100644 --- a/ibis/backends/mssql/compiler.py +++ b/ibis/backends/mssql/compiler.py @@ -73,7 +73,6 @@ class MSSQLCompiler(SQLGlotCompiler): UNSUPPORTED_OPERATIONS = frozenset( ( ops.ApproxMedian, - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayCollect, diff --git a/ibis/backends/mysql/compiler.py b/ibis/backends/mysql/compiler.py index e5b0d02833b24..e835975671ac8 100644 --- a/ibis/backends/mysql/compiler.py +++ b/ibis/backends/mysql/compiler.py @@ -70,7 +70,6 @@ def POS_INF(self): UNSUPPORTED_OPERATIONS = frozenset( ( ops.ApproxMedian, - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayCollect, diff --git a/ibis/backends/oracle/compiler.py b/ibis/backends/oracle/compiler.py index d797e3688e20e..e15eb8aec9f7f 100644 --- a/ibis/backends/oracle/compiler.py +++ b/ibis/backends/oracle/compiler.py @@ -49,7 +49,6 @@ class OracleCompiler(SQLGlotCompiler): UNSUPPORTED_OPERATIONS = frozenset( ( - ops.Arbitrary, ops.ArgMax, ops.ArgMin, ops.ArrayCollect, diff --git a/ibis/backends/pandas/executor.py b/ibis/backends/pandas/executor.py index d1a3f8d73d0de..2478b27de1833 100644 --- a/ibis/backends/pandas/executor.py +++ b/ibis/backends/pandas/executor.py @@ -267,12 +267,7 @@ def agg(df): @classmethod def visit(cls, op: ops.Arbitrary, arg, where, how): - if how == "first": - return cls.agg(cls.kernels.reductions[ops.First], arg, where) - elif how == "last": - return cls.agg(cls.kernels.reductions[ops.Last], arg, where) - else: - raise OperationNotDefinedError(f"Arbitrary {how!r} is not supported") + return cls.agg(cls.kernels.reductions[ops.First], arg, where) @classmethod def visit(cls, op: ops.ArgMin | ops.ArgMax, arg, key, where): diff --git a/ibis/backends/postgres/compiler.py b/ibis/backends/postgres/compiler.py index 7d794984375b3..a0511bf5eb37e 100644 --- a/ibis/backends/postgres/compiler.py +++ b/ibis/backends/postgres/compiler.py @@ -41,6 +41,7 @@ class PostgresCompiler(SQLGlotCompiler): ) SIMPLE_OPS = { + ops.Arbitrary: "first", # could use any_value for postgres>=16 ops.ArrayCollect: "array_agg", ops.ArrayRemove: "array_remove", ops.BitAnd: "bit_and", diff --git a/ibis/backends/pyspark/compiler.py b/ibis/backends/pyspark/compiler.py index 4e61b604f2d20..70babdff0cea5 100644 --- a/ibis/backends/pyspark/compiler.py +++ b/ibis/backends/pyspark/compiler.py @@ -238,18 +238,11 @@ def visit_Last(self, op, *, arg, where): arg = self.if_(where, arg, NULL) return sge.IgnoreNulls(this=self.f.last(arg)) - def visit_Arbitrary(self, op, *, arg, how, where): + def visit_Arbitrary(self, op, *, arg, where): + # For Spark>=3.4 we could use any_value here if where is not None: arg = self.if_(where, arg, NULL) - if how == "first": - return sge.IgnoreNulls(this=self.f.first(arg)) - elif how == "last": - return sge.IgnoreNulls(this=self.f.last(arg)) - else: - raise com.UnsupportedOperationError( - f"PySpark backend does not support arbitrary with how={how}. " - "Supported values are `first` and `last`." - ) + return sge.IgnoreNulls(this=self.f.first(arg)) def visit_Median(self, op, *, arg, where): return self.agg.percentile(arg, 0.5, where=where) diff --git a/ibis/backends/risingwave/compiler.py b/ibis/backends/risingwave/compiler.py index 066493e7d4689..32a3941ba2c3c 100644 --- a/ibis/backends/risingwave/compiler.py +++ b/ibis/backends/risingwave/compiler.py @@ -22,6 +22,7 @@ class RisingwaveCompiler(PostgresCompiler): UNSUPPORTED_OPERATIONS = frozenset( ( + ops.Arbitrary, ops.DateFromYMD, ops.Mode, ops.RandomUUID, diff --git a/ibis/backends/snowflake/compiler.py b/ibis/backends/snowflake/compiler.py index 8b980aa9aa8f3..c1c152294f12c 100644 --- a/ibis/backends/snowflake/compiler.py +++ b/ibis/backends/snowflake/compiler.py @@ -56,8 +56,9 @@ class SnowflakeCompiler(SQLGlotCompiler): ) SIMPLE_OPS = { - ops.Any: "max", ops.All: "min", + ops.Any: "max", + ops.Arbitrary: "any_value", ops.ArrayDistinct: "array_distinct", ops.ArrayFlatten: "array_flatten", ops.ArrayIndex: "get", @@ -380,15 +381,6 @@ def visit_TimestampBucket(self, op, *, arg, interval, offset): return self.f.time_slice(arg, interval.value, interval.dtype.unit.name) - def visit_Arbitrary(self, op, *, arg, how, where): - if how == "first": - return self.f.get(self.agg.array_agg(arg, where=where), 0) - elif how == "last": - expr = self.agg.array_agg(arg, where=where) - return self.f.get(expr, self.f.array_size(expr) - 1) - else: - raise com.UnsupportedOperationError("how must be 'first' or 'last'") - def visit_ArraySlice(self, op, *, arg, start, stop): if start is None: start = 0 diff --git a/ibis/backends/sql/compiler.py b/ibis/backends/sql/compiler.py index 1822140a4ce35..dfa7ea7b06d15 100644 --- a/ibis/backends/sql/compiler.py +++ b/ibis/backends/sql/compiler.py @@ -930,13 +930,6 @@ def visit_VarianceStandardDevCovariance(self, op, *, how, where, **kw): visit_VarianceStandardDevCovariance ) - def visit_Arbitrary(self, op, *, arg, how, where): - if how == "heavy": - raise com.UnsupportedOperationError( - f"how='heavy' not supported in the {self.dialect} backend" - ) - return self.agg[how](arg, where=where) - def visit_SimpleCase(self, op, *, base=None, cases, results, default): return sge.Case( this=base, ifs=list(map(self.if_, cases, results)), default=default diff --git a/ibis/backends/sqlite/compiler.py b/ibis/backends/sqlite/compiler.py index b991040242f56..80554c7510935 100644 --- a/ibis/backends/sqlite/compiler.py +++ b/ibis/backends/sqlite/compiler.py @@ -70,6 +70,7 @@ class SQLiteCompiler(SQLGlotCompiler): ) SIMPLE_OPS = { + ops.Arbitrary: "_ibis_first", ops.RegexReplace: "_ibis_regex_replace", ops.RegexExtract: "_ibis_regex_extract", ops.RegexSearch: "_ibis_regex_search", @@ -92,8 +93,8 @@ class SQLiteCompiler(SQLGlotCompiler): ops.BitOr: "_ibis_bit_or", ops.BitAnd: "_ibis_bit_and", ops.BitXor: "_ibis_bit_xor", - ops.First: "_ibis_arbitrary_first", - ops.Last: "_ibis_arbitrary_last", + ops.First: "_ibis_first", + ops.Last: "_ibis_last", ops.Mode: "_ibis_mode", ops.Time: "time", ops.Date: "date", @@ -213,14 +214,6 @@ def visit_RandomScalar(self, op): def visit_Cot(self, op, *, arg): return 1 / self.f.tan(arg) - def visit_Arbitrary(self, op, *, arg, how, where): - if op.how == "heavy": - raise com.OperationNotDefinedError( - "how='heavy' not implemented for the SQLite backend" - ) - - return self._aggregate(f"_ibis_arbitrary_{how}", arg, where=where) - def visit_ArgMin(self, *args, **kwargs): return self._visit_arg_reduction("min", *args, **kwargs) diff --git a/ibis/backends/sqlite/udf.py b/ibis/backends/sqlite/udf.py index ee47ff704c24e..64dfa15666da1 100644 --- a/ibis/backends/sqlite/udf.py +++ b/ibis/backends/sqlite/udf.py @@ -438,7 +438,7 @@ def __init__(self): super().__init__(operator.xor) -class _ibis_arbitrary(abc.ABC): +class _ibis_first_last(abc.ABC): def __init__(self) -> None: self.value = None @@ -450,14 +450,14 @@ def finalize(self) -> int | None: @udaf -class _ibis_arbitrary_first(_ibis_arbitrary): +class _ibis_first(_ibis_first_last): def step(self, value): if self.value is None: self.value = value @udaf -class _ibis_arbitrary_last(_ibis_arbitrary): +class _ibis_last(_ibis_first_last): def step(self, value): if value is not None: self.value = value diff --git a/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/duckdb/out.sql b/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/duckdb/out.sql index 147f962504157..bf5a2c3daa7de 100644 --- a/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/duckdb/out.sql +++ b/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/duckdb/out.sql @@ -1,7 +1,7 @@ WITH "t5" AS ( SELECT "t4"."field_of_study", - FIRST("t4"."diff") AS "diff" + ANY_VALUE("t4"."diff") AS "diff" FROM ( SELECT "t3"."field_of_study", diff --git a/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/risingwave/out.sql b/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/risingwave/out.sql deleted file mode 100644 index f61357b240666..0000000000000 --- a/ibis/backends/tests/snapshots/test_sql/test_union_aliasing/risingwave/out.sql +++ /dev/null @@ -1,68 +0,0 @@ -WITH "t5" AS ( - SELECT - "t4"."field_of_study", - FIRST("t4"."diff") AS "diff" - FROM ( - SELECT - "t3"."field_of_study", - "t3"."years", - "t3"."degrees", - "t3"."earliest_degrees", - "t3"."latest_degrees", - "t3"."latest_degrees" - "t3"."earliest_degrees" AS "diff" - FROM ( - SELECT - "t2"."field_of_study", - "t2"."years", - "t2"."degrees", - FIRST_VALUE("t2"."degrees") OVER (PARTITION BY "t2"."field_of_study" ORDER BY "t2"."years" ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "earliest_degrees", - LAST_VALUE("t2"."degrees") OVER (PARTITION BY "t2"."field_of_study" ORDER BY "t2"."years" ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS "latest_degrees" - FROM ( - SELECT - "t1"."field_of_study", - CAST(JSONB_EXTRACT_PATH(TO_JSONB("t1"."__pivoted__"), 'f1') AS VARCHAR) AS "years", - CAST(JSONB_EXTRACT_PATH(TO_JSONB("t1"."__pivoted__"), 'f2') AS BIGINT) AS "degrees" - FROM ( - SELECT - "t0"."field_of_study", - UNNEST( - ARRAY[ROW(CAST('1970-71' AS VARCHAR), CAST("t0"."1970-71" AS BIGINT)), ROW(CAST('1975-76' AS VARCHAR), CAST("t0"."1975-76" AS BIGINT)), ROW(CAST('1980-81' AS VARCHAR), CAST("t0"."1980-81" AS BIGINT)), ROW(CAST('1985-86' AS VARCHAR), CAST("t0"."1985-86" AS BIGINT)), ROW(CAST('1990-91' AS VARCHAR), CAST("t0"."1990-91" AS BIGINT)), ROW(CAST('1995-96' AS VARCHAR), CAST("t0"."1995-96" AS BIGINT)), ROW(CAST('2000-01' AS VARCHAR), CAST("t0"."2000-01" AS BIGINT)), ROW(CAST('2005-06' AS VARCHAR), CAST("t0"."2005-06" AS BIGINT)), ROW(CAST('2010-11' AS VARCHAR), CAST("t0"."2010-11" AS BIGINT)), ROW(CAST('2011-12' AS VARCHAR), CAST("t0"."2011-12" AS BIGINT)), ROW(CAST('2012-13' AS VARCHAR), CAST("t0"."2012-13" AS BIGINT)), ROW(CAST('2013-14' AS VARCHAR), CAST("t0"."2013-14" AS BIGINT)), ROW(CAST('2014-15' AS VARCHAR), CAST("t0"."2014-15" AS BIGINT)), ROW(CAST('2015-16' AS VARCHAR), CAST("t0"."2015-16" AS BIGINT)), ROW(CAST('2016-17' AS VARCHAR), CAST("t0"."2016-17" AS BIGINT)), ROW(CAST('2017-18' AS VARCHAR), CAST("t0"."2017-18" AS BIGINT)), ROW(CAST('2018-19' AS VARCHAR), CAST("t0"."2018-19" AS BIGINT)), ROW(CAST('2019-20' AS VARCHAR), CAST("t0"."2019-20" AS BIGINT))] - ) AS "__pivoted__" - FROM "humanities" AS "t0" - ) AS "t1" - ) AS "t2" - ) AS "t3" - ) AS "t4" - GROUP BY - 1 -) -SELECT - "t11"."field_of_study", - "t11"."diff" -FROM ( - SELECT - * - FROM ( - SELECT - "t6"."field_of_study", - "t6"."diff" - FROM "t5" AS "t6" - ORDER BY - "t6"."diff" DESC NULLS LAST - LIMIT 10 - ) AS "t9" - UNION ALL - SELECT - * - FROM ( - SELECT - "t6"."field_of_study", - "t6"."diff" - FROM "t5" AS "t6" - WHERE - "t6"."diff" < 0 - ORDER BY - "t6"."diff" ASC - LIMIT 10 - ) AS "t10" -) AS "t11" \ No newline at end of file diff --git a/ibis/backends/tests/test_aggregation.py b/ibis/backends/tests/test_aggregation.py index 248736af547e5..c08d9dce12d82 100644 --- a/ibis/backends/tests/test_aggregation.py +++ b/ibis/backends/tests/test_aggregation.py @@ -506,128 +506,6 @@ def mean_and_std(v): pytest.mark.notimpl(["polars"], raises=com.OperationNotDefinedError), ], ), - param( - lambda t, where: t.double_col.arbitrary(where=where), - lambda t, where: t.double_col[where].iloc[0], - id="arbitrary_default", - marks=[ - pytest.mark.notimpl( - [ - "impala", - "mysql", - "polars", - "datafusion", - "mssql", - "druid", - "oracle", - "exasol", - "flink", - ], - raises=com.OperationNotDefinedError, - ), - pytest.mark.notimpl( - ["risingwave"], - raises=PsycoPg2InternalError, - ), - ], - ), - param( - lambda t, where: t.double_col.arbitrary(how="first", where=where), - lambda t, where: t.double_col[where].iloc[0], - id="arbitrary_first", - marks=[ - pytest.mark.notimpl( - [ - "impala", - "mysql", - "polars", - "datafusion", - "mssql", - "druid", - "oracle", - "exasol", - "flink", - ], - raises=com.OperationNotDefinedError, - ), - pytest.mark.notimpl( - ["risingwave"], - raises=PsycoPg2InternalError, - ), - ], - ), - param( - lambda t, where: t.double_col.arbitrary(how="last", where=where), - lambda t, where: t.double_col[where].iloc[-1], - id="arbitrary_last", - marks=[ - pytest.mark.notimpl( - [ - "impala", - "mysql", - "polars", - "datafusion", - "mssql", - "druid", - "oracle", - "exasol", - "flink", - ], - raises=com.OperationNotDefinedError, - ), - pytest.mark.notimpl( - ["bigquery", "trino"], - raises=com.UnsupportedOperationError, - reason="backend only supports the `first` option for `.arbitrary()", - ), - pytest.mark.notimpl( - ["risingwave"], - raises=PsycoPg2InternalError, - ), - ], - ), - param( - lambda t, where: t.double_col.arbitrary(how="heavy", where=where), - lambda t, where: t.double_col[where].iloc[8], - id="arbitrary_heavy", - # only clickhouse implements this option - marks=[ - pytest.mark.notimpl( - [ - "dask", - "datafusion", - "druid", - "impala", - "mssql", - "mysql", - "oracle", - "pandas", - "polars", - "sqlite", - "exasol", - "flink", - ], - raises=com.OperationNotDefinedError, - ), - pytest.mark.notimpl( - [ - "bigquery", - "duckdb", - "postgres", - "risingwave", - "pyspark", - "trino", - ], - raises=com.UnsupportedOperationError, - reason="how='heavy' not supported in the backend", - ), - pytest.mark.notimpl( - ["snowflake"], - raises=com.UnsupportedOperationError, - reason="Snowflake only supports the `first` option for `.arbitrary()", - ), - ], - ), param( lambda t, where: t.double_col.first(where=where), lambda t, where: t.double_col[where].iloc[0], @@ -783,6 +661,40 @@ def test_reduction_ops( np.testing.assert_array_equal(result, expected) +@pytest.mark.notimpl( + [ + "impala", + "mysql", + "polars", + "datafusion", + "mssql", + "druid", + "oracle", + "exasol", + "flink", + "risingwave", + ], + raises=com.OperationNotDefinedError, +) +@pytest.mark.parametrize("filtered", [False, True]) +def test_arbitrary(backend, alltypes, df, filtered): + # Arbitrary chooses a non-null arbitrary value. To ensure we can test for + # _something_ we create a column that is a mix of nulls and a single value + # (or a single value after filtering is applied). + if filtered: + new = alltypes.int_col.cases([(3, 30), (4, 40)]) + where = _.int_col == 3 + else: + new = (alltypes.int_col == 3).ifelse(30, None) + where = None + + t = alltypes.mutate(new=new) + + expr = t.new.arbitrary(where=where) + res = expr.execute() + assert res == 30 + + @pytest.mark.parametrize( ("ibis_cond", "pandas_cond"), [ diff --git a/ibis/backends/tests/test_generic.py b/ibis/backends/tests/test_generic.py index e492e0e5eb04d..2d6925ae42e78 100644 --- a/ibis/backends/tests/test_generic.py +++ b/ibis/backends/tests/test_generic.py @@ -1112,34 +1112,12 @@ def test_pivot_wider(backend): ), ], ) -@pytest.mark.parametrize( - "keep", - [ - "first", - param( - "last", - marks=pytest.mark.notimpl( - ["bigquery", "trino"], - raises=com.UnsupportedOperationError, - reason="backend doesn't support how='last'", - ), - ), - ], -) +@pytest.mark.parametrize("keep", ["first", "last"]) @pytest.mark.notimpl( ["druid", "impala", "oracle"], - raises=( - NotImplementedError, - OracleDatabaseError, - com.OperationNotDefinedError, - ), + raises=(NotImplementedError, OracleDatabaseError, com.OperationNotDefinedError), reason="arbitrary not implemented in the backend", ) -@pytest.mark.notimpl( - ["datafusion"], - raises=com.OperationNotDefinedError, - reason="backend doesn't implement window functions", -) @pytest.mark.notimpl( ["polars"], raises=com.OperationNotDefinedError, @@ -1209,11 +1187,6 @@ def test_distinct_on_keep(backend, on, keep): ["exasol"], raises=com.OperationNotDefinedError, ) -@pytest.mark.notimpl( - ["datafusion"], - raises=com.OperationNotDefinedError, - reason="backend doesn't implement window functions", -) @pytest.mark.notimpl( ["polars"], raises=com.OperationNotDefinedError, diff --git a/ibis/backends/tests/test_sql.py b/ibis/backends/tests/test_sql.py index a5edcf3c5db81..0bf6ce5304ac9 100644 --- a/ibis/backends/tests/test_sql.py +++ b/ibis/backends/tests/test_sql.py @@ -108,18 +108,13 @@ def test_isin_bug(con, snapshot): raises=NotImplementedError, ) @pytest.mark.notyet( - ["datafusion", "exasol"], + ["datafusion", "exasol", "oracle", "flink", "risingwave"], reason="no unnest support", raises=exc.OperationNotDefinedError, ) @pytest.mark.notyet( ["sqlite", "mysql", "druid", "impala", "mssql"], reason="no unnest support upstream" ) -@pytest.mark.notimpl( - ["oracle", "flink"], - reason="unnest not yet implemented", - raises=exc.OperationNotDefinedError, -) @pytest.mark.parametrize("backend_name", _get_backends_to_test()) def test_union_aliasing(backend_name, snapshot): if backend_name == "snowflake": diff --git a/ibis/backends/trino/compiler.py b/ibis/backends/trino/compiler.py index a32cc72f15272..5e4b52ebac13c 100644 --- a/ibis/backends/trino/compiler.py +++ b/ibis/backends/trino/compiler.py @@ -39,6 +39,7 @@ class TrinoCompiler(SQLGlotCompiler): ) SIMPLE_OPS = { + ops.Arbitrary: "any_value", ops.Pi: "pi", ops.E: "e", ops.RegexReplace: "regexp_replace", @@ -122,13 +123,6 @@ def visit_Correlation(self, op, *, left, right, how, where): return self.agg.corr(left, right, where=where) - def visit_Arbitrary(self, op, *, arg, how, where): - if how != "first": - raise com.UnsupportedOperationError( - 'Trino only supports how="first" for `arbitrary` reduction' - ) - return self.agg.arbitrary(arg, where=where) - def visit_BitXor(self, op, *, arg, where): a, b = map(sg.to_identifier, "ab") input_fn = combine_fn = sge.Lambda( diff --git a/ibis/expr/operations/reductions.py b/ibis/expr/operations/reductions.py index e96327412d08e..e96c5850d3510 100644 --- a/ibis/expr/operations/reductions.py +++ b/ibis/expr/operations/reductions.py @@ -53,8 +53,12 @@ def relations(self): @public class Arbitrary(Filterable, Reduction): + """Retrieve an arbitrary element. + + Returns a non-null value unless the column is empty or all values are NULL. + """ + arg: Column[dt.Any] - how: Literal["first", "last", "heavy"] dtype = rlz.dtype_like("arg") diff --git a/ibis/expr/types/generic.py b/ibis/expr/types/generic.py index db191f915a213..7da5f63c98b69 100644 --- a/ibis/expr/types/generic.py +++ b/ibis/expr/types/generic.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Iterable, Sequence -from typing import TYPE_CHECKING, Any, Literal +from typing import TYPE_CHECKING, Any from public import public @@ -14,7 +14,7 @@ from ibis.common.grounds import Singleton from ibis.expr.rewrites import rewrite_window_input from ibis.expr.types.core import Expr, _binop, _FixedTextJupyterMixin -from ibis.util import deprecated +from ibis.util import deprecated, warn_deprecated if TYPE_CHECKING: import pandas as pd @@ -1832,33 +1832,34 @@ def topk( return table.aggregate(metric, by=[self]).order_by(metric.desc()).limit(k) def arbitrary( - self, - where: ir.BooleanValue | None = None, - how: Literal["first", "last", "heavy"] = "first", + self, where: ir.BooleanValue | None = None, how: Any = None ) -> Scalar: """Select an arbitrary value in a column. + Returns an arbitrary (nondeterministic, backend-specific) value from + the column. The value will be non-NULL, except if the column is empty + or all values are NULL. + Parameters ---------- where A filter expression how - The method to use for selecting the element. - - * `"first"`: Select the first non-`NULL` element - * `"last"`: Select the last non-`NULL` element - * `"heavy"`: Select a frequently occurring value using the heavy - hitters algorithm. `"heavy"` is only supported by Clickhouse - backend. + DEPRECATED Returns ------- Scalar An expression """ - return ops.Arbitrary( - self, how=how, where=self._bind_reduction_filter(where) - ).to_expr() + if how is not None: + warn_deprecated( + name="how", + as_of="9.0", + removed_in="10.0", + instead="call `first` or `last` explicitly", + ) + return ops.Arbitrary(self, where=self._bind_reduction_filter(where)).to_expr() def count(self, where: ir.BooleanValue | None = None) -> ir.IntegerScalar: """Compute the number of rows in an expression. diff --git a/ibis/expr/types/relations.py b/ibis/expr/types/relations.py index a9caf47bc0dbe..1a64341e91277 100644 --- a/ibis/expr/types/relations.py +++ b/ibis/expr/types/relations.py @@ -1342,16 +1342,16 @@ def distinct( if keep is None: having = lambda t: t.count() == 1 - how = "first" + method = "first" elif keep in ("first", "last"): having = None - how = keep + method = keep else: raise com.IbisError( f"Invalid value for `keep`: {keep!r}, must be 'first', 'last' or None" ) - aggs = {col.get_name(): col.arbitrary(how=how) for col in (~on).expand(self)} + aggs = {col.get_name(): getattr(col, method)() for col in (~on).expand(self)} gb = self.group_by(on) if having is not None: diff --git a/ibis/tests/expr/test_value_exprs.py b/ibis/tests/expr/test_value_exprs.py index 140cf4e031ba1..b962daf624b64 100644 --- a/ibis/tests/expr/test_value_exprs.py +++ b/ibis/tests/expr/test_value_exprs.py @@ -493,18 +493,23 @@ def test_negate_boolean_column(table, op): assert isinstance(result.op(), ops.Not) -@pytest.mark.parametrize("column", ["a", "b", "c", "d", "e", "f", "g", "h"]) -@pytest.mark.parametrize("how", ["first", "last", "heavy"]) +@pytest.mark.parametrize("column", ["a", "b"]) @pytest.mark.parametrize("condition_fn", [lambda t: None, lambda t: t.a > 8]) -def test_arbitrary(table, column, how, condition_fn): +def test_arbitrary(table, column, condition_fn): col = table[column] where = condition_fn(table) - expr = col.arbitrary(how=how, where=where) + expr = col.arbitrary(where=where) assert expr.type() == col.type() assert isinstance(expr, ir.Scalar) assert isinstance(expr.op(), ops.Arbitrary) +def test_arbitrary_how_deprecated(table): + with pytest.warns(FutureWarning, match="v9.0"): + out = table.a.arbitrary(how="last") + assert isinstance(out.op(), ops.Arbitrary) + + @pytest.mark.parametrize( ["column", "operation"], [