Ensure comparisons with pyints and integer series always succeed (#16532

) When Python integers are compared to a series of integers, the result can always be correctly defined no matter the values of the Python integer. This was always a very mild issue. But with NumPy 2 behavior not upcasting the computation result type based on the value anymore, even things like: ``` cudf.Series([1, 2, 3], dtype="int8") < 1000 ``` would fail. (Similar paths could be taken for other integer scalars, but there would be mostly nice for performance.) N.B. NumPy/pandas also support exact comparisons when mixing e.g. uint64 and int64. This is another rare exception that cudf currently does not support. Closes gh-16282 Authors: - Sebastian Berg (https://github.com/seberg) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: #16532
rapidsai · Aug 13, 2024 · cf3fabf · cf3fabf
1 parent 5780c4d
commit cf3fabf
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 14 deletions.
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
@@ -199,16 +199,53 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             np.bool_: np.float32,
         }
 
+        out_dtype = None
         if op in {"__truediv__", "__rtruediv__"}:
             # Division with integer types results in a suitable float.
             if truediv_type := int_float_dtype_mapping.get(self.dtype.type):
                 return self.astype(truediv_type)._binaryop(other, op)
+        elif op in {
+            "__lt__",
+            "__gt__",
+            "__le__",
+            "__ge__",
+            "__eq__",
+            "__ne__",
+        }:
+            out_dtype = "bool"
+
+            # If `other` is a Python integer and it is out-of-bounds
+            # promotion could fail but we can trivially define the result
+            # in terms of `notnull` or `NULL_NOT_EQUALS`.
+            if type(other) is int and self.dtype.kind in "iu":  # noqa: E721
+                truthiness = None
+                iinfo = np.iinfo(self.dtype)
+                if iinfo.min > other:
+                    truthiness = op in {"__ne__", "__gt__", "__ge__"}
+                elif iinfo.max < other:
+                    truthiness = op in {"__ne__", "__lt__", "__le__"}
+
+                # Compare with minimum value so that the result is true/false
+                if truthiness is True:
+                    other = iinfo.min
+                    op = "__ge__"
+                elif truthiness is False:
+                    other = iinfo.min
+                    op = "__lt__"
+
+        elif op in {"NULL_EQUALS", "NULL_NOT_EQUALS"}:
+            out_dtype = "bool"
 
         reflect, op = self._check_reflected_op(op)
         if (other := self._wrap_binop_normalization(other)) is NotImplemented:
             return NotImplemented
-        out_dtype = self.dtype
-        if other is not None:
+
+        if out_dtype is not None:
+            pass  # out_dtype was already set to bool
+        if other is None:
+            # not a binary operator, so no need to promote
+            out_dtype = self.dtype
+        elif out_dtype is None:
             out_dtype = np.result_type(self.dtype, other.dtype)
             if op in {"__mod__", "__floordiv__"}:
                 tmp = self if reflect else other
@@ -225,17 +262,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                             out_dtype = cudf.dtype("float64")
                     elif is_scalar(tmp) and tmp == 0:
                         out_dtype = cudf.dtype("float64")
-        if op in {
-            "__lt__",
-            "__gt__",
-            "__le__",
-            "__ge__",
-            "__eq__",
-            "__ne__",
-            "NULL_EQUALS",
-            "NULL_NOT_EQUALS",
-        }:
-            out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
             if self.dtype.kind == "f" or other.dtype.kind == "f":
@@ -247,7 +273,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             if self.dtype.kind == "b" or other.dtype.kind == "b":
                 out_dtype = "bool"
 
-        if (
+        elif (
             op == "__pow__"
             and self.dtype.kind in "iu"
             and (is_integer(other) or other.dtype.kind in "iu")

diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
@@ -290,6 +290,47 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2))
 
 
+@pytest.mark.parametrize(
+    "dtype,val",
+    [("int8", 200), ("int32", 2**32), ("uint8", -128), ("uint64", -1)],
+)
+@pytest.mark.parametrize(
+    "op",
+    [
+        operator.eq,
+        operator.ne,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+    ],
+)
+@pytest.mark.parametrize("reverse", [False, True])
+def test_series_compare_integer(dtype, val, op, reverse):
+    # Tests that these actually work, even though they are out of bound.
+    force_cast_val = np.array(val).astype(dtype)
+    sr = Series(
+        [np.iinfo(dtype).min, np.iinfo(dtype).max, force_cast_val, None],
+        dtype=dtype,
+    )
+
+    if reverse:
+        _op = op
+
+        def op(x, y):
+            return _op(y, x)
+
+    # We expect the same result as comparing to a value within range (e.g. 0)
+    # except that a NULL value evaluates to False
+    if op(0, val):
+        expected = Series([True, True, True, None])
+    else:
+        expected = Series([False, False, False, None])
+
+    res = op(sr, val)
+    assert_eq(res, expected)
+
+
 def _series_compare_nulls_typegen():
     return [
         *combinations_with_replacement(DATETIME_TYPES, 2),