diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d3cbf3bd695..54b42b1f6de 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1068,6 +1068,10 @@ def replace( if regex and isinstance(pat, re.Pattern): pat = pat.pattern + pa_repl = pa.scalar(repl) + if not pa.types.is_string(pa_repl.type): + raise TypeError(f"repl must be a str, not {type(repl).__name__}.") + # Pandas forces non-regex replace when pat is a single-character with acquire_spill_lock(): if regex is True and len(pat) > 1: @@ -1076,14 +1080,14 @@ def replace( plc.strings.regex_program.RegexProgram.create( pat, plc.strings.regex_flags.RegexFlags.DEFAULT ), - pa_scalar_to_plc_scalar(pa.scalar(repl)), + pa_scalar_to_plc_scalar(pa_repl), n, ) else: plc_result = plc.strings.replace.replace( self._column.to_pylibcudf(mode="read"), pa_scalar_to_plc_scalar(pa.scalar(pat)), - pa_scalar_to_plc_scalar(pa.scalar(repl)), + pa_scalar_to_plc_scalar(pa_repl), n, ) result = Column.from_pylibcudf(plc_result) @@ -2416,13 +2420,19 @@ def get(self, i: int = 0) -> SeriesOrIndex: 2 f dtype: object """ + str_lens = self.len() if i < 0: next_index = i - 1 step = -1 + to_mask = str_lens < abs(i) # type: ignore[operator] else: next_index = i + 1 step = 1 - return self.slice(i, next_index, step) + to_mask = str_lens <= i # type: ignore[operator] + result = self.slice(i, next_index, step) + if to_mask.any(): # type: ignore[union-attr] + result[to_mask] = cudf.NA # type: ignore[index] + return result def get_json_object( self, @@ -3933,19 +3943,18 @@ def isspace(self) -> SeriesOrIndex: def _starts_ends_with( self, method: Callable[[plc.Column, plc.Column | plc.Scalar], plc.Column], - pat: str | Sequence, + pat: str | tuple[str, ...], ) -> SeriesOrIndex: - if pat is None: - raise TypeError( - f"expected a string or a sequence-like object, not " - f"{type(pat).__name__}" - ) - elif is_scalar(pat): + if isinstance(pat, str): plc_pat = pa_scalar_to_plc_scalar(pa.scalar(pat, type=pa.string())) - else: + elif isinstance(pat, tuple) and all(isinstance(p, str) for p in pat): plc_pat = column.as_column(pat, dtype="str").to_pylibcudf( mode="read" ) + else: + raise TypeError( + f"expected a string or tuple, not {type(pat).__name__}" + ) with acquire_spill_lock(): plc_result = method( self._column.to_pylibcudf(mode="read"), plc_pat @@ -3953,7 +3962,7 @@ def _starts_ends_with( result = Column.from_pylibcudf(plc_result) return self._return_or_inplace(result) - def endswith(self, pat: str | Sequence) -> SeriesOrIndex: + def endswith(self, pat: str | tuple[str, ...]) -> SeriesOrIndex: """ Test if the end of each string element matches a pattern. @@ -3997,7 +4006,7 @@ def endswith(self, pat: str | Sequence) -> SeriesOrIndex: """ return self._starts_ends_with(plc.strings.find.ends_with, pat) - def startswith(self, pat: str | Sequence) -> SeriesOrIndex: + def startswith(self, pat: str | tuple[str, ...]) -> SeriesOrIndex: """ Test if the start of each string element matches a pattern. @@ -4299,6 +4308,8 @@ def index( if (result == -1).any(): raise ValueError("substring not found") + elif cudf.get_option("mode.pandas_compatible"): + return result.astype(np.dtype(np.int64)) else: return result @@ -4359,6 +4370,8 @@ def rindex( if (result == -1).any(): raise ValueError("substring not found") + elif cudf.get_option("mode.pandas_compatible"): + return result.astype(np.dtype(np.int64)) else: return result diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py index bdc9e695844..809fedfde7b 100644 --- a/python/cudf/cudf/tests/test_string.py +++ b/python/cudf/cudf/tests/test_string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import json import re @@ -2048,26 +2048,26 @@ def test_string_starts_ends(data, pat): [ ( ["abc", "xyz", "a", "ab", "123", "097"], - ["abc", "x", "a", "b", "3", "7"], + ("abc", "x", "a", "b", "3", "7"), ), - (["A B", "1.5", "3,000"], ["A ", ".", ","]), - (["23", "³", "⅕", ""], ["23", "³", "⅕", ""]), - ([" ", "\t\r\n ", ""], ["d", "\n ", ""]), + (["A B", "1.5", "3,000"], ("A ", ".", ",")), + (["23", "³", "⅕", ""], ("23", "³", "⅕", "")), + ([" ", "\t\r\n ", ""], ("d", "\n ", "")), ( ["$", "B", "Aab$", "$$ca", "C$B$", "cat"], - ["$", "$", "a", "<", "(", "#"], + ("$", "$", "a", "<", "(", "#"), ), ( ["line to be wrapped", "another line to be wrapped"], - ["another", "wrapped"], + ("another", "wrapped"), ), ( ["hello", "there", "world", "+1234", "-1234", None, "accént", ""], - ["hsdjfk", None, "ll", "+", "-", "w", "-", "én"], + ("hsdjfk", "", "ll", "+", "-", "w", "-", "én"), ), ( ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], - ["1. Ant. ", "2. Bee!\n", "3. Cat?\t", None], + ("1. Ant. ", "2. Bee!\n", "3. Cat?\t", ""), ), ], ) @@ -3539,3 +3539,39 @@ def test_string_reduction_error(): lfunc_args_and_kwargs=([], {"skipna": False}), rfunc_args_and_kwargs=([], {"skipna": False}), ) + + +def test_getitem_out_of_bounds(): + data = ["123", "12", "1"] + pd_ser = pd.Series(data) + cudf_ser = cudf.Series(data) + expected = pd_ser.str[2] + result = cudf_ser.str[2] + assert_eq(result, expected) + + expected = pd_ser.str[-2] + result = cudf_ser.str[-2] + assert_eq(result, expected) + + +@pytest.mark.parametrize("method", ["startswith", "endswith"]) +@pytest.mark.parametrize("pat", [None, (1, 2), pd.Series([1])]) +def test_startsendwith_invalid_pat(method, pat): + ser = cudf.Series(["1"]) + with pytest.raises(TypeError): + getattr(ser.str, method)(pat) + + +@pytest.mark.parametrize("method", ["rindex", "index"]) +def test_index_int64_pandas_compat(method): + data = ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"] + with cudf.option_context("mode.pandas_compatible", True): + result = getattr(cudf.Series(data).str, method)("E", 4, 8) + expected = getattr(pd.Series(data).str, method)("E", 4, 8) + assert_eq(result, expected) + + +def test_replace_invalid_scalar_repl(): + ser = cudf.Series(["1"]) + with pytest.raises(TypeError): + ser.str.replace("1", 2)