From d3f1611959fa01a41913eda8b3fa9d2c6b5baeea Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sun, 10 Nov 2024 16:05:55 +0800 Subject: [PATCH 1/4] clib.conversion._to_numpy: Add tests for pandas.Series with pandas string dtypes --- pygmt/tests/test_clib_to_numpy.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/pygmt/tests/test_clib_to_numpy.py b/pygmt/tests/test_clib_to_numpy.py index 3624ed2be8d..f5cc3a6d24c 100644 --- a/pygmt/tests/test_clib_to_numpy.py +++ b/pygmt/tests/test_clib_to_numpy.py @@ -10,6 +10,7 @@ import pytest from packaging.version import Version from pygmt.clib.conversion import _to_numpy +from pygmt.helpers.testing import skip_if_no try: import pyarrow as pa @@ -161,6 +162,31 @@ def test_to_numpy_pandas_series_numpy_dtypes_numeric(dtype, expected_dtype): npt.assert_array_equal(result, series) +@pytest.mark.parametrize( + "dtype", + [ + None, + np.str_, + "U10", + "string[python]", + pytest.param("string[pyarrow]", marks=skip_if_no(package="pyarrow")), + pytest.param("string[pyarrow_numpy]", marks=skip_if_no(package="pyarrow")), + ], +) +def test_to_numpy_pandas_series_pandas_dtypes_string(dtype): + """ + Test the _to_numpy function with pandas.Series of pandas string types. + + In pandas, string arrays can be specified in multiple ways. + + Reference: https://pandas.pydata.org/docs/reference/api/pandas.StringDtype.html + """ + array = pd.Series(["abc", "defg", "12345"], dtype=dtype) + result = _to_numpy(array) + _check_result(result, np.str_) + npt.assert_array_equal(result, array) + + ######################################################################################## # Test the _to_numpy function with PyArrow arrays. # From 01ba31786ddf424027b3c3472339cd43d6fb49d5 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sun, 10 Nov 2024 16:18:50 +0800 Subject: [PATCH 2/4] Mapping 'string' to np.str_ explicitly --- pygmt/clib/conversion.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index af8eb3458d4..7b60b5e77e9 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -158,6 +158,9 @@ def _to_numpy(data: Any) -> np.ndarray: """ # Mapping of unsupported dtypes to the expected NumPy dtype. dtypes: dict[str, type] = { + # For pandas string dtype, "string[python]", "string[pyarrow]" and + # "string[pyarrow_numpy]". + "string": np.str_, "date32[day][pyarrow]": np.datetime64, "date64[ms][pyarrow]": np.datetime64, } From dac7e8eda7632464f556eaae5f1ac8ac34c9b6bd Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sun, 10 Nov 2024 16:23:57 +0800 Subject: [PATCH 3/4] Try to convert np.object_ into string type --- pygmt/clib/conversion.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pygmt/clib/conversion.py b/pygmt/clib/conversion.py index 7b60b5e77e9..c4fa39ecdba 100644 --- a/pygmt/clib/conversion.py +++ b/pygmt/clib/conversion.py @@ -2,6 +2,7 @@ Functions to convert data types into ctypes friendly formats. """ +import contextlib import ctypes as ctp import warnings from collections.abc import Sequence @@ -178,6 +179,11 @@ def _to_numpy(data: Any) -> np.ndarray: else: vec_dtype = str(getattr(data, "dtype", "")) array = np.ascontiguousarray(data, dtype=dtypes.get(vec_dtype)) + + # Check if a np.object_ array can be converted to np.str_. + if array.dtype == np.object_: + with contextlib.suppress(TypeError, ValueError): + return np.ascontiguousarray(array, dtype=np.str_) return array From 4364e0d93bf70298925b2b20e49c04d97770b946 Mon Sep 17 00:00:00 2001 From: Dongdong Tian Date: Sun, 10 Nov 2024 16:46:13 +0800 Subject: [PATCH 4/4] Remove the workaround in PR #684 --- pygmt/clib/session.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pygmt/clib/session.py b/pygmt/clib/session.py index 10c8770adaa..69922edc6dc 100644 --- a/pygmt/clib/session.py +++ b/pygmt/clib/session.py @@ -1475,7 +1475,7 @@ def virtualfile_from_vectors( # 2 columns contains coordinates like longitude, latitude, or datetime string # types. for col, array in enumerate(arrays[2:]): - if pd.api.types.is_string_dtype(array.dtype): + if np.issubdtype(array.dtype, np.str_): columns = col + 2 break @@ -1506,9 +1506,9 @@ def virtualfile_from_vectors( strings = string_arrays[0] elif len(string_arrays) > 1: strings = np.array( - [" ".join(vals) for vals in zip(*string_arrays, strict=True)] + [" ".join(vals) for vals in zip(*string_arrays, strict=True)], + dtype=np.str_, ) - strings = np.asanyarray(a=strings, dtype=np.str_) self.put_strings( dataset, family="GMT_IS_VECTOR|GMT_IS_DUPLICATE", strings=strings )