Skip to content

Commit

Permalink
Add string.convert.convert_urls APIs to pylibcudf (#17003)
Browse files Browse the repository at this point in the history
Contributes to #15162

Also I believe the cpp docstrings were incorrect, but could use a second look.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: #17003
  • Loading branch information
mroeschke authored Oct 9, 2024
1 parent 349ba5d commit 5b931ac
Show file tree
Hide file tree
Showing 9 changed files with 121 additions and 34 deletions.
4 changes: 2 additions & 2 deletions cpp/include/cudf/strings/convert/convert_urls.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ namespace strings {
*/

/**
* @brief Decodes each string using URL encoding.
* @brief Encodes each string using URL encoding.
*
* Converts mostly non-ascii characters and control characters into UTF-8 hex code-points
* prefixed with '%'. For example, the space character must be converted to characters '%20' where
Expand All @@ -49,7 +49,7 @@ std::unique_ptr<column> url_encode(
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Encodes each string using URL encoding.
* @brief Decodes each string using URL encoding.
*
* Converts all character sequences starting with '%' into character code-points
* interpreting the 2 following characters as hex values to create the code-point.
Expand Down
36 changes: 7 additions & 29 deletions python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
import pylibcudf as plc

from cudf.core.buffer import acquire_spill_lock

from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.column.column_view cimport column_view
from pylibcudf.libcudf.strings.convert.convert_urls cimport (
url_decode as cpp_url_decode,
url_encode as cpp_url_encode,
)

from cudf._lib.column cimport Column


Expand All @@ -28,17 +20,10 @@ def url_decode(Column source_strings):
-------
URL decoded string column
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_url_decode(
source_view
))

return Column.from_unique_ptr(
move(c_result)
plc_column = plc.strings.convert.convert_urls.url_decode(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)


@acquire_spill_lock()
Expand All @@ -57,14 +42,7 @@ def url_encode(Column source_strings):
-------
URL encoded string column
"""
cdef unique_ptr[column] c_result
cdef column_view source_view = source_strings.view()

with nogil:
c_result = move(cpp_url_encode(
source_view
))

return Column.from_unique_ptr(
move(c_result)
plc_column = plc.strings.convert.convert_urls.url_encode(
source_strings.to_pylibcudf(mode="read")
)
return Column.from_pylibcudf(plc_column)
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
"cudf::strings" nogil:
cdef unique_ptr[column] url_encode(
column_view input_col) except +
column_view input) except +

cdef unique_ptr[column] url_decode(
column_view input_col) except +
column_view input) except +
2 changes: 1 addition & 1 deletion python/pylibcudf/pylibcudf/strings/convert/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# =============================================================================

set(cython_sources convert_booleans.pyx convert_datetime.pyx convert_durations.pyx
convert_fixed_point.pyx convert_ipv4.pyx
convert_fixed_point.pyx convert_ipv4.pyx convert_urls.pyx
)

set(linked_libraries cudf::cudf)
Expand Down
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ from . cimport (
convert_durations,
convert_fixed_point,
convert_ipv4,
convert_urls,
)
1 change: 1 addition & 0 deletions python/pylibcudf/pylibcudf/strings/convert/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
convert_durations,
convert_fixed_point,
convert_ipv4,
convert_urls,
)
8 changes: 8 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from pylibcudf.column cimport Column


cpdef Column url_encode(Column Input)

cpdef Column url_decode(Column Input)
63 changes: 63 additions & 0 deletions python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Copyright (c) 2024, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
from pylibcudf.libcudf.column.column cimport column
from pylibcudf.libcudf.strings.convert cimport convert_urls as cpp_convert_urls


cpdef Column url_encode(Column input):
"""
Encodes each string using URL encoding.
For details, see :cpp:func:`cudf::strings::url_encode`
Parameters
----------
input : Column
Strings instance for this operation.
Returns
-------
Column
New strings column.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_convert_urls.url_encode(
input.view()
)
)

return Column.from_libcudf(move(c_result))


cpdef Column url_decode(Column input):
"""
Decodes each string using URL encoding.
For details, see :cpp:func:`cudf::strings::url_decode`
Parameters
----------
input : Column
Strings instance for this operation.
Returns
-------
Column
New strings column.
"""
cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_convert_urls.url_decode(
input.view()
)
)

return Column.from_libcudf(move(c_result))
36 changes: 36 additions & 0 deletions python/pylibcudf/pylibcudf/tests/test_string_convert_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
import urllib

import pyarrow as pa
import pylibcudf as plc
from utils import assert_column_eq


def test_url_encode():
data = ["/home/nfs", None]
arr = pa.array(data)
result = plc.strings.convert.convert_urls.url_encode(
plc.interop.from_arrow(arr)
)
expected = pa.array(
[
urllib.parse.quote(url, safe="") if isinstance(url, str) else url
for url in data
]
)
assert_column_eq(result, expected)


def test_url_decode():
data = ["%2Fhome%2fnfs", None]
arr = pa.array(data)
result = plc.strings.convert.convert_urls.url_decode(
plc.interop.from_arrow(arr)
)
expected = pa.array(
[
urllib.parse.unquote(url) if isinstance(url, str) else url
for url in data
]
)
assert_column_eq(result, expected)

0 comments on commit 5b931ac

Please sign in to comment.