From b0c1b7b82ccdf1a7e4159cb3bffa1984092440d4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:48:32 -0500
Subject: [PATCH] Add BytePairEncoder class to cuDF (#13891)

Adds a new BytePairEncoding class to cuDF
```
>>> import cudf
>>> from cudf.core.byte_pair_encoding import BytePairEncoder
>>> mps = cudf.read_text('merges.txt', delimiter='\n', strip_delimiters=True)
>>> bpe = BytePairEncoder(mps)
>>> str_series = cudf.Series(['This is a sentence', 'thisisit'])
>>> bpe(str_series)
0    This is a sent ence
1             this is it
dtype: object
```
This class wraps the existing `nvtext::byte_pair_encoding` APIs to load the merge-pairs data and encode a column of strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13891
---
 .../cudf/_lib/cpp/nvtext/byte_pair_encode.pxd | 24 ++++++++
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  4 +-
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     | 50 ++++++++++++++++
 python/cudf/cudf/core/byte_pair_encoding.py   | 59 +++++++++++++++++++
 .../cudf/cudf/tests/text/test_text_methods.py | 41 +++++++++++++
 5 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
 create mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
 create mode 100644 python/cudf/cudf/core/byte_pair_encoding.py

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e678e4e84db
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index d4e2392ee04..d7cbdeb5bda 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..cfc76afa8a5
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPEMergePairs merge_pairs,
+    object separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
new file mode 100644
index 00000000000..4c881022ecf
--- /dev/null
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.byte_pair_encode import (
+    BPEMergePairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
+
+    Parameters
+    ----------
+    merges_pairs : str
+        Strings column of merge pairs
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+
+    def __call__(self, text, separator: str = " "):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index e565df8f3da..2dccd583b23 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)