diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd new file mode 100644 index 00000000000..e678e4e84db --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar + + +cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil: + + cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs": + pass + + cdef unique_ptr[bpe_merge_pairs] load_merge_pairs( + const column_view &merge_pairs + ) except + + + cdef unique_ptr[column] byte_pair_encoding( + const column_view &strings, + const bpe_merge_pairs &merge_pairs, + const string_scalar &separator + ) except + diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index d4e2392ee04..d7cbdeb5bda 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx + byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx + ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx new file mode 100644 index 00000000000..cfc76afa8a5 --- /dev/null +++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx @@ -0,0 +1,50 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + + +from cudf.core.buffer import acquire_spill_lock + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.nvtext.byte_pair_encode cimport ( + bpe_merge_pairs as cpp_bpe_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, + load_merge_pairs as cpp_load_merge_pairs, +) +from cudf._lib.cpp.scalar.scalar cimport string_scalar +from cudf._lib.scalar cimport DeviceScalar + + +cdef class BPEMergePairs: + cdef unique_ptr[cpp_bpe_merge_pairs] c_obj + + def __cinit__(self, Column merge_pairs): + cdef column_view c_pairs = merge_pairs.view() + with nogil: + self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + + +@acquire_spill_lock() +def byte_pair_encoding( + Column strings, + BPEMergePairs merge_pairs, + object separator +): + cdef column_view c_strings = strings.view() + cdef DeviceScalar d_separator = separator.device_value + cdef const string_scalar* c_separator = d_separator\ + .get_raw_ptr() + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_byte_pair_encoding( + c_strings, + merge_pairs.c_obj.get()[0], + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py new file mode 100644 index 00000000000..4c881022ecf --- /dev/null +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -0,0 +1,59 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from __future__ import annotations + +import cudf +from cudf._lib.nvtext.byte_pair_encode import ( + BPEMergePairs as cpp_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, +) + + +class BytePairEncoder: + """ + Given a merge pairs strings series, performs byte pair encoding on + a strings series using the provided separator. + + Parameters + ---------- + merges_pairs : str + Strings column of merge pairs + + Returns + ------- + BytePairEncoder + """ + + def __init__(self, merges_pair: "cudf.Series"): + self.merge_pairs = cpp_merge_pairs(merges_pair._column) + + def __call__(self, text, separator: str = " "): + """ + + Parameters + ---------- + text : cudf string series + The strings to be encoded. + + Returns + ------- + Encoded strings + + Examples + -------- + >>> import cudf + >>> from cudf.core.byte_pair_encoding import BytePairEncoder + >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t", + ... "c e", "es t", "en ce", "T h", "Th is", + ... "t est", "s ent", "t h", "th is"]) + >>> bpe = BytePairEncoder(mps) + >>> str_series = cudf.Series(['This is the sentence', 'thisisit']) + >>> bpe(str_series) + 0 This is a sent ence + 1 this is it + dtype: object + """ + sep = cudf.Scalar(separator, dtype="str") + result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) + + return cudf.Series(result) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index e565df8f3da..2dccd583b23 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf.core.byte_pair_encoding import BytePairEncoder from cudf.core.tokenize_vocabulary import TokenizeVocabulary from cudf.testing._utils import assert_eq @@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings(): actual = str1.str.jaccard_index(str2, jaccard_width) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "separator, input, results", + [ + (" ", "thetestsentence", "the test sent ence"), + ("_", "sentenceistest", "sent_ence_is_test"), + ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"), + ], +) +def test_byte_pair_encoding(separator, input, results): + pairs_table = cudf.Series( + [ + "t he", + "h e", + "e n", + "i t", + "i s", + "e s", + "en t", + "c e", + "es t", + "en ce", + "t h", + "h i", + "th is", + "t est", + "s i", + "s ent", + ] + ) + encoder = BytePairEncoder(pairs_table) + + strings = cudf.Series([input, None, "", input]) + + expected = cudf.Series([results, None, "", results]) + + actual = encoder(strings, separator) + assert type(expected) == type(actual) + assert_eq(expected, actual)