From 042a138c56101a5a9a2198fa636ec4e57c1c9f96 Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <oholworthy@nvidia.com>
Date: Wed, 10 May 2023 15:52:49 +0100
Subject: [PATCH 1/5] Add test for workflow with unicode string inputs

---
 .../triton/ops/workflow/test_ensemble.py      | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
index d7174db1c..b0dd57df7 100644
--- a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
+++ b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
@@ -366,3 +366,39 @@ def test_workflow_dtypes(tmpdir):
                 )
                 for key, value in expected_response.items():
                     np.testing.assert_array_equal(response[key], value)
+
+
+
+@pytest.mark.skipif(not TRITON_SERVER_PATH, reason="triton server not found")
+def test_workflow_with_string_input(tmpdir):
+    """This test checks that we can pass strings with unicode characters to a workflow in Triton."""
+    df = make_df(
+        {
+            "a": ["椅子", "καρέκλα", "כִּסֵא", "chair"]
+        }
+    )
+    dataset = Dataset(df)
+    workflow_ops = ["a"] >> wf_ops.Categorify()
+    workflow = Workflow(workflow_ops)
+    workflow.fit(dataset)
+
+    workflow_node = workflow.input_schema.column_names >> workflow_op.TransformWorkflow(workflow)
+    wkflow_ensemble = ensemble.Ensemble(workflow_node, workflow.input_schema)
+    ensemble_config, node_configs = wkflow_ensemble.export(tmpdir)
+
+    with run_triton_server(tmpdir) as client:
+        for model_name in [ensemble_config.name, node_configs[0].name]:
+            request_dict = {
+                "a": np.array(["椅子", "καρέκλα", "כִּסֵא", "chair"], dtype="object"),
+            }
+            expected_response = {
+                "a": np.array([1, 2, 3, 4], dtype="int32"),
+            }
+            schema = workflow.input_schema
+            input_table = TensorTable(request_dict)
+            output_names = ["a"]
+            response = send_triton_request(
+                schema, input_table, output_names, client=client, triton_model=model_name
+            )
+            assert set(expected_response["a"].tolist()) == set(response["a"].tolist())
+

From 980c1a548a06239213abbbaba85fa56e15cd46d2 Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <oholworthy@nvidia.com>
Date: Wed, 10 May 2023 15:56:56 +0100
Subject: [PATCH 2/5] Reformat test_ensemble.py

---
 .../dag/runtimes/triton/ops/workflow/test_ensemble.py     | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
index b0dd57df7..9baf406f8 100644
--- a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
+++ b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
@@ -368,15 +368,10 @@ def test_workflow_dtypes(tmpdir):
                     np.testing.assert_array_equal(response[key], value)
 
 
-
 @pytest.mark.skipif(not TRITON_SERVER_PATH, reason="triton server not found")
 def test_workflow_with_string_input(tmpdir):
     """This test checks that we can pass strings with unicode characters to a workflow in Triton."""
-    df = make_df(
-        {
-            "a": ["椅子", "καρέκλα", "כִּסֵא", "chair"]
-        }
-    )
+    df = make_df({"a": ["椅子", "καρέκλα", "כִּסֵא", "chair"]})
     dataset = Dataset(df)
     workflow_ops = ["a"] >> wf_ops.Categorify()
     workflow = Workflow(workflow_ops)
@@ -401,4 +396,3 @@ def test_workflow_with_string_input(tmpdir):
                 schema, input_table, output_names, client=client, triton_model=model_name
             )
             assert set(expected_response["a"].tolist()) == set(response["a"].tolist())
-

From 926a6ce34d3534b33e47c8aecc30674ccb44d423 Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <oholworthy@nvidia.com>
Date: Wed, 10 May 2023 16:07:23 +0100
Subject: [PATCH 3/5] Remove string dtype coercion

---
 merlin/systems/triton/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/merlin/systems/triton/__init__.py b/merlin/systems/triton/__init__.py
index dc5256602..54efd45a4 100644
--- a/merlin/systems/triton/__init__.py
+++ b/merlin/systems/triton/__init__.py
@@ -150,9 +150,6 @@ def _convert_tensor(t):
     out = t.as_numpy()
     if len(out.shape) == 2:
         out = out[:, 0]
-    # cudf doesn't seem to handle dtypes like |S15 or object that well
-    if is_string_dtype(out.dtype):
-        out = out.astype("str")
     return out
 
 

From 02d25a72bac499bf107dd572a0addfd2edd22cd5 Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com>
Date: Fri, 12 May 2023 21:06:38 +0100
Subject: [PATCH 4/5] Add string coercion to ensure we have unicode string
 array

---
 merlin/systems/triton/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/merlin/systems/triton/__init__.py b/merlin/systems/triton/__init__.py
index 54efd45a4..a5ef854a2 100644
--- a/merlin/systems/triton/__init__.py
+++ b/merlin/systems/triton/__init__.py
@@ -15,6 +15,7 @@
 import json
 import os
 
+import numpy as np
 import pandas as pd
 
 # this needs to be before any modules that import protobuf
@@ -150,6 +151,10 @@ def _convert_tensor(t):
     out = t.as_numpy()
     if len(out.shape) == 2:
         out = out[:, 0]
+
+    # coerce byte string arrays to unicode strings
+    if is_string_dtype(out.dtype):
+        out = np.char.decode(out.astype(bytes))
     return out
 
 

From 2c866a410bb04ddb960f4f396969a34badf8762f Mon Sep 17 00:00:00 2001
From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com>
Date: Wed, 21 Jun 2023 20:14:19 +0100
Subject: [PATCH 5/5] Use LambdaOp instead of Categorify in string test

---
 .../systems/dag/runtimes/triton/ops/workflow/test_ensemble.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
index 049d97d01..6aeede8b7 100644
--- a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
+++ b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py
@@ -385,7 +385,7 @@ def test_workflow_with_string_input(tmpdir):
     """This test checks that we can pass strings with unicode characters to a workflow in Triton."""
     df = make_df({"a": ["椅子", "καρέκλα", "כִּסֵא", "chair"]})
     dataset = Dataset(df)
-    workflow_ops = ["a"] >> wf_ops.Categorify()
+    workflow_ops = ["a"] >> wf_ops.LambdaOp(lambda s: s.str.len())
     workflow = Workflow(workflow_ops)
     workflow.fit(dataset)
 
@@ -399,7 +399,7 @@ def test_workflow_with_string_input(tmpdir):
                 "a": np.array(["椅子", "καρέκλα", "כִּסֵא", "chair"], dtype="object"),
             }
             expected_response = {
-                "a": np.array([1, 2, 3, 4], dtype="int32"),
+                "a": np.array([2, 7, 6, 5], dtype="int64"),
             }
             schema = workflow.input_schema
             input_table = TensorTable(request_dict)