From 042a138c56101a5a9a2198fa636ec4e57c1c9f96 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy Date: Wed, 10 May 2023 15:52:49 +0100 Subject: [PATCH 1/5] Add test for workflow with unicode string inputs --- .../triton/ops/workflow/test_ensemble.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py index d7174db1c..b0dd57df7 100644 --- a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py +++ b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py @@ -366,3 +366,39 @@ def test_workflow_dtypes(tmpdir): ) for key, value in expected_response.items(): np.testing.assert_array_equal(response[key], value) + + + +@pytest.mark.skipif(not TRITON_SERVER_PATH, reason="triton server not found") +def test_workflow_with_string_input(tmpdir): + """This test checks that we can pass strings with unicode characters to a workflow in Triton.""" + df = make_df( + { + "a": ["椅子", "καρέκλα", "כִּסֵא", "chair"] + } + ) + dataset = Dataset(df) + workflow_ops = ["a"] >> wf_ops.Categorify() + workflow = Workflow(workflow_ops) + workflow.fit(dataset) + + workflow_node = workflow.input_schema.column_names >> workflow_op.TransformWorkflow(workflow) + wkflow_ensemble = ensemble.Ensemble(workflow_node, workflow.input_schema) + ensemble_config, node_configs = wkflow_ensemble.export(tmpdir) + + with run_triton_server(tmpdir) as client: + for model_name in [ensemble_config.name, node_configs[0].name]: + request_dict = { + "a": np.array(["椅子", "καρέκλα", "כִּסֵא", "chair"], dtype="object"), + } + expected_response = { + "a": np.array([1, 2, 3, 4], dtype="int32"), + } + schema = workflow.input_schema + input_table = TensorTable(request_dict) + output_names = ["a"] + response = send_triton_request( + schema, input_table, output_names, client=client, triton_model=model_name + ) + assert set(expected_response["a"].tolist()) == set(response["a"].tolist()) + From 980c1a548a06239213abbbaba85fa56e15cd46d2 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy Date: Wed, 10 May 2023 15:56:56 +0100 Subject: [PATCH 2/5] Reformat test_ensemble.py --- .../dag/runtimes/triton/ops/workflow/test_ensemble.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py index b0dd57df7..9baf406f8 100644 --- a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py +++ b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py @@ -368,15 +368,10 @@ def test_workflow_dtypes(tmpdir): np.testing.assert_array_equal(response[key], value) - @pytest.mark.skipif(not TRITON_SERVER_PATH, reason="triton server not found") def test_workflow_with_string_input(tmpdir): """This test checks that we can pass strings with unicode characters to a workflow in Triton.""" - df = make_df( - { - "a": ["椅子", "καρέκλα", "כִּסֵא", "chair"] - } - ) + df = make_df({"a": ["椅子", "καρέκλα", "כִּסֵא", "chair"]}) dataset = Dataset(df) workflow_ops = ["a"] >> wf_ops.Categorify() workflow = Workflow(workflow_ops) @@ -401,4 +396,3 @@ def test_workflow_with_string_input(tmpdir): schema, input_table, output_names, client=client, triton_model=model_name ) assert set(expected_response["a"].tolist()) == set(response["a"].tolist()) - From 926a6ce34d3534b33e47c8aecc30674ccb44d423 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy Date: Wed, 10 May 2023 16:07:23 +0100 Subject: [PATCH 3/5] Remove string dtype coercion --- merlin/systems/triton/__init__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/merlin/systems/triton/__init__.py b/merlin/systems/triton/__init__.py index dc5256602..54efd45a4 100644 --- a/merlin/systems/triton/__init__.py +++ b/merlin/systems/triton/__init__.py @@ -150,9 +150,6 @@ def _convert_tensor(t): out = t.as_numpy() if len(out.shape) == 2: out = out[:, 0] - # cudf doesn't seem to handle dtypes like |S15 or object that well - if is_string_dtype(out.dtype): - out = out.astype("str") return out From 02d25a72bac499bf107dd572a0addfd2edd22cd5 Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Fri, 12 May 2023 21:06:38 +0100 Subject: [PATCH 4/5] Add string coercion to ensure we have unicode string array --- merlin/systems/triton/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/merlin/systems/triton/__init__.py b/merlin/systems/triton/__init__.py index 54efd45a4..a5ef854a2 100644 --- a/merlin/systems/triton/__init__.py +++ b/merlin/systems/triton/__init__.py @@ -15,6 +15,7 @@ import json import os +import numpy as np import pandas as pd # this needs to be before any modules that import protobuf @@ -150,6 +151,10 @@ def _convert_tensor(t): out = t.as_numpy() if len(out.shape) == 2: out = out[:, 0] + + # coerce byte string arrays to unicode strings + if is_string_dtype(out.dtype): + out = np.char.decode(out.astype(bytes)) return out From 2c866a410bb04ddb960f4f396969a34badf8762f Mon Sep 17 00:00:00 2001 From: Oliver Holworthy <1216955+oliverholworthy@users.noreply.github.com> Date: Wed, 21 Jun 2023 20:14:19 +0100 Subject: [PATCH 5/5] Use LambdaOp instead of Categorify in string test --- .../systems/dag/runtimes/triton/ops/workflow/test_ensemble.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py index 049d97d01..6aeede8b7 100644 --- a/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py +++ b/tests/unit/systems/dag/runtimes/triton/ops/workflow/test_ensemble.py @@ -385,7 +385,7 @@ def test_workflow_with_string_input(tmpdir): """This test checks that we can pass strings with unicode characters to a workflow in Triton.""" df = make_df({"a": ["椅子", "καρέκλα", "כִּסֵא", "chair"]}) dataset = Dataset(df) - workflow_ops = ["a"] >> wf_ops.Categorify() + workflow_ops = ["a"] >> wf_ops.LambdaOp(lambda s: s.str.len()) workflow = Workflow(workflow_ops) workflow.fit(dataset) @@ -399,7 +399,7 @@ def test_workflow_with_string_input(tmpdir): "a": np.array(["椅子", "καρέκλα", "כִּסֵא", "chair"], dtype="object"), } expected_response = { - "a": np.array([1, 2, 3, 4], dtype="int32"), + "a": np.array([2, 7, 6, 5], dtype="int64"), } schema = workflow.input_schema input_table = TensorTable(request_dict)