From 72a5f390ee8b0946828c3968b91c60ebbe5f8ebe Mon Sep 17 00:00:00 2001 From: Mike Wilson Date: Mon, 23 Oct 2023 17:39:03 +0000 Subject: [PATCH] Adding IP6 characters into valid character list for URI parsing Signed-off-by: Mike Wilson --- src/main/cpp/src/parse_uri.cu | 4 +- src/main/cpp/tests/parse_uri.cpp | 47 ++++++++++ .../nvidia/spark/rapids/jni/ParseURITest.java | 85 ++++++++++++++++++- 3 files changed, 133 insertions(+), 3 deletions(-) diff --git a/src/main/cpp/src/parse_uri.cu b/src/main/cpp/src/parse_uri.cu index 54e79ab022..49595956cd 100644 --- a/src/main/cpp/src/parse_uri.cu +++ b/src/main/cpp/src/parse_uri.cu @@ -44,8 +44,8 @@ constexpr bool is_valid_character(char ch, bool alphanum_only) if (ch >= 'A' && ch <= 'Z') return true; // A-Z if (ch >= 'a' && ch <= 'z') return true; // a-z } else { - if (ch >= '!' && ch <= ';' && ch != '"') return true; // 0-9 and !#%&'()*+,-./ - if (ch >= '=' && ch <= 'Z' && ch != '>') return true; // A-Z and =?@ + if (ch >= '!' && ch <= ':' && ch != '"') return true; // 0-9 and !#%&'()*+,-./: + if (ch >= '=' && ch <= ']' && ch != '>') return true; // A-Z and =?@[] if (ch >= '_' && ch <= 'z' && ch != '`') return true; // a-z and _ } return false; diff --git a/src/main/cpp/tests/parse_uri.cpp b/src/main/cpp/tests/parse_uri.cpp index 3ff14a6075..a182e5d426 100644 --- a/src/main/cpp/tests/parse_uri.cpp +++ b/src/main/cpp/tests/parse_uri.cpp @@ -91,5 +91,52 @@ TEST_F(ParseURIProtocolTests, SparkEdges) "https"}, {1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIProtocolTests, IP6) +{ + cudf::test::strings_column_wrapper col({ + "https://[fe80::]", + "https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", + "https://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]", + "https://[2001:db8::1:0]", + "http://[2001:db8::2:1]", + "https://[::1]", + "https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:443", + }); + auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper expected({"https", "https", "https", "https", "http", "https", "https"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIProtocolTests, IP4) +{ + cudf::test::strings_column_wrapper col({ + "https://192.168.1.100/", + "https://192.168.1.100:8443/", + "https://192.168.1.100.5/", + "https://192.168.1/", + "https://280.100.1.1/", + }); + auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper expected({"https", "https", "https", "https", "https"}); + + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); +} + +TEST_F(ParseURIProtocolTests, UTF8) +{ + cudf::test::strings_column_wrapper col({ + "https://nvidia.com/%4EV%49%44%49%41", + "http://%77%77%77.%4EV%49%44%49%41.com", + }); + auto result = spark_rapids_jni::parse_uri_to_protocol(cudf::strings_column_view{col}); + + cudf::test::strings_column_wrapper expected({"https", "http"}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected); } \ No newline at end of file diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java index 7289d110b2..752eb8c708 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -26,7 +26,7 @@ public class ParseURITest { @Test - void parseURIToProtocolTest() { + void parseURIToProtocolSparkTest() { String[] testData = {"https://nvidia.com/https&#://nvidia.com", "https://http://www.nvidia.com", "filesystemmagicthing://bob.yaml", @@ -50,6 +50,89 @@ void parseURIToProtocolTest() { "http//www.nvidia.com/q", "", null}; + + String[] expectedStrings = new String[testData.length]; + for (int i=0; i