diff --git a/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java new file mode 100644 index 0000000000..bfb5bc484c --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/ParseURI.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.DType; +import ai.rapids.cudf.NativeDepsLoader; + +public class ParseURI { + static { + NativeDepsLoader.loadNativeDeps(); + } + + + /** + * Parse protocol for each URI from the incoming column. + * + * @param URIColumn The input strings column in which each row contains a URI. + * @return A string column with protocol data extracted. + */ + public static ColumnVector parseURIProtocol(ColumnView URIColumn) { + assert URIColumn.getType().equals(DType.STRING) : "Input type must be String"; + return new ColumnVector(parseProtocol(URIColumn.getNativeView())); + } + + + private static native long parseProtocol(long jsonColumnHandle); + +} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java new file mode 100644 index 0000000000..6d55e49130 --- /dev/null +++ b/src/test/java/com/nvidia/spark/rapids/jni/ParseURITest.java @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import java.net.URI; +import java.net.URISyntaxException; + +import org.junit.jupiter.api.Test; + +import ai.rapids.cudf.AssertUtils; +import ai.rapids.cudf.ColumnVector; + +public class ParseURITest { + @Test + void parseURIToProtocolTest() { + String[] testData = {"https://nvidia.com/https&#://nvidia.com", + "https://http://www.nvidia.com", + "filesystemmagicthing://bob.yaml", + "nvidia.com:8080", + "http://thisisinvalid.data/due/to-the_character%s/inside*the#url`~", + "file:/absolute/path", + "//www.nvidia.com", + "#bob", + "#this%doesnt#make//sense://to/me", + "HTTP:&bob", + "/absolute/path", + "http://%77%77%77.%4EV%49%44%49%41.com", + "https:://broken.url", + "https://www.nvidia.com/q/This%20is%20a%20query", + "http:/www.nvidia.com", + "http://:www.nvidia.com/", + "http:///nvidia.com/q", + "https://www.nvidia.com:8080/q", + "https://www.nvidia.com#8080", + "file://path/to/cool/file", + "http//www.nvidia.com/q"}; + + String[] expectedStrings = new String[testData.length]; + for (int i=0; i