Skip to content

Commit

Permalink
Add JNI Support for Multi-line Delimiters and Include Test (#17139)
Browse files Browse the repository at this point in the history
This PR introduces the necessary changes to the cuDF jni to support the issue described in [NVIDIA/spark-rapids#11554](NVIDIA/spark-rapids#11554). For further information, refer to the details in the [comment](NVIDIA/spark-rapids#11554 (comment)).

Issue #15961 adds support for handling multiple line delimiters. This PR extends that functionality to JNI, which was previously missing, and also includes a test to validate the changes.

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: #17139
  • Loading branch information
SurajAralihalli authored Oct 23, 2024
1 parent 3126f77 commit f0c6a04
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 1 deletion.
11 changes: 10 additions & 1 deletion java/src/main/java/ai/rapids/cudf/RegexFlag.java
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@ public enum RegexFlag {
DEFAULT(0), // default
MULTILINE(8), // the '^' and '$' honor new-line characters
DOTALL(16), // the '.' matching includes new-line characters
ASCII(256); // use only ASCII when matching built-in character classes
ASCII(256), // use only ASCII when matching built-in character classes
/**
* EXT_NEWLINE(512): Extends line delimiters to include the following Unicode characters
* - NEXT_LINE ('\u0085')
* - LINE_SEPARATOR ('\u2028')
* - PARAGRAPH_SEPARATOR ('\u2029')
* - CARRIAGE_RETURN ('\r')
* - NEW_LINE ('\n')
*/
EXT_NEWLINE(512);

final int nativeId; // Native id, for use with libcudf.
private RegexFlag(int nativeId) { // Only constant values should be used
Expand Down
38 changes: 38 additions & 0 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicInteger;
Expand Down Expand Up @@ -3877,6 +3878,43 @@ void testExtractRe() {
}
}

@Test
void testExtractReWithMultiLineDelimiters() {
String NEXT_LINE = "\u0085";
String LINE_SEPARATOR = "\u2028";
String PARAGRAPH_SEPARATOR = "\u2029";
String CARRIAGE_RETURN = "\r";
String NEW_LINE = "\n";

try (ColumnVector input = ColumnVector.fromStrings(
"boo:" + NEXT_LINE + "boo::" + LINE_SEPARATOR + "boo:::",
"boo:::" + LINE_SEPARATOR + "zzé" + CARRIAGE_RETURN + "lll",
"boo::",
"",
"boo::" + NEW_LINE,
"boo::" + CARRIAGE_RETURN,
"boo:" + NEXT_LINE + "boo::" + PARAGRAPH_SEPARATOR,
"boo:" + NEW_LINE + "boo::" + LINE_SEPARATOR,
"boo:" + NEXT_LINE + "boo::" + NEXT_LINE);
Table expected_ext_newline = new Table.TestBuilder()
.column("boo:::", null, "boo::", null, "boo::", "boo::", "boo::", "boo::", "boo::")
.build();
Table expected_default = new Table.TestBuilder()
.column("boo:::", null, "boo::", null, "boo::", null, null, null, null)
.build()) {

// Regex pattern to match 'boo:' followed by one or more colons at the end of the string
try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.EXT_NEWLINE)))) {
assertColumnsAreEqual(expected_ext_newline.getColumns()[0], found.getColumns()[0]);
}

try (Table found = input.extractRe(new RegexProgram("(boo:+)$", EnumSet.of(RegexFlag.DEFAULT)))) {
assertColumnsAreEqual(expected_default.getColumns()[0], found.getColumns()[0]);
}
}
}


@Test
void testExtractAllRecord() {
String pattern = "([ab])(\\d)";
Expand Down

0 comments on commit f0c6a04

Please sign in to comment.