Skip to content

Commit

Permalink
Add in support for setting delim when parsing JSON through java (#16867
Browse files Browse the repository at this point in the history
…) (#16880)

This is a back-port of #16867 to 24.10.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)

URL: #16880
  • Loading branch information
revans2 authored Sep 24, 2024
1 parent 8b12cf4 commit 6badd6b
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 8 deletions.
16 changes: 16 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean allowLeadingZeros;
private final boolean allowNonNumericNumbers;
private final boolean allowUnquotedControlChars;
private final byte lineDelimiter;

private JSONOptions(Builder builder) {
super(builder);
Expand All @@ -52,6 +53,11 @@ private JSONOptions(Builder builder) {
allowLeadingZeros = builder.allowLeadingZeros;
allowNonNumericNumbers = builder.allowNonNumericNumbers;
allowUnquotedControlChars = builder.allowUnquotedControlChars;
lineDelimiter = builder.lineDelimiter;
}

public byte getLineDelimiter() {
return lineDelimiter;
}

public boolean isDayFirst() {
Expand Down Expand Up @@ -123,6 +129,16 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

private byte lineDelimiter = '\n';

public Builder withLineDelimiter(char delimiter) {
if (delimiter > Byte.MAX_VALUE) {
throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
}
lineDelimiter = (byte)delimiter;
return this;
}

/**
* Should json validation be strict or not
*/
Expand Down
19 changes: 14 additions & 5 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,8 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl) throws CudfException;
boolean allowUnquotedControl,
byte lineDelimiter) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
int[] dTypeIds, int[] dTypeScales,
Expand All @@ -272,6 +273,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
byte lineDelimiter,
long dsHandle) throws CudfException;

private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
Expand All @@ -284,6 +286,7 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
byte lineDelimiter,
long dsHandle) throws CudfException;

private static native long readAndInferJSON(long address, long length,
Expand All @@ -297,7 +300,8 @@ private static native long readAndInferJSON(long address, long length,
boolean strictValidation,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl) throws CudfException;
boolean allowUnquotedControl,
byte lineDelimiter) throws CudfException;

/**
* Read in Parquet formatted data.
Expand Down Expand Up @@ -1321,7 +1325,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {
opts.unquotedControlChars(),
opts.getLineDelimiter()))) {

return gatherJSONColumns(schema, twm, -1);
}
Expand Down Expand Up @@ -1404,7 +1409,8 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()));
opts.unquotedControlChars(),
opts.getLineDelimiter()));
}

/**
Expand All @@ -1426,6 +1432,7 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
opts.getLineDelimiter(),
dsHandle));
return twm;
} finally {
Expand Down Expand Up @@ -1479,7 +1486,8 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.strictValidation(),
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars()))) {
opts.unquotedControlChars(),
opts.getLineDelimiter()))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
}
}
Expand Down Expand Up @@ -1518,6 +1526,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
opts.getLineDelimiter(),
dsHandle))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
} finally {
Expand Down
12 changes: 10 additions & 2 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1627,6 +1627,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jbyte line_delimiter,
jlong ds_handle)
{
JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
Expand All @@ -1646,6 +1647,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
if (strict_validation) {
Expand Down Expand Up @@ -1676,7 +1678,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control)
jboolean allow_unquoted_control,
jbyte line_delimiter)
{
JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
if (buffer_length <= 0) {
Expand All @@ -1700,6 +1703,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.strict_validation(strict_validation)
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.keep_quotes(keep_quotes);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
Expand Down Expand Up @@ -1814,6 +1818,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jbyte line_delimiter,
jlong ds_handle)
{
JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
Expand Down Expand Up @@ -1848,6 +1853,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
if (strict_validation) {
Expand Down Expand Up @@ -1908,7 +1914,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
jboolean strict_validation,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control)
jboolean allow_unquoted_control,
jbyte line_delimiter)
{
bool read_buffer = true;
if (buffer == 0) {
Expand Down Expand Up @@ -1957,6 +1964,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
.normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
if (strict_validation) {
Expand Down
19 changes: 18 additions & 1 deletion java/src/test/java/ai/rapids/cudf/TableTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;

import java.io.*;
Expand Down Expand Up @@ -656,6 +655,24 @@ void testJSONValidationUnquotedControl() {
}
}

private static final byte[] CR_JSON_TEST_BUFFER = ("{\"a\":\"12\n3\"}\0" +
"{\"a\":\"AB\nC\"}\0").getBytes(StandardCharsets.UTF_8);

@Test
void testReadJSONDelim() {
Schema schema = Schema.builder().addColumn(DType.STRING, "a").build();
JSONOptions opts = JSONOptions.builder()
.withLines(true)
.withLineDelimiter('\0')
.build();
try (Table expected = new Table.TestBuilder()
.column("12\n3", "AB\nC")
.build();
Table found = Table.readJSON(schema, opts, CR_JSON_TEST_BUFFER)) {
assertTablesAreEqual(expected, found);
}
}

private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
"{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
"{\"d\":[1,2,3]}\n" +
Expand Down

0 comments on commit 6badd6b

Please sign in to comment.