Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/branch-24.10' into dask-cudf-a…
Browse files Browse the repository at this point in the history
…rrow-filesystem
  • Loading branch information
rjzamora committed Sep 24, 2024
2 parents 8cfe71e + b3518ab commit badf359
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 3 deletions.
12 changes: 12 additions & 0 deletions java/src/main/java/ai/rapids/cudf/JSONOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public final class JSONOptions extends ColumnFilterOptions {
private final boolean allowLeadingZeros;
private final boolean allowNonNumericNumbers;
private final boolean allowUnquotedControlChars;
private final boolean cudfPruneSchema;
private final byte lineDelimiter;

private JSONOptions(Builder builder) {
Expand All @@ -53,9 +54,14 @@ private JSONOptions(Builder builder) {
allowLeadingZeros = builder.allowLeadingZeros;
allowNonNumericNumbers = builder.allowNonNumericNumbers;
allowUnquotedControlChars = builder.allowUnquotedControlChars;
cudfPruneSchema = builder.cudfPruneSchema;
lineDelimiter = builder.lineDelimiter;
}

public boolean shouldCudfPruneSchema() {
return cudfPruneSchema;
}

public byte getLineDelimiter() {
return lineDelimiter;
}
Expand Down Expand Up @@ -129,8 +135,14 @@ public static final class Builder extends ColumnFilterOptions.Builder<JSONOptio
private boolean mixedTypesAsStrings = false;
private boolean keepQuotes = false;

private boolean cudfPruneSchema = false;
private byte lineDelimiter = '\n';

public Builder withCudfPruneSchema(boolean prune) {
cudfPruneSchema = prune;
return this;
}

public Builder withLineDelimiter(char delimiter) {
if (delimiter > Byte.MAX_VALUE) {
throw new IllegalArgumentException("Only basic ASCII values are supported as line delimiters " + delimiter);
Expand Down
17 changes: 17 additions & 0 deletions java/src/main/java/ai/rapids/cudf/Table.java
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter) throws CudfException;

private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
Expand All @@ -273,6 +274,7 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
boolean allowLeadingZeros,
boolean allowNonNumericNumbers,
boolean allowUnquotedControl,
boolean pruneColumns,
byte lineDelimiter,
long dsHandle) throws CudfException;

Expand Down Expand Up @@ -1312,6 +1314,10 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
* @return the file parsed as a table on the GPU.
*/
public static Table readJSON(Schema schema, JSONOptions opts, File path) {
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(
readJSON(schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
Expand All @@ -1326,6 +1332,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {

return gatherJSONColumns(schema, twm, -1);
Expand Down Expand Up @@ -1472,6 +1479,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
assert len > 0;
assert len <= buffer.length - offset;
assert offset >= 0 && offset < buffer.length;
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSON(
schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
Expand All @@ -1487,6 +1498,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter()))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
}
Expand All @@ -1513,6 +1525,10 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
*/
public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
// only prune the schema if one is provided
boolean cudfPruneSchema = schema.getColumnNames() != null &&
schema.getColumnNames().length != 0 &&
opts.shouldCudfPruneSchema();
try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
opts.isDayFirst(),
Expand All @@ -1526,6 +1542,7 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int
opts.leadingZerosAllowed(),
opts.nonNumericNumbersAllowed(),
opts.unquotedControlChars(),
cudfPruneSchema,
opts.getLineDelimiter(),
dsHandle))) {
return gatherJSONColumns(schema, twm, emptyRowCount);
Expand Down
12 changes: 9 additions & 3 deletions java/src/main/native/src/TableJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1649,7 +1649,8 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(false);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1703,6 +1704,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
.normalize_whitespace(static_cast<bool>(normalize_whitespace))
.strict_validation(strict_validation)
.mixed_types_as_string(mixed_types_as_string)
.prune_columns(false)
.delimiter(static_cast<char>(line_delimiter))
.keep_quotes(keep_quotes);
if (strict_validation) {
Expand Down Expand Up @@ -1818,6 +1820,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter,
jlong ds_handle)
{
Expand Down Expand Up @@ -1855,7 +1858,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down Expand Up @@ -1915,6 +1919,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
jboolean allow_leading_zeros,
jboolean allow_nonnumeric_numbers,
jboolean allow_unquoted_control,
jboolean prune_columns,
jbyte line_delimiter)
{
bool read_buffer = true;
Expand Down Expand Up @@ -1966,7 +1971,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
.mixed_types_as_string(mixed_types_as_string)
.delimiter(static_cast<char>(line_delimiter))
.strict_validation(strict_validation)
.keep_quotes(keep_quotes);
.keep_quotes(keep_quotes)
.prune_columns(prune_columns);
if (strict_validation) {
opts.numeric_leading_zeros(allow_leading_zeros)
.nonnumeric_numbers(allow_nonnumeric_numbers)
Expand Down

0 comments on commit badf359

Please sign in to comment.