Skip to content

Commit

Permalink
Rewrite docs and change function name
Browse files Browse the repository at this point in the history
Signed-off-by: Nghia Truong <[email protected]>
  • Loading branch information
ttnghia committed Nov 15, 2024
1 parent 32edcbf commit fe8e359
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 52 deletions.
32 changes: 16 additions & 16 deletions src/main/cpp/src/JSONUtilsJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,15 +222,15 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_fromJSONToStructs(JNIEnv* env,
}

JNIEXPORT jlong JNICALL
Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env,
jclass,
jlong j_input,
jintArray j_num_children,
jintArray j_types,
jintArray j_scales,
jintArray j_precisions,
jboolean allow_nonnumeric_numbers,
jboolean is_us_locale)
Java_com_nvidia_spark_rapids_jni_JSONUtils_convertFromStrings(JNIEnv* env,
jclass,
jlong j_input,
jintArray j_num_children,
jintArray j_types,
jintArray j_scales,
jintArray j_precisions,
jboolean allow_nonnumeric_numbers,
jboolean is_us_locale)
{
JNI_NULL_CHECK(env, j_input, "j_input is null", 0);
JNI_NULL_CHECK(env, j_num_children, "j_num_children is null", 0);
Expand All @@ -253,13 +253,13 @@ Java_com_nvidia_spark_rapids_jni_JSONUtils_convertDataType(JNIEnv* env,
CUDF_EXPECTS(num_children.size() == precisions.size(), "Invalid schema data: precisions.");

return cudf::jni::ptr_as_jlong(
spark_rapids_jni::convert_data_type(cudf::strings_column_view{*input_cv},
num_children,
types,
scales,
precisions,
allow_nonnumeric_numbers,
is_us_locale)
spark_rapids_jni::convert_from_strings(cudf::strings_column_view{*input_cv},
num_children,
types,
scales,
precisions,
allow_nonnumeric_numbers,
is_us_locale)
.release());
}
CATCH_STD(env, 0);
Expand Down
36 changes: 18 additions & 18 deletions src/main/cpp/src/from_json_to_structs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -260,11 +260,11 @@ std::unique_ptr<cudf::column> cast_strings_to_integers(cudf::column_view const&
mr);
}

// Build a new strings column, removing the invalid rows.
auto chars_data = cudf::strings::detail::make_chars_buffer(
offsets_column->view(), bytes, string_pairs.begin(), string_count, stream, mr);

// Don't care about the null mask, as nulls imply empty strings, which will also result in
// nulls.
// Don't care about the null mask, as nulls imply empty strings, which will also result in nulls.
auto const sanitized_input =
cudf::make_strings_column(string_count, std::move(offsets_column), chars_data.release(), 0, {});

Expand Down Expand Up @@ -345,8 +345,7 @@ std::pair<std::unique_ptr<cudf::column>, bool> try_remove_quotes_for_floats(
auto [offsets_column, bytes] =
cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr);

// If the output has the same total bytes, the input should not be changed.
// That is because when removing quotes, we always reduce the number of characters.
// If the output has the same total bytes, the output should be the same as the input.
if (bytes == input_sv.chars_size(stream)) { return {nullptr, false}; }

auto chars_data = cudf::strings::detail::make_chars_buffer(
Expand All @@ -372,14 +371,16 @@ std::unique_ptr<cudf::column> cast_strings_to_floats(cudf::column_view const& in
if (string_count == 0) { return cudf::make_empty_column(output_type); }

if (allow_nonnumeric_numbers) {
// Non-numeric numbers are always quoted.
auto const [removed_quotes, success] = try_remove_quotes_for_floats(input, stream, mr);
return string_to_float(output_type,
cudf::strings_column_view{success ? removed_quotes->view() : input},
false,
/*ansi_mode*/ false,
stream,
mr);
}
return string_to_float(output_type, cudf::strings_column_view{input}, false, stream, mr);
return string_to_float(
output_type, cudf::strings_column_view{input}, /*ansi_mode*/ false, stream, mr);
}

// TODO there is a bug here around 0 https://github.com/NVIDIA/spark-rapids/issues/10898
Expand Down Expand Up @@ -480,7 +481,7 @@ std::unique_ptr<cudf::column> cast_strings_to_decimals(cudf::column_view const&
cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
auto chars_data = rmm::device_uvector<char>(bytes, stream, mr);

// Since the strings store decimal numbers, they should be very short.
// Since the strings store decimal numbers, they should not be very long.
// As such, using one thread per string should be good.
thrust::for_each(rmm::exec_policy_nosync(stream),
thrust::make_counting_iterator(0),
Expand Down Expand Up @@ -574,8 +575,7 @@ std::pair<std::unique_ptr<cudf::column>, bool> try_remove_quotes(
auto [offsets_column, bytes] =
cudf::strings::detail::make_offsets_child_column(size_it, size_it + string_count, stream, mr);

// If the output has the same total bytes, the input should not be changed.
// That is because when removing quotes, we always reduce the number of characters.
// If the output has the same total bytes, the output should be the same as the input.
if (bytes == input.chars_size(stream)) { return {nullptr, false}; }

auto chars_data = cudf::strings::detail::make_chars_buffer(
Expand Down Expand Up @@ -910,15 +910,15 @@ std::unique_ptr<cudf::column> from_json_to_structs(cudf::strings_column_view con
mr);
}

std::unique_ptr<cudf::column> convert_data_type(cudf::strings_column_view const& input,
std::vector<int> const& num_children,
std::vector<int> const& types,
std::vector<int> const& scales,
std::vector<int> const& precisions,
bool allow_nonnumeric_numbers,
bool is_us_locale,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
std::unique_ptr<cudf::column> convert_from_strings(cudf::strings_column_view const& input,
std::vector<int> const& num_children,
std::vector<int> const& types,
std::vector<int> const& scales,
std::vector<int> const& precisions,
bool allow_nonnumeric_numbers,
bool is_us_locale,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();

Expand Down
4 changes: 2 additions & 2 deletions src/main/cpp/src/json_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,11 @@ std::unique_ptr<cudf::column> from_json_to_structs(
rmm::device_async_resource_ref mr = cudf::get_current_device_resource());

/**
* @brief Convert the input strings column into a desired type given by a data schema.
* @brief Convert from a strings column to a column with the desired type given by a data schema.
*
* The given column schema is specified as data arrays flattened by depth-first-search order.
*/
std::unique_ptr<cudf::column> convert_data_type(
std::unique_ptr<cudf::column> convert_from_strings(
cudf::strings_column_view const& input,
std::vector<int> const& num_children,
std::vector<int> const& types,
Expand Down
44 changes: 28 additions & 16 deletions src/main/java/com/nvidia/spark/rapids/jni/JSONUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,22 @@ public static ColumnVector extractRawMapFromJsonString(ColumnView input, JSONOpt

/**
* Parse a JSON string into a struct column following by the given data schema.
* <p/>
* Many JSON options in the given {@code opts} parameter are ignored from passing down to the
* native code. That is because these options are hard-coded with the same values in both the
* plugin code and native code. Specifically:<br>
* - {@code RecoverWithNull: true}<br>
* - {@code MixedTypesAsStrings: true}<br>
* - {@code NormalizeWhitespace: true}<br>
* - {@code KeepQuotes: true}<br>
* - {@code StrictValidation: true}<br>
* - {@code Experimental: true}
*
* @param input The input strings column in which each row specifies a json object
* @param schema The schema of the output struct column
* @param opts The options for parsing JSON strings
* @param isUSLocale Whether the current local is US locale
* @param isUSLocale Whether the current local is US locale, used when converting strings to
* decimal types
* @return A struct column in which each row is parsed from the corresponding json string
*/
public static ColumnVector fromJSONToStructs(ColumnView input, Schema schema, JSONOptions opts,
Expand All @@ -191,19 +202,21 @@ public static ColumnVector fromJSONToStructs(ColumnView input, Schema schema, JS
}

/**
* Convert the data type of a strings column to the desired type given by a data schema.
* Convert from a strings column to a column with the desired type given by a data schema.
*
* @param input The input strings column
* @param schema The schema of the output column
* @param allowedNonNumericNumbers Whether non-numeric numbers are allowed
* @param isUSLocale Whether the current local is US locale
* @param allowedNonNumericNumbers Whether non-numeric numbers are allowed, used when converting
* strings to float types
* @param isUSLocale Whether the current local is US locale, used when converting strings to
* decimal types
* @return A column with the desired data type
*/
public static ColumnVector convertDataType(ColumnView input, Schema schema,
boolean allowedNonNumericNumbers,
boolean isUSLocale) {
public static ColumnVector convertFromStrings(ColumnView input, Schema schema,
boolean allowedNonNumericNumbers,
boolean isUSLocale) {
assert (input.getType().equals(DType.STRING)) : "Input must be of STRING type";
return new ColumnVector(convertDataType(input.getNativeView(),
return new ColumnVector(convertFromStrings(input.getNativeView(),
schema.getFlattenedNumChildren(),
schema.getFlattenedTypeIds(),
schema.getFlattenedTypeScales(),
Expand Down Expand Up @@ -242,7 +255,6 @@ private static native long[] getJsonObjectMultiplePaths(long input,
long memoryBudgetBytes,
int parallelOverride);


private static native long extractRawMapFromJsonString(long input,
boolean normalizeSingleQuotes,
boolean leadingZerosAllowed,
Expand All @@ -261,13 +273,13 @@ private static native long fromJSONToStructs(long input,
boolean unquotedControlChars,
boolean isUSLocale);

private static native long convertDataType(long input,
int[] numChildren,
int[] typeIds,
int[] typeScales,
int[] typePrecision,
boolean nonNumericNumbersAllowed,
boolean isUSLocale);
private static native long convertFromStrings(long input,
int[] numChildren,
int[] typeIds,
int[] typeScales,
int[] typePrecision,
boolean nonNumericNumbersAllowed,
boolean isUSLocale);

private static native long removeQuotes(long input, boolean nullifyIfNotQuoted);
}

0 comments on commit fe8e359

Please sign in to comment.