-
Notifications
You must be signed in to change notification settings - Fork 68
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
New implementation of getJsonObject #1893
Changes from 20 commits
0c67f0b
e3dfdaa
72d2994
1f06b34
304bf71
d23d1e3
5b922ce
d312a7a
50481ab
c971f61
e7801bd
3c561f7
848c92b
2ae268a
1febcdd
a9bb79f
67ae598
44d5d2d
3abc3fc
02d4cfa
a204efa
9686754
01e6029
fa02fc9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/* | ||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "cudf_jni_apis.hpp" | ||
#include "get_json_object.hpp" | ||
|
||
#include <cudf/strings/strings_column_view.hpp> | ||
|
||
#include <vector> | ||
|
||
using path_instruction_type = spark_rapids_jni::path_instruction_type; | ||
|
||
extern "C" { | ||
JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObject( | ||
JNIEnv* env, jclass, jlong input_column, jobjectArray path_instructions) | ||
{ | ||
JNI_NULL_CHECK(env, input_column, "input column is null", 0); | ||
JNI_NULL_CHECK(env, path_instructions, "path_instructions is null", 0); | ||
try { | ||
cudf::jni::auto_set_device(env); | ||
auto const n_column_view = reinterpret_cast<cudf::column_view const*>(input_column); | ||
auto const n_strings_col_view = cudf::strings_column_view{*n_column_view}; | ||
|
||
std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions; | ||
int size = env->GetArrayLength(path_instructions); | ||
for (int i = 0; i < size; i++) { | ||
jobject instruction = env->GetObjectArrayElement(path_instructions, i); | ||
JNI_NULL_CHECK(env, instruction, "path_instruction is null", 0); | ||
jclass instruction_class = env->GetObjectClass(instruction); | ||
JNI_NULL_CHECK(env, instruction_class, "instruction_class is null", 0); | ||
|
||
jfieldID field_id = env->GetFieldID(instruction_class, "type", "I"); | ||
JNI_NULL_CHECK(env, field_id, "field_id is null", 0); | ||
jint type = env->GetIntField(instruction, field_id); | ||
path_instruction_type instruction_type = static_cast<path_instruction_type>(type); | ||
|
||
field_id = env->GetFieldID(instruction_class, "name", "Ljava/lang/String;"); | ||
JNI_NULL_CHECK(env, field_id, "field_id is null", 0); | ||
jstring name = (jstring)env->GetObjectField(instruction, field_id); | ||
JNI_NULL_CHECK(env, name, "name is null", 0); | ||
const char* name_str = env->GetStringUTFChars(name, JNI_FALSE); | ||
|
||
field_id = env->GetFieldID(instruction_class, "index", "J"); | ||
JNI_NULL_CHECK(env, field_id, "field_id is null", 0); | ||
jlong index = env->GetLongField(instruction, field_id); | ||
|
||
instructions.emplace_back(instruction_type, name_str, index); | ||
|
||
env->ReleaseStringUTFChars(name, name_str); | ||
} | ||
|
||
return cudf::jni::release_as_jlong( | ||
spark_rapids_jni::get_json_object(n_strings_col_view, instructions)); | ||
} | ||
CATCH_STD(env, 0); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -800,15 +800,15 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha | |||||
if (sign) { result[index++] = '-'; } | ||||||
|
||||||
uint64_t output = v.mantissa; | ||||||
uint32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + static_cast<int32_t>(olength) - 1; | ||||||
int32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + olength - 1; | ||||||
bool scientificNotation = (exp < -3) || (exp >= 7); | ||||||
|
||||||
// Values in the interval [1E-3, 1E7) are special. | ||||||
if (scientificNotation) { | ||||||
// Print in the format x.xxxxxE-yy. | ||||||
for (uint32_t i = 0; i < olength - 1; ++i) { | ||||||
uint32_t const c = output % 10; | ||||||
for (int i = 0; i < olength - 1; ++i) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not necessary to change this, but we could have gotten away with the following:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Alternatively, let's use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do in a follow-on. |
||||||
int const c = output % 10; | ||||||
output /= 10; | ||||||
result[index + olength - i] = (char)('0' + c); | ||||||
} | ||||||
|
@@ -845,7 +845,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha | |||||
output /= 10; | ||||||
index++; | ||||||
} | ||||||
} else if (exp + 1 >= olength) { | ||||||
} else if (exp + 1 >= static_cast<int32_t>(olength)) { | ||||||
// Decimal dot is after any of the digits. | ||||||
for (int i = 0; i < olength; i++) { | ||||||
result[index + olength - i - 1] = (char)('0' + output % 10); | ||||||
|
@@ -880,7 +880,7 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) | |||||
if (sign) { index++; } | ||||||
|
||||||
uint64_t output = v.mantissa; | ||||||
uint32_t const olength = decimal_length(output); | ||||||
int32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + static_cast<int32_t>(olength) - 1; | ||||||
bool scientificNotation = (exp < -3) || (exp >= 7); | ||||||
|
||||||
|
@@ -920,7 +920,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha | |||||
if (sign) { result[index++] = '-'; } | ||||||
|
||||||
uint32_t output = v.mantissa; | ||||||
uint32_t const olength = decimal_length(output); | ||||||
int32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + olength - 1; | ||||||
bool scientificNotation = (exp < -3) || (exp >= 7); | ||||||
|
||||||
|
@@ -995,7 +995,7 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) | |||||
if (sign) { index++; } | ||||||
|
||||||
uint32_t output = v.mantissa; | ||||||
uint32_t const olength = decimal_length(output); | ||||||
int32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + olength - 1; | ||||||
bool scientificNotation = (exp < -3) || (exp >= 7); | ||||||
|
||||||
|
@@ -1149,6 +1149,57 @@ __device__ inline int compute_f2s_size(float value) | |||||
return f2s_size(v, sign); | ||||||
} | ||||||
|
||||||
//===== special inf handling for json ===== | ||||||
|
||||||
__device__ inline int copy_special_str_json(char* const result, | ||||||
bool const sign, | ||||||
bool const exponent, | ||||||
bool const mantissa) | ||||||
{ | ||||||
// no NaN in json | ||||||
if (exponent) { | ||||||
if (sign) { | ||||||
memcpy(result, "\"-Infinity\"", 11); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nitpick: Feel free to ignore: I had to count the characters to make sure There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do in a follow-on. |
||||||
return 11; | ||||||
} else { | ||||||
memcpy(result, "\"Infinity\"", 10); | ||||||
return 10; | ||||||
} | ||||||
} | ||||||
if (sign) { | ||||||
memcpy(result, "-0.0", 4); | ||||||
return 4; | ||||||
} else { | ||||||
memcpy(result, "0.0", 3); | ||||||
return 3; | ||||||
} | ||||||
} | ||||||
|
||||||
__device__ inline int special_str_size_json(bool const sign, | ||||||
bool const exponent, | ||||||
bool const mantissa) | ||||||
{ | ||||||
// no NaN in json | ||||||
if (exponent) { return sign + 10; } | ||||||
return sign + 3; | ||||||
} | ||||||
|
||||||
__device__ inline int d2s_buffered_n_json(double f, char* result) | ||||||
{ | ||||||
bool sign = false, special = false; | ||||||
floating_decimal_64 v = d2d(f, sign, special); | ||||||
if (special) { return copy_special_str_json(result, sign, v.exponent, v.mantissa); } | ||||||
return to_chars(v, sign, result); | ||||||
} | ||||||
|
||||||
__device__ inline int compute_d2s_size_json(double value) | ||||||
{ | ||||||
bool sign = false, special = false; | ||||||
floating_decimal_64 v = d2d(value, sign, special); | ||||||
if (special) { return special_str_size_json(sign, v.exponent, v.mantissa); } | ||||||
return d2s_size(v, sign); | ||||||
} | ||||||
|
||||||
} // namespace | ||||||
|
||||||
//===== APIs ===== | ||||||
|
@@ -1223,9 +1274,9 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const | |||||
using U = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>; | ||||||
int index = 0; | ||||||
if (sign) { result[index++] = '-'; } | ||||||
U output = v.mantissa; | ||||||
uint32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + static_cast<int32_t>(olength) - 1; | ||||||
U output = v.mantissa; | ||||||
int32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + static_cast<int32_t>(olength) - 1; | ||||||
if (exp < 0) { | ||||||
// Decimal dot is before any of the digits. | ||||||
int index_for_carrier = index; | ||||||
|
@@ -1291,7 +1342,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const | |||||
} | ||||||
} else { | ||||||
// 0 <= exp < olength - 1 | ||||||
uint32_t temp_d = digits, tailing_zero = 0; | ||||||
int32_t temp_d = digits, tailing_zero = 0; | ||||||
if (exp + digits + 1 > olength) { | ||||||
temp_d = olength - exp - 1; | ||||||
tailing_zero = digits - temp_d; | ||||||
|
@@ -1301,10 +1352,10 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const | |||||
U integer = rounded_output / pow10; | ||||||
U decimal = rounded_output % pow10; | ||||||
// calculate integer length after format to cover carry case | ||||||
uint32_t integer_len = decimal_length(integer); | ||||||
uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; | ||||||
uint32_t sep_cnt = 0; | ||||||
int rev_index = 0; | ||||||
int32_t integer_len = decimal_length(integer); | ||||||
int32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; | ||||||
int32_t sep_cnt = 0; | ||||||
int rev_index = 0; | ||||||
for (int i = 0; i < integer_len; i++) { | ||||||
if (sep_cnt == 3) { | ||||||
result[formated_integer_len - (rev_index++) - 1] = ','; | ||||||
|
@@ -1338,9 +1389,9 @@ __device__ inline int format_size(T const v, bool const sign, int digits) | |||||
using U = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>; | ||||||
int index = 0; | ||||||
if (sign) { index++; } | ||||||
U output = v.mantissa; | ||||||
uint32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + static_cast<int32_t>(olength) - 1; | ||||||
U output = v.mantissa; | ||||||
int32_t const olength = decimal_length(output); | ||||||
int32_t exp = v.exponent + static_cast<int32_t>(olength) - 1; | ||||||
if (exp < 0) { | ||||||
index += 2 + digits; | ||||||
} else if (exp + 1 >= olength) { | ||||||
|
@@ -1424,4 +1475,15 @@ __device__ inline int format_float(double value, int digits, bool is_float, char | |||||
} | ||||||
} | ||||||
|
||||||
//===== json_parser utility ===== | ||||||
|
||||||
__device__ inline int double_normalization(double value, char* output) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's consider renaming this to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will do in a follow-on. |
||||||
{ | ||||||
if (output == nullptr) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't fully grasped the control flow, so this might be a silly observation: We might be inadvertently checking In a follow-up, there might be value in separating those call chains. Computing the sizes can be done separately from actually writing the normalized value. Just a thought for future consideration. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will fix in a follow-on |
||||||
return compute_d2s_size_json(value); | ||||||
} else { | ||||||
return d2s_buffered_n_json(value, output); | ||||||
} | ||||||
} | ||||||
|
||||||
} // namespace spark_rapids_jni::ftos_converter |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: can we have a follow on where we change this to be 3 arrays? one that is an int array for the type, another that is a string array, and the last one would be a long array? This would reduce the number of reflection calls that would need to be done. This is very minor.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Humnn, probably I was the one that asked for using array of structs instead of individual arrays. But yeah for data transfer from Java to C++ then individual arrays seem to be better. I'm sorry @res-life.
Edit: I have a better idea. We want to have array of structs because that is simpler to iterate through the instructions. With individual arrays, we still can use
thrust::zip_iterator
to zip together multiple arrays with just one iterator and iterate through them easily. Of course this is not required to be in this PR but future work.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
thrust::zip_iterator
looks good, will do in a follow-on