NVIDIA · thirtiseven · Mar 27, 2024 · Mar 10, 2024 · Mar 12, 2024 · Mar 12, 2024
diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt
@@ -156,6 +156,7 @@ add_library(
   src/GpuTimeZoneDBJni.cpp
   src/HashJni.cpp
   src/HistogramJni.cpp
+  src/JSONUtilsJni.cpp
   src/MapUtilsJni.cpp
   src/NativeParquetJni.cpp
   src/ParseURIJni.cpp
@@ -170,6 +171,7 @@ add_library(
   src/cast_string_to_float.cu
   src/datetime_rebase.cu
   src/decimal_utils.cu
+  src/get_json_object.cu
   src/histogram.cu
   src/map_utils.cu
   src/murmur_hash.cu

diff --git a/src/main/cpp/src/JSONUtilsJni.cpp b/src/main/cpp/src/JSONUtilsJni.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cudf_jni_apis.hpp"
+#include "get_json_object.hpp"
+
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+using path_instruction_type = spark_rapids_jni::path_instruction_type;
+
+extern "C" {
+JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_JSONUtils_getJsonObject(
+  JNIEnv* env, jclass, jlong input_column, jobjectArray path_instructions)
+{
+  JNI_NULL_CHECK(env, input_column, "input column is null", 0);
+  JNI_NULL_CHECK(env, path_instructions, "path_instructions is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const n_column_view      = reinterpret_cast<cudf::column_view const*>(input_column);
+    auto const n_strings_col_view = cudf::strings_column_view{*n_column_view};
+
+    std::vector<std::tuple<path_instruction_type, std::string, int64_t>> instructions;
+    int size = env->GetArrayLength(path_instructions);
+    for (int i = 0; i < size; i++) {
+      jobject instruction = env->GetObjectArrayElement(path_instructions, i);
+      JNI_NULL_CHECK(env, instruction, "path_instruction is null", 0);
+      jclass instruction_class = env->GetObjectClass(instruction);
+      JNI_NULL_CHECK(env, instruction_class, "instruction_class is null", 0);
+
+      jfieldID field_id = env->GetFieldID(instruction_class, "type", "I");
+      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
+      jint type                              = env->GetIntField(instruction, field_id);
+      path_instruction_type instruction_type = static_cast<path_instruction_type>(type);
+
+      field_id = env->GetFieldID(instruction_class, "name", "Ljava/lang/String;");
+      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
+      jstring name = (jstring)env->GetObjectField(instruction, field_id);
+      JNI_NULL_CHECK(env, name, "name is null", 0);
+      const char* name_str = env->GetStringUTFChars(name, JNI_FALSE);
+
+      field_id = env->GetFieldID(instruction_class, "index", "J");
+      JNI_NULL_CHECK(env, field_id, "field_id is null", 0);
+      jlong index = env->GetLongField(instruction, field_id);
+
+      instructions.emplace_back(instruction_type, name_str, index);
+
+      env->ReleaseStringUTFChars(name, name_str);
+    }
+
+    return cudf::jni::release_as_jlong(
+      spark_rapids_jni::get_json_object(n_strings_col_view, instructions));
+  }
+  CATCH_STD(env, 0);
+}
+}
diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh
@@ -800,15 +800,15 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
   if (sign) { result[index++] = '-'; }
 
   uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
-  int32_t exp             = v.exponent + static_cast<int32_t>(olength) - 1;
+  int32_t const olength   = decimal_length(output);
+  int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
   // Values in the interval [1E-3, 1E7) are special.
   if (scientificNotation) {
     // Print in the format x.xxxxxE-yy.
-    for (uint32_t i = 0; i < olength - 1; ++i) {
-      uint32_t const c = output % 10;
+    for (int i = 0; i < olength - 1; ++i) {
-    for (int i = 0; i < olength - 1; ++i) {
+    for (auto i = 0; i < olength - 1; ++i) {
-    for (int i = 0; i < olength - 1; ++i) {
+    for (auto i = 0; i < olength - 1; ++i) {
+      int const c = output % 10;
       output /= 10;
       result[index + olength - i] = (char)('0' + c);
     }
@@ -845,7 +845,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha
         output /= 10;
         index++;
       }
-    } else if (exp + 1 >= olength) {
+    } else if (exp + 1 >= static_cast<int32_t>(olength)) {
       // Decimal dot is after any of the digits.
       for (int i = 0; i < olength; i++) {
         result[index + olength - i - 1] = (char)('0' + output % 10);
@@ -880,7 +880,7 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign)
   if (sign) { index++; }
 
   uint64_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
+  int32_t const olength   = decimal_length(output);
   int32_t exp             = v.exponent + static_cast<int32_t>(olength) - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -920,7 +920,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha
   if (sign) { result[index++] = '-'; }
 
   uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
+  int32_t const olength   = decimal_length(output);
   int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -995,7 +995,7 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign)
   if (sign) { index++; }
 
   uint32_t output         = v.mantissa;
-  uint32_t const olength  = decimal_length(output);
+  int32_t const olength   = decimal_length(output);
   int32_t exp             = v.exponent + olength - 1;
   bool scientificNotation = (exp < -3) || (exp >= 7);
 
@@ -1149,6 +1149,57 @@ __device__ inline int compute_f2s_size(float value)
   return f2s_size(v, sign);
 }
 
+//===== special inf handling for json =====
+
+__device__ inline int copy_special_str_json(char* const result,
+                                            bool const sign,
+                                            bool const exponent,
+                                            bool const mantissa)
+{
+  // no NaN in json
+  if (exponent) {
+    if (sign) {
+      memcpy(result, "\"-Infinity\"", 11);
+      return 11;
+    } else {
+      memcpy(result, "\"Infinity\"", 10);
+      return 10;
+    }
+  }
+  if (sign) {
+    memcpy(result, "-0.0", 4);
+    return 4;
+  } else {
+    memcpy(result, "0.0", 3);
+    return 3;
+  }
+}
+
+__device__ inline int special_str_size_json(bool const sign,
+                                            bool const exponent,
+                                            bool const mantissa)
+{
+  // no NaN in json
+  if (exponent) { return sign + 10; }
+  return sign + 3;
+}
+
+__device__ inline int d2s_buffered_n_json(double f, char* result)
+{
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(f, sign, special);
+  if (special) { return copy_special_str_json(result, sign, v.exponent, v.mantissa); }
+  return to_chars(v, sign, result);
+}
+
+__device__ inline int compute_d2s_size_json(double value)
+{
+  bool sign = false, special = false;
+  floating_decimal_64 v = d2d(value, sign, special);
+  if (special) { return special_str_size_json(sign, v.exponent, v.mantissa); }
+  return d2s_size(v, sign);
+}
+
 }  // namespace
 
 //===== APIs =====
@@ -1223,9 +1274,9 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
   using U   = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>;
   int index = 0;
   if (sign) { result[index++] = '-'; }
-  U output               = v.mantissa;
-  uint32_t const olength = decimal_length(output);
-  int32_t exp            = v.exponent + static_cast<int32_t>(olength) - 1;
+  U output              = v.mantissa;
+  int32_t const olength = decimal_length(output);
+  int32_t exp           = v.exponent + static_cast<int32_t>(olength) - 1;
   if (exp < 0) {
     // Decimal dot is before any of the digits.
     int index_for_carrier = index;
@@ -1291,7 +1342,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     }
   } else {
     // 0 <= exp < olength - 1
-    uint32_t temp_d = digits, tailing_zero = 0;
+    int32_t temp_d = digits, tailing_zero = 0;
     if (exp + digits + 1 > olength) {
       temp_d       = olength - exp - 1;
       tailing_zero = digits - temp_d;
@@ -1301,10 +1352,10 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const
     U integer        = rounded_output / pow10;
     U decimal        = rounded_output % pow10;
     // calculate integer length after format to cover carry case
-    uint32_t integer_len          = decimal_length(integer);
-    uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
-    uint32_t sep_cnt              = 0;
-    int rev_index                 = 0;
+    int32_t integer_len          = decimal_length(integer);
+    int32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3;
+    int32_t sep_cnt              = 0;
+    int rev_index                = 0;
     for (int i = 0; i < integer_len; i++) {
       if (sep_cnt == 3) {
         result[formated_integer_len - (rev_index++) - 1] = ',';
@@ -1338,9 +1389,9 @@ __device__ inline int format_size(T const v, bool const sign, int digits)
   using U   = std::conditional_t<std::is_same_v<T, floating_decimal_32>, uint32_t, uint64_t>;
   int index = 0;
   if (sign) { index++; }
-  U output               = v.mantissa;
-  uint32_t const olength = decimal_length(output);
-  int32_t exp            = v.exponent + static_cast<int32_t>(olength) - 1;
+  U output              = v.mantissa;
+  int32_t const olength = decimal_length(output);
+  int32_t exp           = v.exponent + static_cast<int32_t>(olength) - 1;
   if (exp < 0) {
     index += 2 + digits;
   } else if (exp + 1 >= olength) {
@@ -1424,4 +1475,15 @@ __device__ inline int format_float(double value, int digits, bool is_float, char
   }
 }
 
+//===== json_parser utility =====
+
+__device__ inline int double_normalization(double value, char* output)
+{
+  if (output == nullptr) {
+    return compute_d2s_size_json(value);
+  } else {
+    return d2s_buffered_n_json(value, output);
+  }
+}
+
 }  // namespace spark_rapids_jni::ftos_converter