Skip to content

Commit

Permalink
enable list to be forced as string in JSON reader. (#16472)
Browse files Browse the repository at this point in the history
closes #15278

This PR allows list type also forced as string when mixed type as string is enabled and a user given schema specifies a column as string, in JSON reader.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: #16472
  • Loading branch information
karthikeyann authored Aug 9, 2024
1 parent 1bbe440 commit 2c8de62
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 45 deletions.
22 changes: 12 additions & 10 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -567,22 +567,22 @@ void make_device_json_column(device_span<SymbolT const> input,
thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
};

auto initialize_json_columns = [&](auto i, auto& col) {
if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) {
auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
if (column_category == NC_ERR || column_category == NC_FN) {
return;
} else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) {
} else if (column_category == NC_VAL || column_category == NC_STR) {
col.string_offsets.resize(max_row_offsets[i] + 1, stream);
col.string_lengths.resize(max_row_offsets[i] + 1, stream);
init_to_zero(col.string_offsets);
init_to_zero(col.string_lengths);
} else if (column_categories[i] == NC_LIST) {
} else if (column_category == NC_LIST) {
col.child_offsets.resize(max_row_offsets[i] + 2, stream);
init_to_zero(col.child_offsets);
}
col.num_rows = max_row_offsets[i] + 1;
col.validity =
cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
col.type = to_json_col_type(column_categories[i]);
col.type = to_json_col_type(column_category);
};

auto reinitialize_as_string = [&](auto i, auto& col) {
Expand Down Expand Up @@ -764,21 +764,23 @@ void make_device_json_column(device_span<SymbolT const> input,
}
}

auto this_column_category = column_categories[this_col_id];
if (is_enabled_mixed_types_as_string) {
// get path of this column, check if it is a struct forced as string, and enforce it
// get path of this column, check if it is a struct/list forced as string, and enforce it
auto const nt = tree_path.get_path(this_col_id);
std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
user_dtype.value().id() == type_id::STRING) {
if ((column_categories[this_col_id] == NC_STRUCT or
column_categories[this_col_id] == NC_LIST) and
user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
is_mixed_type_column[this_col_id] = 1;
column_categories[this_col_id] = NC_STR;
this_column_category = NC_STR;
}
}

CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
// move into parent
device_json_column col(stream, mr);
initialize_json_columns(this_col_id, col);
initialize_json_columns(this_col_id, col, this_column_category);
auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
if (not replaced) parent_col.column_order.push_back(name);
Expand Down
113 changes: 78 additions & 35 deletions cpp/tests/io/json/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2351,7 +2351,7 @@ TEST_F(JsonReaderTest, MapTypes)
// Testing function for mixed types in JSON (for spark json reader)
auto test_fn = [](std::string_view json_string, bool lines, std::vector<type_id> types) {
std::map<std::string, cudf::io::schema_element> dtype_schema{
{"foo1", {data_type{type_id::STRING}}}, // list won't be a string
{"foo1", {data_type{type_id::STRING}}}, // list forced as a string
{"foo2", {data_type{type_id::STRING}}}, // struct forced as a string
{"1", {data_type{type_id::STRING}}},
{"2", {data_type{type_id::STRING}}},
Expand All @@ -2378,17 +2378,17 @@ TEST_F(JsonReaderTest, MapTypes)
test_fn(R"([{ "foo1": [1,2,3], "bar": 123 },
{ "foo2": { "a": 1 }, "bar": 456 }])",
false,
{type_id::LIST, type_id::INT32, type_id::STRING});
{type_id::STRING, type_id::INT32, type_id::STRING});
// jsonl
test_fn(R"( { "foo1": [1,2,3], "bar": 123 }
{ "foo2": { "a": 1 }, "bar": 456 })",
true,
{type_id::LIST, type_id::INT32, type_id::STRING});
{type_id::STRING, type_id::INT32, type_id::STRING});
// jsonl-array
test_fn(R"([123, [1,2,3]]
[456, null, { "a": 1 }])",
true,
{type_id::INT64, type_id::LIST, type_id::STRING});
{type_id::INT64, type_id::STRING, type_id::STRING});
// json-array
test_fn(R"([[[1,2,3], null, 123],
[null, { "a": 1 }, 456 ]])",
Expand Down Expand Up @@ -2678,38 +2678,81 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter)

TEST_F(JsonReaderTest, JSONMixedTypeChildren)
{
std::string const json_str = R"(
{ "Root": { "Key": [ { "EE": "A" } ] } }
{ "Root": { "Key": { } } }
{ "Root": { "Key": [{ "YY": 1}] } }
)";
// Column "EE" is created and destroyed
// Column "YY" should not be created

cudf::io::json_reader_options options =
cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()})
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.normalize_single_quotes(true)
.normalize_whitespace(false)
.mixed_types_as_string(true)
.keep_quotes(true);

auto result = cudf::io::read_json(options);
// struct mixed.
{
std::string const json_str = R"(
{ "Root": { "Key": [ { "EE": "A" } ] } }
{ "Root": { "Key": { } } }
{ "Root": { "Key": [{ "YY": 1}] } }
)";
// Column "EE" is created and destroyed
// Column "YY" should not be created

cudf::io::json_reader_options options =
cudf::io::json_reader_options::builder(
cudf::io::source_info{json_str.c_str(), json_str.size()})
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.normalize_single_quotes(true)
.normalize_whitespace(false)
.mixed_types_as_string(true)
.keep_quotes(true);

auto result = cudf::io::read_json(options);

ASSERT_EQ(result.tbl->num_columns(), 1);
ASSERT_EQ(result.metadata.schema_info.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
// types
EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
cudf::test::strings_column_wrapper expected(
{R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
}

ASSERT_EQ(result.tbl->num_columns(), 1);
ASSERT_EQ(result.metadata.schema_info.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
// types
EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{ }", R"([{ "YY": 1}])"});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
// list mixed.
{
std::string const json_str = R"(
{ "Root": { "Key": [ { "EE": "A" } ] } }
{ "Root": { "Key": "abc" } }
{ "Root": { "Key": [{ "YY": 1}] } }
)";
// Column "EE" is created and destroyed
// Column "YY" should not be created

cudf::io::json_reader_options options =
cudf::io::json_reader_options::builder(
cudf::io::source_info{json_str.c_str(), json_str.size()})
.lines(true)
.recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
.normalize_single_quotes(true)
.normalize_whitespace(false)
.mixed_types_as_string(true)
.keep_quotes(true);

auto result = cudf::io::read_json(options);

ASSERT_EQ(result.tbl->num_columns(), 1);
ASSERT_EQ(result.metadata.schema_info.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
// types
EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
cudf::test::strings_column_wrapper expected(
{R"([ { "EE": "A" } ])", "\"abc\"", R"([{ "YY": 1}])"});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
}
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit 2c8de62

Please sign in to comment.