Skip to content

Commit

Permalink
[c++] Fix bug in nnz of variant-indexed dataframes
Browse files Browse the repository at this point in the history
  • Loading branch information
johnkerl committed Sep 19, 2024
1 parent 4a13a82 commit 265c9b4
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 1 deletion.
19 changes: 19 additions & 0 deletions libtiledbsoma/src/soma/soma_array.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1205,13 +1205,32 @@ uint64_t SOMAArray::nnz() {
// compute total_cell_num while going through the loop
uint64_t total_cell_num = 0;
std::vector<std::array<uint64_t, 2>> non_empty_domains(fragment_count);

// The loop after this only works if dim 0 is int64 soma_joinid.
// That's the case for _almost_ all SOMADataFrame objects, but
// not the "variant-indexed" ones: the SOMA spec only requires
// that soma_joinid be present as a dim or an attr.
auto dim = tiledb_schema()->domain().dimension(0);
auto dim_name = dim.name();
auto type_code = dim.type();
if (dim_name != "soma_joinid" || type_code != TILEDB_INT64) {
LOG_DEBUG(fmt::format(
"[SOMAArray::nnz] dim 0 (type={} name={}) isn't int64 "
"soma_joind: using _nnz_slow",
tiledb::impl::type_to_str(type_code),
dim_name));
return _nnz_slow();
}

for (uint32_t i = 0; i < fragment_count; i++) {
// TODO[perf]: Reading fragment info is not supported on TileDB Cloud
// yet, but reading one fragment at a time will be slow. Is there
// another way?
total_cell_num += fragment_info.cell_num(relevant_fragments[i]);

fragment_info.get_non_empty_domain(
relevant_fragments[i], 0, &non_empty_domains[i]);

LOG_DEBUG(fmt::format(
"[SOMAArray] fragment {} non-empty domain = [{}, {}]",
i,
Expand Down
28 changes: 27 additions & 1 deletion libtiledbsoma/test/unit_soma_dataframe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ TEST_CASE_METHOD(
REQUIRE(
soma_dataframe->index_column_names() ==
expected_index_column_names);
REQUIRE(soma_dataframe->count() == 0);
REQUIRE(soma_dataframe->nnz() == 0);
soma_dataframe->close();

std::vector<int64_t> d0(10);
Expand Down Expand Up @@ -531,8 +531,14 @@ TEST_CASE_METHOD(

soma_dataframe->close();

REQUIRE(soma_dataframe->nnz() == 0);

write_sjid_u32_str_data_from(0);

REQUIRE(soma_dataframe->nnz() == 2);
write_sjid_u32_str_data_from(8);
REQUIRE(soma_dataframe->nnz() == 4);

// Resize
auto new_shape = std::vector<int64_t>({SOMA_JOINID_RESIZE_DIM_MAX});

Expand Down Expand Up @@ -637,9 +643,15 @@ TEST_CASE_METHOD(
REQUIRE(actual.value() == expect);
soma_dataframe->close();

REQUIRE(soma_dataframe->nnz() == 0);

// Write
write_sjid_u32_str_data_from(0);

REQUIRE(soma_dataframe->nnz() == 2);
write_sjid_u32_str_data_from(8);
REQUIRE(soma_dataframe->nnz() == 4);

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = dim_infos[0].dim_max + 1;
Expand Down Expand Up @@ -755,9 +767,15 @@ TEST_CASE_METHOD(
REQUIRE(actual.value() == expect);
soma_dataframe->close();

REQUIRE(soma_dataframe->nnz() == 0);

// Write
write_sjid_u32_str_data_from(0);

REQUIRE(soma_dataframe->nnz() == 2);
write_sjid_u32_str_data_from(8);
REQUIRE(soma_dataframe->nnz() == 4);

// Check shape after write
soma_dataframe = open(OpenMode::read);
expect = dim_infos[0].dim_max + 1;
Expand Down Expand Up @@ -870,9 +888,17 @@ TEST_CASE_METHOD(
REQUIRE(!actual.has_value());
soma_dataframe->close();

REQUIRE(soma_dataframe->nnz() == 0);

// Write
write_sjid_u32_str_data_from(0);

REQUIRE(soma_dataframe->nnz() == 2);
write_sjid_u32_str_data_from(8);
// soma_joinid is not a dim here and so the second write is an overwrite
// of the first here
REQUIRE(soma_dataframe->nnz() == 2);

// Check shape after write
soma_dataframe = open(OpenMode::read);
actual = soma_dataframe->maybe_soma_joinid_shape();
Expand Down

0 comments on commit 265c9b4

Please sign in to comment.