From 983f3491918ab1677ac8743b7d755153be01d189 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Wed, 21 Jun 2023 11:25:52 -0400 Subject: [PATCH 01/25] Changes to examples for testing + TODO: Revert this commit --- examples/quickstart_dimension_labels.py | 8 +- .../quickstart_dimension_labels_string.py | 179 ++++++++++++++++++ examples/string_float_int_dimensions.py | 3 + 3 files changed, 186 insertions(+), 4 deletions(-) create mode 100644 examples/quickstart_dimension_labels_string.py diff --git a/examples/quickstart_dimension_labels.py b/examples/quickstart_dimension_labels.py index 4fade0bde9..998223799f 100644 --- a/examples/quickstart_dimension_labels.py +++ b/examples/quickstart_dimension_labels.py @@ -104,9 +104,9 @@ def read_array(uri: str): else: # Only create and write to the array if it doesn't already exist. - if tiledb.object_type(ARRAY_NAME) != "array": - create_array(ARRAY_NAME) - write_array(ARRAY_NAME) - + if tiledb.object_type(ARRAY_NAME) == "array": + tiledb.Array.delete_array(ARRAY_NAME) + create_array(ARRAY_NAME) + write_array(ARRAY_NAME) # Read from the array and print output. read_array(ARRAY_NAME) diff --git a/examples/quickstart_dimension_labels_string.py b/examples/quickstart_dimension_labels_string.py new file mode 100644 index 0000000000..6e27c03005 --- /dev/null +++ b/examples/quickstart_dimension_labels_string.py @@ -0,0 +1,179 @@ +# quickstart_dense.py +# +# LICENSE +# +# The MIT License +# +# Copyright (c) 2023 TileDB, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# DESCRIPTION +# +# Please refer to the TileDB and TileDB-Py documentation for more information: +# https://docs.tiledb.com/main/how-to +# https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html +# +# When run, this program will create a simple 1D dense array with a dimension label, write +# some data to it, and read a slice of the data back. +# + +import numpy as np +import tiledb + +def create_array(uri: str): + """Create array schema with dimension labels""" + dim1 = tiledb.Dim("d1", domain=(1, 4), dtype=np.int32) + dim2 = tiledb.Dim("d2", domain=(1, 5), dtype=np.int32) + # TODO: Using np.bytes_ here was not converting correctly in DataType.from_numpy() + # + dimension.py overrides bytes_->TILEDB_STRING_ASCII and does not use DataType.from_numpy() + dim_labels = { + 0: {"l1": dim1.create_label_schema("increasing", "ascii")}, + 1: { + "l2": dim2.create_label_schema("increasing", np.int64), + "l3": dim2.create_label_schema("increasing", np.float64), + }, + } + dom = tiledb.Domain(dim1, dim2) + + # Var-sized attributes seems to work without any issues. + att1 = tiledb.Attr("a1", var=True, dtype=np.bytes_) + att2 = tiledb.Attr("a2", var=True, dtype=np.int64) + schema = tiledb.ArraySchema(sparse=False, domain=dom, attrs=(att1, att2), dim_labels=dim_labels) + tiledb.Array.create(uri, schema) + + +def write_array(uri: str): + """Write attribute and label data to the array""" + a1_data = np.array( + [ + "a", + "bb", + "ccc", + "dddd", + "eeeee", + "a", + "bb", + "ccc", + "d", + "eeeeeeeeee", + "a", + "bb", + "ccc", + "d", + "eeeeeeeeee", + "a", + "bb", + "ccc", + "d", + "eeeeeeeeee", + # "a", + # "bb", + # "ccc", + # "d", + # "eeeeeeeeee", + ] + ).reshape(4, 5) + + a2_data = np.array( + [ + np.repeat(1, 1).astype(np.int64), + np.repeat(2, 2).astype(np.int64), + np.repeat(3, 3).astype(np.int64), + np.repeat(4, 4).astype(np.int64), + np.repeat(5, 5).astype(np.int64), + + np.repeat(1, 5).astype(np.int64), + np.repeat(2, 4).astype(np.int64), + np.repeat(3, 3).astype(np.int64), + np.repeat(4, 2).astype(np.int64), + np.repeat(5, 1).astype(np.int64), + + np.repeat(1, 1).astype(np.int64), + np.repeat(2, 2).astype(np.int64), + np.repeat(3, 3).astype(np.int64), + np.repeat(4, 4).astype(np.int64), + np.repeat(5, 5).astype(np.int64), + + np.repeat(1, 5).astype(np.int64), + np.repeat(2, 4).astype(np.int64), + np.repeat(3, 1).astype(np.int64), + np.repeat(4, 2).astype(np.int64), + np.repeat(5, 3).astype(np.int64), + + # np.repeat(1, 1).astype(np.int64), + # np.repeat(2, 1).astype(np.int64), + # np.repeat(3, 5).astype(np.int64), + # np.repeat(4, 5).astype(np.int64), + # np.repeat(5, 10).astype(np.int64), + ], dtype=object + ).reshape(4, 5) + + # l1_data = np.array(["a", "bb", "ccc", "dddd", "eeeee"]) + l1_data = np.array(["a", "bb", "ccc", "ddd"]) + l2_data = np.arange(-2, 3) + l3_data = np.linspace(-1.0, 1.0, 5) + with tiledb.open(uri, "w") as array: + array[:] = {"a1": a1_data, "a2": a2_data, "l1": l1_data, "l2": l2_data, "l3": l3_data} + + +def read_array(uri: str): + """Read the array from the dimension label""" + + with tiledb.open(uri, "r") as array: + # data1 = array.label_index(["l2"])[1, 1:2] + # print("Reading array on [[1, -1:1]] with label 'l2' on dim2") + # for name, value in data1.items(): + # print(f" '{name}'={value}") + + # data2 = array.label_index(["l1", "l2"])[4:5, -2:2] + # print("Reading array on [[4:5, -2:2]] with label 'l1' on dim1 and 'l2' on dim2") + # for name, value in data2.items(): + # print(f" '{name}'={value}") + + # Should read all data + print("Reading array on [['a':'ddd']] with label 'l1' on dim1") + data3 = array.label_index(["l1"])["a":"ddd"] + for name, value in data3.items(): + print(f" '{name}'={value}") + + +if __name__ == "__main__": + # Name of the array to create. + ARRAY_NAME = "/home/shaun/Documents/Arrays/quickstart_labels_string_py" + conf = tiledb.Config({ + "sm.io_concurrency_level": "1", + "sm.compute_concurrency_level": "1", + }) + tiledb.default_ctx(conf) + + LIBVERSION = tiledb.libtiledb.version() + vfs = tiledb.VFS() + + if LIBVERSION[0] == 2 and LIBVERSION[1] < 15: + print( + f"Dimension labels requires libtiledb version >= 2.15.0. Current version is" + f" {LIBVERSION[0]}.{LIBVERSION[1]}.{LIBVERSION[2]}" + ) + else: + if vfs.is_dir(ARRAY_NAME): + vfs.remove_dir(ARRAY_NAME) + create_array(ARRAY_NAME) + write_array(ARRAY_NAME) + read_array(ARRAY_NAME) diff --git a/examples/string_float_int_dimensions.py b/examples/string_float_int_dimensions.py index d1fb03b95d..edda458cc6 100644 --- a/examples/string_float_int_dimensions.py +++ b/examples/string_float_int_dimensions.py @@ -39,6 +39,9 @@ import tiledb path = "sparse_mixed_demo" +vfs = tiledb.VFS() +if vfs.is_dir(path): + vfs.remove_dir(path) dom = tiledb.Domain( *[ From 0ec4a060803fcda28afbaf11355291bb1ebfc156 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Thu, 22 Jun 2023 16:17:27 -0400 Subject: [PATCH 02/25] WIP --- tiledb/core.cc | 62 +++++++++++++++++++++++--------- tiledb/dimension_label_schema.py | 3 ++ tiledb/multirange_indexing.py | 22 ++++++------ 3 files changed, 60 insertions(+), 27 deletions(-) diff --git a/tiledb/core.cc b/tiledb/core.cc index 55fa95e14d..2da17792f7 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -467,6 +467,14 @@ class PyQuery { return array_schema_->has_attribute(name); } + bool is_dimension_label(std::string name) { +#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 + return ArraySchemaExperimental::has_dimension_label(ctx_, *array_schema_, name); +#else + return false; +#endif + } + bool is_var(std::string name) { if (is_dimension(name)) { auto dim = domain_->dimension(name); @@ -474,6 +482,12 @@ class PyQuery { } else if (is_attribute(name)) { auto attr = array_schema_->attribute(name); return attr.cell_val_num() == TILEDB_VAR_NUM; +#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 + } else if (is_dimension_label(name)) { + auto dim_label = ArraySchemaExperimental::dimension_label(ctx_, + *array_schema_, name); + return dim_label.label_cell_val_num() == TILEDB_VAR_NUM; +#endif } else { TPY_ERROR_LOC("Unknown buffer type for is_var check (expected attribute " "or dimension)") @@ -481,7 +495,7 @@ class PyQuery { } bool is_nullable(std::string name) { - if (is_dimension(name)) { + if (is_dimension(name) || is_dimension_label(name)) { return false; } @@ -498,6 +512,13 @@ class PyQuery { } else if (is_attribute(name)) { type = array_schema_->attribute(name).type(); cell_val_num = array_schema_->attribute(name).cell_val_num(); +#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 + } else if (is_dimension_label(name)) { + auto dim_label = ArraySchemaExperimental::dimension_label(ctx_, + *array_schema_, name); + type = dim_label.label_type(); + cell_val_num = dim_label.label_cell_val_num(); +#endif } else { TPY_ERROR_LOC("Unknown buffer '" + name + "'"); } @@ -621,28 +642,34 @@ class PyQuery { #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 void alloc_label_buffer(std::string &label_name, uint64_t ncells) { - auto dim_label = ArraySchemaExperimental::dimension_label( ctx_, *array_schema_, label_name); + std::cout << "label_name = " << label_name << std::endl; + std::cout << "\tncells = " << ncells << std::endl; tiledb_datatype_t type = dim_label.label_type(); uint32_t cell_val_num = dim_label.label_cell_val_num(); uint64_t cell_nbytes = tiledb_datatype_size(type); - if (cell_val_num != TILEDB_VAR_NUM) { + std::cout << "\tcell_nbytes = " << cell_nbytes << std::endl; + bool var = cell_val_num == TILEDB_VAR_NUM; + bool nullable = false; + uint64_t buf_nbytes = 0; + + if (!var) { + std::cout << "\tcell_val_num = " << cell_val_num << std::endl; cell_nbytes *= cell_val_num; + std::cout << "\tcell_nbytes *= cell_val_num = " << cell_nbytes << std::endl; + buf_nbytes = ncells * cell_nbytes; + std::cout << "\tbuf_nbytes = ncells * cell_nbytes = " << buf_nbytes << std::endl; } else { - throw TileDBError( - "reading variable length dimension labels is not yet supported"); + // TODO: I think we still need est_result_size here. + // + Given range ['a', 'ddd'], I don't see another way to calculate the label data size between 'a' and 'ddd'. + buf_nbytes = 9; // Full label data for this hard-coded example is ['a', 'bb', 'ccc', 'ddd'] } - auto dtype = tiledb_dtype(type, cell_val_num); - uint64_t buf_nbytes = ncells * cell_nbytes; - uint64_t offsets_num = 0; + uint64_t offsets_num = var ? ncells : 0; uint64_t validity_num = 0; - bool var = cell_val_num == TILEDB_VAR_NUM; - bool nullable = false; - buffers_order_.push_back(label_name); buffers_.insert( {label_name, BufferInfo(label_name, buf_nbytes, type, cell_val_num, @@ -763,16 +790,17 @@ class PyQuery { if ((Py_ssize_t)(buf.data_vals_read * buf.elem_nbytes) > (Py_ssize_t)buf.data.size()) { - throw TileDBError("After read query, data buffer out of bounds: " + - name); + throw TileDBError("After read query, data buffer out of bounds: " + name + " (" + + std::to_string(buf.data_vals_read * buf.elem_nbytes) + " > " + + std::to_string(buf.data.size()) + ")"); } if ((Py_ssize_t)buf.offsets_read > buf.offsets.size()) { - throw TileDBError("After read query, offsets buffer out of bounds: " + - name); + throw TileDBError("After read query, offsets buffer out of bounds: " + name + " (" + + std::to_string(buf.offsets_read) + " > " + std::to_string(buf.offsets.size()) + ")"); } if ((Py_ssize_t)buf.validity_vals_read > buf.validity.size()) { - throw TileDBError("After read query, validity buffer out of bounds: " + - name); + throw TileDBError("After read query, validity buffer out of bounds: " + name + " (" + + std::to_string(buf.validity_vals_read) + " > " + std::to_string(buf.validity.size()) + ")"); } } } diff --git a/tiledb/dimension_label_schema.py b/tiledb/dimension_label_schema.py index d6e953a9dd..fbb1cb964b 100644 --- a/tiledb/dimension_label_schema.py +++ b/tiledb/dimension_label_schema.py @@ -35,6 +35,9 @@ def __init__( # Get DataType and DataOrder objects _label_order = DataOrder[order] _label_dtype = DataType.from_numpy(label_dtype) + # TOOD: Fix from_numpy for np.bytes_ (?) + if _label_dtype.tiledb_type == lt.DataType.CHAR: + _label_dtype = DataType(np.bytes_, lt.DataType.STRING_ASCII, lt.TILEDB_VAR_NUM) _dim_dtype = DataType.from_numpy(dim_dtype) # Convert the tile extent (if set) diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index 1b58c8d21b..e0b7a66d60 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -486,10 +486,6 @@ def __init__( self._labels: Dict[int, str] = {} for label_name in labels: dim_label = array.schema.dim_label(label_name) - if dim_label.isvar: - raise NotImplementedError( - "querying by variable length labels is not yet implemented" - ) dim_idx = dim_label.dim_index if dim_idx in self._labels: raise TileDBError( @@ -624,16 +620,22 @@ def _get_pyquery_results( pyquery: PyQuery, schema: ArraySchema ) -> Dict[str, np.ndarray]: result_dict = OrderedDict() - for name, item in pyquery.results().items(): + res = pyquery.results() + # TODO: There are no offsets at item[1] for the label result buffer, resulting in exception from numpy in else case. + # + Var size labels should have len(item[1]) > 0; We should not hit the else case below. + for name, item in res.items(): if len(item[1]) > 0: arr = pyquery.unpack_buffer(name, item[0], item[1]) else: arr = item[0] - arr.dtype = ( - schema.attr_or_dim_dtype(name) - if not schema.has_dim_label(name) - else schema.dim_label(name).dtype - ) + if schema.has_dim_label(name): + if schema.dim_label(name).isvar: + # arr.dtype = np.uint8 # TODO: Revert all changes here. This is just hard-coded for POC. + arr = pyquery.unpack_buffer(name, item[0], [0, 1, 3, 6]) + else: + arr.dtype = schema.dim_label(name).dtype + else: + arr.dtype = schema.attr_or_dim_dtype(name) result_dict[name if name != "__attr" else ""] = arr return result_dict From 7398457ac9f14b43e44b020a93bebba47e48298c Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Fri, 23 Jun 2023 14:02:36 -0400 Subject: [PATCH 03/25] Allocate label var size buffers --- .../quickstart_dimension_labels_string.py | 23 ++++++++-------- tiledb/cc/query.cc | 2 ++ tiledb/core.cc | 27 +++++++------------ tiledb/dimension_label_schema.py | 4 +-- tiledb/multirange_indexing.py | 12 +++++++-- 5 files changed, 35 insertions(+), 33 deletions(-) diff --git a/examples/quickstart_dimension_labels_string.py b/examples/quickstart_dimension_labels_string.py index 6e27c03005..9a2c3df2ab 100644 --- a/examples/quickstart_dimension_labels_string.py +++ b/examples/quickstart_dimension_labels_string.py @@ -41,10 +41,9 @@ def create_array(uri: str): """Create array schema with dimension labels""" dim1 = tiledb.Dim("d1", domain=(1, 4), dtype=np.int32) dim2 = tiledb.Dim("d2", domain=(1, 5), dtype=np.int32) - # TODO: Using np.bytes_ here was not converting correctly in DataType.from_numpy() - # + dimension.py overrides bytes_->TILEDB_STRING_ASCII and does not use DataType.from_numpy() + # TODO: Test label query with N var-size labels. dim_labels = { - 0: {"l1": dim1.create_label_schema("increasing", "ascii")}, + 0: {"l1": dim1.create_label_schema("increasing", np.bytes_)}, 1: { "l2": dim2.create_label_schema("increasing", np.int64), "l3": dim2.create_label_schema("increasing", np.float64), @@ -137,15 +136,15 @@ def read_array(uri: str): """Read the array from the dimension label""" with tiledb.open(uri, "r") as array: - # data1 = array.label_index(["l2"])[1, 1:2] - # print("Reading array on [[1, -1:1]] with label 'l2' on dim2") - # for name, value in data1.items(): - # print(f" '{name}'={value}") - - # data2 = array.label_index(["l1", "l2"])[4:5, -2:2] - # print("Reading array on [[4:5, -2:2]] with label 'l1' on dim1 and 'l2' on dim2") - # for name, value in data2.items(): - # print(f" '{name}'={value}") + data1 = array.label_index(["l2"])[1, 1:2] + print("Reading array on [[1, -1:1]] with label 'l2' on dim2") + for name, value in data1.items(): + print(f" '{name}'={value}") + + data2 = array.label_index(["l1", "l2"])["a":"ddd", -2:2] + print("Reading array on [[4:5, -2:2]] with label 'l1' on dim1 and 'l2' on dim2") + for name, value in data2.items(): + print(f" '{name}'={value}") # Should read all data print("Reading array on [['a':'ddd']] with label 'l1' on dim1") diff --git a/tiledb/cc/query.cc b/tiledb/cc/query.cc index 7320084ac7..9d515dc13c 100644 --- a/tiledb/cc/query.cc +++ b/tiledb/cc/query.cc @@ -50,6 +50,8 @@ void init_query(py::module &m) { .def("has_results", &Query::has_results) + .def("est_result_size_var", &Query::est_result_size_var) + .def("is_complete", [](const Query &query) { return query.query_status() == Query::Status::COMPLETE; diff --git a/tiledb/core.cc b/tiledb/core.cc index 2da17792f7..897ac35188 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -314,7 +314,7 @@ class PyQuery { tiledb_layout_t layout_ = TILEDB_ROW_MAJOR; // label buffer list - std::vector> label_input_buffer_data_; + std::vector> label_input_buffer_data_; py::object pyschema_; @@ -641,34 +641,27 @@ class PyQuery { } #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 - void alloc_label_buffer(std::string &label_name, uint64_t ncells) { + void alloc_label_buffer(std::string &label_name, uint64_t ncells, uint64_t var_size) { auto dim_label = ArraySchemaExperimental::dimension_label( ctx_, *array_schema_, label_name); - std::cout << "label_name = " << label_name << std::endl; - std::cout << "\tncells = " << ncells << std::endl; tiledb_datatype_t type = dim_label.label_type(); uint32_t cell_val_num = dim_label.label_cell_val_num(); uint64_t cell_nbytes = tiledb_datatype_size(type); - std::cout << "\tcell_nbytes = " << cell_nbytes << std::endl; bool var = cell_val_num == TILEDB_VAR_NUM; bool nullable = false; uint64_t buf_nbytes = 0; + uint64_t offsets_num = 0; + uint64_t validity_num = 0; if (!var) { - std::cout << "\tcell_val_num = " << cell_val_num << std::endl; cell_nbytes *= cell_val_num; - std::cout << "\tcell_nbytes *= cell_val_num = " << cell_nbytes << std::endl; buf_nbytes = ncells * cell_nbytes; - std::cout << "\tbuf_nbytes = ncells * cell_nbytes = " << buf_nbytes << std::endl; } else { - // TODO: I think we still need est_result_size here. - // + Given range ['a', 'ddd'], I don't see another way to calculate the label data size between 'a' and 'ddd'. - buf_nbytes = 9; // Full label data for this hard-coded example is ['a', 'bb', 'ccc', 'ddd'] + buf_nbytes = var_size; + offsets_num = ncells; } - uint64_t offsets_num = var ? ncells : 0; - uint64_t validity_num = 0; buffers_order_.push_back(label_name); buffers_.insert( @@ -676,14 +669,14 @@ class PyQuery { offsets_num, validity_num, var, nullable)}); } #else - void alloc_label_buffer(std::string &, uint64_t) { + void alloc_label_buffer(std::string &, uint64_t, uint64_t) { throw TileDBError( "Using dimension labels requires libtiledb version 2.15.0 or greater"); } #endif - void add_label_buffer(std::string &label_name, uint64_t ncells) { - label_input_buffer_data_.push_back({label_name, ncells}); + void add_label_buffer(std::string &label_name, uint64_t ncells, uint64_t var_size) { + label_input_buffer_data_.push_back({label_name, ncells, var_size}); } py::object get_buffers() { @@ -965,7 +958,7 @@ class PyQuery { // allocate buffers for label dimensions for (auto &label_data : label_input_buffer_data_) { - alloc_label_buffer(label_data.first, label_data.second); + alloc_label_buffer(std::get<0>(label_data), std::get<1>(label_data), std::get<2>(label_data)); } // allocate buffers for attributes diff --git a/tiledb/dimension_label_schema.py b/tiledb/dimension_label_schema.py index fbb1cb964b..a93fa73dc9 100644 --- a/tiledb/dimension_label_schema.py +++ b/tiledb/dimension_label_schema.py @@ -35,9 +35,9 @@ def __init__( # Get DataType and DataOrder objects _label_order = DataOrder[order] _label_dtype = DataType.from_numpy(label_dtype) - # TOOD: Fix from_numpy for np.bytes_ (?) - if _label_dtype.tiledb_type == lt.DataType.CHAR: + if np.issubdtype(label_dtype, np.bytes_): _label_dtype = DataType(np.bytes_, lt.DataType.STRING_ASCII, lt.TILEDB_VAR_NUM) + _dim_dtype = DataType.from_numpy(dim_dtype) # Convert the tile extent (if set) diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index e0b7a66d60..b9f650e6b7 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -519,6 +519,12 @@ def _run_query(self) -> Dict[str, np.ndarray]: # query and update the pyquery with the actual dimensions. if self.label_query is not None and not self.label_query.is_complete(): self.label_query.submit() + + est_var_size = [0, 0] + for label_name in self._labels.values(): + if self.array.schema.dim_label(label_name).isvar: + est_var_size = self.label_query.est_result_size_var(label_name) + if not self.label_query.is_complete(): raise TileDBError("failed to get dimension ranges from labels") label_subarray = self.label_query.subarray() @@ -537,7 +543,7 @@ def _run_query(self) -> Dict[str, np.ndarray]: for dim_idx, label_name in self._labels.items(): if self.result_shape is None: raise TileDBError("failed to compute subarray shape") - self.pyquery.add_label_buffer(label_name, self.result_shape[dim_idx]) + self.pyquery.add_label_buffer(label_name, self.result_shape[dim_idx], est_var_size[1]) return super()._run_query() @@ -631,7 +637,9 @@ def _get_pyquery_results( if schema.has_dim_label(name): if schema.dim_label(name).isvar: # arr.dtype = np.uint8 # TODO: Revert all changes here. This is just hard-coded for POC. - arr = pyquery.unpack_buffer(name, item[0], [0, 1, 3, 6]) + # arr = pyquery.unpack_buffer(name, item[0], [0, 1]) # 'bb' + # arr = pyquery.unpack_buffer(name, item[0], [0, 1, 3]) # 'ccc' + arr = pyquery.unpack_buffer(name, item[0], [0, 1, 3, 6]) # 'ddd' else: arr.dtype = schema.dim_label(name).dtype else: From 33fd00aa2cf8e22c8e3f28a9c69058482dd22fea Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Fri, 23 Jun 2023 15:30:49 -0400 Subject: [PATCH 04/25] Merge alloc_buffer and alloc_label_buffer --- .../quickstart_dimension_labels_string.py | 4 +- tiledb/core.cc | 132 ++++++++---------- 2 files changed, 61 insertions(+), 75 deletions(-) diff --git a/examples/quickstart_dimension_labels_string.py b/examples/quickstart_dimension_labels_string.py index 9a2c3df2ab..c361ce73ac 100644 --- a/examples/quickstart_dimension_labels_string.py +++ b/examples/quickstart_dimension_labels_string.py @@ -136,13 +136,13 @@ def read_array(uri: str): """Read the array from the dimension label""" with tiledb.open(uri, "r") as array: - data1 = array.label_index(["l2"])[1, 1:2] + data1 = array.label_index(["l2"])[1, -1:1] print("Reading array on [[1, -1:1]] with label 'l2' on dim2") for name, value in data1.items(): print(f" '{name}'={value}") data2 = array.label_index(["l1", "l2"])["a":"ddd", -2:2] - print("Reading array on [[4:5, -2:2]] with label 'l1' on dim1 and 'l2' on dim2") + print("Reading array on [['a':'ddd', -2:2]] with label 'l1' on dim1 and 'l2' on dim2") for name, value in data2.items(): print(f" '{name}'={value}") diff --git a/tiledb/core.cc b/tiledb/core.cc index 897ac35188..1c12e0a88a 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -314,7 +314,7 @@ class PyQuery { tiledb_layout_t layout_ = TILEDB_ROW_MAJOR; // label buffer list - std::vector> label_input_buffer_data_; + std::unordered_map> label_input_buffer_data_; py::object pyschema_; @@ -564,51 +564,72 @@ class PyQuery { } void alloc_buffer(std::string name) { - tiledb_datatype_t type; uint32_t cell_val_num; - std::tie(type, cell_val_num) = buffer_type(name); - uint64_t cell_nbytes = tiledb_datatype_size(type); - if (cell_val_num != TILEDB_VAR_NUM) - cell_nbytes *= cell_val_num; - auto dtype = tiledb_dtype(type, cell_val_num); - + uint64_t cell_nbytes; + bool var; + bool nullable; uint64_t buf_nbytes = 0; uint64_t offsets_num = 0; uint64_t validity_num = 0; - - bool var = is_var(name); - bool nullable = is_nullable(name); bool dense = array_schema_->array_type() == TILEDB_DENSE; + if (is_dimension_label(name)) { +#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 + auto dim_label = ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name); + type = dim_label.label_type(); + cell_val_num = dim_label.label_cell_val_num(); + var = cell_val_num == TILEDB_VAR_NUM; + nullable = false; - if (retries_ < 1 && dense) { - // we must not call after submitting - if (nullable && var) { - auto sizes = query_->est_result_size_var_nullable(name); - offsets_num = sizes[0]; - buf_nbytes = sizes[1]; - validity_num = sizes[2] / sizeof(uint8_t); - } else if (nullable && !var) { - auto sizes = query_->est_result_size_nullable(name); - buf_nbytes = sizes[0]; - validity_num = sizes[1] / sizeof(uint8_t); - } else if (!nullable && var) { - auto size_pair = query_->est_result_size_var(name); + cell_nbytes = tiledb_datatype_size(type); + uint64_t ncells = label_input_buffer_data_[name].first; + + if (!var) { + cell_nbytes *= cell_val_num; + buf_nbytes = ncells * cell_nbytes; + } else { + buf_nbytes = label_input_buffer_data_[name].second; + offsets_num = ncells; + } +#endif + } else { + std::tie(type, cell_val_num) = buffer_type(name); + cell_nbytes = tiledb_datatype_size(type); + if (cell_val_num != TILEDB_VAR_NUM) { + cell_nbytes *= cell_val_num; + } + var = is_var(name); + nullable = is_nullable(name); + + if (retries_ < 1 && dense) { + // we must not call after submitting + if (nullable && var) { + auto sizes = query_->est_result_size_var_nullable(name); + offsets_num = sizes[0]; + buf_nbytes = sizes[1]; + validity_num = sizes[2] / sizeof(uint8_t); + } else if (nullable && !var) { + auto sizes = query_->est_result_size_nullable(name); + buf_nbytes = sizes[0]; + validity_num = sizes[1] / sizeof(uint8_t); + } else if (!nullable && var) { + auto size_pair = query_->est_result_size_var(name); #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR < 2 - buf_nbytes = size_pair.first; - offsets_num = size_pair.second; + buf_nbytes = size_pair.first; + offsets_num = size_pair.second; #else - buf_nbytes = size_pair[0]; - offsets_num = size_pair[1]; + buf_nbytes = size_pair[0]; + offsets_num = size_pair[1]; #endif - } else { // !nullable && !var - buf_nbytes = query_->est_result_size(name); - } + } else { // !nullable && !var + buf_nbytes = query_->est_result_size(name); + } - // Add extra offset to estimate in order to avoid incomplete resubmit - // libtiledb 2.7.* does not include extra element in estimate. - // Remove this section after resolution of SC-16301. - offsets_num += (var && use_arrow_) ? 1 : 0; + // Add extra offset to estimate in order to avoid incomplete resubmit + // libtiledb 2.7.* does not include extra element in estimate. + // Remove this section after resolution of SC-16301. + offsets_num += (var && use_arrow_) ? 1 : 0; + } } // - for sparse arrays: don't try to allocate more than alloc_max_bytes_ @@ -640,43 +661,8 @@ class PyQuery { validity_num, var, nullable)}); } -#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 - void alloc_label_buffer(std::string &label_name, uint64_t ncells, uint64_t var_size) { - auto dim_label = ArraySchemaExperimental::dimension_label( - ctx_, *array_schema_, label_name); - - tiledb_datatype_t type = dim_label.label_type(); - uint32_t cell_val_num = dim_label.label_cell_val_num(); - uint64_t cell_nbytes = tiledb_datatype_size(type); - bool var = cell_val_num == TILEDB_VAR_NUM; - bool nullable = false; - uint64_t buf_nbytes = 0; - uint64_t offsets_num = 0; - uint64_t validity_num = 0; - - if (!var) { - cell_nbytes *= cell_val_num; - buf_nbytes = ncells * cell_nbytes; - } else { - buf_nbytes = var_size; - offsets_num = ncells; - } - - - buffers_order_.push_back(label_name); - buffers_.insert( - {label_name, BufferInfo(label_name, buf_nbytes, type, cell_val_num, - offsets_num, validity_num, var, nullable)}); - } -#else - void alloc_label_buffer(std::string &, uint64_t, uint64_t) { - throw TileDBError( - "Using dimension labels requires libtiledb version 2.15.0 or greater"); - } -#endif - void add_label_buffer(std::string &label_name, uint64_t ncells, uint64_t var_size) { - label_input_buffer_data_.push_back({label_name, ncells, var_size}); + label_input_buffer_data_[label_name] = {ncells, var_size}; } py::object get_buffers() { @@ -957,8 +943,8 @@ class PyQuery { } // allocate buffers for label dimensions - for (auto &label_data : label_input_buffer_data_) { - alloc_label_buffer(std::get<0>(label_data), std::get<1>(label_data), std::get<2>(label_data)); + for (const auto &label_data : label_input_buffer_data_) { + alloc_buffer(label_data.first); } // allocate buffers for attributes From 187f7ed71374ccc6079aa9778e316a8d87cee19f Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Mon, 26 Jun 2023 15:49:00 -0400 Subject: [PATCH 05/25] Fix missing offset buffer data + Bug was in core Query::result_buffer_elements_nullable --- .../quickstart_dimension_labels_string.py | 4 ++-- tiledb/multirange_indexing.py | 20 ++++++------------- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/examples/quickstart_dimension_labels_string.py b/examples/quickstart_dimension_labels_string.py index c361ce73ac..b75b053dd5 100644 --- a/examples/quickstart_dimension_labels_string.py +++ b/examples/quickstart_dimension_labels_string.py @@ -141,8 +141,8 @@ def read_array(uri: str): for name, value in data1.items(): print(f" '{name}'={value}") - data2 = array.label_index(["l1", "l2"])["a":"ddd", -2:2] - print("Reading array on [['a':'ddd', -2:2]] with label 'l1' on dim1 and 'l2' on dim2") + data2 = array.label_index(["l1", "l2"])["a":"ccc", -2:2] + print("Reading array on [['a':'ccc', -2:2]] with label 'l1' on dim1 and 'l2' on dim2") for name, value in data2.items(): print(f" '{name}'={value}") diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index b9f650e6b7..b0378dafad 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -626,24 +626,16 @@ def _get_pyquery_results( pyquery: PyQuery, schema: ArraySchema ) -> Dict[str, np.ndarray]: result_dict = OrderedDict() - res = pyquery.results() - # TODO: There are no offsets at item[1] for the label result buffer, resulting in exception from numpy in else case. - # + Var size labels should have len(item[1]) > 0; We should not hit the else case below. - for name, item in res.items(): + for name, item in pyquery.results().items(): if len(item[1]) > 0: arr = pyquery.unpack_buffer(name, item[0], item[1]) else: arr = item[0] - if schema.has_dim_label(name): - if schema.dim_label(name).isvar: - # arr.dtype = np.uint8 # TODO: Revert all changes here. This is just hard-coded for POC. - # arr = pyquery.unpack_buffer(name, item[0], [0, 1]) # 'bb' - # arr = pyquery.unpack_buffer(name, item[0], [0, 1, 3]) # 'ccc' - arr = pyquery.unpack_buffer(name, item[0], [0, 1, 3, 6]) # 'ddd' - else: - arr.dtype = schema.dim_label(name).dtype - else: - arr.dtype = schema.attr_or_dim_dtype(name) + arr.dtype = ( + schema.attr_or_dim_dtype(name) + if not schema.has_dim_label(name) + else schema.dim_label(name).dtype + ) result_dict[name if name != "__attr" else ""] = arr return result_dict From 92dbe408155c8221cacea0f426bc14d78ae00d08 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Tue, 27 Jun 2023 11:42:09 -0400 Subject: [PATCH 06/25] Add UTs --- tiledb/dimension_label_schema.py | 8 ++- tiledb/tests/test_dimension_label.py | 102 +++++++++++++++++++++++++-- 2 files changed, 100 insertions(+), 10 deletions(-) diff --git a/tiledb/dimension_label_schema.py b/tiledb/dimension_label_schema.py index a93fa73dc9..dc1365de32 100644 --- a/tiledb/dimension_label_schema.py +++ b/tiledb/dimension_label_schema.py @@ -34,9 +34,11 @@ def __init__( # Get DataType and DataOrder objects _label_order = DataOrder[order] - _label_dtype = DataType.from_numpy(label_dtype) - if np.issubdtype(label_dtype, np.bytes_): - _label_dtype = DataType(np.bytes_, lt.DataType.STRING_ASCII, lt.TILEDB_VAR_NUM) + _label_dtype = DataType.from_numpy( + np.dtype(label_dtype) if label_dtype not in ("ascii", "blob") else label_dtype) + np_dtype = _label_dtype.np_dtype + if np.issubdtype(np_dtype, np.bytes_) or np.issubdtype(np_dtype, np.str_): + _label_dtype = DataType(np_dtype, lt.DataType.STRING_ASCII, lt.TILEDB_VAR_NUM) _dim_dtype = DataType.from_numpy(dim_dtype) diff --git a/tiledb/tests/test_dimension_label.py b/tiledb/tests/test_dimension_label.py index 6edc346597..ac99caca4b 100644 --- a/tiledb/tests/test_dimension_label.py +++ b/tiledb/tests/test_dimension_label.py @@ -148,12 +148,15 @@ def test_add_to_array_schema_dim_dtype_mismatch(self): tiledb.libtiledb.version()[0] == 2 and tiledb.libtiledb.version()[1] < 15, reason="dimension labels requires libtiledb version 2.15 or greater", ) - def test_dimension_label_round_trip_dense_array(self): + @pytest.mark.parametrize("var", [True, False]) + def test_dimension_label_round_trip_dense_array(self, var): # Create array schema with dimension labels dim = tiledb.Dim("d1", domain=(1, 10)) dom = tiledb.Domain(dim) att = tiledb.Attr("a1", dtype=np.int64) dim_labels = {0: {"l1": dim.create_label_schema("increasing", np.int64)}} + if var: + dim_labels = {0: {"l1": dim.create_label_schema("increasing", np.bytes_)}} schema = tiledb.ArraySchema(domain=dom, attrs=(att,), dim_labels=dim_labels) # Create array @@ -163,6 +166,8 @@ def test_dimension_label_round_trip_dense_array(self): # Write data to the array and the label attr_data = np.arange(1, 11) label_data = np.arange(-9, 10, 2) + if var: + label_data = np.array([str(chr(ord('a') + c) * (10 - c)).encode("utf-8") for c in range(10)]) with tiledb.open(uri, "w") as array: array[:] = {"a1": attr_data, "l1": label_data} @@ -181,7 +186,8 @@ def test_dimension_label_round_trip_dense_array(self): indexer = array.label_index(["l1"]) # Read full array - result = indexer[-100:100] + result = indexer[label_data[0]:label_data[-1]] + np.testing.assert_array_equal(result["a1"], attr_data) np.testing.assert_array_equal(result["l1"], label_data) @@ -192,11 +198,18 @@ def test_dimension_label_round_trip_dense_array(self): assert result["a1"][0] == attr_data[index] assert result["l1"][0] == label_index + for index in range(10): + label_index = label_data[index:] + result = indexer[label_index[0]:label_index[-1]] + np.testing.assert_array_equal(result["a1"], attr_data[index:]) + np.testing.assert_array_equal(result["l1"], label_index) + @pytest.mark.skipif( tiledb.libtiledb.version()[0] == 2 and tiledb.libtiledb.version()[1] < 15, reason="dimension labels requires libtiledb version 2.15 or greater", ) - def test_dimension_label_round_trip_multidim_dense_array(self): + @pytest.mark.parametrize("var", [True, False]) + def test_dimension_label_round_trip_multidim_dense_array(self, var): # Create array schema with dimension labels dim1 = tiledb.Dim("x_index", domain=(1, 8)) dim2 = tiledb.Dim("y_index", domain=(1, 8)) @@ -204,7 +217,7 @@ def test_dimension_label_round_trip_multidim_dense_array(self): att = tiledb.Attr("value", dtype=np.int64) dim_labels = { 0: { - "x1": dim1.create_label_schema("increasing", np.float64), + "x1": dim1.create_label_schema("increasing", np.float64 if not var else "U"), "x2": dim1.create_label_schema("decreasing", np.int64), }, 1: { @@ -220,6 +233,8 @@ def test_dimension_label_round_trip_multidim_dense_array(self): # Write data to the array and the label attr_data = np.reshape(np.arange(1, 65), (8, 8)) x1_data = np.linspace(-1.0, 1.0, 8) + if var: + x1_data = np.array([str(chr(ord('a') + c - 1) * c).encode('utf-8') for c in range(1, 9)]) x2_data = np.arange(8, 0, -1) y1_data = np.arange(9, 17) with tiledb.open(uri, "w") as array: @@ -233,7 +248,7 @@ def test_dimension_label_round_trip_multidim_dense_array(self): # Test querying by label with tiledb.open(uri, "r") as array: # Read full array: labels on both ranges - result = array.label_index(["x1", "y1"])[-1.0:1.0, 9:17] + result = array.label_index(["x1", "y1"])[x1_data[0]:x1_data[-1], 9:17] np.testing.assert_array_equal(result["value"], attr_data) np.testing.assert_array_equal(result["x1"], x1_data) np.testing.assert_array_equal(result["y1"], y1_data) @@ -261,12 +276,13 @@ def test_dimension_label_round_trip_multidim_dense_array(self): tiledb.libtiledb.version()[0] == 2 and tiledb.libtiledb.version()[1] < 15, reason="dimension labels requires libtiledb version 2.15 or greater", ) - def test_dimension_label_round_trip_sparse_array(self): + @pytest.mark.parametrize("var", [True, False]) + def test_dimension_label_round_trip_sparse_array(self, var): # Create array schema with dimension labels dim = tiledb.Dim("index", domain=(1, 10)) dom = tiledb.Domain(dim) att = tiledb.Attr("value", dtype=np.int64) - dim_labels = {0: {"l1": dim.create_label_schema("increasing", np.int64)}} + dim_labels = {0: {"l1": dim.create_label_schema("increasing", np.int64 if not var else "ascii")}} schema = tiledb.ArraySchema( domain=dom, attrs=(att,), dim_labels=dim_labels, sparse=True ) @@ -279,6 +295,8 @@ def test_dimension_label_round_trip_sparse_array(self): index_data = np.arange(1, 11) attr_data = np.arange(11, 21) label_data = np.arange(-10, 0) + if var: + label_data = np.array([str(chr(ord('a') + c) * (10 - c)).encode('utf-8') for c in range(10)]) with tiledb.open(uri, "w") as array: array[index_data] = {"value": attr_data, "l1": label_data} @@ -290,3 +308,73 @@ def test_dimension_label_round_trip_sparse_array(self): with tiledb.open(dim_label.uri, "r") as label1: output_label_data = label1[:][dim_label.label_attr_name] np.testing.assert_array_equal(output_label_data, label_data) + + @pytest.mark.skipif( + tiledb.libtiledb.version()[0] == 2 and tiledb.libtiledb.version()[1] < 15, + reason="dimension labels requires libtiledb version 2.15 or greater", + ) + def test_dimension_label_round_trip_dense_var(self): + # Create array schema with dimension labels + dims = [ + tiledb.Dim("d1", domain=(1, 10), dtype=np.int64), + tiledb.Dim("d2", domain=(1, 10), dtype=np.int64), + ] + dom = tiledb.Domain(*dims) + att = tiledb.Attr("value", var=True, dtype="S") + dim_labels = { + 0: { + "l1": dims[0].create_label_schema("increasing", np.float32), + }, + 1: { + "l2": dims[1].create_label_schema("decreasing", np.int32), + "l3": dims[1].create_label_schema("increasing", np.bytes_), + }, + } + + schema = tiledb.ArraySchema( + domain=dom, attrs=(att,), dim_labels=dim_labels, sparse=False + ) + + # Create array + uri = self.path("dense_array_with_var_label2") + tiledb.Array.create(uri, schema) + + # Write data to the array and the label + attr_data = np.array( + [[str(chr(ord('z') - c) * (10 - c)).encode('utf-8') for c in range(10)] for i in range(10)]) + l1_data = np.arange(10, dtype=np.float32) + l2_data = np.arange(10, 0, -1, dtype=np.int32) + l3_data = np.array([str(chr(ord('a') + c) * (c + 1)).encode('utf-8') for c in range(10)]) + + with tiledb.open(uri, "w") as array: + array[:, :] = {"value": attr_data, "l1": l1_data, "l2": l2_data, "l3": l3_data} + + # Load the array schema and get the URI of the dimension label + schema = tiledb.ArraySchema.load(uri) + for label_name, label_data in {"l1": l1_data, "l2": l2_data, "l3": l3_data}.items(): + dim_label = schema.dim_label(label_name) + # Read and check the data directly from the dimension label + with tiledb.open(dim_label.uri, "r") as label: + output_label_data = label[:][dim_label.label_attr_name] + np.testing.assert_array_equal(output_label_data, label_data) + + with tiledb.open(uri, "r") as array: + indexer = array.label_index([label_name]) + lower = min(label_data[0], label_data[-1]) + upper = max(label_data[0], label_data[-1]) + if label_name == "l1": + all_data = indexer[lower:upper] + else: + all_data = indexer[:, lower:upper] + np.testing.assert_array_equal(all_data[label_name], label_data) + np.testing.assert_array_equal(all_data["value"], attr_data) + + # Slice array with varying sizes. + for index in range(10): + label_index = label_data[index:] + if label_name == "l1": + result = indexer[lower:upper] + else: + result = indexer[:, lower:upper] + np.testing.assert_array_equal(result["value"][index:], attr_data[index:]) + np.testing.assert_array_equal(result[label_name][index:], label_index) From f1af354bb771ccdb11a6504608d20f043013ba3a Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Tue, 27 Jun 2023 11:43:33 -0400 Subject: [PATCH 07/25] Revert "Changes to examples for testing" This reverts commit 32df5f37ac6a1fed520c86d66bf6ae1fb19bff81. --- examples/quickstart_dimension_labels.py | 8 +- .../quickstart_dimension_labels_string.py | 178 ------------------ examples/string_float_int_dimensions.py | 3 - 3 files changed, 4 insertions(+), 185 deletions(-) delete mode 100644 examples/quickstart_dimension_labels_string.py diff --git a/examples/quickstart_dimension_labels.py b/examples/quickstart_dimension_labels.py index 998223799f..4fade0bde9 100644 --- a/examples/quickstart_dimension_labels.py +++ b/examples/quickstart_dimension_labels.py @@ -104,9 +104,9 @@ def read_array(uri: str): else: # Only create and write to the array if it doesn't already exist. - if tiledb.object_type(ARRAY_NAME) == "array": - tiledb.Array.delete_array(ARRAY_NAME) - create_array(ARRAY_NAME) - write_array(ARRAY_NAME) + if tiledb.object_type(ARRAY_NAME) != "array": + create_array(ARRAY_NAME) + write_array(ARRAY_NAME) + # Read from the array and print output. read_array(ARRAY_NAME) diff --git a/examples/quickstart_dimension_labels_string.py b/examples/quickstart_dimension_labels_string.py deleted file mode 100644 index b75b053dd5..0000000000 --- a/examples/quickstart_dimension_labels_string.py +++ /dev/null @@ -1,178 +0,0 @@ -# quickstart_dense.py -# -# LICENSE -# -# The MIT License -# -# Copyright (c) 2023 TileDB, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. -# -# DESCRIPTION -# -# Please refer to the TileDB and TileDB-Py documentation for more information: -# https://docs.tiledb.com/main/how-to -# https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html -# -# When run, this program will create a simple 1D dense array with a dimension label, write -# some data to it, and read a slice of the data back. -# - -import numpy as np -import tiledb - -def create_array(uri: str): - """Create array schema with dimension labels""" - dim1 = tiledb.Dim("d1", domain=(1, 4), dtype=np.int32) - dim2 = tiledb.Dim("d2", domain=(1, 5), dtype=np.int32) - # TODO: Test label query with N var-size labels. - dim_labels = { - 0: {"l1": dim1.create_label_schema("increasing", np.bytes_)}, - 1: { - "l2": dim2.create_label_schema("increasing", np.int64), - "l3": dim2.create_label_schema("increasing", np.float64), - }, - } - dom = tiledb.Domain(dim1, dim2) - - # Var-sized attributes seems to work without any issues. - att1 = tiledb.Attr("a1", var=True, dtype=np.bytes_) - att2 = tiledb.Attr("a2", var=True, dtype=np.int64) - schema = tiledb.ArraySchema(sparse=False, domain=dom, attrs=(att1, att2), dim_labels=dim_labels) - tiledb.Array.create(uri, schema) - - -def write_array(uri: str): - """Write attribute and label data to the array""" - a1_data = np.array( - [ - "a", - "bb", - "ccc", - "dddd", - "eeeee", - "a", - "bb", - "ccc", - "d", - "eeeeeeeeee", - "a", - "bb", - "ccc", - "d", - "eeeeeeeeee", - "a", - "bb", - "ccc", - "d", - "eeeeeeeeee", - # "a", - # "bb", - # "ccc", - # "d", - # "eeeeeeeeee", - ] - ).reshape(4, 5) - - a2_data = np.array( - [ - np.repeat(1, 1).astype(np.int64), - np.repeat(2, 2).astype(np.int64), - np.repeat(3, 3).astype(np.int64), - np.repeat(4, 4).astype(np.int64), - np.repeat(5, 5).astype(np.int64), - - np.repeat(1, 5).astype(np.int64), - np.repeat(2, 4).astype(np.int64), - np.repeat(3, 3).astype(np.int64), - np.repeat(4, 2).astype(np.int64), - np.repeat(5, 1).astype(np.int64), - - np.repeat(1, 1).astype(np.int64), - np.repeat(2, 2).astype(np.int64), - np.repeat(3, 3).astype(np.int64), - np.repeat(4, 4).astype(np.int64), - np.repeat(5, 5).astype(np.int64), - - np.repeat(1, 5).astype(np.int64), - np.repeat(2, 4).astype(np.int64), - np.repeat(3, 1).astype(np.int64), - np.repeat(4, 2).astype(np.int64), - np.repeat(5, 3).astype(np.int64), - - # np.repeat(1, 1).astype(np.int64), - # np.repeat(2, 1).astype(np.int64), - # np.repeat(3, 5).astype(np.int64), - # np.repeat(4, 5).astype(np.int64), - # np.repeat(5, 10).astype(np.int64), - ], dtype=object - ).reshape(4, 5) - - # l1_data = np.array(["a", "bb", "ccc", "dddd", "eeeee"]) - l1_data = np.array(["a", "bb", "ccc", "ddd"]) - l2_data = np.arange(-2, 3) - l3_data = np.linspace(-1.0, 1.0, 5) - with tiledb.open(uri, "w") as array: - array[:] = {"a1": a1_data, "a2": a2_data, "l1": l1_data, "l2": l2_data, "l3": l3_data} - - -def read_array(uri: str): - """Read the array from the dimension label""" - - with tiledb.open(uri, "r") as array: - data1 = array.label_index(["l2"])[1, -1:1] - print("Reading array on [[1, -1:1]] with label 'l2' on dim2") - for name, value in data1.items(): - print(f" '{name}'={value}") - - data2 = array.label_index(["l1", "l2"])["a":"ccc", -2:2] - print("Reading array on [['a':'ccc', -2:2]] with label 'l1' on dim1 and 'l2' on dim2") - for name, value in data2.items(): - print(f" '{name}'={value}") - - # Should read all data - print("Reading array on [['a':'ddd']] with label 'l1' on dim1") - data3 = array.label_index(["l1"])["a":"ddd"] - for name, value in data3.items(): - print(f" '{name}'={value}") - - -if __name__ == "__main__": - # Name of the array to create. - ARRAY_NAME = "/home/shaun/Documents/Arrays/quickstart_labels_string_py" - conf = tiledb.Config({ - "sm.io_concurrency_level": "1", - "sm.compute_concurrency_level": "1", - }) - tiledb.default_ctx(conf) - - LIBVERSION = tiledb.libtiledb.version() - vfs = tiledb.VFS() - - if LIBVERSION[0] == 2 and LIBVERSION[1] < 15: - print( - f"Dimension labels requires libtiledb version >= 2.15.0. Current version is" - f" {LIBVERSION[0]}.{LIBVERSION[1]}.{LIBVERSION[2]}" - ) - else: - if vfs.is_dir(ARRAY_NAME): - vfs.remove_dir(ARRAY_NAME) - create_array(ARRAY_NAME) - write_array(ARRAY_NAME) - read_array(ARRAY_NAME) diff --git a/examples/string_float_int_dimensions.py b/examples/string_float_int_dimensions.py index edda458cc6..d1fb03b95d 100644 --- a/examples/string_float_int_dimensions.py +++ b/examples/string_float_int_dimensions.py @@ -39,9 +39,6 @@ import tiledb path = "sparse_mixed_demo" -vfs = tiledb.VFS() -if vfs.is_dir(path): - vfs.remove_dir(path) dom = tiledb.Domain( *[ From 71388f785f3dc47106c473cd012a2c4983359c82 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Tue, 27 Jun 2023 11:59:41 -0400 Subject: [PATCH 08/25] Test with CI --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6fc1dda9a7..5dbd9a6a20 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ### DO NOT USE ON CI # Target branch: Note that this should be set to the current core release, not `dev` -TILEDB_VERSION = "2.15.4" +TILEDB_VERSION = "smr/sc-29317/py-var-size-dim-labels" # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION From aef844d66e46e60eb61805871cc9aa6dd14d5f6d Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Tue, 27 Jun 2023 12:04:20 -0400 Subject: [PATCH 09/25] Format --- tiledb/dimension_label_schema.py | 9 +++- tiledb/multirange_indexing.py | 4 +- tiledb/tests/test_dimension_label.py | 61 +++++++++++++++++++++------- 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/tiledb/dimension_label_schema.py b/tiledb/dimension_label_schema.py index dc1365de32..f107dcb109 100644 --- a/tiledb/dimension_label_schema.py +++ b/tiledb/dimension_label_schema.py @@ -35,10 +35,15 @@ def __init__( # Get DataType and DataOrder objects _label_order = DataOrder[order] _label_dtype = DataType.from_numpy( - np.dtype(label_dtype) if label_dtype not in ("ascii", "blob") else label_dtype) + np.dtype(label_dtype) + if label_dtype not in ("ascii", "blob") + else label_dtype + ) np_dtype = _label_dtype.np_dtype if np.issubdtype(np_dtype, np.bytes_) or np.issubdtype(np_dtype, np.str_): - _label_dtype = DataType(np_dtype, lt.DataType.STRING_ASCII, lt.TILEDB_VAR_NUM) + _label_dtype = DataType( + np_dtype, lt.DataType.STRING_ASCII, lt.TILEDB_VAR_NUM + ) _dim_dtype = DataType.from_numpy(dim_dtype) diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index b0378dafad..a93bccb513 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -543,7 +543,9 @@ def _run_query(self) -> Dict[str, np.ndarray]: for dim_idx, label_name in self._labels.items(): if self.result_shape is None: raise TileDBError("failed to compute subarray shape") - self.pyquery.add_label_buffer(label_name, self.result_shape[dim_idx], est_var_size[1]) + self.pyquery.add_label_buffer( + label_name, self.result_shape[dim_idx], est_var_size[1] + ) return super()._run_query() diff --git a/tiledb/tests/test_dimension_label.py b/tiledb/tests/test_dimension_label.py index ac99caca4b..a3e77e8515 100644 --- a/tiledb/tests/test_dimension_label.py +++ b/tiledb/tests/test_dimension_label.py @@ -167,7 +167,9 @@ def test_dimension_label_round_trip_dense_array(self, var): attr_data = np.arange(1, 11) label_data = np.arange(-9, 10, 2) if var: - label_data = np.array([str(chr(ord('a') + c) * (10 - c)).encode("utf-8") for c in range(10)]) + label_data = np.array( + [str(chr(ord("a") + c) * (10 - c)).encode("utf-8") for c in range(10)] + ) with tiledb.open(uri, "w") as array: array[:] = {"a1": attr_data, "l1": label_data} @@ -186,7 +188,7 @@ def test_dimension_label_round_trip_dense_array(self, var): indexer = array.label_index(["l1"]) # Read full array - result = indexer[label_data[0]:label_data[-1]] + result = indexer[label_data[0] : label_data[-1]] np.testing.assert_array_equal(result["a1"], attr_data) np.testing.assert_array_equal(result["l1"], label_data) @@ -200,7 +202,7 @@ def test_dimension_label_round_trip_dense_array(self, var): for index in range(10): label_index = label_data[index:] - result = indexer[label_index[0]:label_index[-1]] + result = indexer[label_index[0] : label_index[-1]] np.testing.assert_array_equal(result["a1"], attr_data[index:]) np.testing.assert_array_equal(result["l1"], label_index) @@ -217,7 +219,9 @@ def test_dimension_label_round_trip_multidim_dense_array(self, var): att = tiledb.Attr("value", dtype=np.int64) dim_labels = { 0: { - "x1": dim1.create_label_schema("increasing", np.float64 if not var else "U"), + "x1": dim1.create_label_schema( + "increasing", np.float64 if not var else "U" + ), "x2": dim1.create_label_schema("decreasing", np.int64), }, 1: { @@ -234,7 +238,9 @@ def test_dimension_label_round_trip_multidim_dense_array(self, var): attr_data = np.reshape(np.arange(1, 65), (8, 8)) x1_data = np.linspace(-1.0, 1.0, 8) if var: - x1_data = np.array([str(chr(ord('a') + c - 1) * c).encode('utf-8') for c in range(1, 9)]) + x1_data = np.array( + [str(chr(ord("a") + c - 1) * c).encode("utf-8") for c in range(1, 9)] + ) x2_data = np.arange(8, 0, -1) y1_data = np.arange(9, 17) with tiledb.open(uri, "w") as array: @@ -248,7 +254,7 @@ def test_dimension_label_round_trip_multidim_dense_array(self, var): # Test querying by label with tiledb.open(uri, "r") as array: # Read full array: labels on both ranges - result = array.label_index(["x1", "y1"])[x1_data[0]:x1_data[-1], 9:17] + result = array.label_index(["x1", "y1"])[x1_data[0] : x1_data[-1], 9:17] np.testing.assert_array_equal(result["value"], attr_data) np.testing.assert_array_equal(result["x1"], x1_data) np.testing.assert_array_equal(result["y1"], y1_data) @@ -282,7 +288,13 @@ def test_dimension_label_round_trip_sparse_array(self, var): dim = tiledb.Dim("index", domain=(1, 10)) dom = tiledb.Domain(dim) att = tiledb.Attr("value", dtype=np.int64) - dim_labels = {0: {"l1": dim.create_label_schema("increasing", np.int64 if not var else "ascii")}} + dim_labels = { + 0: { + "l1": dim.create_label_schema( + "increasing", np.int64 if not var else "ascii" + ) + } + } schema = tiledb.ArraySchema( domain=dom, attrs=(att,), dim_labels=dim_labels, sparse=True ) @@ -296,7 +308,9 @@ def test_dimension_label_round_trip_sparse_array(self, var): attr_data = np.arange(11, 21) label_data = np.arange(-10, 0) if var: - label_data = np.array([str(chr(ord('a') + c) * (10 - c)).encode('utf-8') for c in range(10)]) + label_data = np.array( + [str(chr(ord("a") + c) * (10 - c)).encode("utf-8") for c in range(10)] + ) with tiledb.open(uri, "w") as array: array[index_data] = {"value": attr_data, "l1": label_data} @@ -341,17 +355,32 @@ def test_dimension_label_round_trip_dense_var(self): # Write data to the array and the label attr_data = np.array( - [[str(chr(ord('z') - c) * (10 - c)).encode('utf-8') for c in range(10)] for i in range(10)]) + [ + [str(chr(ord("z") - c) * (10 - c)).encode("utf-8") for c in range(10)] + for i in range(10) + ] + ) l1_data = np.arange(10, dtype=np.float32) l2_data = np.arange(10, 0, -1, dtype=np.int32) - l3_data = np.array([str(chr(ord('a') + c) * (c + 1)).encode('utf-8') for c in range(10)]) + l3_data = np.array( + [str(chr(ord("a") + c) * (c + 1)).encode("utf-8") for c in range(10)] + ) with tiledb.open(uri, "w") as array: - array[:, :] = {"value": attr_data, "l1": l1_data, "l2": l2_data, "l3": l3_data} + array[:, :] = { + "value": attr_data, + "l1": l1_data, + "l2": l2_data, + "l3": l3_data, + } # Load the array schema and get the URI of the dimension label schema = tiledb.ArraySchema.load(uri) - for label_name, label_data in {"l1": l1_data, "l2": l2_data, "l3": l3_data}.items(): + for label_name, label_data in { + "l1": l1_data, + "l2": l2_data, + "l3": l3_data, + }.items(): dim_label = schema.dim_label(label_name) # Read and check the data directly from the dimension label with tiledb.open(dim_label.uri, "r") as label: @@ -376,5 +405,9 @@ def test_dimension_label_round_trip_dense_var(self): result = indexer[lower:upper] else: result = indexer[:, lower:upper] - np.testing.assert_array_equal(result["value"][index:], attr_data[index:]) - np.testing.assert_array_equal(result[label_name][index:], label_index) + np.testing.assert_array_equal( + result["value"][index:], attr_data[index:] + ) + np.testing.assert_array_equal( + result[label_name][index:], label_index + ) From c7c5c9f8076e26aadeef75c46cabcd8631a9422a Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Tue, 27 Jun 2023 12:46:32 -0400 Subject: [PATCH 10/25] Fix UT --- setup.py | 2 +- tiledb/tests/test_dimension_label.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 5dbd9a6a20..d98eadcfae 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ### DO NOT USE ON CI # Target branch: Note that this should be set to the current core release, not `dev` -TILEDB_VERSION = "smr/sc-29317/py-var-size-dim-labels" +TILEDB_VERSION = "4903c723c194d4ba948cb2958e45f65e8369f287" # TODO: Revert # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION diff --git a/tiledb/tests/test_dimension_label.py b/tiledb/tests/test_dimension_label.py index a3e77e8515..06ea4b2732 100644 --- a/tiledb/tests/test_dimension_label.py +++ b/tiledb/tests/test_dimension_label.py @@ -401,13 +401,20 @@ def test_dimension_label_round_trip_dense_var(self): # Slice array with varying sizes. for index in range(10): label_index = label_data[index:] + lower = min(label_index[0], label_index[-1]) + upper = max(label_index[0], label_index[-1]) if label_name == "l1": result = indexer[lower:upper] + # Check against dim1 + np.testing.assert_array_equal( + result["value"], attr_data[index:, :] + ) else: result = indexer[:, lower:upper] + # Check against dim2 + np.testing.assert_array_equal( + result["value"], attr_data[:, index:] + ) np.testing.assert_array_equal( - result["value"][index:], attr_data[index:] - ) - np.testing.assert_array_equal( - result[label_name][index:], label_index + result[label_name], label_index ) From 1567ea94109bccbe2da59c2545c6f7cacbf582e6 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Tue, 27 Jun 2023 13:18:08 -0400 Subject: [PATCH 11/25] Format --- setup.py | 2 +- tiledb/core.cc | 37 +++++++++++++++++----------- tiledb/tests/test_dimension_label.py | 4 +-- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/setup.py b/setup.py index d98eadcfae..3827fbefd9 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ### DO NOT USE ON CI # Target branch: Note that this should be set to the current core release, not `dev` -TILEDB_VERSION = "4903c723c194d4ba948cb2958e45f65e8369f287" # TODO: Revert +TILEDB_VERSION = "107803e42eef7be9d9935b178d6a10d67eac14f3" # TODO: Revert # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION diff --git a/tiledb/core.cc b/tiledb/core.cc index 1c12e0a88a..6070f7e6f4 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -314,7 +314,8 @@ class PyQuery { tiledb_layout_t layout_ = TILEDB_ROW_MAJOR; // label buffer list - std::unordered_map> label_input_buffer_data_; + std::unordered_map> + label_input_buffer_data_; py::object pyschema_; @@ -469,7 +470,8 @@ class PyQuery { bool is_dimension_label(std::string name) { #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 - return ArraySchemaExperimental::has_dimension_label(ctx_, *array_schema_, name); + return ArraySchemaExperimental::has_dimension_label(ctx_, *array_schema_, + name); #else return false; #endif @@ -484,8 +486,8 @@ class PyQuery { return attr.cell_val_num() == TILEDB_VAR_NUM; #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 } else if (is_dimension_label(name)) { - auto dim_label = ArraySchemaExperimental::dimension_label(ctx_, - *array_schema_, name); + auto dim_label = + ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name); return dim_label.label_cell_val_num() == TILEDB_VAR_NUM; #endif } else { @@ -514,8 +516,8 @@ class PyQuery { cell_val_num = array_schema_->attribute(name).cell_val_num(); #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 } else if (is_dimension_label(name)) { - auto dim_label = ArraySchemaExperimental::dimension_label(ctx_, - *array_schema_, name); + auto dim_label = + ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name); type = dim_label.label_type(); cell_val_num = dim_label.label_cell_val_num(); #endif @@ -575,7 +577,8 @@ class PyQuery { bool dense = array_schema_->array_type() == TILEDB_DENSE; if (is_dimension_label(name)) { #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15 - auto dim_label = ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name); + auto dim_label = + ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name); type = dim_label.label_type(); cell_val_num = dim_label.label_cell_val_num(); var = cell_val_num == TILEDB_VAR_NUM; @@ -661,7 +664,8 @@ class PyQuery { validity_num, var, nullable)}); } - void add_label_buffer(std::string &label_name, uint64_t ncells, uint64_t var_size) { + void add_label_buffer(std::string &label_name, uint64_t ncells, + uint64_t var_size) { label_input_buffer_data_[label_name] = {ncells, var_size}; } @@ -769,17 +773,20 @@ class PyQuery { if ((Py_ssize_t)(buf.data_vals_read * buf.elem_nbytes) > (Py_ssize_t)buf.data.size()) { - throw TileDBError("After read query, data buffer out of bounds: " + name + " (" - + std::to_string(buf.data_vals_read * buf.elem_nbytes) + " > " - + std::to_string(buf.data.size()) + ")"); + throw TileDBError( + "After read query, data buffer out of bounds: " + name + " (" + + std::to_string(buf.data_vals_read * buf.elem_nbytes) + " > " + + std::to_string(buf.data.size()) + ")"); } if ((Py_ssize_t)buf.offsets_read > buf.offsets.size()) { - throw TileDBError("After read query, offsets buffer out of bounds: " + name + " (" - + std::to_string(buf.offsets_read) + " > " + std::to_string(buf.offsets.size()) + ")"); + throw TileDBError("After read query, offsets buffer out of bounds: " + + name + " (" + std::to_string(buf.offsets_read) + + " > " + std::to_string(buf.offsets.size()) + ")"); } if ((Py_ssize_t)buf.validity_vals_read > buf.validity.size()) { - throw TileDBError("After read query, validity buffer out of bounds: " + name + " (" - + std::to_string(buf.validity_vals_read) + " > " + std::to_string(buf.validity.size()) + ")"); + throw TileDBError("After read query, validity buffer out of bounds: " + + name + " (" + std::to_string(buf.validity_vals_read) + + " > " + std::to_string(buf.validity.size()) + ")"); } } } diff --git a/tiledb/tests/test_dimension_label.py b/tiledb/tests/test_dimension_label.py index 06ea4b2732..0daeab2e36 100644 --- a/tiledb/tests/test_dimension_label.py +++ b/tiledb/tests/test_dimension_label.py @@ -415,6 +415,4 @@ def test_dimension_label_round_trip_dense_var(self): np.testing.assert_array_equal( result["value"], attr_data[:, index:] ) - np.testing.assert_array_equal( - result[label_name], label_index - ) + np.testing.assert_array_equal(result[label_name], label_index) From 045bd97fed2c33e4b037d605012761afe285960c Mon Sep 17 00:00:00 2001 From: "J.P. Dark" <24235303+jp-dark@users.noreply.github.com> Date: Tue, 27 Jun 2023 14:50:18 -0400 Subject: [PATCH 12/25] Add string dimension label example --- examples/string_dimension_labels.py | 92 +++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 examples/string_dimension_labels.py diff --git a/examples/string_dimension_labels.py b/examples/string_dimension_labels.py new file mode 100644 index 0000000000..443896626e --- /dev/null +++ b/examples/string_dimension_labels.py @@ -0,0 +1,92 @@ +# string_dimension_label.py +# +# LICENSE +# +# The MIT License +# +# Copyright (c) 2023 TileDB, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# +# DESCRIPTION +# +# Please refer to the TileDB and TileDB-Py documentation for more information: +# https://docs.tiledb.com/main/how-to +# https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html +# +# When run, this program will create a simple 2D dense array with a string dimension +# dimension on the column dimension, and read a slice back with the dimension label. +# + +import numpy as np + +import tiledb + + +def create_array(uri: str): + """Create array schema with a dimension label on the columns""" + dim1 = tiledb.Dim("row", domain=(1, 5)) + dim2 = tiledb.Dim("column", domain=(1, 5)) + dom = tiledb.Domain(dim1, dim2) + att = tiledb.Attr("a1", dtype=np.int64) + dim_labels = {1: {"name": dim2.create_label_schema("increasing", "ascii")}} + schema = tiledb.ArraySchema(domain=dom, attrs=(att,), dim_labels=dim_labels) + tiledb.Array.create(uri, schema) + + +def write_array(uri: str): + """Write attribute and label data to the array""" + a1_data = np.reshape(np.arange(1, 26), (5, 5)) + label_data = np.array(["alpha", "beta", "gamma", "kappa", "omega"]) + with tiledb.open(uri, "w") as array: + array[:, :] = {"a1": a1_data, "name": label_data} + + +def read_array(uri: str): + """Read the array from the dimension label""" + + with tiledb.open(uri, "r") as array: + data = array.label_index(["name"])[1, "beta":"kappa"] + print( + "Reading array on [[1, 'beta':'kappa']] with label 'name' on dimension 'col'" + ) + for name, value in data.items(): + print(f" '{name}'={value}") + + +if __name__ == "__main__": + # Name of the array to create. + ARRAY_NAME = "string_dimension_labels" + + LIBVERSION = tiledb.libtiledb.version() + + if LIBVERSION[0] == 2 and LIBVERSION[1] < 15: + print( + f"Dimension labels requires libtiledb version >= 2.15.0. Current version is" + f" {LIBVERSION[0]}.{LIBVERSION[1]}.{LIBVERSION[2]}" + ) + + else: + # Only create and write to the array if it doesn't already exist. + if tiledb.object_type(ARRAY_NAME) != "array": + create_array(ARRAY_NAME) + write_array(ARRAY_NAME) + + # Read from the array and print output. + read_array(ARRAY_NAME) From 8dfea3eac3843a6d2a47bdf5ec4c9a81aa9e6d74 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Wed, 28 Jun 2023 13:00:27 -0400 Subject: [PATCH 13/25] Changes to use experimental APIs from core --- setup.py | 2 +- tiledb/cc/query.cc | 15 +++++++++++++++ tiledb/core.cc | 4 +++- tiledb/multirange_indexing.py | 2 +- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 3827fbefd9..f4a5a7b796 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ### DO NOT USE ON CI # Target branch: Note that this should be set to the current core release, not `dev` -TILEDB_VERSION = "107803e42eef7be9d9935b178d6a10d67eac14f3" # TODO: Revert +TILEDB_VERSION = "88a73c8a14ebf41ed75b6e60ed86f7e835e5da91" # TODO: Revert # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION diff --git a/tiledb/cc/query.cc b/tiledb/cc/query.cc index 9d515dc13c..772fcec392 100644 --- a/tiledb/cc/query.cc +++ b/tiledb/cc/query.cc @@ -1,4 +1,5 @@ #include // C++ +#include #include "common.h" @@ -50,6 +51,20 @@ void init_query(py::module &m) { .def("has_results", &Query::has_results) +#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 16 + .def("est_result_size_var_label", + [](const Query & query, const std::string& attr_name, bool label_data) { + return QueryExperimental::est_result_size_var_label(query, attr_name, label_data); + }) +#else + .def("est_result_size_var_label", + [](const Query & query, const std::string& attr_name, bool label_data) { + throw TileDBError("Estimate result size for dimension label data queries requires libtiledb version 2.15.0 " + "or greater"); + }) +#endif + + // For dimension labels, experimental variant above adds support to retrieve underlying data query estimates. .def("est_result_size_var", &Query::est_result_size_var) .def("is_complete", diff --git a/tiledb/core.cc b/tiledb/core.cc index 6070f7e6f4..4dbd3ee064 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -722,7 +722,9 @@ class PyQuery { } void update_read_elem_num() { -#if TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 3 +#if TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 16 + auto result_elements = QueryExperimental::result_buffer_elements_nullable(*query_); +#elif TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 3 // needs https://github.com/TileDB-Inc/TileDB/pull/2238 auto result_elements = query_->result_buffer_elements_nullable(); #else diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index a93bccb513..83126f0e2f 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -523,7 +523,7 @@ def _run_query(self) -> Dict[str, np.ndarray]: est_var_size = [0, 0] for label_name in self._labels.values(): if self.array.schema.dim_label(label_name).isvar: - est_var_size = self.label_query.est_result_size_var(label_name) + est_var_size = self.label_query.est_result_size_var_label(label_name, False) if not self.label_query.is_complete(): raise TileDBError("failed to get dimension ranges from labels") From 38881e16f9953856459ee6259c28dfda0b49fe05 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Wed, 28 Jun 2023 13:08:29 -0400 Subject: [PATCH 14/25] Format --- tiledb/cc/query.cc | 11 +++++++---- tiledb/core.cc | 5 +++-- tiledb/multirange_indexing.py | 4 +++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tiledb/cc/query.cc b/tiledb/cc/query.cc index 772fcec392..5301a8877b 100644 --- a/tiledb/cc/query.cc +++ b/tiledb/cc/query.cc @@ -53,9 +53,11 @@ void init_query(py::module &m) { #if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 16 .def("est_result_size_var_label", - [](const Query & query, const std::string& attr_name, bool label_data) { - return QueryExperimental::est_result_size_var_label(query, attr_name, label_data); - }) + [](const Query &query, const std::string &attr_name, + bool label_data) { + return QueryExperimental::est_result_size_var_label( + query, attr_name, label_data); + }) #else .def("est_result_size_var_label", [](const Query & query, const std::string& attr_name, bool label_data) { @@ -64,7 +66,8 @@ void init_query(py::module &m) { }) #endif - // For dimension labels, experimental variant above adds support to retrieve underlying data query estimates. + // For dimension labels, experimental variant above adds support to + // retrieve underlying data query estimates. .def("est_result_size_var", &Query::est_result_size_var) .def("is_complete", diff --git a/tiledb/core.cc b/tiledb/core.cc index 4dbd3ee064..1d97114a90 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -723,7 +723,8 @@ class PyQuery { void update_read_elem_num() { #if TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 16 - auto result_elements = QueryExperimental::result_buffer_elements_nullable(*query_); + auto result_elements = + QueryExperimental::result_buffer_elements_nullable(*query_); #elif TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 3 // needs https://github.com/TileDB-Inc/TileDB/pull/2238 auto result_elements = query_->result_buffer_elements_nullable(); @@ -1410,7 +1411,7 @@ class PyQuery { } } -}; // class PyQuery +}; // namespace tiledbpy void init_stats() { g_stats.reset(new StatsInfo()); diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index 83126f0e2f..6aae75dac9 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -523,7 +523,9 @@ def _run_query(self) -> Dict[str, np.ndarray]: est_var_size = [0, 0] for label_name in self._labels.values(): if self.array.schema.dim_label(label_name).isvar: - est_var_size = self.label_query.est_result_size_var_label(label_name, False) + est_var_size = self.label_query.est_result_size_var_label( + label_name, False + ) if not self.label_query.is_complete(): raise TileDBError("failed to get dimension ranges from labels") From 39b02c1b16a632a76cbe920917b2b54f86bd70d1 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Thu, 29 Jun 2023 11:41:42 -0400 Subject: [PATCH 15/25] Remove calls to est_result_size --- setup.py | 2 +- tiledb/cc/query.cc | 19 ------------------- tiledb/core.cc | 13 +++++-------- tiledb/multirange_indexing.py | 11 +---------- 4 files changed, 7 insertions(+), 38 deletions(-) diff --git a/setup.py b/setup.py index f4a5a7b796..4243ba6a69 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ### DO NOT USE ON CI # Target branch: Note that this should be set to the current core release, not `dev` -TILEDB_VERSION = "88a73c8a14ebf41ed75b6e60ed86f7e835e5da91" # TODO: Revert +TILEDB_VERSION = "c1bf4e0eccf2b2fde72a32cdd240928b7da8a64a" # TODO: Revert # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION diff --git a/tiledb/cc/query.cc b/tiledb/cc/query.cc index 5301a8877b..145277c33f 100644 --- a/tiledb/cc/query.cc +++ b/tiledb/cc/query.cc @@ -51,25 +51,6 @@ void init_query(py::module &m) { .def("has_results", &Query::has_results) -#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 16 - .def("est_result_size_var_label", - [](const Query &query, const std::string &attr_name, - bool label_data) { - return QueryExperimental::est_result_size_var_label( - query, attr_name, label_data); - }) -#else - .def("est_result_size_var_label", - [](const Query & query, const std::string& attr_name, bool label_data) { - throw TileDBError("Estimate result size for dimension label data queries requires libtiledb version 2.15.0 " - "or greater"); - }) -#endif - - // For dimension labels, experimental variant above adds support to - // retrieve underlying data query estimates. - .def("est_result_size_var", &Query::est_result_size_var) - .def("is_complete", [](const Query &query) { return query.query_status() == Query::Status::COMPLETE; diff --git a/tiledb/core.cc b/tiledb/core.cc index 1d97114a90..ee440baac0 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -314,8 +314,7 @@ class PyQuery { tiledb_layout_t layout_ = TILEDB_ROW_MAJOR; // label buffer list - std::unordered_map> - label_input_buffer_data_; + std::unordered_map label_input_buffer_data_; py::object pyschema_; @@ -585,15 +584,14 @@ class PyQuery { nullable = false; cell_nbytes = tiledb_datatype_size(type); - uint64_t ncells = label_input_buffer_data_[name].first; + uint64_t ncells = label_input_buffer_data_[name]; if (!var) { cell_nbytes *= cell_val_num; - buf_nbytes = ncells * cell_nbytes; } else { - buf_nbytes = label_input_buffer_data_[name].second; offsets_num = ncells; } + buf_nbytes = ncells * cell_nbytes; #endif } else { std::tie(type, cell_val_num) = buffer_type(name); @@ -664,9 +662,8 @@ class PyQuery { validity_num, var, nullable)}); } - void add_label_buffer(std::string &label_name, uint64_t ncells, - uint64_t var_size) { - label_input_buffer_data_[label_name] = {ncells, var_size}; + void add_label_buffer(std::string &label_name, uint64_t ncells) { + label_input_buffer_data_[label_name] = ncells; } py::object get_buffers() { diff --git a/tiledb/multirange_indexing.py b/tiledb/multirange_indexing.py index 6aae75dac9..dc0354d8d3 100644 --- a/tiledb/multirange_indexing.py +++ b/tiledb/multirange_indexing.py @@ -520,13 +520,6 @@ def _run_query(self) -> Dict[str, np.ndarray]: if self.label_query is not None and not self.label_query.is_complete(): self.label_query.submit() - est_var_size = [0, 0] - for label_name in self._labels.values(): - if self.array.schema.dim_label(label_name).isvar: - est_var_size = self.label_query.est_result_size_var_label( - label_name, False - ) - if not self.label_query.is_complete(): raise TileDBError("failed to get dimension ranges from labels") label_subarray = self.label_query.subarray() @@ -545,9 +538,7 @@ def _run_query(self) -> Dict[str, np.ndarray]: for dim_idx, label_name in self._labels.items(): if self.result_shape is None: raise TileDBError("failed to compute subarray shape") - self.pyquery.add_label_buffer( - label_name, self.result_shape[dim_idx], est_var_size[1] - ) + self.pyquery.add_label_buffer(label_name, self.result_shape[dim_idx]) return super()._run_query() From bb10cf7aeb1f3ca7078e16405a32a7a36047a531 Mon Sep 17 00:00:00 2001 From: Shaun Reed Date: Thu, 6 Jul 2023 09:59:32 -0400 Subject: [PATCH 16/25] Update to use result_buffer_elements_nullable_labels rename --- setup.py | 2 +- tiledb/core.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4243ba6a69..1e97a5d140 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ### DO NOT USE ON CI # Target branch: Note that this should be set to the current core release, not `dev` -TILEDB_VERSION = "c1bf4e0eccf2b2fde72a32cdd240928b7da8a64a" # TODO: Revert +TILEDB_VERSION = "a2ab3c2d29296cb78e51c87888a45b2ed901d8a3" # TODO: Revert # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION diff --git a/tiledb/core.cc b/tiledb/core.cc index ee440baac0..5047dbe1b1 100644 --- a/tiledb/core.cc +++ b/tiledb/core.cc @@ -721,7 +721,7 @@ class PyQuery { void update_read_elem_num() { #if TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 16 auto result_elements = - QueryExperimental::result_buffer_elements_nullable(*query_); + QueryExperimental::result_buffer_elements_nullable_labels(*query_); #elif TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 3 // needs https://github.com/TileDB-Inc/TileDB/pull/2238 auto result_elements = query_->result_buffer_elements_nullable(); From 6e8b4df11cd5401eccc49aa957bfa4f36059602b Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Wed, 12 Jul 2023 15:49:56 -0400 Subject: [PATCH 17/25] Test --- tiledb/tests/cc/test_cc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index 20ef3b88ee..366b261552 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -1,5 +1,6 @@ import os import tempfile +import time import numpy as np import pytest @@ -141,6 +142,7 @@ def test_array(): arrw = lt.Array(ctx, uri, lt.QueryType.WRITE) arrw.delete_metadata("key") + time.sleep(0.1) arrw.close() arr = lt.Array(ctx, uri, lt.QueryType.READ) From 6247dbf81ac0894decd407fe0d73a2d5adb29a0a Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Wed, 12 Jul 2023 16:57:09 -0400 Subject: [PATCH 18/25] Use 2.16.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1e97a5d140..bd692b0b6e 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ ### DO NOT USE ON CI # Target branch: Note that this should be set to the current core release, not `dev` -TILEDB_VERSION = "a2ab3c2d29296cb78e51c87888a45b2ed901d8a3" # TODO: Revert +TILEDB_VERSION = "2.16.0" # TODO: Revert # allow overriding w/ environment variable TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION From 862fac06514ebf78a64d3aa5f97b872f833bf9d9 Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Thu, 13 Jul 2023 15:43:00 -0400 Subject: [PATCH 19/25] DEBUG --- tiledb/tests/cc/test_cc.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index 366b261552..29c723a80b 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -144,8 +144,15 @@ def test_array(): arrw.delete_metadata("key") time.sleep(0.1) arrw.close() + time.sleep(0.1) arr = lt.Array(ctx, uri, lt.QueryType.READ) + try: + import subprocess + + subprocess.check_output(["tree", uri]) + except Exception: + pass with pytest.raises(KeyError): arr.get_metadata("key") assert not arr.has_metadata("key")[0] From 58a748f00a0cc0a8cd617e4c70d70732fd9dc9fb Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Thu, 13 Jul 2023 16:14:55 -0400 Subject: [PATCH 20/25] DEBUG --- tiledb/tests/cc/test_cc.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index 29c723a80b..b70ce4da0a 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -150,8 +150,9 @@ def test_array(): try: import subprocess - subprocess.check_output(["tree", uri]) + print(subprocess.check_output(["tree", uri]).decode()) except Exception: + print("failed") pass with pytest.raises(KeyError): arr.get_metadata("key") From b6030d20e1ec38522bd2d5ca546c195acbc9ccf4 Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Thu, 13 Jul 2023 16:20:27 -0400 Subject: [PATCH 21/25] DEBUG --- tiledb/tests/cc/test_cc.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index b70ce4da0a..55318d8e48 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -148,9 +148,16 @@ def test_array(): arr = lt.Array(ctx, uri, lt.QueryType.READ) try: + import base64 import subprocess print(subprocess.check_output(["tree", uri]).decode()) + + print("--- starting ---") + subprocess.check_output(["tar", "czvf", "/tmp/array.tgz", uri]) + with open("/tmp/array.tgz", "rb") as f: + print(base64.b64encode(f.read())) + print("--- ending encoded array tgz ---") except Exception: print("failed") pass From fa5c35589f0482095c0243da3e7d8cbe48f57539 Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Thu, 13 Jul 2023 23:14:04 -0400 Subject: [PATCH 22/25] use tempfile --- tiledb/tests/cc/test_cc.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index 55318d8e48..255937c308 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -153,9 +153,10 @@ def test_array(): print(subprocess.check_output(["tree", uri]).decode()) - print("--- starting ---") - subprocess.check_output(["tar", "czvf", "/tmp/array.tgz", uri]) - with open("/tmp/array.tgz", "rb") as f: + path = os.path.join(tempfile.mkdtemp(), "array.tgz") + print("--- starting --- ", path) + subprocess.check_output(["tar", "czvf", path, uri]) + with open(path, "rb") as f: print(base64.b64encode(f.read())) print("--- ending encoded array tgz ---") except Exception: From 1a296a883cba71e575ce299c96f8871451320dbe Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Thu, 13 Jul 2023 23:36:08 -0400 Subject: [PATCH 23/25] DEBUG --- .github/workflows/ci.yml | 2 +- tiledb/tests/cc/test_cc.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5e4a46dde..65200dcf6e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,7 +52,7 @@ jobs: # Remove after upstream PR fully-deployed: # - https://github.com/actions/runner-images/pull/7125 - name: "Install homebrew dependencies" - run: brew install pkg-config + run: brew install pkg-config tar if: matrix.os == 'macos-11' - name: "Install dependencies" diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index 255937c308..3a94f4041f 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -159,8 +159,8 @@ def test_array(): with open(path, "rb") as f: print(base64.b64encode(f.read())) print("--- ending encoded array tgz ---") - except Exception: - print("failed") + except Exception as exc: + print("failed: ", exc) pass with pytest.raises(KeyError): arr.get_metadata("key") From 71a2dc0eaf6f04bce064c70137248e1d73eb84cd Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Fri, 14 Jul 2023 08:29:57 -0400 Subject: [PATCH 24/25] DEBUG --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 65200dcf6e..6a22fc1f82 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -29,7 +29,7 @@ jobs: uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - + - run: rm -rf $RUNNER_TOOL_CACHE/Python/3.7.17 if: matrix.os == 'macos-11' && matrix.python-version == '3.7' - uses: actions/setup-python@v4 @@ -52,7 +52,7 @@ jobs: # Remove after upstream PR fully-deployed: # - https://github.com/actions/runner-images/pull/7125 - name: "Install homebrew dependencies" - run: brew install pkg-config tar + run: brew install pkg-config if: matrix.os == 'macos-11' - name: "Install dependencies" From 29b10e48c09d500141333e70fde5af4006151bee Mon Sep 17 00:00:00 2001 From: Isaiah Norton Date: Fri, 14 Jul 2023 08:47:18 -0400 Subject: [PATCH 25/25] DEBUG --- tiledb/tests/cc/test_cc.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tiledb/tests/cc/test_cc.py b/tiledb/tests/cc/test_cc.py index 3a94f4041f..9529187603 100644 --- a/tiledb/tests/cc/test_cc.py +++ b/tiledb/tests/cc/test_cc.py @@ -151,7 +151,10 @@ def test_array(): import base64 import subprocess - print(subprocess.check_output(["tree", uri]).decode()) + try: + print(subprocess.check_output(["tree", uri]).decode()) + except Exception as e1: + print("tree got exception: ", e1) path = os.path.join(tempfile.mkdtemp(), "array.tgz") print("--- starting --- ", path)