Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support variable length dimension label reads #1802

Closed
wants to merge 25 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}

- run: rm -rf $RUNNER_TOOL_CACHE/Python/3.7.17
if: matrix.os == 'macos-11' && matrix.python-version == '3.7'
- uses: actions/setup-python@v4
Expand Down
92 changes: 92 additions & 0 deletions examples/string_dimension_labels.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# string_dimension_label.py
#
# LICENSE
#
# The MIT License
#
# Copyright (c) 2023 TileDB, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# DESCRIPTION
#
# Please refer to the TileDB and TileDB-Py documentation for more information:
# https://docs.tiledb.com/main/how-to
# https://tiledb-inc-tiledb.readthedocs-hosted.com/projects/tiledb-py/en/stable/python-api.html
#
# When run, this program will create a simple 2D dense array with a string dimension
# dimension on the column dimension, and read a slice back with the dimension label.
#

import numpy as np

import tiledb


def create_array(uri: str):
"""Create array schema with a dimension label on the columns"""
dim1 = tiledb.Dim("row", domain=(1, 5))
dim2 = tiledb.Dim("column", domain=(1, 5))
dom = tiledb.Domain(dim1, dim2)
att = tiledb.Attr("a1", dtype=np.int64)
dim_labels = {1: {"name": dim2.create_label_schema("increasing", "ascii")}}
schema = tiledb.ArraySchema(domain=dom, attrs=(att,), dim_labels=dim_labels)
tiledb.Array.create(uri, schema)


def write_array(uri: str):
"""Write attribute and label data to the array"""
a1_data = np.reshape(np.arange(1, 26), (5, 5))
label_data = np.array(["alpha", "beta", "gamma", "kappa", "omega"])
with tiledb.open(uri, "w") as array:
array[:, :] = {"a1": a1_data, "name": label_data}


def read_array(uri: str):
"""Read the array from the dimension label"""

with tiledb.open(uri, "r") as array:
data = array.label_index(["name"])[1, "beta":"kappa"]
print(
"Reading array on [[1, 'beta':'kappa']] with label 'name' on dimension 'col'"
)
for name, value in data.items():
print(f" '{name}'={value}")


if __name__ == "__main__":
# Name of the array to create.
ARRAY_NAME = "string_dimension_labels"

LIBVERSION = tiledb.libtiledb.version()

if LIBVERSION[0] == 2 and LIBVERSION[1] < 15:
print(
f"Dimension labels requires libtiledb version >= 2.15.0. Current version is"
f" {LIBVERSION[0]}.{LIBVERSION[1]}.{LIBVERSION[2]}"
)

else:
# Only create and write to the array if it doesn't already exist.
if tiledb.object_type(ARRAY_NAME) != "array":
create_array(ARRAY_NAME)
write_array(ARRAY_NAME)

# Read from the array and print output.
read_array(ARRAY_NAME)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
### DO NOT USE ON CI

# Target branch: Note that this should be set to the current core release, not `dev`
TILEDB_VERSION = "2.15.4"
TILEDB_VERSION = "2.16.0" # TODO: Revert

# allow overriding w/ environment variable
TILEDB_VERSION = os.environ.get("TILEDB_VERSION") or TILEDB_VERSION
Expand Down
1 change: 1 addition & 0 deletions tiledb/cc/query.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include <tiledb/tiledb> // C++
#include <tiledb/tiledb_experimental>

#include "common.h"

Expand Down
178 changes: 96 additions & 82 deletions tiledb/core.cc
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ class PyQuery {
tiledb_layout_t layout_ = TILEDB_ROW_MAJOR;

// label buffer list
std::vector<std::pair<string, uint64_t>> label_input_buffer_data_;
std::unordered_map<string, uint64_t> label_input_buffer_data_;

py::object pyschema_;

Expand Down Expand Up @@ -467,21 +467,36 @@ class PyQuery {
return array_schema_->has_attribute(name);
}

bool is_dimension_label(std::string name) {
#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15
return ArraySchemaExperimental::has_dimension_label(ctx_, *array_schema_,
name);
#else
return false;
#endif
}

bool is_var(std::string name) {
if (is_dimension(name)) {
auto dim = domain_->dimension(name);
return dim.cell_val_num() == TILEDB_VAR_NUM;
} else if (is_attribute(name)) {
auto attr = array_schema_->attribute(name);
return attr.cell_val_num() == TILEDB_VAR_NUM;
#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15
} else if (is_dimension_label(name)) {
auto dim_label =
ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name);
return dim_label.label_cell_val_num() == TILEDB_VAR_NUM;
#endif
} else {
TPY_ERROR_LOC("Unknown buffer type for is_var check (expected attribute "
"or dimension)")
}
}

bool is_nullable(std::string name) {
if (is_dimension(name)) {
if (is_dimension(name) || is_dimension_label(name)) {
return false;
}

Expand All @@ -498,6 +513,13 @@ class PyQuery {
} else if (is_attribute(name)) {
type = array_schema_->attribute(name).type();
cell_val_num = array_schema_->attribute(name).cell_val_num();
#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15
} else if (is_dimension_label(name)) {
auto dim_label =
ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name);
type = dim_label.label_type();
cell_val_num = dim_label.label_cell_val_num();
#endif
} else {
TPY_ERROR_LOC("Unknown buffer '" + name + "'");
}
Expand Down Expand Up @@ -543,51 +565,72 @@ class PyQuery {
}

void alloc_buffer(std::string name) {

tiledb_datatype_t type;
uint32_t cell_val_num;
std::tie(type, cell_val_num) = buffer_type(name);
uint64_t cell_nbytes = tiledb_datatype_size(type);
if (cell_val_num != TILEDB_VAR_NUM)
cell_nbytes *= cell_val_num;
auto dtype = tiledb_dtype(type, cell_val_num);

uint64_t cell_nbytes;
bool var;
bool nullable;
uint64_t buf_nbytes = 0;
uint64_t offsets_num = 0;
uint64_t validity_num = 0;

bool var = is_var(name);
bool nullable = is_nullable(name);
bool dense = array_schema_->array_type() == TILEDB_DENSE;

if (retries_ < 1 && dense) {
// we must not call after submitting
if (nullable && var) {
auto sizes = query_->est_result_size_var_nullable(name);
offsets_num = sizes[0];
buf_nbytes = sizes[1];
validity_num = sizes[2] / sizeof(uint8_t);
} else if (nullable && !var) {
auto sizes = query_->est_result_size_nullable(name);
buf_nbytes = sizes[0];
validity_num = sizes[1] / sizeof(uint8_t);
} else if (!nullable && var) {
auto size_pair = query_->est_result_size_var(name);
if (is_dimension_label(name)) {
#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15
auto dim_label =
ArraySchemaExperimental::dimension_label(ctx_, *array_schema_, name);
type = dim_label.label_type();
cell_val_num = dim_label.label_cell_val_num();
var = cell_val_num == TILEDB_VAR_NUM;
nullable = false;

cell_nbytes = tiledb_datatype_size(type);
uint64_t ncells = label_input_buffer_data_[name];

if (!var) {
cell_nbytes *= cell_val_num;
} else {
offsets_num = ncells;
}
buf_nbytes = ncells * cell_nbytes;
#endif
} else {
std::tie(type, cell_val_num) = buffer_type(name);
cell_nbytes = tiledb_datatype_size(type);
if (cell_val_num != TILEDB_VAR_NUM) {
cell_nbytes *= cell_val_num;
}
var = is_var(name);
nullable = is_nullable(name);

if (retries_ < 1 && dense) {
// we must not call after submitting
if (nullable && var) {
auto sizes = query_->est_result_size_var_nullable(name);
offsets_num = sizes[0];
buf_nbytes = sizes[1];
validity_num = sizes[2] / sizeof(uint8_t);
} else if (nullable && !var) {
auto sizes = query_->est_result_size_nullable(name);
buf_nbytes = sizes[0];
validity_num = sizes[1] / sizeof(uint8_t);
} else if (!nullable && var) {
auto size_pair = query_->est_result_size_var(name);
#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR < 2
buf_nbytes = size_pair.first;
offsets_num = size_pair.second;
buf_nbytes = size_pair.first;
offsets_num = size_pair.second;
#else
buf_nbytes = size_pair[0];
offsets_num = size_pair[1];
buf_nbytes = size_pair[0];
offsets_num = size_pair[1];
#endif
} else { // !nullable && !var
buf_nbytes = query_->est_result_size(name);
}
} else { // !nullable && !var
buf_nbytes = query_->est_result_size(name);
}

// Add extra offset to estimate in order to avoid incomplete resubmit
// libtiledb 2.7.* does not include extra element in estimate.
// Remove this section after resolution of SC-16301.
offsets_num += (var && use_arrow_) ? 1 : 0;
// Add extra offset to estimate in order to avoid incomplete resubmit
// libtiledb 2.7.* does not include extra element in estimate.
// Remove this section after resolution of SC-16301.
offsets_num += (var && use_arrow_) ? 1 : 0;
}
}

// - for sparse arrays: don't try to allocate more than alloc_max_bytes_
Expand Down Expand Up @@ -619,44 +662,8 @@ class PyQuery {
validity_num, var, nullable)});
}

#if TILEDB_VERSION_MAJOR == 2 && TILEDB_VERSION_MINOR >= 15
void alloc_label_buffer(std::string &label_name, uint64_t ncells) {

auto dim_label = ArraySchemaExperimental::dimension_label(
ctx_, *array_schema_, label_name);

tiledb_datatype_t type = dim_label.label_type();
uint32_t cell_val_num = dim_label.label_cell_val_num();
uint64_t cell_nbytes = tiledb_datatype_size(type);
if (cell_val_num != TILEDB_VAR_NUM) {
cell_nbytes *= cell_val_num;
} else {
throw TileDBError(
"reading variable length dimension labels is not yet supported");
}
auto dtype = tiledb_dtype(type, cell_val_num);

uint64_t buf_nbytes = ncells * cell_nbytes;
uint64_t offsets_num = 0;
uint64_t validity_num = 0;

bool var = cell_val_num == TILEDB_VAR_NUM;
bool nullable = false;

buffers_order_.push_back(label_name);
buffers_.insert(
{label_name, BufferInfo(label_name, buf_nbytes, type, cell_val_num,
offsets_num, validity_num, var, nullable)});
}
#else
void alloc_label_buffer(std::string &, uint64_t) {
throw TileDBError(
"Using dimension labels requires libtiledb version 2.15.0 or greater");
}
#endif

void add_label_buffer(std::string &label_name, uint64_t ncells) {
label_input_buffer_data_.push_back({label_name, ncells});
label_input_buffer_data_[label_name] = ncells;
}

py::object get_buffers() {
Expand Down Expand Up @@ -712,7 +719,10 @@ class PyQuery {
}

void update_read_elem_num() {
#if TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 3
#if TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 16
auto result_elements =
QueryExperimental::result_buffer_elements_nullable_labels(*query_);
#elif TILEDB_VERSION_MAJOR >= 2 && TILEDB_VERSION_MINOR >= 3
// needs https://github.com/TileDB-Inc/TileDB/pull/2238
auto result_elements = query_->result_buffer_elements_nullable();
#else
Expand Down Expand Up @@ -763,16 +773,20 @@ class PyQuery {

if ((Py_ssize_t)(buf.data_vals_read * buf.elem_nbytes) >
(Py_ssize_t)buf.data.size()) {
throw TileDBError("After read query, data buffer out of bounds: " +
name);
throw TileDBError(
"After read query, data buffer out of bounds: " + name + " (" +
std::to_string(buf.data_vals_read * buf.elem_nbytes) + " > " +
std::to_string(buf.data.size()) + ")");
}
if ((Py_ssize_t)buf.offsets_read > buf.offsets.size()) {
throw TileDBError("After read query, offsets buffer out of bounds: " +
name);
name + " (" + std::to_string(buf.offsets_read) +
" > " + std::to_string(buf.offsets.size()) + ")");
}
if ((Py_ssize_t)buf.validity_vals_read > buf.validity.size()) {
throw TileDBError("After read query, validity buffer out of bounds: " +
name);
name + " (" + std::to_string(buf.validity_vals_read) +
" > " + std::to_string(buf.validity.size()) + ")");
}
}
}
Expand Down Expand Up @@ -936,8 +950,8 @@ class PyQuery {
}

// allocate buffers for label dimensions
for (auto &label_data : label_input_buffer_data_) {
alloc_label_buffer(label_data.first, label_data.second);
for (const auto &label_data : label_input_buffer_data_) {
alloc_buffer(label_data.first);
}

// allocate buffers for attributes
Expand Down Expand Up @@ -1394,7 +1408,7 @@ class PyQuery {
}
}

}; // class PyQuery
}; // namespace tiledbpy

void init_stats() {
g_stats.reset(new StatsInfo());
Expand Down
Loading